2021-11-12 08:11:16 +01:00
|
|
|
# Data
|
|
|
|
## Import from previous work
|
2022-09-28 16:03:58 +02:00
|
|
|
# dta<-read.csv("/Volumes/Data/exercise/source/background.csv",na.strings = c("NA","","unknown"),colClasses = "character")
|
|
|
|
|
|
|
|
library(REDCapR)
|
|
|
|
library(lubridate)
|
|
|
|
library(dplyr)
|
|
|
|
library(daDoctoR)
|
|
|
|
|
|
|
|
# source("https://raw.githubusercontent.com/agdamsbo/daDoctoR/master/R/dob_extract_cpr_function.R")
|
|
|
|
|
|
|
|
dta <- redcap_read_oneshot(
|
|
|
|
redcap_uri = "https://redcap.au.dk/api/",
|
|
|
|
token = read.csv("/Users/au301842/talos_redcap_token.csv",header = FALSE)[[1]],
|
|
|
|
fields = c("talos_basis02a", #Indlæggelsesdato
|
|
|
|
"cpr",
|
|
|
|
"talos_nihss16_0", #Akut NIHSS
|
|
|
|
"basis_kon",
|
|
|
|
"reg_hojde", #Alle fra "reg(ister/DAP)"
|
|
|
|
"reg_vaegt",
|
|
|
|
"reg_vaegt_anslaaet",
|
|
|
|
"reg_rygning",
|
|
|
|
"reg_alkohol",
|
|
|
|
"reg_civil",
|
|
|
|
"reg_bolig",
|
|
|
|
"reg_diabetes",
|
|
|
|
"reg_hyperten",
|
|
|
|
"reg_perifer_arteriel",
|
|
|
|
"reg_atriefli",
|
|
|
|
"reg_ami",
|
|
|
|
"reg_tidl_tci",
|
|
|
|
"reg_trombolyse",
|
|
|
|
"reg_trombektomi",
|
|
|
|
"rtreat" #Trial treatment
|
|
|
|
)
|
|
|
|
)$data |>
|
|
|
|
mutate(age=time_length(talos_basis02a-dob_extract_cpr(cpr),
|
|
|
|
unit="year")
|
|
|
|
)|>
|
|
|
|
select(!c("cpr"))
|
|
|
|
|
|
|
|
|
2021-11-12 08:11:16 +01:00
|
|
|
|
|
|
|
|
|
|
|
## Cleaning and enhancing
|
|
|
|
dta$pase_drop<-factor(ifelse((dta$pase_0_q=="q_2"|dta$pase_0_q=="q_3"|dta$pase_0_q=="q_4")&dta$pase_06_q=="q_1","yes","no"),levels = c("no","yes"))
|
|
|
|
dta$pase_drop[is.na(dta$pase_6)]<-NA
|
|
|
|
dta$pase_drop[is.na(dta$pase_0)]<-NA
|
|
|
|
|
|
|
|
## Selection of data set and formatting
|
|
|
|
library(dplyr)
|
|
|
|
dta_f<-dta %>% filter(pase_0_q != "q_1" & !is.na(pase_drop))
|
|
|
|
|
|
|
|
|
|
|
|
variable_names<-c("age","sex","weight","height",
|
|
|
|
"bmi",
|
|
|
|
"smoke_ever",
|
|
|
|
"civil",
|
|
|
|
"diabetes",
|
|
|
|
"hypertension",
|
|
|
|
"pad",
|
|
|
|
"afli",
|
|
|
|
"ami",
|
|
|
|
"tci",
|
|
|
|
"nihss_0",
|
|
|
|
"thrombolysis",
|
|
|
|
"thrombechtomy",
|
|
|
|
"rep_any","pase_0_q","pase_drop")
|
|
|
|
|
|
|
|
|
|
|
|
library(daDoctoR)
|
|
|
|
dta2<-dta_f[,variable_names]
|
|
|
|
|
|
|
|
dta2<-col_num(c("age","weight","height","bmi","nihss_0"),dta2)
|
|
|
|
dta2<-col_fact(c("sex","smoke_ever","civil","diabetes", "hypertension","pad", "afli", "ami", "tci","thrombolysis", "thrombechtomy","rep_any","pase_0_q","pase_drop"),dta2)
|
|
|
|
|
|
|
|
## Partitioning
|
|
|
|
library(caret)
|
|
|
|
set.seed(100)
|
|
|
|
|
|
|
|
## Step 1: Get row numbers for the training data
|
|
|
|
trainRowNumbers <- createDataPartition(dta2$pase_drop, p=0.8, list=FALSE)
|
|
|
|
|
|
|
|
## Step 2: Create the training dataset
|
|
|
|
trainData <- dta2[trainRowNumbers,]
|
|
|
|
|
|
|
|
## Step 3: Create the test dataset
|
|
|
|
testData <- dta2[-trainRowNumbers,]
|
|
|
|
y_test = testData[,"pase_drop"]
|
|
|
|
|
|
|
|
# Store X and Y for later use.
|
|
|
|
x = trainData %>% select(!matches("pase_drop"))
|
|
|
|
y = trainData[,"pase_drop"]
|
|
|
|
|
|
|
|
# Normalization and dummy binaries
|
|
|
|
|
|
|
|
# One-Hot Encoding
|
|
|
|
# Creating dummy variables is converting a categorical variable to as many binary variables as here are categories.
|
|
|
|
dummies_model <- dummyVars(pase_drop ~ ., data=trainData)
|
|
|
|
|
|
|
|
# Create the dummy variables using predict. The Y variable (Purchase) will not be present in trainData_mat.
|
|
|
|
trainData_mat <- predict(dummies_model, newdata = trainData)
|
|
|
|
|
|
|
|
# # Convert to dataframe
|
|
|
|
trainData <- data.frame(trainData_mat)
|
|
|
|
|
|
|
|
# # See the structure of the new dataset
|
|
|
|
str(trainData)
|
|
|
|
|
|
|
|
dummies_model <- dummyVars(pase_drop ~ ., data=testData)
|
|
|
|
testData_mat <- predict(dummies_model, newdata = testData)
|
|
|
|
testData <- data.frame(testData_mat)
|
|
|
|
preProcess_range_model <- preProcess(testData, method='range')
|
|
|
|
testData <- predict(preProcess_range_model, newdata = testData)
|
|
|
|
testData$pase_drop<-y_test
|
|
|
|
|
|
|
|
# Imputation
|
|
|
|
|
|
|
|
library(RANN) # required for knnInpute
|
|
|
|
preProcess_missingdata_model <- preProcess(trainData, method='knnImpute')
|
|
|
|
# preProcess_missingdata_model
|
|
|
|
|
|
|
|
trainData <- predict(preProcess_missingdata_model, newdata = trainData) # Giver fejl??
|
|
|
|
anyNA(trainData)
|
|
|
|
|
|
|
|
# skimr::skim(trainData)
|
|
|
|
# skimr::skim(x)
|
|
|
|
|
|
|
|
preProcess_range_model <- preProcess(trainData, method='range')
|
|
|
|
trainData <- predict(preProcess_range_model, newdata = trainData)
|
|
|
|
|
|
|
|
# Append the Y variable
|
|
|
|
trainData$pase_drop <- y
|
|
|
|
|
|
|
|
|
|
|
|
# Export
|
|
|
|
write.csv(trainData,"/Users/au301842/PhysicalActivityandStrokeOutcome/data/trainData.csv",row.names = FALSE)
|
|
|
|
write.csv(testData,"/Users/au301842/PhysicalActivityandStrokeOutcome/data/testData.csv",row.names = FALSE)
|