PhysicalActivityandStrokeOu.../1 PA Decline/archive/dataset.R

99 lines
3.2 KiB
R
Raw Normal View History

# Data
## Import from previous work
dta<-read.csv("/Volumes/Data/exercise/source/background.csv",na.strings = c("NA","","unknown"),colClasses = "character")
## Cleaning and enhancing
dta$pase_drop<-factor(ifelse((dta$pase_0_q=="q_2"|dta$pase_0_q=="q_3"|dta$pase_0_q=="q_4")&dta$pase_06_q=="q_1","yes","no"),levels = c("no","yes"))
dta$pase_drop[is.na(dta$pase_6)]<-NA
dta$pase_drop[is.na(dta$pase_0)]<-NA
## Selection of data set and formatting
library(dplyr)
dta_f<-dta %>% filter(pase_0_q != "q_1" & !is.na(pase_drop))
variable_names<-c("age","sex","weight","height",
"bmi",
"smoke_ever",
"civil",
"diabetes",
"hypertension",
"pad",
"afli",
"ami",
"tci",
"nihss_0",
"thrombolysis",
"thrombechtomy",
"rep_any","pase_0_q","pase_drop")
library(daDoctoR)
dta2<-dta_f[,variable_names]
dta2<-col_num(c("age","weight","height","bmi","nihss_0"),dta2)
dta2<-col_fact(c("sex","smoke_ever","civil","diabetes", "hypertension","pad", "afli", "ami", "tci","thrombolysis", "thrombechtomy","rep_any","pase_0_q","pase_drop"),dta2)
## Partitioning
library(caret)
set.seed(100)
## Step 1: Get row numbers for the training data
trainRowNumbers <- createDataPartition(dta2$pase_drop, p=0.8, list=FALSE)
## Step 2: Create the training dataset
trainData <- dta2[trainRowNumbers,]
## Step 3: Create the test dataset
testData <- dta2[-trainRowNumbers,]
y_test = testData[,"pase_drop"]
# Store X and Y for later use.
x = trainData %>% select(!matches("pase_drop"))
y = trainData[,"pase_drop"]
# Normalization and dummy binaries
# One-Hot Encoding
# Creating dummy variables is converting a categorical variable to as many binary variables as here are categories.
dummies_model <- dummyVars(pase_drop ~ ., data=trainData)
# Create the dummy variables using predict. The Y variable (Purchase) will not be present in trainData_mat.
trainData_mat <- predict(dummies_model, newdata = trainData)
# # Convert to dataframe
trainData <- data.frame(trainData_mat)
# # See the structure of the new dataset
str(trainData)
dummies_model <- dummyVars(pase_drop ~ ., data=testData)
testData_mat <- predict(dummies_model, newdata = testData)
testData <- data.frame(testData_mat)
preProcess_range_model <- preProcess(testData, method='range')
testData <- predict(preProcess_range_model, newdata = testData)
testData$pase_drop<-y_test
# Imputation
library(RANN) # required for knnInpute
preProcess_missingdata_model <- preProcess(trainData, method='knnImpute')
# preProcess_missingdata_model
trainData <- predict(preProcess_missingdata_model, newdata = trainData) # Giver fejl??
anyNA(trainData)
# skimr::skim(trainData)
# skimr::skim(x)
preProcess_range_model <- preProcess(trainData, method='range')
trainData <- predict(preProcess_range_model, newdata = trainData)
# Append the Y variable
trainData$pase_drop <- y
# Export
write.csv(trainData,"/Users/au301842/PhysicalActivityandStrokeOutcome/data/trainData.csv",row.names = FALSE)
write.csv(testData,"/Users/au301842/PhysicalActivityandStrokeOutcome/data/testData.csv",row.names = FALSE)