first baby steps in building a predictive model

2021-11-12 08:11:16 +01:00 · 2021-11-12 08:11:16 +01:00 · ee38f4ce89
commit ee38f4ce89
parent 364bbe290e
3 changed files with 341 additions and 0 deletions
--- a/Decline/dataset.R
+++ b/Decline/dataset.R
@ -0,0 +1,99 @@
 # Data
 ## Import from previous work
 dta<-read.csv("/Volumes/Data/exercise/source/background.csv",na.strings = c("NA","","unknown"),colClasses = "character")
 ## Cleaning and enhancing
 dta$pase_drop<-factor(ifelse((dta$pase_0_q=="q_2"|dta$pase_0_q=="q_3"|dta$pase_0_q=="q_4")&dta$pase_06_q=="q_1","yes","no"),levels = c("no","yes"))
 dta$pase_drop[is.na(dta$pase_6)]<-NA
 dta$pase_drop[is.na(dta$pase_0)]<-NA
 ## Selection of data set and formatting
 library(dplyr)
 dta_f<-dta %>% filter(pase_0_q != "q_1" & !is.na(pase_drop))
 variable_names<-c("age","sex","weight","height",
                     "bmi",
                     "smoke_ever",
                     "civil",
                     "diabetes", 
                     "hypertension",
                     "pad", 
                     "afli",  
                     "ami", 
                     "tci",
                     "nihss_0",
                     "thrombolysis", 
                     "thrombechtomy",
                     "rep_any","pase_0_q","pase_drop")
 library(daDoctoR)
 dta2<-dta_f[,variable_names]
 dta2<-col_num(c("age","weight","height","bmi","nihss_0"),dta2)
 dta2<-col_fact(c("sex","smoke_ever","civil","diabetes", "hypertension","pad", "afli",  "ami", "tci","thrombolysis", "thrombechtomy","rep_any","pase_0_q","pase_drop"),dta2)
 ## Partitioning
 library(caret)
 set.seed(100)
 ## Step 1: Get row numbers for the training data
 trainRowNumbers <- createDataPartition(dta2$pase_drop, p=0.8, list=FALSE)
 ## Step 2: Create the training  dataset
 trainData <- dta2[trainRowNumbers,]
 ## Step 3: Create the test dataset
 testData <- dta2[-trainRowNumbers,]
 y_test = testData[,"pase_drop"]
 # Store X and Y for later use.
 x = trainData %>% select(!matches("pase_drop"))
 y = trainData[,"pase_drop"]
 # Normalization and dummy binaries
 # One-Hot Encoding
 # Creating dummy variables is converting a categorical variable to as many binary variables as here are categories.
 dummies_model <- dummyVars(pase_drop ~ ., data=trainData)
 # Create the dummy variables using predict. The Y variable (Purchase) will not be present in trainData_mat.
 trainData_mat <- predict(dummies_model, newdata = trainData)
 # # Convert to dataframe
 trainData <- data.frame(trainData_mat)
 # # See the structure of the new dataset
 str(trainData)
 dummies_model <- dummyVars(pase_drop ~ ., data=testData)
 testData_mat <- predict(dummies_model, newdata = testData)
 testData <- data.frame(testData_mat)
 preProcess_range_model <- preProcess(testData, method='range')
 testData <- predict(preProcess_range_model, newdata = testData)
 testData$pase_drop<-y_test
 # Imputation
 library(RANN)  # required for knnInpute
 preProcess_missingdata_model <- preProcess(trainData, method='knnImpute')
 # preProcess_missingdata_model
 trainData <- predict(preProcess_missingdata_model, newdata = trainData)  # Giver fejl??
 anyNA(trainData)
 # skimr::skim(trainData)
 # skimr::skim(x)
 preProcess_range_model <- preProcess(trainData, method='range')
 trainData <- predict(preProcess_range_model, newdata = trainData)
 # Append the Y variable
 trainData$pase_drop <- y
 # Export
 write.csv(trainData,"/Users/au301842/PhysicalActivityandStrokeOutcome/data/trainData.csv",row.names = FALSE)
 write.csv(testData,"/Users/au301842/PhysicalActivityandStrokeOutcome/data/testData.csv",row.names = FALSE)
--- a/Decline/prediction
+++ b/Decline/prediction
@ -0,0 +1,142 @@
 # https://www.machinelearningplus.com/machine-learning/caret-package/
 # install.packages(c('caret', 'skimr', 'RANN', 'randomForest', 'fastAdaboost', 'gbm', 'xgboost', 'caretEnsemble', 'C50', 'earth'))
 # Load the caret package
 library(caret)
 # Import dataset
 orange <- read.csv('https://raw.githubusercontent.com/selva86/datasets/master/orange_juice_withmissing.csv')
 # Structure of the dataframe
 str(orange)
 # See top 6 rows and 10 columns
 head(orange[, 1:10])
 # Create the training and test datasets
 set.seed(100)
 # Step 1: Get row numbers for the training data
 trainRowNumbers <- createDataPartition(orange$Purchase, p=0.8, list=FALSE)
 # Step 2: Create the training  dataset
 trainData <- orange[trainRowNumbers,]
 # Step 3: Create the test dataset
 testData <- orange[-trainRowNumbers,]
 # Store X and Y for later use.
 x = trainData[, 2:18]
 y = trainData$Purchase
 library(skimr)
 skimmed <- skim(trainData)
 skimmed
 # Create the knn imputation model on the training data
 preProcess_missingdata_model <- preProcess(trainData, method='knnImpute')
 preProcess_missingdata_model
 # Use the imputation model to predict the values of missing data points
 library(RANN)  # required for knnInpute
 trainData <- predict(preProcess_missingdata_model, newdata = trainData)
 anyNA(trainData)
 # One-Hot Encoding
 # Creating dummy variables is converting a categorical variable to as many binary variables as here are categories.
 dummies_model <- dummyVars(Purchase ~ ., data=trainData)
 # Create the dummy variables using predict. The Y variable (Purchase) will not be present in trainData_mat.
 trainData_mat <- predict(dummies_model, newdata = trainData)
 # # Convert to dataframe
 trainData <- data.frame(trainData_mat)
 # # See the structure of the new dataset
 str(trainData)
 preProcess_range_model <- preProcess(trainData, method='range')
 trainData <- predict(preProcess_range_model, newdata = trainData)
 # Append the Y variable
 trainData$Purchase <- y
 apply(trainData[, 1:10], 2, FUN=function(x){c('min'=min(x), 'max'=max(x))})
 featurePlot(x=trainData[,1:18],
            y=factor(trainData$Purchase),
            plot="box",
            strip=strip.custom(par.strip.text=list(cex=.7)),
            scales = list(x = list(relation="free"),
                          y = list(relation="free")))
 featurePlot(x=trainData[,1:18],
            y=factor(trainData$Purchase),
            plot="density",
            strip=strip.custom(par.strip.text=list(cex=.7)),
            scales = list(x = list(relation="free"),
                          y = list(relation="free")))
 # 5
 set.seed(100)
 options(warn=-1)
 subsets <- c(1:5, 10, 15, 18)
 ctrl <- rfeControl(functions = rfFuncs,
                   method = "repeatedcv",
                   repeats = 5,
                   verbose = FALSE)
 lmProfile <- rfe(x=trainData[, 1:18], y=factor(trainData$Purchase),
                 sizes = subsets,
                 rfeControl = ctrl)
 lmProfile
 # See available algorithms in caret
 modelnames <- dput(names(getModelInfo()))
 # modelnames <- paste(names(getModelInfo()), collapse=',  ')
 modelnames
 # Set the seed for reproducibility
 set.seed(100)
 # Train the model using randomForest and predict on the training data itself.
 model_mars = train(Purchase ~ ., data=trainData, method='earth')
 fitted <- predict(model_mars)
 model_mars
 plot(model_mars, main="Model Accuracies with MARS")
 varimp_mars <- varImp(model_mars)
 plot(varimp_mars, main="Variable Importance with MARS")
 ## 6.4
 # Step 1: Impute missing values 
 testData2 <- predict(preProcess_missingdata_model, testData)  
 # Step 2: Create one-hot encodings (dummy variables)
 testData3 <- predict(dummies_model, testData2)
 # Step 3: Transform the features to range between 0 and 1
 testData4 <- predict(preProcess_range_model, testData3)
 # View
 head(testData4[, 1:10])
 predicted <- predict(model_mars, testData4)
 head(predicted)
 # Compute the confusion matrix
 confusionMatrix(reference = factor(testData$Purchase), data = predicted, mode='everything', positive='MM')
--- a/Decline/predictive_model.Rmd
+++ b/Decline/predictive_model.Rmd
@ -0,0 +1,100 @@
 ---
 title: "predictive_model"
 output: pdf_document
 ---
 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE)
 ```
 # Data
 ```{r}
 library(caret)
 library(pROC)
 library(daDoctoR)
 ```
 Import
 ```{r}
 trainData<-read.csv("/Users/au301842/PhysicalActivityandStrokeOutcome/data/trainData.csv",)
 testData<-read.csv("/Users/au301842/PhysicalActivityandStrokeOutcome/data/testData.csv",)
 ```
 # Prediction
 Inspiration: https://stackoverflow.com/questions/30366143/how-to-compute-roc-and-auc-under-roc-after-training-using-caret-in-r and https://www.machinelearningplus.com/machine-learning/caret-package/
 ## Early visualisation
 ```{r}
 featurePlot(x = trainData %>% select(!matches("pase_drop")),
            y = factor(trainData$pase_drop), 
            plot = "box",
            strip=strip.custom(par.strip.text=list(cex=.7)),
            scales = list(x = list(relation="free"), 
                          y = list(relation="free")))
 featurePlot(x = trainData %>% select(!matches("pase_drop")),
            y = factor(trainData$pase_drop), 
            plot = "density",
            strip=strip.custom(par.strip.text=list(cex=.7)),
            scales = list(x = list(relation="free"), 
                          y = list(relation="free")))
 ```
 ```{r}
 subsets <- c(1:10, 15, 18,33)
 ctrl <- rfeControl(functions = rfFuncs,
                   method = "repeatedcv",
                   repeats = 5,
                   verbose = FALSE)
 lmProfile <- rfe(x = trainData %>% select(!matches("pase_drop")), 
                 y = trainData$pase_drop,
                 sizes = subsets,
                 rfeControl = ctrl)
 lmProfile
 ```
 ```{r}
 set.seed(1000)
 forest.model <- train(pase_drop ~., trainData)
 result.predicted.prob <- predict(forest.model, testData, type="prob") # Prediction
 result.roc <- roc(factor(testData$pase_drop), result.predicted.prob$no) # Draw ROC curve. 
 plot(result.roc, print.thres="best", print.thres.best.method="closest.topleft")
 result.coords <- coords(result.roc, "best", best.method="closest.topleft", ret=c("threshold", "accuracy"))
 print(result.coords)#to get threshold and accuracy
 ```
 ```{r}
 library(MLeval)
 myTrainingControl <- trainControl(method = "cv", 
                                  number = 10, 
                                  savePredictions = TRUE, 
                                  classProbs = TRUE, 
                                  verboseIter = TRUE)
 randomForestFit = train(x = trainData[,1:32], 
                        y = as.factor(trainData$pase_drop), 
                        method = "rf", 
                        trControl = myTrainingControl, 
                        preProcess = c("center","scale"), 
                        ntree = 50)
 x <- evalm(randomForestFit)
 x$roc
 x$stdres
 ```