first baby steps in building a predictive model
This commit is contained in:
parent
364bbe290e
commit
ee38f4ce89
99
1 PA Decline/dataset.R
Normal file
99
1 PA Decline/dataset.R
Normal file
@ -0,0 +1,99 @@
|
||||
# Data
|
||||
## Import from previous work
|
||||
dta<-read.csv("/Volumes/Data/exercise/source/background.csv",na.strings = c("NA","","unknown"),colClasses = "character")
|
||||
|
||||
|
||||
## Cleaning and enhancing
|
||||
dta$pase_drop<-factor(ifelse((dta$pase_0_q=="q_2"|dta$pase_0_q=="q_3"|dta$pase_0_q=="q_4")&dta$pase_06_q=="q_1","yes","no"),levels = c("no","yes"))
|
||||
dta$pase_drop[is.na(dta$pase_6)]<-NA
|
||||
dta$pase_drop[is.na(dta$pase_0)]<-NA
|
||||
|
||||
## Selection of data set and formatting
|
||||
library(dplyr)
|
||||
dta_f<-dta %>% filter(pase_0_q != "q_1" & !is.na(pase_drop))
|
||||
|
||||
|
||||
variable_names<-c("age","sex","weight","height",
|
||||
"bmi",
|
||||
"smoke_ever",
|
||||
"civil",
|
||||
"diabetes",
|
||||
"hypertension",
|
||||
"pad",
|
||||
"afli",
|
||||
"ami",
|
||||
"tci",
|
||||
"nihss_0",
|
||||
"thrombolysis",
|
||||
"thrombechtomy",
|
||||
"rep_any","pase_0_q","pase_drop")
|
||||
|
||||
|
||||
library(daDoctoR)
|
||||
dta2<-dta_f[,variable_names]
|
||||
|
||||
dta2<-col_num(c("age","weight","height","bmi","nihss_0"),dta2)
|
||||
dta2<-col_fact(c("sex","smoke_ever","civil","diabetes", "hypertension","pad", "afli", "ami", "tci","thrombolysis", "thrombechtomy","rep_any","pase_0_q","pase_drop"),dta2)
|
||||
|
||||
## Partitioning
|
||||
library(caret)
|
||||
set.seed(100)
|
||||
|
||||
## Step 1: Get row numbers for the training data
|
||||
trainRowNumbers <- createDataPartition(dta2$pase_drop, p=0.8, list=FALSE)
|
||||
|
||||
## Step 2: Create the training dataset
|
||||
trainData <- dta2[trainRowNumbers,]
|
||||
|
||||
## Step 3: Create the test dataset
|
||||
testData <- dta2[-trainRowNumbers,]
|
||||
y_test = testData[,"pase_drop"]
|
||||
|
||||
# Store X and Y for later use.
|
||||
x = trainData %>% select(!matches("pase_drop"))
|
||||
y = trainData[,"pase_drop"]
|
||||
|
||||
# Normalization and dummy binaries
|
||||
|
||||
# One-Hot Encoding
|
||||
# Creating dummy variables is converting a categorical variable to as many binary variables as here are categories.
|
||||
dummies_model <- dummyVars(pase_drop ~ ., data=trainData)
|
||||
|
||||
# Create the dummy variables using predict. The Y variable (Purchase) will not be present in trainData_mat.
|
||||
trainData_mat <- predict(dummies_model, newdata = trainData)
|
||||
|
||||
# # Convert to dataframe
|
||||
trainData <- data.frame(trainData_mat)
|
||||
|
||||
# # See the structure of the new dataset
|
||||
str(trainData)
|
||||
|
||||
dummies_model <- dummyVars(pase_drop ~ ., data=testData)
|
||||
testData_mat <- predict(dummies_model, newdata = testData)
|
||||
testData <- data.frame(testData_mat)
|
||||
preProcess_range_model <- preProcess(testData, method='range')
|
||||
testData <- predict(preProcess_range_model, newdata = testData)
|
||||
testData$pase_drop<-y_test
|
||||
|
||||
# Imputation
|
||||
|
||||
library(RANN) # required for knnInpute
|
||||
preProcess_missingdata_model <- preProcess(trainData, method='knnImpute')
|
||||
# preProcess_missingdata_model
|
||||
|
||||
trainData <- predict(preProcess_missingdata_model, newdata = trainData) # Giver fejl??
|
||||
anyNA(trainData)
|
||||
|
||||
# skimr::skim(trainData)
|
||||
# skimr::skim(x)
|
||||
|
||||
preProcess_range_model <- preProcess(trainData, method='range')
|
||||
trainData <- predict(preProcess_range_model, newdata = trainData)
|
||||
|
||||
# Append the Y variable
|
||||
trainData$pase_drop <- y
|
||||
|
||||
|
||||
# Export
|
||||
write.csv(trainData,"/Users/au301842/PhysicalActivityandStrokeOutcome/data/trainData.csv",row.names = FALSE)
|
||||
write.csv(testData,"/Users/au301842/PhysicalActivityandStrokeOutcome/data/testData.csv",row.names = FALSE)
|
@ -0,0 +1,142 @@
|
||||
# https://www.machinelearningplus.com/machine-learning/caret-package/
|
||||
|
||||
# install.packages(c('caret', 'skimr', 'RANN', 'randomForest', 'fastAdaboost', 'gbm', 'xgboost', 'caretEnsemble', 'C50', 'earth'))
|
||||
|
||||
# Load the caret package
|
||||
library(caret)
|
||||
|
||||
# Import dataset
|
||||
orange <- read.csv('https://raw.githubusercontent.com/selva86/datasets/master/orange_juice_withmissing.csv')
|
||||
|
||||
# Structure of the dataframe
|
||||
str(orange)
|
||||
|
||||
# See top 6 rows and 10 columns
|
||||
head(orange[, 1:10])
|
||||
|
||||
# Create the training and test datasets
|
||||
set.seed(100)
|
||||
|
||||
# Step 1: Get row numbers for the training data
|
||||
trainRowNumbers <- createDataPartition(orange$Purchase, p=0.8, list=FALSE)
|
||||
|
||||
# Step 2: Create the training dataset
|
||||
trainData <- orange[trainRowNumbers,]
|
||||
|
||||
# Step 3: Create the test dataset
|
||||
testData <- orange[-trainRowNumbers,]
|
||||
|
||||
# Store X and Y for later use.
|
||||
x = trainData[, 2:18]
|
||||
y = trainData$Purchase
|
||||
|
||||
library(skimr)
|
||||
skimmed <- skim(trainData)
|
||||
skimmed
|
||||
|
||||
# Create the knn imputation model on the training data
|
||||
preProcess_missingdata_model <- preProcess(trainData, method='knnImpute')
|
||||
preProcess_missingdata_model
|
||||
|
||||
# Use the imputation model to predict the values of missing data points
|
||||
library(RANN) # required for knnInpute
|
||||
trainData <- predict(preProcess_missingdata_model, newdata = trainData)
|
||||
anyNA(trainData)
|
||||
|
||||
# One-Hot Encoding
|
||||
# Creating dummy variables is converting a categorical variable to as many binary variables as here are categories.
|
||||
dummies_model <- dummyVars(Purchase ~ ., data=trainData)
|
||||
|
||||
# Create the dummy variables using predict. The Y variable (Purchase) will not be present in trainData_mat.
|
||||
trainData_mat <- predict(dummies_model, newdata = trainData)
|
||||
|
||||
# # Convert to dataframe
|
||||
trainData <- data.frame(trainData_mat)
|
||||
|
||||
# # See the structure of the new dataset
|
||||
str(trainData)
|
||||
|
||||
|
||||
preProcess_range_model <- preProcess(trainData, method='range')
|
||||
trainData <- predict(preProcess_range_model, newdata = trainData)
|
||||
|
||||
# Append the Y variable
|
||||
trainData$Purchase <- y
|
||||
|
||||
apply(trainData[, 1:10], 2, FUN=function(x){c('min'=min(x), 'max'=max(x))})
|
||||
|
||||
|
||||
featurePlot(x=trainData[,1:18],
|
||||
y=factor(trainData$Purchase),
|
||||
plot="box",
|
||||
strip=strip.custom(par.strip.text=list(cex=.7)),
|
||||
scales = list(x = list(relation="free"),
|
||||
y = list(relation="free")))
|
||||
|
||||
featurePlot(x=trainData[,1:18],
|
||||
y=factor(trainData$Purchase),
|
||||
plot="density",
|
||||
strip=strip.custom(par.strip.text=list(cex=.7)),
|
||||
scales = list(x = list(relation="free"),
|
||||
y = list(relation="free")))
|
||||
|
||||
# 5
|
||||
|
||||
set.seed(100)
|
||||
options(warn=-1)
|
||||
|
||||
subsets <- c(1:5, 10, 15, 18)
|
||||
|
||||
ctrl <- rfeControl(functions = rfFuncs,
|
||||
method = "repeatedcv",
|
||||
repeats = 5,
|
||||
verbose = FALSE)
|
||||
|
||||
lmProfile <- rfe(x=trainData[, 1:18], y=factor(trainData$Purchase),
|
||||
sizes = subsets,
|
||||
rfeControl = ctrl)
|
||||
|
||||
lmProfile
|
||||
|
||||
|
||||
# See available algorithms in caret
|
||||
modelnames <- dput(names(getModelInfo()))
|
||||
# modelnames <- paste(names(getModelInfo()), collapse=', ')
|
||||
modelnames
|
||||
|
||||
|
||||
# Set the seed for reproducibility
|
||||
set.seed(100)
|
||||
|
||||
# Train the model using randomForest and predict on the training data itself.
|
||||
model_mars = train(Purchase ~ ., data=trainData, method='earth')
|
||||
fitted <- predict(model_mars)
|
||||
|
||||
model_mars
|
||||
|
||||
|
||||
plot(model_mars, main="Model Accuracies with MARS")
|
||||
|
||||
varimp_mars <- varImp(model_mars)
|
||||
plot(varimp_mars, main="Variable Importance with MARS")
|
||||
|
||||
|
||||
## 6.4
|
||||
|
||||
# Step 1: Impute missing values
|
||||
testData2 <- predict(preProcess_missingdata_model, testData)
|
||||
|
||||
# Step 2: Create one-hot encodings (dummy variables)
|
||||
testData3 <- predict(dummies_model, testData2)
|
||||
|
||||
# Step 3: Transform the features to range between 0 and 1
|
||||
testData4 <- predict(preProcess_range_model, testData3)
|
||||
|
||||
# View
|
||||
head(testData4[, 1:10])
|
||||
|
||||
predicted <- predict(model_mars, testData4)
|
||||
head(predicted)
|
||||
|
||||
# Compute the confusion matrix
|
||||
confusionMatrix(reference = factor(testData$Purchase), data = predicted, mode='everything', positive='MM')
|
100
1 PA Decline/predictive_model.Rmd
Normal file
100
1 PA Decline/predictive_model.Rmd
Normal file
@ -0,0 +1,100 @@
|
||||
---
|
||||
title: "predictive_model"
|
||||
output: pdf_document
|
||||
---
|
||||
|
||||
```{r setup, include=FALSE}
|
||||
knitr::opts_chunk$set(echo = TRUE)
|
||||
```
|
||||
|
||||
# Data
|
||||
```{r}
|
||||
library(caret)
|
||||
library(pROC)
|
||||
library(daDoctoR)
|
||||
```
|
||||
|
||||
Import
|
||||
```{r}
|
||||
trainData<-read.csv("/Users/au301842/PhysicalActivityandStrokeOutcome/data/trainData.csv",)
|
||||
testData<-read.csv("/Users/au301842/PhysicalActivityandStrokeOutcome/data/testData.csv",)
|
||||
```
|
||||
|
||||
|
||||
# Prediction
|
||||
Inspiration: https://stackoverflow.com/questions/30366143/how-to-compute-roc-and-auc-under-roc-after-training-using-caret-in-r and https://www.machinelearningplus.com/machine-learning/caret-package/
|
||||
|
||||
## Early visualisation
|
||||
|
||||
```{r}
|
||||
featurePlot(x = trainData %>% select(!matches("pase_drop")),
|
||||
y = factor(trainData$pase_drop),
|
||||
plot = "box",
|
||||
strip=strip.custom(par.strip.text=list(cex=.7)),
|
||||
scales = list(x = list(relation="free"),
|
||||
y = list(relation="free")))
|
||||
|
||||
featurePlot(x = trainData %>% select(!matches("pase_drop")),
|
||||
y = factor(trainData$pase_drop),
|
||||
plot = "density",
|
||||
strip=strip.custom(par.strip.text=list(cex=.7)),
|
||||
scales = list(x = list(relation="free"),
|
||||
y = list(relation="free")))
|
||||
```
|
||||
|
||||
|
||||
```{r}
|
||||
subsets <- c(1:10, 15, 18,33)
|
||||
|
||||
ctrl <- rfeControl(functions = rfFuncs,
|
||||
method = "repeatedcv",
|
||||
repeats = 5,
|
||||
verbose = FALSE)
|
||||
|
||||
lmProfile <- rfe(x = trainData %>% select(!matches("pase_drop")),
|
||||
y = trainData$pase_drop,
|
||||
sizes = subsets,
|
||||
rfeControl = ctrl)
|
||||
|
||||
lmProfile
|
||||
```
|
||||
|
||||
|
||||
```{r}
|
||||
set.seed(1000)
|
||||
|
||||
forest.model <- train(pase_drop ~., trainData)
|
||||
|
||||
result.predicted.prob <- predict(forest.model, testData, type="prob") # Prediction
|
||||
|
||||
result.roc <- roc(factor(testData$pase_drop), result.predicted.prob$no) # Draw ROC curve.
|
||||
|
||||
plot(result.roc, print.thres="best", print.thres.best.method="closest.topleft")
|
||||
|
||||
result.coords <- coords(result.roc, "best", best.method="closest.topleft", ret=c("threshold", "accuracy"))
|
||||
print(result.coords)#to get threshold and accuracy
|
||||
```
|
||||
|
||||
```{r}
|
||||
library(MLeval)
|
||||
|
||||
myTrainingControl <- trainControl(method = "cv",
|
||||
number = 10,
|
||||
savePredictions = TRUE,
|
||||
classProbs = TRUE,
|
||||
verboseIter = TRUE)
|
||||
|
||||
randomForestFit = train(x = trainData[,1:32],
|
||||
y = as.factor(trainData$pase_drop),
|
||||
method = "rf",
|
||||
trControl = myTrainingControl,
|
||||
preProcess = c("center","scale"),
|
||||
ntree = 50)
|
||||
|
||||
x <- evalm(randomForestFit)
|
||||
|
||||
x$roc
|
||||
|
||||
x$stdres
|
||||
```
|
||||
|
Loading…
Reference in New Issue
Block a user