PhysicalActivityandStrokeOu.../1 PA Decline/archive/prediction exercise.R

# https://www.machinelearningplus.com/machine-learning/caret-package/

# install.packages(c('caret', 'skimr', 'RANN', 'randomForest', 'fastAdaboost', 'gbm', 'xgboost', 'caretEnsemble', 'C50', 'earth'))

# Load the caret package
library(caret)

# Import dataset
orange <- read.csv('https://raw.githubusercontent.com/selva86/datasets/master/orange_juice_withmissing.csv')

# Structure of the dataframe
str(orange)

# See top 6 rows and 10 columns
head(orange[, 1:10])

# Create the training and test datasets
set.seed(100)

# Step 1: Get row numbers for the training data
trainRowNumbers <- createDataPartition(orange$Purchase, p=0.8, list=FALSE)

# Step 2: Create the training  dataset
trainData <- orange[trainRowNumbers,]

# Step 3: Create the test dataset
testData <- orange[-trainRowNumbers,]

# Store X and Y for later use.
x = trainData[, 2:18]
y = trainData$Purchase

library(skimr)
skimmed <- skim(trainData)
skimmed

# Create the knn imputation model on the training data
preProcess_missingdata_model <- preProcess(trainData, method='knnImpute')
preProcess_missingdata_model

# Use the imputation model to predict the values of missing data points
library(RANN)  # required for knnInpute
trainData <- predict(preProcess_missingdata_model, newdata = trainData)
anyNA(trainData)

# One-Hot Encoding
# Creating dummy variables is converting a categorical variable to as many binary variables as here are categories.
dummies_model <- dummyVars(Purchase ~ ., data=trainData)

# Create the dummy variables using predict. The Y variable (Purchase) will not be present in trainData_mat.
trainData_mat <- predict(dummies_model, newdata = trainData)

# # Convert to dataframe
trainData <- data.frame(trainData_mat)

# # See the structure of the new dataset
str(trainData)


preProcess_range_model <- preProcess(trainData, method='range')
trainData <- predict(preProcess_range_model, newdata = trainData)

# Append the Y variable
trainData$Purchase <- y

apply(trainData[, 1:10], 2, FUN=function(x){c('min'=min(x), 'max'=max(x))})


featurePlot(x=trainData[,1:18],
            y=factor(trainData$Purchase),
            plot="box",
            strip=strip.custom(par.strip.text=list(cex=.7)),
            scales = list(x = list(relation="free"),
                          y = list(relation="free")))

featurePlot(x=trainData[,1:18],
            y=factor(trainData$Purchase),
            plot="density",
            strip=strip.custom(par.strip.text=list(cex=.7)),
            scales = list(x = list(relation="free"),
                          y = list(relation="free")))

# 5

set.seed(100)
options(warn=-1)

subsets <- c(1:5, 10, 15, 18)

ctrl <- rfeControl(functions = rfFuncs,
                   method = "repeatedcv",
                   repeats = 5,
                   verbose = FALSE)

lmProfile <- rfe(x=trainData[, 1:18], y=factor(trainData$Purchase),
                 sizes = subsets,
                 rfeControl = ctrl)

lmProfile


# See available algorithms in caret
modelnames <- dput(names(getModelInfo()))
# modelnames <- paste(names(getModelInfo()), collapse=',  ')
modelnames


# Set the seed for reproducibility
set.seed(100)

# Train the model using randomForest and predict on the training data itself.
model_mars = train(Purchase ~ ., data=trainData, method='earth')
fitted <- predict(model_mars)

model_mars


plot(model_mars, main="Model Accuracies with MARS")

varimp_mars <- varImp(model_mars)
plot(varimp_mars, main="Variable Importance with MARS")


## 6.4

# Step 1: Impute missing values 
testData2 <- predict(preProcess_missingdata_model, testData)  

# Step 2: Create one-hot encodings (dummy variables)
testData3 <- predict(dummies_model, testData2)

# Step 3: Transform the features to range between 0 and 1
testData4 <- predict(preProcess_range_model, testData3)

# View
head(testData4[, 1:10])

predicted <- predict(model_mars, testData4)
head(predicted)

# Compute the confusion matrix
confusionMatrix(reference = factor(testData$Purchase), data = predicted, mode='everything', positive='MM')
first baby steps in building a predictive model 2021-11-12 08:11:16 +01:00			`# https://www.machinelearningplus.com/machine-learning/caret-package/`

			`# install.packages(c('caret', 'skimr', 'RANN', 'randomForest', 'fastAdaboost', 'gbm', 'xgboost', 'caretEnsemble', 'C50', 'earth'))`

			`# Load the caret package`
			`library(caret)`

			`# Import dataset`
			`orange <- read.csv('https://raw.githubusercontent.com/selva86/datasets/master/orange_juice_withmissing.csv')`

			`# Structure of the dataframe`
			`str(orange)`

			`# See top 6 rows and 10 columns`
			`head(orange[, 1:10])`

			`# Create the training and test datasets`
			`set.seed(100)`

			`# Step 1: Get row numbers for the training data`
			`trainRowNumbers <- createDataPartition(orange$Purchase, p=0.8, list=FALSE)`

			`# Step 2: Create the training dataset`
			`trainData <- orange[trainRowNumbers,]`

			`# Step 3: Create the test dataset`
			`testData <- orange[-trainRowNumbers,]`

			`# Store X and Y for later use.`
			`x = trainData[, 2:18]`
			`y = trainData$Purchase`

			`library(skimr)`
			`skimmed <- skim(trainData)`
			`skimmed`

			`# Create the knn imputation model on the training data`
			`preProcess_missingdata_model <- preProcess(trainData, method='knnImpute')`
			`preProcess_missingdata_model`

			`# Use the imputation model to predict the values of missing data points`
			`library(RANN) # required for knnInpute`
			`trainData <- predict(preProcess_missingdata_model, newdata = trainData)`
			`anyNA(trainData)`

			`# One-Hot Encoding`
			`# Creating dummy variables is converting a categorical variable to as many binary variables as here are categories.`
			`dummies_model <- dummyVars(Purchase ~ ., data=trainData)`

			`# Create the dummy variables using predict. The Y variable (Purchase) will not be present in trainData_mat.`
			`trainData_mat <- predict(dummies_model, newdata = trainData)`

			`# # Convert to dataframe`
			`trainData <- data.frame(trainData_mat)`

			`# # See the structure of the new dataset`
			`str(trainData)`


			`preProcess_range_model <- preProcess(trainData, method='range')`
			`trainData <- predict(preProcess_range_model, newdata = trainData)`

			`# Append the Y variable`
			`trainData$Purchase <- y`

			`apply(trainData[, 1:10], 2, FUN=function(x){c('min'=min(x), 'max'=max(x))})`


			`featurePlot(x=trainData[,1:18],`
			`y=factor(trainData$Purchase),`
			`plot="box",`
			`strip=strip.custom(par.strip.text=list(cex=.7)),`
			`scales = list(x = list(relation="free"),`
			`y = list(relation="free")))`

			`featurePlot(x=trainData[,1:18],`
			`y=factor(trainData$Purchase),`
			`plot="density",`
			`strip=strip.custom(par.strip.text=list(cex=.7)),`
			`scales = list(x = list(relation="free"),`
			`y = list(relation="free")))`

			`# 5`

			`set.seed(100)`
			`options(warn=-1)`

			`subsets <- c(1:5, 10, 15, 18)`

			`ctrl <- rfeControl(functions = rfFuncs,`
			`method = "repeatedcv",`
			`repeats = 5,`
			`verbose = FALSE)`

			`lmProfile <- rfe(x=trainData[, 1:18], y=factor(trainData$Purchase),`
			`sizes = subsets,`
			`rfeControl = ctrl)`

			`lmProfile`


			`# See available algorithms in caret`
			`modelnames <- dput(names(getModelInfo()))`
			`# modelnames <- paste(names(getModelInfo()), collapse=', ')`
			`modelnames`


			`# Set the seed for reproducibility`
			`set.seed(100)`

			`# Train the model using randomForest and predict on the training data itself.`
			`model_mars = train(Purchase ~ ., data=trainData, method='earth')`
			`fitted <- predict(model_mars)`

			`model_mars`


			`plot(model_mars, main="Model Accuracies with MARS")`

			`varimp_mars <- varImp(model_mars)`
			`plot(varimp_mars, main="Variable Importance with MARS")`


			`## 6.4`

			`# Step 1: Impute missing values`
			`testData2 <- predict(preProcess_missingdata_model, testData)`

			`# Step 2: Create one-hot encodings (dummy variables)`
			`testData3 <- predict(dummies_model, testData2)`

			`# Step 3: Transform the features to range between 0 and 1`
			`testData4 <- predict(preProcess_range_model, testData3)`

			`# View`
			`head(testData4[, 1:10])`

			`predicted <- predict(model_mars, testData4)`
			`head(predicted)`

			`# Compute the confusion matrix`
			`confusionMatrix(reference = factor(testData$Purchase), data = predicted, mode='everything', positive='MM')`