143 lines
3.9 KiB
R
143 lines
3.9 KiB
R
# https://www.machinelearningplus.com/machine-learning/caret-package/
|
|
|
|
# install.packages(c('caret', 'skimr', 'RANN', 'randomForest', 'fastAdaboost', 'gbm', 'xgboost', 'caretEnsemble', 'C50', 'earth'))
|
|
|
|
# Load the caret package
|
|
library(caret)
|
|
|
|
# Import dataset
|
|
orange <- read.csv('https://raw.githubusercontent.com/selva86/datasets/master/orange_juice_withmissing.csv')
|
|
|
|
# Structure of the dataframe
|
|
str(orange)
|
|
|
|
# See top 6 rows and 10 columns
|
|
head(orange[, 1:10])
|
|
|
|
# Create the training and test datasets
|
|
set.seed(100)
|
|
|
|
# Step 1: Get row numbers for the training data
|
|
trainRowNumbers <- createDataPartition(orange$Purchase, p=0.8, list=FALSE)
|
|
|
|
# Step 2: Create the training dataset
|
|
trainData <- orange[trainRowNumbers,]
|
|
|
|
# Step 3: Create the test dataset
|
|
testData <- orange[-trainRowNumbers,]
|
|
|
|
# Store X and Y for later use.
|
|
x = trainData[, 2:18]
|
|
y = trainData$Purchase
|
|
|
|
library(skimr)
|
|
skimmed <- skim(trainData)
|
|
skimmed
|
|
|
|
# Create the knn imputation model on the training data
|
|
preProcess_missingdata_model <- preProcess(trainData, method='knnImpute')
|
|
preProcess_missingdata_model
|
|
|
|
# Use the imputation model to predict the values of missing data points
|
|
library(RANN) # required for knnInpute
|
|
trainData <- predict(preProcess_missingdata_model, newdata = trainData)
|
|
anyNA(trainData)
|
|
|
|
# One-Hot Encoding
|
|
# Creating dummy variables is converting a categorical variable to as many binary variables as here are categories.
|
|
dummies_model <- dummyVars(Purchase ~ ., data=trainData)
|
|
|
|
# Create the dummy variables using predict. The Y variable (Purchase) will not be present in trainData_mat.
|
|
trainData_mat <- predict(dummies_model, newdata = trainData)
|
|
|
|
# # Convert to dataframe
|
|
trainData <- data.frame(trainData_mat)
|
|
|
|
# # See the structure of the new dataset
|
|
str(trainData)
|
|
|
|
|
|
preProcess_range_model <- preProcess(trainData, method='range')
|
|
trainData <- predict(preProcess_range_model, newdata = trainData)
|
|
|
|
# Append the Y variable
|
|
trainData$Purchase <- y
|
|
|
|
apply(trainData[, 1:10], 2, FUN=function(x){c('min'=min(x), 'max'=max(x))})
|
|
|
|
|
|
featurePlot(x=trainData[,1:18],
|
|
y=factor(trainData$Purchase),
|
|
plot="box",
|
|
strip=strip.custom(par.strip.text=list(cex=.7)),
|
|
scales = list(x = list(relation="free"),
|
|
y = list(relation="free")))
|
|
|
|
featurePlot(x=trainData[,1:18],
|
|
y=factor(trainData$Purchase),
|
|
plot="density",
|
|
strip=strip.custom(par.strip.text=list(cex=.7)),
|
|
scales = list(x = list(relation="free"),
|
|
y = list(relation="free")))
|
|
|
|
# 5
|
|
|
|
set.seed(100)
|
|
options(warn=-1)
|
|
|
|
subsets <- c(1:5, 10, 15, 18)
|
|
|
|
ctrl <- rfeControl(functions = rfFuncs,
|
|
method = "repeatedcv",
|
|
repeats = 5,
|
|
verbose = FALSE)
|
|
|
|
lmProfile <- rfe(x=trainData[, 1:18], y=factor(trainData$Purchase),
|
|
sizes = subsets,
|
|
rfeControl = ctrl)
|
|
|
|
lmProfile
|
|
|
|
|
|
# See available algorithms in caret
|
|
modelnames <- dput(names(getModelInfo()))
|
|
# modelnames <- paste(names(getModelInfo()), collapse=', ')
|
|
modelnames
|
|
|
|
|
|
# Set the seed for reproducibility
|
|
set.seed(100)
|
|
|
|
# Train the model using randomForest and predict on the training data itself.
|
|
model_mars = train(Purchase ~ ., data=trainData, method='earth')
|
|
fitted <- predict(model_mars)
|
|
|
|
model_mars
|
|
|
|
|
|
plot(model_mars, main="Model Accuracies with MARS")
|
|
|
|
varimp_mars <- varImp(model_mars)
|
|
plot(varimp_mars, main="Variable Importance with MARS")
|
|
|
|
|
|
## 6.4
|
|
|
|
# Step 1: Impute missing values
|
|
testData2 <- predict(preProcess_missingdata_model, testData)
|
|
|
|
# Step 2: Create one-hot encodings (dummy variables)
|
|
testData3 <- predict(dummies_model, testData2)
|
|
|
|
# Step 3: Transform the features to range between 0 and 1
|
|
testData4 <- predict(preProcess_range_model, testData3)
|
|
|
|
# View
|
|
head(testData4[, 1:10])
|
|
|
|
predicted <- predict(model_mars, testData4)
|
|
head(predicted)
|
|
|
|
# Compute the confusion matrix
|
|
confusionMatrix(reference = factor(testData$Purchase), data = predicted, mode='everything', positive='MM')
|