# https://www.machinelearningplus.com/machine-learning/caret-package/ # install.packages(c('caret', 'skimr', 'RANN', 'randomForest', 'fastAdaboost', 'gbm', 'xgboost', 'caretEnsemble', 'C50', 'earth')) # Load the caret package library(caret) # Import dataset orange <- read.csv('https://raw.githubusercontent.com/selva86/datasets/master/orange_juice_withmissing.csv') # Structure of the dataframe str(orange) # See top 6 rows and 10 columns head(orange[, 1:10]) # Create the training and test datasets set.seed(100) # Step 1: Get row numbers for the training data trainRowNumbers <- createDataPartition(orange$Purchase, p=0.8, list=FALSE) # Step 2: Create the training dataset trainData <- orange[trainRowNumbers,] # Step 3: Create the test dataset testData <- orange[-trainRowNumbers,] # Store X and Y for later use. x = trainData[, 2:18] y = trainData$Purchase library(skimr) skimmed <- skim(trainData) skimmed # Create the knn imputation model on the training data preProcess_missingdata_model <- preProcess(trainData, method='knnImpute') preProcess_missingdata_model # Use the imputation model to predict the values of missing data points library(RANN) # required for knnInpute trainData <- predict(preProcess_missingdata_model, newdata = trainData) anyNA(trainData) # One-Hot Encoding # Creating dummy variables is converting a categorical variable to as many binary variables as here are categories. dummies_model <- dummyVars(Purchase ~ ., data=trainData) # Create the dummy variables using predict. The Y variable (Purchase) will not be present in trainData_mat. trainData_mat <- predict(dummies_model, newdata = trainData) # # Convert to dataframe trainData <- data.frame(trainData_mat) # # See the structure of the new dataset str(trainData) preProcess_range_model <- preProcess(trainData, method='range') trainData <- predict(preProcess_range_model, newdata = trainData) # Append the Y variable trainData$Purchase <- y apply(trainData[, 1:10], 2, FUN=function(x){c('min'=min(x), 'max'=max(x))}) featurePlot(x=trainData[,1:18], y=factor(trainData$Purchase), plot="box", strip=strip.custom(par.strip.text=list(cex=.7)), scales = list(x = list(relation="free"), y = list(relation="free"))) featurePlot(x=trainData[,1:18], y=factor(trainData$Purchase), plot="density", strip=strip.custom(par.strip.text=list(cex=.7)), scales = list(x = list(relation="free"), y = list(relation="free"))) # 5 set.seed(100) options(warn=-1) subsets <- c(1:5, 10, 15, 18) ctrl <- rfeControl(functions = rfFuncs, method = "repeatedcv", repeats = 5, verbose = FALSE) lmProfile <- rfe(x=trainData[, 1:18], y=factor(trainData$Purchase), sizes = subsets, rfeControl = ctrl) lmProfile # See available algorithms in caret modelnames <- dput(names(getModelInfo())) # modelnames <- paste(names(getModelInfo()), collapse=', ') modelnames # Set the seed for reproducibility set.seed(100) # Train the model using randomForest and predict on the training data itself. model_mars = train(Purchase ~ ., data=trainData, method='earth') fitted <- predict(model_mars) model_mars plot(model_mars, main="Model Accuracies with MARS") varimp_mars <- varImp(model_mars) plot(varimp_mars, main="Variable Importance with MARS") ## 6.4 # Step 1: Impute missing values testData2 <- predict(preProcess_missingdata_model, testData) # Step 2: Create one-hot encodings (dummy variables) testData3 <- predict(dummies_model, testData2) # Step 3: Transform the features to range between 0 and 1 testData4 <- predict(preProcess_range_model, testData3) # View head(testData4[, 1:10]) predicted <- predict(model_mars, testData4) head(predicted) # Compute the confusion matrix confusionMatrix(reference = factor(testData$Purchase), data = predicted, mode='everything', positive='MM')