## ItMLiHSmar2022 ## regularisation_steps.R, child script ## Regularised model building and analysation for assignment ## Andreas Gammelgaard Damsbo, agdamsbo@clin.au.dk ## ## Now modified to use in publication ## ## ==================================================================== ## Step 0: data import and wrangling ## ==================================================================== setwd("/Users/au301842/PhysicalActivityandStrokeOutcome/1 PA Decline/") # source("data_format.R") y1<-factor(as.integer(y)-1) ## Outcome is required to be factor of 0 or 1. ## ==================================================================== ## Step 1: settings ## ==================================================================== ## Folds K=10 set.seed(3) c<-caret::createFolds(y=y, k = K, list = FALSE, returnTrain = TRUE) # Foldids for alpha tuning ## Defining tuning parameters lambdas=2^seq(-10, 5, 1) alphas<-seq(0,1,.1) ## Weights for models weighted=TRUE if (weighted == TRUE) { wght<-as.vector(1 - (table(y)[y] / length(y))) } else { wght <- rep(1, nrow(y)) } ## Standardise numeric ## Centered and ## ==================================================================== ## Step 2: all cross validations for each alpha ## ==================================================================== library(furrr) library(purrr) library(doMC) registerDoMC(cores=6) # Nested CVs with analysis for all lambdas for each alpha # set.seed(3) cvs <- future_map(alphas, function(a){ cv.glmnet(model.matrix(~.-1,X), y1, weights = wght, lambda=lambdas, type.measure = "deviance", # This is standard measure and recommended for tuning foldid = c, # Per recommendation the folds are kept for alpha optimisation alpha=a, standardize=TRUE, family=quasibinomial, keep=TRUE) # Same as binomial, but not as picky }) ## ==================================================================== # Step 3: optimum lambda for each alpha ## ==================================================================== # For each alpha, lambda is chosen for the lowest meassure (deviance) each_alpha <- sapply(seq_along(alphas), function(id) { each_cv <- cvs[[id]] alpha_val <- alphas[id] index_lmin <- match(each_cv$lambda.min, each_cv$lambda) c(lamb = each_cv$lambda.min, alph = alpha_val, cvm = each_cv$cvm[index_lmin]) }) # Best lambda best_lamb <- min(each_alpha["lamb", ]) # Alpha is chosen for best lambda with lowest model deviance, each_alpha["cvm",] best_alph <- each_alpha["alph",][each_alpha["cvm",]==min(each_alpha["cvm",] [each_alpha["lamb",] %in% best_lamb])] ## https://stackoverflow.com/questions/42007313/plot-an-roc-curve-in-r-with-ggplot2 p_roc<-roc.glmnet(cvs[[1]]$fit.preval, newy = y)[[match(best_alph,alphas)]]|> # Plots performance from model with best alpha ggplot(aes(FPR,TPR)) + geom_step() + coord_cartesian(xlim=c(0,1), ylim=c(0,1)) + geom_abline()+ theme_bw() ## ==================================================================== # Step 4: Creating the final model ## ==================================================================== source("regular_fun.R") # Custom function optimised_model<-regular_fun(X,y1,K,lambdas=best_lamb,alpha=best_alph) # With lambda and alpha specified, the function is just a k-fold cross-validation wrapper, # but keeps model performance figures from each fold. list2env(optimised_model,.GlobalEnv) # Function outputs a list, which is unwrapped to Env. # See source script for reference. ## ==================================================================== # Step 5: creating table of coefficients for inference ## ==================================================================== Bmatrix<-matrix(unlist(B),ncol=10) Bmedian<-apply(Bmatrix,1,median) Bmean<-apply(Bmatrix,1,mean) reg_coef_tbl<-tibble( name = c("Intercept",Hmisc::label(X)), medianX = round(Bmedian,5), ORmed = round(exp(Bmedian),5), meanX = round(Bmean,5), ORmea = round(exp(Bmean),5))%>% # arrange(desc(abs(medianX)))%>% gt() ## ==================================================================== # Step 6: plotting predictive performance ## ==================================================================== reg_cfm<-confusionMatrix(cMatTest) reg_auc_sum<-summary(auc_test[,1]) ## ==================================================================== # Step 7: Packing list to save in loop ## ==================================================================== ls[[i]] <- list("RegularisedCoefs"=reg_coef_tbl, "bestA"=best_alph, "bestL"=best_lamb, "ConfusionMatrx"=reg_cfm, "AUROC"=reg_auc_sum)