2022-09-28 16:03:58 +02:00
|
|
|
## ItMLiHSmar2022
|
|
|
|
## regularisation_steps.R, child script
|
|
|
|
## Regularised model building and analysation for assignment
|
|
|
|
## Andreas Gammelgaard Damsbo, agdamsbo@clin.au.dk
|
|
|
|
##
|
|
|
|
## Now modified to use in publication
|
|
|
|
##
|
|
|
|
|
|
|
|
## ====================================================================
|
|
|
|
## Step 0: data import and wrangling
|
|
|
|
## ====================================================================
|
|
|
|
|
|
|
|
setwd("/Users/au301842/PhysicalActivityandStrokeOutcome/1 PA Decline/")
|
|
|
|
|
|
|
|
# source("data_format.R")
|
|
|
|
y1<-factor(as.integer(y)-1) ## Outcome is required to be factor of 0 or 1.
|
|
|
|
|
|
|
|
|
|
|
|
## ====================================================================
|
|
|
|
## Step 1: settings
|
|
|
|
## ====================================================================
|
|
|
|
|
|
|
|
## Folds
|
|
|
|
K=10
|
|
|
|
set.seed(3)
|
|
|
|
c<-caret::createFolds(y=y,
|
|
|
|
k = K,
|
|
|
|
list = FALSE,
|
|
|
|
returnTrain = TRUE) # Foldids for alpha tuning
|
|
|
|
|
|
|
|
## Defining tuning parameters
|
|
|
|
lambdas=2^seq(-10, 5, 1)
|
|
|
|
alphas<-seq(0,1,.1)
|
|
|
|
|
|
|
|
## Weights for models
|
|
|
|
weighted=TRUE
|
|
|
|
if (weighted == TRUE) {
|
|
|
|
wght<-as.vector(1 - (table(y)[y] / length(y)))
|
|
|
|
} else {
|
|
|
|
wght <- rep(1, nrow(y))
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
## Standardise numeric
|
|
|
|
## Centered and
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## ====================================================================
|
|
|
|
## Step 2: all cross validations for each alpha
|
|
|
|
## ====================================================================
|
|
|
|
|
|
|
|
library(furrr)
|
|
|
|
library(purrr)
|
|
|
|
library(doMC)
|
|
|
|
registerDoMC(cores=6)
|
|
|
|
|
|
|
|
# Nested CVs with analysis for all lambdas for each alpha
|
|
|
|
#
|
|
|
|
set.seed(3)
|
|
|
|
cvs <- future_map(alphas, function(a){
|
|
|
|
cv.glmnet(model.matrix(~.-1,X),
|
|
|
|
y1,
|
|
|
|
weights = wght,
|
|
|
|
lambda=lambdas,
|
|
|
|
type.measure = "deviance", # This is standard measure and recommended for tuning
|
|
|
|
foldid = c, # Per recommendation the folds are kept for alpha optimisation
|
|
|
|
alpha=a,
|
|
|
|
standardize=TRUE,
|
|
|
|
family=quasibinomial,
|
|
|
|
keep=TRUE) # Same as binomial, but not as picky
|
|
|
|
})
|
|
|
|
|
|
|
|
## ====================================================================
|
|
|
|
# Step 3: optimum lambda for each alpha
|
|
|
|
## ====================================================================
|
|
|
|
|
|
|
|
|
|
|
|
# For each alpha, lambda is chosen for the lowest meassure (deviance)
|
|
|
|
each_alpha <- sapply(seq_along(alphas), function(id) {
|
|
|
|
each_cv <- cvs[[id]]
|
|
|
|
alpha_val <- alphas[id]
|
|
|
|
index_lmin <- match(each_cv$lambda.min,
|
|
|
|
each_cv$lambda)
|
|
|
|
c(lamb = each_cv$lambda.min,
|
|
|
|
alph = alpha_val,
|
|
|
|
cvm = each_cv$cvm[index_lmin])
|
|
|
|
})
|
|
|
|
|
|
|
|
# Best lambda
|
|
|
|
best_lamb <- min(each_alpha["lamb", ])
|
|
|
|
|
|
|
|
# Alpha is chosen for best lambda with lowest model deviance, each_alpha["cvm",]
|
|
|
|
best_alph <- each_alpha["alph",][each_alpha["cvm",]==min(each_alpha["cvm",]
|
|
|
|
[each_alpha["lamb",] %in% best_lamb])]
|
|
|
|
|
|
|
|
## https://stackoverflow.com/questions/42007313/plot-an-roc-curve-in-r-with-ggplot2
|
|
|
|
p_roc<-roc.glmnet(cvs[[1]]$fit.preval, newy = y)[[match(best_alph,alphas)]]|> # Plots performance from model with best alpha
|
|
|
|
ggplot(aes(FPR,TPR)) +
|
|
|
|
geom_step() +
|
|
|
|
coord_cartesian(xlim=c(0,1), ylim=c(0,1)) +
|
|
|
|
geom_abline()+
|
|
|
|
theme_bw()
|
|
|
|
|
|
|
|
## ====================================================================
|
|
|
|
# Step 4: Creating the final model
|
|
|
|
## ====================================================================
|
|
|
|
|
|
|
|
source("regular_fun.R") # Custom function
|
|
|
|
optimised_model<-regular_fun(X,y1,K,lambdas=best_lamb,alpha=best_alph)
|
|
|
|
# With lambda and alpha specified, the function is just a k-fold cross-validation wrapper,
|
|
|
|
# but keeps model performance figures from each fold.
|
|
|
|
|
|
|
|
list2env(optimised_model,.GlobalEnv)
|
|
|
|
# Function outputs a list, which is unwrapped to Env.
|
|
|
|
# See source script for reference.
|
|
|
|
|
|
|
|
## ====================================================================
|
|
|
|
# Step 5: creating table of coefficients for inference
|
|
|
|
## ====================================================================
|
|
|
|
|
|
|
|
Bmatrix<-matrix(unlist(B),ncol=10)
|
|
|
|
Bmedian<-apply(Bmatrix,1,median)
|
|
|
|
Bmean<-apply(Bmatrix,1,mean)
|
|
|
|
|
|
|
|
reg_coef_tbl<-tibble(
|
|
|
|
name = c("Intercept",Hmisc::label(X)),
|
|
|
|
medianX = round(Bmedian,5),
|
|
|
|
ORmed = round(exp(Bmedian),5),
|
|
|
|
meanX = round(Bmean,5),
|
|
|
|
ORmea = round(exp(Bmean),5))%>%
|
|
|
|
# arrange(desc(abs(medianX)))%>%
|
|
|
|
gt()
|
|
|
|
|
|
|
|
## ====================================================================
|
|
|
|
# Step 6: plotting predictive performance
|
|
|
|
## ====================================================================
|
|
|
|
|
|
|
|
reg_cfm<-confusionMatrix(cMatTest)
|
|
|
|
reg_auc_sum<-summary(auc_test[,1])
|
2022-12-02 16:43:31 +01:00
|
|
|
|
|
|
|
## ====================================================================
|
|
|
|
# Step 7: Packing list to save in loop
|
|
|
|
## ====================================================================
|
|
|
|
|
|
|
|
ls[[i]] <- list("RegularisedCoefs"=reg_coef_tbl,
|
|
|
|
"bestA"=best_alph,
|
|
|
|
"bestL"=best_lamb,
|
|
|
|
"ConfusionMatrx"=reg_cfm,
|
|
|
|
"AUROC"=reg_auc_sum)
|