PhysicalActivityandStrokeOu.../1 PA Decline/regularisation_steps.R

## ItMLiHSmar2022
## regularisation_steps.R, child script
## Regularised model building and analysation for assignment
## Andreas Gammelgaard Damsbo, agdamsbo@clin.au.dk
## 
## Now modified to use in publication
## 

## ====================================================================
## Step 0: data import and wrangling
## ====================================================================

setwd("/Users/au301842/PhysicalActivityandStrokeOutcome/1 PA Decline/")

# source("data_format.R")
y1<-factor(as.integer(y)-1) ## Outcome is required to be factor of 0 or 1.


## ====================================================================
## Step 1: settings
## ====================================================================

## Folds
K=10
set.seed(3)
c<-caret::createFolds(y=y, 
                      k = K, 
                      list = FALSE, 
                      returnTrain = TRUE) # Foldids for alpha tuning

## Defining tuning parameters
lambdas=2^seq(-10, 5, 1)
alphas<-seq(0,1,.1)

## Weights for models
weighted=TRUE
if (weighted == TRUE) {
  wght<-as.vector(1 - (table(y)[y] / length(y)))
} else {
  wght <- rep(1, nrow(y))
}


## Standardise numeric
## Centered and 


## ====================================================================
## Step 2: all cross validations for each alpha
## ====================================================================

library(furrr)
library(purrr)
library(doMC)
registerDoMC(cores=6)

# Nested CVs with analysis for all lambdas for each alpha
# 
set.seed(3)
cvs <- future_map(alphas, function(a){
  cv.glmnet(model.matrix(~.-1,X),
            y1,
            weights = wght,
            lambda=lambdas, 
            type.measure = "deviance", # This is standard measure and recommended for tuning
            foldid = c, # Per recommendation the folds are kept for alpha optimisation
            alpha=a,
            standardize=TRUE,
            family=quasibinomial,
            keep=TRUE) # Same as binomial, but not as picky
})

## ====================================================================
# Step 3: optimum lambda for each alpha
## ====================================================================


# For each alpha, lambda is chosen for the lowest meassure (deviance)
each_alpha <- sapply(seq_along(alphas), function(id) {
  each_cv <- cvs[[id]]
  alpha_val <- alphas[id]
  index_lmin <- match(each_cv$lambda.min, 
                      each_cv$lambda)
  c(lamb = each_cv$lambda.min, 
    alph = alpha_val,
    cvm = each_cv$cvm[index_lmin])
})

# Best lambda
best_lamb <- min(each_alpha["lamb", ])

# Alpha is chosen for best lambda with lowest model deviance, each_alpha["cvm",]
best_alph <- each_alpha["alph",][each_alpha["cvm",]==min(each_alpha["cvm",]
                                                         [each_alpha["lamb",] %in% best_lamb])]

## https://stackoverflow.com/questions/42007313/plot-an-roc-curve-in-r-with-ggplot2
p_roc<-roc.glmnet(cvs[[1]]$fit.preval, newy = y)[[match(best_alph,alphas)]]|> # Plots performance from model with best alpha
  ggplot(aes(FPR,TPR)) + 
  geom_step() +
  coord_cartesian(xlim=c(0,1), ylim=c(0,1)) +
  geom_abline()+
  theme_bw()

## ====================================================================
# Step 4: Creating the final model
## ====================================================================

source("regular_fun.R") # Custom function
optimised_model<-regular_fun(X,y1,K,lambdas=best_lamb,alpha=best_alph) 
# With lambda and alpha specified, the function is just a k-fold cross-validation wrapper, 
# but keeps model performance figures from each fold.

list2env(optimised_model,.GlobalEnv)
# Function outputs a list, which is unwrapped to Env.
# See source script for reference.

## ====================================================================
# Step 5: creating table of coefficients for inference
## ====================================================================

Bmatrix<-matrix(unlist(B),ncol=10)
Bmedian<-apply(Bmatrix,1,median)
Bmean<-apply(Bmatrix,1,mean)

reg_coef_tbl<-tibble(
  name = c("Intercept",Hmisc::label(X)),
  medianX = round(Bmedian,5),
  ORmed = round(exp(Bmedian),5),
  meanX = round(Bmean,5),
  ORmea = round(exp(Bmean),5))%>%
  # arrange(desc(abs(medianX)))%>%
  gt()

## ====================================================================
# Step 6: plotting predictive performance
## ====================================================================

reg_cfm<-confusionMatrix(cMatTest)
reg_auc_sum<-summary(auc_test[,1])

## ====================================================================
# Step 7: Packing list to save in loop
## ====================================================================

ls[[i]] <- list("RegularisedCoefs"=reg_coef_tbl,
                "bestA"=best_alph,
                "bestL"=best_lamb,
                "ConfusionMatrx"=reg_cfm,
                "AUROC"=reg_auc_sum)
updated analysis and much more 2022-09-28 16:03:58 +02:00			`## ItMLiHSmar2022`
			`## regularisation_steps.R, child script`
			`## Regularised model building and analysation for assignment`
			`## Andreas Gammelgaard Damsbo, agdamsbo@clin.au.dk`
			`##`
			`## Now modified to use in publication`
			`##`

			`## ====================================================================`
			`## Step 0: data import and wrangling`
			`## ====================================================================`

			`setwd("/Users/au301842/PhysicalActivityandStrokeOutcome/1 PA Decline/")`

			`# source("data_format.R")`
			`y1<-factor(as.integer(y)-1) ## Outcome is required to be factor of 0 or 1.`


			`## ====================================================================`
			`## Step 1: settings`
			`## ====================================================================`

			`## Folds`
			`K=10`
			`set.seed(3)`
			`c<-caret::createFolds(y=y,`
			`k = K,`
			`list = FALSE,`
			`returnTrain = TRUE) # Foldids for alpha tuning`

			`## Defining tuning parameters`
			`lambdas=2^seq(-10, 5, 1)`
			`alphas<-seq(0,1,.1)`

			`## Weights for models`
			`weighted=TRUE`
			`if (weighted == TRUE) {`
			`wght<-as.vector(1 - (table(y)[y] / length(y)))`
			`} else {`
			`wght <- rep(1, nrow(y))`
			`}`


			`## Standardise numeric`
			`## Centered and`



			`## ====================================================================`
			`## Step 2: all cross validations for each alpha`
			`## ====================================================================`

			`library(furrr)`
			`library(purrr)`
			`library(doMC)`
			`registerDoMC(cores=6)`

			`# Nested CVs with analysis for all lambdas for each alpha`
			`#`
			`set.seed(3)`
			`cvs <- future_map(alphas, function(a){`
			`cv.glmnet(model.matrix(~.-1,X),`
			`y1,`
			`weights = wght,`
			`lambda=lambdas,`
			`type.measure = "deviance", # This is standard measure and recommended for tuning`
			`foldid = c, # Per recommendation the folds are kept for alpha optimisation`
			`alpha=a,`
			`standardize=TRUE,`
			`family=quasibinomial,`
			`keep=TRUE) # Same as binomial, but not as picky`
			`})`

			`## ====================================================================`
			`# Step 3: optimum lambda for each alpha`
			`## ====================================================================`


			`# For each alpha, lambda is chosen for the lowest meassure (deviance)`
			`each_alpha <- sapply(seq_along(alphas), function(id) {`
			`each_cv <- cvs[[id]]`
			`alpha_val <- alphas[id]`
			`index_lmin <- match(each_cv$lambda.min,`
			`each_cv$lambda)`
			`c(lamb = each_cv$lambda.min,`
			`alph = alpha_val,`
			`cvm = each_cv$cvm[index_lmin])`
			`})`

			`# Best lambda`
			`best_lamb <- min(each_alpha["lamb", ])`

			`# Alpha is chosen for best lambda with lowest model deviance, each_alpha["cvm",]`
			`best_alph <- each_alpha["alph",][each_alpha["cvm",]==min(each_alpha["cvm",]`
			`[each_alpha["lamb",] %in% best_lamb])]`

			`## https://stackoverflow.com/questions/42007313/plot-an-roc-curve-in-r-with-ggplot2`
			`p_roc<-roc.glmnet(cvs[[1]]$fit.preval, newy = y)[[match(best_alph,alphas)]]\|> # Plots performance from model with best alpha`
			`ggplot(aes(FPR,TPR)) +`
			`geom_step() +`
			`coord_cartesian(xlim=c(0,1), ylim=c(0,1)) +`
			`geom_abline()+`
			`theme_bw()`

			`## ====================================================================`
			`# Step 4: Creating the final model`
			`## ====================================================================`

			`source("regular_fun.R") # Custom function`
			`optimised_model<-regular_fun(X,y1,K,lambdas=best_lamb,alpha=best_alph)`
			`# With lambda and alpha specified, the function is just a k-fold cross-validation wrapper,`
			`# but keeps model performance figures from each fold.`

			`list2env(optimised_model,.GlobalEnv)`
			`# Function outputs a list, which is unwrapped to Env.`
			`# See source script for reference.`

			`## ====================================================================`
			`# Step 5: creating table of coefficients for inference`
			`## ====================================================================`

			`Bmatrix<-matrix(unlist(B),ncol=10)`
			`Bmedian<-apply(Bmatrix,1,median)`
			`Bmean<-apply(Bmatrix,1,mean)`

			`reg_coef_tbl<-tibble(`
			`name = c("Intercept",Hmisc::label(X)),`
			`medianX = round(Bmedian,5),`
			`ORmed = round(exp(Bmedian),5),`
			`meanX = round(Bmean,5),`
			`ORmea = round(exp(Bmean),5))%>%`
			`# arrange(desc(abs(medianX)))%>%`
			`gt()`

			`## ====================================================================`
			`# Step 6: plotting predictive performance`
			`## ====================================================================`

			`reg_cfm<-confusionMatrix(cMatTest)`
			`reg_auc_sum<-summary(auc_test[,1])`
all analyses of project one moved to new generation (old saved as gen_1) 2022-12-02 16:43:31 +01:00
			`## ====================================================================`
			`# Step 7: Packing list to save in loop`
			`## ====================================================================`

			`ls[[i]] <- list("RegularisedCoefs"=reg_coef_tbl,`
			`"bestA"=best_alph,`
			`"bestL"=best_lamb,`
			`"ConfusionMatrx"=reg_cfm,`
			`"AUROC"=reg_auc_sum)`