PhysicalActivityandStrokeOu.../1 PA Decline/archive/generation_1/assign_full.R

## ItMLiHSmar2022
## assign_full.R, child script
## Full model building and analysation for assignment
## Andreas Gammelgaard Damsbo, agdamsbo@clin.au.dk

## ====================================================================
## Step 0: data import and wrangling
## ====================================================================

# source("data_format.R")

## ====================================================================
## Step 1: settings
## ====================================================================
K<-10
n<-nrow(X)
set.seed(321)

# Using caret function to ensure both levels represented in all folds
c<-createFolds(y=y, k = K, list = FALSE, returnTrain = TRUE) 

B<-list()
auc_train<-auc_test<-c()

catinfo<-levels(y)

cMatTrain<-cMatTest<-table(factor(c(0,0),levels=catinfo),factor(c(0,0),levels=catinfo))

## ====================================================================
## Step 2: cross validation
## ====================================================================

set.seed(321)
## Iterate over partitions
for (idx1 in 1:K){
  
  # Status
  cat('Processing fold', idx1, 'of', K,'\n')
  
  # idx1=1
  # Get training- and test sets
  I_train = c!=idx1 ## Creating selection vector of TRUE/FALSE
  I_test = !I_train
  
  Xtrain = X[I_train,]
  ytrain = y[I_train]
  Xtest = X[I_test,]
  ytest = y[I_test]
  
  # Z-score standardisation
  source("standardise.R")
  list2env(standardise(Xtrain,Xtest,type="cs"),.GlobalEnv) 
  ## Outputs XtrainSt and XtestSt
  ## Standardised by centering and scaling
  
  
  ## Model matrices for glmnet
  # Xmat.train<-model.matrix(~.-1,XtrainSt)
  # Xmat.test<-model.matrix(~.-1,XtestSt)
  
  # Weights
  ytrain_weight<-as.vector(1 - (table(ytrain)[ytrain] / length(ytrain)))
  
  # Fit regularized linear regression model
  mod<-glm(ytrain~.,
           data=XtrainSt,
           weights = ytrain_weight,
           family = stats::quasibinomial(link = "logit"))
  
  # Keep coefficients for plot
  B[[idx1]] <- mod
  
  # Predict
  yhatTrainProb<-predict(mod, 
                         newdata = XtrainSt,
                         type = "response"
  )
  
  yhatTestProb<-predict(mod, 
                        newdata = XtestSt,
                        type = "response"
  )
  
  # Compute training and test error
  yhatTrain = round(yhatTrainProb)
  yhatTest = round(yhatTestProb)
  
  # Make predictions categorical again (instead of 0/1 coding)
  yhatTrainCat = factor(round(yhatTrainProb),levels=c("0","1"),labels=catinfo,ordered = TRUE)
  yhatTestCat = factor(round(yhatTestProb),levels=c("0","1"),labels=catinfo,ordered = TRUE)
  # Compute confusion matrices
  cMatTrain = cMatTrain + table(ytrain,yhatTrainCat)
  cMatTest = cMatTest + table(ytest,yhatTestCat)
  
  # AUROC
  suppressMessages(
    auc_train[idx1]<-auc(ytrain, yhatTrainCat))
  suppressMessages(
    auc_test [idx1]<-auc(ytest, yhatTestCat))
}

## ====================================================================
# Step 3: creating table of coefficients for inference
## ====================================================================

confs<-lapply(1:K, function(x){
  cs<-exp(confint(B[[x]]))
  lo<-cs[,1]
  hi<-cs[,2]
  return(list(lo=lo,hi=hi))
})

coefs<-apply(Reduce('cbind',  lapply(B,"[[", "coefficients")),1,mean)

unlist(strsplit(names(B[[1]]$coefficients),2))

var.labels<-c(var.labels,'(Intercept)'="Intercept")

ds<-tibble(name=var.labels[match(unlist(strsplit(names(coefs),2)), names(var.labels))],
           coefs=coefs,
           OR=round(exp(coefs),3),
           CIs=paste0("(",
                      round(apply(Reduce('cbind',  lapply(confs,"[[", "lo")),1,mean),3),
                      ",",
                      round(apply(Reduce('cbind',  lapply(confs,"[[", "hi")),1,mean),3),
                      ")")
           )

# 
full_coef_tbl<-ds%>%
  gt(rowname_col = list(age~"Age"))
# 
full_coef_tbl

## ====================================================================
# Step 4: plotting classification performance
## ====================================================================

full_cfm<-confusionMatrix(cMatTest)
full_auc_sum<-summary(auc_test)
updated analysis and much more 2022-09-28 16:03:58 +02:00			`## ItMLiHSmar2022`
			`## assign_full.R, child script`
			`## Full model building and analysation for assignment`
			`## Andreas Gammelgaard Damsbo, agdamsbo@clin.au.dk`

			`## ====================================================================`
			`## Step 0: data import and wrangling`
			`## ====================================================================`

			`# source("data_format.R")`

			`## ====================================================================`
			`## Step 1: settings`
			`## ====================================================================`
			`K<-10`
			`n<-nrow(X)`
			`set.seed(321)`

			`# Using caret function to ensure both levels represented in all folds`
			`c<-createFolds(y=y, k = K, list = FALSE, returnTrain = TRUE)`

			`B<-list()`
			`auc_train<-auc_test<-c()`

			`catinfo<-levels(y)`

			`cMatTrain<-cMatTest<-table(factor(c(0,0),levels=catinfo),factor(c(0,0),levels=catinfo))`

			`## ====================================================================`
			`## Step 2: cross validation`
			`## ====================================================================`

			`set.seed(321)`
			`## Iterate over partitions`
			`for (idx1 in 1:K){`

			`# Status`
			`cat('Processing fold', idx1, 'of', K,'\n')`

			`# idx1=1`
			`# Get training- and test sets`
			`I_train = c!=idx1 ## Creating selection vector of TRUE/FALSE`
			`I_test = !I_train`

			`Xtrain = X[I_train,]`
			`ytrain = y[I_train]`
			`Xtest = X[I_test,]`
			`ytest = y[I_test]`

			`# Z-score standardisation`
			`source("standardise.R")`
			`list2env(standardise(Xtrain,Xtest,type="cs"),.GlobalEnv)`
			`## Outputs XtrainSt and XtestSt`
			`## Standardised by centering and scaling`


			`## Model matrices for glmnet`
			`# Xmat.train<-model.matrix(~.-1,XtrainSt)`
			`# Xmat.test<-model.matrix(~.-1,XtestSt)`

			`# Weights`
			`ytrain_weight<-as.vector(1 - (table(ytrain)[ytrain] / length(ytrain)))`

			`# Fit regularized linear regression model`
			`mod<-glm(ytrain~.,`
			`data=XtrainSt,`
			`weights = ytrain_weight,`
			`family = stats::quasibinomial(link = "logit"))`

			`# Keep coefficients for plot`
			`B[[idx1]] <- mod`

			`# Predict`
			`yhatTrainProb<-predict(mod,`
			`newdata = XtrainSt,`
			`type = "response"`
			`)`

			`yhatTestProb<-predict(mod,`
			`newdata = XtestSt,`
			`type = "response"`
			`)`

			`# Compute training and test error`
			`yhatTrain = round(yhatTrainProb)`
			`yhatTest = round(yhatTestProb)`

			`# Make predictions categorical again (instead of 0/1 coding)`
			`yhatTrainCat = factor(round(yhatTrainProb),levels=c("0","1"),labels=catinfo,ordered = TRUE)`
			`yhatTestCat = factor(round(yhatTestProb),levels=c("0","1"),labels=catinfo,ordered = TRUE)`
			`# Compute confusion matrices`
			`cMatTrain = cMatTrain + table(ytrain,yhatTrainCat)`
			`cMatTest = cMatTest + table(ytest,yhatTestCat)`

			`# AUROC`
			`suppressMessages(`
			`auc_train[idx1]<-auc(ytrain, yhatTrainCat))`
			`suppressMessages(`
			`auc_test [idx1]<-auc(ytest, yhatTestCat))`
			`}`

			`## ====================================================================`
			`# Step 3: creating table of coefficients for inference`
			`## ====================================================================`

			`confs<-lapply(1:K, function(x){`
			`cs<-exp(confint(B[[x]]))`
			`lo<-cs[,1]`
			`hi<-cs[,2]`
			`return(list(lo=lo,hi=hi))`
			`})`

			`coefs<-apply(Reduce('cbind', lapply(B,"[[", "coefficients")),1,mean)`

			`unlist(strsplit(names(B[[1]]$coefficients),2))`

			`var.labels<-c(var.labels,'(Intercept)'="Intercept")`

			`ds<-tibble(name=var.labels[match(unlist(strsplit(names(coefs),2)), names(var.labels))],`
			`coefs=coefs,`
			`OR=round(exp(coefs),3),`
			`CIs=paste0("(",`
			`round(apply(Reduce('cbind', lapply(confs,"[[", "lo")),1,mean),3),`
			`",",`
			`round(apply(Reduce('cbind', lapply(confs,"[[", "hi")),1,mean),3),`
			`")")`
			`)`

			`#`
			`full_coef_tbl<-ds%>%`
			`gt(rowname_col = list(age~"Age"))`
			`#`
			`full_coef_tbl`

			`## ====================================================================`
			`# Step 4: plotting classification performance`
			`## ====================================================================`

			`full_cfm<-confusionMatrix(cMatTest)`
			`full_auc_sum<-summary(auc_test)`