PhysicalActivityandStrokeOu.../1 PA Decline/assign_full.R

## ItMLiHSmar2022
## assign_full.R, child script
## Full model building and analysation for assignment
## Andreas Gammelgaard Damsbo, agdamsbo@clin.au.dk

## ====================================================================
## Step 0: data import and wrangling
## ====================================================================

# source("data_format.R")

## ====================================================================
## Step 1: settings
## ====================================================================
K<-10
n<-nrow(X)
set.seed(321)

# Using caret function to ensure both levels represented in all folds
c<-createFolds(y=y, k = K, list = FALSE, returnTrain = TRUE)

B<-list()
auc_train<-auc_test<-c()

catinfo<-levels(y)

cMatTrain<-cMatTest<-table(factor(c(0,0),levels=catinfo),factor(c(0,0),levels=catinfo))

## ====================================================================
## Step 2: cross validation
## ====================================================================

set.seed(321)
## Iterate over partitions
for (idx1 in 1:K){

  # Status
  cat('Processing fold', idx1, 'of', K,'\n')

  # idx1=1
  # Get training- and test sets
  I_train = c!=idx1 ## Creating selection vector of TRUE/FALSE
  I_test = !I_train

  Xtrain = X[I_train,]
  ytrain = y[I_train]
  Xtest = X[I_test,]
  ytest = y[I_test]

  # Z-score standardisation
  source("standardise.R")
  list2env(standardise(Xtrain,Xtest,type="cs"),.GlobalEnv)
  ## Outputs XtrainSt and XtestSt
  ## Standardised by centering and scaling


  ## Model matrices for glmnet
  # Xmat.train<-model.matrix(~.-1,XtrainSt)
  # Xmat.test<-model.matrix(~.-1,XtestSt)

  # Weights
  ytrain_weight<-as.vector(1 - (table(ytrain)[ytrain] / length(ytrain)))

  # Fit regularized linear regression model
  mod<-glm(ytrain~.,
           data=XtrainSt,
           weights = ytrain_weight,
           family = stats::quasibinomial(link = "logit"))

  # Keep coefficients for plot
  B[[idx1]] <- mod

  # Predict
  yhatTrainProb<-predict(mod,
                         newdata = XtrainSt,
                         type = "response"
  )

  yhatTestProb<-predict(mod,
                        newdata = XtestSt,
                        type = "response"
  )

  # Compute training and test error
  yhatTrain = round(yhatTrainProb)
  yhatTest = round(yhatTestProb)

  # Make predictions categorical again (instead of 0/1 coding)
  yhatTrainCat = factor(round(yhatTrainProb),levels=c("0","1"),labels=catinfo,ordered = TRUE)
  yhatTestCat = factor(round(yhatTestProb),levels=c("0","1"),labels=catinfo,ordered = TRUE)
  # Compute confusion matrices
  cMatTrain = cMatTrain + table(ytrain,yhatTrainCat)
  cMatTest = cMatTest + table(ytest,yhatTestCat)

  # AUROC
  suppressMessages(
    auc_train[idx1]<-auc(ytrain, yhatTrainCat))
  suppressMessages(
    auc_test [idx1]<-auc(ytest, yhatTestCat))
}

## ====================================================================
# Step 3: creating table of coefficients for inference
## ====================================================================

confs<-lapply(1:K, function(x){
  cs<-exp(confint(B[[x]]))
  lo<-cs[,1]
  hi<-cs[,2]
  return(list(lo=lo,hi=hi))
})

coefs<-apply(Reduce('cbind',  lapply(B,"[[", "coefficients")),1,mean)

unlist(strsplit(names(B[[1]]$coefficients),2))

var.labels<-c(var.labels,'(Intercept)'="Intercept")

ds<-tibble(name=var.labels[match(unlist(strsplit(names(coefs),2)), names(var.labels))],
           coefs=coefs,
           OR=round(exp(coefs),3),
           CIs=paste0("(",
                      round(apply(Reduce('cbind',  lapply(confs,"[[", "lo")),1,mean),3),
                      ",",
                      round(apply(Reduce('cbind',  lapply(confs,"[[", "hi")),1,mean),3),
                      ")")
           )

#
full_coef_tbl<-ds%>%
  gt(rowname_col = list(age~"Age"))
#
full_coef_tbl

## ====================================================================
# Step 4: plotting classification performance
## ====================================================================

full_cfm<-confusionMatrix(cMatTest)
full_auc_sum<-summary(auc_test)