## ItMLiHSmar2022 ## assign_full.R, child script ## Full model building and analysation for assignment ## Andreas Gammelgaard Damsbo, agdamsbo@clin.au.dk ## ==================================================================== ## Step 0: data import and wrangling ## ==================================================================== # source("data_format.R") ## ==================================================================== ## Step 1: settings ## ==================================================================== K<-10 n<-nrow(X) set.seed(321) # Using caret function to ensure both levels represented in all folds c<-createFolds(y=y, k = K, list = FALSE, returnTrain = TRUE) B<-list() auc_train<-auc_test<-c() catinfo<-levels(y) cMatTrain<-cMatTest<-table(factor(c(0,0),levels=catinfo),factor(c(0,0),levels=catinfo)) ## ==================================================================== ## Step 2: cross validation ## ==================================================================== set.seed(321) ## Iterate over partitions for (idx1 in 1:K){ # Status cat('Processing fold', idx1, 'of', K,'\n') # idx1=1 # Get training- and test sets I_train = c!=idx1 ## Creating selection vector of TRUE/FALSE I_test = !I_train Xtrain = X[I_train,] ytrain = y[I_train] Xtest = X[I_test,] ytest = y[I_test] # Z-score standardisation source("standardise.R") list2env(standardise(Xtrain,Xtest,type="cs"),.GlobalEnv) ## Outputs XtrainSt and XtestSt ## Standardised by centering and scaling ## Model matrices for glmnet # Xmat.train<-model.matrix(~.-1,XtrainSt) # Xmat.test<-model.matrix(~.-1,XtestSt) # Weights ytrain_weight<-as.vector(1 - (table(ytrain)[ytrain] / length(ytrain))) # Fit regularized linear regression model mod<-glm(ytrain~., data=XtrainSt, weights = ytrain_weight, family = stats::quasibinomial(link = "logit")) # Keep coefficients for plot B[[idx1]] <- mod # Predict yhatTrainProb<-predict(mod, newdata = XtrainSt, type = "response" ) yhatTestProb<-predict(mod, newdata = XtestSt, type = "response" ) # Compute training and test error yhatTrain = round(yhatTrainProb) yhatTest = round(yhatTestProb) # Make predictions categorical again (instead of 0/1 coding) yhatTrainCat = factor(round(yhatTrainProb),levels=c("0","1"),labels=catinfo,ordered = TRUE) yhatTestCat = factor(round(yhatTestProb),levels=c("0","1"),labels=catinfo,ordered = TRUE) # Compute confusion matrices cMatTrain = cMatTrain + table(ytrain,yhatTrainCat) cMatTest = cMatTest + table(ytest,yhatTestCat) # AUROC suppressMessages( auc_train[idx1]<-auc(ytrain, yhatTrainCat)) suppressMessages( auc_test [idx1]<-auc(ytest, yhatTestCat)) } ## ==================================================================== # Step 3: creating table of coefficients for inference ## ==================================================================== confs<-lapply(1:K, function(x){ cs<-exp(confint(B[[x]])) lo<-cs[,1] hi<-cs[,2] return(list(lo=lo,hi=hi)) }) coefs<-apply(Reduce('cbind', lapply(B,"[[", "coefficients")),1,mean) unlist(strsplit(names(B[[1]]$coefficients),2)) var.labels<-c(var.labels,'(Intercept)'="Intercept") ds<-tibble(name=var.labels[match(unlist(strsplit(names(coefs),2)), names(var.labels))], coefs=coefs, OR=round(exp(coefs),3), CIs=paste0("(", round(apply(Reduce('cbind', lapply(confs,"[[", "lo")),1,mean),3), ",", round(apply(Reduce('cbind', lapply(confs,"[[", "hi")),1,mean),3), ")") ) # full_coef_tbl<-ds%>% gt(rowname_col = list(age~"Age")) # full_coef_tbl ## ==================================================================== # Step 4: plotting classification performance ## ==================================================================== full_cfm<-confusionMatrix(cMatTest) full_auc_sum<-summary(auc_test)