142 lines
4.0 KiB
R
142 lines
4.0 KiB
R
|
## ItMLiHSmar2022
|
||
|
## assign_full.R, child script
|
||
|
## Full model building and analysation for assignment
|
||
|
## Andreas Gammelgaard Damsbo, agdamsbo@clin.au.dk
|
||
|
|
||
|
## ====================================================================
|
||
|
## Step 0: data import and wrangling
|
||
|
## ====================================================================
|
||
|
|
||
|
# source("data_format.R")
|
||
|
|
||
|
## ====================================================================
|
||
|
## Step 1: settings
|
||
|
## ====================================================================
|
||
|
K<-10
|
||
|
n<-nrow(X)
|
||
|
set.seed(321)
|
||
|
|
||
|
# Using caret function to ensure both levels represented in all folds
|
||
|
c<-createFolds(y=y, k = K, list = FALSE, returnTrain = TRUE)
|
||
|
|
||
|
B<-list()
|
||
|
auc_train<-auc_test<-c()
|
||
|
|
||
|
catinfo<-levels(y)
|
||
|
|
||
|
cMatTrain<-cMatTest<-table(factor(c(0,0),levels=catinfo),factor(c(0,0),levels=catinfo))
|
||
|
|
||
|
## ====================================================================
|
||
|
## Step 2: cross validation
|
||
|
## ====================================================================
|
||
|
|
||
|
set.seed(321)
|
||
|
## Iterate over partitions
|
||
|
for (idx1 in 1:K){
|
||
|
|
||
|
# Status
|
||
|
cat('Processing fold', idx1, 'of', K,'\n')
|
||
|
|
||
|
# idx1=1
|
||
|
# Get training- and test sets
|
||
|
I_train = c!=idx1 ## Creating selection vector of TRUE/FALSE
|
||
|
I_test = !I_train
|
||
|
|
||
|
Xtrain = X[I_train,]
|
||
|
ytrain = y[I_train]
|
||
|
Xtest = X[I_test,]
|
||
|
ytest = y[I_test]
|
||
|
|
||
|
# Z-score standardisation
|
||
|
source("standardise.R")
|
||
|
list2env(standardise(Xtrain,Xtest,type="cs"),.GlobalEnv)
|
||
|
## Outputs XtrainSt and XtestSt
|
||
|
## Standardised by centering and scaling
|
||
|
|
||
|
|
||
|
## Model matrices for glmnet
|
||
|
# Xmat.train<-model.matrix(~.-1,XtrainSt)
|
||
|
# Xmat.test<-model.matrix(~.-1,XtestSt)
|
||
|
|
||
|
# Weights
|
||
|
ytrain_weight<-as.vector(1 - (table(ytrain)[ytrain] / length(ytrain)))
|
||
|
|
||
|
# Fit regularized linear regression model
|
||
|
mod<-glm(ytrain~.,
|
||
|
data=XtrainSt,
|
||
|
weights = ytrain_weight,
|
||
|
family = stats::quasibinomial(link = "logit"))
|
||
|
|
||
|
# Keep coefficients for plot
|
||
|
B[[idx1]] <- mod
|
||
|
|
||
|
# Predict
|
||
|
yhatTrainProb<-predict(mod,
|
||
|
newdata = XtrainSt,
|
||
|
type = "response"
|
||
|
)
|
||
|
|
||
|
yhatTestProb<-predict(mod,
|
||
|
newdata = XtestSt,
|
||
|
type = "response"
|
||
|
)
|
||
|
|
||
|
# Compute training and test error
|
||
|
yhatTrain = round(yhatTrainProb)
|
||
|
yhatTest = round(yhatTestProb)
|
||
|
|
||
|
# Make predictions categorical again (instead of 0/1 coding)
|
||
|
yhatTrainCat = factor(round(yhatTrainProb),levels=c("0","1"),labels=catinfo,ordered = TRUE)
|
||
|
yhatTestCat = factor(round(yhatTestProb),levels=c("0","1"),labels=catinfo,ordered = TRUE)
|
||
|
# Compute confusion matrices
|
||
|
cMatTrain = cMatTrain + table(ytrain,yhatTrainCat)
|
||
|
cMatTest = cMatTest + table(ytest,yhatTestCat)
|
||
|
|
||
|
# AUROC
|
||
|
suppressMessages(
|
||
|
auc_train[idx1]<-auc(ytrain, yhatTrainCat))
|
||
|
suppressMessages(
|
||
|
auc_test [idx1]<-auc(ytest, yhatTestCat))
|
||
|
}
|
||
|
|
||
|
## ====================================================================
|
||
|
# Step 3: creating table of coefficients for inference
|
||
|
## ====================================================================
|
||
|
|
||
|
confs<-lapply(1:K, function(x){
|
||
|
cs<-exp(confint(B[[x]]))
|
||
|
lo<-cs[,1]
|
||
|
hi<-cs[,2]
|
||
|
return(list(lo=lo,hi=hi))
|
||
|
})
|
||
|
|
||
|
coefs<-apply(Reduce('cbind', lapply(B,"[[", "coefficients")),1,mean)
|
||
|
|
||
|
unlist(strsplit(names(B[[1]]$coefficients),2))
|
||
|
|
||
|
var.labels<-c(var.labels,'(Intercept)'="Intercept")
|
||
|
|
||
|
ds<-tibble(name=var.labels[match(unlist(strsplit(names(coefs),2)), names(var.labels))],
|
||
|
coefs=coefs,
|
||
|
OR=round(exp(coefs),3),
|
||
|
CIs=paste0("(",
|
||
|
round(apply(Reduce('cbind', lapply(confs,"[[", "lo")),1,mean),3),
|
||
|
",",
|
||
|
round(apply(Reduce('cbind', lapply(confs,"[[", "hi")),1,mean),3),
|
||
|
")")
|
||
|
)
|
||
|
|
||
|
#
|
||
|
full_coef_tbl<-ds%>%
|
||
|
gt(rowname_col = list(age~"Age"))
|
||
|
#
|
||
|
full_coef_tbl
|
||
|
|
||
|
## ====================================================================
|
||
|
# Step 4: plotting classification performance
|
||
|
## ====================================================================
|
||
|
|
||
|
full_cfm<-confusionMatrix(cMatTest)
|
||
|
full_auc_sum<-summary(auc_test)
|
||
|
|