118 lines
3.7 KiB
R
118 lines
3.7 KiB
R
|
## ItMLiHSmar2022
|
||
|
## regular_fun.R, child script
|
||
|
## Regularisation model building function
|
||
|
## Andreas Gammelgaard Damsbo, agdamsbo@clin.au.dk
|
||
|
##
|
||
|
## Now modified to use in publication
|
||
|
##
|
||
|
|
||
|
regular_fun<-function(X,y,K,lambdas,alpha){
|
||
|
n<-nrow(X)
|
||
|
set.seed(321)
|
||
|
|
||
|
# Using caret function to ensure both levels represented in all folds
|
||
|
c<-createFolds(y=y, k = K, list = FALSE, returnTrain = TRUE)
|
||
|
|
||
|
B<-yhatTestProbKeep<-list()
|
||
|
accTrain<-accTest<-err_train<-err_test<-auc_train<-auc_test<-matrix(nrow = K,ncol = length(lambdas))
|
||
|
|
||
|
catinfo<-levels(y)
|
||
|
|
||
|
cMatTrain<-cMatTest<-table(true=factor(c(0,0),levels=catinfo),pred=factor(c(0,0),levels=catinfo))
|
||
|
|
||
|
|
||
|
## Iterate over partitions
|
||
|
for (idx1 in 1:K){
|
||
|
|
||
|
# Status
|
||
|
cat('Processing fold', idx1, 'of', K,'\n')
|
||
|
|
||
|
# idx1=1
|
||
|
# Get training- and test sets
|
||
|
I_train = c!=idx1 ## Creating selection vector of TRUE/FALSE
|
||
|
I_test = !I_train
|
||
|
|
||
|
Xtrain = X[I_train,]
|
||
|
ytrain = y[I_train]
|
||
|
Xtest = X[I_test,]
|
||
|
ytest = y[I_test]
|
||
|
|
||
|
|
||
|
## Model matrices for glmnet
|
||
|
## Using the complicated approach not to include first level.
|
||
|
# Xmat.train<-model.matrix(~ .-1, data=Xtrain,
|
||
|
# contrasts.arg = lapply(Xtrain[,sapply(Xtrain, is.factor)],
|
||
|
# contrasts, contrasts=T))
|
||
|
# Xmat.test<-model.matrix(~ .-1, data=Xtest,
|
||
|
# contrasts.arg = lapply(Xtest[,sapply(Xtest, is.factor)],
|
||
|
# contrasts, contrasts=T))
|
||
|
|
||
|
# Xmat.train<-model.matrix(~.-1,Xtrain)
|
||
|
# Xmat.test<-model.matrix(~.-1,Xtest)
|
||
|
|
||
|
# Weights
|
||
|
ytrain_weight<-as.vector(1 - (table(ytrain)[ytrain] / length(ytrain)))
|
||
|
# ytest_weight<-as.vector(1 / (table(ytest)[ytest] / length(ytest)))
|
||
|
|
||
|
# Fit regularized linear regression model
|
||
|
mod<-glmnet(Xtrain, ytrain,
|
||
|
alpha = alpha, ## Alpha = 1 for lasso
|
||
|
lambda = lambdas, ## Setting lambdas
|
||
|
standardize = TRUE, ## Scales and centers
|
||
|
weights = ytrain_weight,
|
||
|
family = "binomial"
|
||
|
)
|
||
|
|
||
|
# Keep coefficients for plot
|
||
|
B[[idx1]] <- as.matrix(coef(mod))
|
||
|
|
||
|
# Iterate over regularization strengths to compute training- and test
|
||
|
# errors for individual regularization strengths.
|
||
|
for (idx2 in 1:length(lambdas)){
|
||
|
# idx2=1
|
||
|
|
||
|
# Predict
|
||
|
yhatTrainProb<-predict(mod,
|
||
|
s = lambdas[idx2],
|
||
|
newx = data.matrix(Xtrain),
|
||
|
type = "response"
|
||
|
)
|
||
|
|
||
|
yhatTestProb<-predict(mod,
|
||
|
s = lambdas[idx2],
|
||
|
newx = data.matrix(Xtest),
|
||
|
type = "response"
|
||
|
)
|
||
|
|
||
|
# Compute training and test error
|
||
|
yhatTrain = round(yhatTrainProb)
|
||
|
yhatTest = round(yhatTestProb)
|
||
|
|
||
|
# Make predictions categorical again (instead of 0/1 coding)
|
||
|
yhatTrainCat = factor(round(yhatTrainProb),levels=c("0","1"),labels=catinfo,ordered = TRUE)
|
||
|
yhatTestCat = factor(round(yhatTestProb),levels=c("0","1"),labels=catinfo,ordered = TRUE)
|
||
|
|
||
|
# Evaluate classifier performance
|
||
|
# Accuracy
|
||
|
# accTrain[idx1,idx2] <- sum(yhatTrainCat==ytrain)/length(ytrain)
|
||
|
# accTest [idx1,idx2] <- sum(yhatTestCat==ytest)/length(ytest)
|
||
|
# #
|
||
|
# # Error rate
|
||
|
# err_train[idx1,idx2] = 1 - accTrain[idx1,idx2]
|
||
|
# err_test [idx1,idx2] = 1 - accTest[idx1,idx2]
|
||
|
|
||
|
# AUROC
|
||
|
suppressMessages(
|
||
|
auc_train[idx1,idx2]<-auc(ytrain, yhatTrainCat))
|
||
|
suppressMessages(
|
||
|
auc_test [idx1,idx2]<-auc(ytest, yhatTestCat))
|
||
|
|
||
|
# Compute confusion matrices
|
||
|
cMatTrain = cMatTrain + table(true=ytrain,pred=yhatTrainCat)
|
||
|
cMatTest = cMatTest + table(true=ytest,pred=yhatTestCat)
|
||
|
}
|
||
|
}
|
||
|
ls<-list(mod=mod,B=B,auc_train=auc_train,auc_test=auc_test,cMatTrain=cMatTrain,cMatTest=cMatTest)
|
||
|
return(ls)
|
||
|
}
|