--- title: "Sensitivity analysis on imputed dataset" author: "Andreas Gammelgaard Damsbo" date: "Knitted: `r format(Sys.time(), '%d %B, %Y')`" output: pdf_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, message = FALSE) ``` # Import ```{r} dta_all<-read.csv("/Volumes/Data/depression/dep_dataset.csv",na.strings = c("NA","unknown")) ## Extending definition of NA's for imputation ``` # Defining patients to include for analysis Only including cases with complete pase_0 and MDI at 1 & 6 months ```{r} dta<-dta_all[!is.na(dta_all$pase_0),] # &!is.na(dta$mdi_1)&!is.na(dta$mdi_6) ``` ## Formatting ```{r echo=FALSE} dta$diabetes<-factor(dta$diabetes) dta$pad<-factor(dta$pad) dta$civil<-factor(dta$civil) dta$hypertension<-factor(dta$hypertension) dta$afli<-factor(dta$afli) dta$smoke_ever<-factor(dta$smoke_ever) dta$ami<-factor(dta$ami) dta$tci<-factor(dta$tci) dta$thrombolysis<-factor(dta$thrombolysis) dta$thrombechtomy<-factor(dta$thrombechtomy) dta$rep_any<-factor(dta$rep_any) dta$pad<-factor(dta$pad) dta$nihss_0<-as.numeric(dta$nihss_0) dta$age<-as.numeric(dta$age) dta$rtreat<-factor(dta$rtreat) dta$sex<-factor(dta$sex) dta$pase_0<-as.numeric(dta$pase_0) dta$pase_6<-as.numeric(dta$pase_6) dta$bmi<-as.numeric(dta$bmi) dta$mdi_6<-as.numeric(dta$mdi_6) dta$pase_0_bin<-factor(dta$pase_0_bin,levels=c("lower","higher")) ``` ```{r} # Backup dta_b<-dta ``` # Libraries ```{r} library(daDoctoR) library(mice) ``` ## Variables to include in imputation ```{r} # Possible variables to include coval<-c("pase_0_bin","rtreat","age","sex","smoke_ever","civil","bmi","diabetes", "hypertension", "afli","pad","nihss_0","rep_any") ``` # Imputation ```{r} # Output variables added to include in model. Excluded from predicting. outc<-c("mdi_1_enr","mdi_6_newobs_enr") # Adding all covar<-c(coval,outc) # Selecting dataset r<-dta[,c("rnumb",covar)] ``` ```{r} # Iterations mxt=20 # Imputations mis=5 ``` https://datascienceplus.com/imputing-missing-data-with-r-mice-package/ ```{r} md.pattern(r) # Missing pattern # library(VIM) # aggr_plot <- aggr(r, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern")) ``` ```{r} init <- mice(r, maxit=0) # Creating initial imputation list to assess methods meth <- init$method meth predM <- init$predictorMatrix predM[, c("rnumb")] <- 0 # Defining variables not to be used for predicting imputed values # meth[outc]="" # Defining variables not to be imputed. # Commented out as all included variables will be imputed ``` ```{r echo=FALSE} imputed <- mice(r, method=meth, predictorMatrix=predM, m=mis, maxit = mxt,seed = 103, printFlag=FALSE) ``` ```{r} # summary(imputed) ``` ```{r} library(dplyr) export<-dta %>% select(-all_of(coval)) %>% # Leaving out imputed variables from original dataset left_join(mice::complete(imputed,1),.,by="rnumb") # Join with the first imputed dataset for a full dataset export md.pattern(export[coval]) # Ensuring complete data write.csv(export,"/Volumes/Data/depression/imputed.csv",row.names = FALSE) # Export ``` # Regression analyses ```{r} print(adjs_10<-rep_lm(meas = "mdi_6",string=c("pase_0_bin","rtreat",coval),data=dta,cut.p = .1)[[2]]) ``` ## Bivariabel Function to format collected data from pool function ```{r} pool_table<-function(clls){ ## Variables needed: estimate, p.value, term coll$lo<-round(coll$estimate-coll$std.error*1.96,2) coll$hi<-round(coll$estimate+coll$std.error*1.96,2) pa<-coll$p.value pa <- ifelse(pa < 0.001, "<0.001", round(pa, 3)) pa <- ifelse(pa <= 0.05 | pa == "<0.001", paste0("*",pa), ifelse(pa > 0.05 & pa <= 0.1, paste0(".", pa),pa)) cl<-data.frame(id=coll$term,diff=paste0(round(coll$estimate,2)," (",coll$lo," to ",coll$hi,")"),p=pa,stringsAsFactors=FALSE) return(cl) } keeps<-c("term","estimate","std.error","p.value") ``` ### Repeated bivariabel analysis Not necessary for this, but an interesting addition ```{r} coll<-c() for (i in c("rtreat",adjs_10)){ ## Bivariable linear regression analysis of all coeffs<-summary(pool( with(imputed,lm(as.formula(paste0("mdi_1_enr~",i)))) ))[-1,c("term","estimate","std.error","p.value")] coll<-rbind(coll,coeffs) ## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula ## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3 } ## Collecting coll$lo<-round(coll$estimate-coll$std.error*1.96,2) coll$hi<-round(coll$estimate+coll$std.error*1.96,2) pa<-coll$p.value pa <- ifelse(pa < 0.001, "<0.001", round(pa, 3)) pa <- ifelse(pa <= 0.05 | pa == "<0.001", paste0("*",pa), ifelse(pa > 0.05 & pa <= 0.1, paste0(".", pa),pa)) coll_bi<-data.frame(diff=paste0(round(exp(coll$estimate),2)," (",coll$lo," to ",coll$hi,")"),p=pa,id=coll$term,stringsAsFactors=FALSE) ``` ## Unadjusted analyses ```{r} adjs_10m<-adjs_10[adjs_10!="pase_0_bin"] adj_m<-c("rtreat","pase_0_bin") coll<-c() nms<-c() for (l in outc){ for (i in adj_m){ coeffs<-summary(pool( with(imputed,lm(as.formula(paste0(l,"~",i)))) ))[-1,keeps] coll<-rbind(coll,coeffs) nms<-c(nms,paste(l,i,sep = "_")) d.long <- mice::complete(imputed,"long",include = T) # Inspiration: https://stackoverflow.com/questions/53014141/mice-splitting-imputed-data-for-further-analysis for (j in levels(d.long[[i]])){ k<-length(adj_m)-grep(i,adj_m)+1 ## This only works to select the "opposite" of i for length(adj_m)==2 s.imp<-mice::as.mids(d.long[which(d.long[[i]] == j),]) # Subsetting long and convert to "mids" format for pooling coeffs<-summary(pool( with(s.imp,lm(as.formula(paste0(l,"~",adj_m[k])))) ))[-1,keeps] coll<-rbind(coll,coeffs) nms<-c(nms,paste(l,j,sep = "_")) } ## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula ## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3 } } coll$term<-nms ``` ### Collecting ```{r} biv_coll<-pool_table(coll) ``` ## Adjusted analyses ```{r} adjs_10m<-adjs_10[adjs_10!="pase_0_bin"] adj_m<-c("rtreat","pase_0_bin") coll<-c() nms<-c() for (l in outc){ # l="mdi_1_enr" for (i in adj_m){ coeffs<-summary(pool( with(imputed,lm(as.formula(paste0(l,"~",paste(i,paste(adjs_10m,collapse="+"),sep="+"))))) ))[2,keeps] coll<-rbind(coll,coeffs) nms<-c(nms,paste(l,i,sep = "_")) d.long <- mice::complete(imputed,"long",include = T) # Inspiration: https://stackoverflow.com/questions/53014141/mice-splitting-imputed-data-for-further-analysis for (j in levels(d.long[[i]])){ k<-length(adj_m)-grep(i,adj_m)+1 ## This only works to select the "opposite" of i for length(adj_m)==2 s.imp<-mice::as.mids(d.long[which(d.long[[i]] == j),]) # Subsetting long and convert to "mids" format for pooling coeffs<-summary(pool( with(s.imp,lm(as.formula(paste0(l,"~",paste(adj_m[k],paste(adjs_10m,collapse="+"),sep="+"))))) ))[2,keeps] coll<-rbind(coll,coeffs) nms<-c(nms,paste(l,j,sep = "_")) } ## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula ## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3 } } coll$term<-nms ``` ### Collecting ```{r} mul_coll<-pool_table(coll) colnames(mul_coll)[-1]<-paste0("adj_",colnames(mul_coll)[-1]) ``` ```{r} library(lubridate) write.csv(full_join(biv_coll,mul_coll),paste0("/Volumes/Data/depression/imp_regression_",today(),".csv")) ```