talos-pa-depression/Archive/dep_imputation.Rmd

300 lines
7.4 KiB
Plaintext

---
title: "Sensitivity analysis on imputed dataset"
author: "Andreas Gammelgaard Damsbo"
date: "Knitted: `r format(Sys.time(), '%d %B, %Y')`"
output: pdf_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message = FALSE)
```
# Import
```{r}
dta_all<-read.csv("/Volumes/Data/depression/dep_dataset.csv",na.strings = c("NA","unknown")) ## Extending definition of NA's for imputation
```
# Defining patients to include for analysis
Only including cases with complete pase_0 and MDI at 1 & 6 months
```{r}
dta<-dta_all[!is.na(dta_all$pase_0),]
# &!is.na(dta$mdi_1)&!is.na(dta$mdi_6)
```
## Formatting
```{r echo=FALSE}
dta$diabetes<-factor(dta$diabetes)
dta$pad<-factor(dta$pad)
dta$civil<-factor(dta$civil)
dta$hypertension<-factor(dta$hypertension)
dta$afli<-factor(dta$afli)
dta$smoke_ever<-factor(dta$smoke_ever)
dta$ami<-factor(dta$ami)
dta$tci<-factor(dta$tci)
dta$thrombolysis<-factor(dta$thrombolysis)
dta$thrombechtomy<-factor(dta$thrombechtomy)
dta$rep_any<-factor(dta$rep_any)
dta$pad<-factor(dta$pad)
dta$nihss_0<-as.numeric(dta$nihss_0)
dta$age<-as.numeric(dta$age)
dta$rtreat<-factor(dta$rtreat)
dta$sex<-factor(dta$sex)
dta$pase_0<-as.numeric(dta$pase_0)
dta$pase_6<-as.numeric(dta$pase_6)
dta$bmi<-as.numeric(dta$bmi)
dta$mdi_6<-as.numeric(dta$mdi_6)
dta$pase_0_bin<-factor(dta$pase_0_bin,levels=c("lower","higher"))
```
```{r}
# Backup
dta_b<-dta
```
# Libraries
```{r}
library(daDoctoR)
library(mice)
```
## Variables to include in imputation
```{r}
# Possible variables to include
coval<-c("pase_0_bin","rtreat","age","sex","smoke_ever","civil","bmi","diabetes", "hypertension", "afli","pad","nihss_0","rep_any")
```
# Imputation
```{r}
# Output variables added to include in model. Excluded from predicting.
outc<-c("mdi_1_enr","mdi_6_newobs_enr")
# Adding all
covar<-c(coval,outc)
# Selecting dataset
r<-dta[,c("rnumb",covar)]
```
```{r}
# Iterations
mxt=20
# Imputations
mis=5
```
https://datascienceplus.com/imputing-missing-data-with-r-mice-package/
```{r}
md.pattern(r) # Missing pattern
# library(VIM)
# aggr_plot <- aggr(r, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
```
```{r}
init <- mice(r, maxit=0) # Creating initial imputation list to assess methods
meth <- init$method
meth
predM <- init$predictorMatrix
predM[, c("rnumb")] <- 0 # Defining variables not to be used for predicting imputed values
# meth[outc]=""
# Defining variables not to be imputed.
# Commented out as all included variables will be imputed
```
```{r echo=FALSE}
imputed <- mice(r, method=meth, predictorMatrix=predM, m=mis, maxit = mxt,seed = 103, printFlag=FALSE)
```
```{r}
# summary(imputed)
```
```{r}
library(dplyr)
export<-dta %>%
select(-all_of(coval)) %>% # Leaving out imputed variables from original dataset
left_join(mice::complete(imputed,1),.,by="rnumb") # Join with the first imputed dataset for a full dataset export
md.pattern(export[coval]) # Ensuring complete data
write.csv(export,"/Volumes/Data/depression/imputed.csv",row.names = FALSE) # Export
```
# Regression analyses
```{r}
print(adjs_10<-rep_lm(meas = "mdi_6",string=c("pase_0_bin","rtreat",coval),data=dta,cut.p = .1)[[2]])
```
## Bivariabel
Function to format collected data from pool function
```{r}
pool_table<-function(clls){
## Variables needed: estimate, p.value, term
coll$lo<-round(coll$estimate-coll$std.error*1.96,2)
coll$hi<-round(coll$estimate+coll$std.error*1.96,2)
pa<-coll$p.value
pa <- ifelse(pa < 0.001, "<0.001", round(pa, 3))
pa <- ifelse(pa <= 0.05 | pa == "<0.001", paste0("*",pa), ifelse(pa > 0.05 & pa <= 0.1, paste0(".", pa),pa))
cl<-data.frame(id=coll$term,diff=paste0(round(coll$estimate,2)," (",coll$lo," to ",coll$hi,")"),p=pa,stringsAsFactors=FALSE)
return(cl)
}
keeps<-c("term","estimate","std.error","p.value")
```
### Repeated bivariabel analysis
Not necessary for this, but an interesting addition
```{r}
coll<-c()
for (i in c("rtreat",adjs_10)){
## Bivariable linear regression analysis of all
coeffs<-summary(pool(
with(imputed,lm(as.formula(paste0("mdi_1_enr~",i))))
))[-1,c("term","estimate","std.error","p.value")]
coll<-rbind(coll,coeffs)
## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula
## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3
}
## Collecting
coll$lo<-round(coll$estimate-coll$std.error*1.96,2)
coll$hi<-round(coll$estimate+coll$std.error*1.96,2)
pa<-coll$p.value
pa <- ifelse(pa < 0.001, "<0.001", round(pa, 3))
pa <- ifelse(pa <= 0.05 | pa == "<0.001", paste0("*",pa), ifelse(pa > 0.05 & pa <= 0.1, paste0(".", pa),pa))
coll_bi<-data.frame(diff=paste0(round(exp(coll$estimate),2)," (",coll$lo," to ",coll$hi,")"),p=pa,id=coll$term,stringsAsFactors=FALSE)
```
## Unadjusted analyses
```{r}
adjs_10m<-adjs_10[adjs_10!="pase_0_bin"]
adj_m<-c("rtreat","pase_0_bin")
coll<-c()
nms<-c()
for (l in outc){
for (i in adj_m){
coeffs<-summary(pool(
with(imputed,lm(as.formula(paste0(l,"~",i))))
))[-1,keeps]
coll<-rbind(coll,coeffs)
nms<-c(nms,paste(l,i,sep = "_"))
d.long <- mice::complete(imputed,"long",include = T)
# Inspiration: https://stackoverflow.com/questions/53014141/mice-splitting-imputed-data-for-further-analysis
for (j in levels(d.long[[i]])){
k<-length(adj_m)-grep(i,adj_m)+1 ## This only works to select the "opposite" of i for length(adj_m)==2
s.imp<-mice::as.mids(d.long[which(d.long[[i]] == j),]) # Subsetting long and convert to "mids" format for pooling
coeffs<-summary(pool(
with(s.imp,lm(as.formula(paste0(l,"~",adj_m[k]))))
))[-1,keeps]
coll<-rbind(coll,coeffs)
nms<-c(nms,paste(l,j,sep = "_"))
}
## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula
## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3
}
}
coll$term<-nms
```
### Collecting
```{r}
biv_coll<-pool_table(coll)
```
## Adjusted analyses
```{r}
adjs_10m<-adjs_10[adjs_10!="pase_0_bin"]
adj_m<-c("rtreat","pase_0_bin")
coll<-c()
nms<-c()
for (l in outc){
# l="mdi_1_enr"
for (i in adj_m){
coeffs<-summary(pool(
with(imputed,lm(as.formula(paste0(l,"~",paste(i,paste(adjs_10m,collapse="+"),sep="+")))))
))[2,keeps]
coll<-rbind(coll,coeffs)
nms<-c(nms,paste(l,i,sep = "_"))
d.long <- mice::complete(imputed,"long",include = T)
# Inspiration: https://stackoverflow.com/questions/53014141/mice-splitting-imputed-data-for-further-analysis
for (j in levels(d.long[[i]])){
k<-length(adj_m)-grep(i,adj_m)+1 ## This only works to select the "opposite" of i for length(adj_m)==2
s.imp<-mice::as.mids(d.long[which(d.long[[i]] == j),]) # Subsetting long and convert to "mids" format for pooling
coeffs<-summary(pool(
with(s.imp,lm(as.formula(paste0(l,"~",paste(adj_m[k],paste(adjs_10m,collapse="+"),sep="+")))))
))[2,keeps]
coll<-rbind(coll,coeffs)
nms<-c(nms,paste(l,j,sep = "_"))
}
## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula
## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3
}
}
coll$term<-nms
```
### Collecting
```{r}
mul_coll<-pool_table(coll)
colnames(mul_coll)[-1]<-paste0("adj_",colnames(mul_coll)[-1])
```
```{r}
library(lubridate)
write.csv(full_join(biv_coll,mul_coll),paste0("/Volumes/Data/depression/imp_regression_",today(),".csv"))
```