300 lines
7.4 KiB
Plaintext
300 lines
7.4 KiB
Plaintext
|
---
|
||
|
title: "Sensitivity analysis on imputed dataset"
|
||
|
author: "Andreas Gammelgaard Damsbo"
|
||
|
date: "Knitted: `r format(Sys.time(), '%d %B, %Y')`"
|
||
|
output: pdf_document
|
||
|
---
|
||
|
|
||
|
```{r setup, include=FALSE}
|
||
|
knitr::opts_chunk$set(echo = TRUE, message = FALSE)
|
||
|
```
|
||
|
|
||
|
|
||
|
# Import
|
||
|
```{r}
|
||
|
dta_all<-read.csv("/Volumes/Data/depression/dep_dataset.csv",na.strings = c("NA","unknown")) ## Extending definition of NA's for imputation
|
||
|
```
|
||
|
|
||
|
# Defining patients to include for analysis
|
||
|
Only including cases with complete pase_0 and MDI at 1 & 6 months
|
||
|
```{r}
|
||
|
dta<-dta_all[!is.na(dta_all$pase_0),]
|
||
|
# &!is.na(dta$mdi_1)&!is.na(dta$mdi_6)
|
||
|
```
|
||
|
|
||
|
## Formatting
|
||
|
```{r echo=FALSE}
|
||
|
dta$diabetes<-factor(dta$diabetes)
|
||
|
dta$pad<-factor(dta$pad)
|
||
|
|
||
|
dta$civil<-factor(dta$civil)
|
||
|
|
||
|
dta$hypertension<-factor(dta$hypertension)
|
||
|
dta$afli<-factor(dta$afli)
|
||
|
dta$smoke_ever<-factor(dta$smoke_ever)
|
||
|
dta$ami<-factor(dta$ami)
|
||
|
dta$tci<-factor(dta$tci)
|
||
|
dta$thrombolysis<-factor(dta$thrombolysis)
|
||
|
dta$thrombechtomy<-factor(dta$thrombechtomy)
|
||
|
dta$rep_any<-factor(dta$rep_any)
|
||
|
dta$pad<-factor(dta$pad)
|
||
|
dta$nihss_0<-as.numeric(dta$nihss_0)
|
||
|
dta$age<-as.numeric(dta$age)
|
||
|
dta$rtreat<-factor(dta$rtreat)
|
||
|
dta$sex<-factor(dta$sex)
|
||
|
dta$pase_0<-as.numeric(dta$pase_0)
|
||
|
dta$pase_6<-as.numeric(dta$pase_6)
|
||
|
dta$bmi<-as.numeric(dta$bmi)
|
||
|
dta$mdi_6<-as.numeric(dta$mdi_6)
|
||
|
dta$pase_0_bin<-factor(dta$pase_0_bin,levels=c("lower","higher"))
|
||
|
```
|
||
|
|
||
|
|
||
|
```{r}
|
||
|
# Backup
|
||
|
dta_b<-dta
|
||
|
```
|
||
|
|
||
|
|
||
|
# Libraries
|
||
|
```{r}
|
||
|
library(daDoctoR)
|
||
|
library(mice)
|
||
|
```
|
||
|
|
||
|
## Variables to include in imputation
|
||
|
```{r}
|
||
|
# Possible variables to include
|
||
|
coval<-c("pase_0_bin","rtreat","age","sex","smoke_ever","civil","bmi","diabetes", "hypertension", "afli","pad","nihss_0","rep_any")
|
||
|
```
|
||
|
|
||
|
|
||
|
# Imputation
|
||
|
```{r}
|
||
|
# Output variables added to include in model. Excluded from predicting.
|
||
|
outc<-c("mdi_1_enr","mdi_6_newobs_enr")
|
||
|
# Adding all
|
||
|
covar<-c(coval,outc)
|
||
|
# Selecting dataset
|
||
|
r<-dta[,c("rnumb",covar)]
|
||
|
```
|
||
|
|
||
|
```{r}
|
||
|
# Iterations
|
||
|
mxt=20
|
||
|
# Imputations
|
||
|
mis=5
|
||
|
```
|
||
|
|
||
|
https://datascienceplus.com/imputing-missing-data-with-r-mice-package/
|
||
|
```{r}
|
||
|
md.pattern(r) # Missing pattern
|
||
|
# library(VIM)
|
||
|
# aggr_plot <- aggr(r, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
|
||
|
```
|
||
|
|
||
|
```{r}
|
||
|
init <- mice(r, maxit=0) # Creating initial imputation list to assess methods
|
||
|
meth <- init$method
|
||
|
meth
|
||
|
predM <- init$predictorMatrix
|
||
|
|
||
|
predM[, c("rnumb")] <- 0 # Defining variables not to be used for predicting imputed values
|
||
|
|
||
|
# meth[outc]=""
|
||
|
# Defining variables not to be imputed.
|
||
|
# Commented out as all included variables will be imputed
|
||
|
```
|
||
|
|
||
|
```{r echo=FALSE}
|
||
|
imputed <- mice(r, method=meth, predictorMatrix=predM, m=mis, maxit = mxt,seed = 103, printFlag=FALSE)
|
||
|
```
|
||
|
|
||
|
```{r}
|
||
|
# summary(imputed)
|
||
|
```
|
||
|
|
||
|
```{r}
|
||
|
library(dplyr)
|
||
|
export<-dta %>%
|
||
|
select(-all_of(coval)) %>% # Leaving out imputed variables from original dataset
|
||
|
left_join(mice::complete(imputed,1),.,by="rnumb") # Join with the first imputed dataset for a full dataset export
|
||
|
|
||
|
md.pattern(export[coval]) # Ensuring complete data
|
||
|
|
||
|
write.csv(export,"/Volumes/Data/depression/imputed.csv",row.names = FALSE) # Export
|
||
|
```
|
||
|
|
||
|
|
||
|
# Regression analyses
|
||
|
```{r}
|
||
|
print(adjs_10<-rep_lm(meas = "mdi_6",string=c("pase_0_bin","rtreat",coval),data=dta,cut.p = .1)[[2]])
|
||
|
```
|
||
|
|
||
|
## Bivariabel
|
||
|
|
||
|
Function to format collected data from pool function
|
||
|
```{r}
|
||
|
pool_table<-function(clls){
|
||
|
## Variables needed: estimate, p.value, term
|
||
|
|
||
|
coll$lo<-round(coll$estimate-coll$std.error*1.96,2)
|
||
|
coll$hi<-round(coll$estimate+coll$std.error*1.96,2)
|
||
|
|
||
|
pa<-coll$p.value
|
||
|
pa <- ifelse(pa < 0.001, "<0.001", round(pa, 3))
|
||
|
pa <- ifelse(pa <= 0.05 | pa == "<0.001", paste0("*",pa), ifelse(pa > 0.05 & pa <= 0.1, paste0(".", pa),pa))
|
||
|
|
||
|
cl<-data.frame(id=coll$term,diff=paste0(round(coll$estimate,2)," (",coll$lo," to ",coll$hi,")"),p=pa,stringsAsFactors=FALSE)
|
||
|
return(cl)
|
||
|
}
|
||
|
|
||
|
keeps<-c("term","estimate","std.error","p.value")
|
||
|
```
|
||
|
|
||
|
### Repeated bivariabel analysis
|
||
|
Not necessary for this, but an interesting addition
|
||
|
|
||
|
```{r}
|
||
|
coll<-c()
|
||
|
for (i in c("rtreat",adjs_10)){
|
||
|
## Bivariable linear regression analysis of all
|
||
|
coeffs<-summary(pool(
|
||
|
with(imputed,lm(as.formula(paste0("mdi_1_enr~",i))))
|
||
|
))[-1,c("term","estimate","std.error","p.value")]
|
||
|
|
||
|
coll<-rbind(coll,coeffs)
|
||
|
|
||
|
## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula
|
||
|
## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3
|
||
|
}
|
||
|
|
||
|
## Collecting
|
||
|
|
||
|
coll$lo<-round(coll$estimate-coll$std.error*1.96,2)
|
||
|
coll$hi<-round(coll$estimate+coll$std.error*1.96,2)
|
||
|
|
||
|
pa<-coll$p.value
|
||
|
pa <- ifelse(pa < 0.001, "<0.001", round(pa, 3))
|
||
|
pa <- ifelse(pa <= 0.05 | pa == "<0.001", paste0("*",pa), ifelse(pa > 0.05 & pa <= 0.1, paste0(".", pa),pa))
|
||
|
|
||
|
coll_bi<-data.frame(diff=paste0(round(exp(coll$estimate),2)," (",coll$lo," to ",coll$hi,")"),p=pa,id=coll$term,stringsAsFactors=FALSE)
|
||
|
|
||
|
```
|
||
|
|
||
|
|
||
|
## Unadjusted analyses
|
||
|
|
||
|
```{r}
|
||
|
adjs_10m<-adjs_10[adjs_10!="pase_0_bin"]
|
||
|
|
||
|
adj_m<-c("rtreat","pase_0_bin")
|
||
|
|
||
|
coll<-c()
|
||
|
nms<-c()
|
||
|
for (l in outc){
|
||
|
|
||
|
for (i in adj_m){
|
||
|
coeffs<-summary(pool(
|
||
|
with(imputed,lm(as.formula(paste0(l,"~",i))))
|
||
|
))[-1,keeps]
|
||
|
coll<-rbind(coll,coeffs)
|
||
|
|
||
|
nms<-c(nms,paste(l,i,sep = "_"))
|
||
|
|
||
|
d.long <- mice::complete(imputed,"long",include = T)
|
||
|
|
||
|
# Inspiration: https://stackoverflow.com/questions/53014141/mice-splitting-imputed-data-for-further-analysis
|
||
|
|
||
|
for (j in levels(d.long[[i]])){
|
||
|
k<-length(adj_m)-grep(i,adj_m)+1 ## This only works to select the "opposite" of i for length(adj_m)==2
|
||
|
|
||
|
s.imp<-mice::as.mids(d.long[which(d.long[[i]] == j),]) # Subsetting long and convert to "mids" format for pooling
|
||
|
|
||
|
coeffs<-summary(pool(
|
||
|
with(s.imp,lm(as.formula(paste0(l,"~",adj_m[k]))))
|
||
|
))[-1,keeps]
|
||
|
|
||
|
coll<-rbind(coll,coeffs)
|
||
|
|
||
|
nms<-c(nms,paste(l,j,sep = "_"))
|
||
|
}
|
||
|
|
||
|
|
||
|
## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula
|
||
|
## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3
|
||
|
}
|
||
|
|
||
|
|
||
|
}
|
||
|
coll$term<-nms
|
||
|
|
||
|
```
|
||
|
|
||
|
### Collecting
|
||
|
```{r}
|
||
|
biv_coll<-pool_table(coll)
|
||
|
```
|
||
|
|
||
|
|
||
|
|
||
|
## Adjusted analyses
|
||
|
|
||
|
```{r}
|
||
|
adjs_10m<-adjs_10[adjs_10!="pase_0_bin"]
|
||
|
|
||
|
adj_m<-c("rtreat","pase_0_bin")
|
||
|
|
||
|
coll<-c()
|
||
|
nms<-c()
|
||
|
for (l in outc){
|
||
|
# l="mdi_1_enr"
|
||
|
for (i in adj_m){
|
||
|
coeffs<-summary(pool(
|
||
|
with(imputed,lm(as.formula(paste0(l,"~",paste(i,paste(adjs_10m,collapse="+"),sep="+")))))
|
||
|
))[2,keeps]
|
||
|
coll<-rbind(coll,coeffs)
|
||
|
|
||
|
nms<-c(nms,paste(l,i,sep = "_"))
|
||
|
|
||
|
d.long <- mice::complete(imputed,"long",include = T)
|
||
|
|
||
|
# Inspiration: https://stackoverflow.com/questions/53014141/mice-splitting-imputed-data-for-further-analysis
|
||
|
|
||
|
for (j in levels(d.long[[i]])){
|
||
|
k<-length(adj_m)-grep(i,adj_m)+1 ## This only works to select the "opposite" of i for length(adj_m)==2
|
||
|
|
||
|
s.imp<-mice::as.mids(d.long[which(d.long[[i]] == j),]) # Subsetting long and convert to "mids" format for pooling
|
||
|
|
||
|
coeffs<-summary(pool(
|
||
|
with(s.imp,lm(as.formula(paste0(l,"~",paste(adj_m[k],paste(adjs_10m,collapse="+"),sep="+")))))
|
||
|
))[2,keeps]
|
||
|
|
||
|
coll<-rbind(coll,coeffs)
|
||
|
|
||
|
nms<-c(nms,paste(l,j,sep = "_"))
|
||
|
}
|
||
|
|
||
|
|
||
|
## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula
|
||
|
## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3
|
||
|
}
|
||
|
|
||
|
|
||
|
}
|
||
|
coll$term<-nms
|
||
|
|
||
|
```
|
||
|
|
||
|
### Collecting
|
||
|
```{r}
|
||
|
mul_coll<-pool_table(coll)
|
||
|
colnames(mul_coll)[-1]<-paste0("adj_",colnames(mul_coll)[-1])
|
||
|
```
|
||
|
|
||
|
```{r}
|
||
|
library(lubridate)
|
||
|
write.csv(full_join(biv_coll,mul_coll),paste0("/Volumes/Data/depression/imp_regression_",today(),".csv"))
|
||
|
```
|
||
|
|