talos-pa-depression/Archive/dep_imputation.Rmd

---
title: "Sensitivity analysis on imputed dataset"
author: "Andreas Gammelgaard Damsbo"
date: "Knitted: `r format(Sys.time(), '%d %B, %Y')`"
output: pdf_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message = FALSE)
```


# Import
```{r}
dta_all<-read.csv("/Volumes/Data/depression/dep_dataset.csv",na.strings = c("NA","unknown")) ## Extending definition of NA's for imputation
```

# Defining patients to include for analysis
Only including cases with complete pase_0 and MDI at 1 & 6 months
```{r}
dta<-dta_all[!is.na(dta_all$pase_0),]
# &!is.na(dta$mdi_1)&!is.na(dta$mdi_6)
```

## Formatting
```{r echo=FALSE}
dta$diabetes<-factor(dta$diabetes)
dta$pad<-factor(dta$pad)

dta$civil<-factor(dta$civil)

dta$hypertension<-factor(dta$hypertension)
dta$afli<-factor(dta$afli)
dta$smoke_ever<-factor(dta$smoke_ever)
dta$ami<-factor(dta$ami)
dta$tci<-factor(dta$tci)
dta$thrombolysis<-factor(dta$thrombolysis)
dta$thrombechtomy<-factor(dta$thrombechtomy)
dta$rep_any<-factor(dta$rep_any)
dta$pad<-factor(dta$pad)
dta$nihss_0<-as.numeric(dta$nihss_0)
dta$age<-as.numeric(dta$age)
dta$rtreat<-factor(dta$rtreat)
dta$sex<-factor(dta$sex)
dta$pase_0<-as.numeric(dta$pase_0)
dta$pase_6<-as.numeric(dta$pase_6)
dta$bmi<-as.numeric(dta$bmi)
dta$mdi_6<-as.numeric(dta$mdi_6)
dta$pase_0_bin<-factor(dta$pase_0_bin,levels=c("lower","higher"))
```


```{r}
# Backup
dta_b<-dta
```


# Libraries
```{r}
library(daDoctoR)
library(mice)
```

## Variables to include in imputation
```{r}
# Possible variables to include
coval<-c("pase_0_bin","rtreat","age","sex","smoke_ever","civil","bmi","diabetes", "hypertension", "afli","pad","nihss_0","rep_any")
```


# Imputation
```{r}
# Output variables added to include in model. Excluded from predicting.
outc<-c("mdi_1_enr","mdi_6_newobs_enr")
# Adding all
covar<-c(coval,outc)
# Selecting dataset
r<-dta[,c("rnumb",covar)]
```

```{r}
# Iterations
mxt=20
# Imputations
mis=5
```

https://datascienceplus.com/imputing-missing-data-with-r-mice-package/
```{r}
md.pattern(r)  # Missing pattern
# library(VIM)
# aggr_plot <- aggr(r, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
```

```{r}
init <- mice(r, maxit=0)  # Creating initial imputation list to assess methods
meth <- init$method
meth
predM <- init$predictorMatrix

predM[, c("rnumb")] <- 0  # Defining variables not to be used for predicting imputed values

# meth[outc]=""  
# Defining variables not to be imputed. 
# Commented out as all included variables will be imputed
```

```{r echo=FALSE}
imputed <- mice(r, method=meth, predictorMatrix=predM, m=mis, maxit = mxt,seed = 103, printFlag=FALSE)
```

```{r}
# summary(imputed)
```

```{r}
library(dplyr)
export<-dta %>%
  select(-all_of(coval)) %>%  # Leaving out imputed variables from original dataset
  left_join(mice::complete(imputed,1),.,by="rnumb") # Join with the first imputed dataset for a full dataset export

md.pattern(export[coval]) # Ensuring complete data

write.csv(export,"/Volumes/Data/depression/imputed.csv",row.names = FALSE)  # Export
```


# Regression analyses
```{r}
print(adjs_10<-rep_lm(meas = "mdi_6",string=c("pase_0_bin","rtreat",coval),data=dta,cut.p = .1)[[2]])
```

## Bivariabel

Function to format collected data from pool function
```{r}
pool_table<-function(clls){
  ## Variables needed: estimate, p.value, term
  
  coll$lo<-round(coll$estimate-coll$std.error*1.96,2)
coll$hi<-round(coll$estimate+coll$std.error*1.96,2)

pa<-coll$p.value
pa <- ifelse(pa < 0.001, "<0.001", round(pa, 3))
pa <- ifelse(pa <= 0.05 | pa == "<0.001", paste0("*",pa), ifelse(pa > 0.05 & pa <= 0.1, paste0(".", pa),pa))

cl<-data.frame(id=coll$term,diff=paste0(round(coll$estimate,2)," (",coll$lo," to ",coll$hi,")"),p=pa,stringsAsFactors=FALSE)
  return(cl)
}

keeps<-c("term","estimate","std.error","p.value")
```

### Repeated bivariabel analysis
Not necessary for this, but an interesting addition

```{r}
coll<-c()
for (i in c("rtreat",adjs_10)){
  ## Bivariable linear regression analysis of all 
  coeffs<-summary(pool(
    with(imputed,lm(as.formula(paste0("mdi_1_enr~",i))))
    ))[-1,c("term","estimate","std.error","p.value")]
  
  coll<-rbind(coll,coeffs)
  
  ## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula
  ## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3
}

## Collecting

coll$lo<-round(coll$estimate-coll$std.error*1.96,2)
coll$hi<-round(coll$estimate+coll$std.error*1.96,2)

pa<-coll$p.value
pa <- ifelse(pa < 0.001, "<0.001", round(pa, 3))
pa <- ifelse(pa <= 0.05 | pa == "<0.001", paste0("*",pa), ifelse(pa > 0.05 & pa <= 0.1, paste0(".", pa),pa))

coll_bi<-data.frame(diff=paste0(round(exp(coll$estimate),2)," (",coll$lo," to ",coll$hi,")"),p=pa,id=coll$term,stringsAsFactors=FALSE)

```


## Unadjusted analyses

```{r}
adjs_10m<-adjs_10[adjs_10!="pase_0_bin"]

adj_m<-c("rtreat","pase_0_bin")

coll<-c()
nms<-c()
for (l in outc){

for (i in adj_m){
  coeffs<-summary(pool(
    with(imputed,lm(as.formula(paste0(l,"~",i))))
    ))[-1,keeps]
  coll<-rbind(coll,coeffs)
  
  nms<-c(nms,paste(l,i,sep = "_"))
  
  d.long <- mice::complete(imputed,"long",include = T)
  
  # Inspiration: https://stackoverflow.com/questions/53014141/mice-splitting-imputed-data-for-further-analysis
  
  for (j in levels(d.long[[i]])){
    k<-length(adj_m)-grep(i,adj_m)+1 ## This only works to select the "opposite" of i for length(adj_m)==2

    s.imp<-mice::as.mids(d.long[which(d.long[[i]] == j),]) # Subsetting long and convert to "mids" format for pooling
    
    coeffs<-summary(pool(
    with(s.imp,lm(as.formula(paste0(l,"~",adj_m[k]))))
    ))[-1,keeps]
    
    coll<-rbind(coll,coeffs)
    
    nms<-c(nms,paste(l,j,sep = "_"))
  }
  
  
  ## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula
  ## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3
}


}
coll$term<-nms

```

### Collecting
```{r}
biv_coll<-pool_table(coll)
```


## Adjusted analyses

```{r}
adjs_10m<-adjs_10[adjs_10!="pase_0_bin"]

adj_m<-c("rtreat","pase_0_bin")

coll<-c()
nms<-c()
for (l in outc){
# l="mdi_1_enr"
for (i in adj_m){
  coeffs<-summary(pool(
    with(imputed,lm(as.formula(paste0(l,"~",paste(i,paste(adjs_10m,collapse="+"),sep="+")))))
    ))[2,keeps]
  coll<-rbind(coll,coeffs)
  
  nms<-c(nms,paste(l,i,sep = "_"))
  
  d.long <- mice::complete(imputed,"long",include = T)
  
  # Inspiration: https://stackoverflow.com/questions/53014141/mice-splitting-imputed-data-for-further-analysis
  
  for (j in levels(d.long[[i]])){
    k<-length(adj_m)-grep(i,adj_m)+1 ## This only works to select the "opposite" of i for length(adj_m)==2

    s.imp<-mice::as.mids(d.long[which(d.long[[i]] == j),]) # Subsetting long and convert to "mids" format for pooling
    
    coeffs<-summary(pool(
    with(s.imp,lm(as.formula(paste0(l,"~",paste(adj_m[k],paste(adjs_10m,collapse="+"),sep="+")))))
    ))[2,keeps]
    
    coll<-rbind(coll,coeffs)
    
    nms<-c(nms,paste(l,j,sep = "_"))
  }
  
  
  ## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula
  ## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3
}


}
coll$term<-nms

```

### Collecting
```{r}
mul_coll<-pool_table(coll)
colnames(mul_coll)[-1]<-paste0("adj_",colnames(mul_coll)[-1])
```

```{r}
library(lubridate)
write.csv(full_join(biv_coll,mul_coll),paste0("/Volumes/Data/depression/imp_regression_",today(),".csv"))
```
first commit of old data 2022-08-01 13:57:03 +02:00			`---`
			`title: "Sensitivity analysis on imputed dataset"`
			`author: "Andreas Gammelgaard Damsbo"`
			date: "Knitted: `r format(Sys.time(), '%d %B, %Y')`"
			`output: pdf_document`
			`---`

			```{r setup, include=FALSE}
			`knitr::opts_chunk$set(echo = TRUE, message = FALSE)`
			```


			`# Import`
			```{r}
			`dta_all<-read.csv("/Volumes/Data/depression/dep_dataset.csv",na.strings = c("NA","unknown")) ## Extending definition of NA's for imputation`
			```

			`# Defining patients to include for analysis`
			`Only including cases with complete pase_0 and MDI at 1 & 6 months`
			```{r}
			`dta<-dta_all[!is.na(dta_all$pase_0),]`
			`# &!is.na(dta$mdi_1)&!is.na(dta$mdi_6)`
			```

			`## Formatting`
			```{r echo=FALSE}
			`dta$diabetes<-factor(dta$diabetes)`
			`dta$pad<-factor(dta$pad)`

			`dta$civil<-factor(dta$civil)`

			`dta$hypertension<-factor(dta$hypertension)`
			`dta$afli<-factor(dta$afli)`
			`dta$smoke_ever<-factor(dta$smoke_ever)`
			`dta$ami<-factor(dta$ami)`
			`dta$tci<-factor(dta$tci)`
			`dta$thrombolysis<-factor(dta$thrombolysis)`
			`dta$thrombechtomy<-factor(dta$thrombechtomy)`
			`dta$rep_any<-factor(dta$rep_any)`
			`dta$pad<-factor(dta$pad)`
			`dta$nihss_0<-as.numeric(dta$nihss_0)`
			`dta$age<-as.numeric(dta$age)`
			`dta$rtreat<-factor(dta$rtreat)`
			`dta$sex<-factor(dta$sex)`
			`dta$pase_0<-as.numeric(dta$pase_0)`
			`dta$pase_6<-as.numeric(dta$pase_6)`
			`dta$bmi<-as.numeric(dta$bmi)`
			`dta$mdi_6<-as.numeric(dta$mdi_6)`
			`dta$pase_0_bin<-factor(dta$pase_0_bin,levels=c("lower","higher"))`
			```


			```{r}
			`# Backup`
			`dta_b<-dta`
			```


			`# Libraries`
			```{r}
			`library(daDoctoR)`
			`library(mice)`
			```

			`## Variables to include in imputation`
			```{r}
			`# Possible variables to include`
			`coval<-c("pase_0_bin","rtreat","age","sex","smoke_ever","civil","bmi","diabetes", "hypertension", "afli","pad","nihss_0","rep_any")`
			```


			`# Imputation`
			```{r}
			`# Output variables added to include in model. Excluded from predicting.`
			`outc<-c("mdi_1_enr","mdi_6_newobs_enr")`
			`# Adding all`
			`covar<-c(coval,outc)`
			`# Selecting dataset`
			`r<-dta[,c("rnumb",covar)]`
			```

			```{r}
			`# Iterations`
			`mxt=20`
			`# Imputations`
			`mis=5`
			```

			`https://datascienceplus.com/imputing-missing-data-with-r-mice-package/`
			```{r}
			`md.pattern(r) # Missing pattern`
			`# library(VIM)`
			`# aggr_plot <- aggr(r, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))`
			```

			```{r}
			`init <- mice(r, maxit=0) # Creating initial imputation list to assess methods`
			`meth <- init$method`
			`meth`
			`predM <- init$predictorMatrix`

			`predM[, c("rnumb")] <- 0 # Defining variables not to be used for predicting imputed values`

			`# meth[outc]=""`
			`# Defining variables not to be imputed.`
			`# Commented out as all included variables will be imputed`
			```

			```{r echo=FALSE}
			`imputed <- mice(r, method=meth, predictorMatrix=predM, m=mis, maxit = mxt,seed = 103, printFlag=FALSE)`
			```

			```{r}
			`# summary(imputed)`
			```

			```{r}
			`library(dplyr)`
			`export<-dta %>%`
			`select(-all_of(coval)) %>% # Leaving out imputed variables from original dataset`
			`left_join(mice::complete(imputed,1),.,by="rnumb") # Join with the first imputed dataset for a full dataset export`

			`md.pattern(export[coval]) # Ensuring complete data`

			`write.csv(export,"/Volumes/Data/depression/imputed.csv",row.names = FALSE) # Export`
			```


			`# Regression analyses`
			```{r}
			`print(adjs_10<-rep_lm(meas = "mdi_6",string=c("pase_0_bin","rtreat",coval),data=dta,cut.p = .1)[[2]])`
			```

			`## Bivariabel`

			`Function to format collected data from pool function`
			```{r}
			`pool_table<-function(clls){`
			`## Variables needed: estimate, p.value, term`

			`coll$lo<-round(coll$estimate-coll$std.error*1.96,2)`
			`coll$hi<-round(coll$estimate+coll$std.error*1.96,2)`

			`pa<-coll$p.value`
			`pa <- ifelse(pa < 0.001, "<0.001", round(pa, 3))`
			`pa <- ifelse(pa <= 0.05 \| pa == "<0.001", paste0("*",pa), ifelse(pa > 0.05 & pa <= 0.1, paste0(".", pa),pa))`

			`cl<-data.frame(id=coll$term,diff=paste0(round(coll$estimate,2)," (",coll$lo," to ",coll$hi,")"),p=pa,stringsAsFactors=FALSE)`
			`return(cl)`
			`}`

			`keeps<-c("term","estimate","std.error","p.value")`
			```

			`### Repeated bivariabel analysis`
			`Not necessary for this, but an interesting addition`

			```{r}
			`coll<-c()`
			`for (i in c("rtreat",adjs_10)){`
			`## Bivariable linear regression analysis of all`
			`coeffs<-summary(pool(`
			`with(imputed,lm(as.formula(paste0("mdi_1_enr~",i))))`
			`))[-1,c("term","estimate","std.error","p.value")]`

			`coll<-rbind(coll,coeffs)`

			`## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula`
			`## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3`
			`}`

			`## Collecting`

			`coll$lo<-round(coll$estimate-coll$std.error*1.96,2)`
			`coll$hi<-round(coll$estimate+coll$std.error*1.96,2)`

			`pa<-coll$p.value`
			`pa <- ifelse(pa < 0.001, "<0.001", round(pa, 3))`
			`pa <- ifelse(pa <= 0.05 \| pa == "<0.001", paste0("*",pa), ifelse(pa > 0.05 & pa <= 0.1, paste0(".", pa),pa))`

			`coll_bi<-data.frame(diff=paste0(round(exp(coll$estimate),2)," (",coll$lo," to ",coll$hi,")"),p=pa,id=coll$term,stringsAsFactors=FALSE)`

			```


			`## Unadjusted analyses`

			```{r}
			`adjs_10m<-adjs_10[adjs_10!="pase_0_bin"]`

			`adj_m<-c("rtreat","pase_0_bin")`

			`coll<-c()`
			`nms<-c()`
			`for (l in outc){`

			`for (i in adj_m){`
			`coeffs<-summary(pool(`
			`with(imputed,lm(as.formula(paste0(l,"~",i))))`
			`))[-1,keeps]`
			`coll<-rbind(coll,coeffs)`

			`nms<-c(nms,paste(l,i,sep = "_"))`

			`d.long <- mice::complete(imputed,"long",include = T)`

			`# Inspiration: https://stackoverflow.com/questions/53014141/mice-splitting-imputed-data-for-further-analysis`

			`for (j in levels(d.long[[i]])){`
			`k<-length(adj_m)-grep(i,adj_m)+1 ## This only works to select the "opposite" of i for length(adj_m)==2`

			`s.imp<-mice::as.mids(d.long[which(d.long[[i]] == j),]) # Subsetting long and convert to "mids" format for pooling`

			`coeffs<-summary(pool(`
			`with(s.imp,lm(as.formula(paste0(l,"~",adj_m[k]))))`
			`))[-1,keeps]`

			`coll<-rbind(coll,coeffs)`

			`nms<-c(nms,paste(l,j,sep = "_"))`
			`}`


			`## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula`
			`## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3`
			`}`


			`}`
			`coll$term<-nms`

			```

			`### Collecting`
			```{r}
			`biv_coll<-pool_table(coll)`
			```



			`## Adjusted analyses`

			```{r}
			`adjs_10m<-adjs_10[adjs_10!="pase_0_bin"]`

			`adj_m<-c("rtreat","pase_0_bin")`

			`coll<-c()`
			`nms<-c()`
			`for (l in outc){`
			`# l="mdi_1_enr"`
			`for (i in adj_m){`
			`coeffs<-summary(pool(`
			`with(imputed,lm(as.formula(paste0(l,"~",paste(i,paste(adjs_10m,collapse="+"),sep="+")))))`
			`))[2,keeps]`
			`coll<-rbind(coll,coeffs)`

			`nms<-c(nms,paste(l,i,sep = "_"))`

			`d.long <- mice::complete(imputed,"long",include = T)`

			`# Inspiration: https://stackoverflow.com/questions/53014141/mice-splitting-imputed-data-for-further-analysis`

			`for (j in levels(d.long[[i]])){`
			`k<-length(adj_m)-grep(i,adj_m)+1 ## This only works to select the "opposite" of i for length(adj_m)==2`

			`s.imp<-mice::as.mids(d.long[which(d.long[[i]] == j),]) # Subsetting long and convert to "mids" format for pooling`

			`coeffs<-summary(pool(`
			`with(s.imp,lm(as.formula(paste0(l,"~",paste(adj_m[k],paste(adjs_10m,collapse="+"),sep="+")))))`
			`))[2,keeps]`

			`coll<-rbind(coll,coeffs)`

			`nms<-c(nms,paste(l,j,sep = "_"))`
			`}`


			`## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula`
			`## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3`
			`}`


			`}`
			`coll$term<-nms`

			```

			`### Collecting`
			```{r}
			`mul_coll<-pool_table(coll)`
			`colnames(mul_coll)[-1]<-paste0("adj_",colnames(mul_coll)[-1])`
			```

			```{r}
			`library(lubridate)`
			`write.csv(full_join(biv_coll,mul_coll),paste0("/Volumes/Data/depression/imp_regression_",today(),".csv"))`
			```