talos-pa-depression/Archive/dep_imputation.Rmd

---
title: "Sensitivity analysis on imputed dataset"
author: "Andreas Gammelgaard Damsbo"
date: "Knitted: `r format(Sys.time(), '%d %B, %Y')`"
output: pdf_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message = FALSE)
```


# Import
```{r}
dta_all<-read.csv("/Volumes/Data/depression/dep_dataset.csv",na.strings = c("NA","unknown")) ## Extending definition of NA's for imputation
```

# Defining patients to include for analysis
Only including cases with complete pase_0 and MDI at 1 & 6 months
```{r}
dta<-dta_all[!is.na(dta_all$pase_0),]
# &!is.na(dta$mdi_1)&!is.na(dta$mdi_6)
```

## Formatting
```{r echo=FALSE}
dta$diabetes<-factor(dta$diabetes)
dta$pad<-factor(dta$pad)

dta$civil<-factor(dta$civil)

dta$hypertension<-factor(dta$hypertension)
dta$afli<-factor(dta$afli)
dta$smoke_ever<-factor(dta$smoke_ever)
dta$ami<-factor(dta$ami)
dta$tci<-factor(dta$tci)
dta$thrombolysis<-factor(dta$thrombolysis)
dta$thrombechtomy<-factor(dta$thrombechtomy)
dta$rep_any<-factor(dta$rep_any)
dta$pad<-factor(dta$pad)
dta$nihss_0<-as.numeric(dta$nihss_0)
dta$age<-as.numeric(dta$age)
dta$rtreat<-factor(dta$rtreat)
dta$sex<-factor(dta$sex)
dta$pase_0<-as.numeric(dta$pase_0)
dta$pase_6<-as.numeric(dta$pase_6)
dta$bmi<-as.numeric(dta$bmi)
dta$mdi_6<-as.numeric(dta$mdi_6)
dta$pase_0_bin<-factor(dta$pase_0_bin,levels=c("lower","higher"))
```


```{r}
# Backup
dta_b<-dta
```


# Libraries
```{r}
library(daDoctoR)
library(mice)
```

## Variables to include in imputation
```{r}
# Possible variables to include
coval<-c("pase_0_bin","rtreat","age","sex","smoke_ever","civil","bmi","diabetes", "hypertension", "afli","pad","nihss_0","rep_any")
```


# Imputation
```{r}
# Output variables added to include in model. Excluded from predicting.
outc<-c("mdi_1_enr","mdi_6_newobs_enr")
# Adding all
covar<-c(coval,outc)
# Selecting dataset
r<-dta[,c("rnumb",covar)]
```

```{r}
# Iterations
mxt=20
# Imputations
mis=5
```

https://datascienceplus.com/imputing-missing-data-with-r-mice-package/
```{r}
md.pattern(r)  # Missing pattern
# library(VIM)
# aggr_plot <- aggr(r, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
```

```{r}
init <- mice(r, maxit=0)  # Creating initial imputation list to assess methods
meth <- init$method
meth
predM <- init$predictorMatrix

predM[, c("rnumb")] <- 0  # Defining variables not to be used for predicting imputed values

# meth[outc]=""
# Defining variables not to be imputed.
# Commented out as all included variables will be imputed
```

```{r echo=FALSE}
imputed <- mice(r, method=meth, predictorMatrix=predM, m=mis, maxit = mxt,seed = 103, printFlag=FALSE)
```

```{r}
# summary(imputed)
```

```{r}
library(dplyr)
export<-dta %>%
  select(-all_of(coval)) %>%  # Leaving out imputed variables from original dataset
  left_join(mice::complete(imputed,1),.,by="rnumb") # Join with the first imputed dataset for a full dataset export

md.pattern(export[coval]) # Ensuring complete data

write.csv(export,"/Volumes/Data/depression/imputed.csv",row.names = FALSE)  # Export
```


# Regression analyses
```{r}
print(adjs_10<-rep_lm(meas = "mdi_6",string=c("pase_0_bin","rtreat",coval),data=dta,cut.p = .1)[[2]])
```

## Bivariabel

Function to format collected data from pool function
```{r}
pool_table<-function(clls){
  ## Variables needed: estimate, p.value, term

  coll$lo<-round(coll$estimate-coll$std.error*1.96,2)
coll$hi<-round(coll$estimate+coll$std.error*1.96,2)

pa<-coll$p.value
pa <- ifelse(pa < 0.001, "<0.001", round(pa, 3))
pa <- ifelse(pa <= 0.05 | pa == "<0.001", paste0("*",pa), ifelse(pa > 0.05 & pa <= 0.1, paste0(".", pa),pa))

cl<-data.frame(id=coll$term,diff=paste0(round(coll$estimate,2)," (",coll$lo," to ",coll$hi,")"),p=pa,stringsAsFactors=FALSE)
  return(cl)
}

keeps<-c("term","estimate","std.error","p.value")
```

### Repeated bivariabel analysis
Not necessary for this, but an interesting addition

```{r}
coll<-c()
for (i in c("rtreat",adjs_10)){
  ## Bivariable linear regression analysis of all
  coeffs<-summary(pool(
    with(imputed,lm(as.formula(paste0("mdi_1_enr~",i))))
    ))[-1,c("term","estimate","std.error","p.value")]

  coll<-rbind(coll,coeffs)

  ## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula
  ## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3
}

## Collecting

coll$lo<-round(coll$estimate-coll$std.error*1.96,2)
coll$hi<-round(coll$estimate+coll$std.error*1.96,2)

pa<-coll$p.value
pa <- ifelse(pa < 0.001, "<0.001", round(pa, 3))
pa <- ifelse(pa <= 0.05 | pa == "<0.001", paste0("*",pa), ifelse(pa > 0.05 & pa <= 0.1, paste0(".", pa),pa))

coll_bi<-data.frame(diff=paste0(round(exp(coll$estimate),2)," (",coll$lo," to ",coll$hi,")"),p=pa,id=coll$term,stringsAsFactors=FALSE)

```


## Unadjusted analyses

```{r}
adjs_10m<-adjs_10[adjs_10!="pase_0_bin"]

adj_m<-c("rtreat","pase_0_bin")

coll<-c()
nms<-c()
for (l in outc){

for (i in adj_m){
  coeffs<-summary(pool(
    with(imputed,lm(as.formula(paste0(l,"~",i))))
    ))[-1,keeps]
  coll<-rbind(coll,coeffs)

  nms<-c(nms,paste(l,i,sep = "_"))

  d.long <- mice::complete(imputed,"long",include = T)

  # Inspiration: https://stackoverflow.com/questions/53014141/mice-splitting-imputed-data-for-further-analysis

  for (j in levels(d.long[[i]])){
    k<-length(adj_m)-grep(i,adj_m)+1 ## This only works to select the "opposite" of i for length(adj_m)==2

    s.imp<-mice::as.mids(d.long[which(d.long[[i]] == j),]) # Subsetting long and convert to "mids" format for pooling

    coeffs<-summary(pool(
    with(s.imp,lm(as.formula(paste0(l,"~",adj_m[k]))))
    ))[-1,keeps]

    coll<-rbind(coll,coeffs)

    nms<-c(nms,paste(l,j,sep = "_"))
  }


  ## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula
  ## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3
}


}
coll$term<-nms

```

### Collecting
```{r}
biv_coll<-pool_table(coll)
```


## Adjusted analyses

```{r}
adjs_10m<-adjs_10[adjs_10!="pase_0_bin"]

adj_m<-c("rtreat","pase_0_bin")

coll<-c()
nms<-c()
for (l in outc){
# l="mdi_1_enr"
for (i in adj_m){
  coeffs<-summary(pool(
    with(imputed,lm(as.formula(paste0(l,"~",paste(i,paste(adjs_10m,collapse="+"),sep="+")))))
    ))[2,keeps]
  coll<-rbind(coll,coeffs)

  nms<-c(nms,paste(l,i,sep = "_"))

  d.long <- mice::complete(imputed,"long",include = T)

  # Inspiration: https://stackoverflow.com/questions/53014141/mice-splitting-imputed-data-for-further-analysis

  for (j in levels(d.long[[i]])){
    k<-length(adj_m)-grep(i,adj_m)+1 ## This only works to select the "opposite" of i for length(adj_m)==2

    s.imp<-mice::as.mids(d.long[which(d.long[[i]] == j),]) # Subsetting long and convert to "mids" format for pooling

    coeffs<-summary(pool(
    with(s.imp,lm(as.formula(paste0(l,"~",paste(adj_m[k],paste(adjs_10m,collapse="+"),sep="+")))))
    ))[2,keeps]

    coll<-rbind(coll,coeffs)

    nms<-c(nms,paste(l,j,sep = "_"))
  }


  ## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula
  ## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3
}


}
coll$term<-nms

```

### Collecting
```{r}
mul_coll<-pool_table(coll)
colnames(mul_coll)[-1]<-paste0("adj_",colnames(mul_coll)[-1])
```

```{r}
library(lubridate)
write.csv(full_join(biv_coll,mul_coll),paste0("/Volumes/Data/depression/imp_regression_",today(),".csv"))
```