first commit of old data

2022-08-01 13:57:03 +02:00 · 2022-08-01 13:57:03 +02:00 · 2d7f0fc186
commit 2d7f0fc186
parent edb0afd869
26 changed files with 3805 additions and 0 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/PASE:rand.docx
+++ b/PASE:rand.docx
--- a/dep_data.Rmd
+++ b/dep_data.Rmd
@ -0,0 +1,304 @@
+---
+title: "Data export and wrangling"
+author: "Andreas Gammelgaard Damsbo"
+date: "Knitted: `r format(Sys.time(), '%d %B, %Y')`"
+output:
+  pdf_document: default
+  html_document: default
+toc: TRUE
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, message = FALSE)
+```
+
+```{r message=FALSE}
+library(haven)
+library(plyr)
+library(dplyr)
+library(reshape2)
+```
+
+```{r}
+dta<-read.csv("/Volumes/Data/exercise/source/background.csv",
+              na.strings = c("NA",""),colClasses = "character")
+# dta_b<-dta
+```
+
+# Variables
+List of variables included in dataset
+
+```{r}
+dput(names(dta))
+```
+
+## New additions and formatting
+
+```{r}
+dta$mors_delay<-difftime(as.Date(dta$mors_d),as.Date(dta$rdate),units = "days")
+dta$mors_v1<-factor(ifelse(dta$mors_delay<=38&
+                             (dta$mors_delay-as.numeric(dta$inc_time))<=1,
+                           "yes","no")) 
+# Tæller som død hvis død inden 38 dage og dødsdato og EOS ligger indenfor 1 døgn.
+dta$mors_v1[is.na(dta$mors_v1)]<-"no"
+```
+
+```{r}
+dta$mors_v16<-factor(ifelse(dta$mors_v1=="no"&
+                              (dta$mors_delay-as.numeric(dta$inc_time))<=1,
+                            "yes","no"))
+# Tæller som død mellem 1 til 6 mdr, hvis ikke død inden 1 mdr, 
+# og dødsdato og EOS ligger indenfor 1 døgn.
+dta$mors_v16[is.na(dta$mors_v16)]<-"no"
+```
+
+PASE score dichotomisation at median score.
+```{r}
+dta$pase_0<-as.numeric(dta$pase_0)
+dta$pase_0_bin<-cut(dta$pase_0,
+                    c(min(dta$pase_0,na.rm = T),median(dta$pase_0,na.rm = T),
+                      max(dta$pase_0,na.rm = T)),include.lowest = T,
+                    labels = c("lower","higher"))
+quantile(dta$pase_0,na.rm = T)
+```
+
+### Formatting
+
+```{r}
+dta$inc_time<-as.numeric(dta$inc_time)
+```
+
+# Cleaning MDI scores
+
+The following contains a serious bit of data wrangling. Reasons are the occasional recording of visit 1 data at 6 months due to LOCF approach. Additionally some patients have data recorded at 6 months, but later end date has been defined as prior to the visit 6.
+Additionally the definition of when to define a MDI recording as 1 month or 6 months have added a bit of extra work..
+
+This work should be applied for all endpoint data. If needed, a general script or function should be written.
+
+Steps used for the correction:
+
+1.   If the inc_time is 38 days or less MDI 6 scores are moved to MDI 1 and visit 6 is defined as visit 1.
+2.   If both visit 1 and 6 dates are NA, use enddate as visit 1 date. This is the case if patients were excluded early.
+3.   If visit 6 is recorded later than enddate, use enddate instead.  MDI 6 score is dropped.
+4.   If visit delay is 7 days or less, and inclusion time is more than 38, MDI 1 is moved to MDI 6 and dropped. If MDI 1 and 6 are different both are kept. Enddate is moved to visit 6 date.
+5.   Defining the visit 6 date as same as enddate if visit delay is <7.
+
+
+```{r}
+summary(inc196<-dta$inc_time>196)
+dt1<-dta[inc196,c("rnumb","rdate","visit_1","visit_6","enddate","inc_time","mdi_1","mdi_6","mors_delay")]
+
+```
+
+
+
+## Step 1
+```{r}
+summary(inc38<-dta$inc_time<=38)
+dt1<-dta[inc38,c("rnumb","rdate","visit_1","visit_6","inc_time","mdi_1","mdi_6")]
+dta$visit_1<-ifelse(inc38&!is.na(dta$visit_6),dta$visit_6,dta$visit_1)
+dta$mdi_1<-ifelse(inc38&is.na(dta$mdi_1),dta$mdi_6,dta$mdi_1)
+dta$mdi_6[inc38]<-NA
+dta$visit_6[inc38]<-NA
+# If the inc_time is 38 days or less MDI 6 scores are moved to MDI 1 and visit 6 is defined as visit 1.
+# LOCF correction.
+```
+
+## Step 2
+```{r}
+summary(na16enddate<-is.na(dta$visit_1)&is.na(dta$visit_6))
+dt2<-dta[na16enddate,c("rnumb","rdate","visit_1","visit_6","inc_time","mdi_1","mdi_6")]
+dta$visit_1<-ifelse(na16enddate,dta$enddate,dta$visit_1)
+# If both visit 1 and 6 dates are NA, use enddate as visit 1 date. This is the case if patients were excluded early.
+```
+
+## Step 3
+```{r}
+summary(late61<-as.Date(dta$visit_6)>as.Date(dta$enddate)&difftime(as.Date(dta$visit_6),as.Date(dta$enddate),units = "days")<=1)
+summary(late62<-as.Date(dta$visit_6)>as.Date(dta$enddate)&difftime(as.Date(dta$visit_6),as.Date(dta$enddate),units = "days")>1)
+
+late61[is.na(late61)]<-FALSE
+late62[is.na(late62)]<-FALSE
+
+# dt5<-dta[late61,c("rnumb","rdate","visit_1","visit_6","enddate","inc_time","mdi_1","mdi_6")]
+# dt6<-dta[late62,c("rnumb","rdate","visit_1","visit_6","enddate","inc_time","mdi_1","mdi_6")]
+
+dta$visit_6<-ifelse(late61,dta$enddate,dta$visit_6)
+dta$visit_6<-ifelse(late62,dta$enddate,dta$visit_6)
+dta$mdi_6[late62]<-NA
+# If visit 6 is recorded later than enddate, use enddate instead
+# A group of patients have visit 6 and MDI 6 recorded, but enddate is before visit 6 data. 
+# After manual lookups, this is likely due to some patients coming for visit 6, but the 
+# interviewer later realizing, that the patients should have been excluded earlier on.
+# Due to this, patients with enddate more than 1 day (leaving room for simple recording errors) prior to visit 6 date, MDI 6 will be dropped.
+# Patients with 1 day difference the enddate is moved to visit 6 date.
+```
+
+## Step 4
+
+```{r}
+summary(locflate<-(difftime(as.Date(dta$visit_6),as.Date(dta$visit_1))<=7|is.na(dta$visit_1))&dta$inc_time>38)
+locflate[is.na(locflate)]<-FALSE
+
+dt2<-dta[locflate,c("rnumb","rdate","visit_1","visit_6","inc_time","mdi_1","mdi_6")]
+
+dta$mdi_6<-ifelse(locflate&is.na(dta$mdi_6),dta$mdi_1,dta$mdi_6)
+
+dta$mdi_1[locflate&dta$mdi_1==dta$mdi_6]<-NA
+dta$visit_1[locflate&is.na(dta$mdi_1)]<-NA
+
+dta$visit_6<-ifelse(locflate,dta$enddate,dta$visit_6)
+
+# If visit delay is 7 days or less, and inclusion time is more than 38, MDI 1 is moved to MDI 6 and dropped. If MDI 1 and 6 are different both are kept. Enddate is moved to visit 6 date.
+```
+
+## Step 5
+```{r}
+summary(same1n6date<-difftime(as.Date(dta$visit_6),as.Date(dta$visit_1),units = "days")<7)
+same1n6date[is.na(same1n6date)]<-FALSE
+# dt5<-dta[same1n6date,c("rnumb","rdate","visit_1","visit_6","enddate","inc_time","mdi_1","mdi_6",drops)]
+dta$visit_6<-ifelse(same1n6date,dta$enddate,dta$visit_6)
+# Defining the visit 6 date as same as enddate if visit delay is <7.
+```
+
+
+# Visit delay
+```{r}
+dta$visit_delay<-difftime(as.Date(dta$visit_6),as.Date(dta$visit_1),units = "days")
+# Final calculation of days between visits
+summary(as.numeric(dta$visit_delay))
+```
+
+# newobs definition - DEPRECATED
+
+The definition of a truly new observation is a recorded score at least 7 days after the first score. This was relevant prior to the work of redefining time points for scoring.
+```{r}
+dta$mdi_6_newobs<-dta$mdi_6
+# The newobs variable is later used, but is obsolete due to the previous change in definitions. The previous definition of newobs were a delay between visits of at least 7 days. No cases matched. Minimum is 13.
+```
+
+
+```{r}
+# dta$mdi_6_166<-ifelse(dta$inc_time>166,NA,dta$mdi_6)
+# dta$mdi_6_80<-ifelse(dta$inc_time>80,NA,dta$mdi_6)
+# dta$mdi_6_protocol<-ifelse(dta$protocol=="2",NA,ifelse(is.na(dta$mdi_6),dta$mdi_1,dta$mdi_6))
+# dta$mdi_6_locf<-ifelse(is.na(dta$mdi_6),dta$mdi_1,dta$mdi_6)
+```
+
+
+# Drops
+Streamlining drop out data to avoid NA's.
+```{r}
+drops<-c("side_effect2","side_effect","wants_out","open_treat")
+for (i in drops) {
+  dta[i]<-ifelse(dta[i]=="1. Ja","yes","no")
+  dta[i][is.na(dta[i])]<-"no"
+}
+```
+
+Defining a common all cause drop variable
+```{r}
+dta$drop<-ifelse(dta$side_effect2=="yes"|dta$side_effect=="yes"|dta$wants_out=="yes"|dta$open_treat=="yes","yes","no")
+```
+
+Defining drop before or at day 38 (Following protocol design) as drop before 1 month and drop after day 38 as drop between 1 and 6 months
+```{r}
+cut_line<-38
+dta$inc_time<-as.numeric(dta$inc_time)
+dta$drop1<-ifelse(dta$drop=="yes"&dta$inc_time<=cut_line,"yes","no")
+summary(factor(dta$drop1))
+
+# dt3<-dta[,c("rnumb","rdate","visit_1","visit_6","inc_time","mdi_1","mdi_6","mdi_6_newobs","drop1","drop16",drops)]
+
+dta$drop16<-ifelse(dta$drop=="yes"&dta$inc_time>cut_line,"yes","no")
+summary(factor(dta$drop16))
+summary(factor(dta$drop))
+# dtf<-dta[dta$drop1=="yes",c("mdi_6_newobs","inc_time")]
+# dtf<-dta[,c("mdi_1","mdi_6_newobs","inc_time","drop","drop1","drop16")]
+```
+
+
+# Enriching 
+With patients excluded due to open treatment need and defining populations to include/exclude
+```{r}
+summary(sel_enr_1<-dta$open_treat=="yes"&is.na(dta$mdi_1)&dta$drop1=="yes")
+dta$mdi_1_enr<-ifelse(sel_enr_1,21,dta$mdi_1)  # Per agreement, patients excluded due to open treatment need a given the fictive MDI score "21".
+```
+
+Vectorising ex/inclusions at 1 month, to keep patients with data or with later data.
+```{r}
+summary(dta$excluded_1<-factor(case_when(dta$mors_v1=="yes"|
+                                           is.na(dta$mdi_1_enr)&
+                                           dta$drop1=="yes"~"ex_1", # Excluded
+                         is.na(dta$mdi_1_enr)&!is.na(dta$mdi_6_newobs)~"ca_1", # Missing, but carried to 6 months
+                         is.na(dta$mdi_1_enr)~"mi_1", # Missing,
+                         is.na(dta$mdi_1)&!is.na(dta$mdi_1_enr)~"en_1",
+                         TRUE ~ "dt_1"))) # Data available
+```
+
+
+```{r}
+summary(sel_enr_6<-dta$open_treat=="yes"&dta$drop16=="yes"&is.na(dta$mdi_6_newobs)&dta$excluded_1%in%c("ca_1","dt_1")&dta$mors_v16=="no")
+
+# Entries to be enriched are entries with need for open treatment after 1 month, with missing mdi_6_newobs and with data at or "carried" from 1 month 
+
+dta$mdi_6_newobs_enr<-as.numeric(ifelse(sel_enr_6,21,dta$mdi_6_newobs))  # Per agreement, patients excluded due to open treatment need a given the fictive MDI score "21".
+```
+
+```{r}
+summary(dta$excluded_6<-factor(case_when(is.na(dta$mdi_6_newobs_enr)&dta$excluded_1%in%c("ca_1","dt_1","en_1")~"ex_6", # Excluded due to death or other dropout
+                                         is.na(dta$mdi_6_newobs_enr)~"mi_6", # Missing data due to exclusion at 1 month
+                                         is.na(dta$mdi_6_newobs)&!is.na(dta$mdi_6_newobs_enr)~"en_6", # Enriched entries
+                                         dta$excluded_1%in%c("ca_1","dt_1")~"dt_6" # Organic data available
+                                         ))) # Data available
+```
+
+```{r}
+# dtf<-cbind(dta[,c("rnumb","mdi_1","mdi_6_newobs","inc_time","drop","drop1","drop16","mdi_1_enr","mdi_6_newobs_enr","excluded_1","excluded_6","mors_v16","mors_delay")],"excluded"=is.na(dta$mdi_6_newobs_enr)&dta$excluded_1%in%c("ca_1","dt_1","en_1"))
+# 
+# summary(dtf %>% filter(excluded==TRUE))
+```
+
+
+# Main Dataset export
+```{r}
+variable_namebits<-c("rnumb","rtreat","age","sex",
+               "bmi",
+               "smoke_ever",
+               "civil",
+               "diabetes", 
+               "hypertension",
+               "pad", 
+               "afli",  
+               "ami", 
+               "tci",
+               "nihss_0",
+               "thrombolysis", 
+               "thrombechtomy",
+               "rep_any",
+               "pase_0",
+               "pase_6",
+               "mrs_0","mrs_1","mrs_6",
+#               "who5_score",
+               "mdi",
+#               "ham_score_1","ham_score_6",
+               "mors",
+               "drop",
+               "wants_out",
+               "side_effect",
+               "open_treat",
+               "side_effect2",
+               "excluded",
+               "protocol","eos_early","inc_time",
+               "rdate","visit","enddate"
+               )
+```
+
+```{r}
+export<-dta %>% select(contains(variable_namebits))
+```
+
+```{r}
+write.csv(export,"/Volumes/Data/depression/dep_dataset.csv",row.names = FALSE)
+```
+
--- a/dep_data.pdf
+++ b/dep_data.pdf
--- a/dep_flow.Rmd
+++ b/dep_flow.Rmd
@ -0,0 +1,164 @@
+---
+title: "Patient flowchart and chi^2 tests"
+author: "Andreas Gammelgaard Damsbo"
+date: "Knitted: `r format(Sys.time(), '%d %B, %Y')`"
+output: pdf_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, message = FALSE)
+```
+
+# Import
+```{r}
+dta_all<-read.csv("/Volumes/Data/depression/dep_dataset.csv")
+```
+
+# Defining patients to include for analysis
+Only including cases with complete pase_0 and MDI at 1 & 6 months
+```{r}
+dta<-dta_all[!is.na(dta_all$pase_0),]
+# &!is.na(dta$mdi_1)&!is.na(dta$mdi_6)
+```
+
+# Backup
+```{r}
+dta_b<-dta
+```
+
+
+# Sammentællinger
+```{r}
+summary(cbind(is.na(dta_all[,c("pase_0","mdi_1","mdi_1_enr","mdi_6_newobs","mdi_6_newobs_enr")]),
+      both_missing=is.na(dta$mdi_1)&is.na(dta$mdi_6_newobs),
+      either_missing=is.na(dta$mdi_1)|is.na(dta$mdi_6_newobs)))
+```
+
+
+```{r}
+suppressWarnings(summary(cbind(all_particip=dta_all$mors_180=="yes",
+              all_pase0=dta$mors_180=="yes",
+              all_mdi_1=!is.na(dta$mdi_1)&dta$mors_180=="yes"))) # Antal der dør
+
+table(dta$pase_0_bin,factor(dta$mors_180)) # Antal der dør, stratificeret efter PASE gruppe
+```
+
+```{r}
+# summary(factor(dta$mors_v1))
+# summary(factor(dta$mors_v16)) # OBS medregnet er 2 dødsfald, der ikke har MDI 1.
+```
+
+# Flow
+
+## 1 month
+
+Shows counts of all patients withs missing MDI 1 scores.
+```{r message=FALSE}
+source("/Volumes/Data/depression/function_flow.R") # Home made flow function
+show(flow_prog(df=dta[dta$excluded_1%in%c("ex_1","mi_1","ca_1"),],
+          sngl=c("mors_v1","drop1"),
+          sngl_keep=c("no","yes"),
+          mltp=c("open_treat","wants_out","side_effect","side_effect2")))
+# v1<-dta$rnumb[dta$excluded_1%in%c("ex_1","mi_1")]
+```
+
+Same overview, but vectorised
+```{r}
+summary(factor(dta$excluded_1))
+
+# dt_1 are organic data, en_1 are enriched, ex_1 are excluded, mi_1 were missing, 
+# ca_1 were missing at 1 month, but held data at 6 months, and thus carried along.
+```
+
+
+
+## 6 months
+
+Shows counts of all patients withs missing MDI 6 scores.
+```{r}
+show(flow_prog(df=dta[is.na(dta$mdi_6_newobs_enr)&dta$excluded_6%in%c("ex_6"),], # 
+          sngl=c("mors_v16","drop16"),
+          sngl_keep=c("no","yes"),
+          mltp=c("open_treat","wants_out","side_effect","side_effect2")))
+# v2<-dta$rnumb[is.na(dta$mdi_6_newobs_enr)&dta$excluded_1%in%c("ca_1","dt_1")]
+```
+
+```{r}
+summary(factor(dta$excluded_6))
+
+# dt_6 are organic data, en_6 are enriched, ex_6 are excluded, mi_6 were excluded at 1 month.
+# At 6 month 118 are excluded due to any cause
+# Due to later inclusion of ca_1 patients, the sum of patients excluded at 6 months is 71+62-16=117
+```
+
+This flow counts all patients dying or dropping out early after 1 month. Some have a recorded MDI at dropout. This is just to give a perspective on data.
+```{r}
+show(flow_prog(df=dta, # 
+          sngl=c("mors_v16","drop16"),
+          sngl_keep=c("no","yes"),
+          mltp=c("open_treat","wants_out","side_effect","side_effect2")))
+# v2<-dta$rnumb[is.na(dta$mdi_6_newobs_enr)&dta$excluded_1%in%c("ca_1","dt_1")]
+```
+
+```{r}
+summary(as.numeric(dta$inc_time[dta$drop16=="yes"]))
+```
+
+
+# Chi^2 tests
+
+```{r}
+source("/Volumes/Data/depression/function_chi_test_sum.R")
+ex_lst<-list()
+ex_var<-c("mdi_1_enr","rtreat","pase_0_bin")
+for (i in 2:3){
+ ex_lst <- append(ex_lst,chi_test_sum(a=is.na(dta[,ex_var[1]]),
+                                      b=dta[,ex_var[i]],
+                                      aname=ex_var[1],
+                                      bname=ex_var[i]))
+}
+```
+
+```{r}
+# ex_var<-c("open_treat","rtreat","pase_0_bin")
+# for (i in 2:3){
+#  ex_lst <- append(ex_lst,
+#                   chi_test_sum(a=dta[,ex_var[1]],
+#                                b=dta[,ex_var[i]],
+#                                aname=ex_var[1],
+#                                bname=ex_var[i]))
+# }
+
+## Taget ud grundet for lave tal
+```
+
+```{r}
+ex_var<-c("mdi_1_enr","rtreat","pase_0_bin")
+for (i in 2:3){
+ ex_lst <- append(ex_lst,
+                  chi_test_sum(a=is.na(dta$mdi_6_newobs_enr)&!is.na(dta$mdi_1_enr),
+                                      b=dta[,ex_var[i]],
+                                      aname="Excluded at 6 months",
+                                      bname=ex_var[i]))
+}
+for (i in 2:3){
+ ex_lst <- append(ex_lst,
+                  chi_test_sum(a=is.na(dta$mdi_6_newobs_enr),
+                                      b=dta[,ex_var[i]],
+                                      aname="Total unavailable at 6 months",
+                                      bname=ex_var[i]))
+}
+```
+
+
+```{r}
+# ex_var<-c("open_treat","rtreat","pase_0_bin")
+# for (i in 2:3){
+#  ex_lst <- append(ex_lst,
+#                   chi_test_sum(a=dta[,ex_var[1]],
+#                                b=dta[,ex_var[i]],
+#                                aname=ex_var[1],
+#                                bname=ex_var[i]))
+# }
+show(ex_lst)
+```
--- a/dep_flow.pdf
+++ b/dep_flow.pdf
--- a/dep_imputation.Rmd
+++ b/dep_imputation.Rmd
@ -0,0 +1,299 @@
+---
+title: "Sensitivity analysis on imputed dataset"
+author: "Andreas Gammelgaard Damsbo"
+date: "Knitted: `r format(Sys.time(), '%d %B, %Y')`"
+output: pdf_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, message = FALSE)
+```
+
+
+# Import
+```{r}
+dta_all<-read.csv("/Volumes/Data/depression/dep_dataset.csv",na.strings = c("NA","unknown")) ## Extending definition of NA's for imputation
+```
+
+# Defining patients to include for analysis
+Only including cases with complete pase_0 and MDI at 1 & 6 months
+```{r}
+dta<-dta_all[!is.na(dta_all$pase_0),]
+# &!is.na(dta$mdi_1)&!is.na(dta$mdi_6)
+```
+
+## Formatting
+```{r echo=FALSE}
+dta$diabetes<-factor(dta$diabetes)
+dta$pad<-factor(dta$pad)
+
+dta$civil<-factor(dta$civil)
+
+dta$hypertension<-factor(dta$hypertension)
+dta$afli<-factor(dta$afli)
+dta$smoke_ever<-factor(dta$smoke_ever)
+dta$ami<-factor(dta$ami)
+dta$tci<-factor(dta$tci)
+dta$thrombolysis<-factor(dta$thrombolysis)
+dta$thrombechtomy<-factor(dta$thrombechtomy)
+dta$rep_any<-factor(dta$rep_any)
+dta$pad<-factor(dta$pad)
+dta$nihss_0<-as.numeric(dta$nihss_0)
+dta$age<-as.numeric(dta$age)
+dta$rtreat<-factor(dta$rtreat)
+dta$sex<-factor(dta$sex)
+dta$pase_0<-as.numeric(dta$pase_0)
+dta$pase_6<-as.numeric(dta$pase_6)
+dta$bmi<-as.numeric(dta$bmi)
+dta$mdi_6<-as.numeric(dta$mdi_6)
+dta$pase_0_bin<-factor(dta$pase_0_bin,levels=c("lower","higher"))
+```
+
+
+```{r}
+# Backup
+dta_b<-dta
+```
+
+
+# Libraries
+```{r}
+library(daDoctoR)
+library(mice)
+```
+
+## Variables to include in imputation
+```{r}
+# Possible variables to include
+coval<-c("pase_0_bin","rtreat","age","sex","smoke_ever","civil","bmi","diabetes", "hypertension", "afli","pad","nihss_0","rep_any")
+```
+
+
+# Imputation
+```{r}
+# Output variables added to include in model. Excluded from predicting.
+outc<-c("mdi_1_enr","mdi_6_newobs_enr")
+# Adding all
+covar<-c(coval,outc)
+# Selecting dataset
+r<-dta[,c("rnumb",covar)]
+```
+
+```{r}
+# Iterations
+mxt=20
+# Imputations
+mis=5
+```
+
+https://datascienceplus.com/imputing-missing-data-with-r-mice-package/
+```{r}
+md.pattern(r)  # Missing pattern
+# library(VIM)
+# aggr_plot <- aggr(r, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
+```
+
+```{r}
+init <- mice(r, maxit=0)  # Creating initial imputation list to assess methods
+meth <- init$method
+meth
+predM <- init$predictorMatrix
+
+predM[, c("rnumb")] <- 0  # Defining variables not to be used for predicting imputed values
+
+# meth[outc]=""  
+# Defining variables not to be imputed. 
+# Commented out as all included variables will be imputed
+```
+
+```{r echo=FALSE}
+imputed <- mice(r, method=meth, predictorMatrix=predM, m=mis, maxit = mxt,seed = 103, printFlag=FALSE)
+```
+
+```{r}
+# summary(imputed)
+```
+
+```{r}
+library(dplyr)
+export<-dta %>%
+  select(-all_of(coval)) %>%  # Leaving out imputed variables from original dataset
+  left_join(mice::complete(imputed,1),.,by="rnumb") # Join with the first imputed dataset for a full dataset export
+
+md.pattern(export[coval]) # Ensuring complete data
+
+write.csv(export,"/Volumes/Data/depression/imputed.csv",row.names = FALSE)  # Export
+```
+
+
+# Regression analyses
+```{r}
+print(adjs_10<-rep_lm(meas = "mdi_6",string=c("pase_0_bin","rtreat",coval),data=dta,cut.p = .1)[[2]])
+```
+
+## Bivariabel
+
+Function to format collected data from pool function
+```{r}
+pool_table<-function(clls){
+  ## Variables needed: estimate, p.value, term
+  
+  coll$lo<-round(coll$estimate-coll$std.error*1.96,2)
+coll$hi<-round(coll$estimate+coll$std.error*1.96,2)
+
+pa<-coll$p.value
+pa <- ifelse(pa < 0.001, "<0.001", round(pa, 3))
+pa <- ifelse(pa <= 0.05 | pa == "<0.001", paste0("*",pa), ifelse(pa > 0.05 & pa <= 0.1, paste0(".", pa),pa))
+
+cl<-data.frame(id=coll$term,diff=paste0(round(coll$estimate,2)," (",coll$lo," to ",coll$hi,")"),p=pa,stringsAsFactors=FALSE)
+  return(cl)
+}
+
+keeps<-c("term","estimate","std.error","p.value")
+```
+
+### Repeated bivariabel analysis
+Not necessary for this, but an interesting addition
+
+```{r}
+coll<-c()
+for (i in c("rtreat",adjs_10)){
+  ## Bivariable linear regression analysis of all 
+  coeffs<-summary(pool(
+    with(imputed,lm(as.formula(paste0("mdi_1_enr~",i))))
+    ))[-1,c("term","estimate","std.error","p.value")]
+  
+  coll<-rbind(coll,coeffs)
+  
+  ## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula
+  ## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3
+}
+
+## Collecting
+
+coll$lo<-round(coll$estimate-coll$std.error*1.96,2)
+coll$hi<-round(coll$estimate+coll$std.error*1.96,2)
+
+pa<-coll$p.value
+pa <- ifelse(pa < 0.001, "<0.001", round(pa, 3))
+pa <- ifelse(pa <= 0.05 | pa == "<0.001", paste0("*",pa), ifelse(pa > 0.05 & pa <= 0.1, paste0(".", pa),pa))
+
+coll_bi<-data.frame(diff=paste0(round(exp(coll$estimate),2)," (",coll$lo," to ",coll$hi,")"),p=pa,id=coll$term,stringsAsFactors=FALSE)
+
+```
+
+
+## Unadjusted analyses
+
+```{r}
+adjs_10m<-adjs_10[adjs_10!="pase_0_bin"]
+
+adj_m<-c("rtreat","pase_0_bin")
+
+coll<-c()
+nms<-c()
+for (l in outc){
+
+for (i in adj_m){
+  coeffs<-summary(pool(
+    with(imputed,lm(as.formula(paste0(l,"~",i))))
+    ))[-1,keeps]
+  coll<-rbind(coll,coeffs)
+  
+  nms<-c(nms,paste(l,i,sep = "_"))
+  
+  d.long <- mice::complete(imputed,"long",include = T)
+  
+  # Inspiration: https://stackoverflow.com/questions/53014141/mice-splitting-imputed-data-for-further-analysis
+  
+  for (j in levels(d.long[[i]])){
+    k<-length(adj_m)-grep(i,adj_m)+1 ## This only works to select the "opposite" of i for length(adj_m)==2
+
+    s.imp<-mice::as.mids(d.long[which(d.long[[i]] == j),]) # Subsetting long and convert to "mids" format for pooling
+    
+    coeffs<-summary(pool(
+    with(s.imp,lm(as.formula(paste0(l,"~",adj_m[k]))))
+    ))[-1,keeps]
+    
+    coll<-rbind(coll,coeffs)
+    
+    nms<-c(nms,paste(l,j,sep = "_"))
+  }
+  
+  
+  ## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula
+  ## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3
+}
+
+
+}
+coll$term<-nms
+
+```
+
+### Collecting
+```{r}
+biv_coll<-pool_table(coll)
+```
+
+
+
+## Adjusted analyses
+
+```{r}
+adjs_10m<-adjs_10[adjs_10!="pase_0_bin"]
+
+adj_m<-c("rtreat","pase_0_bin")
+
+coll<-c()
+nms<-c()
+for (l in outc){
+# l="mdi_1_enr"
+for (i in adj_m){
+  coeffs<-summary(pool(
+    with(imputed,lm(as.formula(paste0(l,"~",paste(i,paste(adjs_10m,collapse="+"),sep="+")))))
+    ))[2,keeps]
+  coll<-rbind(coll,coeffs)
+  
+  nms<-c(nms,paste(l,i,sep = "_"))
+  
+  d.long <- mice::complete(imputed,"long",include = T)
+  
+  # Inspiration: https://stackoverflow.com/questions/53014141/mice-splitting-imputed-data-for-further-analysis
+  
+  for (j in levels(d.long[[i]])){
+    k<-length(adj_m)-grep(i,adj_m)+1 ## This only works to select the "opposite" of i for length(adj_m)==2
+
+    s.imp<-mice::as.mids(d.long[which(d.long[[i]] == j),]) # Subsetting long and convert to "mids" format for pooling
+    
+    coeffs<-summary(pool(
+    with(s.imp,lm(as.formula(paste0(l,"~",paste(adj_m[k],paste(adjs_10m,collapse="+"),sep="+")))))
+    ))[2,keeps]
+    
+    coll<-rbind(coll,coeffs)
+    
+    nms<-c(nms,paste(l,j,sep = "_"))
+  }
+  
+  
+  ## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula
+  ## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3
+}
+
+
+}
+coll$term<-nms
+
+```
+
+### Collecting
+```{r}
+mul_coll<-pool_table(coll)
+colnames(mul_coll)[-1]<-paste0("adj_",colnames(mul_coll)[-1])
+```
+
+```{r}
+library(lubridate)
+write.csv(full_join(biv_coll,mul_coll),paste0("/Volumes/Data/depression/imp_regression_",today(),".csv"))
+```
+
--- a/dep_imputation.pdf
+++ b/dep_imputation.pdf
--- a/dep_median.Rmd
+++ b/dep_median.Rmd
@ -0,0 +1,97 @@
+---
+title: "Additional data analysis"
+author: "Andreas Gammelgaard Damsbo"
+date: "Knitted: `r format(Sys.time(), '%d %B, %Y')`"
+output: pdf_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
+```
+
+# Import
+```{r}
+dta_all<-read.csv("/Volumes/Data/depression/dep_dataset.csv")
+```
+
+# Defining patients to include for analysis
+Only including cases with complete pase_0 and MDI at 1 & 6 months
+```{r}
+dta<-dta_all[!is.na(dta_all$pase_0),]
+# &!is.na(dta$mdi_1)&!is.na(dta$mdi_6)
+quantile(dta$pase_0, probs = seq(0, 1, 0.25), names = TRUE)
+```
+
+## Formatting
+```{r echo=FALSE}
+dta$rtreat<-factor(dta$rtreat)
+dta$pase_6<-as.numeric(dta$pase_6)
+```
+
+# Summaries
+
+Fraction with inc_time of at least 166 days
+```{r}
+dt<-as.numeric(dta[dta$excluded_6%in%c("dt_6","en_6"),c("inc_time")])>=166
+summary(dt)
+length(dt[dt==TRUE])/length(dt)*100 # Percent after 166 days
+```
+
+5% percentiler
+```{r}
+quantile(dt, probs = seq(0, 1, 0.05), names = TRUE)
+```
+
+
+Base version
+```{r}
+aggregate(pase_6 ~ rtreat, data = dta, summary)
+```
+
+Fancy version
+```{r}
+psych::describeBy(dta$pase_6, dta$rtreat,mat=T)
+```
+
+# Mann-Whitney U test
+See: https://stat-methods.com/home/mann-whitney-u-r/
+
+```{r}
+#Perform the Mann-Whitney U test
+m1<-wilcox.test(pase_6 ~ rtreat, data=dta, na.rm=TRUE, 
+                paired=FALSE, exact=FALSE, conf.int=TRUE)
+print(m1)
+```
+
+
+
+# Boxplot
+
+## Base function - simple
+```{r}
+boxplot(dta$pase_6 ~ dta$rtreat)
+```
+
+## ggplot2 - fancy version
+```{r}
+library(ggplot2)
+ ggplot(dta, aes(x = rtreat, y = pase_6, fill = rtreat)) +
+  stat_boxplot(geom ="errorbar", width = 0.5) +
+  geom_boxplot(fill = "light blue") + 
+  stat_summary(fun.y=mean, geom="point", shape=10, size=3.5, color="black") +   
+   # Point symbol is mean value
+  ggtitle("Boxplot of Treatments C and D") + 
+  theme_bw() + theme(legend.position="none")
+```
+
+# Bonus: QQ plots
+```{r}
+library(qqplotr)
+ggplot(data = dta, mapping = aes(sample = pase_6, color = rtreat, fill = rtreat)) +
+  stat_qq_band(alpha=0.5, conf=0.95, qtype=1, bandType = "boot") +
+  stat_qq_line(identity=TRUE) +
+  stat_qq_point(col="black") +
+  facet_wrap(~ rtreat, scales = "free") +
+  labs(x = "Theoretical Quantiles", y = "Sample Quantiles") + theme_bw()
+```
+
--- a/dep_median.pdf
+++ b/dep_median.pdf
--- a/dep_regression.Rmd
+++ b/dep_regression.Rmd
@ -0,0 +1,220 @@
+---
+title: "Regression analyses"
+author: "Andreas Gammelgaard Damsbo"
+date: "Knitted: `r format(Sys.time(), '%d %B, %Y')`"
+output: pdf_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, message = FALSE)
+```
+
+
+# Import
+```{r}
+dta_all<-read.csv("/Volumes/Data/depression/dep_dataset.csv")
+```
+
+# Defining patients to include for analysis
+Only including cases with complete pase_0 and MDI at 1 & 6 months
+```{r}
+dta<-dta_all[!is.na(dta_all$pase_0),]
+# &!is.na(dta$mdi_1)&!is.na(dta$mdi_6)
+```
+
+## Formatting
+```{r echo=FALSE}
+dta$diabetes<-factor(dta$diabetes)
+dta$pad<-factor(dta$pad)
+dta$civil<-factor(dta$civil)
+dta$hypertension<-factor(dta$hypertension)
+dta$afli<-factor(dta$afli)
+dta$smoke_ever<-factor(dta$smoke_ever)
+dta$ami<-factor(dta$ami)
+dta$tci<-factor(dta$tci)
+dta$thrombolysis<-factor(dta$thrombolysis)
+dta$thrombechtomy<-factor(dta$thrombechtomy)
+dta$rep_any<-factor(dta$rep_any)
+dta$pad<-factor(dta$pad)
+dta$nihss_0<-as.numeric(dta$nihss_0)
+dta$age<-as.numeric(dta$age)
+dta$rtreat<-factor(dta$rtreat)
+dta$sex<-factor(dta$sex)
+dta$pase_0<-as.numeric(dta$pase_0)
+dta$pase_6<-as.numeric(dta$pase_6)
+dta$bmi<-as.numeric(dta$bmi)
+dta$mdi_6<-as.numeric(dta$mdi_6)
+dta$pase_0_bin<-factor(dta$pase_0_bin,levels=c("lower","higher"))
+```
+
+
+```{r}
+# Backup
+dta_b<-dta
+```
+
+
+# Linear regression analysis
+
+```{r}
+library(broom)
+library(daDoctoR)
+library(lubridate)
+```
+
+## Tests of variables to adjust for
+```{r}
+# Possible variables to include
+adjs<-c("age","sex","smoke_ever","civil","bmi","diabetes", "hypertension", "afli","pad","nihss_0","rep_any")
+```
+
+```{r}
+# Variables with p<10% i bivariable linear regression analysis
+print(adjs_10<-rep_lm(meas = "mdi_6",string=c("pase_0_bin","rtreat",adjs),data=dta,cut.p = .1)[[2]])
+```
+
+## True mean estimations (adjusted)
+
+```{r}
+strt<-append(print_pred_stratum(meas = "mdi_6",adj=unique(c("pase_0_bin",adjs_10)),strat="rtreat",data=dta,include.stratum = T),print_pred_stratum(meas = "mdi_6",adj=c(adjs_10[adjs_10!="pase_0_bin"],"rtreat"),strat="pase_0_bin",data=dta,include.stratum = T))
+
+for (i in 1:length(strt)){
+  write.csv(strt[[i]][[1]],paste0("tbl_md6_",substr(names(strt)[i],1,3),".csv"))
+}
+
+c<-c()
+for (i in 1:length(strt)){
+  c<-c(c,paste("Estimated true mean,",names(strt)[i]),strt[[i]][[5]])
+}
+mat_true<-matrix(c(c,c("Variables adjusted for:",paste(c("rtreat",adjs_10), collapse=', '))), ncol=2, byrow = T)
+```
+
+# MDI outcome 2x2 
+
+```{r}
+sts<-c("pase_0_bin","rtreat")
+# sts<-c("rtreat","pase_0_bin")
+adjs_10m<-adjs_10[adjs_10!="pase_0_bin"]
+```
+
+## One month
+
+### Enriched
+
+```{r echo=FALSE}
+outs<-c("mdi_1_enr")
+
+source("/Volumes/Data/depression/script_regression_frame.R")
+
+show(reg_frm)
+write.csv(reg_frm,paste0(outs,"_matrix_",today(),".csv"))
+```
+
+
+### Raw
+```{r echo=FALSE}
+outs<-c("mdi_1")
+
+source("/Volumes/Data/depression/script_regression_frame.R")
+
+show(reg_frm)
+write.csv(reg_frm,paste0(outs,"_matrix_",today(),".csv"))
+```
+
+
+## Six months
+
+### New Observations - enriched
+```{r echo=FALSE}
+outs<-c("mdi_6_newobs_enr")  # Outcome is based on new observations for 2nd (6 months) visit
+
+# mean(dta$inc_time[!is.na(dta[,outs])])
+# quantile(dta$inc_time[!is.na(dta[,outs])])
+
+source("/Volumes/Data/depression/script_regression_frame.R")
+
+show(reg_frm)
+write.csv(reg_frm,paste0(outs,"_matrix_",today(),".csv"))
+```
+
+### New Observations
+```{r echo=FALSE}
+outs<-c("mdi_6_newobs")  # Outcome is based on new observations for 2nd (6 months) visit
+
+# mean(dta$inc_time[!is.na(dta[,outs])])
+# quantile(dta$inc_time[!is.na(dta[,outs])])
+
+source("/Volumes/Data/depression/script_regression_frame.R")
+
+show(reg_frm)
+write.csv(reg_frm,paste0(outs,"_matrix_",today(),".csv"))
+```
+
+### New observations - adjusted for 6 months PASE
+```{r echo=FALSE}
+outs<-c("mdi_6_newobs")
+adjs_10m<-c(adjs_10[adjs_10!="pase_0_bin"],"pase_6")
+
+# mean(dta$inc_time[!is.na(dta[,outs])])
+# quantile(dta$inc_time[!is.na(dta[,outs])])
+
+source("/Volumes/Data/depression/script_regression_frame.R")
+
+show(reg_frm)
+write.csv(reg_frm,paste0(outs,"_matrix_",today(),".csv"))
+```
+
+
+## Dichotomized sensitivity analysis
+
+```{r}
+dta$composite_out<-case_when(dta$open_treat=="yes"|(dta$mdi_6_newobs-dta$mdi_1)>5~"yes",
+          is.na(dta$mdi_6_newobs)~"NA",
+          is.na(dta$mdi_1)~"NA",
+          TRUE~"no")
+dta$composite_out[dta$composite_out=="NA"]<-NA
+summary(dta$composite_out<-factor(dta$composite_out))
+```
+
+## Enriching and cleaning variables
+
+```{r}
+# Enriching
+dta$pad[is.na(dta$pad)]<-"no"
+dta$hypertension[is.na(dta$hypertension)]<-"no"
+
+# Cleaning
+dta$civil<-factor(ifelse(dta$civil=="unknown",NA,dta$civil))
+```
+
+
+```{r}
+table(dta$rtreat,dta$pase_0_bin)
+
+outs<-"composite_out"
+sts<-c("pase_0_bin","rtreat")
+# sts<-c("rtreat","pase_0_bin")
+adjs_10m<-adjs_10[adjs_10!="pase_0_bin"]
+dta_frm<-dta[!is.na(dta$composite_out),c(outs,sts,adjs_10m)]
+
+summary(dta_frm)
+
+# colnames(dta_frm)[1]<-"outs"
+
+# print_log(meas="composite_out",var=sts[2],adj=c(sts[1],adjs_10m),data=dta_frm)
+
+# print_pred(meas="composite_out",adj=c(sts[2],adjs_10m),data=dta_frm[dta_frm$pase_0_bin=="lower",],n.by.adj = T)
+
+composite_out_lst<-list(print_pred_stratum(meas="composite_out",strat = sts[1],adj=c(sts[2],adjs_10m),
+                                           data=dta_frm,n.by.adj = T),
+                        print_pred_stratum(meas="composite_out",strat = sts[2],adj=c(sts[1],adjs_10m),
+                                           data=dta_frm,n.by.adj = T))
+
+# show(composite_out_lst)
+capture.output(show(composite_out_lst), 
+               file = paste0("composite_out_lst",today(),".txt"))
+
+```
+
+
+
--- a/dep_regression.pdf
+++ b/dep_regression.pdf
--- a/dep_regression_interaction.R
+++ b/dep_regression_interaction.R
@ -0,0 +1,162 @@
+## TALOS Depression regression analyses
+## Created: 07.apr.2022
+## New regression analyses incl anova in new tables based on gtsummary
+
+
+## =============================================================================
+## Data import and formatting
+## =============================================================================
+
+dta_all<-read.csv("/Volumes/Data/depression/dep_dataset.csv",na.strings = c("NA","unknown"))
+
+dta<-dta_all[!is.na(dta_all$pase_0),]
+
+dta$diabetes<-factor(dta$diabetes)
+dta$pad<-factor(dta$pad)
+dta$civil<-factor(dta$civil)
+dta$hypertension<-factor(dta$hypertension)
+dta$afli<-factor(dta$afli)
+dta$smoke_ever<-factor(dta$smoke_ever)
+dta$ami<-factor(dta$ami)
+dta$tci<-factor(dta$tci)
+dta$thrombolysis<-factor(dta$thrombolysis)
+dta$thrombechtomy<-factor(dta$thrombechtomy)
+dta$rep_any<-factor(dta$rep_any,labels=c("no","yes"))
+dta$pad<-factor(dta$pad)
+dta$nihss_0<-as.numeric(dta$nihss_0)
+dta$age<-as.numeric(dta$age)
+dta$rtreat<-factor(dta$rtreat, levels=c("Placebo","Active"))
+dta$sex<-factor(dta$sex)
+dta$pase_0<-as.numeric(dta$pase_0)
+dta$pase_6<-as.numeric(dta$pase_6)
+dta$bmi<-as.numeric(dta$bmi)
+dta$mdi_6<-as.numeric(dta$mdi_6)
+dta$pase_0_bin<-factor(dta$pase_0_bin,levels=c("lower","higher"))
+
+dta_b<-dta
+
+
+library(broom)
+library(daDoctoR)
+library(lubridate)
+library(gtsummary)
+library(dplyr)
+library(gt)
+
+## =============================================================================
+## Regression setup
+## =============================================================================
+
+# Possible variables to adjust for
+adjs<-c("age","sex","smoke_ever","civil","diabetes", "hypertension", "afli","pad","nihss_0","rep_any")
+
+preds<-c("pase_0_bin","rtreat")
+
+# Selection of variables
+print(adjs_10<-rep_lm(meas = "mdi_6",string=c(preds,adjs),data=dta,cut.p = .1)[[2]])
+
+## =============================================================================
+## One month regression
+## =============================================================================
+
+out="mdi_1_enr"
+
+ds<-dta%>%select(all_of(c(out,preds,adjs_10)))
+
+# AOV
+# aov<-aov(as.formula(paste(out,".",sep = "~")), 
+#          data = ds) %>% 
+#   tidy() %>%
+#   gt()
+
+# Interaction
+int<-lm(as.formula(paste(out,paste0(paste(preds,collapse = "*"),"+."),sep = "~")), 
+        data = ds) %>% 
+  tbl_regression()%>%
+  # add_n%>%
+  bold_labels() %>%
+  italicize_levels()
+
+# Bivariate
+biv<-tbl_uvregression(data=ds,
+                      y=out,
+                      method=lm
+)  %>% 
+  bold_labels() %>%
+  add_n%>%
+  italicize_levels()
+
+# Multivariate
+mul<-lm(as.formula(paste(out,".",sep = "~")), 
+        data = ds) %>% 
+  tbl_regression()%>% 
+  bold_labels() %>%
+  add_n()%>%
+  italicize_levels()
+
+# Table merge
+tbl_comb<-tbl_merge(
+  tbls = list(biv, mul,int), 
+  tab_spanner = c("**Bivariate**", 
+                  "**Multivariate**",
+                  "**Multivariate with interaction**") 
+)
+
+tbl_comb
+
+tbl_reg_1_rtf <- file("tbl_reg_1.RTF", "w")
+writeLines(tbl_comb%>%as_gt()%>%as_rtf(), tbl_reg_1_rtf)
+close(tbl_reg_1_rtf)
+
+## =============================================================================
+## Six month regression
+## =============================================================================
+
+out="mdi_6_newobs_enr"
+
+ds<-dta%>%select(all_of(c(out,preds,adjs_10)))
+
+# AOV
+# aov<-aov(as.formula(paste(out,".",sep = "~")), 
+#          data = ds) %>% 
+#   tidy() %>%
+#   gt()
+
+# Interaction
+int<-lm(as.formula(paste(out,paste0(paste(preds,collapse = "*"),"+."),sep = "~")), 
+        data = ds) %>% 
+  tbl_regression()%>%
+  # add_n%>%
+  bold_labels() %>%
+  italicize_levels()
+
+# Bivariate
+biv<-tbl_uvregression(data=ds,
+                      y=out,
+                      method=lm
+)  %>% 
+  bold_labels() %>%
+  add_n%>%
+  italicize_levels()
+
+# Multivariate
+mul<-lm(as.formula(paste(out,".",sep = "~")), 
+        data = ds) %>% 
+  tbl_regression()%>% 
+  bold_labels() %>%
+  add_n()%>%
+  italicize_levels()
+
+# Table merge
+tbl_comb<-tbl_merge(
+  tbls = list(biv, mul,int), 
+  tab_spanner = c("**Bivariate**", 
+                  "**Multivariate**",
+                  "**Multivariate with interaction**") 
+)
+
+tbl_comb
+
+tbl_reg_6_rtf <- file("tbl_reg_6.RTF", "w")
+writeLines(tbl_comb%>%as_gt()%>%as_rtf(), tbl_reg_1_rtf)
+close(tbl_reg_6_rtf)
--- a/dep_tableone.Rmd
+++ b/dep_tableone.Rmd
@ -0,0 +1,118 @@
+---
+title: "Table One"
+author: "Andreas Gammelgaard Damsbo"
+date: "Knitted: `r format(Sys.time(), '%d %B, %Y')`"
+output: pdf_document
+---
+# Import
+```{r}
+dta<-read.csv("/Volumes/Data/depression/dep_dataset.csv")
+```
+
+## Formatting
+```{r}
+dta$diabetes<-factor(dta$diabetes)
+dta$pad<-factor(dta$pad)
+dta$civil<-factor(dta$civil)
+dta$hypertension<-factor(dta$hypertension)
+dta$afli<-factor(dta$afli)
+dta$smoke_ever<-factor(dta$smoke_ever)
+dta$ami<-factor(dta$ami)
+dta$tci<-factor(dta$tci)
+dta$thrombolysis<-factor(dta$thrombolysis)
+dta$thrombechtomy<-factor(dta$thrombechtomy)
+dta$rep_any<-factor(dta$rep_any)
+dta$pad<-factor(dta$pad)
+dta$nihss_0<-as.numeric(dta$nihss_0)
+dta$age<-as.numeric(dta$age)
+dta$rtreat<-factor(dta$rtreat)
+dta$sex<-factor(dta$sex)
+dta$pase_0<-as.numeric(dta$pase_0)
+dta$bmi<-as.numeric(dta$bmi)
+dta$mdi_6<-as.numeric(dta$mdi_6)
+dta$inc_time<-as.numeric(dta$inc_time)
+```
+
+# Defining patients to include for analysis
+Only including cases with complete pase_0 and MDI at 1 & 6 months
+```{r}
+dta<-dta[!is.na(dta$pase_0),]
+# &!is.na(dta$mdi_1)&!is.na(dta$mdi_6)
+```
+
+## Defining table one stratification
+```{r}
+dta$strat_table_one<-factor(case_when(is.na(dta$mdi_6_newobs)~"zExcluded",
+                         dta$pase_0_bin=="lower"~"xLower",
+                         dta$pase_0_bin=="higher"~"yHigher"))
+
+# summary(dta$strat_table_one)
+```
+
+```{r}
+library(plyr)
+dta$in_ex<-mapvalues(dta$strat_table_one, from=c("xLower", "yHigher"), to=c("xIncluded","xIncluded"))
+# summary(dta$in_ex)
+library(dplyr)
+```
+
+
+# Basic analyses
+```{r}
+show(mdn<-median(dta$pase_0))
+hist(dta$pase_0,100)
+hist(sqrt(dta$pase_0),100)
+```
+
+# Table One
+```{r}
+library(tableone)
+```
+
+```{r}
+tbl_norm<-c("rtreat","age","sex","bmi","smoke_ever","civil","diabetes", "hypertension", "afli", "ami", "tci","pad","nihss_0", "thrombolysis", "thrombechtomy","rep_any","inc_time")
+tbl_cat<-c("rtreat","sex","diabetes", "hypertension", "smoke_ever","civil", "ami", "tci", "thrombolysis", "thrombechtomy","rep_any")
+tbl_non<-c("age","nihss_0","inc_time")
+```
+
+```{r}
+tab1 <- CreateTableOne(vars = tbl_norm, data = dta, factorVars = tbl_cat,includeNA = TRUE)
+tbl1_1<-print(tab1, contDigits = 1, missing=T,showAllLevels=T ,nonnormal = tbl_non, smd = FALSE, quote = F, noSpaces = TRUE)
+```
+
+```{r}
+tab2 <- CreateTableOne(vars = tbl_norm, strata="pase_0_bin",data = dta, factorVars = tbl_cat,includeNA = T)
+tbl1_2<-print(tab2, contDigits = 1, missing=T,showAllLevels=T ,nonnormal = tbl_non, smd = F,test = T, quote = F, noSpaces = TRUE)
+```
+
+```{r}
+tab3 <- CreateTableOne(vars = tbl_norm, strata="strat_table_one",data = dta, factorVars = tbl_cat,includeNA = T)
+tbl1_3<-print(tab3, contDigits = 1, missing=T,showAllLevels=T ,nonnormal = tbl_non, smd = F,test = T, quote = F, noSpaces = TRUE)
+```
+
+```{r}
+tab4 <- CreateTableOne(vars = tbl_norm, strata="in_ex",data = dta, factorVars = tbl_cat,includeNA = T)
+tbl1_4<-print(tab4, contDigits = 1, missing=T,showAllLevels=T ,nonnormal = tbl_non, smd = F,test = T, quote = F, noSpaces = TRUE)
+```
+
+```{r}
+table(is.na(dta$nihss_0),dta$strat_table_one)
+table(is.na(dta$bmi),dta$strat_table_one)
+```
+
+
+```{r}
+dta<-dta[dta$strat_table_one!="zExcluded",]
+dta$strat_table_one<-factor(dta$strat_table_one)
+tab5 <- CreateTableOne(vars = tbl_norm, strata="strat_table_one",data = dta, factorVars = tbl_cat,includeNA = T)
+tbl1_5<-print(tab5, contDigits = 1, missing=T,showAllLevels=T ,nonnormal = tbl_non, smd = F,test = T, quote = F, noSpaces = TRUE)
+```
+
+```{r}
+library(lubridate)
+tbl_list<-list(tbl1_1,tbl1_2,tbl1_3,tbl1_4,tbl1_5)
+for (i in 1:length(tbl_list)){
+  nm<-paste0("tbl1_",i)
+  write.csv(tbl_list[[i]],paste0("/Volumes/Data/depression/",nm,"_",unlist(strsplit(as.character(now()),"[ ]"))[1],".csv"))
+}
+```
--- a/dep_tableone.pdf
+++ b/dep_tableone.pdf
--- a/dep_tableone_enriched.Rmd
+++ b/dep_tableone_enriched.Rmd
@ -0,0 +1,153 @@
+---
+title: "Table One - enriched"
+author: "Andreas Gammelgaard Damsbo"
+date: "Knitted: `r format(Sys.time(), '%d %B, %Y')`"
+output: pdf_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, message = FALSE)
+```
+
+
+# Import
+```{r}
+dta<-read.csv("/Volumes/Data/depression/dep_dataset.csv")
+```
+
+## Formatting
+```{r}
+dta$diabetes<-factor(dta$diabetes)
+dta$pad<-factor(dta$pad)
+dta$civil<-factor(dta$civil)
+dta$hypertension<-factor(dta$hypertension)
+dta$afli<-factor(dta$afli)
+dta$smoke_ever<-factor(dta$smoke_ever)
+dta$ami<-factor(dta$ami)
+dta$tci<-factor(dta$tci)
+dta$thrombolysis<-factor(dta$thrombolysis)
+dta$thrombechtomy<-factor(dta$thrombechtomy)
+dta$rep_any<-factor(dta$rep_any)
+dta$pad<-factor(dta$pad)
+dta$nihss_0<-as.numeric(dta$nihss_0)
+dta$age<-as.numeric(dta$age)
+dta$rtreat<-factor(dta$rtreat)
+dta$sex<-factor(dta$sex)
+dta$pase_0<-as.numeric(dta$pase_0)
+dta$bmi<-as.numeric(dta$bmi)
+dta$mdi_6<-as.numeric(dta$mdi_6)
+dta$inc_time<-as.numeric(dta$inc_time)
+
+dta$bmi_isna<-is.na(dta$bmi)
+dta$nihss_0_isna<-is.na(dta$nihss_0)
+```
+
+# Defining patients to include for analysis
+```{r  message=FALSE}
+library(dplyr)
+```
+
+
+Only including cases with complete pase_0 and MDI at 1 & 6 months
+```{r}
+dta<-dta[!is.na(dta$pase_0),]
+# &!is.na(dta$mdi_1)&!is.na(dta$mdi_6)
+```
+
+## Defining table one stratification
+```{r}
+dta$strat_table_one<-factor(case_when(dta$excluded_6%in%c("mi_6","ex_6")~"zExcluded",
+                         dta$pase_0_bin=="lower"~"xLower",
+                         dta$pase_0_bin=="higher"~"yHigher"))
+
+summary(dta$strat_table_one)
+```
+
+```{r}
+dta$in_ex<-plyr::mapvalues(dta$strat_table_one, 
+                           from=c("xLower", "yHigher"), 
+                           to=c("xIncluded","xIncluded"))
+# summary(dta$in_ex)
+```
+
+
+# Basic analyses
+```{r}
+show(mdn<-median(dta$pase_0))
+hist(dta$pase_0,100)
+hist(sqrt(dta$pase_0),100)
+```
+
+# Table One
+```{r}
+library(tableone)
+```
+
+```{r}
+tbl_norm<-c("rtreat","age","sex","bmi","bmi_isna","smoke_ever","civil","diabetes", 
+            "hypertension", "afli", "ami", "tci","pad","nihss_0","nihss_0_isna", 
+            "thrombolysis", "thrombechtomy","rep_any","inc_time")
+
+tbl_cat<-c("rtreat","sex","bmi_isna","diabetes", "hypertension", "smoke_ever","civil", 
+           "ami", "tci","nihss_0_isna", "thrombolysis", 
+           "thrombechtomy","rep_any")
+
+tbl_non<-c("age","nihss_0","inc_time")
+```
+
+```{r}
+tab1 <- CreateTableOne(vars = tbl_norm, data = dta, 
+                       factorVars = tbl_cat,includeNA = TRUE)
+
+tbl1_1<-print(tab1, contDigits = 1, missing=T,showAllLevels=T ,
+              nonnormal = tbl_non, smd = FALSE, quote = F, noSpaces = TRUE)
+```
+
+```{r}
+tab2 <- CreateTableOne(vars = tbl_norm, strata="pase_0_bin",
+                       data = dta, factorVars = tbl_cat,includeNA = T)
+
+tbl1_2<-print(tab2, contDigits = 1, missing=T,showAllLevels=T,
+              nonnormal = tbl_non, smd = F,test = T, quote = F, noSpaces = TRUE)
+```
+
+```{r}
+tab3 <- CreateTableOne(vars = tbl_norm, strata="strat_table_one",
+                       data = dta, factorVars = tbl_cat,includeNA = T)
+tbl1_3<-print(tab3, contDigits = 1, missing=T,showAllLevels=T,
+              nonnormal = tbl_non, smd = F,test = T, quote = F, noSpaces = TRUE)
+```
+
+```{r}
+tab4 <- CreateTableOne(vars = tbl_norm, strata="in_ex",
+                       data = dta, factorVars = tbl_cat,includeNA = T)
+
+tbl1_4<-print(tab4, contDigits = 1, missing=T,showAllLevels=T,
+              nonnormal = tbl_non, smd = F,test = T, quote = F, noSpaces = TRUE)
+```
+
+```{r}
+table(is.na(dta$nihss_0),dta$strat_table_one)
+table(is.na(dta$bmi),dta$strat_table_one)
+```
+
+
+```{r}
+dta<-dta[dta$strat_table_one!="zExcluded",]
+dta$strat_table_one<-factor(dta$strat_table_one)
+tab5 <- CreateTableOne(vars = tbl_norm, strata="strat_table_one",
+                       data = dta, factorVars = tbl_cat,includeNA = T)
+
+tbl1_5<-print(tab5, contDigits = 1, missing=T,showAllLevels=T,
+              nonnormal = tbl_non, smd = F,test = T, quote = F, noSpaces = TRUE)
+```
+
+```{r}
+tbl_list<-list(tbl1_1,tbl1_2,tbl1_3,tbl1_4,tbl1_5)
+for (i in 1:length(tbl_list)){
+  nm<-paste0("tbl1_",i)
+  write.csv(tbl_list[[i]],
+            paste0("/Volumes/Data/depression/",nm,"_enr_",
+                   lubridate::today(),".csv"))
+}
+```
--- a/dep_tableone_enriched.pdf
+++ b/dep_tableone_enriched.pdf
--- a/function_chi_test_sum.R
+++ b/function_chi_test_sum.R
@ -0,0 +1,10 @@
+chi_test_sum<-function(a,b,aname=NULL,bname=NULL){
+  chi<-chisq.test(table(a,b))
+  
+  lst<-list(matrix(chi$observed,nrow=2,byrow = FALSE,dimnames = list(levels(factor(a)),levels(factor(b)))),chi$p.value)
+  names(lst)<-c(paste("observed,",aname,"vs",bname),"pval^")
+  return(lst)
+}
+
+
+       
--- a/function_flow.R
+++ b/function_flow.R
@ -0,0 +1,54 @@
+flow_prog<-function(df,sngl,sngl_keep,mltp=NULL){
+  # df is the data frame
+  # sngl is a vector of variable names to exclude from the data frame one by one, prioritised
+  # sngl_keep is a variable of names of the same length as sngl containing the vector names of cases to keep for flow analysis
+  # mltp is a vector of variable names to count, can be one or more
+  # first sngl, then mltp
+  
+  # all specified variables should be factors or "factorisable"
+
+  # If categorical variables are decided, define a new variable. More elegant solution is desirable, but this works (!).
+  
+  # Dependencies
+  library(dplyr)
+  
+  # Small, home made, general summary function with naming
+  summary_list<-function(var){
+    v<-factor(var)
+    x<-levels(v)
+    y<-summary(v)
+    lst<-list(data.frame(matrix(y,nrow=length(x),byrow = TRUE,dimnames = list(x,deparse(substitute(var))))))  # deparse and substitue is used to get the name of the vector for naming the matrix. Nice!
+    return(lst)
+  }
+  
+  slist<-list(nrow(df))
+  names(slist)[1]<-"nrow of provided data frame"
+
+  # Handling sngl first
+  for (i in 1:length(sngl)){
+    z<-summary_list(df[,sngl[i]])
+    colnames(z[[1]])<-sngl[i]
+    slist[length(slist)+1] <- z
+    names(slist)[length(slist)]<-sngl[i]
+    df <- df %>% filter(df[,sngl[i]]==sngl_keep[i])
+  }
+  
+  # Handling mltp
+  
+  if (!is.null(mltp)){
+    for (i in 1:length(mltp)){
+      z<-summary_list(df[,mltp[i]])
+      colnames(z[[1]])<-mltp[i]
+      slist[length(slist)+1] <- z
+      names(slist)[length(slist)]<-paste0(mltp[i],"_for_",last(sngl),"==",last(sngl_keep))
+    }
+  }
+  
+  return(slist)
+}
+
+
+df<-dta
+sngl<-c("mors_v1","drop")
+sngl_keep<-c("no","yes")
+mltp<-c("open_treat","wants_out","side_effect","side_effect2")
--- a/plot.png
+++ b/plot.png
--- a/script_distribution_plot.R
+++ b/script_distribution_plot.R
@ -0,0 +1,32 @@
+# Distribution plot
+library(ggplot2)
+library(cowplot)
+
+pase_hist<-ggplot(data=dta, aes(pase_0)) + 
+  geom_histogram(binwidth = 10, aes(y =..density..), 
+                 col="orange", 
+                 fill="orange", 
+                 alpha=.2) + 
+  geom_density(col="tomato") + 
+  geom_vline(xintercept=median(dta$pase_0),col="navy") +
+  scale_x_continuous(limits=c(0,600) ,breaks=seq(0,600,by=50)) +
+  labs(title="Baseline PASE distribution", x="PASE score", y="Count") +
+  theme(axis.title.x=element_blank(),
+        axis.text.x=element_blank(),
+        axis.ticks.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks.y=element_blank()
+  )
+
+pase_box<-ggplot(data=dta, aes(pase_0,y=1)) + 
+  geom_boxplot(col="olivedrab", 
+               fill="olivedrab", 
+               alpha=.2) +
+  geom_jitter(col="maroon", alpha=0.3) +
+  scale_x_continuous(limits=c(0,600) , breaks=seq(0,600,by=50)) +
+  labs(title=NULL, x="PASE score", y="Count") +
+  theme(axis.text.y=element_blank(),
+        axis.ticks.y=element_blank()
+  )
+
+pg<-cowplot::plot_grid(pase_hist, pase_box, align = "v", ncol = 1, rel_heights = c(0.7, 0.3))
--- a/script_flowchart.R
+++ b/script_flowchart.R
@ -0,0 +1,21 @@
+# Flowchart
+
+# Data set defined in dep_regression.Rmd
+summary_list<-function(var){
+  v<-factor(var)
+  x<-levels(v)
+  y<-summary(v)
+  lst<-list(data.frame(matrix(y,nrow=length(x),byrow = TRUE,dimnames = list(x,deparse(substitute(var))))))  # deparse and substitue is used to get the name of the vector for naming the matrix. Nice!
+  return(lst)
+}
+
+slist<-summary_list(dta$drop)
+
+dta<-dta[dta$drop=="yes",]
+
+vars_l<-c("open_treat","wants_out","side_effect","side_effect2")
+for (i in 1:length(vars_l)){
+  z<-summary_list(dta[,vars_l[i]])
+  colnames(z[[1]])<-vars_l[i]
+  slist[length(slist)+1] <- z
+}
--- a/script_regression_frame.R
+++ b/script_regression_frame.R
@ -0,0 +1,56 @@
+envir_b<-ls()
+## Regression frame
+dta_frm<-dta[,c(outs,sts,adjs_10m)]
+colnames(dta_frm)[1]<-"outs"
+
+lst<-list()
+## 'rtreat' horisontal, 'pase_0_bin' vertical
+for (i in 1:length(outs)){
+  for (j in 1:length(sts)){
+    ls_22<-list(print_diff_bygroup(meas="outs",group=sts[j],var=sts[length(sts)+1-j],adj=adjs_10m,data=dta_frm))
+    names(ls_22)<-paste0(outs[i],"_",sts[j],"_ver_",sts[length(sts)+1-j],"_hor")
+    lst<-append(lst,ls_22)
+  }
+  lst<-append(lst,print_pred_stratum(meas="outs",adj=c(sts[1],adjs_10m),strat=sts[2],data=dta_frm,include.stratum = T)[1])
+}
+
+# outs (outcome), Active
+oadr<-lst[[2]][c(6,8)][2,1] # LvsH, Diff raw
+oada<-lst[[2]][c(6,8)][2,2] # LvsH, Diff adj
+#oada_1<-oada
+
+# outs, Placebo
+opdr<-lst[[2]][c(6,8)][3,1] # LvsH, Diff raw
+opda<-lst[[2]][c(6,8)][3,2] # LvsH, Diff adj
+#opda_1<-opda
+
+spc<-""
+spc4<-c("","","","")
+
+mdi1_tbl<-rbind(lst[[1]],matrix(c("Unadjusted mean diff.",spc,oadr,spc,opdr,spc4,
+                                  "Adjusted mean diff.",spc,oada,spc,opda,spc4),ncol=ncol(lst[[1]]),byrow=T,dimnames = list(c("a","b"),names(lst[[1]]))))
+
+# write.csv(mdi1_tbl,"mdi1_2x2.csv")
+
+reg_frm<-cbind(mdi1_tbl[,1],"",mdi1_tbl[,2:6],mdi1_tbl[,8])
+names(reg_frm)<-c("By_PA", "Rand_Total", "N_Active", "Mean_Active", "N_Placebo", "Mean_Placebo","Unadjusted_mean_diff", "Adjusted_mean_diff")
+reg_frm[1,1]<-"PASE_total"
+reg_frm[1,2]<-paste0(round(mean(dta_frm$outs, na.rm = TRUE), 1)," (",round(sd(dta_frm$outs, na.rm = TRUE), 1),")")
+reg_frm[1,3]<-nrow(dta_frm[dta_frm$rtreat=="Active"&!is.na(dta_frm$outs),])
+reg_frm[1,4]<-paste0(round(mean(dta_frm$outs[dta_frm$rtreat=="Active"], na.rm = TRUE), 1)," (",round(sd(dta_frm$outs[dta_frm$rtreat=="Active"], na.rm = TRUE), 1),")")
+reg_frm[1,5]<-nrow(dta_frm[dta_frm$rtreat=="Placebo"&!is.na(dta_frm$outs),])
+reg_frm[1,6]<-paste0(round(mean(dta_frm$outs[dta_frm$rtreat=="Placebo"], na.rm = TRUE), 1)," (",round(sd(dta_frm$outs[dta_frm$rtreat=="Placebo"], na.rm = TRUE), 1),")")
+
+## Det var fedt med en universel løsning her, så den vender rigtigt i forhold til vektoren
+reg_frm[2,2]<-paste0(round(mean(dta_frm$outs[dta_frm$pase_0_bin=="lower"], na.rm = TRUE), 1)," (",round(sd(dta_frm$outs[dta_frm$pase_0_bin=="lower"], na.rm = TRUE), 1),")")
+reg_frm[3,2]<-paste0(round(mean(dta_frm$outs[dta_frm$pase_0_bin=="higher"], na.rm = TRUE), 1)," (",round(sd(dta_frm$outs[dta_frm$pase_0_bin=="higher"], na.rm = TRUE), 1),")")
+
+reg_frm[4,2]<-lst[[3]][[1]][7,3]
+reg_frm[5,2]<-lst[[3]][[1]][7,4]
+
+reg_frm[1,7]<-lst[[3]][[1]][4,3]
+reg_frm[1,8]<-lst[[3]][[1]][4,4]
+
+source("/Volumes/Data/func/remove_all_but.R")
+
+remove_all_but(c("reg_frm",envir_b))
--- a/talos-pa-depression.Rproj
+++ b/talos-pa-depression.Rproj
@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
--- a/tbl_reg_1.RTF
+++ b/tbl_reg_1.RTF
--- a/tbl_reg_6.RTF
+++ b/tbl_reg_6.RTF