first commit of old data

2022-08-01 13:57:03 +02:00 · 2022-08-01 13:57:03 +02:00 · 2d7f0fc186
commit 2d7f0fc186
parent edb0afd869
26 changed files with 3805 additions and 0 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/PASE:rand.docx
+++ b/PASE:rand.docx
--- a/dep_data.Rmd
+++ b/dep_data.Rmd
@ -0,0 +1,304 @@
 ---
 title: "Data export and wrangling"
 author: "Andreas Gammelgaard Damsbo"
 date: "Knitted: `r format(Sys.time(), '%d %B, %Y')`"
 output:
  pdf_document: default
  html_document: default
 toc: TRUE
 ---
 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE, message = FALSE)
 ```
 ```{r message=FALSE}
 library(haven)
 library(plyr)
 library(dplyr)
 library(reshape2)
 ```
 ```{r}
 dta<-read.csv("/Volumes/Data/exercise/source/background.csv",
              na.strings = c("NA",""),colClasses = "character")
 # dta_b<-dta
 ```
 # Variables
 List of variables included in dataset
 ```{r}
 dput(names(dta))
 ```
 ## New additions and formatting
 ```{r}
 dta$mors_delay<-difftime(as.Date(dta$mors_d),as.Date(dta$rdate),units = "days")
 dta$mors_v1<-factor(ifelse(dta$mors_delay<=38&
                             (dta$mors_delay-as.numeric(dta$inc_time))<=1,
                           "yes","no")) 
 # Tæller som død hvis død inden 38 dage og dødsdato og EOS ligger indenfor 1 døgn.
 dta$mors_v1[is.na(dta$mors_v1)]<-"no"
 ```
 ```{r}
 dta$mors_v16<-factor(ifelse(dta$mors_v1=="no"&
                              (dta$mors_delay-as.numeric(dta$inc_time))<=1,
                            "yes","no"))
 # Tæller som død mellem 1 til 6 mdr, hvis ikke død inden 1 mdr, 
 # og dødsdato og EOS ligger indenfor 1 døgn.
 dta$mors_v16[is.na(dta$mors_v16)]<-"no"
 ```
 PASE score dichotomisation at median score.
 ```{r}
 dta$pase_0<-as.numeric(dta$pase_0)
 dta$pase_0_bin<-cut(dta$pase_0,
                    c(min(dta$pase_0,na.rm = T),median(dta$pase_0,na.rm = T),
                      max(dta$pase_0,na.rm = T)),include.lowest = T,
                    labels = c("lower","higher"))
 quantile(dta$pase_0,na.rm = T)
 ```
 ### Formatting
 ```{r}
 dta$inc_time<-as.numeric(dta$inc_time)
 ```
 # Cleaning MDI scores
 The following contains a serious bit of data wrangling. Reasons are the occasional recording of visit 1 data at 6 months due to LOCF approach. Additionally some patients have data recorded at 6 months, but later end date has been defined as prior to the visit 6.
 Additionally the definition of when to define a MDI recording as 1 month or 6 months have added a bit of extra work..
 This work should be applied for all endpoint data. If needed, a general script or function should be written.
 Steps used for the correction:
 1.   If the inc_time is 38 days or less MDI 6 scores are moved to MDI 1 and visit 6 is defined as visit 1.
 2.   If both visit 1 and 6 dates are NA, use enddate as visit 1 date. This is the case if patients were excluded early.
 3.   If visit 6 is recorded later than enddate, use enddate instead.  MDI 6 score is dropped.
 4.   If visit delay is 7 days or less, and inclusion time is more than 38, MDI 1 is moved to MDI 6 and dropped. If MDI 1 and 6 are different both are kept. Enddate is moved to visit 6 date.
 5.   Defining the visit 6 date as same as enddate if visit delay is <7.
 ```{r}
 summary(inc196<-dta$inc_time>196)
 dt1<-dta[inc196,c("rnumb","rdate","visit_1","visit_6","enddate","inc_time","mdi_1","mdi_6","mors_delay")]
 ```
 ## Step 1
 ```{r}
 summary(inc38<-dta$inc_time<=38)
 dt1<-dta[inc38,c("rnumb","rdate","visit_1","visit_6","inc_time","mdi_1","mdi_6")]
 dta$visit_1<-ifelse(inc38&!is.na(dta$visit_6),dta$visit_6,dta$visit_1)
 dta$mdi_1<-ifelse(inc38&is.na(dta$mdi_1),dta$mdi_6,dta$mdi_1)
 dta$mdi_6[inc38]<-NA
 dta$visit_6[inc38]<-NA
 # If the inc_time is 38 days or less MDI 6 scores are moved to MDI 1 and visit 6 is defined as visit 1.
 # LOCF correction.
 ```
 ## Step 2
 ```{r}
 summary(na16enddate<-is.na(dta$visit_1)&is.na(dta$visit_6))
 dt2<-dta[na16enddate,c("rnumb","rdate","visit_1","visit_6","inc_time","mdi_1","mdi_6")]
 dta$visit_1<-ifelse(na16enddate,dta$enddate,dta$visit_1)
 # If both visit 1 and 6 dates are NA, use enddate as visit 1 date. This is the case if patients were excluded early.
 ```
 ## Step 3
 ```{r}
 summary(late61<-as.Date(dta$visit_6)>as.Date(dta$enddate)&difftime(as.Date(dta$visit_6),as.Date(dta$enddate),units = "days")<=1)
 summary(late62<-as.Date(dta$visit_6)>as.Date(dta$enddate)&difftime(as.Date(dta$visit_6),as.Date(dta$enddate),units = "days")>1)
 late61[is.na(late61)]<-FALSE
 late62[is.na(late62)]<-FALSE
 # dt5<-dta[late61,c("rnumb","rdate","visit_1","visit_6","enddate","inc_time","mdi_1","mdi_6")]
 # dt6<-dta[late62,c("rnumb","rdate","visit_1","visit_6","enddate","inc_time","mdi_1","mdi_6")]
 dta$visit_6<-ifelse(late61,dta$enddate,dta$visit_6)
 dta$visit_6<-ifelse(late62,dta$enddate,dta$visit_6)
 dta$mdi_6[late62]<-NA
 # If visit 6 is recorded later than enddate, use enddate instead
 # A group of patients have visit 6 and MDI 6 recorded, but enddate is before visit 6 data. 
 # After manual lookups, this is likely due to some patients coming for visit 6, but the 
 # interviewer later realizing, that the patients should have been excluded earlier on.
 # Due to this, patients with enddate more than 1 day (leaving room for simple recording errors) prior to visit 6 date, MDI 6 will be dropped.
 # Patients with 1 day difference the enddate is moved to visit 6 date.
 ```
 ## Step 4
 ```{r}
 summary(locflate<-(difftime(as.Date(dta$visit_6),as.Date(dta$visit_1))<=7|is.na(dta$visit_1))&dta$inc_time>38)
 locflate[is.na(locflate)]<-FALSE
 dt2<-dta[locflate,c("rnumb","rdate","visit_1","visit_6","inc_time","mdi_1","mdi_6")]
 dta$mdi_6<-ifelse(locflate&is.na(dta$mdi_6),dta$mdi_1,dta$mdi_6)
 dta$mdi_1[locflate&dta$mdi_1==dta$mdi_6]<-NA
 dta$visit_1[locflate&is.na(dta$mdi_1)]<-NA
 dta$visit_6<-ifelse(locflate,dta$enddate,dta$visit_6)
 # If visit delay is 7 days or less, and inclusion time is more than 38, MDI 1 is moved to MDI 6 and dropped. If MDI 1 and 6 are different both are kept. Enddate is moved to visit 6 date.
 ```
 ## Step 5
 ```{r}
 summary(same1n6date<-difftime(as.Date(dta$visit_6),as.Date(dta$visit_1),units = "days")<7)
 same1n6date[is.na(same1n6date)]<-FALSE
 # dt5<-dta[same1n6date,c("rnumb","rdate","visit_1","visit_6","enddate","inc_time","mdi_1","mdi_6",drops)]
 dta$visit_6<-ifelse(same1n6date,dta$enddate,dta$visit_6)
 # Defining the visit 6 date as same as enddate if visit delay is <7.
 ```
 # Visit delay
 ```{r}
 dta$visit_delay<-difftime(as.Date(dta$visit_6),as.Date(dta$visit_1),units = "days")
 # Final calculation of days between visits
 summary(as.numeric(dta$visit_delay))
 ```
 # newobs definition - DEPRECATED
 The definition of a truly new observation is a recorded score at least 7 days after the first score. This was relevant prior to the work of redefining time points for scoring.
 ```{r}
 dta$mdi_6_newobs<-dta$mdi_6
 # The newobs variable is later used, but is obsolete due to the previous change in definitions. The previous definition of newobs were a delay between visits of at least 7 days. No cases matched. Minimum is 13.
 ```
 ```{r}
 # dta$mdi_6_166<-ifelse(dta$inc_time>166,NA,dta$mdi_6)
 # dta$mdi_6_80<-ifelse(dta$inc_time>80,NA,dta$mdi_6)
 # dta$mdi_6_protocol<-ifelse(dta$protocol=="2",NA,ifelse(is.na(dta$mdi_6),dta$mdi_1,dta$mdi_6))
 # dta$mdi_6_locf<-ifelse(is.na(dta$mdi_6),dta$mdi_1,dta$mdi_6)
 ```
 # Drops
 Streamlining drop out data to avoid NA's.
 ```{r}
 drops<-c("side_effect2","side_effect","wants_out","open_treat")
 for (i in drops) {
  dta[i]<-ifelse(dta[i]=="1. Ja","yes","no")
  dta[i][is.na(dta[i])]<-"no"
 }
 ```
 Defining a common all cause drop variable
 ```{r}
 dta$drop<-ifelse(dta$side_effect2=="yes"|dta$side_effect=="yes"|dta$wants_out=="yes"|dta$open_treat=="yes","yes","no")
 ```
 Defining drop before or at day 38 (Following protocol design) as drop before 1 month and drop after day 38 as drop between 1 and 6 months
 ```{r}
 cut_line<-38
 dta$inc_time<-as.numeric(dta$inc_time)
 dta$drop1<-ifelse(dta$drop=="yes"&dta$inc_time<=cut_line,"yes","no")
 summary(factor(dta$drop1))
 # dt3<-dta[,c("rnumb","rdate","visit_1","visit_6","inc_time","mdi_1","mdi_6","mdi_6_newobs","drop1","drop16",drops)]
 dta$drop16<-ifelse(dta$drop=="yes"&dta$inc_time>cut_line,"yes","no")
 summary(factor(dta$drop16))
 summary(factor(dta$drop))
 # dtf<-dta[dta$drop1=="yes",c("mdi_6_newobs","inc_time")]
 # dtf<-dta[,c("mdi_1","mdi_6_newobs","inc_time","drop","drop1","drop16")]
 ```
 # Enriching 
 With patients excluded due to open treatment need and defining populations to include/exclude
 ```{r}
 summary(sel_enr_1<-dta$open_treat=="yes"&is.na(dta$mdi_1)&dta$drop1=="yes")
 dta$mdi_1_enr<-ifelse(sel_enr_1,21,dta$mdi_1)  # Per agreement, patients excluded due to open treatment need a given the fictive MDI score "21".
 ```
 Vectorising ex/inclusions at 1 month, to keep patients with data or with later data.
 ```{r}
 summary(dta$excluded_1<-factor(case_when(dta$mors_v1=="yes"|
                                           is.na(dta$mdi_1_enr)&
                                           dta$drop1=="yes"~"ex_1", # Excluded
                         is.na(dta$mdi_1_enr)&!is.na(dta$mdi_6_newobs)~"ca_1", # Missing, but carried to 6 months
                         is.na(dta$mdi_1_enr)~"mi_1", # Missing,
                         is.na(dta$mdi_1)&!is.na(dta$mdi_1_enr)~"en_1",
                         TRUE ~ "dt_1"))) # Data available
 ```
 ```{r}
 summary(sel_enr_6<-dta$open_treat=="yes"&dta$drop16=="yes"&is.na(dta$mdi_6_newobs)&dta$excluded_1%in%c("ca_1","dt_1")&dta$mors_v16=="no")
 # Entries to be enriched are entries with need for open treatment after 1 month, with missing mdi_6_newobs and with data at or "carried" from 1 month 
 dta$mdi_6_newobs_enr<-as.numeric(ifelse(sel_enr_6,21,dta$mdi_6_newobs))  # Per agreement, patients excluded due to open treatment need a given the fictive MDI score "21".
 ```
 ```{r}
 summary(dta$excluded_6<-factor(case_when(is.na(dta$mdi_6_newobs_enr)&dta$excluded_1%in%c("ca_1","dt_1","en_1")~"ex_6", # Excluded due to death or other dropout
                                         is.na(dta$mdi_6_newobs_enr)~"mi_6", # Missing data due to exclusion at 1 month
                                         is.na(dta$mdi_6_newobs)&!is.na(dta$mdi_6_newobs_enr)~"en_6", # Enriched entries
                                         dta$excluded_1%in%c("ca_1","dt_1")~"dt_6" # Organic data available
                                         ))) # Data available
 ```
 ```{r}
 # dtf<-cbind(dta[,c("rnumb","mdi_1","mdi_6_newobs","inc_time","drop","drop1","drop16","mdi_1_enr","mdi_6_newobs_enr","excluded_1","excluded_6","mors_v16","mors_delay")],"excluded"=is.na(dta$mdi_6_newobs_enr)&dta$excluded_1%in%c("ca_1","dt_1","en_1"))
 # 
 # summary(dtf %>% filter(excluded==TRUE))
 ```
 # Main Dataset export
 ```{r}
 variable_namebits<-c("rnumb","rtreat","age","sex",
               "bmi",
               "smoke_ever",
               "civil",
               "diabetes", 
               "hypertension",
               "pad", 
               "afli",  
               "ami", 
               "tci",
               "nihss_0",
               "thrombolysis", 
               "thrombechtomy",
               "rep_any",
               "pase_0",
               "pase_6",
               "mrs_0","mrs_1","mrs_6",
 #               "who5_score",
               "mdi",
 #               "ham_score_1","ham_score_6",
               "mors",
               "drop",
               "wants_out",
               "side_effect",
               "open_treat",
               "side_effect2",
               "excluded",
               "protocol","eos_early","inc_time",
               "rdate","visit","enddate"
               )
 ```
 ```{r}
 export<-dta %>% select(contains(variable_namebits))
 ```
 ```{r}
 write.csv(export,"/Volumes/Data/depression/dep_dataset.csv",row.names = FALSE)
 ```
--- a/dep_data.pdf
+++ b/dep_data.pdf
--- a/dep_flow.Rmd
+++ b/dep_flow.Rmd
@ -0,0 +1,164 @@
 ---
 title: "Patient flowchart and chi^2 tests"
 author: "Andreas Gammelgaard Damsbo"
 date: "Knitted: `r format(Sys.time(), '%d %B, %Y')`"
 output: pdf_document
 ---
 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE, message = FALSE)
 ```
 # Import
 ```{r}
 dta_all<-read.csv("/Volumes/Data/depression/dep_dataset.csv")
 ```
 # Defining patients to include for analysis
 Only including cases with complete pase_0 and MDI at 1 & 6 months
 ```{r}
 dta<-dta_all[!is.na(dta_all$pase_0),]
 # &!is.na(dta$mdi_1)&!is.na(dta$mdi_6)
 ```
 # Backup
 ```{r}
 dta_b<-dta
 ```
 # Sammentællinger
 ```{r}
 summary(cbind(is.na(dta_all[,c("pase_0","mdi_1","mdi_1_enr","mdi_6_newobs","mdi_6_newobs_enr")]),
      both_missing=is.na(dta$mdi_1)&is.na(dta$mdi_6_newobs),
      either_missing=is.na(dta$mdi_1)|is.na(dta$mdi_6_newobs)))
 ```
 ```{r}
 suppressWarnings(summary(cbind(all_particip=dta_all$mors_180=="yes",
              all_pase0=dta$mors_180=="yes",
              all_mdi_1=!is.na(dta$mdi_1)&dta$mors_180=="yes"))) # Antal der dør
 table(dta$pase_0_bin,factor(dta$mors_180)) # Antal der dør, stratificeret efter PASE gruppe
 ```
 ```{r}
 # summary(factor(dta$mors_v1))
 # summary(factor(dta$mors_v16)) # OBS medregnet er 2 dødsfald, der ikke har MDI 1.
 ```
 # Flow
 ## 1 month
 Shows counts of all patients withs missing MDI 1 scores.
 ```{r message=FALSE}
 source("/Volumes/Data/depression/function_flow.R") # Home made flow function
 show(flow_prog(df=dta[dta$excluded_1%in%c("ex_1","mi_1","ca_1"),],
          sngl=c("mors_v1","drop1"),
          sngl_keep=c("no","yes"),
          mltp=c("open_treat","wants_out","side_effect","side_effect2")))
 # v1<-dta$rnumb[dta$excluded_1%in%c("ex_1","mi_1")]
 ```
 Same overview, but vectorised
 ```{r}
 summary(factor(dta$excluded_1))
 # dt_1 are organic data, en_1 are enriched, ex_1 are excluded, mi_1 were missing, 
 # ca_1 were missing at 1 month, but held data at 6 months, and thus carried along.
 ```
 ## 6 months
 Shows counts of all patients withs missing MDI 6 scores.
 ```{r}
 show(flow_prog(df=dta[is.na(dta$mdi_6_newobs_enr)&dta$excluded_6%in%c("ex_6"),], # 
          sngl=c("mors_v16","drop16"),
          sngl_keep=c("no","yes"),
          mltp=c("open_treat","wants_out","side_effect","side_effect2")))
 # v2<-dta$rnumb[is.na(dta$mdi_6_newobs_enr)&dta$excluded_1%in%c("ca_1","dt_1")]
 ```
 ```{r}
 summary(factor(dta$excluded_6))
 # dt_6 are organic data, en_6 are enriched, ex_6 are excluded, mi_6 were excluded at 1 month.
 # At 6 month 118 are excluded due to any cause
 # Due to later inclusion of ca_1 patients, the sum of patients excluded at 6 months is 71+62-16=117
 ```
 This flow counts all patients dying or dropping out early after 1 month. Some have a recorded MDI at dropout. This is just to give a perspective on data.
 ```{r}
 show(flow_prog(df=dta, # 
          sngl=c("mors_v16","drop16"),
          sngl_keep=c("no","yes"),
          mltp=c("open_treat","wants_out","side_effect","side_effect2")))
 # v2<-dta$rnumb[is.na(dta$mdi_6_newobs_enr)&dta$excluded_1%in%c("ca_1","dt_1")]
 ```
 ```{r}
 summary(as.numeric(dta$inc_time[dta$drop16=="yes"]))
 ```
 # Chi^2 tests
 ```{r}
 source("/Volumes/Data/depression/function_chi_test_sum.R")
 ex_lst<-list()
 ex_var<-c("mdi_1_enr","rtreat","pase_0_bin")
 for (i in 2:3){
 ex_lst <- append(ex_lst,chi_test_sum(a=is.na(dta[,ex_var[1]]),
                                      b=dta[,ex_var[i]],
                                      aname=ex_var[1],
                                      bname=ex_var[i]))
 }
 ```
 ```{r}
 # ex_var<-c("open_treat","rtreat","pase_0_bin")
 # for (i in 2:3){
 #  ex_lst <- append(ex_lst,
 #                   chi_test_sum(a=dta[,ex_var[1]],
 #                                b=dta[,ex_var[i]],
 #                                aname=ex_var[1],
 #                                bname=ex_var[i]))
 # }
 ## Taget ud grundet for lave tal
 ```
 ```{r}
 ex_var<-c("mdi_1_enr","rtreat","pase_0_bin")
 for (i in 2:3){
 ex_lst <- append(ex_lst,
                  chi_test_sum(a=is.na(dta$mdi_6_newobs_enr)&!is.na(dta$mdi_1_enr),
                                      b=dta[,ex_var[i]],
                                      aname="Excluded at 6 months",
                                      bname=ex_var[i]))
 }
 for (i in 2:3){
 ex_lst <- append(ex_lst,
                  chi_test_sum(a=is.na(dta$mdi_6_newobs_enr),
                                      b=dta[,ex_var[i]],
                                      aname="Total unavailable at 6 months",
                                      bname=ex_var[i]))
 }
 ```
 ```{r}
 # ex_var<-c("open_treat","rtreat","pase_0_bin")
 # for (i in 2:3){
 #  ex_lst <- append(ex_lst,
 #                   chi_test_sum(a=dta[,ex_var[1]],
 #                                b=dta[,ex_var[i]],
 #                                aname=ex_var[1],
 #                                bname=ex_var[i]))
 # }
 show(ex_lst)
 ```
--- a/dep_flow.pdf
+++ b/dep_flow.pdf
--- a/dep_imputation.Rmd
+++ b/dep_imputation.Rmd
@ -0,0 +1,299 @@
 ---
 title: "Sensitivity analysis on imputed dataset"
 author: "Andreas Gammelgaard Damsbo"
 date: "Knitted: `r format(Sys.time(), '%d %B, %Y')`"
 output: pdf_document
 ---
 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE, message = FALSE)
 ```
 # Import
 ```{r}
 dta_all<-read.csv("/Volumes/Data/depression/dep_dataset.csv",na.strings = c("NA","unknown")) ## Extending definition of NA's for imputation
 ```
 # Defining patients to include for analysis
 Only including cases with complete pase_0 and MDI at 1 & 6 months
 ```{r}
 dta<-dta_all[!is.na(dta_all$pase_0),]
 # &!is.na(dta$mdi_1)&!is.na(dta$mdi_6)
 ```
 ## Formatting
 ```{r echo=FALSE}
 dta$diabetes<-factor(dta$diabetes)
 dta$pad<-factor(dta$pad)
 dta$civil<-factor(dta$civil)
 dta$hypertension<-factor(dta$hypertension)
 dta$afli<-factor(dta$afli)
 dta$smoke_ever<-factor(dta$smoke_ever)
 dta$ami<-factor(dta$ami)
 dta$tci<-factor(dta$tci)
 dta$thrombolysis<-factor(dta$thrombolysis)
 dta$thrombechtomy<-factor(dta$thrombechtomy)
 dta$rep_any<-factor(dta$rep_any)
 dta$pad<-factor(dta$pad)
 dta$nihss_0<-as.numeric(dta$nihss_0)
 dta$age<-as.numeric(dta$age)
 dta$rtreat<-factor(dta$rtreat)
 dta$sex<-factor(dta$sex)
 dta$pase_0<-as.numeric(dta$pase_0)
 dta$pase_6<-as.numeric(dta$pase_6)
 dta$bmi<-as.numeric(dta$bmi)
 dta$mdi_6<-as.numeric(dta$mdi_6)
 dta$pase_0_bin<-factor(dta$pase_0_bin,levels=c("lower","higher"))
 ```
 ```{r}
 # Backup
 dta_b<-dta
 ```
 # Libraries
 ```{r}
 library(daDoctoR)
 library(mice)
 ```
 ## Variables to include in imputation
 ```{r}
 # Possible variables to include
 coval<-c("pase_0_bin","rtreat","age","sex","smoke_ever","civil","bmi","diabetes", "hypertension", "afli","pad","nihss_0","rep_any")
 ```
 # Imputation
 ```{r}
 # Output variables added to include in model. Excluded from predicting.
 outc<-c("mdi_1_enr","mdi_6_newobs_enr")
 # Adding all
 covar<-c(coval,outc)
 # Selecting dataset
 r<-dta[,c("rnumb",covar)]
 ```
 ```{r}
 # Iterations
 mxt=20
 # Imputations
 mis=5
 ```
 https://datascienceplus.com/imputing-missing-data-with-r-mice-package/
 ```{r}
 md.pattern(r)  # Missing pattern
 # library(VIM)
 # aggr_plot <- aggr(r, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
 ```
 ```{r}
 init <- mice(r, maxit=0)  # Creating initial imputation list to assess methods
 meth <- init$method
 meth
 predM <- init$predictorMatrix
 predM[, c("rnumb")] <- 0  # Defining variables not to be used for predicting imputed values
 # meth[outc]=""  
 # Defining variables not to be imputed. 
 # Commented out as all included variables will be imputed
 ```
 ```{r echo=FALSE}
 imputed <- mice(r, method=meth, predictorMatrix=predM, m=mis, maxit = mxt,seed = 103, printFlag=FALSE)
 ```
 ```{r}
 # summary(imputed)
 ```
 ```{r}
 library(dplyr)
 export<-dta %>%
  select(-all_of(coval)) %>%  # Leaving out imputed variables from original dataset
  left_join(mice::complete(imputed,1),.,by="rnumb") # Join with the first imputed dataset for a full dataset export
 md.pattern(export[coval]) # Ensuring complete data
 write.csv(export,"/Volumes/Data/depression/imputed.csv",row.names = FALSE)  # Export
 ```
 # Regression analyses
 ```{r}
 print(adjs_10<-rep_lm(meas = "mdi_6",string=c("pase_0_bin","rtreat",coval),data=dta,cut.p = .1)[[2]])
 ```
 ## Bivariabel
 Function to format collected data from pool function
 ```{r}
 pool_table<-function(clls){
  ## Variables needed: estimate, p.value, term
  coll$lo<-round(coll$estimate-coll$std.error*1.96,2)
 coll$hi<-round(coll$estimate+coll$std.error*1.96,2)
 pa<-coll$p.value
 pa <- ifelse(pa < 0.001, "<0.001", round(pa, 3))
 pa <- ifelse(pa <= 0.05 | pa == "<0.001", paste0("*",pa), ifelse(pa > 0.05 & pa <= 0.1, paste0(".", pa),pa))
 cl<-data.frame(id=coll$term,diff=paste0(round(coll$estimate,2)," (",coll$lo," to ",coll$hi,")"),p=pa,stringsAsFactors=FALSE)
  return(cl)
 }
 keeps<-c("term","estimate","std.error","p.value")
 ```
 ### Repeated bivariabel analysis
 Not necessary for this, but an interesting addition
 ```{r}
 coll<-c()
 for (i in c("rtreat",adjs_10)){
  ## Bivariable linear regression analysis of all 
  coeffs<-summary(pool(
    with(imputed,lm(as.formula(paste0("mdi_1_enr~",i))))
    ))[-1,c("term","estimate","std.error","p.value")]
  coll<-rbind(coll,coeffs)
  ## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula
  ## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3
 }
 ## Collecting
 coll$lo<-round(coll$estimate-coll$std.error*1.96,2)
 coll$hi<-round(coll$estimate+coll$std.error*1.96,2)
 pa<-coll$p.value
 pa <- ifelse(pa < 0.001, "<0.001", round(pa, 3))
 pa <- ifelse(pa <= 0.05 | pa == "<0.001", paste0("*",pa), ifelse(pa > 0.05 & pa <= 0.1, paste0(".", pa),pa))
 coll_bi<-data.frame(diff=paste0(round(exp(coll$estimate),2)," (",coll$lo," to ",coll$hi,")"),p=pa,id=coll$term,stringsAsFactors=FALSE)
 ```
 ## Unadjusted analyses
 ```{r}
 adjs_10m<-adjs_10[adjs_10!="pase_0_bin"]
 adj_m<-c("rtreat","pase_0_bin")
 coll<-c()
 nms<-c()
 for (l in outc){
 for (i in adj_m){
  coeffs<-summary(pool(
    with(imputed,lm(as.formula(paste0(l,"~",i))))
    ))[-1,keeps]
  coll<-rbind(coll,coeffs)
  nms<-c(nms,paste(l,i,sep = "_"))
  d.long <- mice::complete(imputed,"long",include = T)
  # Inspiration: https://stackoverflow.com/questions/53014141/mice-splitting-imputed-data-for-further-analysis
  for (j in levels(d.long[[i]])){
    k<-length(adj_m)-grep(i,adj_m)+1 ## This only works to select the "opposite" of i for length(adj_m)==2
    s.imp<-mice::as.mids(d.long[which(d.long[[i]] == j),]) # Subsetting long and convert to "mids" format for pooling
    coeffs<-summary(pool(
    with(s.imp,lm(as.formula(paste0(l,"~",adj_m[k]))))
    ))[-1,keeps]
    coll<-rbind(coll,coeffs)
    nms<-c(nms,paste(l,j,sep = "_"))
  }
  ## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula
  ## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3
 }
 }
 coll$term<-nms
 ```
 ### Collecting
 ```{r}
 biv_coll<-pool_table(coll)
 ```
 ## Adjusted analyses
 ```{r}
 adjs_10m<-adjs_10[adjs_10!="pase_0_bin"]
 adj_m<-c("rtreat","pase_0_bin")
 coll<-c()
 nms<-c()
 for (l in outc){
 # l="mdi_1_enr"
 for (i in adj_m){
  coeffs<-summary(pool(
    with(imputed,lm(as.formula(paste0(l,"~",paste(i,paste(adjs_10m,collapse="+"),sep="+")))))
    ))[2,keeps]
  coll<-rbind(coll,coeffs)
  nms<-c(nms,paste(l,i,sep = "_"))
  d.long <- mice::complete(imputed,"long",include = T)
  # Inspiration: https://stackoverflow.com/questions/53014141/mice-splitting-imputed-data-for-further-analysis
  for (j in levels(d.long[[i]])){
    k<-length(adj_m)-grep(i,adj_m)+1 ## This only works to select the "opposite" of i for length(adj_m)==2
    s.imp<-mice::as.mids(d.long[which(d.long[[i]] == j),]) # Subsetting long and convert to "mids" format for pooling
    coeffs<-summary(pool(
    with(s.imp,lm(as.formula(paste0(l,"~",paste(adj_m[k],paste(adjs_10m,collapse="+"),sep="+")))))
    ))[2,keeps]
    coll<-rbind(coll,coeffs)
    nms<-c(nms,paste(l,j,sep = "_"))
  }
  ## Inspiration: https://stackoverflow.com/questions/40132829/r-for-loop-in-a-formula
  ## Also: https://gist.github.com/AaronGullickson/3ccb3fdd1778b32fc46df40d78faf5d3
 }
 }
 coll$term<-nms
 ```
 ### Collecting
 ```{r}
 mul_coll<-pool_table(coll)
 colnames(mul_coll)[-1]<-paste0("adj_",colnames(mul_coll)[-1])
 ```
 ```{r}
 library(lubridate)
 write.csv(full_join(biv_coll,mul_coll),paste0("/Volumes/Data/depression/imp_regression_",today(),".csv"))
 ```
--- a/dep_imputation.pdf
+++ b/dep_imputation.pdf
--- a/dep_median.Rmd
+++ b/dep_median.Rmd
@ -0,0 +1,97 @@
 ---
 title: "Additional data analysis"
 author: "Andreas Gammelgaard Damsbo"
 date: "Knitted: `r format(Sys.time(), '%d %B, %Y')`"
 output: pdf_document
 ---
 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
 ```
 # Import
 ```{r}
 dta_all<-read.csv("/Volumes/Data/depression/dep_dataset.csv")
 ```
 # Defining patients to include for analysis
 Only including cases with complete pase_0 and MDI at 1 & 6 months
 ```{r}
 dta<-dta_all[!is.na(dta_all$pase_0),]
 # &!is.na(dta$mdi_1)&!is.na(dta$mdi_6)
 quantile(dta$pase_0, probs = seq(0, 1, 0.25), names = TRUE)
 ```
 ## Formatting
 ```{r echo=FALSE}
 dta$rtreat<-factor(dta$rtreat)
 dta$pase_6<-as.numeric(dta$pase_6)
 ```
 # Summaries
 Fraction with inc_time of at least 166 days
 ```{r}
 dt<-as.numeric(dta[dta$excluded_6%in%c("dt_6","en_6"),c("inc_time")])>=166
 summary(dt)
 length(dt[dt==TRUE])/length(dt)*100 # Percent after 166 days
 ```
 5% percentiler
 ```{r}
 quantile(dt, probs = seq(0, 1, 0.05), names = TRUE)
 ```
 Base version
 ```{r}
 aggregate(pase_6 ~ rtreat, data = dta, summary)
 ```
 Fancy version
 ```{r}
 psych::describeBy(dta$pase_6, dta$rtreat,mat=T)
 ```
 # Mann-Whitney U test
 See: https://stat-methods.com/home/mann-whitney-u-r/
 ```{r}
 #Perform the Mann-Whitney U test
 m1<-wilcox.test(pase_6 ~ rtreat, data=dta, na.rm=TRUE, 
                paired=FALSE, exact=FALSE, conf.int=TRUE)
 print(m1)
 ```
 # Boxplot
 ## Base function - simple
 ```{r}
 boxplot(dta$pase_6 ~ dta$rtreat)
 ```
 ## ggplot2 - fancy version
 ```{r}
 library(ggplot2)
 ggplot(dta, aes(x = rtreat, y = pase_6, fill = rtreat)) +
  stat_boxplot(geom ="errorbar", width = 0.5) +
  geom_boxplot(fill = "light blue") + 
  stat_summary(fun.y=mean, geom="point", shape=10, size=3.5, color="black") +   
   # Point symbol is mean value
  ggtitle("Boxplot of Treatments C and D") + 
  theme_bw() + theme(legend.position="none")
 ```
 # Bonus: QQ plots
 ```{r}
 library(qqplotr)
 ggplot(data = dta, mapping = aes(sample = pase_6, color = rtreat, fill = rtreat)) +
  stat_qq_band(alpha=0.5, conf=0.95, qtype=1, bandType = "boot") +
  stat_qq_line(identity=TRUE) +
  stat_qq_point(col="black") +
  facet_wrap(~ rtreat, scales = "free") +
  labs(x = "Theoretical Quantiles", y = "Sample Quantiles") + theme_bw()
 ```
--- a/dep_median.pdf
+++ b/dep_median.pdf
--- a/dep_regression.Rmd
+++ b/dep_regression.Rmd
@ -0,0 +1,220 @@
 ---
 title: "Regression analyses"
 author: "Andreas Gammelgaard Damsbo"
 date: "Knitted: `r format(Sys.time(), '%d %B, %Y')`"
 output: pdf_document
 ---
 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE, message = FALSE)
 ```
 # Import
 ```{r}
 dta_all<-read.csv("/Volumes/Data/depression/dep_dataset.csv")
 ```
 # Defining patients to include for analysis
 Only including cases with complete pase_0 and MDI at 1 & 6 months
 ```{r}
 dta<-dta_all[!is.na(dta_all$pase_0),]
 # &!is.na(dta$mdi_1)&!is.na(dta$mdi_6)
 ```
 ## Formatting
 ```{r echo=FALSE}
 dta$diabetes<-factor(dta$diabetes)
 dta$pad<-factor(dta$pad)
 dta$civil<-factor(dta$civil)
 dta$hypertension<-factor(dta$hypertension)
 dta$afli<-factor(dta$afli)
 dta$smoke_ever<-factor(dta$smoke_ever)
 dta$ami<-factor(dta$ami)
 dta$tci<-factor(dta$tci)
 dta$thrombolysis<-factor(dta$thrombolysis)
 dta$thrombechtomy<-factor(dta$thrombechtomy)
 dta$rep_any<-factor(dta$rep_any)
 dta$pad<-factor(dta$pad)
 dta$nihss_0<-as.numeric(dta$nihss_0)
 dta$age<-as.numeric(dta$age)
 dta$rtreat<-factor(dta$rtreat)
 dta$sex<-factor(dta$sex)
 dta$pase_0<-as.numeric(dta$pase_0)
 dta$pase_6<-as.numeric(dta$pase_6)
 dta$bmi<-as.numeric(dta$bmi)
 dta$mdi_6<-as.numeric(dta$mdi_6)
 dta$pase_0_bin<-factor(dta$pase_0_bin,levels=c("lower","higher"))
 ```
 ```{r}
 # Backup
 dta_b<-dta
 ```
 # Linear regression analysis
 ```{r}
 library(broom)
 library(daDoctoR)
 library(lubridate)
 ```
 ## Tests of variables to adjust for
 ```{r}
 # Possible variables to include
 adjs<-c("age","sex","smoke_ever","civil","bmi","diabetes", "hypertension", "afli","pad","nihss_0","rep_any")
 ```
 ```{r}
 # Variables with p<10% i bivariable linear regression analysis
 print(adjs_10<-rep_lm(meas = "mdi_6",string=c("pase_0_bin","rtreat",adjs),data=dta,cut.p = .1)[[2]])
 ```
 ## True mean estimations (adjusted)
 ```{r}
 strt<-append(print_pred_stratum(meas = "mdi_6",adj=unique(c("pase_0_bin",adjs_10)),strat="rtreat",data=dta,include.stratum = T),print_pred_stratum(meas = "mdi_6",adj=c(adjs_10[adjs_10!="pase_0_bin"],"rtreat"),strat="pase_0_bin",data=dta,include.stratum = T))
 for (i in 1:length(strt)){
  write.csv(strt[[i]][[1]],paste0("tbl_md6_",substr(names(strt)[i],1,3),".csv"))
 }
 c<-c()
 for (i in 1:length(strt)){
  c<-c(c,paste("Estimated true mean,",names(strt)[i]),strt[[i]][[5]])
 }
 mat_true<-matrix(c(c,c("Variables adjusted for:",paste(c("rtreat",adjs_10), collapse=', '))), ncol=2, byrow = T)
 ```
 # MDI outcome 2x2 
 ```{r}
 sts<-c("pase_0_bin","rtreat")
 # sts<-c("rtreat","pase_0_bin")
 adjs_10m<-adjs_10[adjs_10!="pase_0_bin"]
 ```
 ## One month
 ### Enriched
 ```{r echo=FALSE}
 outs<-c("mdi_1_enr")
 source("/Volumes/Data/depression/script_regression_frame.R")
 show(reg_frm)
 write.csv(reg_frm,paste0(outs,"_matrix_",today(),".csv"))
 ```
 ### Raw
 ```{r echo=FALSE}
 outs<-c("mdi_1")
 source("/Volumes/Data/depression/script_regression_frame.R")
 show(reg_frm)
 write.csv(reg_frm,paste0(outs,"_matrix_",today(),".csv"))
 ```
 ## Six months
 ### New Observations - enriched
 ```{r echo=FALSE}
 outs<-c("mdi_6_newobs_enr")  # Outcome is based on new observations for 2nd (6 months) visit
 # mean(dta$inc_time[!is.na(dta[,outs])])
 # quantile(dta$inc_time[!is.na(dta[,outs])])
 source("/Volumes/Data/depression/script_regression_frame.R")
 show(reg_frm)
 write.csv(reg_frm,paste0(outs,"_matrix_",today(),".csv"))
 ```
 ### New Observations
 ```{r echo=FALSE}
 outs<-c("mdi_6_newobs")  # Outcome is based on new observations for 2nd (6 months) visit
 # mean(dta$inc_time[!is.na(dta[,outs])])
 # quantile(dta$inc_time[!is.na(dta[,outs])])
 source("/Volumes/Data/depression/script_regression_frame.R")
 show(reg_frm)
 write.csv(reg_frm,paste0(outs,"_matrix_",today(),".csv"))
 ```
 ### New observations - adjusted for 6 months PASE
 ```{r echo=FALSE}
 outs<-c("mdi_6_newobs")
 adjs_10m<-c(adjs_10[adjs_10!="pase_0_bin"],"pase_6")
 # mean(dta$inc_time[!is.na(dta[,outs])])
 # quantile(dta$inc_time[!is.na(dta[,outs])])
 source("/Volumes/Data/depression/script_regression_frame.R")
 show(reg_frm)
 write.csv(reg_frm,paste0(outs,"_matrix_",today(),".csv"))
 ```
 ## Dichotomized sensitivity analysis
 ```{r}
 dta$composite_out<-case_when(dta$open_treat=="yes"|(dta$mdi_6_newobs-dta$mdi_1)>5~"yes",
          is.na(dta$mdi_6_newobs)~"NA",
          is.na(dta$mdi_1)~"NA",
          TRUE~"no")
 dta$composite_out[dta$composite_out=="NA"]<-NA
 summary(dta$composite_out<-factor(dta$composite_out))
 ```
 ## Enriching and cleaning variables
 ```{r}
 # Enriching
 dta$pad[is.na(dta$pad)]<-"no"
 dta$hypertension[is.na(dta$hypertension)]<-"no"
 # Cleaning
 dta$civil<-factor(ifelse(dta$civil=="unknown",NA,dta$civil))
 ```
 ```{r}
 table(dta$rtreat,dta$pase_0_bin)
 outs<-"composite_out"
 sts<-c("pase_0_bin","rtreat")
 # sts<-c("rtreat","pase_0_bin")
 adjs_10m<-adjs_10[adjs_10!="pase_0_bin"]
 dta_frm<-dta[!is.na(dta$composite_out),c(outs,sts,adjs_10m)]
 summary(dta_frm)
 # colnames(dta_frm)[1]<-"outs"
 # print_log(meas="composite_out",var=sts[2],adj=c(sts[1],adjs_10m),data=dta_frm)
 # print_pred(meas="composite_out",adj=c(sts[2],adjs_10m),data=dta_frm[dta_frm$pase_0_bin=="lower",],n.by.adj = T)
 composite_out_lst<-list(print_pred_stratum(meas="composite_out",strat = sts[1],adj=c(sts[2],adjs_10m),
                                           data=dta_frm,n.by.adj = T),
                        print_pred_stratum(meas="composite_out",strat = sts[2],adj=c(sts[1],adjs_10m),
                                           data=dta_frm,n.by.adj = T))
 # show(composite_out_lst)
 capture.output(show(composite_out_lst), 
               file = paste0("composite_out_lst",today(),".txt"))
 ```
--- a/dep_regression.pdf
+++ b/dep_regression.pdf
--- a/dep_regression_interaction.R
+++ b/dep_regression_interaction.R
@ -0,0 +1,162 @@
 ## TALOS Depression regression analyses
 ## Created: 07.apr.2022
 ## New regression analyses incl anova in new tables based on gtsummary
 ## =============================================================================
 ## Data import and formatting
 ## =============================================================================
 dta_all<-read.csv("/Volumes/Data/depression/dep_dataset.csv",na.strings = c("NA","unknown"))
 dta<-dta_all[!is.na(dta_all$pase_0),]
 dta$diabetes<-factor(dta$diabetes)
 dta$pad<-factor(dta$pad)
 dta$civil<-factor(dta$civil)
 dta$hypertension<-factor(dta$hypertension)
 dta$afli<-factor(dta$afli)
 dta$smoke_ever<-factor(dta$smoke_ever)
 dta$ami<-factor(dta$ami)
 dta$tci<-factor(dta$tci)
 dta$thrombolysis<-factor(dta$thrombolysis)
 dta$thrombechtomy<-factor(dta$thrombechtomy)
 dta$rep_any<-factor(dta$rep_any,labels=c("no","yes"))
 dta$pad<-factor(dta$pad)
 dta$nihss_0<-as.numeric(dta$nihss_0)
 dta$age<-as.numeric(dta$age)
 dta$rtreat<-factor(dta$rtreat, levels=c("Placebo","Active"))
 dta$sex<-factor(dta$sex)
 dta$pase_0<-as.numeric(dta$pase_0)
 dta$pase_6<-as.numeric(dta$pase_6)
 dta$bmi<-as.numeric(dta$bmi)
 dta$mdi_6<-as.numeric(dta$mdi_6)
 dta$pase_0_bin<-factor(dta$pase_0_bin,levels=c("lower","higher"))
 dta_b<-dta
 library(broom)
 library(daDoctoR)
 library(lubridate)
 library(gtsummary)
 library(dplyr)
 library(gt)
 ## =============================================================================
 ## Regression setup
 ## =============================================================================
 # Possible variables to adjust for
 adjs<-c("age","sex","smoke_ever","civil","diabetes", "hypertension", "afli","pad","nihss_0","rep_any")
 preds<-c("pase_0_bin","rtreat")
 # Selection of variables
 print(adjs_10<-rep_lm(meas = "mdi_6",string=c(preds,adjs),data=dta,cut.p = .1)[[2]])
 ## =============================================================================
 ## One month regression
 ## =============================================================================
 out="mdi_1_enr"
 ds<-dta%>%select(all_of(c(out,preds,adjs_10)))
 # AOV
 # aov<-aov(as.formula(paste(out,".",sep = "~")), 
 #          data = ds) %>% 
 #   tidy() %>%
 #   gt()
 # Interaction
 int<-lm(as.formula(paste(out,paste0(paste(preds,collapse = "*"),"+."),sep = "~")), 
        data = ds) %>% 
  tbl_regression()%>%
  # add_n%>%
  bold_labels() %>%
  italicize_levels()
 # Bivariate
 biv<-tbl_uvregression(data=ds,
                      y=out,
                      method=lm
 )  %>% 
  bold_labels() %>%
  add_n%>%
  italicize_levels()
 # Multivariate
 mul<-lm(as.formula(paste(out,".",sep = "~")), 
        data = ds) %>% 
  tbl_regression()%>% 
  bold_labels() %>%
  add_n()%>%
  italicize_levels()
 # Table merge
 tbl_comb<-tbl_merge(
  tbls = list(biv, mul,int), 
  tab_spanner = c("**Bivariate**", 
                  "**Multivariate**",
                  "**Multivariate with interaction**") 
 )
 tbl_comb
 tbl_reg_1_rtf <- file("tbl_reg_1.RTF", "w")
 writeLines(tbl_comb%>%as_gt()%>%as_rtf(), tbl_reg_1_rtf)
 close(tbl_reg_1_rtf)
 ## =============================================================================
 ## Six month regression
 ## =============================================================================
 out="mdi_6_newobs_enr"
 ds<-dta%>%select(all_of(c(out,preds,adjs_10)))
 # AOV
 # aov<-aov(as.formula(paste(out,".",sep = "~")), 
 #          data = ds) %>% 
 #   tidy() %>%
 #   gt()
 # Interaction
 int<-lm(as.formula(paste(out,paste0(paste(preds,collapse = "*"),"+."),sep = "~")), 
        data = ds) %>% 
  tbl_regression()%>%
  # add_n%>%
  bold_labels() %>%
  italicize_levels()
 # Bivariate
 biv<-tbl_uvregression(data=ds,
                      y=out,
                      method=lm
 )  %>% 
  bold_labels() %>%
  add_n%>%
  italicize_levels()
 # Multivariate
 mul<-lm(as.formula(paste(out,".",sep = "~")), 
        data = ds) %>% 
  tbl_regression()%>% 
  bold_labels() %>%
  add_n()%>%
  italicize_levels()
 # Table merge
 tbl_comb<-tbl_merge(
  tbls = list(biv, mul,int), 
  tab_spanner = c("**Bivariate**", 
                  "**Multivariate**",
                  "**Multivariate with interaction**") 
 )
 tbl_comb
 tbl_reg_6_rtf <- file("tbl_reg_6.RTF", "w")
 writeLines(tbl_comb%>%as_gt()%>%as_rtf(), tbl_reg_1_rtf)
 close(tbl_reg_6_rtf)
--- a/dep_tableone.Rmd
+++ b/dep_tableone.Rmd
@ -0,0 +1,118 @@
 ---
 title: "Table One"
 author: "Andreas Gammelgaard Damsbo"
 date: "Knitted: `r format(Sys.time(), '%d %B, %Y')`"
 output: pdf_document
 ---
 # Import
 ```{r}
 dta<-read.csv("/Volumes/Data/depression/dep_dataset.csv")
 ```
 ## Formatting
 ```{r}
 dta$diabetes<-factor(dta$diabetes)
 dta$pad<-factor(dta$pad)
 dta$civil<-factor(dta$civil)
 dta$hypertension<-factor(dta$hypertension)
 dta$afli<-factor(dta$afli)
 dta$smoke_ever<-factor(dta$smoke_ever)
 dta$ami<-factor(dta$ami)
 dta$tci<-factor(dta$tci)
 dta$thrombolysis<-factor(dta$thrombolysis)
 dta$thrombechtomy<-factor(dta$thrombechtomy)
 dta$rep_any<-factor(dta$rep_any)
 dta$pad<-factor(dta$pad)
 dta$nihss_0<-as.numeric(dta$nihss_0)
 dta$age<-as.numeric(dta$age)
 dta$rtreat<-factor(dta$rtreat)
 dta$sex<-factor(dta$sex)
 dta$pase_0<-as.numeric(dta$pase_0)
 dta$bmi<-as.numeric(dta$bmi)
 dta$mdi_6<-as.numeric(dta$mdi_6)
 dta$inc_time<-as.numeric(dta$inc_time)
 ```
 # Defining patients to include for analysis
 Only including cases with complete pase_0 and MDI at 1 & 6 months
 ```{r}
 dta<-dta[!is.na(dta$pase_0),]
 # &!is.na(dta$mdi_1)&!is.na(dta$mdi_6)
 ```
 ## Defining table one stratification
 ```{r}
 dta$strat_table_one<-factor(case_when(is.na(dta$mdi_6_newobs)~"zExcluded",
                         dta$pase_0_bin=="lower"~"xLower",
                         dta$pase_0_bin=="higher"~"yHigher"))
 # summary(dta$strat_table_one)
 ```
 ```{r}
 library(plyr)
 dta$in_ex<-mapvalues(dta$strat_table_one, from=c("xLower", "yHigher"), to=c("xIncluded","xIncluded"))
 # summary(dta$in_ex)
 library(dplyr)
 ```
 # Basic analyses
 ```{r}
 show(mdn<-median(dta$pase_0))
 hist(dta$pase_0,100)
 hist(sqrt(dta$pase_0),100)
 ```
 # Table One
 ```{r}
 library(tableone)
 ```
 ```{r}
 tbl_norm<-c("rtreat","age","sex","bmi","smoke_ever","civil","diabetes", "hypertension", "afli", "ami", "tci","pad","nihss_0", "thrombolysis", "thrombechtomy","rep_any","inc_time")
 tbl_cat<-c("rtreat","sex","diabetes", "hypertension", "smoke_ever","civil", "ami", "tci", "thrombolysis", "thrombechtomy","rep_any")
 tbl_non<-c("age","nihss_0","inc_time")
 ```
 ```{r}
 tab1 <- CreateTableOne(vars = tbl_norm, data = dta, factorVars = tbl_cat,includeNA = TRUE)
 tbl1_1<-print(tab1, contDigits = 1, missing=T,showAllLevels=T ,nonnormal = tbl_non, smd = FALSE, quote = F, noSpaces = TRUE)
 ```
 ```{r}
 tab2 <- CreateTableOne(vars = tbl_norm, strata="pase_0_bin",data = dta, factorVars = tbl_cat,includeNA = T)
 tbl1_2<-print(tab2, contDigits = 1, missing=T,showAllLevels=T ,nonnormal = tbl_non, smd = F,test = T, quote = F, noSpaces = TRUE)
 ```
 ```{r}
 tab3 <- CreateTableOne(vars = tbl_norm, strata="strat_table_one",data = dta, factorVars = tbl_cat,includeNA = T)
 tbl1_3<-print(tab3, contDigits = 1, missing=T,showAllLevels=T ,nonnormal = tbl_non, smd = F,test = T, quote = F, noSpaces = TRUE)
 ```
 ```{r}
 tab4 <- CreateTableOne(vars = tbl_norm, strata="in_ex",data = dta, factorVars = tbl_cat,includeNA = T)
 tbl1_4<-print(tab4, contDigits = 1, missing=T,showAllLevels=T ,nonnormal = tbl_non, smd = F,test = T, quote = F, noSpaces = TRUE)
 ```
 ```{r}
 table(is.na(dta$nihss_0),dta$strat_table_one)
 table(is.na(dta$bmi),dta$strat_table_one)
 ```
 ```{r}
 dta<-dta[dta$strat_table_one!="zExcluded",]
 dta$strat_table_one<-factor(dta$strat_table_one)
 tab5 <- CreateTableOne(vars = tbl_norm, strata="strat_table_one",data = dta, factorVars = tbl_cat,includeNA = T)
 tbl1_5<-print(tab5, contDigits = 1, missing=T,showAllLevels=T ,nonnormal = tbl_non, smd = F,test = T, quote = F, noSpaces = TRUE)
 ```
 ```{r}
 library(lubridate)
 tbl_list<-list(tbl1_1,tbl1_2,tbl1_3,tbl1_4,tbl1_5)
 for (i in 1:length(tbl_list)){
  nm<-paste0("tbl1_",i)
  write.csv(tbl_list[[i]],paste0("/Volumes/Data/depression/",nm,"_",unlist(strsplit(as.character(now()),"[ ]"))[1],".csv"))
 }
 ```
--- a/dep_tableone.pdf
+++ b/dep_tableone.pdf
--- a/dep_tableone_enriched.Rmd
+++ b/dep_tableone_enriched.Rmd
@ -0,0 +1,153 @@
 ---
 title: "Table One - enriched"
 author: "Andreas Gammelgaard Damsbo"
 date: "Knitted: `r format(Sys.time(), '%d %B, %Y')`"
 output: pdf_document
 ---
 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE, message = FALSE)
 ```
 # Import
 ```{r}
 dta<-read.csv("/Volumes/Data/depression/dep_dataset.csv")
 ```
 ## Formatting
 ```{r}
 dta$diabetes<-factor(dta$diabetes)
 dta$pad<-factor(dta$pad)
 dta$civil<-factor(dta$civil)
 dta$hypertension<-factor(dta$hypertension)
 dta$afli<-factor(dta$afli)
 dta$smoke_ever<-factor(dta$smoke_ever)
 dta$ami<-factor(dta$ami)
 dta$tci<-factor(dta$tci)
 dta$thrombolysis<-factor(dta$thrombolysis)
 dta$thrombechtomy<-factor(dta$thrombechtomy)
 dta$rep_any<-factor(dta$rep_any)
 dta$pad<-factor(dta$pad)
 dta$nihss_0<-as.numeric(dta$nihss_0)
 dta$age<-as.numeric(dta$age)
 dta$rtreat<-factor(dta$rtreat)
 dta$sex<-factor(dta$sex)
 dta$pase_0<-as.numeric(dta$pase_0)
 dta$bmi<-as.numeric(dta$bmi)
 dta$mdi_6<-as.numeric(dta$mdi_6)
 dta$inc_time<-as.numeric(dta$inc_time)
 dta$bmi_isna<-is.na(dta$bmi)
 dta$nihss_0_isna<-is.na(dta$nihss_0)
 ```
 # Defining patients to include for analysis
 ```{r  message=FALSE}
 library(dplyr)
 ```
 Only including cases with complete pase_0 and MDI at 1 & 6 months
 ```{r}
 dta<-dta[!is.na(dta$pase_0),]
 # &!is.na(dta$mdi_1)&!is.na(dta$mdi_6)
 ```
 ## Defining table one stratification
 ```{r}
 dta$strat_table_one<-factor(case_when(dta$excluded_6%in%c("mi_6","ex_6")~"zExcluded",
                         dta$pase_0_bin=="lower"~"xLower",
                         dta$pase_0_bin=="higher"~"yHigher"))
 summary(dta$strat_table_one)
 ```
 ```{r}
 dta$in_ex<-plyr::mapvalues(dta$strat_table_one, 
                           from=c("xLower", "yHigher"), 
                           to=c("xIncluded","xIncluded"))
 # summary(dta$in_ex)
 ```
 # Basic analyses
 ```{r}
 show(mdn<-median(dta$pase_0))
 hist(dta$pase_0,100)
 hist(sqrt(dta$pase_0),100)
 ```
 # Table One
 ```{r}
 library(tableone)
 ```
 ```{r}
 tbl_norm<-c("rtreat","age","sex","bmi","bmi_isna","smoke_ever","civil","diabetes", 
            "hypertension", "afli", "ami", "tci","pad","nihss_0","nihss_0_isna", 
            "thrombolysis", "thrombechtomy","rep_any","inc_time")
 tbl_cat<-c("rtreat","sex","bmi_isna","diabetes", "hypertension", "smoke_ever","civil", 
           "ami", "tci","nihss_0_isna", "thrombolysis", 
           "thrombechtomy","rep_any")
 tbl_non<-c("age","nihss_0","inc_time")
 ```
 ```{r}
 tab1 <- CreateTableOne(vars = tbl_norm, data = dta, 
                       factorVars = tbl_cat,includeNA = TRUE)
 tbl1_1<-print(tab1, contDigits = 1, missing=T,showAllLevels=T ,
              nonnormal = tbl_non, smd = FALSE, quote = F, noSpaces = TRUE)
 ```
 ```{r}
 tab2 <- CreateTableOne(vars = tbl_norm, strata="pase_0_bin",
                       data = dta, factorVars = tbl_cat,includeNA = T)
 tbl1_2<-print(tab2, contDigits = 1, missing=T,showAllLevels=T,
              nonnormal = tbl_non, smd = F,test = T, quote = F, noSpaces = TRUE)
 ```
 ```{r}
 tab3 <- CreateTableOne(vars = tbl_norm, strata="strat_table_one",
                       data = dta, factorVars = tbl_cat,includeNA = T)
 tbl1_3<-print(tab3, contDigits = 1, missing=T,showAllLevels=T,
              nonnormal = tbl_non, smd = F,test = T, quote = F, noSpaces = TRUE)
 ```
 ```{r}
 tab4 <- CreateTableOne(vars = tbl_norm, strata="in_ex",
                       data = dta, factorVars = tbl_cat,includeNA = T)
 tbl1_4<-print(tab4, contDigits = 1, missing=T,showAllLevels=T,
              nonnormal = tbl_non, smd = F,test = T, quote = F, noSpaces = TRUE)
 ```
 ```{r}
 table(is.na(dta$nihss_0),dta$strat_table_one)
 table(is.na(dta$bmi),dta$strat_table_one)
 ```
 ```{r}
 dta<-dta[dta$strat_table_one!="zExcluded",]
 dta$strat_table_one<-factor(dta$strat_table_one)
 tab5 <- CreateTableOne(vars = tbl_norm, strata="strat_table_one",
                       data = dta, factorVars = tbl_cat,includeNA = T)
 tbl1_5<-print(tab5, contDigits = 1, missing=T,showAllLevels=T,
              nonnormal = tbl_non, smd = F,test = T, quote = F, noSpaces = TRUE)
 ```
 ```{r}
 tbl_list<-list(tbl1_1,tbl1_2,tbl1_3,tbl1_4,tbl1_5)
 for (i in 1:length(tbl_list)){
  nm<-paste0("tbl1_",i)
  write.csv(tbl_list[[i]],
            paste0("/Volumes/Data/depression/",nm,"_enr_",
                   lubridate::today(),".csv"))
 }
 ```
--- a/dep_tableone_enriched.pdf
+++ b/dep_tableone_enriched.pdf
--- a/function_chi_test_sum.R
+++ b/function_chi_test_sum.R
@ -0,0 +1,10 @@
 chi_test_sum<-function(a,b,aname=NULL,bname=NULL){
  chi<-chisq.test(table(a,b))
  lst<-list(matrix(chi$observed,nrow=2,byrow = FALSE,dimnames = list(levels(factor(a)),levels(factor(b)))),chi$p.value)
  names(lst)<-c(paste("observed,",aname,"vs",bname),"pval^")
  return(lst)
 }
--- a/function_flow.R
+++ b/function_flow.R
@ -0,0 +1,54 @@
 flow_prog<-function(df,sngl,sngl_keep,mltp=NULL){
  # df is the data frame
  # sngl is a vector of variable names to exclude from the data frame one by one, prioritised
  # sngl_keep is a variable of names of the same length as sngl containing the vector names of cases to keep for flow analysis
  # mltp is a vector of variable names to count, can be one or more
  # first sngl, then mltp
  # all specified variables should be factors or "factorisable"
  # If categorical variables are decided, define a new variable. More elegant solution is desirable, but this works (!).
  # Dependencies
  library(dplyr)
  # Small, home made, general summary function with naming
  summary_list<-function(var){
    v<-factor(var)
    x<-levels(v)
    y<-summary(v)
    lst<-list(data.frame(matrix(y,nrow=length(x),byrow = TRUE,dimnames = list(x,deparse(substitute(var))))))  # deparse and substitue is used to get the name of the vector for naming the matrix. Nice!
    return(lst)
  }
  slist<-list(nrow(df))
  names(slist)[1]<-"nrow of provided data frame"
  # Handling sngl first
  for (i in 1:length(sngl)){
    z<-summary_list(df[,sngl[i]])
    colnames(z[[1]])<-sngl[i]
    slist[length(slist)+1] <- z
    names(slist)[length(slist)]<-sngl[i]
    df <- df %>% filter(df[,sngl[i]]==sngl_keep[i])
  }
  # Handling mltp
  if (!is.null(mltp)){
    for (i in 1:length(mltp)){
      z<-summary_list(df[,mltp[i]])
      colnames(z[[1]])<-mltp[i]
      slist[length(slist)+1] <- z
      names(slist)[length(slist)]<-paste0(mltp[i],"_for_",last(sngl),"==",last(sngl_keep))
    }
  }
  return(slist)
 }
 df<-dta
 sngl<-c("mors_v1","drop")
 sngl_keep<-c("no","yes")
 mltp<-c("open_treat","wants_out","side_effect","side_effect2")
--- a/plot.png
+++ b/plot.png
--- a/script_distribution_plot.R
+++ b/script_distribution_plot.R
@ -0,0 +1,32 @@
 # Distribution plot
 library(ggplot2)
 library(cowplot)
 pase_hist<-ggplot(data=dta, aes(pase_0)) + 
  geom_histogram(binwidth = 10, aes(y =..density..), 
                 col="orange", 
                 fill="orange", 
                 alpha=.2) + 
  geom_density(col="tomato") + 
  geom_vline(xintercept=median(dta$pase_0),col="navy") +
  scale_x_continuous(limits=c(0,600) ,breaks=seq(0,600,by=50)) +
  labs(title="Baseline PASE distribution", x="PASE score", y="Count") +
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank()
  )
 pase_box<-ggplot(data=dta, aes(pase_0,y=1)) + 
  geom_boxplot(col="olivedrab", 
               fill="olivedrab", 
               alpha=.2) +
  geom_jitter(col="maroon", alpha=0.3) +
  scale_x_continuous(limits=c(0,600) , breaks=seq(0,600,by=50)) +
  labs(title=NULL, x="PASE score", y="Count") +
  theme(axis.text.y=element_blank(),
        axis.ticks.y=element_blank()
  )
 pg<-cowplot::plot_grid(pase_hist, pase_box, align = "v", ncol = 1, rel_heights = c(0.7, 0.3))
--- a/script_flowchart.R
+++ b/script_flowchart.R
@ -0,0 +1,21 @@
 # Flowchart
 # Data set defined in dep_regression.Rmd
 summary_list<-function(var){
  v<-factor(var)
  x<-levels(v)
  y<-summary(v)
  lst<-list(data.frame(matrix(y,nrow=length(x),byrow = TRUE,dimnames = list(x,deparse(substitute(var))))))  # deparse and substitue is used to get the name of the vector for naming the matrix. Nice!
  return(lst)
 }
 slist<-summary_list(dta$drop)
 dta<-dta[dta$drop=="yes",]
 vars_l<-c("open_treat","wants_out","side_effect","side_effect2")
 for (i in 1:length(vars_l)){
  z<-summary_list(dta[,vars_l[i]])
  colnames(z[[1]])<-vars_l[i]
  slist[length(slist)+1] <- z
 }
--- a/script_regression_frame.R
+++ b/script_regression_frame.R
@ -0,0 +1,56 @@
 envir_b<-ls()
 ## Regression frame
 dta_frm<-dta[,c(outs,sts,adjs_10m)]
 colnames(dta_frm)[1]<-"outs"
 lst<-list()
 ## 'rtreat' horisontal, 'pase_0_bin' vertical
 for (i in 1:length(outs)){
  for (j in 1:length(sts)){
    ls_22<-list(print_diff_bygroup(meas="outs",group=sts[j],var=sts[length(sts)+1-j],adj=adjs_10m,data=dta_frm))
    names(ls_22)<-paste0(outs[i],"_",sts[j],"_ver_",sts[length(sts)+1-j],"_hor")
    lst<-append(lst,ls_22)
  }
  lst<-append(lst,print_pred_stratum(meas="outs",adj=c(sts[1],adjs_10m),strat=sts[2],data=dta_frm,include.stratum = T)[1])
 }
 # outs (outcome), Active
 oadr<-lst[[2]][c(6,8)][2,1] # LvsH, Diff raw
 oada<-lst[[2]][c(6,8)][2,2] # LvsH, Diff adj
 #oada_1<-oada
 # outs, Placebo
 opdr<-lst[[2]][c(6,8)][3,1] # LvsH, Diff raw
 opda<-lst[[2]][c(6,8)][3,2] # LvsH, Diff adj
 #opda_1<-opda
 spc<-""
 spc4<-c("","","","")
 mdi1_tbl<-rbind(lst[[1]],matrix(c("Unadjusted mean diff.",spc,oadr,spc,opdr,spc4,
                                  "Adjusted mean diff.",spc,oada,spc,opda,spc4),ncol=ncol(lst[[1]]),byrow=T,dimnames = list(c("a","b"),names(lst[[1]]))))
 # write.csv(mdi1_tbl,"mdi1_2x2.csv")
 reg_frm<-cbind(mdi1_tbl[,1],"",mdi1_tbl[,2:6],mdi1_tbl[,8])
 names(reg_frm)<-c("By_PA", "Rand_Total", "N_Active", "Mean_Active", "N_Placebo", "Mean_Placebo","Unadjusted_mean_diff", "Adjusted_mean_diff")
 reg_frm[1,1]<-"PASE_total"
 reg_frm[1,2]<-paste0(round(mean(dta_frm$outs, na.rm = TRUE), 1)," (",round(sd(dta_frm$outs, na.rm = TRUE), 1),")")
 reg_frm[1,3]<-nrow(dta_frm[dta_frm$rtreat=="Active"&!is.na(dta_frm$outs),])
 reg_frm[1,4]<-paste0(round(mean(dta_frm$outs[dta_frm$rtreat=="Active"], na.rm = TRUE), 1)," (",round(sd(dta_frm$outs[dta_frm$rtreat=="Active"], na.rm = TRUE), 1),")")
 reg_frm[1,5]<-nrow(dta_frm[dta_frm$rtreat=="Placebo"&!is.na(dta_frm$outs),])
 reg_frm[1,6]<-paste0(round(mean(dta_frm$outs[dta_frm$rtreat=="Placebo"], na.rm = TRUE), 1)," (",round(sd(dta_frm$outs[dta_frm$rtreat=="Placebo"], na.rm = TRUE), 1),")")
 ## Det var fedt med en universel løsning her, så den vender rigtigt i forhold til vektoren
 reg_frm[2,2]<-paste0(round(mean(dta_frm$outs[dta_frm$pase_0_bin=="lower"], na.rm = TRUE), 1)," (",round(sd(dta_frm$outs[dta_frm$pase_0_bin=="lower"], na.rm = TRUE), 1),")")
 reg_frm[3,2]<-paste0(round(mean(dta_frm$outs[dta_frm$pase_0_bin=="higher"], na.rm = TRUE), 1)," (",round(sd(dta_frm$outs[dta_frm$pase_0_bin=="higher"], na.rm = TRUE), 1),")")
 reg_frm[4,2]<-lst[[3]][[1]][7,3]
 reg_frm[5,2]<-lst[[3]][[1]][7,4]
 reg_frm[1,7]<-lst[[3]][[1]][4,3]
 reg_frm[1,8]<-lst[[3]][[1]][4,4]
 source("/Volumes/Data/func/remove_all_but.R")
 remove_all_but(c("reg_frm",envir_b))
--- a/talos-pa-depression.Rproj
+++ b/talos-pa-depression.Rproj
@ -0,0 +1,13 @@
 Version: 1.0
 RestoreWorkspace: Default
 SaveWorkspace: Default
 AlwaysSaveHistory: Default
 EnableCodeIndexing: Yes
 UseSpacesForTab: Yes
 NumSpacesForTab: 2
 Encoding: UTF-8
 RnwWeave: Sweave
 LaTeX: pdfLaTeX
--- a/tbl_reg_1.RTF
+++ b/tbl_reg_1.RTF
--- a/tbl_reg_6.RTF
+++ b/tbl_reg_6.RTF