## ItMLiHSmar2022
## data_format.R, child script
## Data formatting and handling
## Andreas Gammelgaard Damsbo, agdamsbo@clin.au.dk
## 
## Now modified to use in publication
## 

## ====================================================================
# Step 1: Loading libraries
## ====================================================================

library(Hmisc)
library(dplyr)
library(daDoctoR)
library(tidyselect)

## ====================================================================
# Step 2: Loading data
## ====================================================================
 
# rm(list = ls()) # Clear
# setwd("/Users/au301842/Library/CloudStorage/OneDrive-Personligt/Research/ISLcourse/") 
# dta<-read.csv("/Users/au301842/Library/CloudStorage/OneDrive-Personligt/Research/ISLcourse/assigndata.csv")

## ====================================================================
# Step 3: Formatting variables
## ====================================================================

dta <- export %>% 
  # as_tibble()%>%
  mutate(any_rep=factor(ifelse(thrombolysis=="yes"|thrombechtomy=="yes","yes","no")), # If not noted, no therapy was received
         male_sex= factor(ifelse(sex=="female","no","yes")),
         # smoke_ever=factor(ifelse(smoke_ever=="never","no","yes")),
         civil=factor(ifelse(civil=="partner","no","yes")), # Sets "yes" for not-cohabiting
         rtreat=factor(ifelse(rtreat=="Placebo","no","yes")), # "Yes" receives active treatment
         alc=factor(ifelse(alc=="more","yes","no")), # Yes for more than guideline
         pase_0=as.numeric(pase_0),
         pase_6=as.numeric(pase_6),
         across(c("diabetes",
                  "hypertension",
                  "smoker",
                  # "smoker_prev",
                  "afli",
                  "pad",
                  "ami",
                  "tci",
                  "mrs_0",
                  "mrs_1"),as.factor),
         across(c("nihss_c",
                  "age",
                  "mdi_1", # For "enriched" analysis
                  "who5_score_1",
                  "mfi_gen_1",
                  "mfi_phys_1",
                  "mfi_act_1",
                  "mfi_mot_1",
                  "mfi_men_1"),as.numeric )
         )%>% 
  select(-c(sex))
  
  
## ====================================================================
# Step 4: Defining outcome
## ====================================================================

## Changed to step 7
## This is to perform proper quantile split based on actually included.

## ====================================================================
# Step 5: Ordering variables
## ====================================================================


dta<-select(dta,c(age,
                  male_sex,
                  civil,
                  pase_0,
                  smoker,
                  # smoker_prev,
                  alc,
                  afli,
                  hypertension,
                  diabetes,
                  pad,
                  ami,
                  tci,
                  mrs_0,
                  nihss_c,
                  # thrombolysis,
                  # thrombechtomy,
                  any_rep,
                  rtreat,
                  pase_6, 
                  mrs_1,
                  mfi_gen_1,
                  mfi_phys_1,
                  mfi_act_1,
                  mfi_mot_1,
                  mfi_men_1,
                  mdi_1, 
                  who5_score_1
                  ))

## ====================================================================
# Step 6: Labeling
## ====================================================================

var.labels = c(age="Age",
               male_sex="Male",
               civil="Living alone",
               pase_0="Pre-stroke PASE score",
               pase_6="Six month PASE score",
               smoker="Daily or occasinally smoking",
               # smoker_prev="Previous habbit of smoking",
               alc="More alcohol than recommendation",
               afli="Known AFIB",
               hypertension="Known hypertension",
               diabetes="Known diabetes",
               pad="Known PAD",
               ami="Previous MI",
               tci="Previous TIA",
               mrs_0="Pre-stroke mRS  [-1]",
               nihss_c="Acute NIHSS score",
               thrombolysis="Acute thrombolysis",
               thrombechtomy="Acute thrombechtomy",
               any_rep="Any reperfusion therapy",
               rtreat="Active trial treatment",
               mrs_1="One month mRS [-1]",
               mfi_gen_1="One month MFI (General fatigue)",
               mfi_phys_1="One month MFI (Physical fatigue)",
               mfi_act_1="One month MFI (Reduced activity)",
               mfi_mot_1="One month MFI (Reduced motivation)",
               mfi_men_1="One month MFI (Mental fatigue)",
               mdi_1="One month MDI",
               who5_score_1="One month WHO5",
               pase_decl_rel_fac="PASE score difference, relative F",
               pase_decl_abs_fac="PASE score difference, absolute F",
               pase_drop_fac="PASE first quartile drop F",
               pase_diff="PASE absolute decline",
               pase_decl_rel="PASE relative decline",
               pase_0_cut="PASE 0 quartiles",
               pase_6_cut="PASE 6 quartiles")

## Labelling based on outcome flag
if (pout=="decl_rel"|pout=="decl_abs"){
  var.labels = c(var.labels,group="PASE decline")}
if (pout=="drop"){
  var.labels = c(var.labels,group="PASE drop")}

## ====================================================================
# Step 7: final data export
## ====================================================================

data_summary<-summary(dta)

# Saving "old" factorised variables
sel<-sapply(dta,is.factor)
# Reformatting factors as 1/2 for analysis
dta<-dta |> 
  mutate(across(where(is.factor), as.numeric))|> # Turning factors into 1(no) or 2(yes) for model. Numbered alphabetically.
  mutate(across(matches(colnames(dta)[sel]), as.factor),
         across(starts_with("pase_"), as.numeric))

# Filtering out non-PASE
X_tbl<-dta |>
  filter(!is.na(pase_0),!is.na(pase_6))

nrow(X_tbl)

# Defining possible outcome meassures. Keeping in df for characterisation
X_tbl <- X_tbl|>
  mutate(## Relative decline
         pase_diff=(pase_0-pase_6),
         pase_decl_rel = pase_diff/pase_0*100,
         pase_decl_rel_fac=factor(ifelse(pase_decl_rel>=rel_dif,"yes","no")),
         ## Absolute decline
         pase_decl_abs_fac=factor(ifelse(pase_diff>=abs_dif,"yes","no")),
         ## Drop
         pase_0_cut=quantile_cut(as.numeric(pase_0),
                                 groups=4,
                                 group.names = c(as.character(1:4)),
                                 y=as.numeric(pase_0),
                                 ordered.f = TRUE,
                                 inc.outs = TRUE, 
                                 detail.lst=FALSE),
         pase_6_cut=quantile_cut(as.numeric(pase_6),
                                 groups=4,
                                 group.names = c(as.character(1:4)),
                                 y=as.numeric(pase_0),
                                 ordered.f = TRUE,
                                 inc.outs = TRUE, 
                                 detail.lst=FALSE),
         pase_drop_fac=factor(ifelse(pase_6_cut==1&pase_0_cut!=1,"yes","no")))

label(X_tbl) = as.list(var.labels[match(names(X_tbl), names(var.labels))])

# Setting final primary output from "pout"
if (pout=="decl_rel"){
  X_tbl <- X_tbl|>
    mutate(group=pase_decl_rel_fac)
  
  X_tbl_f <- X_tbl|>
    filter(pase_0!=0)|>
    select(-starts_with("pase_"))
}

if (pout=="decl_abs"){
  X_tbl <- X_tbl|>
    mutate(group=pase_decl_rel_fac)
  
  X_tbl_f <- X_tbl|>
    filter(pase_0>=abs_dif)|>
    select(-starts_with("pase_"))
}

if (pout=="drop"){
  X_tbl <- X_tbl|>
    mutate(group=pase_drop_fac)
  
  print(quantile(as.numeric(X_tbl$pase_0),na.rm=T))
  print(quantile(as.numeric(X_tbl$pase_6),na.rm=T))
  print(summary(X_tbl$pase_0_cut))
  
  X_tbl_f <- X_tbl|>
    filter(pase_0_cut!=1)|>
    select(-starts_with("pase_"))
}

# Excluding one month measures for primary analysis and setting df for table one
X_tbl_f <- X_tbl_f |> 
  select(-c(who5_score_1,
            mdi_1,
            mrs_1,
            starts_with("mfi_"))) # Left out of model as no present in drop-group

# Dropping non-complete for analysis
Xy <- X_tbl_f|> 
  na.omit()|> # Keeping only complete observations
  select(-c(tci) # Left out of model as no present in drop-group
  )|>
  mutate(mrs_0=factor(ifelse(mrs_0==1,1,2))) # Sets binary mRS 0 to include in glmnet, 0 or above

label(Xy) = as.list(var.labels[match(names(Xy), names(var.labels))])

X<-dplyr::select(Xy,-c(group, -starts_with("pase_")) # Exclude primary outcome
                 )
y<-Xy$group


## ====================================================================
# Secondary analysis
## ====================================================================


dta_s<-X_tbl|>
  select(-c(tci),
         -starts_with("pase_"))|> 
  na.omit()|>
  mutate(mrs_0=factor(ifelse(mrs_0==1,1,2)),# Sets binary mRS 0 to include in glmnet, 0 or above
         mrs_1=factor(ifelse(mrs_1==1,1,2)))# Sets binary mRS 1 to include in glmnet, 0 or above

label(dta_s) = as.list(var.labels[match(names(dta_s), names(var.labels))])

## ====================================================================
# Step 8: Loading rest of libraries
## ====================================================================

library(tidyverse)
library(patchwork)
library(caret)
library(glmnet)
library(leaps)
library(pROC)
library(gt)
library(gtsummary)
library(dplyr)