PhysicalActivityandStrokeOu.../1 PA Decline/data_format.R

## ItMLiHSmar2022
## data_format.R, child script
## Data formatting and handling
## Andreas Gammelgaard Damsbo, agdamsbo@clin.au.dk
## 
## Now modified to use in publication
## 

## ====================================================================
# Step 1: Loading libraries
## ====================================================================

library(Hmisc)
library(dplyr)
library(daDoctoR)
library(tidyselect)

## ====================================================================
# Step 2: Loading data
## ====================================================================
 
# rm(list = ls()) # Clear
# setwd("/Users/au301842/Library/CloudStorage/OneDrive-Personligt/Research/ISLcourse/") 
# dta<-read.csv("/Users/au301842/Library/CloudStorage/OneDrive-Personligt/Research/ISLcourse/assigndata.csv")

## ====================================================================
# Step 3: Formatting variables
## ====================================================================

dta <- export %>% 
  # as_tibble()%>%
  mutate(any_rep=factor(ifelse(thrombolysis=="yes"|thrombechtomy=="yes","yes","no")), # If not noted, no therapy was received
         male_sex= factor(ifelse(sex=="female","no","yes")),
         # smoke_ever=factor(ifelse(smoke_ever=="never","no","yes")),
         civil=factor(ifelse(civil=="partner","no","yes")), # Sets "yes" for not-cohabiting
         rtreat=factor(ifelse(rtreat=="Placebo","no","yes")), # "Yes" receives active treatment
         alc=factor(ifelse(alc=="more","yes","no")), # Yes for more than guideline
         pase_0=as.numeric(pase_0),
         pase_6=as.numeric(pase_6),
         across(c("diabetes",
                  "hypertension",
                  "smoker",
                  # "smoker_prev",
                  "afli",
                  "pad",
                  "ami",
                  "tci",
                  "mrs_0",
                  "mrs_1"),as.factor),
         across(c("nihss_c",
                  "age",
                  "mdi_1", # For "enriched" analysis
                  "who5_score_1",
                  "mfi_gen_1",
                  "mfi_phys_1",
                  "mfi_act_1",
                  "mfi_mot_1",
                  "mfi_men_1"),as.numeric )
         )%>% 
  select(-c(sex))
  
  
## ====================================================================
# Step 4: Defining outcome
## ====================================================================

## Changed to step 7
## This is to perform proper quantile split based on actually included.

## ====================================================================
# Step 5: Ordering variables
## ====================================================================


dta<-select(dta,c(age,
                  male_sex,
                  civil,
                  pase_0,
                  smoker,
                  # smoker_prev,
                  alc,
                  afli,
                  hypertension,
                  diabetes,
                  pad,
                  ami,
                  tci,
                  mrs_0,
                  nihss_c,
                  # thrombolysis,
                  # thrombechtomy,
                  any_rep,
                  rtreat,
                  pase_6, 
                  mrs_1,
                  mfi_gen_1,
                  mfi_phys_1,
                  mfi_act_1,
                  mfi_mot_1,
                  mfi_men_1,
                  mdi_1, 
                  who5_score_1
                  ))

## ====================================================================
# Step 6: Labeling
## ====================================================================

var.labels = c(age="Age",
               male_sex="Male",
               civil="Living alone",
               pase_0="Pre-stroke PASE score",
               pase_6="Six month PASE score",
               smoker="Daily or occasinally smoking",
               # smoker_prev="Previous habbit of smoking",
               alc="More alcohol than recommendation",
               afli="Known AFIB",
               hypertension="Known hypertension",
               diabetes="Known diabetes",
               pad="Known PAD",
               ami="Previous MI",
               tci="Previous TIA",
               mrs_0="Pre-stroke mRS  [-1]",
               nihss_c="Acute NIHSS score",
               thrombolysis="Acute thrombolysis",
               thrombechtomy="Acute thrombechtomy",
               any_rep="Any reperfusion therapy",
               rtreat="Active trial treatment",
               mrs_1="One month mRS [-1]",
               mfi_gen_1="One month MFI (General fatigue)",
               mfi_phys_1="One month MFI (Physical fatigue)",
               mfi_act_1="One month MFI (Reduced activity)",
               mfi_mot_1="One month MFI (Reduced motivation)",
               mfi_men_1="One month MFI (Mental fatigue)",
               mdi_1="One month MDI",
               who5_score_1="One month WHO5",
               pase_decl_rel_fac="PASE score difference, relative F",
               pase_decl_abs_fac="PASE score difference, absolute F",
               pase_drop_fac="PASE first quartile drop F",
               pase_diff="PASE absolute decline",
               pase_decl_rel="PASE relative decline",
               pase_0_cut="PASE 0 quartiles",
               pase_6_cut="PASE 6 quartiles")

## Labelling based on outcome flag
if (pout=="decl_rel"|pout=="decl_abs"){
  var.labels = c(var.labels,group="PASE decline")}
if (pout=="drop"){
  var.labels = c(var.labels,group="PASE drop")}

## ====================================================================
# Step 7: final data export
## ====================================================================

data_summary<-summary(dta)

# Saving "old" factorised variables
sel<-sapply(dta,is.factor)
# Reformatting factors as 1/2 for analysis
dta<-dta |> 
  mutate(across(where(is.factor), as.numeric))|> # Turning factors into 1(no) or 2(yes) for model. Numbered alphabetically.
  mutate(across(matches(colnames(dta)[sel]), as.factor),
         across(starts_with("pase_"), as.numeric))

# Filtering out non-PASE
X_tbl<-dta |>
  filter(!is.na(pase_0),!is.na(pase_6))

nrow(X_tbl)

# Defining possible outcome meassures. Keeping in df for characterisation
X_tbl <- X_tbl|>
  mutate(## Relative decline
         pase_diff=(pase_0-pase_6),
         pase_decl_rel = pase_diff/pase_0*100,
         pase_decl_rel_fac=factor(ifelse(pase_decl_rel>=rel_dif,"yes","no")),
         ## Absolute decline
         pase_decl_abs_fac=factor(ifelse(pase_diff>=abs_dif,"yes","no")),
         ## Drop
         pase_0_cut=quantile_cut(as.numeric(pase_0),
                                 groups=4,
                                 group.names = c(as.character(1:4)),
                                 y=as.numeric(pase_0),
                                 ordered.f = TRUE,
                                 inc.outs = TRUE, 
                                 detail.lst=FALSE),
         pase_6_cut=quantile_cut(as.numeric(pase_6),
                                 groups=4,
                                 group.names = c(as.character(1:4)),
                                 y=as.numeric(pase_0),
                                 ordered.f = TRUE,
                                 inc.outs = TRUE, 
                                 detail.lst=FALSE),
         pase_drop_fac=factor(ifelse(pase_6_cut==1&pase_0_cut!=1,"yes","no")))

label(X_tbl) = as.list(var.labels[match(names(X_tbl), names(var.labels))])

# Setting final primary output from "pout"
if (pout=="decl_rel"){
  X_tbl <- X_tbl|>
    mutate(group=pase_decl_rel_fac)
  
  X_tbl_f <- X_tbl|>
    filter(pase_0!=0)|>
    select(-starts_with("pase_"))
}

if (pout=="decl_abs"){
  X_tbl <- X_tbl|>
    mutate(group=pase_decl_rel_fac)
  
  X_tbl_f <- X_tbl|>
    filter(pase_0>=abs_dif)|>
    select(-starts_with("pase_"))
}

if (pout=="drop"){
  X_tbl <- X_tbl|>
    mutate(group=pase_drop_fac)
  
  print(quantile(as.numeric(X_tbl$pase_0),na.rm=T))
  print(quantile(as.numeric(X_tbl$pase_6),na.rm=T))
  print(summary(X_tbl$pase_0_cut))
  
  X_tbl_f <- X_tbl|>
    filter(pase_0_cut!=1)|>
    select(-starts_with("pase_"))
}

# Excluding one month measures for primary analysis and setting df for table one
X_tbl_f <- X_tbl_f |> 
  select(-c(who5_score_1,
            mdi_1,
            mrs_1,
            starts_with("mfi_"))) # Left out of model as no present in drop-group

# Dropping non-complete for analysis
Xy <- X_tbl_f|> 
  na.omit()|> # Keeping only complete observations
  select(-c(tci) # Left out of model as no present in drop-group
  )|>
  mutate(mrs_0=factor(ifelse(mrs_0==1,1,2))) # Sets binary mRS 0 to include in glmnet, 0 or above

label(Xy) = as.list(var.labels[match(names(Xy), names(var.labels))])

X<-dplyr::select(Xy,-c(group, -starts_with("pase_")) # Exclude primary outcome
                 )
y<-Xy$group


## ====================================================================
# Secondary analysis
## ====================================================================


dta_s<-X_tbl|>
  select(-c(tci),
         -starts_with("pase_"))|> 
  na.omit()|>
  mutate(mrs_0=factor(ifelse(mrs_0==1,1,2)),# Sets binary mRS 0 to include in glmnet, 0 or above
         mrs_1=factor(ifelse(mrs_1==1,1,2)))# Sets binary mRS 1 to include in glmnet, 0 or above

label(dta_s) = as.list(var.labels[match(names(dta_s), names(var.labels))])

## ====================================================================
# Step 8: Loading rest of libraries
## ====================================================================

library(tidyverse)
library(patchwork)
library(caret)
library(glmnet)
library(leaps)
library(pROC)
library(gt)
library(gtsummary)
library(dplyr)
updated analysis and much more 2022-09-28 16:03:58 +02:00			`## ItMLiHSmar2022`
			`## data_format.R, child script`
			`## Data formatting and handling`
			`## Andreas Gammelgaard Damsbo, agdamsbo@clin.au.dk`
			`##`
			`## Now modified to use in publication`
			`##`

			`## ====================================================================`
			`# Step 1: Loading libraries`
			`## ====================================================================`

			`library(Hmisc)`
			`library(dplyr)`
			`library(daDoctoR)`
			`library(tidyselect)`

			`## ====================================================================`
			`# Step 2: Loading data`
			`## ====================================================================`

			`# rm(list = ls()) # Clear`
			`# setwd("/Users/au301842/Library/CloudStorage/OneDrive-Personligt/Research/ISLcourse/")`
			`# dta<-read.csv("/Users/au301842/Library/CloudStorage/OneDrive-Personligt/Research/ISLcourse/assigndata.csv")`

			`## ====================================================================`
			`# Step 3: Formatting variables`
			`## ====================================================================`

			`dta <- export %>%`
			`# as_tibble()%>%`
			`mutate(any_rep=factor(ifelse(thrombolysis=="yes"\|thrombechtomy=="yes","yes","no")), # If not noted, no therapy was received`
			`male_sex= factor(ifelse(sex=="female","no","yes")),`
			`# smoke_ever=factor(ifelse(smoke_ever=="never","no","yes")),`
			`civil=factor(ifelse(civil=="partner","no","yes")), # Sets "yes" for not-cohabiting`
			`rtreat=factor(ifelse(rtreat=="Placebo","no","yes")), # "Yes" receives active treatment`
			`alc=factor(ifelse(alc=="more","yes","no")), # Yes for more than guideline`
			`pase_0=as.numeric(pase_0),`
			`pase_6=as.numeric(pase_6),`
			`across(c("diabetes",`
			`"hypertension",`
			`"smoker",`
			`# "smoker_prev",`
			`"afli",`
			`"pad",`
			`"ami",`
			`"tci",`
			`"mrs_0",`
			`"mrs_1"),as.factor),`
			`across(c("nihss_c",`
			`"age",`
			`"mdi_1", # For "enriched" analysis`
			`"who5_score_1",`
			`"mfi_gen_1",`
			`"mfi_phys_1",`
			`"mfi_act_1",`
			`"mfi_mot_1",`
			`"mfi_men_1"),as.numeric )`
			`)%>%`
			`select(-c(sex))`


			`## ====================================================================`
			`# Step 4: Defining outcome`
			`## ====================================================================`

			`## Changed to step 7`
			`## This is to perform proper quantile split based on actually included.`

			`## ====================================================================`
			`# Step 5: Ordering variables`
			`## ====================================================================`


			`dta<-select(dta,c(age,`
			`male_sex,`
			`civil,`
			`pase_0,`
			`smoker,`
			`# smoker_prev,`
			`alc,`
			`afli,`
			`hypertension,`
			`diabetes,`
			`pad,`
			`ami,`
			`tci,`
			`mrs_0,`
			`nihss_c,`
			`# thrombolysis,`
			`# thrombechtomy,`
			`any_rep,`
			`rtreat,`
			`pase_6,`
			`mrs_1,`
			`mfi_gen_1,`
			`mfi_phys_1,`
			`mfi_act_1,`
			`mfi_mot_1,`
			`mfi_men_1,`
			`mdi_1,`
			`who5_score_1`
			`))`

			`## ====================================================================`
			`# Step 6: Labeling`
			`## ====================================================================`

			`var.labels = c(age="Age",`
			`male_sex="Male",`
			`civil="Living alone",`
			`pase_0="Pre-stroke PASE score",`
			`pase_6="Six month PASE score",`
			`smoker="Daily or occasinally smoking",`
			`# smoker_prev="Previous habbit of smoking",`
			`alc="More alcohol than recommendation",`
			`afli="Known AFIB",`
			`hypertension="Known hypertension",`
			`diabetes="Known diabetes",`
			`pad="Known PAD",`
			`ami="Previous MI",`
			`tci="Previous TIA",`
			`mrs_0="Pre-stroke mRS [-1]",`
			`nihss_c="Acute NIHSS score",`
			`thrombolysis="Acute thrombolysis",`
			`thrombechtomy="Acute thrombechtomy",`
			`any_rep="Any reperfusion therapy",`
			`rtreat="Active trial treatment",`
			`mrs_1="One month mRS [-1]",`
			`mfi_gen_1="One month MFI (General fatigue)",`
			`mfi_phys_1="One month MFI (Physical fatigue)",`
			`mfi_act_1="One month MFI (Reduced activity)",`
			`mfi_mot_1="One month MFI (Reduced motivation)",`
			`mfi_men_1="One month MFI (Mental fatigue)",`
			`mdi_1="One month MDI",`
			`who5_score_1="One month WHO5",`
			`pase_decl_rel_fac="PASE score difference, relative F",`
			`pase_decl_abs_fac="PASE score difference, absolute F",`
			`pase_drop_fac="PASE first quartile drop F",`
			`pase_diff="PASE absolute decline",`
			`pase_decl_rel="PASE relative decline",`
			`pase_0_cut="PASE 0 quartiles",`
			`pase_6_cut="PASE 6 quartiles")`

			`## Labelling based on outcome flag`
			`if (pout=="decl_rel"\|pout=="decl_abs"){`
			`var.labels = c(var.labels,group="PASE decline")}`
			`if (pout=="drop"){`
			`var.labels = c(var.labels,group="PASE drop")}`

			`## ====================================================================`
			`# Step 7: final data export`
			`## ====================================================================`

			`data_summary<-summary(dta)`

			`# Saving "old" factorised variables`
			`sel<-sapply(dta,is.factor)`
			`# Reformatting factors as 1/2 for analysis`
			`dta<-dta \|>`
			`mutate(across(where(is.factor), as.numeric))\|> # Turning factors into 1(no) or 2(yes) for model. Numbered alphabetically.`
			`mutate(across(matches(colnames(dta)[sel]), as.factor),`
			`across(starts_with("pase_"), as.numeric))`

			`# Filtering out non-PASE`
			`X_tbl<-dta \|>`
			`filter(!is.na(pase_0),!is.na(pase_6))`

			`nrow(X_tbl)`

			`# Defining possible outcome meassures. Keeping in df for characterisation`
			`X_tbl <- X_tbl\|>`
			`mutate(## Relative decline`
			`pase_diff=(pase_0-pase_6),`
			`pase_decl_rel = pase_diff/pase_0*100,`
			`pase_decl_rel_fac=factor(ifelse(pase_decl_rel>=rel_dif,"yes","no")),`
			`## Absolute decline`
			`pase_decl_abs_fac=factor(ifelse(pase_diff>=abs_dif,"yes","no")),`
			`## Drop`
			`pase_0_cut=quantile_cut(as.numeric(pase_0),`
			`groups=4,`
			`group.names = c(as.character(1:4)),`
			`y=as.numeric(pase_0),`
			`ordered.f = TRUE,`
			`inc.outs = TRUE,`
			`detail.lst=FALSE),`
			`pase_6_cut=quantile_cut(as.numeric(pase_6),`
			`groups=4,`
			`group.names = c(as.character(1:4)),`
			`y=as.numeric(pase_0),`
			`ordered.f = TRUE,`
			`inc.outs = TRUE,`
			`detail.lst=FALSE),`
			`pase_drop_fac=factor(ifelse(pase_6_cut==1&pase_0_cut!=1,"yes","no")))`

			`label(X_tbl) = as.list(var.labels[match(names(X_tbl), names(var.labels))])`

			`# Setting final primary output from "pout"`
			`if (pout=="decl_rel"){`
			`X_tbl <- X_tbl\|>`
			`mutate(group=pase_decl_rel_fac)`

			`X_tbl_f <- X_tbl\|>`
			`filter(pase_0!=0)\|>`
			`select(-starts_with("pase_"))`
			`}`

			`if (pout=="decl_abs"){`
			`X_tbl <- X_tbl\|>`
			`mutate(group=pase_decl_rel_fac)`

			`X_tbl_f <- X_tbl\|>`
			`filter(pase_0>=abs_dif)\|>`
			`select(-starts_with("pase_"))`
			`}`

			`if (pout=="drop"){`
			`X_tbl <- X_tbl\|>`
			`mutate(group=pase_drop_fac)`

			`print(quantile(as.numeric(X_tbl$pase_0),na.rm=T))`
			`print(quantile(as.numeric(X_tbl$pase_6),na.rm=T))`
			`print(summary(X_tbl$pase_0_cut))`

			`X_tbl_f <- X_tbl\|>`
			`filter(pase_0_cut!=1)\|>`
			`select(-starts_with("pase_"))`
			`}`

			`# Excluding one month measures for primary analysis and setting df for table one`
			`X_tbl_f <- X_tbl_f \|>`
			`select(-c(who5_score_1,`
			`mdi_1,`
			`mrs_1,`
			`starts_with("mfi_"))) # Left out of model as no present in drop-group`

			`# Dropping non-complete for analysis`
			`Xy <- X_tbl_f\|>`
			`na.omit()\|> # Keeping only complete observations`
			`select(-c(tci) # Left out of model as no present in drop-group`
			`)\|>`
			`mutate(mrs_0=factor(ifelse(mrs_0==1,1,2))) # Sets binary mRS 0 to include in glmnet, 0 or above`

			`label(Xy) = as.list(var.labels[match(names(Xy), names(var.labels))])`

			`X<-dplyr::select(Xy,-c(group, -starts_with("pase_")) # Exclude primary outcome`
			`)`
			`y<-Xy$group`


			`## ====================================================================`
			`# Secondary analysis`
			`## ====================================================================`


			`dta_s<-X_tbl\|>`
			`select(-c(tci),`
			`-starts_with("pase_"))\|>`
			`na.omit()\|>`
			`mutate(mrs_0=factor(ifelse(mrs_0==1,1,2)),# Sets binary mRS 0 to include in glmnet, 0 or above`
			`mrs_1=factor(ifelse(mrs_1==1,1,2)))# Sets binary mRS 1 to include in glmnet, 0 or above`

			`label(dta_s) = as.list(var.labels[match(names(dta_s), names(var.labels))])`

			`## ====================================================================`
			`# Step 8: Loading rest of libraries`
			`## ====================================================================`

			`library(tidyverse)`
			`library(patchwork)`
			`library(caret)`
			`library(glmnet)`
			`library(leaps)`
			`library(pROC)`
			`library(gt)`
			`library(gtsummary)`
			`library(dplyr)`