## ItMLiHSmar2022 ## data_format.R, child script ## Data formatting and handling ## Andreas Gammelgaard Damsbo, agdamsbo@clin.au.dk ## ## Now modified to use in publication ## ## ==================================================================== # Step 1: Loading libraries ## ==================================================================== library(Hmisc) library(dplyr) library(daDoctoR) library(tidyselect) ## ==================================================================== # Step 2: Loading data ## ==================================================================== # rm(list = ls()) # Clear # setwd("/Users/au301842/Library/CloudStorage/OneDrive-Personligt/Research/ISLcourse/") # dta<-read.csv("/Users/au301842/Library/CloudStorage/OneDrive-Personligt/Research/ISLcourse/assigndata.csv") ## ==================================================================== # Step 3: Formatting variables ## ==================================================================== dta <- export %>% # as_tibble()%>% mutate(any_rep=factor(ifelse(thrombolysis=="yes"|thrombechtomy=="yes","yes","no")), # If not noted, no therapy was received male_sex= factor(ifelse(sex=="female","no","yes")), # smoke_ever=factor(ifelse(smoke_ever=="never","no","yes")), civil=factor(ifelse(civil=="partner","no","yes")), # Sets "yes" for not-cohabiting rtreat=factor(ifelse(rtreat=="Placebo","no","yes")), # "Yes" receives active treatment alc=factor(ifelse(alc=="more","yes","no")), # Yes for more than guideline pase_0=as.numeric(pase_0), pase_6=as.numeric(pase_6), across(c("diabetes", "hypertension", "smoker", # "smoker_prev", "afli", "pad", "ami", "tci", "mrs_0", "mrs_1"),as.factor), across(c("nihss_c", "age", "mdi_1", # For "enriched" analysis "who5_score_1", "mfi_gen_1", "mfi_phys_1", "mfi_act_1", "mfi_mot_1", "mfi_men_1"),as.numeric ) )%>% select(-c(sex)) ## ==================================================================== # Step 4: Defining outcome ## ==================================================================== ## Changed to step 7 ## This is to perform proper quantile split based on actually included. ## ==================================================================== # Step 5: Ordering variables ## ==================================================================== dta<-select(dta,c(age, male_sex, civil, pase_0, smoker, # smoker_prev, alc, afli, hypertension, diabetes, pad, ami, tci, mrs_0, nihss_c, # thrombolysis, # thrombechtomy, any_rep, rtreat, pase_6, mrs_1, mfi_gen_1, mfi_phys_1, mfi_act_1, mfi_mot_1, mfi_men_1, mdi_1, who5_score_1 )) ## ==================================================================== # Step 6: Labeling ## ==================================================================== var.labels = c(age="Age", male_sex="Male", civil="Living alone", pase_0="Pre-stroke PASE score", pase_6="Six month PASE score", smoker="Daily or occasinally smoking", # smoker_prev="Previous habbit of smoking", alc="More alcohol than recommendation", afli="Known AFIB", hypertension="Known hypertension", diabetes="Known diabetes", pad="Known PAD", ami="Previous MI", tci="Previous TIA", mrs_0="Pre-stroke mRS [-1]", nihss_c="Acute NIHSS score", thrombolysis="Acute thrombolysis", thrombechtomy="Acute thrombechtomy", any_rep="Any reperfusion therapy", rtreat="Active trial treatment", mrs_1="One month mRS [-1]", mfi_gen_1="One month MFI (General fatigue)", mfi_phys_1="One month MFI (Physical fatigue)", mfi_act_1="One month MFI (Reduced activity)", mfi_mot_1="One month MFI (Reduced motivation)", mfi_men_1="One month MFI (Mental fatigue)", mdi_1="One month MDI", who5_score_1="One month WHO5", pase_decl_rel_fac="PASE score difference, relative F", pase_decl_abs_fac="PASE score difference, absolute F", pase_drop_fac="PASE first quartile drop F", pase_diff="PASE absolute decline", pase_decl_rel="PASE relative decline", pase_0_cut="PASE 0 quartiles", pase_6_cut="PASE 6 quartiles") ## Labelling based on outcome flag if (pout=="decl_rel"|pout=="decl_abs"){ var.labels = c(var.labels,group="PASE decline")} if (pout=="drop"){ var.labels = c(var.labels,group="PASE drop")} ## ==================================================================== # Step 7: final data export ## ==================================================================== data_summary<-summary(dta) # Saving "old" factorised variables sel<-sapply(dta,is.factor) # Reformatting factors as 1/2 for analysis dta<-dta |> mutate(across(where(is.factor), as.numeric))|> # Turning factors into 1(no) or 2(yes) for model. Numbered alphabetically. mutate(across(matches(colnames(dta)[sel]), as.factor), across(starts_with("pase_"), as.numeric)) # Filtering out non-PASE X_tbl<-dta |> filter(!is.na(pase_0),!is.na(pase_6)) nrow(X_tbl) # Defining possible outcome meassures. Keeping in df for characterisation X_tbl <- X_tbl|> mutate(## Relative decline pase_diff=(pase_0-pase_6), pase_decl_rel = pase_diff/pase_0*100, pase_decl_rel_fac=factor(ifelse(pase_decl_rel>=rel_dif,"yes","no")), ## Absolute decline pase_decl_abs_fac=factor(ifelse(pase_diff>=abs_dif,"yes","no")), ## Drop pase_0_cut=quantile_cut(as.numeric(pase_0), groups=4, group.names = c(as.character(1:4)), y=as.numeric(pase_0), ordered.f = TRUE, inc.outs = TRUE, detail.lst=FALSE), pase_6_cut=quantile_cut(as.numeric(pase_6), groups=4, group.names = c(as.character(1:4)), y=as.numeric(pase_0), ordered.f = TRUE, inc.outs = TRUE, detail.lst=FALSE), pase_drop_fac=factor(ifelse(pase_6_cut==1&pase_0_cut!=1,"yes","no"))) label(X_tbl) = as.list(var.labels[match(names(X_tbl), names(var.labels))]) # Setting final primary output from "pout" if (pout=="decl_rel"){ X_tbl <- X_tbl|> mutate(group=pase_decl_rel_fac) X_tbl_f <- X_tbl|> filter(pase_0!=0)|> select(-starts_with("pase_")) } if (pout=="decl_abs"){ X_tbl <- X_tbl|> mutate(group=pase_decl_rel_fac) X_tbl_f <- X_tbl|> filter(pase_0>=abs_dif)|> select(-starts_with("pase_")) } if (pout=="drop"){ X_tbl <- X_tbl|> mutate(group=pase_drop_fac) print(quantile(as.numeric(X_tbl$pase_0),na.rm=T)) print(quantile(as.numeric(X_tbl$pase_6),na.rm=T)) print(summary(X_tbl$pase_0_cut)) X_tbl_f <- X_tbl|> filter(pase_0_cut!=1)|> select(-starts_with("pase_")) } # Excluding one month measures for primary analysis and setting df for table one X_tbl_f <- X_tbl_f |> select(-c(who5_score_1, mdi_1, mrs_1, starts_with("mfi_"))) # Left out of model as no present in drop-group # Dropping non-complete for analysis Xy <- X_tbl_f|> na.omit()|> # Keeping only complete observations select(-c(tci) # Left out of model as no present in drop-group )|> mutate(mrs_0=factor(ifelse(mrs_0==1,1,2))) # Sets binary mRS 0 to include in glmnet, 0 or above label(Xy) = as.list(var.labels[match(names(Xy), names(var.labels))]) X<-dplyr::select(Xy,-c(group, -starts_with("pase_")) # Exclude primary outcome ) y<-Xy$group ## ==================================================================== # Secondary analysis ## ==================================================================== dta_s<-X_tbl|> select(-c(tci), -starts_with("pase_"))|> na.omit()|> mutate(mrs_0=factor(ifelse(mrs_0==1,1,2)),# Sets binary mRS 0 to include in glmnet, 0 or above mrs_1=factor(ifelse(mrs_1==1,1,2)))# Sets binary mRS 1 to include in glmnet, 0 or above label(dta_s) = as.list(var.labels[match(names(dta_s), names(var.labels))]) ## ==================================================================== # Step 8: Loading rest of libraries ## ==================================================================== library(tidyverse) library(patchwork) library(caret) library(glmnet) library(leaps) library(pROC) library(gt) library(gtsummary) library(dplyr)