291 lines
9.7 KiB
R
291 lines
9.7 KiB
R
## ItMLiHSmar2022
|
|
## data_format.R, child script
|
|
## Data formatting and handling
|
|
## Andreas Gammelgaard Damsbo, agdamsbo@clin.au.dk
|
|
##
|
|
## Now modified to use in publication
|
|
##
|
|
|
|
## ====================================================================
|
|
# Step 1: Loading libraries
|
|
## ====================================================================
|
|
|
|
library(Hmisc)
|
|
library(dplyr)
|
|
library(daDoctoR)
|
|
library(tidyselect)
|
|
|
|
## ====================================================================
|
|
# Step 2: Loading data
|
|
## ====================================================================
|
|
|
|
# rm(list = ls()) # Clear
|
|
# setwd("/Users/au301842/Library/CloudStorage/OneDrive-Personligt/Research/ISLcourse/")
|
|
# dta<-read.csv("/Users/au301842/Library/CloudStorage/OneDrive-Personligt/Research/ISLcourse/assigndata.csv")
|
|
|
|
## ====================================================================
|
|
# Step 3: Formatting variables
|
|
## ====================================================================
|
|
|
|
dta <- export %>%
|
|
# as_tibble()%>%
|
|
mutate(any_rep=factor(ifelse(thrombolysis=="yes"|thrombechtomy=="yes","yes","no")), # If not noted, no therapy was received
|
|
male_sex= factor(ifelse(sex=="female","no","yes")),
|
|
# smoke_ever=factor(ifelse(smoke_ever=="never","no","yes")),
|
|
civil=factor(ifelse(civil=="partner","no","yes")), # Sets "yes" for not-cohabiting
|
|
rtreat=factor(ifelse(rtreat=="Placebo","no","yes")), # "Yes" receives active treatment
|
|
alc=factor(ifelse(alc=="more","yes","no")), # Yes for more than guideline
|
|
pase_0=as.numeric(pase_0),
|
|
pase_6=as.numeric(pase_6),
|
|
across(c("diabetes",
|
|
"hypertension",
|
|
"smoker",
|
|
# "smoker_prev",
|
|
"afli",
|
|
"pad",
|
|
"ami",
|
|
"tci",
|
|
"mrs_0",
|
|
"mrs_1"),as.factor),
|
|
across(c("nihss_c",
|
|
"age",
|
|
"mdi_1", # For "enriched" analysis
|
|
"who5_score_1",
|
|
"mfi_gen_1",
|
|
"mfi_phys_1",
|
|
"mfi_act_1",
|
|
"mfi_mot_1",
|
|
"mfi_men_1"),as.numeric )
|
|
)%>%
|
|
select(-c(sex))
|
|
|
|
|
|
## ====================================================================
|
|
# Step 4: Defining outcome
|
|
## ====================================================================
|
|
|
|
## Changed to step 7
|
|
## This is to perform proper quantile split based on actually included.
|
|
|
|
## ====================================================================
|
|
# Step 5: Ordering variables
|
|
## ====================================================================
|
|
|
|
vars <- c("age",
|
|
"male_sex",
|
|
"civil",
|
|
"pase_0",
|
|
"smoker",
|
|
"alc",
|
|
"afli",
|
|
"hypertension",
|
|
"diabetes",
|
|
"pad",
|
|
"ami",
|
|
"tci",
|
|
"mrs_0",
|
|
"nihss_c",
|
|
"any_rep",
|
|
"rtreat",
|
|
"pase_6")
|
|
|
|
dta<-select(dta,c(vars,
|
|
"mrs_1",
|
|
"mfi_gen_1",
|
|
"mfi_phys_1",
|
|
"mfi_act_1",
|
|
"mfi_mot_1",
|
|
"mfi_men_1",
|
|
"mdi_1",
|
|
"who5_score_1"
|
|
))
|
|
|
|
## ====================================================================
|
|
# Step 6: Labeling
|
|
## ====================================================================
|
|
|
|
var.labels = c(age="Age",
|
|
male_sex="Male",
|
|
civil="Living alone",
|
|
pase_0="Pre-stroke PASE score",
|
|
pase_6="Six month PASE score",
|
|
smoker="Daily or occasinally smoking",
|
|
# smoker_prev="Previous habbit of smoking",
|
|
alc="More alcohol than recommendation",
|
|
afli="AFIB",
|
|
hypertension="Hypertension",
|
|
diabetes="Diabetes",
|
|
pad="PAD",
|
|
ami="Previous MI",
|
|
tci="Previous TIA",
|
|
mrs_0="Pre-stroke mRS [-1]",
|
|
nihss_c="Acute NIHSS score",
|
|
thrombolysis="Acute thrombolysis",
|
|
thrombechtomy="Acute thrombechtomy",
|
|
any_rep="Any reperfusion therapy",
|
|
rtreat="Active trial treatment",
|
|
mrs_1="One month mRS [-1]",
|
|
mfi_gen_1="One month MFI (General fatigue)",
|
|
mfi_phys_1="One month MFI (Physical fatigue)",
|
|
mfi_act_1="One month MFI (Reduced activity)",
|
|
mfi_mot_1="One month MFI (Reduced motivation)",
|
|
mfi_men_1="One month MFI (Mental fatigue)",
|
|
mdi_1="One month MDI",
|
|
who5_score_1="One month WHO5",
|
|
pase_decl_rel_fac="PASE score difference, relative F",
|
|
pase_decl_abs_fac="PASE score difference, absolute F",
|
|
pase_drop_fac="PASE first quartile drop F",
|
|
pase_hop_fac="PASE first quartile hop F",
|
|
pase_diff="PASE absolute decline",
|
|
pase_decl_rel="PASE relative decline",
|
|
pase_0_cut="PASE 0 quartiles",
|
|
pase_6_cut="PASE 6 quartiles")
|
|
|
|
## Labelling based on outcome flag
|
|
if (pout=="decl_rel"|pout=="decl_abs"){
|
|
var.labels = c(var.labels,group="PASE decline")}
|
|
if (pout=="drop"){
|
|
var.labels = c(var.labels,group="PASE drop")}
|
|
|
|
## ====================================================================
|
|
# Step 7: final data export
|
|
## ====================================================================
|
|
|
|
data_summary<-summary(dta)
|
|
|
|
# Saving "old" factorised variables
|
|
sel<-sapply(dta,is.factor)
|
|
# Reformatting factors as 1/2 for analysis
|
|
dta<-dta |>
|
|
mutate(across(where(is.factor), as.numeric))|> # Turning factors into 1(no) or 2(yes) for model. Numbered alphabetically.
|
|
mutate(across(matches(colnames(dta)[sel]), as.factor),
|
|
across(starts_with("pase_"), as.numeric))
|
|
|
|
# Filtering out non-PASE
|
|
X_tbl<-dta |>
|
|
filter(!is.na(pase_0),!is.na(pase_6))
|
|
|
|
nrow(X_tbl)
|
|
|
|
# Defining possible outcome meassures. Keeping in df for characterisation
|
|
X_tbl <- X_tbl|>
|
|
mutate(## Relative decline
|
|
pase_diff=(pase_0-pase_6),
|
|
pase_decl_rel = pase_diff/pase_0*100,
|
|
pase_decl_rel_fac=factor(ifelse(pase_decl_rel>=rel_dif,"yes","no")),
|
|
## Absolute decline
|
|
pase_decl_abs_fac=factor(ifelse(pase_diff>=abs_dif,"yes","no")),
|
|
## Drop
|
|
pase_0_cut=quantile_cut(as.numeric(pase_0),
|
|
groups=4,
|
|
group.names = c(as.character(1:4)),
|
|
y=as.numeric(pase_0),
|
|
ordered.f = TRUE,
|
|
inc.outs = TRUE,
|
|
detail.lst=FALSE),
|
|
pase_6_cut=quantile_cut(as.numeric(pase_6),
|
|
groups=4,
|
|
group.names = c(as.character(1:4)),
|
|
y=as.numeric(pase_0),
|
|
ordered.f = TRUE,
|
|
inc.outs = TRUE,
|
|
detail.lst=FALSE),
|
|
pase_drop_fac=factor(ifelse(pase_6_cut==1&pase_0_cut!=1,"yes","no")),
|
|
pase_hop_fac=factor(ifelse(pase_6_cut!=1&pase_0_cut==1,"yes","no")))
|
|
|
|
Hmisc::label(X_tbl) = as.list(var.labels[match(names(X_tbl), names(var.labels))])
|
|
|
|
# Setting final primary output from "pout"
|
|
if (pout=="decl_rel"){
|
|
X_tbl <- X_tbl|>
|
|
mutate(group=pase_decl_rel_fac)
|
|
|
|
X_tbl_f <- X_tbl|>
|
|
filter(pase_0!=0)|>
|
|
select(-starts_with("pase_"))
|
|
}
|
|
|
|
if (pout=="decl_abs"){
|
|
X_tbl <- X_tbl|>
|
|
mutate(group=pase_decl_rel_fac)
|
|
|
|
X_tbl_f <- X_tbl|>
|
|
filter(pase_0>=abs_dif)|>
|
|
select(-starts_with("pase_"))
|
|
}
|
|
|
|
if (pout=="drop"){
|
|
X_tbl <- X_tbl|>
|
|
mutate(group=pase_drop_fac)
|
|
|
|
# print(quantile(as.numeric(X_tbl$pase_0)))
|
|
# print(quantile(as.numeric(X_tbl$pase_6)))
|
|
# print(summary(X_tbl$pase_0_cut))
|
|
|
|
X_tbl_f <- X_tbl|>
|
|
filter(pase_0_cut!=1)|>
|
|
select(-starts_with("pase_"))
|
|
}
|
|
|
|
if (pout=="hop"){
|
|
X_tbl <- X_tbl|>
|
|
mutate(group=pase_hop_fac)
|
|
|
|
# print(quantile(as.numeric(X_tbl$pase_0)))
|
|
# print(quantile(as.numeric(X_tbl$pase_6)))
|
|
# print(summary(X_tbl$pase_0_cut))
|
|
|
|
X_tbl_f <- X_tbl|>
|
|
filter(pase_6_cut!=1)|>
|
|
select(-starts_with("pase_"))
|
|
}
|
|
|
|
# Excluding one month measures for primary analysis and setting df for table one
|
|
X_tbl_f <- X_tbl_f |>
|
|
select(-c(who5_score_1,
|
|
mdi_1,
|
|
mrs_1,
|
|
starts_with("mfi_"))) # Left out of model as no present in drop-group
|
|
|
|
# Dropping non-complete for analysis
|
|
Xy <- X_tbl_f|>
|
|
na.omit()|> # Keeping only complete observations
|
|
select(-c(tci) # Left out of model as no present in drop-group
|
|
)|>
|
|
mutate(mrs_0=factor(ifelse(mrs_0==1,1,2))) # Sets binary mRS 0 to include in glmnet, 0 or above
|
|
|
|
label(Xy) = as.list(var.labels[match(names(Xy), names(var.labels))])
|
|
|
|
X<-dplyr::select(Xy,-c(group, -starts_with("pase_")) # Exclude primary outcome
|
|
)
|
|
y<-Xy$group
|
|
|
|
|
|
## ====================================================================
|
|
# Secondary analysis
|
|
## ====================================================================
|
|
|
|
|
|
dta_s<-X_tbl|>
|
|
select(-c(tci),
|
|
-starts_with("pase_"))|>
|
|
na.omit()|>
|
|
mutate(mrs_0=factor(ifelse(mrs_0==1,1,2)),# Sets binary mRS 0 to include in glmnet, 0 or above
|
|
mrs_1=factor(ifelse(mrs_1==1,1,2)))# Sets binary mRS 1 to include in glmnet, 0 or above
|
|
|
|
label(dta_s) = as.list(var.labels[match(names(dta_s), names(var.labels))])
|
|
|
|
## ====================================================================
|
|
# Step 8: Loading rest of libraries
|
|
## ====================================================================
|
|
|
|
library(tidyverse)
|
|
library(patchwork)
|
|
library(caret)
|
|
library(glmnet)
|
|
library(leaps)
|
|
library(pROC)
|
|
library(gt)
|
|
library(gtsummary)
|
|
library(dplyr)
|