PhysicalActivityandStrokeOu.../1 PA Decline/data_format.R

277 lines
9.5 KiB
R

## ItMLiHSmar2022
## data_format.R, child script
## Data formatting and handling
## Andreas Gammelgaard Damsbo, agdamsbo@clin.au.dk
##
## Now modified to use in publication
##
## ====================================================================
# Step 1: Loading libraries
## ====================================================================
library(Hmisc)
library(dplyr)
library(daDoctoR)
library(tidyselect)
## ====================================================================
# Step 2: Loading data
## ====================================================================
# rm(list = ls()) # Clear
# setwd("/Users/au301842/Library/CloudStorage/OneDrive-Personligt/Research/ISLcourse/")
# dta<-read.csv("/Users/au301842/Library/CloudStorage/OneDrive-Personligt/Research/ISLcourse/assigndata.csv")
## ====================================================================
# Step 3: Formatting variables
## ====================================================================
dta <- export %>%
# as_tibble()%>%
mutate(any_rep=factor(ifelse(thrombolysis=="yes"|thrombechtomy=="yes","yes","no")), # If not noted, no therapy was received
male_sex= factor(ifelse(sex=="female","no","yes")),
# smoke_ever=factor(ifelse(smoke_ever=="never","no","yes")),
civil=factor(ifelse(civil=="partner","no","yes")), # Sets "yes" for not-cohabiting
rtreat=factor(ifelse(rtreat=="Placebo","no","yes")), # "Yes" receives active treatment
alc=factor(ifelse(alc=="more","yes","no")), # Yes for more than guideline
pase_0=as.numeric(pase_0),
pase_6=as.numeric(pase_6),
across(c("diabetes",
"hypertension",
"smoker",
# "smoker_prev",
"afli",
"pad",
"ami",
"tci",
"mrs_0",
"mrs_1"),as.factor),
across(c("nihss_c",
"age",
"mdi_1", # For "enriched" analysis
"who5_score_1",
"mfi_gen_1",
"mfi_phys_1",
"mfi_act_1",
"mfi_mot_1",
"mfi_men_1"),as.numeric )
)%>%
select(-c(sex))
## ====================================================================
# Step 4: Defining outcome
## ====================================================================
## Changed to step 7
## This is to perform proper quantile split based on actually included.
## ====================================================================
# Step 5: Ordering variables
## ====================================================================
vars <- c("age",
"male_sex",
"civil",
"pase_0",
"smoker",
"alc",
"afli",
"hypertension",
"diabetes",
"pad",
"ami",
"tci",
"mrs_0",
"nihss_c",
"any_rep",
"rtreat",
"pase_6")
dta<-select(dta,c(vars,
"mrs_1",
"mfi_gen_1",
"mfi_phys_1",
"mfi_act_1",
"mfi_mot_1",
"mfi_men_1",
"mdi_1",
"who5_score_1"
))
## ====================================================================
# Step 6: Labeling
## ====================================================================
var.labels = c(age="Age",
male_sex="Male",
civil="Living alone",
pase_0="Pre-stroke PASE score",
pase_6="Six month PASE score",
smoker="Daily or occasinally smoking",
# smoker_prev="Previous habbit of smoking",
alc="More alcohol than recommendation",
afli="AFIB",
hypertension="Hypertension",
diabetes="Diabetes",
pad="PAD",
ami="Previous MI",
tci="Previous TIA",
mrs_0="Pre-stroke mRS [-1]",
nihss_c="Acute NIHSS score",
thrombolysis="Acute thrombolysis",
thrombechtomy="Acute thrombechtomy",
any_rep="Any reperfusion therapy",
rtreat="Active trial treatment",
mrs_1="One month mRS [-1]",
mfi_gen_1="One month MFI (General fatigue)",
mfi_phys_1="One month MFI (Physical fatigue)",
mfi_act_1="One month MFI (Reduced activity)",
mfi_mot_1="One month MFI (Reduced motivation)",
mfi_men_1="One month MFI (Mental fatigue)",
mdi_1="One month MDI",
who5_score_1="One month WHO5",
pase_decl_rel_fac="PASE score difference, relative F",
pase_decl_abs_fac="PASE score difference, absolute F",
pase_drop_fac="PASE first quartile drop F",
pase_hop_fac="PASE first quartile hop F",
pase_diff="PASE absolute decline",
pase_decl_rel="PASE relative decline",
pase_0_cut="PASE 0 quartiles",
pase_6_cut="PASE 6 quartiles")
## Labelling based on outcome flag
if (pout=="decl_rel"|pout=="decl_abs"){
var.labels = c(var.labels,group="PASE decline")}
if (pout=="drop"){
var.labels = c(var.labels,group="PASE drop")}
## ====================================================================
# Step 7: final data export
## ====================================================================
data_summary<-summary(dta)
# Saving "old" factorised variables
sel<-sapply(dta,is.factor)
# Reformatting factors as 1/2 for analysis
dta<-dta |>
mutate(across(where(is.factor), as.numeric))|> # Turning factors into 1(no) or 2(yes) for model. Numbered alphabetically.
mutate(across(matches(colnames(dta)[sel]), as.factor),
across(starts_with("pase_"), as.numeric))
# Filtering out non-PASE
X_tbl<-dta |>
filter(!is.na(pase_0),!is.na(pase_6))
nrow(X_tbl)
# Defining possible outcome meassures. Keeping in df for characterisation
X_tbl <- X_tbl|>
mutate(## Relative decline
pase_diff=(pase_0-pase_6),
pase_decl_rel = pase_diff/pase_0*100,
# pase_decl_rel_fac=factor(ifelse(pase_decl_rel>=rel_dif,"yes","no")),
## Absolute decline
# pase_decl_abs_fac=factor(ifelse(pase_diff>=abs_dif,"yes","no")),
## Drop
pase_0_cut=quantile_cut(as.numeric(pase_0),
groups=4,
group.names = c(as.character(1:4)),
y=as.numeric(pase_0),
ordered.f = TRUE,
inc.outs = TRUE,
detail.lst=FALSE),
pase_6_cut=quantile_cut(as.numeric(pase_6),
groups=4,
group.names = c(as.character(1:4)),
y=as.numeric(pase_0),
ordered.f = TRUE,
inc.outs = TRUE,
detail.lst=FALSE),
pase_drop_fac=factor(ifelse(pase_6_cut==1&pase_0_cut!=1,"yes","no")),
pase_hop_fac=factor(ifelse(pase_6_cut!=1&pase_0_cut==1,"yes","no")))
Hmisc::label(X_tbl) = as.list(var.labels[match(names(X_tbl), names(var.labels))])
# Setting final primary output from "pout"
if (pout=="drop"){
X_tbl <- X_tbl|>
mutate(group=pase_drop_fac)
# print(quantile(as.numeric(X_tbl$pase_0)))
# print(quantile(as.numeric(X_tbl$pase_6)))
# print(summary(X_tbl$pase_0_cut))
X_tbl_f <- X_tbl|>
filter(pase_0_cut!=1)|>
select(-starts_with("pase_"))
}
if (pout=="hop"){
X_tbl <- X_tbl|>
mutate(group=pase_hop_fac)
# print(quantile(as.numeric(X_tbl$pase_0)))
# print(quantile(as.numeric(X_tbl$pase_6)))
# print(summary(X_tbl$pase_0_cut))
X_tbl_f <- X_tbl|>
filter(pase_6_cut!=1)|>
select(-starts_with("pase_"))
}
# Excluding one month measures for primary analysis and setting df for table one
X_tbl_f <- X_tbl_f |>
select(-c(who5_score_1,
mdi_1,
mrs_1,
starts_with("mfi_"))) # Left out of model as no present in drop-group
# Dropping non-complete for analysis
Xy <- X_tbl_f|>
na.omit()|> # Keeping only complete observations
select(-c(tci) # Left out of model as no present in drop-group
)|>
mutate(mrs_0=factor(ifelse(mrs_0==1,1,2))) # Sets binary mRS 0 to include in glmnet, 0 or above
label(Xy) = as.list(var.labels[match(names(Xy), names(var.labels))])
X<-dplyr::select(Xy,-c(group, -starts_with("pase_")) # Exclude primary outcome
)
y<-Xy$group
## ====================================================================
# Secondary analysis
## ====================================================================
dta_s<-X_tbl|>
select(-c(tci),
-starts_with("pase_"))|>
na.omit()|>
mutate(mrs_0=factor(ifelse(mrs_0==1,1,2)),# Sets binary mRS 0 to include in glmnet, 0 or above
mrs_1=factor(ifelse(mrs_1==1,1,2)))# Sets binary mRS 1 to include in glmnet, 0 or above
label(dta_s) = as.list(var.labels[match(names(dta_s), names(var.labels))])
## ====================================================================
# Step 8: Loading rest of libraries
## ====================================================================
library(tidyverse)
library(patchwork)
library(caret)
library(glmnet)
library(leaps)
library(pROC)
library(gt)
library(gtsummary)
library(dplyr)
library(tidyverse)
library(glue)
# library(ggdendro)
library(corrplot)