305 lines
11 KiB
Plaintext
305 lines
11 KiB
Plaintext
---
|
|
title: "Data export and wrangling"
|
|
author: "Andreas Gammelgaard Damsbo"
|
|
date: "Knitted: `r format(Sys.time(), '%d %B, %Y')`"
|
|
output:
|
|
pdf_document: default
|
|
html_document: default
|
|
toc: TRUE
|
|
---
|
|
|
|
```{r setup, include=FALSE}
|
|
knitr::opts_chunk$set(echo = TRUE, message = FALSE)
|
|
```
|
|
|
|
```{r message=FALSE}
|
|
library(haven)
|
|
library(plyr)
|
|
library(dplyr)
|
|
library(reshape2)
|
|
```
|
|
|
|
```{r}
|
|
dta<-read.csv("/Volumes/Data/exercise/source/background.csv",
|
|
na.strings = c("NA",""),colClasses = "character")
|
|
# dta_b<-dta
|
|
```
|
|
|
|
# Variables
|
|
List of variables included in dataset
|
|
|
|
```{r}
|
|
dput(names(dta))
|
|
```
|
|
|
|
## New additions and formatting
|
|
|
|
```{r}
|
|
dta$mors_delay<-difftime(as.Date(dta$mors_d),as.Date(dta$rdate),units = "days")
|
|
dta$mors_v1<-factor(ifelse(dta$mors_delay<=38&
|
|
(dta$mors_delay-as.numeric(dta$inc_time))<=1,
|
|
"yes","no"))
|
|
# Tæller som død hvis død inden 38 dage og dødsdato og EOS ligger indenfor 1 døgn.
|
|
dta$mors_v1[is.na(dta$mors_v1)]<-"no"
|
|
```
|
|
|
|
```{r}
|
|
dta$mors_v16<-factor(ifelse(dta$mors_v1=="no"&
|
|
(dta$mors_delay-as.numeric(dta$inc_time))<=1,
|
|
"yes","no"))
|
|
# Tæller som død mellem 1 til 6 mdr, hvis ikke død inden 1 mdr,
|
|
# og dødsdato og EOS ligger indenfor 1 døgn.
|
|
dta$mors_v16[is.na(dta$mors_v16)]<-"no"
|
|
```
|
|
|
|
PASE score dichotomisation at median score.
|
|
```{r}
|
|
dta$pase_0<-as.numeric(dta$pase_0)
|
|
dta$pase_0_bin<-cut(dta$pase_0,
|
|
c(min(dta$pase_0,na.rm = T),median(dta$pase_0,na.rm = T),
|
|
max(dta$pase_0,na.rm = T)),include.lowest = T,
|
|
labels = c("lower","higher"))
|
|
quantile(dta$pase_0,na.rm = T)
|
|
```
|
|
|
|
### Formatting
|
|
|
|
```{r}
|
|
dta$inc_time<-as.numeric(dta$inc_time)
|
|
```
|
|
|
|
# Cleaning MDI scores
|
|
|
|
The following contains a serious bit of data wrangling. Reasons are the occasional recording of visit 1 data at 6 months due to LOCF approach. Additionally some patients have data recorded at 6 months, but later end date has been defined as prior to the visit 6.
|
|
Additionally the definition of when to define a MDI recording as 1 month or 6 months have added a bit of extra work..
|
|
|
|
This work should be applied for all endpoint data. If needed, a general script or function should be written.
|
|
|
|
Steps used for the correction:
|
|
|
|
1. If the inc_time is 38 days or less MDI 6 scores are moved to MDI 1 and visit 6 is defined as visit 1.
|
|
2. If both visit 1 and 6 dates are NA, use enddate as visit 1 date. This is the case if patients were excluded early.
|
|
3. If visit 6 is recorded later than enddate, use enddate instead. MDI 6 score is dropped.
|
|
4. If visit delay is 7 days or less, and inclusion time is more than 38, MDI 1 is moved to MDI 6 and dropped. If MDI 1 and 6 are different both are kept. Enddate is moved to visit 6 date.
|
|
5. Defining the visit 6 date as same as enddate if visit delay is <7.
|
|
|
|
|
|
```{r}
|
|
summary(inc196<-dta$inc_time>196)
|
|
dt1<-dta[inc196,c("rnumb","rdate","visit_1","visit_6","enddate","inc_time","mdi_1","mdi_6","mors_delay")]
|
|
|
|
```
|
|
|
|
|
|
|
|
## Step 1
|
|
```{r}
|
|
summary(inc38<-dta$inc_time<=38)
|
|
dt1<-dta[inc38,c("rnumb","rdate","visit_1","visit_6","inc_time","mdi_1","mdi_6")]
|
|
dta$visit_1<-ifelse(inc38&!is.na(dta$visit_6),dta$visit_6,dta$visit_1)
|
|
dta$mdi_1<-ifelse(inc38&is.na(dta$mdi_1),dta$mdi_6,dta$mdi_1)
|
|
dta$mdi_6[inc38]<-NA
|
|
dta$visit_6[inc38]<-NA
|
|
# If the inc_time is 38 days or less MDI 6 scores are moved to MDI 1 and visit 6 is defined as visit 1.
|
|
# LOCF correction.
|
|
```
|
|
|
|
## Step 2
|
|
```{r}
|
|
summary(na16enddate<-is.na(dta$visit_1)&is.na(dta$visit_6))
|
|
dt2<-dta[na16enddate,c("rnumb","rdate","visit_1","visit_6","inc_time","mdi_1","mdi_6")]
|
|
dta$visit_1<-ifelse(na16enddate,dta$enddate,dta$visit_1)
|
|
# If both visit 1 and 6 dates are NA, use enddate as visit 1 date. This is the case if patients were excluded early.
|
|
```
|
|
|
|
## Step 3
|
|
```{r}
|
|
summary(late61<-as.Date(dta$visit_6)>as.Date(dta$enddate)&difftime(as.Date(dta$visit_6),as.Date(dta$enddate),units = "days")<=1)
|
|
summary(late62<-as.Date(dta$visit_6)>as.Date(dta$enddate)&difftime(as.Date(dta$visit_6),as.Date(dta$enddate),units = "days")>1)
|
|
|
|
late61[is.na(late61)]<-FALSE
|
|
late62[is.na(late62)]<-FALSE
|
|
|
|
# dt5<-dta[late61,c("rnumb","rdate","visit_1","visit_6","enddate","inc_time","mdi_1","mdi_6")]
|
|
# dt6<-dta[late62,c("rnumb","rdate","visit_1","visit_6","enddate","inc_time","mdi_1","mdi_6")]
|
|
|
|
dta$visit_6<-ifelse(late61,dta$enddate,dta$visit_6)
|
|
dta$visit_6<-ifelse(late62,dta$enddate,dta$visit_6)
|
|
dta$mdi_6[late62]<-NA
|
|
# If visit 6 is recorded later than enddate, use enddate instead
|
|
# A group of patients have visit 6 and MDI 6 recorded, but enddate is before visit 6 data.
|
|
# After manual lookups, this is likely due to some patients coming for visit 6, but the
|
|
# interviewer later realizing, that the patients should have been excluded earlier on.
|
|
# Due to this, patients with enddate more than 1 day (leaving room for simple recording errors) prior to visit 6 date, MDI 6 will be dropped.
|
|
# Patients with 1 day difference the enddate is moved to visit 6 date.
|
|
```
|
|
|
|
## Step 4
|
|
|
|
```{r}
|
|
summary(locflate<-(difftime(as.Date(dta$visit_6),as.Date(dta$visit_1))<=7|is.na(dta$visit_1))&dta$inc_time>38)
|
|
locflate[is.na(locflate)]<-FALSE
|
|
|
|
dt2<-dta[locflate,c("rnumb","rdate","visit_1","visit_6","inc_time","mdi_1","mdi_6")]
|
|
|
|
dta$mdi_6<-ifelse(locflate&is.na(dta$mdi_6),dta$mdi_1,dta$mdi_6)
|
|
|
|
dta$mdi_1[locflate&dta$mdi_1==dta$mdi_6]<-NA
|
|
dta$visit_1[locflate&is.na(dta$mdi_1)]<-NA
|
|
|
|
dta$visit_6<-ifelse(locflate,dta$enddate,dta$visit_6)
|
|
|
|
# If visit delay is 7 days or less, and inclusion time is more than 38, MDI 1 is moved to MDI 6 and dropped. If MDI 1 and 6 are different both are kept. Enddate is moved to visit 6 date.
|
|
```
|
|
|
|
## Step 5
|
|
```{r}
|
|
summary(same1n6date<-difftime(as.Date(dta$visit_6),as.Date(dta$visit_1),units = "days")<7)
|
|
same1n6date[is.na(same1n6date)]<-FALSE
|
|
# dt5<-dta[same1n6date,c("rnumb","rdate","visit_1","visit_6","enddate","inc_time","mdi_1","mdi_6",drops)]
|
|
dta$visit_6<-ifelse(same1n6date,dta$enddate,dta$visit_6)
|
|
# Defining the visit 6 date as same as enddate if visit delay is <7.
|
|
```
|
|
|
|
|
|
# Visit delay
|
|
```{r}
|
|
dta$visit_delay<-difftime(as.Date(dta$visit_6),as.Date(dta$visit_1),units = "days")
|
|
# Final calculation of days between visits
|
|
summary(as.numeric(dta$visit_delay))
|
|
```
|
|
|
|
# newobs definition - DEPRECATED
|
|
|
|
The definition of a truly new observation is a recorded score at least 7 days after the first score. This was relevant prior to the work of redefining time points for scoring.
|
|
```{r}
|
|
dta$mdi_6_newobs<-dta$mdi_6
|
|
# The newobs variable is later used, but is obsolete due to the previous change in definitions. The previous definition of newobs were a delay between visits of at least 7 days. No cases matched. Minimum is 13.
|
|
```
|
|
|
|
|
|
```{r}
|
|
# dta$mdi_6_166<-ifelse(dta$inc_time>166,NA,dta$mdi_6)
|
|
# dta$mdi_6_80<-ifelse(dta$inc_time>80,NA,dta$mdi_6)
|
|
# dta$mdi_6_protocol<-ifelse(dta$protocol=="2",NA,ifelse(is.na(dta$mdi_6),dta$mdi_1,dta$mdi_6))
|
|
# dta$mdi_6_locf<-ifelse(is.na(dta$mdi_6),dta$mdi_1,dta$mdi_6)
|
|
```
|
|
|
|
|
|
# Drops
|
|
Streamlining drop out data to avoid NA's.
|
|
```{r}
|
|
drops<-c("side_effect2","side_effect","wants_out","open_treat")
|
|
for (i in drops) {
|
|
dta[i]<-ifelse(dta[i]=="1. Ja","yes","no")
|
|
dta[i][is.na(dta[i])]<-"no"
|
|
}
|
|
```
|
|
|
|
Defining a common all cause drop variable
|
|
```{r}
|
|
dta$drop<-ifelse(dta$side_effect2=="yes"|dta$side_effect=="yes"|dta$wants_out=="yes"|dta$open_treat=="yes","yes","no")
|
|
```
|
|
|
|
Defining drop before or at day 38 (Following protocol design) as drop before 1 month and drop after day 38 as drop between 1 and 6 months
|
|
```{r}
|
|
cut_line<-38
|
|
dta$inc_time<-as.numeric(dta$inc_time)
|
|
dta$drop1<-ifelse(dta$drop=="yes"&dta$inc_time<=cut_line,"yes","no")
|
|
summary(factor(dta$drop1))
|
|
|
|
# dt3<-dta[,c("rnumb","rdate","visit_1","visit_6","inc_time","mdi_1","mdi_6","mdi_6_newobs","drop1","drop16",drops)]
|
|
|
|
dta$drop16<-ifelse(dta$drop=="yes"&dta$inc_time>cut_line,"yes","no")
|
|
summary(factor(dta$drop16))
|
|
summary(factor(dta$drop))
|
|
# dtf<-dta[dta$drop1=="yes",c("mdi_6_newobs","inc_time")]
|
|
# dtf<-dta[,c("mdi_1","mdi_6_newobs","inc_time","drop","drop1","drop16")]
|
|
```
|
|
|
|
|
|
# Enriching
|
|
With patients excluded due to open treatment need and defining populations to include/exclude
|
|
```{r}
|
|
summary(sel_enr_1<-dta$open_treat=="yes"&is.na(dta$mdi_1)&dta$drop1=="yes")
|
|
dta$mdi_1_enr<-ifelse(sel_enr_1,21,dta$mdi_1) # Per agreement, patients excluded due to open treatment need a given the fictive MDI score "21".
|
|
```
|
|
|
|
Vectorising ex/inclusions at 1 month, to keep patients with data or with later data.
|
|
```{r}
|
|
summary(dta$excluded_1<-factor(case_when(dta$mors_v1=="yes"|
|
|
is.na(dta$mdi_1_enr)&
|
|
dta$drop1=="yes"~"ex_1", # Excluded
|
|
is.na(dta$mdi_1_enr)&!is.na(dta$mdi_6_newobs)~"ca_1", # Missing, but carried to 6 months
|
|
is.na(dta$mdi_1_enr)~"mi_1", # Missing,
|
|
is.na(dta$mdi_1)&!is.na(dta$mdi_1_enr)~"en_1",
|
|
TRUE ~ "dt_1"))) # Data available
|
|
```
|
|
|
|
|
|
```{r}
|
|
summary(sel_enr_6<-dta$open_treat=="yes"&dta$drop16=="yes"&is.na(dta$mdi_6_newobs)&dta$excluded_1%in%c("ca_1","dt_1")&dta$mors_v16=="no")
|
|
|
|
# Entries to be enriched are entries with need for open treatment after 1 month, with missing mdi_6_newobs and with data at or "carried" from 1 month
|
|
|
|
dta$mdi_6_newobs_enr<-as.numeric(ifelse(sel_enr_6,21,dta$mdi_6_newobs)) # Per agreement, patients excluded due to open treatment need a given the fictive MDI score "21".
|
|
```
|
|
|
|
```{r}
|
|
summary(dta$excluded_6<-factor(case_when(is.na(dta$mdi_6_newobs_enr)&dta$excluded_1%in%c("ca_1","dt_1","en_1")~"ex_6", # Excluded due to death or other dropout
|
|
is.na(dta$mdi_6_newobs_enr)~"mi_6", # Missing data due to exclusion at 1 month
|
|
is.na(dta$mdi_6_newobs)&!is.na(dta$mdi_6_newobs_enr)~"en_6", # Enriched entries
|
|
dta$excluded_1%in%c("ca_1","dt_1")~"dt_6" # Organic data available
|
|
))) # Data available
|
|
```
|
|
|
|
```{r}
|
|
# dtf<-cbind(dta[,c("rnumb","mdi_1","mdi_6_newobs","inc_time","drop","drop1","drop16","mdi_1_enr","mdi_6_newobs_enr","excluded_1","excluded_6","mors_v16","mors_delay")],"excluded"=is.na(dta$mdi_6_newobs_enr)&dta$excluded_1%in%c("ca_1","dt_1","en_1"))
|
|
#
|
|
# summary(dtf %>% filter(excluded==TRUE))
|
|
```
|
|
|
|
|
|
# Main Dataset export
|
|
```{r}
|
|
variable_namebits<-c("rnumb","rtreat","age","sex",
|
|
"bmi",
|
|
"smoke_ever",
|
|
"civil",
|
|
"diabetes",
|
|
"hypertension",
|
|
"pad",
|
|
"afli",
|
|
"ami",
|
|
"tci",
|
|
"nihss_0",
|
|
"thrombolysis",
|
|
"thrombechtomy",
|
|
"rep_any",
|
|
"pase_0",
|
|
"pase_6",
|
|
"mrs_0","mrs_1","mrs_6",
|
|
# "who5_score",
|
|
"mdi",
|
|
# "ham_score_1","ham_score_6",
|
|
"mors",
|
|
"drop",
|
|
"wants_out",
|
|
"side_effect",
|
|
"open_treat",
|
|
"side_effect2",
|
|
"excluded",
|
|
"protocol","eos_early","inc_time",
|
|
"rdate","visit","enddate"
|
|
)
|
|
```
|
|
|
|
```{r}
|
|
export<-dta %>% select(contains(variable_namebits))
|
|
```
|
|
|
|
```{r}
|
|
write.csv(export,"/Volumes/Data/depression/dep_dataset.csv",row.names = FALSE)
|
|
```
|
|
|