PhysicalActivityandStrokeOu.../REDCap/convert_redcap.R

261 lines
8.2 KiB
R
Raw Normal View History

2022-08-19 11:06:48 +02:00
## This approach is based on a folder with several data files to upload to an empty data base
## Bulk conversion of stata files to CSV
library(haven)
library(dplyr)
## Export output files??
exp_out=TRUE
id_data=c("record_id","cpr","rtreat") # Data from keyfile is all marked as identifier
auto_upload=FALSE
## Loading filenames
setwd("/Volumes/Data/")
files<-list.files("STATA13/.", pattern="*.dta", full.names=FALSE)
## Loading randomisation key file, keeping only necessary variables,
## renaming rnumb to record_id
key<-as_factor(read_dta("/Volumes/Data/STATA13/inkl_rev_v13.dta")) %>%
filter(.,rnumb!="999") %>%
select(.,c("rnumb","cpr","rtreat")) %>%
rename(.,record_id=rnumb)
## Excluding undesired datasets
## DAP, logfile, randomisation log, randomisation key
## These will be kept and uploaded seperately
files<-files[!grepl("dap",files)&
!grepl("transactions",files)&
!grepl("rand",files)]
## Get "first name" from all files
fnames<-sapply(strsplit(files, "[_]"),"[[",1) # Just discovered the subset functions "[" and "[[". Wow!
## Loading data sets to list
ls<-list()
for (i in 1:length(files)){
## Get dataset, factorise labels,
## join with key-file, drop cpr and remove NA's
d<-left_join(key,
as_factor(read_dta(paste0("STATA13/",files[i]))),by="cpr") %>%
# select(.,!"cpr") %>%
filter(.,!is.na(SYS_SITE)) ## SYS_DATA is in all sets, but fmed, uses SYS_SITE
d<-data.frame(d)
## Append dataframe to list
ls[[i]]<-d
names(ls)[i]<-fnames[i]
## Export original data frame
if (exp_out){
write.csv(d,paste0("REDCap/orig/",fnames[i],".csv"),row.names = FALSE)
}
}
## DAP variables to include, incl record_id
vars<-c("record_id","hojde","vaegt","vaegt_anslaaet","rygning","alkohol","civil",
"bolig","diabetes","hyperten","perifer_arteriel","atriefli","ami",
"tidl_tci","trombolyse","trombektomi","nhiss_foer")
# ## Importing and subsetting DAP-data- All in one style
dap_d<-read_dta("/Volumes/Data/STATA13/dap_rapport_2017.dta") %>% ## Loading file with correct name
filter(.,!duplicated(cpr))%>% ## Limiting to first event
left_join(key,.,by="cpr")%>% ## Join with key for record_id's, to only keep entries also in key
select(.,all_of(vars)) ## Select only specified variables, leaving out cpr's
ls[[length(ls)+1]]<-dap_d
names(ls)[length(ls)]<-"reg"
## Exporting attributes
for (i in 1:length(ls)) {
## Lists all attributes for export
l <- lapply(ls[[i]], attr, "label")
for (j in 1:length(l)){
l[[j]]<-ifelse(is.null(l[[j]]),"No attr",l[[j]])
}
la<-data.frame(names=names(l),attr=unlist(l))
## Export individual attribute files for data overview
if (exp_out){
write.csv(la,paste0("REDCap/attr/attr_",fnames[i],".csv"),row.names = FALSE)
}
}
## Naming and splitting in unique instruments for DataDictionary creation and data upload
## Leave inklusion file, adverse events and other medication.
## These are not needed for upload or will be uploaded seperately
r_sel<-!(grepl("inkl",files)|grepl("ae_",files)|grepl("medicin",files))
# Limiting name variables
# r_files<-files[r_sel]
r_fnames<-c(fnames[r_sel],"reg")
# Selecting elements in list
r_ls<-"["(ls,names(ls)%in%r_fnames)
# New list for wrangling to rename variables and splitting by instance
r_lup<-list()
for (i in 1:length(r_ls)){
## Suffix generic variable names
d<-r_ls[[i]]
name<-names(r_ls)[[i]]
dn<-!(grepl(name,colnames(d))|colnames(d)%in%id_data)## Test colnames that does not contain instrument name
colnames(d)[dn]<-paste0(name,"_",colnames(d)[dn]) ## Adds suffix to colnames to ensure unique names
# First entry is omitted, as this is the record_id
## Test for if conditions
## Uses redundant double if, as else wasn't consistent
test<-grepl("INSTANCE",colnames(d),ignore.case = T )
if (any(test)){
d[,test]<-factor(d[,test]) # Factorise to secure ordering
ins<-levels(d[,test]) # Instance numbers drawn from factor levels
ds<-split(d[,!test] ,d[,test]) # Splits by instance and drops instance variable
names(ds)<-paste0(name,"_",ins) # Names frames by instrument, appends instance
for (j in 1:length(ds)){
colnames(ds[[j]])[!colnames(ds[[j]])%in%id_data]<-paste0(colnames(ds[[j]])[!colnames(ds[[j]])%in%id_data],"_",ins[j])
## Append instance number
}
r_lup<-append(r_lup,ds)
}
if (!any(test)) {
r_lup[[length(r_lup)+1]]<-d
names(r_lup)[length(r_lup)]<-name
}
}
## Load REDCap instrument example file for variable names
icname<-colnames(read.csv("/Volumes/Data/REDCap/examples/examlpe instrument.csv"))
dd<-data.frame(matrix(ncol = length(icname))) ## Data frame to collect all
colnames(dd)<-icname
## Instrument for DataDictionary
## Format dataset for REDCap upload
for (i in 1:length(r_lup)){
dd_i<-data.frame(matrix(ncol = length(icname),nrow = ncol("[["(r_lup,i)))) ## Data frame to collect all
colnames(dd_i)<-icname ## for easier reading
## Variable names
dd_i[1]<-colnames("[["(r_lup,i))
## Form Name
dd_i[2]<-names(r_lup)[i]
## Field Type
# dd_i[4]<-ifelse(sapply(r_lup[[i]], class)=="factor","radio","text")
dd_i[4]<-"text"
## Field Label
## Using original attributes as field labels
fl<-lapply(r_lup[[i]], attr, "label")
for (j in 1:length(fl)){
fl[[j]]<-ifelse(is.null(fl[[j]]),
names(fl)[[j]],
fl[[j]])
## If no attributes, variable name is used as "placeholder"
}
dd_i[5]<-unlist(fl)
## Choices
# for (j in 1:ncol(r_lup[[i]])){
# if (is.factor(r_lup[[i]][[j]])){
# lvl<-levels(r_lup[[i]][[j]])
# lvl_ch<-paste("1,",lvl[1])
# for (k in 2:length(lvl)){
# lvl_ch<-c(paste0(lvl_ch," | ",k,", ",lvl[k]))
# }
# dd_i[j,6]<-lvl_ch
# }
# }
## Text Validation
## Only used for date and time data
# for (j in 1:ncol(r_lup[[i]])){
# dd_i[j,8]<-case_when(class(r_lup[[i]][[j]])[1]%in%c("POSIXct","POSIXt") ~"datetime_seconds_ymd",
# class(r_lup[[i]][[j]])[1]%in%c("Date") ~"date_ymd")
# }
## Merge all
dd<-rbind(dd,dd_i)
if (exp_out){
# dir.create(file.path("/Volumes/Data/REDCap/data",names(r_lup)[[i]]))
write.csv(r_lup[[i]],paste0("/Volumes/Data/REDCap/data/",names(r_lup)[[i]],".csv"),row.names = FALSE)
}
}
# Readies the DataDictionary for export by limiting to unique identifier (leaving out multiple record ids) and omitting NAs
dd_exp<-dd %>% filter(.,!(duplicated(Variable...Field.Name)|is.na(Variable...Field.Name)))
## Marking identifier variables
dd_exp$Identifier.[dd_exp$Variable...Field.Name%in%id_data]<-"y"
write.csv(dd_exp,"/Volumes/Data/REDCap/data_dictionary.csv",row.names = FALSE,na="")
## ONE DATA SET TO RULE THEM ALL
ds_all<-key
for (i in names(r_lup)){
ds_all<-left_join(ds_all,"[["(r_lup,i))
## All non-identifier variable names are unique, joining variable is not specified.
}
colnames(ds_all)<-colnames(ds_all)%>%tolower() ## All names in REDCap are lower case.
if (exp_out){
write.csv(ds_all,"/Volumes/Data/REDCap/complete_dataset.csv",row.names = FALSE,na="")
}
## =============================================================================
## REDCap upload
## - worked, but headers should be lower case
##
## - 02aug22 not allowed to export or import.
## Try manual data upload after attribute merge.
## =============================================================================
if (auto_upload==TRUE){
## Trying out native piping
token_talos<-read.csv("/Users/au301842/talos_redcap_token.csv",colClasses = "character")|>
names()|>
# (\(x){ ## Shorthand for "anonymous lambda function". New "_" placeholder does not work.
# substr(x,2,33)})()|>
suppressWarnings()
## See https://towardsdatascience.com/understanding-the-native-r-pipe-98dea6d8b61b"
## OBS: new placeholder "_" in >4.2.
stts<-REDCapR::redcap_write(ds=ds_all,
redcap_uri = "https://redcap.rm.dk/api/",
token = token_talos
)
records_mod <- REDCapR::redcap_read_oneshot(
redcap_uri = "https://redcap.rm.dk/api/",
token = token_talos
)
}
## Notes:
## - Validation and data selection options are not applied, as these are not necessary for upload.