261 lines
8.2 KiB
R
261 lines
8.2 KiB
R
## This approach is based on a folder with several data files to upload to an empty data base
|
|
|
|
## Bulk conversion of stata files to CSV
|
|
library(haven)
|
|
library(dplyr)
|
|
|
|
|
|
## Export output files??
|
|
exp_out=TRUE
|
|
id_data=c("record_id","cpr","rtreat") # Data from keyfile is all marked as identifier
|
|
auto_upload=FALSE
|
|
|
|
## Loading filenames
|
|
setwd("/Volumes/Data/")
|
|
files<-list.files("STATA13/.", pattern="*.dta", full.names=FALSE)
|
|
|
|
## Loading randomisation key file, keeping only necessary variables,
|
|
## renaming rnumb to record_id
|
|
key<-as_factor(read_dta("/Volumes/Data/STATA13/inkl_rev_v13.dta")) %>%
|
|
filter(.,rnumb!="999") %>%
|
|
select(.,c("rnumb","cpr","rtreat")) %>%
|
|
rename(.,record_id=rnumb)
|
|
|
|
## Excluding undesired datasets
|
|
## DAP, logfile, randomisation log, randomisation key
|
|
## These will be kept and uploaded seperately
|
|
files<-files[!grepl("dap",files)&
|
|
!grepl("transactions",files)&
|
|
!grepl("rand",files)]
|
|
|
|
## Get "first name" from all files
|
|
fnames<-sapply(strsplit(files, "[_]"),"[[",1) # Just discovered the subset functions "[" and "[[". Wow!
|
|
|
|
## Loading data sets to list
|
|
ls<-list()
|
|
for (i in 1:length(files)){
|
|
## Get dataset, factorise labels,
|
|
## join with key-file, drop cpr and remove NA's
|
|
d<-left_join(key,
|
|
as_factor(read_dta(paste0("STATA13/",files[i]))),by="cpr") %>%
|
|
# select(.,!"cpr") %>%
|
|
filter(.,!is.na(SYS_SITE)) ## SYS_DATA is in all sets, but fmed, uses SYS_SITE
|
|
d<-data.frame(d)
|
|
|
|
## Append dataframe to list
|
|
ls[[i]]<-d
|
|
names(ls)[i]<-fnames[i]
|
|
|
|
## Export original data frame
|
|
if (exp_out){
|
|
write.csv(d,paste0("REDCap/orig/",fnames[i],".csv"),row.names = FALSE)
|
|
}
|
|
|
|
}
|
|
|
|
## DAP variables to include, incl record_id
|
|
vars<-c("record_id","hojde","vaegt","vaegt_anslaaet","rygning","alkohol","civil",
|
|
"bolig","diabetes","hyperten","perifer_arteriel","atriefli","ami",
|
|
"tidl_tci","trombolyse","trombektomi","nhiss_foer")
|
|
|
|
# ## Importing and subsetting DAP-data- All in one style
|
|
|
|
dap_d<-read_dta("/Volumes/Data/STATA13/dap_rapport_2017.dta") %>% ## Loading file with correct name
|
|
filter(.,!duplicated(cpr))%>% ## Limiting to first event
|
|
left_join(key,.,by="cpr")%>% ## Join with key for record_id's, to only keep entries also in key
|
|
select(.,all_of(vars)) ## Select only specified variables, leaving out cpr's
|
|
|
|
ls[[length(ls)+1]]<-dap_d
|
|
names(ls)[length(ls)]<-"reg"
|
|
|
|
|
|
## Exporting attributes
|
|
for (i in 1:length(ls)) {
|
|
## Lists all attributes for export
|
|
l <- lapply(ls[[i]], attr, "label")
|
|
|
|
for (j in 1:length(l)){
|
|
l[[j]]<-ifelse(is.null(l[[j]]),"No attr",l[[j]])
|
|
}
|
|
|
|
la<-data.frame(names=names(l),attr=unlist(l))
|
|
|
|
## Export individual attribute files for data overview
|
|
if (exp_out){
|
|
write.csv(la,paste0("REDCap/attr/attr_",fnames[i],".csv"),row.names = FALSE)
|
|
}
|
|
}
|
|
|
|
## Naming and splitting in unique instruments for DataDictionary creation and data upload
|
|
|
|
## Leave inklusion file, adverse events and other medication.
|
|
## These are not needed for upload or will be uploaded seperately
|
|
|
|
r_sel<-!(grepl("inkl",files)|grepl("ae_",files)|grepl("medicin",files))
|
|
|
|
# Limiting name variables
|
|
# r_files<-files[r_sel]
|
|
r_fnames<-c(fnames[r_sel],"reg")
|
|
|
|
# Selecting elements in list
|
|
r_ls<-"["(ls,names(ls)%in%r_fnames)
|
|
|
|
# New list for wrangling to rename variables and splitting by instance
|
|
r_lup<-list()
|
|
|
|
for (i in 1:length(r_ls)){
|
|
## Suffix generic variable names
|
|
d<-r_ls[[i]]
|
|
name<-names(r_ls)[[i]]
|
|
dn<-!(grepl(name,colnames(d))|colnames(d)%in%id_data)## Test colnames that does not contain instrument name
|
|
|
|
colnames(d)[dn]<-paste0(name,"_",colnames(d)[dn]) ## Adds suffix to colnames to ensure unique names
|
|
# First entry is omitted, as this is the record_id
|
|
|
|
## Test for if conditions
|
|
## Uses redundant double if, as else wasn't consistent
|
|
test<-grepl("INSTANCE",colnames(d),ignore.case = T )
|
|
|
|
if (any(test)){
|
|
d[,test]<-factor(d[,test]) # Factorise to secure ordering
|
|
ins<-levels(d[,test]) # Instance numbers drawn from factor levels
|
|
|
|
ds<-split(d[,!test] ,d[,test]) # Splits by instance and drops instance variable
|
|
names(ds)<-paste0(name,"_",ins) # Names frames by instrument, appends instance
|
|
for (j in 1:length(ds)){
|
|
colnames(ds[[j]])[!colnames(ds[[j]])%in%id_data]<-paste0(colnames(ds[[j]])[!colnames(ds[[j]])%in%id_data],"_",ins[j])
|
|
## Append instance number
|
|
}
|
|
|
|
r_lup<-append(r_lup,ds)
|
|
}
|
|
|
|
if (!any(test)) {
|
|
r_lup[[length(r_lup)+1]]<-d
|
|
names(r_lup)[length(r_lup)]<-name
|
|
}
|
|
|
|
}
|
|
|
|
## Load REDCap instrument example file for variable names
|
|
icname<-colnames(read.csv("/Volumes/Data/REDCap/examples/examlpe instrument.csv"))
|
|
dd<-data.frame(matrix(ncol = length(icname))) ## Data frame to collect all
|
|
colnames(dd)<-icname
|
|
|
|
## Instrument for DataDictionary
|
|
## Format dataset for REDCap upload
|
|
|
|
for (i in 1:length(r_lup)){
|
|
dd_i<-data.frame(matrix(ncol = length(icname),nrow = ncol("[["(r_lup,i)))) ## Data frame to collect all
|
|
|
|
colnames(dd_i)<-icname ## for easier reading
|
|
|
|
## Variable names
|
|
dd_i[1]<-colnames("[["(r_lup,i))
|
|
|
|
## Form Name
|
|
dd_i[2]<-names(r_lup)[i]
|
|
|
|
## Field Type
|
|
# dd_i[4]<-ifelse(sapply(r_lup[[i]], class)=="factor","radio","text")
|
|
dd_i[4]<-"text"
|
|
|
|
## Field Label
|
|
## Using original attributes as field labels
|
|
fl<-lapply(r_lup[[i]], attr, "label")
|
|
for (j in 1:length(fl)){
|
|
fl[[j]]<-ifelse(is.null(fl[[j]]),
|
|
names(fl)[[j]],
|
|
fl[[j]])
|
|
## If no attributes, variable name is used as "placeholder"
|
|
}
|
|
dd_i[5]<-unlist(fl)
|
|
|
|
## Choices
|
|
# for (j in 1:ncol(r_lup[[i]])){
|
|
# if (is.factor(r_lup[[i]][[j]])){
|
|
# lvl<-levels(r_lup[[i]][[j]])
|
|
# lvl_ch<-paste("1,",lvl[1])
|
|
# for (k in 2:length(lvl)){
|
|
# lvl_ch<-c(paste0(lvl_ch," | ",k,", ",lvl[k]))
|
|
# }
|
|
# dd_i[j,6]<-lvl_ch
|
|
# }
|
|
# }
|
|
|
|
## Text Validation
|
|
## Only used for date and time data
|
|
# for (j in 1:ncol(r_lup[[i]])){
|
|
# dd_i[j,8]<-case_when(class(r_lup[[i]][[j]])[1]%in%c("POSIXct","POSIXt") ~"datetime_seconds_ymd",
|
|
# class(r_lup[[i]][[j]])[1]%in%c("Date") ~"date_ymd")
|
|
# }
|
|
|
|
## Merge all
|
|
dd<-rbind(dd,dd_i)
|
|
|
|
if (exp_out){
|
|
# dir.create(file.path("/Volumes/Data/REDCap/data",names(r_lup)[[i]]))
|
|
write.csv(r_lup[[i]],paste0("/Volumes/Data/REDCap/data/",names(r_lup)[[i]],".csv"),row.names = FALSE)
|
|
}
|
|
|
|
}
|
|
|
|
# Readies the DataDictionary for export by limiting to unique identifier (leaving out multiple record ids) and omitting NAs
|
|
dd_exp<-dd %>% filter(.,!(duplicated(Variable...Field.Name)|is.na(Variable...Field.Name)))
|
|
|
|
## Marking identifier variables
|
|
dd_exp$Identifier.[dd_exp$Variable...Field.Name%in%id_data]<-"y"
|
|
|
|
|
|
write.csv(dd_exp,"/Volumes/Data/REDCap/data_dictionary.csv",row.names = FALSE,na="")
|
|
|
|
|
|
## ONE DATA SET TO RULE THEM ALL
|
|
ds_all<-key
|
|
for (i in names(r_lup)){
|
|
ds_all<-left_join(ds_all,"[["(r_lup,i))
|
|
## All non-identifier variable names are unique, joining variable is not specified.
|
|
}
|
|
|
|
colnames(ds_all)<-colnames(ds_all)%>%tolower() ## All names in REDCap are lower case.
|
|
|
|
if (exp_out){
|
|
write.csv(ds_all,"/Volumes/Data/REDCap/complete_dataset.csv",row.names = FALSE,na="")
|
|
}
|
|
|
|
|
|
## =============================================================================
|
|
## REDCap upload
|
|
## - worked, but headers should be lower case
|
|
##
|
|
## - 02aug22 not allowed to export or import.
|
|
## Try manual data upload after attribute merge.
|
|
## =============================================================================
|
|
|
|
if (auto_upload==TRUE){
|
|
|
|
## Trying out native piping
|
|
token_talos<-read.csv("/Users/au301842/talos_redcap_token.csv",colClasses = "character")|>
|
|
names()|>
|
|
# (\(x){ ## Shorthand for "anonymous lambda function". New "_" placeholder does not work.
|
|
# substr(x,2,33)})()|>
|
|
suppressWarnings()
|
|
|
|
## See https://towardsdatascience.com/understanding-the-native-r-pipe-98dea6d8b61b"
|
|
## OBS: new placeholder "_" in >4.2.
|
|
|
|
|
|
stts<-REDCapR::redcap_write(ds=ds_all,
|
|
redcap_uri = "https://redcap.rm.dk/api/",
|
|
token = token_talos
|
|
)
|
|
|
|
records_mod <- REDCapR::redcap_read_oneshot(
|
|
redcap_uri = "https://redcap.rm.dk/api/",
|
|
token = token_talos
|
|
)
|
|
}
|
|
|
|
## Notes:
|
|
## - Validation and data selection options are not applied, as these are not necessary for upload.
|