## Bulk conversion of stata files to CSV library(haven) library(dplyr) ## Export output files?? exp_out=TRUE id_data=c("record_id","cpr","rtreat") # Data from keyfile is all marked as identifier auto_upload=FALSE ## Loading filenames setwd("/Volumes/Data/") files<-list.files("STATA13/.", pattern="*.dta", full.names=FALSE) ## Loading randomisation key file, keeping only necessary variables, ## renaming rnumb to record_id key<-as_factor(read_dta("/Volumes/Data/STATA13/inkl_rev_v13.dta")) %>% filter(.,rnumb!="999") %>% select(.,c("rnumb","cpr","rtreat")) %>% rename(.,record_id=rnumb) ## Excluding undesired datasets ## DAP, logfile, randomisation log, randomisation key ## These will be kept and uploaded seperately files<-files[!grepl("dap",files)& !grepl("transactions",files)& !grepl("rand",files)] ## Get "first name" from all files fnames<-sapply(strsplit(files, "[_]"),"[[",1) # Just discovered the subset functions "[" and "[[". Wow! ## Loading data sets to list ls<-list() for (i in 1:length(files)){ ## Get dataset, factorise labels, ## join with key-file, drop cpr and remove NA's d<-left_join(key, as_factor(read_dta(paste0("STATA13/",files[i]))),by="cpr") %>% # select(.,!"cpr") %>% filter(.,!is.na(SYS_SITE)) ## SYS_DATA is in all sets, but fmed, uses SYS_SITE d<-data.frame(d) ## Append dataframe to list ls[[i]]<-d names(ls)[i]<-fnames[i] ## Export original data frame if (exp_out){ write.csv(d,paste0("REDCap/orig/",fnames[i],".csv"),row.names = FALSE) } } ## DAP variables to include, incl record_id vars<-c("record_id","hojde","vaegt","vaegt_anslaaet","rygning","alkohol","civil", "bolig","diabetes","hyperten","perifer_arteriel","atriefli","ami", "tidl_tci","trombolyse","trombektomi","nhiss_foer") # ## Importing and subsetting DAP-data- All in one style dap_d<-read_dta("/Volumes/Data/STATA13/dap_rapport_2017.dta") %>% ## Loading file with correct name filter(.,!duplicated(cpr))%>% ## Limiting to first event left_join(key,.,by="cpr")%>% ## Join with key for record_id's, to only keep entries also in key select(.,all_of(vars)) ## Select only specified variables, leaving out cpr's ls[[length(ls)+1]]<-dap_d names(ls)[length(ls)]<-"reg" ## Exporting attributes for (i in 1:length(ls)) { ## Lists all attributes for export l <- lapply(ls[[i]], attr, "label") for (j in 1:length(l)){ l[[j]]<-ifelse(is.null(l[[j]]),"No attr",l[[j]]) } la<-data.frame(names=names(l),attr=unlist(l)) ## Export individual attribute files for data overview if (exp_out){ write.csv(la,paste0("REDCap/attr/attr_",fnames[i],".csv"),row.names = FALSE) } } ## Naming and splitting in unique instruments for DataDictionary creation and data upload ## Leave inklusion file, adverse events and other medication. ## These are not needed for upload or will be uploaded seperately r_sel<-!(grepl("inkl",files)|grepl("ae_",files)|grepl("medicin",files)) # Limiting name variables # r_files<-files[r_sel] r_fnames<-c(fnames[r_sel],"reg") # Selecting elements in list r_ls<-"["(ls,names(ls)%in%r_fnames) # New list for wrangling to rename variables and splitting by instance r_lup<-list() for (i in 1:length(r_ls)){ ## Suffix generic variable names d<-r_ls[[i]] name<-names(r_ls)[[i]] dn<-!(grepl(name,colnames(d))|colnames(d)%in%id_data)## Test colnames that does not contain instrument name colnames(d)[dn]<-paste0(name,"_",colnames(d)[dn]) ## Adds suffix to colnames to ensure unique names # First entry is omitted, as this is the record_id ## Test for if conditions ## Uses redundant double if, as else wasn't consistent test<-grepl("INSTANCE",colnames(d),ignore.case = T ) if (any(test)){ d[,test]<-factor(d[,test]) # Factorise to secure ordering ins<-levels(d[,test]) # Instance numbers drawn from factor levels ds<-split(d[,!test] ,d[,test]) # Splits by instance and drops instance variable names(ds)<-paste0(name,"_",ins) # Names frames by instrument, appends instance for (j in 1:length(ds)){ colnames(ds[[j]])[!colnames(ds[[j]])%in%id_data]<-paste0(colnames(ds[[j]])[!colnames(ds[[j]])%in%id_data],"_",ins[j]) ## Append instance number } r_lup<-append(r_lup,ds) } if (!any(test)) { r_lup[[length(r_lup)+1]]<-d names(r_lup)[length(r_lup)]<-name } } ## Load REDCap instrument example file for variable names icname<-colnames(read.csv("/Volumes/Data/REDCap/examples/examlpe instrument.csv")) dd<-data.frame(matrix(ncol = length(icname))) ## Data frame to collect all colnames(dd)<-icname ## Instrument for DataDictionary ## Format dataset for REDCap upload for (i in 1:length(r_lup)){ dd_i<-data.frame(matrix(ncol = length(icname),nrow = ncol("[["(r_lup,i)))) ## Data frame to collect all colnames(dd_i)<-icname ## for easier reading ## Variable names dd_i[1]<-colnames("[["(r_lup,i)) ## Form Name dd_i[2]<-names(r_lup)[i] ## Field Type # dd_i[4]<-ifelse(sapply(r_lup[[i]], class)=="factor","radio","text") dd_i[4]<-"text" ## Field Label ## Using original attributes as field labels fl<-lapply(r_lup[[i]], attr, "label") for (j in 1:length(fl)){ fl[[j]]<-ifelse(is.null(fl[[j]]), names(fl)[[j]], fl[[j]]) ## If no attributes, variable name is used as "placeholder" } dd_i[5]<-unlist(fl) ## Choices # for (j in 1:ncol(r_lup[[i]])){ # if (is.factor(r_lup[[i]][[j]])){ # lvl<-levels(r_lup[[i]][[j]]) # lvl_ch<-paste("1,",lvl[1]) # for (k in 2:length(lvl)){ # lvl_ch<-c(paste0(lvl_ch," | ",k,", ",lvl[k])) # } # dd_i[j,6]<-lvl_ch # } # } ## Text Validation ## Only used for date and time data # for (j in 1:ncol(r_lup[[i]])){ # dd_i[j,8]<-case_when(class(r_lup[[i]][[j]])[1]%in%c("POSIXct","POSIXt") ~"datetime_seconds_ymd", # class(r_lup[[i]][[j]])[1]%in%c("Date") ~"date_ymd") # } ## Merge all dd<-rbind(dd,dd_i) if (exp_out){ # dir.create(file.path("/Volumes/Data/REDCap/data",names(r_lup)[[i]])) write.csv(r_lup[[i]],paste0("/Volumes/Data/REDCap/data/",names(r_lup)[[i]],".csv"),row.names = FALSE) } } # Readies the DataDictionary for export by limiting to unique identifier (leaving out multiple record ids) and omitting NAs dd_exp<-dd %>% filter(.,!(duplicated(Variable...Field.Name)|is.na(Variable...Field.Name))) ## Marking identifier variables dd_exp$Identifier.[dd_exp$Variable...Field.Name%in%id_data]<-"y" write.csv(dd_exp,"/Volumes/Data/REDCap/data_dictionary.csv",row.names = FALSE,na="") ## ONE DATA SET TO RULE THEM ALL ds_all<-key for (i in names(r_lup)){ ds_all<-left_join(ds_all,"[["(r_lup,i)) ## All non-identifier variable names are unique, joining variable is not specified. } colnames(ds_all)<-colnames(ds_all)%>%tolower() ## All names in REDCap are lower case. if (exp_out){ write.csv(ds_all,"/Volumes/Data/REDCap/complete_dataset.csv",row.names = FALSE,na="") } ## ============================================================================= ## REDCap upload ## - worked, but headers should be lower case ## ## - 02aug22 not allowed to export or import. ## Try manual data upload after attribute merge. ## ============================================================================= if (auto_upload==TRUE){ ## Trying out native piping token_talos<-read.csv("/Users/au301842/talos_redcap_token.csv",colClasses = "character")|> names()|> # (\(x){ ## Shorthand for "anonymous lambda function". New "_" placeholder does not work. # substr(x,2,33)})()|> suppressWarnings() ## See https://towardsdatascience.com/understanding-the-native-r-pipe-98dea6d8b61b" ## OBS: new placeholder "_" in >4.2. stts<-REDCapR::redcap_write(ds=ds_all, redcap_uri = "https://redcap.rm.dk/api/", token = token_talos ) records_mod <- REDCapR::redcap_read_oneshot( redcap_uri = "https://redcap.rm.dk/api/", token = token_talos ) } ## Notes: ## - Validation and data selection options are not applied, as these are not necessary for upload.