PhysicalActivityandStrokeOu.../REDCap/convert_redcap.R

## This approach is based on a folder with several data files to upload to an empty data base

## Bulk conversion of stata files to CSV
library(haven)
library(dplyr)


## Export output files??
exp_out=TRUE
id_data=c("record_id","cpr","rtreat") # Data from keyfile is all marked as identifier
auto_upload=FALSE

## Loading filenames
setwd("/Volumes/Data/")
files<-list.files("STATA13/.", pattern="*.dta", full.names=FALSE)

## Loading randomisation key file, keeping only necessary variables,
## renaming rnumb to record_id
key<-as_factor(read_dta("/Volumes/Data/STATA13/inkl_rev_v13.dta")) %>%
  filter(.,rnumb!="999") %>%
  select(.,c("rnumb","cpr","rtreat")) %>%
  rename(.,record_id=rnumb)

## Excluding undesired datasets
## DAP, logfile, randomisation log, randomisation key
## These will be kept and uploaded seperately
files<-files[!grepl("dap",files)&
               !grepl("transactions",files)&
               !grepl("rand",files)]

## Get "first name" from all files
fnames<-sapply(strsplit(files, "[_]"),"[[",1) # Just discovered the subset functions "[" and "[[". Wow!

## Loading data sets to list
ls<-list()
for (i in 1:length(files)){
  ## Get dataset, factorise labels,
  ## join with key-file, drop cpr and remove NA's
  d<-left_join(key,
               as_factor(read_dta(paste0("STATA13/",files[i]))),by="cpr") %>%
    # select(.,!"cpr") %>%
    filter(.,!is.na(SYS_SITE)) ## SYS_DATA is in all sets, but fmed, uses SYS_SITE
  d<-data.frame(d)

  ## Append dataframe to list
  ls[[i]]<-d
  names(ls)[i]<-fnames[i]

  ## Export original data frame
  if (exp_out){
    write.csv(d,paste0("REDCap/orig/",fnames[i],".csv"),row.names = FALSE)
  }

}

## DAP variables to include, incl record_id
vars<-c("record_id","hojde","vaegt","vaegt_anslaaet","rygning","alkohol","civil",
        "bolig","diabetes","hyperten","perifer_arteriel","atriefli","ami",
        "tidl_tci","trombolyse","trombektomi","nhiss_foer")

# ## Importing and subsetting DAP-data- All in one style

dap_d<-read_dta("/Volumes/Data/STATA13/dap_rapport_2017.dta") %>% ## Loading file with correct name
  filter(.,!duplicated(cpr))%>% ## Limiting to first event
  left_join(key,.,by="cpr")%>% ## Join with key for record_id's, to only keep entries also in key
  select(.,all_of(vars)) ## Select only specified variables, leaving out cpr's

ls[[length(ls)+1]]<-dap_d
names(ls)[length(ls)]<-"reg"


## Exporting attributes
for (i in 1:length(ls)) {
  ## Lists all attributes for export
  l <- lapply(ls[[i]], attr, "label")

  for (j in 1:length(l)){
    l[[j]]<-ifelse(is.null(l[[j]]),"No attr",l[[j]])
  }

  la<-data.frame(names=names(l),attr=unlist(l))

  ## Export individual attribute files for data overview
  if (exp_out){
    write.csv(la,paste0("REDCap/attr/attr_",fnames[i],".csv"),row.names = FALSE)
  }
}

## Naming and splitting in unique instruments for DataDictionary creation and data upload

## Leave inklusion file, adverse events and other medication.
## These are not needed for upload or will be uploaded seperately

r_sel<-!(grepl("inkl",files)|grepl("ae_",files)|grepl("medicin",files))

# Limiting name variables
# r_files<-files[r_sel]
r_fnames<-c(fnames[r_sel],"reg")

# Selecting elements in list
r_ls<-"["(ls,names(ls)%in%r_fnames)

# New list for wrangling to rename variables and splitting by instance
r_lup<-list()

for (i in 1:length(r_ls)){
  ## Suffix generic variable names
  d<-r_ls[[i]]
  name<-names(r_ls)[[i]]
  dn<-!(grepl(name,colnames(d))|colnames(d)%in%id_data)## Test colnames that does not contain instrument name

  colnames(d)[dn]<-paste0(name,"_",colnames(d)[dn]) ## Adds suffix to colnames to ensure unique names
  # First entry is omitted, as this is the record_id

  ## Test for if conditions
  ## Uses redundant double if, as else wasn't consistent
  test<-grepl("INSTANCE",colnames(d),ignore.case = T )

  if (any(test)){
    d[,test]<-factor(d[,test]) # Factorise to secure ordering
    ins<-levels(d[,test]) # Instance numbers drawn from factor levels

    ds<-split(d[,!test] ,d[,test]) # Splits by instance and drops instance variable
    names(ds)<-paste0(name,"_",ins) # Names frames by instrument, appends instance
    for (j in 1:length(ds)){
      colnames(ds[[j]])[!colnames(ds[[j]])%in%id_data]<-paste0(colnames(ds[[j]])[!colnames(ds[[j]])%in%id_data],"_",ins[j])
      ## Append instance number
    }

    r_lup<-append(r_lup,ds)
  }

  if (!any(test)) {
    r_lup[[length(r_lup)+1]]<-d
    names(r_lup)[length(r_lup)]<-name
  }

}

## Load REDCap instrument example file for variable names
icname<-colnames(read.csv("/Volumes/Data/REDCap/examples/examlpe instrument.csv"))
dd<-data.frame(matrix(ncol = length(icname))) ## Data frame to collect all
colnames(dd)<-icname

## Instrument for DataDictionary
## Format dataset for REDCap upload

for (i in 1:length(r_lup)){
  dd_i<-data.frame(matrix(ncol = length(icname),nrow = ncol("[["(r_lup,i)))) ## Data frame to collect all

  colnames(dd_i)<-icname ## for easier reading

  ## Variable names
  dd_i[1]<-colnames("[["(r_lup,i))

  ## Form Name
  dd_i[2]<-names(r_lup)[i]

  ## Field Type
  # dd_i[4]<-ifelse(sapply(r_lup[[i]], class)=="factor","radio","text")
  dd_i[4]<-"text"

  ## Field Label
  ## Using original attributes as field labels
  fl<-lapply(r_lup[[i]], attr, "label")
  for (j in 1:length(fl)){
    fl[[j]]<-ifelse(is.null(fl[[j]]),
                    names(fl)[[j]],
                    fl[[j]])
    ##  If no attributes, variable name is used as "placeholder"
  }
  dd_i[5]<-unlist(fl)

  ## Choices
  # for (j in 1:ncol(r_lup[[i]])){
  #   if (is.factor(r_lup[[i]][[j]])){
  #     lvl<-levels(r_lup[[i]][[j]])
  #     lvl_ch<-paste("1,",lvl[1])
  #     for (k in 2:length(lvl)){
  #       lvl_ch<-c(paste0(lvl_ch," | ",k,", ",lvl[k]))
  #     }
  #     dd_i[j,6]<-lvl_ch
  #   }
  # }

  ## Text Validation
  ## Only used for date and time data
  # for (j in 1:ncol(r_lup[[i]])){
  #   dd_i[j,8]<-case_when(class(r_lup[[i]][[j]])[1]%in%c("POSIXct","POSIXt") ~"datetime_seconds_ymd",
  #                        class(r_lup[[i]][[j]])[1]%in%c("Date") ~"date_ymd")
  # }

  ## Merge all
  dd<-rbind(dd,dd_i)

  if (exp_out){
    # dir.create(file.path("/Volumes/Data/REDCap/data",names(r_lup)[[i]]))
    write.csv(r_lup[[i]],paste0("/Volumes/Data/REDCap/data/",names(r_lup)[[i]],".csv"),row.names = FALSE)
  }

}

# Readies the DataDictionary for export by limiting to unique identifier (leaving out multiple record ids) and omitting NAs
dd_exp<-dd %>% filter(.,!(duplicated(Variable...Field.Name)|is.na(Variable...Field.Name)))

## Marking identifier variables
dd_exp$Identifier.[dd_exp$Variable...Field.Name%in%id_data]<-"y"


write.csv(dd_exp,"/Volumes/Data/REDCap/data_dictionary.csv",row.names = FALSE,na="")


## ONE DATA SET TO RULE THEM ALL
ds_all<-key
for (i in names(r_lup)){
  ds_all<-left_join(ds_all,"[["(r_lup,i))
  ## All non-identifier variable names are unique, joining variable is not specified.
}

colnames(ds_all)<-colnames(ds_all)%>%tolower() ## All names in REDCap are lower case.

if (exp_out){
  write.csv(ds_all,"/Volumes/Data/REDCap/complete_dataset.csv",row.names = FALSE,na="")
}


## =============================================================================
## REDCap upload
## - worked, but headers should be lower case
##
## - 02aug22 not allowed to export or import.
##   Try manual data upload after attribute merge.
## =============================================================================

if (auto_upload==TRUE){

## Trying out native piping
token_talos<-read.csv("/Users/au301842/talos_redcap_token.csv",colClasses = "character")|>
  names()|>
  # (\(x){ ## Shorthand for "anonymous lambda function". New "_" placeholder does not work.
  #   substr(x,2,33)})()|>
  suppressWarnings()

## See https://towardsdatascience.com/understanding-the-native-r-pipe-98dea6d8b61b"
## OBS: new placeholder "_" in >4.2.


stts<-REDCapR::redcap_write(ds=ds_all,
                   redcap_uri   = "https://redcap.rm.dk/api/",
                   token        = token_talos
                   )

records_mod <- REDCapR::redcap_read_oneshot(
  redcap_uri   = "https://redcap.rm.dk/api/",
  token        = token_talos
  )
}

## Notes:
## - Validation and data selection options are not applied, as these are not necessary for upload.