PhysicalActivityandStrokeOu.../REDCap/convert_redcap.R

## This approach is based on a folder with several data files to upload to an empty data base

## Bulk conversion of stata files to CSV
library(haven)
library(dplyr)


## Export output files??
exp_out=TRUE
id_data=c("record_id","cpr","rtreat") # Data from keyfile is all marked as identifier
auto_upload=FALSE

## Loading filenames
setwd("/Volumes/Data/")
files<-list.files("STATA13/.", pattern="*.dta", full.names=FALSE)

## Loading randomisation key file, keeping only necessary variables, 
## renaming rnumb to record_id
key<-as_factor(read_dta("/Volumes/Data/STATA13/inkl_rev_v13.dta")) %>% 
  filter(.,rnumb!="999") %>% 
  select(.,c("rnumb","cpr","rtreat")) %>%
  rename(.,record_id=rnumb)

## Excluding undesired datasets
## DAP, logfile, randomisation log, randomisation key
## These will be kept and uploaded seperately
files<-files[!grepl("dap",files)&
               !grepl("transactions",files)&
               !grepl("rand",files)]
             
## Get "first name" from all files
fnames<-sapply(strsplit(files, "[_]"),"[[",1) # Just discovered the subset functions "[" and "[[". Wow!

## Loading data sets to list
ls<-list()
for (i in 1:length(files)){
  ## Get dataset, factorise labels, 
  ## join with key-file, drop cpr and remove NA's
  d<-left_join(key,
               as_factor(read_dta(paste0("STATA13/",files[i]))),by="cpr") %>% 
    # select(.,!"cpr") %>%
    filter(.,!is.na(SYS_SITE)) ## SYS_DATA is in all sets, but fmed, uses SYS_SITE
  d<-data.frame(d)
  
  ## Append dataframe to list
  ls[[i]]<-d
  names(ls)[i]<-fnames[i]
  
  ## Export original data frame
  if (exp_out){
    write.csv(d,paste0("REDCap/orig/",fnames[i],".csv"),row.names = FALSE)
  }
  
}    

## DAP variables to include, incl record_id
vars<-c("record_id","hojde","vaegt","vaegt_anslaaet","rygning","alkohol","civil",
        "bolig","diabetes","hyperten","perifer_arteriel","atriefli","ami",
        "tidl_tci","trombolyse","trombektomi","nhiss_foer")

# ## Importing and subsetting DAP-data- All in one style

dap_d<-read_dta("/Volumes/Data/STATA13/dap_rapport_2017.dta") %>% ## Loading file with correct name
  filter(.,!duplicated(cpr))%>% ## Limiting to first event
  left_join(key,.,by="cpr")%>% ## Join with key for record_id's, to only keep entries also in key
  select(.,all_of(vars)) ## Select only specified variables, leaving out cpr's

ls[[length(ls)+1]]<-dap_d
names(ls)[length(ls)]<-"reg"


## Exporting attributes
for (i in 1:length(ls)) {
  ## Lists all attributes for export
  l <- lapply(ls[[i]], attr, "label")
  
  for (j in 1:length(l)){
    l[[j]]<-ifelse(is.null(l[[j]]),"No attr",l[[j]])
  }
  
  la<-data.frame(names=names(l),attr=unlist(l))
  
  ## Export individual attribute files for data overview
  if (exp_out){
    write.csv(la,paste0("REDCap/attr/attr_",fnames[i],".csv"),row.names = FALSE)
  }
}

## Naming and splitting in unique instruments for DataDictionary creation and data upload

## Leave inklusion file, adverse events and other medication.
## These are not needed for upload or will be uploaded seperately

r_sel<-!(grepl("inkl",files)|grepl("ae_",files)|grepl("medicin",files))

# Limiting name variables
# r_files<-files[r_sel]
r_fnames<-c(fnames[r_sel],"reg")

# Selecting elements in list
r_ls<-"["(ls,names(ls)%in%r_fnames)

# New list for wrangling to rename variables and splitting by instance
r_lup<-list()

for (i in 1:length(r_ls)){    
  ## Suffix generic variable names
  d<-r_ls[[i]]
  name<-names(r_ls)[[i]]
  dn<-!(grepl(name,colnames(d))|colnames(d)%in%id_data)## Test colnames that does not contain instrument name
  
  colnames(d)[dn]<-paste0(name,"_",colnames(d)[dn]) ## Adds suffix to colnames to ensure unique names
  # First entry is omitted, as this is the record_id
  
  ## Test for if conditions
  ## Uses redundant double if, as else wasn't consistent
  test<-grepl("INSTANCE",colnames(d),ignore.case = T )
  
  if (any(test)){
    d[,test]<-factor(d[,test]) # Factorise to secure ordering
    ins<-levels(d[,test]) # Instance numbers drawn from factor levels
    
    ds<-split(d[,!test] ,d[,test]) # Splits by instance and drops instance variable
    names(ds)<-paste0(name,"_",ins) # Names frames by instrument, appends instance
    for (j in 1:length(ds)){
      colnames(ds[[j]])[!colnames(ds[[j]])%in%id_data]<-paste0(colnames(ds[[j]])[!colnames(ds[[j]])%in%id_data],"_",ins[j])
      ## Append instance number
    }
    
    r_lup<-append(r_lup,ds)
  }
  
  if (!any(test)) {
    r_lup[[length(r_lup)+1]]<-d
    names(r_lup)[length(r_lup)]<-name
  }
  
}

## Load REDCap instrument example file for variable names
icname<-colnames(read.csv("/Volumes/Data/REDCap/examples/examlpe instrument.csv"))
dd<-data.frame(matrix(ncol = length(icname))) ## Data frame to collect all
colnames(dd)<-icname

## Instrument for DataDictionary
## Format dataset for REDCap upload

for (i in 1:length(r_lup)){
  dd_i<-data.frame(matrix(ncol = length(icname),nrow = ncol("[["(r_lup,i)))) ## Data frame to collect all
  
  colnames(dd_i)<-icname ## for easier reading
  
  ## Variable names
  dd_i[1]<-colnames("[["(r_lup,i))
  
  ## Form Name
  dd_i[2]<-names(r_lup)[i]
  
  ## Field Type
  # dd_i[4]<-ifelse(sapply(r_lup[[i]], class)=="factor","radio","text")
  dd_i[4]<-"text"
  
  ## Field Label
  ## Using original attributes as field labels
  fl<-lapply(r_lup[[i]], attr, "label")
  for (j in 1:length(fl)){
    fl[[j]]<-ifelse(is.null(fl[[j]]),
                    names(fl)[[j]],
                    fl[[j]])
    ##  If no attributes, variable name is used as "placeholder"
  }
  dd_i[5]<-unlist(fl)
  
  ## Choices
  # for (j in 1:ncol(r_lup[[i]])){
  #   if (is.factor(r_lup[[i]][[j]])){
  #     lvl<-levels(r_lup[[i]][[j]])
  #     lvl_ch<-paste("1,",lvl[1])
  #     for (k in 2:length(lvl)){
  #       lvl_ch<-c(paste0(lvl_ch," | ",k,", ",lvl[k]))
  #     }
  #     dd_i[j,6]<-lvl_ch
  #   }
  # }
  
  ## Text Validation
  ## Only used for date and time data
  # for (j in 1:ncol(r_lup[[i]])){
  #   dd_i[j,8]<-case_when(class(r_lup[[i]][[j]])[1]%in%c("POSIXct","POSIXt") ~"datetime_seconds_ymd",
  #                        class(r_lup[[i]][[j]])[1]%in%c("Date") ~"date_ymd")
  # }
  
  ## Merge all
  dd<-rbind(dd,dd_i)
  
  if (exp_out){
    # dir.create(file.path("/Volumes/Data/REDCap/data",names(r_lup)[[i]]))
    write.csv(r_lup[[i]],paste0("/Volumes/Data/REDCap/data/",names(r_lup)[[i]],".csv"),row.names = FALSE)
  }
  
}

# Readies the DataDictionary for export by limiting to unique identifier (leaving out multiple record ids) and omitting NAs
dd_exp<-dd %>% filter(.,!(duplicated(Variable...Field.Name)|is.na(Variable...Field.Name)))

## Marking identifier variables
dd_exp$Identifier.[dd_exp$Variable...Field.Name%in%id_data]<-"y"


write.csv(dd_exp,"/Volumes/Data/REDCap/data_dictionary.csv",row.names = FALSE,na="")


## ONE DATA SET TO RULE THEM ALL
ds_all<-key
for (i in names(r_lup)){
  ds_all<-left_join(ds_all,"[["(r_lup,i)) 
  ## All non-identifier variable names are unique, joining variable is not specified.
}

colnames(ds_all)<-colnames(ds_all)%>%tolower() ## All names in REDCap are lower case.

if (exp_out){
  write.csv(ds_all,"/Volumes/Data/REDCap/complete_dataset.csv",row.names = FALSE,na="")
}


## =============================================================================
## REDCap upload
## - worked, but headers should be lower case
## 
## - 02aug22 not allowed to export or import. 
##   Try manual data upload after attribute merge.
## =============================================================================

if (auto_upload==TRUE){

## Trying out native piping
token_talos<-read.csv("/Users/au301842/talos_redcap_token.csv",colClasses = "character")|>
  names()|>
  # (\(x){ ## Shorthand for "anonymous lambda function". New "_" placeholder does not work.
  #   substr(x,2,33)})()|>
  suppressWarnings()

## See https://towardsdatascience.com/understanding-the-native-r-pipe-98dea6d8b61b"
## OBS: new placeholder "_" in >4.2.


stts<-REDCapR::redcap_write(ds=ds_all,
                   redcap_uri   = "https://redcap.rm.dk/api/",
                   token        = token_talos
                   )

records_mod <- REDCapR::redcap_read_oneshot(
  redcap_uri   = "https://redcap.rm.dk/api/",
  token        = token_talos
  )
}

## Notes:
## - Validation and data selection options are not applied, as these are not necessary for upload.
small eds 2022-08-19 11:06:48 +02:00			`## This approach is based on a folder with several data files to upload to an empty data base`

addition of REDCap data conversion work 2022-08-02 11:54:53 +02:00			`## Bulk conversion of stata files to CSV`
			`library(haven)`
			`library(dplyr)`


			`## Export output files??`
			`exp_out=TRUE`
			`id_data=c("record_id","cpr","rtreat") # Data from keyfile is all marked as identifier`
			`auto_upload=FALSE`

			`## Loading filenames`
			`setwd("/Volumes/Data/")`
			`files<-list.files("STATA13/.", pattern="*.dta", full.names=FALSE)`

			`## Loading randomisation key file, keeping only necessary variables,`
			`## renaming rnumb to record_id`
			`key<-as_factor(read_dta("/Volumes/Data/STATA13/inkl_rev_v13.dta")) %>%`
			`filter(.,rnumb!="999") %>%`
			`select(.,c("rnumb","cpr","rtreat")) %>%`
			`rename(.,record_id=rnumb)`

			`## Excluding undesired datasets`
			`## DAP, logfile, randomisation log, randomisation key`
			`## These will be kept and uploaded seperately`
			`files<-files[!grepl("dap",files)&`
			`!grepl("transactions",files)&`
			`!grepl("rand",files)]`

			`## Get "first name" from all files`
			`fnames<-sapply(strsplit(files, "[_]"),"[[",1) # Just discovered the subset functions "[" and "[[". Wow!`

			`## Loading data sets to list`
			`ls<-list()`
			`for (i in 1:length(files)){`
			`## Get dataset, factorise labels,`
			`## join with key-file, drop cpr and remove NA's`
			`d<-left_join(key,`
			`as_factor(read_dta(paste0("STATA13/",files[i]))),by="cpr") %>%`
			`# select(.,!"cpr") %>%`
			`filter(.,!is.na(SYS_SITE)) ## SYS_DATA is in all sets, but fmed, uses SYS_SITE`
			`d<-data.frame(d)`

			`## Append dataframe to list`
			`ls[[i]]<-d`
			`names(ls)[i]<-fnames[i]`

			`## Export original data frame`
			`if (exp_out){`
			`write.csv(d,paste0("REDCap/orig/",fnames[i],".csv"),row.names = FALSE)`
			`}`

			`}`

			`## DAP variables to include, incl record_id`
			`vars<-c("record_id","hojde","vaegt","vaegt_anslaaet","rygning","alkohol","civil",`
			`"bolig","diabetes","hyperten","perifer_arteriel","atriefli","ami",`
			`"tidl_tci","trombolyse","trombektomi","nhiss_foer")`

			`# ## Importing and subsetting DAP-data- All in one style`

			`dap_d<-read_dta("/Volumes/Data/STATA13/dap_rapport_2017.dta") %>% ## Loading file with correct name`
			`filter(.,!duplicated(cpr))%>% ## Limiting to first event`
			`left_join(key,.,by="cpr")%>% ## Join with key for record_id's, to only keep entries also in key`
			`select(.,all_of(vars)) ## Select only specified variables, leaving out cpr's`

			`ls[[length(ls)+1]]<-dap_d`
			`names(ls)[length(ls)]<-"reg"`


			`## Exporting attributes`
			`for (i in 1:length(ls)) {`
			`## Lists all attributes for export`
			`l <- lapply(ls[[i]], attr, "label")`

			`for (j in 1:length(l)){`
			`l[[j]]<-ifelse(is.null(l[[j]]),"No attr",l[[j]])`
			`}`

			`la<-data.frame(names=names(l),attr=unlist(l))`

			`## Export individual attribute files for data overview`
			`if (exp_out){`
			`write.csv(la,paste0("REDCap/attr/attr_",fnames[i],".csv"),row.names = FALSE)`
			`}`
			`}`

			`## Naming and splitting in unique instruments for DataDictionary creation and data upload`

			`## Leave inklusion file, adverse events and other medication.`
			`## These are not needed for upload or will be uploaded seperately`

			`r_sel<-!(grepl("inkl",files)\|grepl("ae_",files)\|grepl("medicin",files))`

			`# Limiting name variables`
			`# r_files<-files[r_sel]`
			`r_fnames<-c(fnames[r_sel],"reg")`

			`# Selecting elements in list`
			`r_ls<-"["(ls,names(ls)%in%r_fnames)`

			`# New list for wrangling to rename variables and splitting by instance`
			`r_lup<-list()`

			`for (i in 1:length(r_ls)){`
			`## Suffix generic variable names`
			`d<-r_ls[[i]]`
			`name<-names(r_ls)[[i]]`
			`dn<-!(grepl(name,colnames(d))\|colnames(d)%in%id_data)## Test colnames that does not contain instrument name`

			`colnames(d)[dn]<-paste0(name,"_",colnames(d)[dn]) ## Adds suffix to colnames to ensure unique names`
			`# First entry is omitted, as this is the record_id`

			`## Test for if conditions`
			`## Uses redundant double if, as else wasn't consistent`
			`test<-grepl("INSTANCE",colnames(d),ignore.case = T )`

			`if (any(test)){`
			`d[,test]<-factor(d[,test]) # Factorise to secure ordering`
			`ins<-levels(d[,test]) # Instance numbers drawn from factor levels`

			`ds<-split(d[,!test] ,d[,test]) # Splits by instance and drops instance variable`
			`names(ds)<-paste0(name,"_",ins) # Names frames by instrument, appends instance`
			`for (j in 1:length(ds)){`
			`colnames(ds[[j]])[!colnames(ds[[j]])%in%id_data]<-paste0(colnames(ds[[j]])[!colnames(ds[[j]])%in%id_data],"_",ins[j])`
			`## Append instance number`
			`}`

			`r_lup<-append(r_lup,ds)`
			`}`

			`if (!any(test)) {`
			`r_lup[[length(r_lup)+1]]<-d`
			`names(r_lup)[length(r_lup)]<-name`
			`}`

			`}`

			`## Load REDCap instrument example file for variable names`
			`icname<-colnames(read.csv("/Volumes/Data/REDCap/examples/examlpe instrument.csv"))`
			`dd<-data.frame(matrix(ncol = length(icname))) ## Data frame to collect all`
			`colnames(dd)<-icname`

			`## Instrument for DataDictionary`
			`## Format dataset for REDCap upload`

			`for (i in 1:length(r_lup)){`
			`dd_i<-data.frame(matrix(ncol = length(icname),nrow = ncol("[["(r_lup,i)))) ## Data frame to collect all`

			`colnames(dd_i)<-icname ## for easier reading`

			`## Variable names`
			`dd_i[1]<-colnames("[["(r_lup,i))`

			`## Form Name`
			`dd_i[2]<-names(r_lup)[i]`

			`## Field Type`
			`# dd_i[4]<-ifelse(sapply(r_lup[[i]], class)=="factor","radio","text")`
			`dd_i[4]<-"text"`

			`## Field Label`
			`## Using original attributes as field labels`
			`fl<-lapply(r_lup[[i]], attr, "label")`
			`for (j in 1:length(fl)){`
			`fl[[j]]<-ifelse(is.null(fl[[j]]),`
			`names(fl)[[j]],`
			`fl[[j]])`
			`## If no attributes, variable name is used as "placeholder"`
			`}`
			`dd_i[5]<-unlist(fl)`

			`## Choices`
			`# for (j in 1:ncol(r_lup[[i]])){`
			`# if (is.factor(r_lup[[i]][[j]])){`
			`# lvl<-levels(r_lup[[i]][[j]])`
			`# lvl_ch<-paste("1,",lvl[1])`
			`# for (k in 2:length(lvl)){`
			`# lvl_ch<-c(paste0(lvl_ch," \| ",k,", ",lvl[k]))`
			`# }`
			`# dd_i[j,6]<-lvl_ch`
			`# }`
			`# }`

			`## Text Validation`
			`## Only used for date and time data`
			`# for (j in 1:ncol(r_lup[[i]])){`
			`# dd_i[j,8]<-case_when(class(r_lup[[i]][[j]])[1]%in%c("POSIXct","POSIXt") ~"datetime_seconds_ymd",`
			`# class(r_lup[[i]][[j]])[1]%in%c("Date") ~"date_ymd")`
			`# }`

			`## Merge all`
			`dd<-rbind(dd,dd_i)`

			`if (exp_out){`
			`# dir.create(file.path("/Volumes/Data/REDCap/data",names(r_lup)[[i]]))`
			`write.csv(r_lup[[i]],paste0("/Volumes/Data/REDCap/data/",names(r_lup)[[i]],".csv"),row.names = FALSE)`
			`}`

			`}`

			`# Readies the DataDictionary for export by limiting to unique identifier (leaving out multiple record ids) and omitting NAs`
			`dd_exp<-dd %>% filter(.,!(duplicated(Variable...Field.Name)\|is.na(Variable...Field.Name)))`

			`## Marking identifier variables`
			`dd_exp$Identifier.[dd_exp$Variable...Field.Name%in%id_data]<-"y"`


			`write.csv(dd_exp,"/Volumes/Data/REDCap/data_dictionary.csv",row.names = FALSE,na="")`


			`## ONE DATA SET TO RULE THEM ALL`
			`ds_all<-key`
			`for (i in names(r_lup)){`
			`ds_all<-left_join(ds_all,"[["(r_lup,i))`
			`## All non-identifier variable names are unique, joining variable is not specified.`
			`}`

			`colnames(ds_all)<-colnames(ds_all)%>%tolower() ## All names in REDCap are lower case.`

			`if (exp_out){`
			`write.csv(ds_all,"/Volumes/Data/REDCap/complete_dataset.csv",row.names = FALSE,na="")`
			`}`


			`## =============================================================================`
			`## REDCap upload`
			`## - worked, but headers should be lower case`
			`##`
			`## - 02aug22 not allowed to export or import.`
			`## Try manual data upload after attribute merge.`
			`## =============================================================================`

			`if (auto_upload==TRUE){`

			`## Trying out native piping`
			`token_talos<-read.csv("/Users/au301842/talos_redcap_token.csv",colClasses = "character")\|>`
			`names()\|>`
			`# (\(x){ ## Shorthand for "anonymous lambda function". New "_" placeholder does not work.`
			`# substr(x,2,33)})()\|>`
			`suppressWarnings()`

			`## See https://towardsdatascience.com/understanding-the-native-r-pipe-98dea6d8b61b"`
			`## OBS: new placeholder "_" in >4.2.`


			`stts<-REDCapR::redcap_write(ds=ds_all,`
			`redcap_uri = "https://redcap.rm.dk/api/",`
			`token = token_talos`
			`)`

			`records_mod <- REDCapR::redcap_read_oneshot(`
			`redcap_uri = "https://redcap.rm.dk/api/",`
			`token = token_talos`
			`)`
			`}`

			`## Notes:`
			`## - Validation and data selection options are not applied, as these are not necessary for upload.`