diff --git a/R/.Rbuildignore b/R/.Rbuildignore index e148253..7f78a9d 100644 --- a/R/.Rbuildignore +++ b/R/.Rbuildignore @@ -2,3 +2,4 @@ ^\.Rproj\.user$ ^\.travis\.yml$ ^appveyor\.yml$ +^data-raw$ diff --git a/R/DESCRIPTION b/R/DESCRIPTION index 5372597..3163f9a 100644 --- a/R/DESCRIPTION +++ b/R/DESCRIPTION @@ -1,6 +1,6 @@ Package: REDCapRITS Title: REDCap Repeating Instrument Table Splitter -Version: 0.0.0 +Version: 0.2.0 Authors@R: c( person("Paul", "Egeler", email = "paul.egeler@spectrumhealth.org", role = c("aut", "cre")), person("Spectrum Health, Grand Rapids, MI", role = "cph")) @@ -18,9 +18,10 @@ Suggests: License: GPL-3 Encoding: UTF-8 LazyData: true -RoxygenNote: 6.0.1 +RoxygenNote: 6.1.1 URL: https://github.com/SpectrumHealthResearch/REDCapRITS BugReports: https://github.com/SpectrumHealthResearch/REDCapRITS/issues Collate: + 'utils.r' 'process_user_input.r' 'REDCap_split.r' diff --git a/R/NEWS.md b/R/NEWS.md new file mode 100644 index 0000000..e79fe06 --- /dev/null +++ b/R/NEWS.md @@ -0,0 +1,15 @@ +# REDCapRITS 0.2.0 (Release date: 2019-??-??) + + * [feature] User can now separate each form into its own data.frame, regardless if it is a repeating instrument or not. (#10) + * [bug] Handles auto-generated form timestamp fields. + +# REDCapRITS 0.1.0 (Release date: 2019-07-01) + + * [feature] User can now specify the name of the 'primary' table, which previously was left blank. (#9) + * [bug] Keeps REDCap-generated fields in non-repeating data.frame that are not present in metadata file. (#7) + * [enhancement] Unit tests created. (#6) + * [bug] Checkbox data now supported. (#1) + +# REDCapRITS 0.0.0 (Release date: 2018-06-03) + + * Initial Release diff --git a/R/R/REDCap_split.r b/R/R/REDCap_split.r index 4e45901..d28a26a 100644 --- a/R/R/REDCap_split.r +++ b/R/R/REDCap_split.r @@ -10,6 +10,11 @@ #' @param metadata Project metadata (the data dictionary). May be a #' \code{data.frame}, \code{response}, or \code{character} vector containing #' JSON from an API call. +#' @param primary_table_name Name given to the list element for the primary +#' output table (as described in \emph{README.md}). Ignored if +#' \code{forms = 'all'}. +#' @param forms Indicate whether to create separate tables for repeating +#' instruments only or for all forms. #' @author Paul W. Egeler, M.S., GStat #' @examples #' \dontrun{ @@ -64,11 +69,21 @@ #' # Split the tables #' REDCapRITS::REDCap_split(data, metadata) #' } -#' @return A list of \code{"data.frame"}s: one base table and zero or more -#' tables for each repeating instrument. -#' @include process_user_input.r +#' @return A list of \code{"data.frame"}s. The number of tables will differ +#' depending on the \code{forms} option selected. +#' \itemize{ +#' \item \code{'repeating'}: one base table and one or more +#' tables for each repeating instrument. +#' \item \code{'all'}: a data.frame for each instrument, regardless of +#' whether it is a repeating instrument or not. +#' } +#' @include process_user_input.r utils.r #' @export -REDCap_split <- function(records, metadata) { +REDCap_split <- function(records, + metadata, + primary_table_name = "", + forms = c("repeating", "all") +) { # Process user input records <- process_user_input(records) @@ -77,103 +92,22 @@ REDCap_split <- function(records, metadata) { # Get the variable names in the dataset vars_in_data <- names(records) + # Match arg for forms + forms <- match.arg(forms) + # Check to see if there were any repeating instruments - if (!any(vars_in_data == "redcap_repeat_instrument")) { - - message("There are no repeating instruments in this data.") - - return(list(records)) - + if (forms == "repeating" && !"redcap_repeat_instrument" %in% vars_in_data) { + stop("There are no repeating instruments in this dataset.") } # Standardize variable names for metadata - names(metadata) <- c( - "field_name", "form_name", "section_header", "field_type", - "field_label", "select_choices_or_calculations", "field_note", - "text_validation_type_or_show_slider_number", "text_validation_min", - "text_validation_max", "identifier", "branching_logic", "required_field", - "custom_alignment", "question_number", "matrix_group_name", "matrix_ranking", - "field_annotation" - ) + names(metadata) <- metadata_names # Make sure that no metadata columns are factors metadata <- rapply(metadata, as.character, classes = "factor", how = "replace") # Find the fields and associated form - fields <- metadata[ - !metadata$field_type %in% c("descriptive", "checkbox"), - c("field_name", "form_name") - ] - - # Process instrument status fields - form_names <- unique(metadata$form_name) - form_complete_fields <- data.frame( - field_name = paste0(form_names, "_complete"), - form_name = form_names, - stringsAsFactors = FALSE - ) - - fields <- rbind(fields, form_complete_fields) - - # Process checkbox fields - if (any(metadata$field_type == "checkbox")) { - - checkbox_basenames <- metadata[ - metadata$field_type == "checkbox", - c("field_name", "form_name") - ] - - checkbox_fields <- - do.call( - "rbind", - apply( - checkbox_basenames, - 1, - function(x, y) - data.frame( - field_name = y[grepl(paste0("^", x[1], "___((?!\\.factor).)+$"), y, perl = TRUE)], - form_name = x[2], - stringsAsFactors = FALSE, - row.names = NULL - ), - y = vars_in_data - ) - ) - - fields <- rbind(fields, checkbox_fields) - - } - - # Process ".*\\.factor" fields supplied by REDCap's export data R script - if (any(grepl("\\.factor$", vars_in_data))) { - - factor_fields <- - do.call( - "rbind", - apply( - fields, - 1, - function(x, y) { - field_indices <- grepl(paste0("^", x[1], "\\.factor$"), y) - if (any(field_indices)) - data.frame( - field_name = y[field_indices], - form_name = x[2], - stringsAsFactors = FALSE, - row.names = NULL - ) - }, - y = vars_in_data - ) - ) - - fields <- rbind(fields, factor_fields) - - } - - # Identify the subtables in the data - subtables <- unique(records$redcap_repeat_instrument) - subtables <- subtables[subtables != ""] + fields <- match_fields_to_form(metadata, vars_in_data) # Variables to be present in each output table universal_fields <- c( @@ -186,43 +120,74 @@ REDCap_split <- function(records, metadata) { ) ) - # Variables to be at the beginning of each repeating instrument - repeat_instrument_fields <- grep( - "^redcap_repeat.*", - vars_in_data, - value = TRUE - ) + if ("redcap_repeat_instrument" %in% vars_in_data) { + # Variables to be at the beginning of each repeating instrument + repeat_instrument_fields <- grep( + "^redcap_repeat.*", + vars_in_data, + value = TRUE + ) + # Identify the subtables in the data + subtables <- unique(records$redcap_repeat_instrument) + subtables <- subtables[subtables != ""] - # Split the table based on instrument - out <- split.data.frame(records, records$redcap_repeat_instrument) + # Split the table based on instrument + out <- split.data.frame(records, records$redcap_repeat_instrument) + primary_table_index <- which(names(out) == "") - # Delete the variables that are not relevant - for (i in names(out)) { + if (forms == "repeating" && primary_table_name %in% subtables) { + warning("The label given to the primary table is already used by a repeating instrument. The primary table label will be left blank.") + primary_table_name <- "" + } else if (primary_table_name > "") { + names(out)[[primary_table_index]] <- primary_table_name + } - if (i == "") { + # Delete the variables that are not relevant + for (i in names(out)) { - out_fields <- which( - vars_in_data %in% c( - universal_fields, - fields[!fields[,2] %in% subtables, 1] + if (i == primary_table_name) { + + out_fields <- which( + vars_in_data %in% c( + universal_fields, + fields[!fields[,2] %in% subtables, 1] + ) ) - ) - out[[which(names(out) == "")]] <- out[[which(names(out) == "")]][out_fields] + out[[primary_table_index]] <- out[[primary_table_index]][out_fields] - } else { + } else { - out_fields <- which( - vars_in_data %in% c( - universal_fields, - repeat_instrument_fields, - fields[fields[,2] == i, 1] + out_fields <- which( + vars_in_data %in% c( + universal_fields, + repeat_instrument_fields, + fields[fields[,2] == i, 1] + ) ) - ) - out[[i]] <- out[[i]][out_fields] + out[[i]] <- out[[i]][out_fields] + + } } + if (forms == "all") { + + out <- c( + split_non_repeating_forms( + out[[primary_table_index]], + universal_fields, + fields[!fields[,2] %in% subtables,] + ), + out[-primary_table_index] + ) + + } + + } else { + + out <- split_non_repeating_forms(records, universal_fields, fields) + } out diff --git a/R/R/sysdata.rda b/R/R/sysdata.rda new file mode 100644 index 0000000..0299f29 Binary files /dev/null and b/R/R/sysdata.rda differ diff --git a/R/R/utils.r b/R/R/utils.r new file mode 100644 index 0000000..87d1509 --- /dev/null +++ b/R/R/utils.r @@ -0,0 +1,106 @@ +match_fields_to_form <- function(metadata, vars_in_data) { + + fields <- metadata[ + !metadata$field_type %in% c("descriptive", "checkbox"), + c("field_name", "form_name") + ] + + # Process instrument status fields + form_names <- unique(metadata$form_name) + form_complete_fields <- data.frame( + field_name = paste0(form_names, "_complete"), + form_name = form_names, + stringsAsFactors = FALSE + ) + + fields <- rbind(fields, form_complete_fields) + + # Process survey timestamps + timestamps <- intersect(vars_in_data, paste0(form_names, "_timestamp")) + if (length(timestamps)) { + + timestamp_fields <- data.frame( + field_name = timestamps, + form_name = sub("_timestamp$", "", timestamps), + stringsAsFactors = FALSE + ) + + fields <- rbind(fields, timestamp_fields) + + } + + # Process checkbox fields + if (any(metadata$field_type == "checkbox")) { + + checkbox_basenames <- metadata[ + metadata$field_type == "checkbox", + c("field_name", "form_name") + ] + + checkbox_fields <- + do.call( + "rbind", + apply( + checkbox_basenames, + 1, + function(x, y) + data.frame( + field_name = y[grepl(paste0("^", x[1], "___((?!\\.factor).)+$"), y, perl = TRUE)], + form_name = x[2], + stringsAsFactors = FALSE, + row.names = NULL + ), + y = vars_in_data + ) + ) + + fields <- rbind(fields, checkbox_fields) + + } + + # Process ".*\\.factor" fields supplied by REDCap's export data R script + if (any(grepl("\\.factor$", vars_in_data))) { + + factor_fields <- + do.call( + "rbind", + apply( + fields, + 1, + function(x, y) { + field_indices <- grepl(paste0("^", x[1], "\\.factor$"), y) + if (any(field_indices)) + data.frame( + field_name = y[field_indices], + form_name = x[2], + stringsAsFactors = FALSE, + row.names = NULL + ) + }, + y = vars_in_data + ) + ) + + fields <- rbind(fields, factor_fields) + + } + + fields + + } + + +split_non_repeating_forms <- function(table, universal_fields, fields) { + + forms <- unique(fields[[2]]) + + x <- lapply( + forms, + function (x) { + table[names(table) %in% union(universal_fields, fields[fields[,2] == x,1])] + }) + + structure(x, names = forms) + +} + diff --git a/R/data-raw/metadata_names.R b/R/data-raw/metadata_names.R new file mode 100644 index 0000000..30d8c8c --- /dev/null +++ b/R/data-raw/metadata_names.R @@ -0,0 +1,10 @@ +metadata_names <- c( + "field_name", "form_name", "section_header", "field_type", + "field_label", "select_choices_or_calculations", "field_note", + "text_validation_type_or_show_slider_number", "text_validation_min", + "text_validation_max", "identifier", "branching_logic", "required_field", + "custom_alignment", "question_number", "matrix_group_name", "matrix_ranking", + "field_annotation" +) + +usethis::use_data(metadata_names, overwrite = TRUE, internal = TRUE) diff --git a/R/man/REDCap_split.Rd b/R/man/REDCap_split.Rd index 8bd4793..c5a99b0 100644 --- a/R/man/REDCap_split.Rd +++ b/R/man/REDCap_split.Rd @@ -4,7 +4,8 @@ \alias{REDCap_split} \title{Split REDCap repeating instruments table into multiple tables} \usage{ -REDCap_split(records, metadata) +REDCap_split(records, metadata, primary_table_name = "", + forms = c("repeating", "all")) } \arguments{ \item{records}{Exported project records. May be a \code{data.frame}, @@ -14,10 +15,23 @@ call.} \item{metadata}{Project metadata (the data dictionary). May be a \code{data.frame}, \code{response}, or \code{character} vector containing JSON from an API call.} + +\item{primary_table_name}{Name given to the list element for the primary +output table (as described in \emph{README.md}). Ignored if +\code{forms = 'all'}.} + +\item{forms}{Indicate whether to create separate tables for repeating +instruments only or for all forms.} } \value{ -A list of \code{"data.frame"}s: one base table and zero or more - tables for each repeating instrument. +A list of \code{"data.frame"}s. The number of tables will differ + depending on the \code{forms} option selected. + \itemize{ + \item \code{'repeating'}: one base table and one or more + tables for each repeating instrument. + \item \code{'all'}: a data.frame for each instrument, regardless of + whether it is a repeating instrument or not. + } } \description{ This will take output from a REDCap export and split it into a base table diff --git a/R/tests/testthat/test-forms-arg.R b/R/tests/testthat/test-forms-arg.R new file mode 100644 index 0000000..74c3507 --- /dev/null +++ b/R/tests/testthat/test-forms-arg.R @@ -0,0 +1,63 @@ +context("Using the `forms = 'all'` argument") + +# Global variables -------------------------------------------------------- + +# Cars +metadata <- jsonlite::fromJSON( + get_data_location( + "ExampleProject_metadata.json" + ) +) + +records <- jsonlite::fromJSON( + get_data_location( + "ExampleProject_records.json" + ) +) + +redcap_output_json <- REDCap_split(records, metadata, forms = "all") + +# Longitudinal +file_paths <- sapply( + c( + records = "WARRIORtestForSoftwa_DATA_2018-06-21_1431.csv", + metadata = "WARRIORtestForSoftwareUpgrades_DataDictionary_2018-06-21.csv" + ), get_data_location +) + +redcap <- lapply(file_paths, read.csv, stringsAsFactors = FALSE) +redcap[["metadata"]] <- with(redcap, metadata[metadata[,1] > "",]) +redcap_output_long <- with(redcap, REDCap_split(records, metadata, forms = "all")) +redcap_long_names <- names(redcap[[1]]) + +# Tests ------------------------------------------------------------------- + +test_that("Each form is an element in the list", { + + expect_length(redcap_output_json, 3L) + expect_identical(names(redcap_output_json), c("motor_trend_cars", "grouping", "sale")) + +}) + +test_that("All variables land somewhere", { + + expect_true(setequal(names(records), Reduce("union", sapply(redcap_output_json, names)))) + +}) + + +test_that("Primary table name is ignored", { + expect_identical( + REDCap_split(records, metadata, "HELLO", "all"), + redcap_output_json + ) +}) + +test_that("Supports longitudinal data", { + + # setdiff(redcap_long_names, Reduce("union", sapply(redcap_output_long, names))) + ## [1] "informed_consent_and_addendum_timestamp" + + expect_true(setequal(redcap_long_names, Reduce("union", sapply(redcap_output_long, names)))) + +}) diff --git a/R/tests/testthat/test-longitudinal.R b/R/tests/testthat/test-longitudinal.R index c64cc14..3b955c3 100644 --- a/R/tests/testthat/test-longitudinal.R +++ b/R/tests/testthat/test-longitudinal.R @@ -13,5 +13,5 @@ test_that("CSV export matches reference", { redcap_output <- with(redcap, REDCap_split(records, metadata)) - expect_known_hash(redcap_output, "dff3a52955") + expect_known_hash(redcap_output, "0934bcb292") }) diff --git a/R/tests/testthat/test-primary-table-name.R b/R/tests/testthat/test-primary-table-name.R new file mode 100644 index 0000000..0c3a31b --- /dev/null +++ b/R/tests/testthat/test-primary-table-name.R @@ -0,0 +1,38 @@ +context("Primary table name processing") + + +# Global variables ------------------------------------------------------- +metadata <- jsonlite::fromJSON( + get_data_location( + "ExampleProject_metadata.json" + ) +) + +records <- jsonlite::fromJSON( + get_data_location( + "ExampleProject_records.json" + ) +) + +ref_hash <- "2c8b6531597182af1248f92124161e0c" + +# Tests ------------------------------------------------------------------- +test_that("Will not use a repeating instrument name for primary table", { + + redcap_output_json1 <- expect_warning( + REDCap_split(records, metadata, "sale"), + "primary table" + ) + + expect_known_hash(redcap_output_json1, ref_hash) + +}) + +test_that("Names are set correctly and output is identical", { + redcap_output_json2 <- REDCap_split(records, metadata, "main") + + + expect_identical(names(redcap_output_json2), c("main", "sale")) + expect_known_hash(setNames(redcap_output_json2, c("", "sale")), ref_hash) + +})