Merge pull request #11 from SpectrumHealthResearch/dev

Merging changes for #9 and #10
2025-04-26 07:50:46 +02:00 · 2019-08-14 08:55:27 -04:00 · 2019-08-14 08:55:27 -04:00 · 8685e6df4a
commit 8685e6df4a
parent 7a83e4bc45 d15c2d35ad
11 changed files with 336 additions and 123 deletions
--- a/R/.Rbuildignore
+++ b/R/.Rbuildignore
@ -2,3 +2,4 @@
 ^\.Rproj\.user$
 ^\.travis\.yml$
 ^appveyor\.yml$
+^data-raw$
--- a/R/DESCRIPTION
+++ b/R/DESCRIPTION
@ -1,6 +1,6 @@
 Package: REDCapRITS
 Title: REDCap Repeating Instrument Table Splitter
-Version: 0.0.0
+Version: 0.2.0
 Authors@R: c(
    person("Paul", "Egeler", email = "paul.egeler@spectrumhealth.org", role = c("aut", "cre")),
    person("Spectrum Health, Grand Rapids, MI", role = "cph"))
@ -18,9 +18,10 @@ Suggests:
 License: GPL-3
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 6.0.1
+RoxygenNote: 6.1.1
 URL: https://github.com/SpectrumHealthResearch/REDCapRITS
 BugReports: https://github.com/SpectrumHealthResearch/REDCapRITS/issues
 Collate: 
+    'utils.r'
    'process_user_input.r'
    'REDCap_split.r'
--- a/R/NEWS.md
+++ b/R/NEWS.md
@ -0,0 +1,15 @@
+# REDCapRITS 0.2.0 (Release date: 2019-??-??)
+
+  * [feature] User can now separate each form into its own data.frame, regardless if it is a repeating instrument or not. (#10)
+  * [bug] Handles auto-generated form timestamp fields.
+
+# REDCapRITS 0.1.0 (Release date: 2019-07-01)
+
+  * [feature] User can now specify the name of the 'primary' table, which previously was left blank. (#9)
+  * [bug] Keeps REDCap-generated fields in non-repeating data.frame that are not present in metadata file. (#7)
+  * [enhancement] Unit tests created. (#6)
+  * [bug] Checkbox data now supported. (#1)
+
+# REDCapRITS 0.0.0 (Release date: 2018-06-03)
+
+  * Initial Release
--- a/R/R/REDCap_split.r
+++ b/R/R/REDCap_split.r
@ -10,6 +10,11 @@
 #' @param metadata Project metadata (the data dictionary). May be a
 #'   \code{data.frame}, \code{response}, or \code{character} vector containing
 #'   JSON from an API call.
+#' @param primary_table_name Name given to the list element for the primary
+#'   output table (as described in \emph{README.md}). Ignored if
+#'   \code{forms = 'all'}.
+#' @param forms Indicate whether to create separate tables for repeating
+#'   instruments only or for all forms.
 #' @author Paul W. Egeler, M.S., GStat
 #' @examples
 #' \dontrun{
@ -64,11 +69,21 @@
 #' # Split the tables
 #' REDCapRITS::REDCap_split(data, metadata)
 #' }
-#' @return A list of \code{"data.frame"}s: one base table and zero or more
+#' @return A list of \code{"data.frame"}s. The number of tables will differ
+#'   depending on the \code{forms} option selected.
+#'   \itemize{
+#'     \item \code{'repeating'}: one base table and one or more
 #'     tables for each repeating instrument.
-#' @include process_user_input.r
+#'     \item \code{'all'}: a data.frame for each instrument, regardless of
+#'     whether it is a repeating instrument or not.
+#'   }
+#' @include process_user_input.r utils.r
 #' @export
-REDCap_split <- function(records, metadata) {
+REDCap_split <- function(records,
+                         metadata,
+                         primary_table_name = "",
+                         forms = c("repeating", "all")
+) {

  # Process user input
  records  <- process_user_input(records)
@ -77,103 +92,22 @@ REDCap_split <- function(records, metadata) {
  # Get the variable names in the dataset
  vars_in_data <- names(records)

+  # Match arg for forms
+  forms <- match.arg(forms)
+
  # Check to see if there were any repeating instruments
-  if (!any(vars_in_data == "redcap_repeat_instrument")) {
-
-    message("There are no repeating instruments in this data.")
-
-    return(list(records))
-
+  if (forms == "repeating" && !"redcap_repeat_instrument" %in% vars_in_data) {
+    stop("There are no repeating instruments in this dataset.")
  }

  # Standardize variable names for metadata
-  names(metadata) <- c(
-    "field_name", "form_name", "section_header", "field_type",
-    "field_label", "select_choices_or_calculations", "field_note",
-    "text_validation_type_or_show_slider_number", "text_validation_min",
-    "text_validation_max", "identifier", "branching_logic", "required_field",
-    "custom_alignment", "question_number", "matrix_group_name", "matrix_ranking",
-    "field_annotation"
-  )
+  names(metadata) <- metadata_names

  # Make sure that no metadata columns are factors
  metadata <- rapply(metadata, as.character, classes = "factor", how = "replace")

  # Find the fields and associated form
-  fields <- metadata[
-    !metadata$field_type %in% c("descriptive", "checkbox"),
-    c("field_name", "form_name")
-  ]
-
-  # Process instrument status fields
-  form_names <- unique(metadata$form_name)
-  form_complete_fields <- data.frame(
-    field_name = paste0(form_names, "_complete"),
-    form_name = form_names,
-    stringsAsFactors = FALSE
-  )
-
-  fields <- rbind(fields, form_complete_fields)
-
-  # Process checkbox fields
-  if (any(metadata$field_type == "checkbox")) {
-
-    checkbox_basenames <- metadata[
-      metadata$field_type == "checkbox",
-      c("field_name", "form_name")
-    ]
-
-    checkbox_fields <-
-      do.call(
-        "rbind",
-        apply(
-          checkbox_basenames,
-          1,
-          function(x, y)
-            data.frame(
-              field_name = y[grepl(paste0("^", x[1], "___((?!\\.factor).)+$"), y, perl = TRUE)],
-              form_name = x[2],
-              stringsAsFactors = FALSE,
-              row.names = NULL
-            ),
-          y = vars_in_data
-        )
-      )
-
-    fields <- rbind(fields, checkbox_fields)
-
-  }
-
-  # Process ".*\\.factor" fields supplied by REDCap's export data R script
-  if (any(grepl("\\.factor$", vars_in_data))) {
-
-    factor_fields <-
-      do.call(
-        "rbind",
-        apply(
-          fields,
-          1,
-          function(x, y) {
-            field_indices <- grepl(paste0("^", x[1], "\\.factor$"), y)
-            if (any(field_indices))
-              data.frame(
-                field_name = y[field_indices],
-                form_name = x[2],
-                stringsAsFactors = FALSE,
-                row.names = NULL
-              )
-          },
-          y = vars_in_data
-        )
-      )
-
-    fields <- rbind(fields, factor_fields)
-
-  }
-
-  # Identify the subtables in the data
-  subtables <- unique(records$redcap_repeat_instrument)
-  subtables <- subtables[subtables != ""]
+  fields <- match_fields_to_form(metadata, vars_in_data)

  # Variables to be present in each output table
  universal_fields <- c(
@ -186,6 +120,7 @@ REDCap_split <- function(records, metadata) {
    )
  )

+  if ("redcap_repeat_instrument" %in% vars_in_data) {
    # Variables to be at the beginning of each repeating instrument
    repeat_instrument_fields <- grep(
      "^redcap_repeat.*",
@ -193,14 +128,25 @@ REDCap_split <- function(records, metadata) {
      value = TRUE
    )

+    # Identify the subtables in the data
+    subtables <- unique(records$redcap_repeat_instrument)
+    subtables <- subtables[subtables != ""]

    # Split the table based on instrument
    out <- split.data.frame(records, records$redcap_repeat_instrument)
+    primary_table_index <- which(names(out) == "")
+
+    if (forms == "repeating" && primary_table_name %in% subtables) {
+      warning("The label given to the primary table is already used by a repeating instrument. The primary table label will be left blank.")
+      primary_table_name <- ""
+    } else if (primary_table_name > "") {
+      names(out)[[primary_table_index]] <- primary_table_name
+    }

    # Delete the variables that are not relevant
    for (i in names(out)) {

-    if (i == "") {
+      if (i == primary_table_name) {

        out_fields <- which(
          vars_in_data %in% c(
@ -208,7 +154,7 @@ REDCap_split <- function(records, metadata) {
            fields[!fields[,2] %in% subtables, 1]
          )
        )
-      out[[which(names(out) == "")]] <- out[[which(names(out) == "")]][out_fields]
+        out[[primary_table_index]] <- out[[primary_table_index]][out_fields]

      } else {

@ -225,6 +171,25 @@ REDCap_split <- function(records, metadata) {

    }

+    if (forms == "all") {
+
+      out <- c(
+        split_non_repeating_forms(
+          out[[primary_table_index]],
+          universal_fields,
+          fields[!fields[,2] %in% subtables,]
+        ),
+        out[-primary_table_index]
+      )
+
+    }
+
+  } else {
+
+    out <- split_non_repeating_forms(records, universal_fields, fields)
+
+  }
+
  out

 }
--- a/R/R/sysdata.rda
+++ b/R/R/sysdata.rda
--- a/R/R/utils.r
+++ b/R/R/utils.r
@ -0,0 +1,106 @@
+match_fields_to_form <- function(metadata, vars_in_data) {
+
+  fields <- metadata[
+    !metadata$field_type %in% c("descriptive", "checkbox"),
+    c("field_name", "form_name")
+  ]
+
+  # Process instrument status fields
+  form_names <- unique(metadata$form_name)
+  form_complete_fields <- data.frame(
+    field_name = paste0(form_names, "_complete"),
+    form_name = form_names,
+    stringsAsFactors = FALSE
+  )
+
+  fields <- rbind(fields, form_complete_fields)
+
+  # Process survey timestamps
+  timestamps <- intersect(vars_in_data, paste0(form_names, "_timestamp"))
+  if (length(timestamps)) {
+
+    timestamp_fields <- data.frame(
+      field_name = timestamps,
+      form_name = sub("_timestamp$", "", timestamps),
+      stringsAsFactors = FALSE
+    )
+
+    fields <- rbind(fields, timestamp_fields)
+
+  }
+
+  # Process checkbox fields
+  if (any(metadata$field_type == "checkbox")) {
+
+    checkbox_basenames <- metadata[
+      metadata$field_type == "checkbox",
+      c("field_name", "form_name")
+    ]
+
+    checkbox_fields <-
+      do.call(
+        "rbind",
+        apply(
+          checkbox_basenames,
+          1,
+          function(x, y)
+            data.frame(
+              field_name = y[grepl(paste0("^", x[1], "___((?!\\.factor).)+$"), y, perl = TRUE)],
+              form_name = x[2],
+              stringsAsFactors = FALSE,
+              row.names = NULL
+            ),
+          y = vars_in_data
+        )
+      )
+
+    fields <- rbind(fields, checkbox_fields)
+
+  }
+
+  # Process ".*\\.factor" fields supplied by REDCap's export data R script
+  if (any(grepl("\\.factor$", vars_in_data))) {
+
+    factor_fields <-
+      do.call(
+        "rbind",
+        apply(
+          fields,
+          1,
+          function(x, y) {
+            field_indices <- grepl(paste0("^", x[1], "\\.factor$"), y)
+            if (any(field_indices))
+              data.frame(
+                field_name = y[field_indices],
+                form_name = x[2],
+                stringsAsFactors = FALSE,
+                row.names = NULL
+              )
+          },
+          y = vars_in_data
+        )
+      )
+
+    fields <- rbind(fields, factor_fields)
+
+  }
+
+  fields
+
+  }
+
+
+split_non_repeating_forms <- function(table, universal_fields, fields) {
+
+  forms <- unique(fields[[2]])
+
+  x <- lapply(
+    forms,
+    function (x) {
+      table[names(table) %in% union(universal_fields, fields[fields[,2] == x,1])]
+  })
+
+  structure(x, names = forms)
+
+}
+
--- a/R/data-raw/metadata_names.R
+++ b/R/data-raw/metadata_names.R
@ -0,0 +1,10 @@
+metadata_names <- c(
+  "field_name", "form_name", "section_header", "field_type",
+  "field_label", "select_choices_or_calculations", "field_note",
+  "text_validation_type_or_show_slider_number", "text_validation_min",
+  "text_validation_max", "identifier", "branching_logic", "required_field",
+  "custom_alignment", "question_number", "matrix_group_name", "matrix_ranking",
+  "field_annotation"
+)
+
+usethis::use_data(metadata_names, overwrite = TRUE, internal = TRUE)
--- a/R/man/REDCap_split.Rd
+++ b/R/man/REDCap_split.Rd
@ -4,7 +4,8 @@
 \alias{REDCap_split}
 \title{Split REDCap repeating instruments table into multiple tables}
 \usage{
-REDCap_split(records, metadata)
+REDCap_split(records, metadata, primary_table_name = "",
+  forms = c("repeating", "all"))
 }
 \arguments{
 \item{records}{Exported project records. May be a \code{data.frame},
@ -14,10 +15,23 @@ call.}
 \item{metadata}{Project metadata (the data dictionary). May be a
 \code{data.frame}, \code{response}, or \code{character} vector containing
 JSON from an API call.}
+
+\item{primary_table_name}{Name given to the list element for the primary
+output table (as described in \emph{README.md}). Ignored if
+\code{forms = 'all'}.}
+
+\item{forms}{Indicate whether to create separate tables for repeating
+instruments only or for all forms.}
 }
 \value{
-A list of \code{"data.frame"}s: one base table and zero or more
+A list of \code{"data.frame"}s. The number of tables will differ
+  depending on the \code{forms} option selected.
+  \itemize{
+    \item \code{'repeating'}: one base table and one or more
    tables for each repeating instrument.
+    \item \code{'all'}: a data.frame for each instrument, regardless of
+    whether it is a repeating instrument or not.
+  }
 }
 \description{
 This will take output from a REDCap export and split it into a base table
--- a/R/tests/testthat/test-forms-arg.R
+++ b/R/tests/testthat/test-forms-arg.R
@ -0,0 +1,63 @@
+context("Using the `forms = 'all'` argument")
+
+# Global variables --------------------------------------------------------
+
+# Cars
+metadata <- jsonlite::fromJSON(
+  get_data_location(
+    "ExampleProject_metadata.json"
+  )
+)
+
+records <- jsonlite::fromJSON(
+  get_data_location(
+    "ExampleProject_records.json"
+  )
+)
+
+redcap_output_json <- REDCap_split(records, metadata, forms = "all")
+
+# Longitudinal
+file_paths <- sapply(
+  c(
+    records = "WARRIORtestForSoftwa_DATA_2018-06-21_1431.csv",
+    metadata = "WARRIORtestForSoftwareUpgrades_DataDictionary_2018-06-21.csv"
+  ), get_data_location
+)
+
+redcap <- lapply(file_paths, read.csv, stringsAsFactors = FALSE)
+redcap[["metadata"]] <- with(redcap, metadata[metadata[,1] > "",])
+redcap_output_long <- with(redcap, REDCap_split(records, metadata, forms = "all"))
+redcap_long_names <- names(redcap[[1]])
+
+# Tests -------------------------------------------------------------------
+
+test_that("Each form is an element in the list", {
+
+  expect_length(redcap_output_json, 3L)
+  expect_identical(names(redcap_output_json), c("motor_trend_cars", "grouping", "sale"))
+
+})
+
+test_that("All variables land somewhere", {
+
+  expect_true(setequal(names(records), Reduce("union", sapply(redcap_output_json, names))))
+
+})
+
+
+test_that("Primary table name is ignored", {
+  expect_identical(
+    REDCap_split(records, metadata, "HELLO", "all"),
+    redcap_output_json
+  )
+})
+
+test_that("Supports longitudinal data", {
+
+  # setdiff(redcap_long_names, Reduce("union", sapply(redcap_output_long, names)))
+  ##  [1] "informed_consent_and_addendum_timestamp"
+
+  expect_true(setequal(redcap_long_names, Reduce("union", sapply(redcap_output_long, names))))
+
+})
--- a/R/tests/testthat/test-longitudinal.R
+++ b/R/tests/testthat/test-longitudinal.R
@ -13,5 +13,5 @@ test_that("CSV export matches reference", {
  redcap_output <- with(redcap, REDCap_split(records, metadata))


-  expect_known_hash(redcap_output, "dff3a52955")
+  expect_known_hash(redcap_output, "0934bcb292")
 })
--- a/R/tests/testthat/test-primary-table-name.R
+++ b/R/tests/testthat/test-primary-table-name.R
@ -0,0 +1,38 @@
+context("Primary table name processing")
+
+
+# Global  variables -------------------------------------------------------
+metadata <- jsonlite::fromJSON(
+  get_data_location(
+    "ExampleProject_metadata.json"
+  )
+)
+
+records <- jsonlite::fromJSON(
+  get_data_location(
+    "ExampleProject_records.json"
+  )
+)
+
+ref_hash <- "2c8b6531597182af1248f92124161e0c"
+
+# Tests -------------------------------------------------------------------
+test_that("Will not use a repeating instrument name for primary table", {
+
+  redcap_output_json1 <- expect_warning(
+    REDCap_split(records, metadata, "sale"),
+    "primary table"
+  )
+
+  expect_known_hash(redcap_output_json1, ref_hash)
+
+})
+
+test_that("Names are set correctly and output is identical", {
+  redcap_output_json2 <- REDCap_split(records, metadata, "main")
+
+
+  expect_identical(names(redcap_output_json2), c("main", "sale"))
+  expect_known_hash(setNames(redcap_output_json2, c("", "sale")), ref_hash)
+
+})