new functionds2dd_detailed()which includes more details than the old ds2dd().

2025-02-18 18:12:36 +01:00 · 2024-01-18 14:57:12 +01:00 · 2024-01-18 14:57:12 +01:00 · 21e635d775
commit 21e635d775
parent f488dde528
8 changed files with 534 additions and 4 deletions
--- a/R/ds2dd_detailed.R
+++ b/R/ds2dd_detailed.R
@ -0,0 +1,343 @@
 utils::globalVariables(c(  "stats::setNames",  "field_name",  "field_type",  "select_choices_or_calculations"))
 #' Try at determining which are true time only variables
 #'
 #' @description
 #' This is just a try at guessing data type based on data class and column names
 #' hoping for a tiny bit of naming consistency. R does not include a time-only
 #' data format natively, so the "hms" class from `readr` is used. This
 #' has to be converted to character class before REDCap upload.
 #'
 #' @param data data set
 #' @param validate flag to output validation data. Will output list.
 #' @param sel.pos Positive selection regex string
 #' @param sel.neg Negative selection regex string
 #'
 #' @return character vector or list depending on `validate` flag.
 #' @export
 #'
 #' @examples
 #' data <- redcapcast_data
 #' data |> guess_time_only_filter()
 #' data |> guess_time_only_filter(validate = TRUE) |> lapply(head)
 guess_time_only_filter <- function(data, validate = FALSE, sel.pos = "[Tt]i[d(me)]", sel.neg = "[Dd]at[eo]") {
  datetime_nms <- data |>
    lapply(\(x)any(c("POSIXct","hms") %in% class(x))) |>
    (\(x) names(data)[do.call(c, x)])()
  time_only_log <- datetime_nms |> (\(x) {
    ## Detects which are determined true Time only variables
    ## Inspection is necessary
    grepl(pattern = sel.pos, x = x) &
      !grepl(pattern = sel.neg, x = x)
  })()
  if (validate) {
    list(
      "is.POSIX" = data[datetime_nms],
      "is.datetime" = data[datetime_nms[!time_only_log]],
      "is.time_only" = data[datetime_nms[time_only_log]]
    )
  } else {
    datetime_nms[time_only_log]
  }
 }
 #' Correction based on time_only_filter function. Introduces new class for easier
 #' validation labelling.
 #'
 #' @description
 #' Dependens on the data class "hms" introduced with
 #' `guess_time_only_filter()` and converts these
 #'
 #' @param data data set
 #' @param ... arguments passed on to `guess_time_only_filter()`
 #'
 #' @return tibble
 #' @importFrom readr parse_time
 #'
 #' @examples
 #' data <- redcapcast_data
 #' ## data |> time_only_correction()
 time_only_correction <- function(data, ...) {
  nms <- guess_time_only_filter(data, ...)
  z <- nms |>
    lapply(\(y) {
      readr::parse_time(format(data[[y]], format = "%H:%M:%S"))
    }) |>
    suppressMessages(dplyr::bind_cols()) |>
    stats::setNames(nm = nms)
  data[nms] <- z
  data
 }
 #' Change "hms" to "character" for REDCap upload.
 #'
 #' @param data data set
 #'
 #' @return data.frame or tibble
 #'
 #' @examples
 #' data <- redcapcast_data
 #' ## data |> time_only_correction() |> hms2character()
 hms2character <- function(data) {
  data |>
    lapply(function(x) {
      if ("hms" %in% class(x)) {
        as.character(x)
      } else {
        x
      }
    }) |>
    dplyr::bind_cols()
 }
 #' Extract data from stata file for data dictionary
 #'
 #' @details
 #' This function is a natural development of the ds2dd() function. It assumes
 #' that the first column is the ID-column. No checks.
 #' Please, do always inspect the data dictionary before upload.
 #'
 #' Ensure, that the data set is formatted with as much information as possible.
 #'
 #' `field.type` can be supplied
 #'
 #' @param data data frame
 #' @param date.format date format, character string. ymd/dmy/mdy. dafault is
 #' dmy.
 #' @param add.auto.id flag to add id column
 #' @param form.name manually specify form name(s). Vector of length 1 or
 #' ncol(data). Default is NULL and "data" is used.
 #' @param field.type manually specify field type(s). Vector of length 1 or
 #' ncol(data). Default is NULL and "text" is used for everything but factors,
 #' which wil get "radio".
 #' @param field.label manually specify field label(s). Vector of length 1 or
 #' ncol(data). Default is NULL and colnames(data) is used or attribute
 #' `field.label.attr` for haven_labelled data set (imported .dta file with
 #' `haven::read_dta()`).
 #' @param field.label.attr attribute name for named labels for haven_labelled
 #' data set (imported .dta file with `haven::read_dta()`. Default is "label"
 #' @param field.validation manually specify field validation(s). Vector of
 #' length 1 or ncol(data). Default is NULL and `levels()` are used for factors
 #' or attribute `factor.labels.attr` for haven_labelled data set (imported .dta file with
 #' `haven::read_dta()`).
 #' @param metadata redcap metadata headings. Default is
 #' REDCapCAST:::metadata_names.
 #' @param validate.time Flag to validate guessed time columns
 #' @param time.var.sel.pos Positive selection regex string passed to
 #' `gues_time_only_filter()` as sel.pos.
 #' @param time.var.sel.neg Negative selection regex string passed to
 #' `gues_time_only_filter()` as sel.neg.
 #'
 #' @return list of length 2
 #' @export
 #'
 #' @examples
 #' data <- redcapcast_data
 #' data |> ds2dd_detailed(validate.time = TRUE)
 #' data |> ds2dd_detailed()
 #' iris |> ds2dd_detailed(add.auto.id = TRUE)
 #' mtcars |> ds2dd_detailed(add.auto.id = TRUE)
 ds2dd_detailed <- function(data,
                           add.auto.id = FALSE,
                           date.format = "dmy",
                           form.name = NULL,
                           field.type = NULL,
                           field.label = NULL,
                           field.label.attr ="label",
                           field.validation = NULL,
                           metadata = metadata_names,
                           validate.time = FALSE,
                           time.var.sel.pos = "[Tt]i[d(me)]",
                           time.var.sel.neg = "[Dd]at[eo]") {
  ## Handles the odd case of no id column present
  if (add.auto.id) {
    data <- dplyr::tibble(
      default_trial_id = seq_len(nrow(data)),
      data
    )
    message("A default id column has been added")
  }
  if (validate.time) {
    return(data |> guess_time_only_filter(validate = TRUE))
  }
  if (lapply(data, haven::is.labelled) |> (\(x)do.call(c, x))() |> any()) {
    message("Data seems to be imported with haven from a Stata (.dta) file and will be treated as such.")
    data.source <- "dta"
  } else {
    data.source <- ""
  }
  ## data classes
  ### Only keeps the first class, as time fields (POSIXct/POSIXt) has two classes
  if (data.source == "dta") {
    data_classes <-
      data |>
      haven::as_factor() |>
      time_only_correction(sel.pos = time.var.sel.pos, sel.neg = time.var.sel.neg) |>
      lapply(\(x)class(x)[1]) |>
      (\(x)do.call(c, x))()
  } else {
    data_classes <-
      data |>
      time_only_correction(sel.pos = time.var.sel.pos, sel.neg = time.var.sel.neg) |>
      lapply(\(x)class(x)[1]) |>
      (\(x)do.call(c, x))()
  }
  ## ---------------------------------------
  ## Building the data dictionary
  ## ---------------------------------------
  ## skeleton
  dd <- data.frame(matrix(ncol = length(metadata), nrow = ncol(data))) |>
    stats::setNames(metadata) |>
    dplyr::tibble()
  dd$field_name <- gsub(" ", "_", tolower(colnames(data)))
  ## form_name
  if (is.null(form.name)) {
    dd$form_name <- "data"
  } else {
    if (length(form.name) == 1 | length(form.name) == nrow(dd)) {
      dd$form_name <- form.name
    } else {
      stop("Length of supplied 'form.name' has to be one (1) or ncol(data).")
    }
  }
  ## field_label
  if (is.null(field.label)) {
    if (data.source == "dta") {
      label <- data |>
        lapply(function(x) {
          if (haven::is.labelled(x)) {
            attributes(x)[[field.label.attr]]
          } else {
            NA
          }
        }) |>
        (\(x)do.call(c, x))()
    } else {
      label <- data |> colnames()
    }
    dd <-
      dd |> dplyr::mutate(field_label = dplyr::if_else(is.na(label), field_name, label))
  } else {
    if (length(field.label) == 1 | length(field.label) == nrow(dd)) {
      dd$field_label <- field.label
    } else {
      stop("Length of supplied 'field.label' has to be one (1) or ncol(data).")
    }
  }
  ## field_type
  if (is.null(field.type)) {
    dd$field_type <- "text"
    dd <-
      dd |> dplyr::mutate(field_type = dplyr::if_else(data_classes == "factor", "radio", field_type))
  } else {
    if (length(field.type) == 1 | length(field.type) == nrow(dd)) {
      dd$field_type <- field.type
    } else {
      stop("Length of supplied 'field.type' has to be one (1) or ncol(data).")
    }
  }
  ## validation
  if (is.null(field.validation)) {
    dd <-
      dd |> dplyr::mutate(
        text_validation_type_or_show_slider_number = dplyr::case_when(
          data_classes == "Date" ~ paste0("date_", date.format),
          data_classes ==
            "hms" ~ "time_hh_mm_ss",
          ## Self invented format after filtering
          data_classes ==
            "POSIXct" ~ paste0("datetime_", date.format),
          data_classes ==
            "numeric" ~ "number"
        )
      )
  } else {
    if (length(field.validation) == 1 | length(field.validation) == nrow(dd)) {
      dd$text_validation_type_or_show_slider_number <- field.validation
    } else {
      stop("Length of supplied 'field.validation' has to be one (1) or ncol(data).")
    }
  }
  ## choices
  if (data.source == "dta") {
    factor_levels <- data |>
      lapply(function(x) {
        if (haven::is.labelled(x)) {
          att <- attributes(x)$labels
          paste(paste(att, names(att), sep = ", "), collapse = " | ")
        } else {
          NA
        }
      }) |>
      (\(x)do.call(c, x))()
  } else {
    factor_levels <- data |>
      lapply(function(x) {
        if (is.factor(x)) {
          ## Re-factors to avoid confusion with missing levels
          ## Assumes alle relevant levels are represented in the data
          re_fac <- factor(x)
          paste(paste(unique(as.numeric(re_fac)), levels(re_fac), sep = ", "), collapse = " | ")
        } else {
          NA
        }
      }) |>
      (\(x)do.call(c, x))()
  }
  dd <-
    dd |> dplyr::mutate(
      select_choices_or_calculations = dplyr::if_else(
        is.na(factor_levels),
        select_choices_or_calculations,
        factor_levels
      )
    )
  list(
    data = data |>
      time_only_correction(sel.pos = time.var.sel.pos, sel.neg = time.var.sel.neg) |>
      hms2character() |>
      (\(x)stats::setNames(x, tolower(names(x))))(),
    meta = dd
  )
 }
 ### Completion
 #' Completion marking based on completed upload
 #'
 #' @param upload output list from `REDCapR::redcap_write()`
 #' @param ls output list from `ds2dd_detailed()`
 #'
 #' @return list with `REDCapR::redcap_write()` results
 mark_complete <- function(upload, ls){
  data <- ls$data
  meta <- ls$meta
  forms <- unique(meta$form_name)
  cbind(data[[1]][data[[1]] %in% upload$affected_ids],
        data.frame(matrix(2,ncol=length(forms),nrow=upload$records_affected_count))) |>
    stats::setNames(c(names(data)[1],paste0(forms,"_complete")))
 }
--- a/data/redcapcast_data.rda
+++ b/data/redcapcast_data.rda
--- a/man/ds2dd.Rd
+++ b/man/ds2dd.Rd
@ -2,7 +2,7 @@
 % Please edit documentation in R/ds2dd.R
 \name{ds2dd}
 \alias{ds2dd}
-\title{Data set to data dictionary function}
+\title{(DEPRECATED) Data set to data dictionary function}
 \usage{
 ds2dd(
  ds,
@ -11,7 +11,7 @@ ds2dd(
  field.type = "text",
  field.label = NULL,
  include.column.names = FALSE,
-  metadata = names(redcapcast_meta)
+  metadata = metadata_names
 )
 }
 \arguments{
@ -34,14 +34,18 @@ names.}
 column names for original data set for upload.}
 \item{metadata}{Metadata column names. Default is the included
-REDCapCAST::redcapcast_data.}
+REDCapCAST::metadata_names.}
 }
 \value{
 data.frame or list of data.frame and vector
 }
 \description{
 Creates a very basic data dictionary skeleton. Please see `ds2dd_detailed()`
 for a more advanced function.
 }
 \details{
 Migrated from stRoke ds2dd(). Fits better with the functionality of
-'REDCapCAST'
+'REDCapCAST'.
 }
 \examples{
 redcapcast_data$record_id <- seq_len(nrow(redcapcast_data))
--- a/man/ds2dd_detailed.Rd
+++ b/man/ds2dd_detailed.Rd
@ -0,0 +1,82 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/ds2dd_detailed.R
 \name{ds2dd_detailed}
 \alias{ds2dd_detailed}
 \title{Extract data from stata file for data dictionary}
 \usage{
 ds2dd_detailed(
  data,
  add.auto.id = FALSE,
  date.format = "dmy",
  form.name = NULL,
  field.type = NULL,
  field.label = NULL,
  field.label.attr = "label",
  field.validation = NULL,
  metadata = metadata_names,
  validate.time = FALSE,
  time.var.sel.pos = "[Tt]i[d(me)]",
  time.var.sel.neg = "[Dd]at[eo]"
 )
 }
 \arguments{
 \item{data}{data frame}
 \item{add.auto.id}{flag to add id column}
 \item{date.format}{date format, character string. ymd/dmy/mdy. dafault is
 dmy.}
 \item{form.name}{manually specify form name(s). Vector of length 1 or
 ncol(data). Default is NULL and "data" is used.}
 \item{field.type}{manually specify field type(s). Vector of length 1 or
 ncol(data). Default is NULL and "text" is used for everything but factors,
 which wil get "radio".}
 \item{field.label}{manually specify field label(s). Vector of length 1 or
 ncol(data). Default is NULL and colnames(data) is used or attribute
 `field.label.attr` for haven_labelled data set (imported .dta file with
 `haven::read_dta()`).}
 \item{field.label.attr}{attribute name for named labels for haven_labelled
 data set (imported .dta file with `haven::read_dta()`. Default is "label"}
 \item{field.validation}{manually specify field validation(s). Vector of
 length 1 or ncol(data). Default is NULL and `levels()` are used for factors
 or attribute `factor.labels.attr` for haven_labelled data set (imported .dta file with
 `haven::read_dta()`).}
 \item{metadata}{redcap metadata headings. Default is
 REDCapCAST:::metadata_names.}
 \item{validate.time}{Flag to validate guessed time columns}
 \item{time.var.sel.pos}{Positive selection regex string passed to
 `gues_time_only_filter()` as sel.pos.}
 \item{time.var.sel.neg}{Negative selection regex string passed to
 `gues_time_only_filter()` as sel.neg.}
 }
 \value{
 list of length 2
 }
 \description{
 Extract data from stata file for data dictionary
 }
 \details{
 This function is a natural development of the ds2dd() function. It assumes
 that the first column is the ID-column. No checks.
 Please, do always inspect the data dictionary before upload.
 Ensure, that the data set is formatted with as much information as possible.
 `field.type` can be supplied
 }
 \examples{
 data <- redcapcast_data
 data |> ds2dd_detailed(validate.time = TRUE)
 data |> ds2dd_detailed()
 iris |> ds2dd_detailed(add.auto.id = TRUE)
 mtcars |> ds2dd_detailed(add.auto.id = TRUE)
 }
--- a/man/guess_time_only_filter.Rd
+++ b/man/guess_time_only_filter.Rd
@ -0,0 +1,36 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/ds2dd_detailed.R
 \name{guess_time_only_filter}
 \alias{guess_time_only_filter}
 \title{Try at determining which are true time only variables}
 \usage{
 guess_time_only_filter(
  data,
  validate = FALSE,
  sel.pos = "[Tt]i[d(me)]",
  sel.neg = "[Dd]at[eo]"
 )
 }
 \arguments{
 \item{data}{data set}
 \item{validate}{flag to output validation data. Will output list.}
 \item{sel.pos}{Positive selection regex string}
 \item{sel.neg}{Negative selection regex string}
 }
 \value{
 character vector or list depending on `validate` flag.
 }
 \description{
 This is just a try at guessing data type based on data class and column names
 hoping for a tiny bit of naming consistency. R does not include a time-only
 data format natively, so the "hms" class from `readr` is used. This
 has to be converted to character class before REDCap upload.
 }
 \examples{
 data <- redcapcast_data
 data |> guess_time_only_filter()
 data |> guess_time_only_filter(validate = TRUE) |> lapply(head)
 }
--- a/man/hms2character.Rd
+++ b/man/hms2character.Rd
@ -0,0 +1,21 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/ds2dd_detailed.R
 \name{hms2character}
 \alias{hms2character}
 \title{Change "hms" to "character" for REDCap upload.}
 \usage{
 hms2character(data)
 }
 \arguments{
 \item{data}{data set}
 }
 \value{
 data.frame or tibble
 }
 \description{
 Change "hms" to "character" for REDCap upload.
 }
 \examples{
 data <- redcapcast_data
 ## data |> time_only_correction() |> hms2character()
 }
--- a/man/mark_complete.Rd
+++ b/man/mark_complete.Rd
@ -0,0 +1,19 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/ds2dd_detailed.R
 \name{mark_complete}
 \alias{mark_complete}
 \title{Completion marking based on completed upload}
 \usage{
 mark_complete(upload, ls)
 }
 \arguments{
 \item{upload}{output list from `REDCapR::redcap_write()`}
 \item{ls}{output list from `ds2dd_detailed()`}
 }
 \value{
 list with `REDCapR::redcap_write()` results
 }
 \description{
 Completion marking based on completed upload
 }
--- a/man/time_only_correction.Rd
+++ b/man/time_only_correction.Rd
@ -0,0 +1,25 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/ds2dd_detailed.R
 \name{time_only_correction}
 \alias{time_only_correction}
 \title{Correction based on time_only_filter function. Introduces new class for easier
 validation labelling.}
 \usage{
 time_only_correction(data, ...)
 }
 \arguments{
 \item{data}{data set}
 \item{...}{arguments passed on to `guess_time_only_filter()`}
 }
 \value{
 tibble
 }
 \description{
 Dependens on the data class "hms" introduced with
 `guess_time_only_filter()` and converts these
 }
 \examples{
 data <- redcapcast_data
 ## data |> time_only_correction()
 }