new functionds2dd_detailed()which includes more details than the old ds2dd().

2025-04-03 06:32:33 +02:00 · 2024-01-18 14:57:12 +01:00 · 2024-01-18 14:57:12 +01:00 · 21e635d775
commit 21e635d775
parent f488dde528
8 changed files with 534 additions and 4 deletions
--- a/R/ds2dd_detailed.R
+++ b/R/ds2dd_detailed.R
@ -0,0 +1,343 @@
+utils::globalVariables(c(  "stats::setNames",  "field_name",  "field_type",  "select_choices_or_calculations"))
+#' Try at determining which are true time only variables
+#'
+#' @description
+#' This is just a try at guessing data type based on data class and column names
+#' hoping for a tiny bit of naming consistency. R does not include a time-only
+#' data format natively, so the "hms" class from `readr` is used. This
+#' has to be converted to character class before REDCap upload.
+#'
+#' @param data data set
+#' @param validate flag to output validation data. Will output list.
+#' @param sel.pos Positive selection regex string
+#' @param sel.neg Negative selection regex string
+#'
+#' @return character vector or list depending on `validate` flag.
+#' @export
+#'
+#' @examples
+#' data <- redcapcast_data
+#' data |> guess_time_only_filter()
+#' data |> guess_time_only_filter(validate = TRUE) |> lapply(head)
+guess_time_only_filter <- function(data, validate = FALSE, sel.pos = "[Tt]i[d(me)]", sel.neg = "[Dd]at[eo]") {
+  datetime_nms <- data |>
+    lapply(\(x)any(c("POSIXct","hms") %in% class(x))) |>
+    (\(x) names(data)[do.call(c, x)])()
+
+  time_only_log <- datetime_nms |> (\(x) {
+    ## Detects which are determined true Time only variables
+    ## Inspection is necessary
+    grepl(pattern = sel.pos, x = x) &
+      !grepl(pattern = sel.neg, x = x)
+  })()
+
+  if (validate) {
+    list(
+      "is.POSIX" = data[datetime_nms],
+      "is.datetime" = data[datetime_nms[!time_only_log]],
+      "is.time_only" = data[datetime_nms[time_only_log]]
+    )
+  } else {
+    datetime_nms[time_only_log]
+  }
+}
+
+#' Correction based on time_only_filter function. Introduces new class for easier
+#' validation labelling.
+#'
+#' @description
+#' Dependens on the data class "hms" introduced with
+#' `guess_time_only_filter()` and converts these
+#'
+#' @param data data set
+#' @param ... arguments passed on to `guess_time_only_filter()`
+#'
+#' @return tibble
+#' @importFrom readr parse_time
+#'
+#' @examples
+#' data <- redcapcast_data
+#' ## data |> time_only_correction()
+time_only_correction <- function(data, ...) {
+  nms <- guess_time_only_filter(data, ...)
+  z <- nms |>
+    lapply(\(y) {
+      readr::parse_time(format(data[[y]], format = "%H:%M:%S"))
+    }) |>
+    suppressMessages(dplyr::bind_cols()) |>
+    stats::setNames(nm = nms)
+  data[nms] <- z
+  data
+}
+
+#' Change "hms" to "character" for REDCap upload.
+#'
+#' @param data data set
+#'
+#' @return data.frame or tibble
+#'
+#' @examples
+#' data <- redcapcast_data
+#' ## data |> time_only_correction() |> hms2character()
+hms2character <- function(data) {
+  data |>
+    lapply(function(x) {
+      if ("hms" %in% class(x)) {
+        as.character(x)
+      } else {
+        x
+      }
+    }) |>
+    dplyr::bind_cols()
+}
+
+#' Extract data from stata file for data dictionary
+#'
+#' @details
+#' This function is a natural development of the ds2dd() function. It assumes
+#' that the first column is the ID-column. No checks.
+#' Please, do always inspect the data dictionary before upload.
+#'
+#' Ensure, that the data set is formatted with as much information as possible.
+#'
+#' `field.type` can be supplied
+#'
+#' @param data data frame
+#' @param date.format date format, character string. ymd/dmy/mdy. dafault is
+#' dmy.
+#' @param add.auto.id flag to add id column
+#' @param form.name manually specify form name(s). Vector of length 1 or
+#' ncol(data). Default is NULL and "data" is used.
+#' @param field.type manually specify field type(s). Vector of length 1 or
+#' ncol(data). Default is NULL and "text" is used for everything but factors,
+#' which wil get "radio".
+#' @param field.label manually specify field label(s). Vector of length 1 or
+#' ncol(data). Default is NULL and colnames(data) is used or attribute
+#' `field.label.attr` for haven_labelled data set (imported .dta file with
+#' `haven::read_dta()`).
+#' @param field.label.attr attribute name for named labels for haven_labelled
+#' data set (imported .dta file with `haven::read_dta()`. Default is "label"
+#' @param field.validation manually specify field validation(s). Vector of
+#' length 1 or ncol(data). Default is NULL and `levels()` are used for factors
+#' or attribute `factor.labels.attr` for haven_labelled data set (imported .dta file with
+#' `haven::read_dta()`).
+#' @param metadata redcap metadata headings. Default is
+#' REDCapCAST:::metadata_names.
+#' @param validate.time Flag to validate guessed time columns
+#' @param time.var.sel.pos Positive selection regex string passed to
+#' `gues_time_only_filter()` as sel.pos.
+#' @param time.var.sel.neg Negative selection regex string passed to
+#' `gues_time_only_filter()` as sel.neg.
+#'
+#' @return list of length 2
+#' @export
+#'
+#' @examples
+#' data <- redcapcast_data
+#' data |> ds2dd_detailed(validate.time = TRUE)
+#' data |> ds2dd_detailed()
+#' iris |> ds2dd_detailed(add.auto.id = TRUE)
+#' mtcars |> ds2dd_detailed(add.auto.id = TRUE)
+ds2dd_detailed <- function(data,
+                           add.auto.id = FALSE,
+                           date.format = "dmy",
+                           form.name = NULL,
+                           field.type = NULL,
+                           field.label = NULL,
+                           field.label.attr ="label",
+                           field.validation = NULL,
+                           metadata = metadata_names,
+                           validate.time = FALSE,
+                           time.var.sel.pos = "[Tt]i[d(me)]",
+                           time.var.sel.neg = "[Dd]at[eo]") {
+  ## Handles the odd case of no id column present
+  if (add.auto.id) {
+    data <- dplyr::tibble(
+      default_trial_id = seq_len(nrow(data)),
+      data
+    )
+    message("A default id column has been added")
+  }
+
+  if (validate.time) {
+    return(data |> guess_time_only_filter(validate = TRUE))
+  }
+
+  if (lapply(data, haven::is.labelled) |> (\(x)do.call(c, x))() |> any()) {
+    message("Data seems to be imported with haven from a Stata (.dta) file and will be treated as such.")
+    data.source <- "dta"
+  } else {
+    data.source <- ""
+  }
+
+  ## data classes
+
+  ### Only keeps the first class, as time fields (POSIXct/POSIXt) has two classes
+  if (data.source == "dta") {
+    data_classes <-
+      data |>
+      haven::as_factor() |>
+      time_only_correction(sel.pos = time.var.sel.pos, sel.neg = time.var.sel.neg) |>
+      lapply(\(x)class(x)[1]) |>
+      (\(x)do.call(c, x))()
+  } else {
+    data_classes <-
+      data |>
+      time_only_correction(sel.pos = time.var.sel.pos, sel.neg = time.var.sel.neg) |>
+      lapply(\(x)class(x)[1]) |>
+      (\(x)do.call(c, x))()
+  }
+
+  ## ---------------------------------------
+  ## Building the data dictionary
+  ## ---------------------------------------
+
+  ## skeleton
+
+  dd <- data.frame(matrix(ncol = length(metadata), nrow = ncol(data))) |>
+    stats::setNames(metadata) |>
+    dplyr::tibble()
+
+  dd$field_name <- gsub(" ", "_", tolower(colnames(data)))
+
+  ## form_name
+  if (is.null(form.name)) {
+    dd$form_name <- "data"
+  } else {
+    if (length(form.name) == 1 | length(form.name) == nrow(dd)) {
+      dd$form_name <- form.name
+    } else {
+      stop("Length of supplied 'form.name' has to be one (1) or ncol(data).")
+    }
+  }
+
+  ## field_label
+
+  if (is.null(field.label)) {
+    if (data.source == "dta") {
+      label <- data |>
+        lapply(function(x) {
+          if (haven::is.labelled(x)) {
+            attributes(x)[[field.label.attr]]
+          } else {
+            NA
+          }
+        }) |>
+        (\(x)do.call(c, x))()
+    } else {
+      label <- data |> colnames()
+    }
+
+    dd <-
+      dd |> dplyr::mutate(field_label = dplyr::if_else(is.na(label), field_name, label))
+  } else {
+    if (length(field.label) == 1 | length(field.label) == nrow(dd)) {
+      dd$field_label <- field.label
+    } else {
+      stop("Length of supplied 'field.label' has to be one (1) or ncol(data).")
+    }
+  }
+
+
+  ## field_type
+
+  if (is.null(field.type)) {
+    dd$field_type <- "text"
+
+    dd <-
+      dd |> dplyr::mutate(field_type = dplyr::if_else(data_classes == "factor", "radio", field_type))
+  } else {
+    if (length(field.type) == 1 | length(field.type) == nrow(dd)) {
+      dd$field_type <- field.type
+    } else {
+      stop("Length of supplied 'field.type' has to be one (1) or ncol(data).")
+    }
+  }
+
+  ## validation
+
+  if (is.null(field.validation)) {
+    dd <-
+      dd |> dplyr::mutate(
+        text_validation_type_or_show_slider_number = dplyr::case_when(
+          data_classes == "Date" ~ paste0("date_", date.format),
+          data_classes ==
+            "hms" ~ "time_hh_mm_ss",
+          ## Self invented format after filtering
+          data_classes ==
+            "POSIXct" ~ paste0("datetime_", date.format),
+          data_classes ==
+            "numeric" ~ "number"
+        )
+      )
+  } else {
+    if (length(field.validation) == 1 | length(field.validation) == nrow(dd)) {
+      dd$text_validation_type_or_show_slider_number <- field.validation
+    } else {
+      stop("Length of supplied 'field.validation' has to be one (1) or ncol(data).")
+    }
+  }
+
+
+
+  ## choices
+
+  if (data.source == "dta") {
+    factor_levels <- data |>
+      lapply(function(x) {
+        if (haven::is.labelled(x)) {
+          att <- attributes(x)$labels
+          paste(paste(att, names(att), sep = ", "), collapse = " | ")
+        } else {
+          NA
+        }
+      }) |>
+      (\(x)do.call(c, x))()
+  } else {
+    factor_levels <- data |>
+      lapply(function(x) {
+        if (is.factor(x)) {
+          ## Re-factors to avoid confusion with missing levels
+          ## Assumes alle relevant levels are represented in the data
+          re_fac <- factor(x)
+          paste(paste(unique(as.numeric(re_fac)), levels(re_fac), sep = ", "), collapse = " | ")
+        } else {
+          NA
+        }
+      }) |>
+      (\(x)do.call(c, x))()
+  }
+
+  dd <-
+    dd |> dplyr::mutate(
+      select_choices_or_calculations = dplyr::if_else(
+        is.na(factor_levels),
+        select_choices_or_calculations,
+        factor_levels
+      )
+    )
+
+  list(
+    data = data |>
+      time_only_correction(sel.pos = time.var.sel.pos, sel.neg = time.var.sel.neg) |>
+      hms2character() |>
+      (\(x)stats::setNames(x, tolower(names(x))))(),
+    meta = dd
+  )
+}
+
+### Completion
+#' Completion marking based on completed upload
+#'
+#' @param upload output list from `REDCapR::redcap_write()`
+#' @param ls output list from `ds2dd_detailed()`
+#'
+#' @return list with `REDCapR::redcap_write()` results
+mark_complete <- function(upload, ls){
+  data <- ls$data
+  meta <- ls$meta
+  forms <- unique(meta$form_name)
+  cbind(data[[1]][data[[1]] %in% upload$affected_ids],
+        data.frame(matrix(2,ncol=length(forms),nrow=upload$records_affected_count))) |>
+    stats::setNames(c(names(data)[1],paste0(forms,"_complete")))
+}
--- a/data/redcapcast_data.rda
+++ b/data/redcapcast_data.rda
--- a/man/ds2dd.Rd
+++ b/man/ds2dd.Rd
@ -2,7 +2,7 @@
 % Please edit documentation in R/ds2dd.R
 \name{ds2dd}
 \alias{ds2dd}
-\title{Data set to data dictionary function}
+\title{(DEPRECATED) Data set to data dictionary function}
 \usage{
 ds2dd(
  ds,
@ -11,7 +11,7 @@ ds2dd(
  field.type = "text",
  field.label = NULL,
  include.column.names = FALSE,
-  metadata = names(redcapcast_meta)
+  metadata = metadata_names
 )
 }
 \arguments{
@ -34,14 +34,18 @@ names.}
 column names for original data set for upload.}

 \item{metadata}{Metadata column names. Default is the included
-REDCapCAST::redcapcast_data.}
+REDCapCAST::metadata_names.}
 }
 \value{
 data.frame or list of data.frame and vector
 }
 \description{
+Creates a very basic data dictionary skeleton. Please see `ds2dd_detailed()`
+for a more advanced function.
+}
+\details{
 Migrated from stRoke ds2dd(). Fits better with the functionality of
-'REDCapCAST'
+'REDCapCAST'.
 }
 \examples{
 redcapcast_data$record_id <- seq_len(nrow(redcapcast_data))
--- a/man/ds2dd_detailed.Rd
+++ b/man/ds2dd_detailed.Rd
@ -0,0 +1,82 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ds2dd_detailed.R
+\name{ds2dd_detailed}
+\alias{ds2dd_detailed}
+\title{Extract data from stata file for data dictionary}
+\usage{
+ds2dd_detailed(
+  data,
+  add.auto.id = FALSE,
+  date.format = "dmy",
+  form.name = NULL,
+  field.type = NULL,
+  field.label = NULL,
+  field.label.attr = "label",
+  field.validation = NULL,
+  metadata = metadata_names,
+  validate.time = FALSE,
+  time.var.sel.pos = "[Tt]i[d(me)]",
+  time.var.sel.neg = "[Dd]at[eo]"
+)
+}
+\arguments{
+\item{data}{data frame}
+
+\item{add.auto.id}{flag to add id column}
+
+\item{date.format}{date format, character string. ymd/dmy/mdy. dafault is
+dmy.}
+
+\item{form.name}{manually specify form name(s). Vector of length 1 or
+ncol(data). Default is NULL and "data" is used.}
+
+\item{field.type}{manually specify field type(s). Vector of length 1 or
+ncol(data). Default is NULL and "text" is used for everything but factors,
+which wil get "radio".}
+
+\item{field.label}{manually specify field label(s). Vector of length 1 or
+ncol(data). Default is NULL and colnames(data) is used or attribute
+`field.label.attr` for haven_labelled data set (imported .dta file with
+`haven::read_dta()`).}
+
+\item{field.label.attr}{attribute name for named labels for haven_labelled
+data set (imported .dta file with `haven::read_dta()`. Default is "label"}
+
+\item{field.validation}{manually specify field validation(s). Vector of
+length 1 or ncol(data). Default is NULL and `levels()` are used for factors
+or attribute `factor.labels.attr` for haven_labelled data set (imported .dta file with
+`haven::read_dta()`).}
+
+\item{metadata}{redcap metadata headings. Default is
+REDCapCAST:::metadata_names.}
+
+\item{validate.time}{Flag to validate guessed time columns}
+
+\item{time.var.sel.pos}{Positive selection regex string passed to
+`gues_time_only_filter()` as sel.pos.}
+
+\item{time.var.sel.neg}{Negative selection regex string passed to
+`gues_time_only_filter()` as sel.neg.}
+}
+\value{
+list of length 2
+}
+\description{
+Extract data from stata file for data dictionary
+}
+\details{
+This function is a natural development of the ds2dd() function. It assumes
+that the first column is the ID-column. No checks.
+Please, do always inspect the data dictionary before upload.
+
+Ensure, that the data set is formatted with as much information as possible.
+
+`field.type` can be supplied
+}
+\examples{
+data <- redcapcast_data
+data |> ds2dd_detailed(validate.time = TRUE)
+data |> ds2dd_detailed()
+iris |> ds2dd_detailed(add.auto.id = TRUE)
+mtcars |> ds2dd_detailed(add.auto.id = TRUE)
+}
--- a/man/guess_time_only_filter.Rd
+++ b/man/guess_time_only_filter.Rd
@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ds2dd_detailed.R
+\name{guess_time_only_filter}
+\alias{guess_time_only_filter}
+\title{Try at determining which are true time only variables}
+\usage{
+guess_time_only_filter(
+  data,
+  validate = FALSE,
+  sel.pos = "[Tt]i[d(me)]",
+  sel.neg = "[Dd]at[eo]"
+)
+}
+\arguments{
+\item{data}{data set}
+
+\item{validate}{flag to output validation data. Will output list.}
+
+\item{sel.pos}{Positive selection regex string}
+
+\item{sel.neg}{Negative selection regex string}
+}
+\value{
+character vector or list depending on `validate` flag.
+}
+\description{
+This is just a try at guessing data type based on data class and column names
+hoping for a tiny bit of naming consistency. R does not include a time-only
+data format natively, so the "hms" class from `readr` is used. This
+has to be converted to character class before REDCap upload.
+}
+\examples{
+data <- redcapcast_data
+data |> guess_time_only_filter()
+data |> guess_time_only_filter(validate = TRUE) |> lapply(head)
+}
--- a/man/hms2character.Rd
+++ b/man/hms2character.Rd
@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ds2dd_detailed.R
+\name{hms2character}
+\alias{hms2character}
+\title{Change "hms" to "character" for REDCap upload.}
+\usage{
+hms2character(data)
+}
+\arguments{
+\item{data}{data set}
+}
+\value{
+data.frame or tibble
+}
+\description{
+Change "hms" to "character" for REDCap upload.
+}
+\examples{
+data <- redcapcast_data
+## data |> time_only_correction() |> hms2character()
+}
--- a/man/mark_complete.Rd
+++ b/man/mark_complete.Rd
@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ds2dd_detailed.R
+\name{mark_complete}
+\alias{mark_complete}
+\title{Completion marking based on completed upload}
+\usage{
+mark_complete(upload, ls)
+}
+\arguments{
+\item{upload}{output list from `REDCapR::redcap_write()`}
+
+\item{ls}{output list from `ds2dd_detailed()`}
+}
+\value{
+list with `REDCapR::redcap_write()` results
+}
+\description{
+Completion marking based on completed upload
+}
--- a/man/time_only_correction.Rd
+++ b/man/time_only_correction.Rd
@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ds2dd_detailed.R
+\name{time_only_correction}
+\alias{time_only_correction}
+\title{Correction based on time_only_filter function. Introduces new class for easier
+validation labelling.}
+\usage{
+time_only_correction(data, ...)
+}
+\arguments{
+\item{data}{data set}
+
+\item{...}{arguments passed on to `guess_time_only_filter()`}
+}
+\value{
+tibble
+}
+\description{
+Dependens on the data class "hms" introduced with
+`guess_time_only_filter()` and converts these
+}
+\examples{
+data <- redcapcast_data
+## data |> time_only_correction()
+}