diff --git a/R/ds2dd_detailed.R b/R/ds2dd_detailed.R new file mode 100644 index 0000000..909db62 --- /dev/null +++ b/R/ds2dd_detailed.R @@ -0,0 +1,343 @@ +utils::globalVariables(c( "stats::setNames", "field_name", "field_type", "select_choices_or_calculations")) +#' Try at determining which are true time only variables +#' +#' @description +#' This is just a try at guessing data type based on data class and column names +#' hoping for a tiny bit of naming consistency. R does not include a time-only +#' data format natively, so the "hms" class from `readr` is used. This +#' has to be converted to character class before REDCap upload. +#' +#' @param data data set +#' @param validate flag to output validation data. Will output list. +#' @param sel.pos Positive selection regex string +#' @param sel.neg Negative selection regex string +#' +#' @return character vector or list depending on `validate` flag. +#' @export +#' +#' @examples +#' data <- redcapcast_data +#' data |> guess_time_only_filter() +#' data |> guess_time_only_filter(validate = TRUE) |> lapply(head) +guess_time_only_filter <- function(data, validate = FALSE, sel.pos = "[Tt]i[d(me)]", sel.neg = "[Dd]at[eo]") { + datetime_nms <- data |> + lapply(\(x)any(c("POSIXct","hms") %in% class(x))) |> + (\(x) names(data)[do.call(c, x)])() + + time_only_log <- datetime_nms |> (\(x) { + ## Detects which are determined true Time only variables + ## Inspection is necessary + grepl(pattern = sel.pos, x = x) & + !grepl(pattern = sel.neg, x = x) + })() + + if (validate) { + list( + "is.POSIX" = data[datetime_nms], + "is.datetime" = data[datetime_nms[!time_only_log]], + "is.time_only" = data[datetime_nms[time_only_log]] + ) + } else { + datetime_nms[time_only_log] + } +} + +#' Correction based on time_only_filter function. Introduces new class for easier +#' validation labelling. +#' +#' @description +#' Dependens on the data class "hms" introduced with +#' `guess_time_only_filter()` and converts these +#' +#' @param data data set +#' @param ... arguments passed on to `guess_time_only_filter()` +#' +#' @return tibble +#' @importFrom readr parse_time +#' +#' @examples +#' data <- redcapcast_data +#' ## data |> time_only_correction() +time_only_correction <- function(data, ...) { + nms <- guess_time_only_filter(data, ...) + z <- nms |> + lapply(\(y) { + readr::parse_time(format(data[[y]], format = "%H:%M:%S")) + }) |> + suppressMessages(dplyr::bind_cols()) |> + stats::setNames(nm = nms) + data[nms] <- z + data +} + +#' Change "hms" to "character" for REDCap upload. +#' +#' @param data data set +#' +#' @return data.frame or tibble +#' +#' @examples +#' data <- redcapcast_data +#' ## data |> time_only_correction() |> hms2character() +hms2character <- function(data) { + data |> + lapply(function(x) { + if ("hms" %in% class(x)) { + as.character(x) + } else { + x + } + }) |> + dplyr::bind_cols() +} + +#' Extract data from stata file for data dictionary +#' +#' @details +#' This function is a natural development of the ds2dd() function. It assumes +#' that the first column is the ID-column. No checks. +#' Please, do always inspect the data dictionary before upload. +#' +#' Ensure, that the data set is formatted with as much information as possible. +#' +#' `field.type` can be supplied +#' +#' @param data data frame +#' @param date.format date format, character string. ymd/dmy/mdy. dafault is +#' dmy. +#' @param add.auto.id flag to add id column +#' @param form.name manually specify form name(s). Vector of length 1 or +#' ncol(data). Default is NULL and "data" is used. +#' @param field.type manually specify field type(s). Vector of length 1 or +#' ncol(data). Default is NULL and "text" is used for everything but factors, +#' which wil get "radio". +#' @param field.label manually specify field label(s). Vector of length 1 or +#' ncol(data). Default is NULL and colnames(data) is used or attribute +#' `field.label.attr` for haven_labelled data set (imported .dta file with +#' `haven::read_dta()`). +#' @param field.label.attr attribute name for named labels for haven_labelled +#' data set (imported .dta file with `haven::read_dta()`. Default is "label" +#' @param field.validation manually specify field validation(s). Vector of +#' length 1 or ncol(data). Default is NULL and `levels()` are used for factors +#' or attribute `factor.labels.attr` for haven_labelled data set (imported .dta file with +#' `haven::read_dta()`). +#' @param metadata redcap metadata headings. Default is +#' REDCapCAST:::metadata_names. +#' @param validate.time Flag to validate guessed time columns +#' @param time.var.sel.pos Positive selection regex string passed to +#' `gues_time_only_filter()` as sel.pos. +#' @param time.var.sel.neg Negative selection regex string passed to +#' `gues_time_only_filter()` as sel.neg. +#' +#' @return list of length 2 +#' @export +#' +#' @examples +#' data <- redcapcast_data +#' data |> ds2dd_detailed(validate.time = TRUE) +#' data |> ds2dd_detailed() +#' iris |> ds2dd_detailed(add.auto.id = TRUE) +#' mtcars |> ds2dd_detailed(add.auto.id = TRUE) +ds2dd_detailed <- function(data, + add.auto.id = FALSE, + date.format = "dmy", + form.name = NULL, + field.type = NULL, + field.label = NULL, + field.label.attr ="label", + field.validation = NULL, + metadata = metadata_names, + validate.time = FALSE, + time.var.sel.pos = "[Tt]i[d(me)]", + time.var.sel.neg = "[Dd]at[eo]") { + ## Handles the odd case of no id column present + if (add.auto.id) { + data <- dplyr::tibble( + default_trial_id = seq_len(nrow(data)), + data + ) + message("A default id column has been added") + } + + if (validate.time) { + return(data |> guess_time_only_filter(validate = TRUE)) + } + + if (lapply(data, haven::is.labelled) |> (\(x)do.call(c, x))() |> any()) { + message("Data seems to be imported with haven from a Stata (.dta) file and will be treated as such.") + data.source <- "dta" + } else { + data.source <- "" + } + + ## data classes + + ### Only keeps the first class, as time fields (POSIXct/POSIXt) has two classes + if (data.source == "dta") { + data_classes <- + data |> + haven::as_factor() |> + time_only_correction(sel.pos = time.var.sel.pos, sel.neg = time.var.sel.neg) |> + lapply(\(x)class(x)[1]) |> + (\(x)do.call(c, x))() + } else { + data_classes <- + data |> + time_only_correction(sel.pos = time.var.sel.pos, sel.neg = time.var.sel.neg) |> + lapply(\(x)class(x)[1]) |> + (\(x)do.call(c, x))() + } + + ## --------------------------------------- + ## Building the data dictionary + ## --------------------------------------- + + ## skeleton + + dd <- data.frame(matrix(ncol = length(metadata), nrow = ncol(data))) |> + stats::setNames(metadata) |> + dplyr::tibble() + + dd$field_name <- gsub(" ", "_", tolower(colnames(data))) + + ## form_name + if (is.null(form.name)) { + dd$form_name <- "data" + } else { + if (length(form.name) == 1 | length(form.name) == nrow(dd)) { + dd$form_name <- form.name + } else { + stop("Length of supplied 'form.name' has to be one (1) or ncol(data).") + } + } + + ## field_label + + if (is.null(field.label)) { + if (data.source == "dta") { + label <- data |> + lapply(function(x) { + if (haven::is.labelled(x)) { + attributes(x)[[field.label.attr]] + } else { + NA + } + }) |> + (\(x)do.call(c, x))() + } else { + label <- data |> colnames() + } + + dd <- + dd |> dplyr::mutate(field_label = dplyr::if_else(is.na(label), field_name, label)) + } else { + if (length(field.label) == 1 | length(field.label) == nrow(dd)) { + dd$field_label <- field.label + } else { + stop("Length of supplied 'field.label' has to be one (1) or ncol(data).") + } + } + + + ## field_type + + if (is.null(field.type)) { + dd$field_type <- "text" + + dd <- + dd |> dplyr::mutate(field_type = dplyr::if_else(data_classes == "factor", "radio", field_type)) + } else { + if (length(field.type) == 1 | length(field.type) == nrow(dd)) { + dd$field_type <- field.type + } else { + stop("Length of supplied 'field.type' has to be one (1) or ncol(data).") + } + } + + ## validation + + if (is.null(field.validation)) { + dd <- + dd |> dplyr::mutate( + text_validation_type_or_show_slider_number = dplyr::case_when( + data_classes == "Date" ~ paste0("date_", date.format), + data_classes == + "hms" ~ "time_hh_mm_ss", + ## Self invented format after filtering + data_classes == + "POSIXct" ~ paste0("datetime_", date.format), + data_classes == + "numeric" ~ "number" + ) + ) + } else { + if (length(field.validation) == 1 | length(field.validation) == nrow(dd)) { + dd$text_validation_type_or_show_slider_number <- field.validation + } else { + stop("Length of supplied 'field.validation' has to be one (1) or ncol(data).") + } + } + + + + ## choices + + if (data.source == "dta") { + factor_levels <- data |> + lapply(function(x) { + if (haven::is.labelled(x)) { + att <- attributes(x)$labels + paste(paste(att, names(att), sep = ", "), collapse = " | ") + } else { + NA + } + }) |> + (\(x)do.call(c, x))() + } else { + factor_levels <- data |> + lapply(function(x) { + if (is.factor(x)) { + ## Re-factors to avoid confusion with missing levels + ## Assumes alle relevant levels are represented in the data + re_fac <- factor(x) + paste(paste(unique(as.numeric(re_fac)), levels(re_fac), sep = ", "), collapse = " | ") + } else { + NA + } + }) |> + (\(x)do.call(c, x))() + } + + dd <- + dd |> dplyr::mutate( + select_choices_or_calculations = dplyr::if_else( + is.na(factor_levels), + select_choices_or_calculations, + factor_levels + ) + ) + + list( + data = data |> + time_only_correction(sel.pos = time.var.sel.pos, sel.neg = time.var.sel.neg) |> + hms2character() |> + (\(x)stats::setNames(x, tolower(names(x))))(), + meta = dd + ) +} + +### Completion +#' Completion marking based on completed upload +#' +#' @param upload output list from `REDCapR::redcap_write()` +#' @param ls output list from `ds2dd_detailed()` +#' +#' @return list with `REDCapR::redcap_write()` results +mark_complete <- function(upload, ls){ + data <- ls$data + meta <- ls$meta + forms <- unique(meta$form_name) + cbind(data[[1]][data[[1]] %in% upload$affected_ids], + data.frame(matrix(2,ncol=length(forms),nrow=upload$records_affected_count))) |> + stats::setNames(c(names(data)[1],paste0(forms,"_complete"))) +} diff --git a/data/redcapcast_data.rda b/data/redcapcast_data.rda index 5095059..907c2c9 100644 Binary files a/data/redcapcast_data.rda and b/data/redcapcast_data.rda differ diff --git a/man/ds2dd.Rd b/man/ds2dd.Rd index c824fb5..ba607b8 100644 --- a/man/ds2dd.Rd +++ b/man/ds2dd.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/ds2dd.R \name{ds2dd} \alias{ds2dd} -\title{Data set to data dictionary function} +\title{(DEPRECATED) Data set to data dictionary function} \usage{ ds2dd( ds, @@ -11,7 +11,7 @@ ds2dd( field.type = "text", field.label = NULL, include.column.names = FALSE, - metadata = names(redcapcast_meta) + metadata = metadata_names ) } \arguments{ @@ -34,14 +34,18 @@ names.} column names for original data set for upload.} \item{metadata}{Metadata column names. Default is the included -REDCapCAST::redcapcast_data.} +REDCapCAST::metadata_names.} } \value{ data.frame or list of data.frame and vector } \description{ +Creates a very basic data dictionary skeleton. Please see `ds2dd_detailed()` +for a more advanced function. +} +\details{ Migrated from stRoke ds2dd(). Fits better with the functionality of -'REDCapCAST' +'REDCapCAST'. } \examples{ redcapcast_data$record_id <- seq_len(nrow(redcapcast_data)) diff --git a/man/ds2dd_detailed.Rd b/man/ds2dd_detailed.Rd new file mode 100644 index 0000000..89cf42b --- /dev/null +++ b/man/ds2dd_detailed.Rd @@ -0,0 +1,82 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ds2dd_detailed.R +\name{ds2dd_detailed} +\alias{ds2dd_detailed} +\title{Extract data from stata file for data dictionary} +\usage{ +ds2dd_detailed( + data, + add.auto.id = FALSE, + date.format = "dmy", + form.name = NULL, + field.type = NULL, + field.label = NULL, + field.label.attr = "label", + field.validation = NULL, + metadata = metadata_names, + validate.time = FALSE, + time.var.sel.pos = "[Tt]i[d(me)]", + time.var.sel.neg = "[Dd]at[eo]" +) +} +\arguments{ +\item{data}{data frame} + +\item{add.auto.id}{flag to add id column} + +\item{date.format}{date format, character string. ymd/dmy/mdy. dafault is +dmy.} + +\item{form.name}{manually specify form name(s). Vector of length 1 or +ncol(data). Default is NULL and "data" is used.} + +\item{field.type}{manually specify field type(s). Vector of length 1 or +ncol(data). Default is NULL and "text" is used for everything but factors, +which wil get "radio".} + +\item{field.label}{manually specify field label(s). Vector of length 1 or +ncol(data). Default is NULL and colnames(data) is used or attribute +`field.label.attr` for haven_labelled data set (imported .dta file with +`haven::read_dta()`).} + +\item{field.label.attr}{attribute name for named labels for haven_labelled +data set (imported .dta file with `haven::read_dta()`. Default is "label"} + +\item{field.validation}{manually specify field validation(s). Vector of +length 1 or ncol(data). Default is NULL and `levels()` are used for factors +or attribute `factor.labels.attr` for haven_labelled data set (imported .dta file with +`haven::read_dta()`).} + +\item{metadata}{redcap metadata headings. Default is +REDCapCAST:::metadata_names.} + +\item{validate.time}{Flag to validate guessed time columns} + +\item{time.var.sel.pos}{Positive selection regex string passed to +`gues_time_only_filter()` as sel.pos.} + +\item{time.var.sel.neg}{Negative selection regex string passed to +`gues_time_only_filter()` as sel.neg.} +} +\value{ +list of length 2 +} +\description{ +Extract data from stata file for data dictionary +} +\details{ +This function is a natural development of the ds2dd() function. It assumes +that the first column is the ID-column. No checks. +Please, do always inspect the data dictionary before upload. + +Ensure, that the data set is formatted with as much information as possible. + +`field.type` can be supplied +} +\examples{ +data <- redcapcast_data +data |> ds2dd_detailed(validate.time = TRUE) +data |> ds2dd_detailed() +iris |> ds2dd_detailed(add.auto.id = TRUE) +mtcars |> ds2dd_detailed(add.auto.id = TRUE) +} diff --git a/man/guess_time_only_filter.Rd b/man/guess_time_only_filter.Rd new file mode 100644 index 0000000..58e6913 --- /dev/null +++ b/man/guess_time_only_filter.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ds2dd_detailed.R +\name{guess_time_only_filter} +\alias{guess_time_only_filter} +\title{Try at determining which are true time only variables} +\usage{ +guess_time_only_filter( + data, + validate = FALSE, + sel.pos = "[Tt]i[d(me)]", + sel.neg = "[Dd]at[eo]" +) +} +\arguments{ +\item{data}{data set} + +\item{validate}{flag to output validation data. Will output list.} + +\item{sel.pos}{Positive selection regex string} + +\item{sel.neg}{Negative selection regex string} +} +\value{ +character vector or list depending on `validate` flag. +} +\description{ +This is just a try at guessing data type based on data class and column names +hoping for a tiny bit of naming consistency. R does not include a time-only +data format natively, so the "hms" class from `readr` is used. This +has to be converted to character class before REDCap upload. +} +\examples{ +data <- redcapcast_data +data |> guess_time_only_filter() +data |> guess_time_only_filter(validate = TRUE) |> lapply(head) +} diff --git a/man/hms2character.Rd b/man/hms2character.Rd new file mode 100644 index 0000000..113b66a --- /dev/null +++ b/man/hms2character.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ds2dd_detailed.R +\name{hms2character} +\alias{hms2character} +\title{Change "hms" to "character" for REDCap upload.} +\usage{ +hms2character(data) +} +\arguments{ +\item{data}{data set} +} +\value{ +data.frame or tibble +} +\description{ +Change "hms" to "character" for REDCap upload. +} +\examples{ +data <- redcapcast_data +## data |> time_only_correction() |> hms2character() +} diff --git a/man/mark_complete.Rd b/man/mark_complete.Rd new file mode 100644 index 0000000..05aee6e --- /dev/null +++ b/man/mark_complete.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ds2dd_detailed.R +\name{mark_complete} +\alias{mark_complete} +\title{Completion marking based on completed upload} +\usage{ +mark_complete(upload, ls) +} +\arguments{ +\item{upload}{output list from `REDCapR::redcap_write()`} + +\item{ls}{output list from `ds2dd_detailed()`} +} +\value{ +list with `REDCapR::redcap_write()` results +} +\description{ +Completion marking based on completed upload +} diff --git a/man/time_only_correction.Rd b/man/time_only_correction.Rd new file mode 100644 index 0000000..75d3821 --- /dev/null +++ b/man/time_only_correction.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ds2dd_detailed.R +\name{time_only_correction} +\alias{time_only_correction} +\title{Correction based on time_only_filter function. Introduces new class for easier +validation labelling.} +\usage{ +time_only_correction(data, ...) +} +\arguments{ +\item{data}{data set} + +\item{...}{arguments passed on to `guess_time_only_filter()`} +} +\value{ +tibble +} +\description{ +Dependens on the data class "hms" introduced with +`guess_time_only_filter()` and converts these +} +\examples{ +data <- redcapcast_data +## data |> time_only_correction() +}