utils::globalVariables(c( "stats::setNames", "field_name", "field_type", "select_choices_or_calculations", "field_label" )) #' Try at determining which are true time only variables #' #' @description #' This is just a try at guessing data type based on data class and column names #' hoping for a tiny bit of naming consistency. R does not include a time-only #' data format natively, so the "hms" class from `readr` is used. This #' has to be converted to character class before REDCap upload. #' #' @param data data set #' @param validate flag to output validation data. Will output list. #' @param sel.pos Positive selection regex string #' @param sel.neg Negative selection regex string #' #' @return character vector or list depending on `validate` flag. #' @export #' #' @examples #' data <- redcapcast_data #' data |> guess_time_only_filter() #' data |> #' guess_time_only_filter(validate = TRUE) |> #' lapply(head) guess_time_only_filter <- function(data, validate = FALSE, sel.pos = "[Tt]i[d(me)]", sel.neg = "[Dd]at[eo]") { datetime_nms <- data |> lapply(\(x) any(c("POSIXct", "hms") %in% class(x))) |> (\(x) names(data)[do.call(c, x)])() time_only_log <- datetime_nms |> (\(x) { ## Detects which are determined true Time only variables ## Inspection is necessary grepl(pattern = sel.pos, x = x) & !grepl(pattern = sel.neg, x = x) })() if (validate) { list( "is.POSIX" = data[datetime_nms], "is.datetime" = data[datetime_nms[!time_only_log]], "is.time_only" = data[datetime_nms[time_only_log]] ) } else { datetime_nms[time_only_log] } } #' Correction based on time_only_filter function #' #' #' @param data data set #' @param ... arguments passed on to `guess_time_only_filter()` #' #' @return tibble #' @importFrom readr parse_time #' #' @examples #' data <- redcapcast_data #' ## data |> time_only_correction() time_only_correction <- function(data, ...) { nms <- guess_time_only_filter(data, ...) z <- nms |> lapply(\(y) { readr::parse_time(format(data[[y]], format = "%H:%M:%S")) }) |> suppressMessages(dplyr::bind_cols()) |> stats::setNames(nm = nms) data[nms] <- z data } #' Change "hms" to "character" for REDCap upload. #' #' @param data data set #' #' @return data.frame or tibble #' #' @examples #' data <- redcapcast_data #' ## data |> time_only_correction() |> hms2character() hms2character <- function(data) { data |> lapply(function(x) { if ("hms" %in% class(x)) { as.character(x) } else { x } }) |> dplyr::bind_cols() } #' Extract data from stata file for data dictionary #' #' @details #' This function is a natural development of the ds2dd() function. It assumes #' that the first column is the ID-column. No checks. #' Please, do always inspect the data dictionary before upload. #' #' Ensure, that the data set is formatted with as much information as possible. #' #' `field.type` can be supplied #' #' @param data data frame #' @param date.format date format, character string. ymd/dmy/mdy. dafault is #' dmy. #' @param add.auto.id flag to add id column #' @param form.name manually specify form name(s). Vector of length 1 or #' ncol(data). Default is NULL and "data" is used. #' @param form.sep If supplied dataset has form names as suffix or prefix to the #' column/variable names, the seperator can be specified. If supplied, the #' form.name is ignored. Default is NULL. #' @param form.prefix Flag to set if form is prefix (TRUE) or suffix (FALSE) to #' the column names. Assumes all columns have pre- or suffix if specified. #' @param field.type manually specify field type(s). Vector of length 1 or #' ncol(data). Default is NULL and "text" is used for everything but factors, #' which wil get "radio". #' @param field.label manually specify field label(s). Vector of length 1 or #' ncol(data). Default is NULL and colnames(data) is used or attribute #' `field.label.attr` for haven_labelled data set (imported .dta file with #' `haven::read_dta()`). #' @param field.label.attr attribute name for named labels for haven_labelled #' data set (imported .dta file with `haven::read_dta()`. Default is "label" #' @param field.validation manually specify field validation(s). Vector of #' length 1 or ncol(data). Default is NULL and `levels()` are used for factors #' or attribute `factor.labels.attr` for haven_labelled data set (imported .dta #' file with `haven::read_dta()`). #' @param metadata redcap metadata headings. Default is #' REDCapCAST:::metadata_names. #' @param convert.logicals convert logicals to factor. Default is TRUE. #' #' @return list of length 2 #' @export #' #' @examples #' data <- REDCapCAST::redcapcast_data #' data |> ds2dd_detailed() #' iris |> ds2dd_detailed(add.auto.id = TRUE) #' iris |> #' ds2dd_detailed( #' add.auto.id = TRUE, #' form.name = sample(c("b", "c"), size = 6, replace = TRUE, prob = rep(.5, 2)) #' ) |> #' purrr::pluck("meta") #' mtcars |> ds2dd_detailed(add.auto.id = TRUE) #' data <- iris |> #' ds2dd_detailed(add.auto.id = TRUE) |> #' purrr::pluck("data") #' names(data) <- glue::glue("{sample(x = c('a','b'),size = length(names(data)), #' replace=TRUE,prob = rep(x=.5,2))}__{names(data)}") #' data |> ds2dd_detailed(form.sep = "__") ds2dd_detailed <- function(data, add.auto.id = FALSE, date.format = "dmy", form.name = NULL, form.sep = NULL, form.prefix = TRUE, field.type = NULL, field.label = NULL, field.label.attr = "label", field.validation = NULL, metadata = names(REDCapCAST::redcapcast_meta), convert.logicals = TRUE) { if (convert.logicals) { # Labels/attributes are saved # labels <- lapply(data, \(.x){ # get_attr(.x, attr = NULL) # }) data <- data |> ## Converts logical to factor, which overwrites attributes dplyr::mutate(dplyr::across(dplyr::where(is.logical), as_factor)) # Old attributes are appended # data <- purrr::imap(no_attr,\(.x,.i){ # attributes(.x) <- c(attributes(.x),labels[[.i]]) # .x # }) |> # dplyr::bind_cols() } ## Handles the odd case of no id column present if (add.auto.id) { data <- dplyr::tibble( record_id = seq_len(nrow(data)), data ) message("A default id column has been added") } ## --------------------------------------- ## Building the data dictionary ## --------------------------------------- ## skeleton dd <- data.frame(matrix(ncol = length(metadata), nrow = ncol(data))) |> stats::setNames(metadata) |> dplyr::tibble() ## form_name and field_name if (!is.null(form.sep)) { if (form.sep != "") { parts <- strsplit(names(data), split = form.sep) ## form.sep should be unique, but handles re-occuring pattern (by only considering first or last) and form.prefix defines if form is prefix or suffix ## The other split part is used as field names if (form.prefix) { dd$form_name <- clean_redcap_name(Reduce(c, lapply(parts, \(.x) .x[[1]]))) dd$field_name <- Reduce(c, lapply(parts, \(.x) paste(.x[seq_len(length(.x))[-1]], collapse = form.sep))) } else { dd$form_name <- clean_redcap_name(Reduce(c, lapply(parts, \(.x) .x[[length(.x)]]))) dd$field_name <- Reduce(c, lapply(parts, \(.x) paste(.x[seq_len(length(.x) - 1)], collapse = form.sep))) } } else { dd$form_name <- "data" dd$field_name <- gsub(" ", "_", tolower(colnames(data))) } } else { ## if no form name prefix, the colnames are used as field_names dd$field_name <- gsub(" ", "_", tolower(colnames(data))) if (is.null(form.name)) { dd$form_name <- "data" } else { if (length(form.name) == 1 || length(form.name) == nrow(dd)) { dd$form_name <- form.name } else { stop("Length of supplied 'form.name' has to be one (1) or ncol(data).") } } } ## field_label if (is.null(field.label)) { dd$field_label <- data |> sapply(function(x) { get_attr(x, attr = field.label.attr) }) dd <- dd |> dplyr::mutate(field_label = dplyr::if_else(is.na(field_label), field_name, field_label )) } else { if (length(field.label) == 1 || length(field.label) == nrow(dd)) { dd$field_label <- field.label } else { stop("Length of supplied 'field.label' has to be one (1) or ncol(data).") } } data_classes <- do.call(c, lapply(data, \(.x)class(.x)[1])) ## field_type if (is.null(field.type)) { dd$field_type <- "text" dd <- dd |> dplyr::mutate(field_type = dplyr::if_else(data_classes == "factor", "radio", field_type )) } else { if (length(field.type) == 1 || length(field.type) == nrow(dd)) { dd$field_type <- field.type } else { stop("Length of supplied 'field.type' has to be one (1) or ncol(data).") } } ## validation if (is.null(field.validation)) { dd <- dd |> dplyr::mutate( text_validation_type_or_show_slider_number = dplyr::case_when( data_classes == "Date" ~ paste0("date_", date.format), data_classes == "hms" ~ "time_hh_mm_ss", ## Self invented format after filtering data_classes == "POSIXct" ~ paste0("datetime_", date.format), data_classes == "numeric" ~ "number" ) ) } else { if (length(field.validation) == 1 || length(field.validation) == nrow(dd)) { dd$text_validation_type_or_show_slider_number <- field.validation } else { stop("Length of supplied 'field.validation' has to be one (1) or ncol(data).") } } ## choices factor_levels <- data |> lapply(function(x) { if (is.factor(x)) { ## Custom function to ensure factor order and keep original values ## Avoiding refactoring to keep as much information as possible lvls <- sort(named_levels(x)) paste( paste(lvls, names(lvls), sep = ", " ), collapse = " | " ) } else { NA } }) |> (\(x)do.call(c, x))() dd <- dd |> dplyr::mutate( select_choices_or_calculations = dplyr::if_else( is.na(factor_levels), select_choices_or_calculations, factor_levels ) ) list( data = data |> hms2character() |> stats::setNames(dd$field_name), meta = dd ) } #' Guess time variables based on naming pattern #' #' @description #' This is for repairing data with time variables with appended "1970-01-01" #' #' #' @param data data.frame or tibble #' @param validate.time Flag to validate guessed time columns #' @param time.var.sel.pos Positive selection regex string passed to #' `gues_time_only_filter()` as sel.pos. #' @param time.var.sel.neg Negative selection regex string passed to #' `gues_time_only_filter()` as sel.neg. #' #' @return data.frame or tibble #' @export #' #' @examples #' redcapcast_data |> guess_time_only(validate.time = TRUE) guess_time_only <- function(data, validate.time = FALSE, time.var.sel.pos = "[Tt]i[d(me)]", time.var.sel.neg = "[Dd]at[eo]") { if (validate.time) { return(data |> guess_time_only_filter(validate = TRUE)) } ### Only keeps the first class, as time fields (POSIXct/POSIXt) has two ### classes data |> time_only_correction( sel.pos = time.var.sel.pos, sel.neg = time.var.sel.neg ) } ### Completion #' Completion marking based on completed upload #' #' @param upload output list from `REDCapR::redcap_write()` #' @param ls output list from `ds2dd_detailed()` #' #' @return list with `REDCapR::redcap_write()` results mark_complete <- function(upload, ls) { data <- ls$data meta <- ls$meta forms <- unique(meta$form_name) cbind( data[[1]][data[[1]] %in% upload$affected_ids], data.frame(matrix(2, ncol = length(forms), nrow = upload$records_affected_count )) ) |> stats::setNames(c(names(data)[1], paste0(forms, "_complete"))) } #' Helper to auto-parse un-formatted data with haven and readr #' #' @param data data.frame or tibble #' @param guess_type logical to guess type with readr #' @param col_types specify col_types using readr semantics. Ignored if guess_type is TRUE #' @param locale option to specify locale. Defaults to readr::default_locale(). #' @param ignore.vars specify column names of columns to ignore when parsing #' @param ... ignored #' #' @return data.frame or tibble #' @export #' #' @examples #' mtcars |> #' parse_data() |> #' str() parse_data <- function(data, guess_type = TRUE, col_types = NULL, locale = readr::default_locale(), ignore.vars = "cpr", ...) { if (any(ignore.vars %in% names(data))) { ignored <- data[ignore.vars] } else { ignored <- NULL } ## Parses haven data by applying labels as factors in case of any if (do.call(c, lapply(data, (\(x)inherits(x, "haven_labelled")))) |> any()) { data <- data |> haven::as_factor() } ## Applying readr cols if (is.null(col_types) && guess_type) { if (do.call(c, lapply(data, is.character)) |> any()) { data <- data |> readr::type_convert( locale = locale, col_types = readr::cols(.default = readr::col_guess()) ) } } else { data <- data |> readr::type_convert( locale = locale, col_types = readr::cols(col_types) ) } if (!is.null(ignored)) { data[ignore.vars] <- ignored } data } #' Convert vector to factor based on threshold of number of unique levels #' #' @description #' This is a wrapper of forcats::as_factor, which sorts numeric vectors before #' factoring, but levels character vectors in order of appearance. #' #' #' @param data vector or data.frame column #' @param unique.n threshold to convert class to factor #' #' @return vector #' @export #' @importFrom forcats as_factor #' #' @examples #' sample(seq_len(4), 20, TRUE) |> #' var2fct(6) |> #' summary() #' sample(letters, 20) |> #' var2fct(6) |> #' summary() #' sample(letters[1:4], 20, TRUE) |> var2fct(6) var2fct <- function(data, unique.n) { if (length(unique(data)) <= unique.n) { forcats::as_factor(data) } else { data } } #' Applying var2fct across data set #' #' @description #' Individual thresholds for character and numeric columns #' #' @param data dataset. data.frame or tibble #' @param numeric.threshold threshold for var2fct for numeric columns. Default #' is 6. #' @param character.throshold threshold for var2fct for character columns. #' Default is 6. #' #' @return data.frame or tibble #' @export #' #' @examples #' mtcars |> str() #' mtcars |> #' numchar2fct(numeric.threshold = 6) |> #' str() numchar2fct <- function(data, numeric.threshold = 6, character.throshold = 6) { data |> dplyr::mutate( dplyr::across( dplyr::where(is.numeric), \(.x){ var2fct(data = .x, unique.n = numeric.threshold) } ), dplyr::across( dplyr::where(is.character), \(.x){ var2fct(data = .x, unique.n = character.throshold) } ) ) }