From ea08a2066fc4e630bd8398aa22b7c51d8fef5b3b Mon Sep 17 00:00:00 2001 From: Andreas Gammelgaard Damsbo Date: Mon, 18 Nov 2024 14:40:32 +0100 Subject: [PATCH] all data parsing and formatting has been seperated in individual functions --- R/ds2dd_detailed.R | 279 +++++++++++++++++++++++++++++----------- man/ds2dd_detailed.Rd | 14 +- man/guess_time_only.Rd | 33 +++++ man/haven_all_levels.Rd | 24 ++++ man/numchar2fct.Rd | 29 +++++ man/parse_data.Rd | 39 ++++++ man/var2fct.Rd | 29 +++++ 7 files changed, 362 insertions(+), 85 deletions(-) create mode 100644 man/guess_time_only.Rd create mode 100644 man/haven_all_levels.Rd create mode 100644 man/numchar2fct.Rd create mode 100644 man/parse_data.Rd create mode 100644 man/var2fct.Rd diff --git a/R/ds2dd_detailed.R b/R/ds2dd_detailed.R index a39e681..e47e415 100644 --- a/R/ds2dd_detailed.R +++ b/R/ds2dd_detailed.R @@ -135,18 +135,12 @@ hms2character <- function(data) { #' file with `haven::read_dta()`). #' @param metadata redcap metadata headings. Default is #' REDCapCAST:::metadata_names. -#' @param validate.time Flag to validate guessed time columns -#' @param time.var.sel.pos Positive selection regex string passed to -#' `gues_time_only_filter()` as sel.pos. -#' @param time.var.sel.neg Negative selection regex string passed to -#' `gues_time_only_filter()` as sel.neg. #' #' @return list of length 2 #' @export #' #' @examples #' data <- REDCapCAST::redcapcast_data -#' data |> ds2dd_detailed(validate.time = TRUE) #' data |> ds2dd_detailed() #' iris |> ds2dd_detailed(add.auto.id = TRUE) #' iris |> @@ -172,10 +166,7 @@ ds2dd_detailed <- function(data, field.label = NULL, field.label.attr = "label", field.validation = NULL, - metadata = names(REDCapCAST::redcapcast_meta), - validate.time = FALSE, - time.var.sel.pos = "[Tt]i[d(me)]", - time.var.sel.neg = "[Dd]at[eo]") { + metadata = names(REDCapCAST::redcapcast_meta)) { ## Handles the odd case of no id column present if (add.auto.id) { data <- dplyr::tibble( @@ -185,43 +176,6 @@ ds2dd_detailed <- function(data, message("A default id column has been added") } - if (validate.time) { - return(data |> guess_time_only_filter(validate = TRUE)) - } - - if (lapply(data, haven::is.labelled) |> (\(x)do.call(c, x))() |> any()) { - message("Data seems to be imported with haven from a Stata (.dta) file and - will be treated as such.") - data.source <- "dta" - } else { - data.source <- "" - } - - ## data classes - - ### Only keeps the first class, as time fields (POSIXct/POSIXt) has two - ### classes - if (data.source == "dta") { - data_classes <- - data |> - haven::as_factor() |> - time_only_correction( - sel.pos = time.var.sel.pos, - sel.neg = time.var.sel.neg - ) |> - lapply(\(x)class(x)[1]) |> - (\(x)do.call(c, x))() - } else { - data_classes <- - data |> - time_only_correction( - sel.pos = time.var.sel.pos, - sel.neg = time.var.sel.neg - ) |> - lapply(\(x)class(x)[1]) |> - (\(x)do.call(c, x))() - } - ## --------------------------------------- ## Building the data dictionary ## --------------------------------------- @@ -240,12 +194,12 @@ ds2dd_detailed <- function(data, ## form.sep should be unique, but handles re-occuring pattern (by only considering first or last) and form.prefix defines if form is prefix or suffix ## The other split part is used as field names - if (form.prefix){ - dd$form_name <- clean_redcap_name(Reduce(c,lapply(parts,\(.x) .x[[1]]))) - dd$field_name <- Reduce(c,lapply(parts,\(.x) paste(.x[seq_len(length(.x))[-1]],collapse=form.sep))) + if (form.prefix) { + dd$form_name <- clean_redcap_name(Reduce(c, lapply(parts, \(.x) .x[[1]]))) + dd$field_name <- Reduce(c, lapply(parts, \(.x) paste(.x[seq_len(length(.x))[-1]], collapse = form.sep))) } else { - dd$form_name <- clean_redcap_name(Reduce(c,lapply(parts,\(.x) .x[[length(.x)]]))) - dd$field_name <- Reduce(c,lapply(parts,\(.x) paste(.x[seq_len(length(.x)-1)],collapse=form.sep))) + dd$form_name <- clean_redcap_name(Reduce(c, lapply(parts, \(.x) .x[[length(.x)]]))) + dd$field_name <- Reduce(c, lapply(parts, \(.x) paste(.x[seq_len(length(.x) - 1)], collapse = form.sep))) } } else { dd$form_name <- "data" @@ -269,17 +223,16 @@ ds2dd_detailed <- function(data, ## field_label if (is.null(field.label)) { - if (data.source == "dta") { - dd$field_label <- data |> - lapply(function(x) { - if (haven::is.labelled(x)) { - attributes(x)[[field.label.attr]] - } else { - NA - } - }) |> - (\(x)do.call(c, x))() - } + dd$field_label <- data |> + lapply(function(x) { + if (haven::is.labelled(x)) { + att <- haven_all_levels(x) + names(att) + } else { + NA + } + }) |> + (\(x)do.call(c, x))() dd <- dd |> dplyr::mutate(field_label = dplyr::if_else(is.na(field_label), @@ -294,6 +247,8 @@ ds2dd_detailed <- function(data, } + data_classes <- do.call(c, lapply(data, \(.x)class(.x)[1])) + ## field_type if (is.null(field.type)) { @@ -312,7 +267,6 @@ ds2dd_detailed <- function(data, } ## validation - if (is.null(field.validation)) { dd <- dd |> dplyr::mutate( @@ -336,15 +290,13 @@ ds2dd_detailed <- function(data, } } - - ## choices - if (data.source == "dta") { + if (any(do.call(c, lapply(data, haven::is.labelled)))) { factor_levels <- data |> lapply(function(x) { if (haven::is.labelled(x)) { - att <- attributes(x)$labels + att <- haven_all_levels(x) paste(paste(att, names(att), sep = ", "), collapse = " | ") } else { NA @@ -383,16 +335,75 @@ ds2dd_detailed <- function(data, list( data = data |> - time_only_correction( - sel.pos = time.var.sel.pos, - sel.neg = time.var.sel.neg - ) |> hms2character() |> stats::setNames(dd$field_name), meta = dd ) } +#' Finish incomplete haven attributes substituting missings with values +#' +#' @param data haven labelled variable +#' +#' @return named vector +#' @export +#' +#' @examples +#' ds <- structure(c(1, 2, 3, 2, 10, 9), +#' labels = c(Unknown = 9, Refused = 10), +#' class = "haven_labelled" +#' ) +#' ds |> haven_all_levels() +haven_all_levels <- function(data) { + stopifnot(haven::is.labelled(data)) + if (length(attributes(data)$labels) == length(unique(data))) { + out <- attributes(data)$labels + } else { + att <- attributes(data)$labels + out <- c(unique(data[!data %in% att]), att) |> + stats::setNames(c(unique(data[!data %in% att]), names(att))) + } + out +} + + +#' Guess time variables based on naming pattern +#' +#' @description +#' This is for repairing data with time variables with appended "1970-01-01" +#' +#' +#' @param data data.frame or tibble +#' @param validate.time Flag to validate guessed time columns +#' @param time.var.sel.pos Positive selection regex string passed to +#' `gues_time_only_filter()` as sel.pos. +#' @param time.var.sel.neg Negative selection regex string passed to +#' `gues_time_only_filter()` as sel.neg. +#' +#' @return data.frame or tibble +#' @export +#' +#' @examples +#' redcapcast_data |> guess_time_only(validate.time = TRUE) +guess_time_only <- function(data, + validate.time = FALSE, + time.var.sel.pos = "[Tt]i[d(me)]", + time.var.sel.neg = "[Dd]at[eo]") { + if (validate.time) { + return(data |> guess_time_only_filter(validate = TRUE)) + } + + ### Only keeps the first class, as time fields (POSIXct/POSIXt) has two + ### classes + data |> time_only_correction( + sel.pos = time.var.sel.pos, + sel.neg = time.var.sel.neg + ) +} + + + + ### Completion #' Completion marking based on completed upload #' @@ -413,3 +424,127 @@ mark_complete <- function(upload, ls) { ) |> stats::setNames(c(names(data)[1], paste0(forms, "_complete"))) } + + +#' Helper to auto-parse un-formatted data with haven and readr +#' +#' @param data data.frame or tibble +#' @param guess_type logical to guess type with readr +#' @param col_types specify col_types using readr semantics. Ignored if guess_type is TRUE +#' @param locale option to specify locale. Defaults to readr::default_locale(). +#' @param ignore.vars specify column names of columns to ignore when parsing +#' @param ... ignored +#' +#' @return data.frame or tibble +#' @export +#' +#' @examples +#' mtcars |> +#' parse_data() |> +#' str() +parse_data <- function(data, + guess_type = TRUE, + col_types = NULL, + locale = readr::default_locale(), + ignore.vars = "cpr", + ...) { + if (any(ignore.vars %in% names(data))) { + ignored <- data[ignore.vars] + } else { + ignored <- NULL + } + + ## Parses haven data by applying labels as factors in case of any + if (do.call(c, lapply(data, (\(x)inherits(x, "haven_labelled")))) |> any()) { + data <- data |> + haven::as_factor() + } + + ## Applying readr cols + if (is.null(col_types) && guess_type) { + if (do.call(c, lapply(data, is.character)) |> any()) { + data <- data |> readr::type_convert( + locale = locale, + col_types = readr::cols(.default = readr::col_guess()) + ) + } + } else { + data <- data |> readr::type_convert( + locale = locale, + col_types = readr::cols(col_types) + ) + } + + if (!is.null(ignored)) { + data[ignore.vars] <- ignored + } + + data +} + +#' Convert vector to factor based on threshold of number of unique levels +#' +#' @description +#' This is a wrapper of forcats::as_factor, which sorts numeric vectors before +#' factoring, but levels character vectors in order of appearance. +#' +#' +#' @param data vector or data.frame column +#' @param unique.n threshold to convert class to factor +#' +#' @return vector +#' @export +#' @importFrom forcats as_factor +#' +#' @examples +#' sample(seq_len(4), 20, TRUE) |> +#' var2fct(6) |> +#' summary() +#' sample(letters, 20) |> +#' var2fct(6) |> +#' summary() +#' sample(letters[1:4], 20, TRUE) |> var2fct(6) +var2fct <- function(data, unique.n) { + if (length(unique(data)) <= unique.n) { + forcats::as_factor(data) + } else { + data + } +} + +#' Applying var2fct across data set +#' +#' @description +#' Individual thresholds for character and numeric columns +#' +#' @param data dataset. data.frame or tibble +#' @param numeric.threshold threshold for var2fct for numeric columns. Default +#' is 6. +#' @param character.throshold threshold for var2fct for character columns. +#' Default is 6. +#' +#' @return data.frame or tibble +#' @export +#' +#' @examples +#' mtcars |> str() +#' mtcars |> +#' numchar2fct(numeric.threshold = 6) |> +#' str() +numchar2fct <- function(data, numeric.threshold = 6, character.throshold = 6) { + data |> + dplyr::mutate( + dplyr::across( + dplyr::where(is.numeric), + \(.x){ + var2fct(data = .x, unique.n = numeric.threshold) + } + ), + dplyr::across( + dplyr::where(is.character), + \(.x){ + var2fct(data = .x, unique.n = character.throshold) + } + ) + ) +} diff --git a/man/ds2dd_detailed.Rd b/man/ds2dd_detailed.Rd index f8a062a..c7cbb2e 100644 --- a/man/ds2dd_detailed.Rd +++ b/man/ds2dd_detailed.Rd @@ -15,10 +15,7 @@ ds2dd_detailed( field.label = NULL, field.label.attr = "label", field.validation = NULL, - metadata = names(REDCapCAST::redcapcast_meta), - validate.time = FALSE, - time.var.sel.pos = "[Tt]i[d(me)]", - time.var.sel.neg = "[Dd]at[eo]" + metadata = names(REDCapCAST::redcapcast_meta) ) } \arguments{ @@ -58,14 +55,6 @@ file with `haven::read_dta()`).} \item{metadata}{redcap metadata headings. Default is REDCapCAST:::metadata_names.} - -\item{validate.time}{Flag to validate guessed time columns} - -\item{time.var.sel.pos}{Positive selection regex string passed to -`gues_time_only_filter()` as sel.pos.} - -\item{time.var.sel.neg}{Negative selection regex string passed to -`gues_time_only_filter()` as sel.neg.} } \value{ list of length 2 @@ -84,7 +73,6 @@ Ensure, that the data set is formatted with as much information as possible. } \examples{ data <- REDCapCAST::redcapcast_data -data |> ds2dd_detailed(validate.time = TRUE) data |> ds2dd_detailed() iris |> ds2dd_detailed(add.auto.id = TRUE) iris |> diff --git a/man/guess_time_only.Rd b/man/guess_time_only.Rd new file mode 100644 index 0000000..af45630 --- /dev/null +++ b/man/guess_time_only.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ds2dd_detailed.R +\name{guess_time_only} +\alias{guess_time_only} +\title{Guess time variables based on naming pattern} +\usage{ +guess_time_only( + data, + validate.time = FALSE, + time.var.sel.pos = "[Tt]i[d(me)]", + time.var.sel.neg = "[Dd]at[eo]" +) +} +\arguments{ +\item{data}{data.frame or tibble} + +\item{validate.time}{Flag to validate guessed time columns} + +\item{time.var.sel.pos}{Positive selection regex string passed to +`gues_time_only_filter()` as sel.pos.} + +\item{time.var.sel.neg}{Negative selection regex string passed to +`gues_time_only_filter()` as sel.neg.} +} +\value{ +data.frame or tibble +} +\description{ +This is for repairing data with time variables with appended "1970-01-01" +} +\examples{ +redcapcast_data |> guess_time_only(validate.time = TRUE) +} diff --git a/man/haven_all_levels.Rd b/man/haven_all_levels.Rd new file mode 100644 index 0000000..f15ff23 --- /dev/null +++ b/man/haven_all_levels.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ds2dd_detailed.R +\name{haven_all_levels} +\alias{haven_all_levels} +\title{Finish incomplete haven attributes substituting missings with values} +\usage{ +haven_all_levels(data) +} +\arguments{ +\item{data}{haven labelled variable} +} +\value{ +named vector +} +\description{ +Finish incomplete haven attributes substituting missings with values +} +\examples{ +ds <- structure(c(1, 2, 3, 2, 10, 9), + labels = c(Unknown = 9, Refused = 10), + class = "haven_labelled" +) +ds |> haven_all_levels() +} diff --git a/man/numchar2fct.Rd b/man/numchar2fct.Rd new file mode 100644 index 0000000..bb9da29 --- /dev/null +++ b/man/numchar2fct.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ds2dd_detailed.R +\name{numchar2fct} +\alias{numchar2fct} +\title{Applying var2fct across data set} +\usage{ +numchar2fct(data, numeric.threshold = 6, character.throshold = 6) +} +\arguments{ +\item{data}{dataset. data.frame or tibble} + +\item{numeric.threshold}{threshold for var2fct for numeric columns. Default +is 6.} + +\item{character.throshold}{threshold for var2fct for character columns. +Default is 6.} +} +\value{ +data.frame or tibble +} +\description{ +Individual thresholds for character and numeric columns +} +\examples{ +mtcars |> str() +mtcars |> + numchar2fct(numeric.threshold = 6) |> + str() +} diff --git a/man/parse_data.Rd b/man/parse_data.Rd new file mode 100644 index 0000000..db54782 --- /dev/null +++ b/man/parse_data.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ds2dd_detailed.R +\name{parse_data} +\alias{parse_data} +\title{Helper to auto-parse un-formatted data with haven and readr} +\usage{ +parse_data( + data, + guess_type = TRUE, + col_types = NULL, + locale = readr::default_locale(), + ignore.vars = "cpr", + ... +) +} +\arguments{ +\item{data}{data.frame or tibble} + +\item{guess_type}{logical to guess type with readr} + +\item{col_types}{specify col_types using readr semantics. Ignored if guess_type is TRUE} + +\item{locale}{option to specify locale. Defaults to readr::default_locale().} + +\item{ignore.vars}{specify column names of columns to ignore when parsing} + +\item{...}{ignored} +} +\value{ +data.frame or tibble +} +\description{ +Helper to auto-parse un-formatted data with haven and readr +} +\examples{ +mtcars |> + parse_data() |> + str() +} diff --git a/man/var2fct.Rd b/man/var2fct.Rd new file mode 100644 index 0000000..5b2265f --- /dev/null +++ b/man/var2fct.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ds2dd_detailed.R +\name{var2fct} +\alias{var2fct} +\title{Convert vector to factor based on threshold of number of unique levels} +\usage{ +var2fct(data, unique.n) +} +\arguments{ +\item{data}{vector or data.frame column} + +\item{unique.n}{threshold to convert class to factor} +} +\value{ +vector +} +\description{ +This is a wrapper of forcats::as_factor, which sorts numeric vectors before +factoring, but levels character vectors in order of appearance. +} +\examples{ +sample(seq_len(4), 20, TRUE) |> + var2fct(6) |> + summary() +sample(letters, 20) |> + var2fct(6) |> + summary() +sample(letters[1:4], 20, TRUE) |> var2fct(6) +}