From ea08a2066fc4e630bd8398aa22b7c51d8fef5b3b Mon Sep 17 00:00:00 2001
From: Andreas Gammelgaard Damsbo <andreas@gdamsbo.dk>
Date: Mon, 18 Nov 2024 14:40:32 +0100
Subject: [PATCH] all data parsing and formatting has been seperated in
 individual functions

---
 R/ds2dd_detailed.R      | 279 +++++++++++++++++++++++++++++-----------
 man/ds2dd_detailed.Rd   |  14 +-
 man/guess_time_only.Rd  |  33 +++++
 man/haven_all_levels.Rd |  24 ++++
 man/numchar2fct.Rd      |  29 +++++
 man/parse_data.Rd       |  39 ++++++
 man/var2fct.Rd          |  29 +++++
 7 files changed, 362 insertions(+), 85 deletions(-)
 create mode 100644 man/guess_time_only.Rd
 create mode 100644 man/haven_all_levels.Rd
 create mode 100644 man/numchar2fct.Rd
 create mode 100644 man/parse_data.Rd
 create mode 100644 man/var2fct.Rd

diff --git a/R/ds2dd_detailed.R b/R/ds2dd_detailed.R
index a39e681..e47e415 100644
--- a/R/ds2dd_detailed.R
+++ b/R/ds2dd_detailed.R
@@ -135,18 +135,12 @@ hms2character <- function(data) {
 #' file with `haven::read_dta()`).
 #' @param metadata redcap metadata headings. Default is
 #' REDCapCAST:::metadata_names.
-#' @param validate.time Flag to validate guessed time columns
-#' @param time.var.sel.pos Positive selection regex string passed to
-#' `gues_time_only_filter()` as sel.pos.
-#' @param time.var.sel.neg Negative selection regex string passed to
-#' `gues_time_only_filter()` as sel.neg.
 #'
 #' @return list of length 2
 #' @export
 #'
 #' @examples
 #' data <- REDCapCAST::redcapcast_data
-#' data |> ds2dd_detailed(validate.time = TRUE)
 #' data |> ds2dd_detailed()
 #' iris |> ds2dd_detailed(add.auto.id = TRUE)
 #' iris |>
@@ -172,10 +166,7 @@ ds2dd_detailed <- function(data,
                            field.label = NULL,
                            field.label.attr = "label",
                            field.validation = NULL,
-                           metadata = names(REDCapCAST::redcapcast_meta),
-                           validate.time = FALSE,
-                           time.var.sel.pos = "[Tt]i[d(me)]",
-                           time.var.sel.neg = "[Dd]at[eo]") {
+                           metadata = names(REDCapCAST::redcapcast_meta)) {
   ## Handles the odd case of no id column present
   if (add.auto.id) {
     data <- dplyr::tibble(
@@ -185,43 +176,6 @@ ds2dd_detailed <- function(data,
     message("A default id column has been added")
   }
 
-  if (validate.time) {
-    return(data |> guess_time_only_filter(validate = TRUE))
-  }
-
-  if (lapply(data, haven::is.labelled) |> (\(x)do.call(c, x))() |> any()) {
-    message("Data seems to be imported with haven from a Stata (.dta) file and
-            will be treated as such.")
-    data.source <- "dta"
-  } else {
-    data.source <- ""
-  }
-
-  ## data classes
-
-  ### Only keeps the first class, as time fields (POSIXct/POSIXt) has two
-  ### classes
-  if (data.source == "dta") {
-    data_classes <-
-      data |>
-      haven::as_factor() |>
-      time_only_correction(
-        sel.pos = time.var.sel.pos,
-        sel.neg = time.var.sel.neg
-      ) |>
-      lapply(\(x)class(x)[1]) |>
-      (\(x)do.call(c, x))()
-  } else {
-    data_classes <-
-      data |>
-      time_only_correction(
-        sel.pos = time.var.sel.pos,
-        sel.neg = time.var.sel.neg
-      ) |>
-      lapply(\(x)class(x)[1]) |>
-      (\(x)do.call(c, x))()
-  }
-
   ## ---------------------------------------
   ## Building the data dictionary
   ## ---------------------------------------
@@ -240,12 +194,12 @@ ds2dd_detailed <- function(data,
 
       ## form.sep should be unique, but handles re-occuring pattern (by only considering first or last) and form.prefix defines if form is prefix or suffix
       ## The other split part is used as field names
-      if (form.prefix){
-        dd$form_name <- clean_redcap_name(Reduce(c,lapply(parts,\(.x) .x[[1]])))
-        dd$field_name <- Reduce(c,lapply(parts,\(.x) paste(.x[seq_len(length(.x))[-1]],collapse=form.sep)))
+      if (form.prefix) {
+        dd$form_name <- clean_redcap_name(Reduce(c, lapply(parts, \(.x) .x[[1]])))
+        dd$field_name <- Reduce(c, lapply(parts, \(.x) paste(.x[seq_len(length(.x))[-1]], collapse = form.sep)))
       } else {
-        dd$form_name <- clean_redcap_name(Reduce(c,lapply(parts,\(.x) .x[[length(.x)]])))
-        dd$field_name <- Reduce(c,lapply(parts,\(.x) paste(.x[seq_len(length(.x)-1)],collapse=form.sep)))
+        dd$form_name <- clean_redcap_name(Reduce(c, lapply(parts, \(.x) .x[[length(.x)]])))
+        dd$field_name <- Reduce(c, lapply(parts, \(.x) paste(.x[seq_len(length(.x) - 1)], collapse = form.sep)))
       }
     } else {
       dd$form_name <- "data"
@@ -269,17 +223,16 @@ ds2dd_detailed <- function(data,
   ## field_label
 
   if (is.null(field.label)) {
-    if (data.source == "dta") {
-      dd$field_label <- data |>
-        lapply(function(x) {
-          if (haven::is.labelled(x)) {
-            attributes(x)[[field.label.attr]]
-          } else {
-            NA
-          }
-        }) |>
-        (\(x)do.call(c, x))()
-    }
+    dd$field_label <- data |>
+      lapply(function(x) {
+        if (haven::is.labelled(x)) {
+          att <- haven_all_levels(x)
+          names(att)
+        } else {
+          NA
+        }
+      }) |>
+      (\(x)do.call(c, x))()
 
     dd <-
       dd |> dplyr::mutate(field_label = dplyr::if_else(is.na(field_label),
@@ -294,6 +247,8 @@ ds2dd_detailed <- function(data,
   }
 
 
+  data_classes <- do.call(c, lapply(data, \(.x)class(.x)[1]))
+
   ## field_type
 
   if (is.null(field.type)) {
@@ -312,7 +267,6 @@ ds2dd_detailed <- function(data,
   }
 
   ## validation
-
   if (is.null(field.validation)) {
     dd <-
       dd |> dplyr::mutate(
@@ -336,15 +290,13 @@ ds2dd_detailed <- function(data,
     }
   }
 
-
-
   ## choices
 
-  if (data.source == "dta") {
+  if (any(do.call(c, lapply(data, haven::is.labelled)))) {
     factor_levels <- data |>
       lapply(function(x) {
         if (haven::is.labelled(x)) {
-          att <- attributes(x)$labels
+          att <- haven_all_levels(x)
           paste(paste(att, names(att), sep = ", "), collapse = " | ")
         } else {
           NA
@@ -383,16 +335,75 @@ ds2dd_detailed <- function(data,
 
   list(
     data = data |>
-      time_only_correction(
-        sel.pos = time.var.sel.pos,
-        sel.neg = time.var.sel.neg
-      ) |>
       hms2character() |>
       stats::setNames(dd$field_name),
     meta = dd
   )
 }
 
+#' Finish incomplete haven attributes substituting missings with values
+#'
+#' @param data haven labelled variable
+#'
+#' @return named vector
+#' @export
+#'
+#' @examples
+#' ds <- structure(c(1, 2, 3, 2, 10, 9),
+#'   labels = c(Unknown = 9, Refused = 10),
+#'   class = "haven_labelled"
+#' )
+#' ds |> haven_all_levels()
+haven_all_levels <- function(data) {
+  stopifnot(haven::is.labelled(data))
+  if (length(attributes(data)$labels) == length(unique(data))) {
+    out <- attributes(data)$labels
+  } else {
+    att <- attributes(data)$labels
+    out <- c(unique(data[!data %in% att]), att) |>
+      stats::setNames(c(unique(data[!data %in% att]), names(att)))
+  }
+  out
+}
+
+
+#' Guess time variables based on naming pattern
+#'
+#' @description
+#' This is for repairing data with time variables with appended "1970-01-01"
+#'
+#'
+#' @param data data.frame or tibble
+#' @param validate.time Flag to validate guessed time columns
+#' @param time.var.sel.pos Positive selection regex string passed to
+#' `gues_time_only_filter()` as sel.pos.
+#' @param time.var.sel.neg Negative selection regex string passed to
+#' `gues_time_only_filter()` as sel.neg.
+#'
+#' @return data.frame or tibble
+#' @export
+#'
+#' @examples
+#' redcapcast_data |> guess_time_only(validate.time = TRUE)
+guess_time_only <- function(data,
+                            validate.time = FALSE,
+                            time.var.sel.pos = "[Tt]i[d(me)]",
+                            time.var.sel.neg = "[Dd]at[eo]") {
+  if (validate.time) {
+    return(data |> guess_time_only_filter(validate = TRUE))
+  }
+
+  ### Only keeps the first class, as time fields (POSIXct/POSIXt) has two
+  ### classes
+  data |> time_only_correction(
+    sel.pos = time.var.sel.pos,
+    sel.neg = time.var.sel.neg
+  )
+}
+
+
+
+
 ### Completion
 #' Completion marking based on completed upload
 #'
@@ -413,3 +424,127 @@ mark_complete <- function(upload, ls) {
   ) |>
     stats::setNames(c(names(data)[1], paste0(forms, "_complete")))
 }
+
+
+#' Helper to auto-parse un-formatted data with haven and readr
+#'
+#' @param data data.frame or tibble
+#' @param guess_type logical to guess type with readr
+#' @param col_types specify col_types using readr semantics. Ignored if guess_type is TRUE
+#' @param locale option to specify locale. Defaults to readr::default_locale().
+#' @param ignore.vars specify column names of columns to ignore when parsing
+#' @param ... ignored
+#'
+#' @return data.frame or tibble
+#' @export
+#'
+#' @examples
+#' mtcars |>
+#'   parse_data() |>
+#'   str()
+parse_data <- function(data,
+                       guess_type = TRUE,
+                       col_types = NULL,
+                       locale = readr::default_locale(),
+                       ignore.vars = "cpr",
+                       ...) {
+  if (any(ignore.vars %in% names(data))) {
+    ignored <- data[ignore.vars]
+  } else {
+    ignored <- NULL
+  }
+
+  ## Parses haven data by applying labels as factors in case of any
+  if (do.call(c, lapply(data, (\(x)inherits(x, "haven_labelled")))) |> any()) {
+    data <- data |>
+      haven::as_factor()
+  }
+
+  ## Applying readr cols
+  if (is.null(col_types) && guess_type) {
+    if (do.call(c, lapply(data, is.character)) |> any()) {
+      data <- data |> readr::type_convert(
+        locale = locale,
+        col_types = readr::cols(.default = readr::col_guess())
+      )
+    }
+  } else {
+    data <- data |> readr::type_convert(
+      locale = locale,
+      col_types = readr::cols(col_types)
+    )
+  }
+
+  if (!is.null(ignored)) {
+    data[ignore.vars] <- ignored
+  }
+
+  data
+}
+
+#' Convert vector to factor based on threshold of number of unique levels
+#'
+#' @description
+#' This is a wrapper of forcats::as_factor, which sorts numeric vectors before
+#' factoring, but levels character vectors in order of appearance.
+#'
+#'
+#' @param data vector or data.frame column
+#' @param unique.n threshold to convert class to factor
+#'
+#' @return vector
+#' @export
+#' @importFrom forcats as_factor
+#'
+#' @examples
+#' sample(seq_len(4), 20, TRUE) |>
+#'   var2fct(6) |>
+#'   summary()
+#' sample(letters, 20) |>
+#'   var2fct(6) |>
+#'   summary()
+#' sample(letters[1:4], 20, TRUE) |> var2fct(6)
+var2fct <- function(data, unique.n) {
+  if (length(unique(data)) <= unique.n) {
+    forcats::as_factor(data)
+  } else {
+    data
+  }
+}
+
+#' Applying var2fct across data set
+#'
+#' @description
+#' Individual thresholds for character and numeric columns
+#'
+#' @param data dataset. data.frame or tibble
+#' @param numeric.threshold threshold for var2fct for numeric columns. Default
+#' is 6.
+#' @param character.throshold threshold for var2fct for character columns.
+#' Default is 6.
+#'
+#' @return data.frame or tibble
+#' @export
+#'
+#' @examples
+#' mtcars |> str()
+#' mtcars |>
+#'   numchar2fct(numeric.threshold = 6) |>
+#'   str()
+numchar2fct <- function(data, numeric.threshold = 6, character.throshold = 6) {
+  data |>
+    dplyr::mutate(
+      dplyr::across(
+        dplyr::where(is.numeric),
+        \(.x){
+          var2fct(data = .x, unique.n = numeric.threshold)
+        }
+      ),
+      dplyr::across(
+        dplyr::where(is.character),
+        \(.x){
+          var2fct(data = .x, unique.n = character.throshold)
+        }
+      )
+    )
+}
diff --git a/man/ds2dd_detailed.Rd b/man/ds2dd_detailed.Rd
index f8a062a..c7cbb2e 100644
--- a/man/ds2dd_detailed.Rd
+++ b/man/ds2dd_detailed.Rd
@@ -15,10 +15,7 @@ ds2dd_detailed(
   field.label = NULL,
   field.label.attr = "label",
   field.validation = NULL,
-  metadata = names(REDCapCAST::redcapcast_meta),
-  validate.time = FALSE,
-  time.var.sel.pos = "[Tt]i[d(me)]",
-  time.var.sel.neg = "[Dd]at[eo]"
+  metadata = names(REDCapCAST::redcapcast_meta)
 )
 }
 \arguments{
@@ -58,14 +55,6 @@ file with `haven::read_dta()`).}
 
 \item{metadata}{redcap metadata headings. Default is
 REDCapCAST:::metadata_names.}
-
-\item{validate.time}{Flag to validate guessed time columns}
-
-\item{time.var.sel.pos}{Positive selection regex string passed to
-`gues_time_only_filter()` as sel.pos.}
-
-\item{time.var.sel.neg}{Negative selection regex string passed to
-`gues_time_only_filter()` as sel.neg.}
 }
 \value{
 list of length 2
@@ -84,7 +73,6 @@ Ensure, that the data set is formatted with as much information as possible.
 }
 \examples{
 data <- REDCapCAST::redcapcast_data
-data |> ds2dd_detailed(validate.time = TRUE)
 data |> ds2dd_detailed()
 iris |> ds2dd_detailed(add.auto.id = TRUE)
 iris |>
diff --git a/man/guess_time_only.Rd b/man/guess_time_only.Rd
new file mode 100644
index 0000000..af45630
--- /dev/null
+++ b/man/guess_time_only.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ds2dd_detailed.R
+\name{guess_time_only}
+\alias{guess_time_only}
+\title{Guess time variables based on naming pattern}
+\usage{
+guess_time_only(
+  data,
+  validate.time = FALSE,
+  time.var.sel.pos = "[Tt]i[d(me)]",
+  time.var.sel.neg = "[Dd]at[eo]"
+)
+}
+\arguments{
+\item{data}{data.frame or tibble}
+
+\item{validate.time}{Flag to validate guessed time columns}
+
+\item{time.var.sel.pos}{Positive selection regex string passed to
+`gues_time_only_filter()` as sel.pos.}
+
+\item{time.var.sel.neg}{Negative selection regex string passed to
+`gues_time_only_filter()` as sel.neg.}
+}
+\value{
+data.frame or tibble
+}
+\description{
+This is for repairing data with time variables with appended "1970-01-01"
+}
+\examples{
+redcapcast_data |> guess_time_only(validate.time = TRUE)
+}
diff --git a/man/haven_all_levels.Rd b/man/haven_all_levels.Rd
new file mode 100644
index 0000000..f15ff23
--- /dev/null
+++ b/man/haven_all_levels.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ds2dd_detailed.R
+\name{haven_all_levels}
+\alias{haven_all_levels}
+\title{Finish incomplete haven attributes substituting missings with values}
+\usage{
+haven_all_levels(data)
+}
+\arguments{
+\item{data}{haven labelled variable}
+}
+\value{
+named vector
+}
+\description{
+Finish incomplete haven attributes substituting missings with values
+}
+\examples{
+ds <- structure(c(1, 2, 3, 2, 10, 9),
+  labels = c(Unknown = 9, Refused = 10),
+  class = "haven_labelled"
+)
+ds |> haven_all_levels()
+}
diff --git a/man/numchar2fct.Rd b/man/numchar2fct.Rd
new file mode 100644
index 0000000..bb9da29
--- /dev/null
+++ b/man/numchar2fct.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ds2dd_detailed.R
+\name{numchar2fct}
+\alias{numchar2fct}
+\title{Applying var2fct across data set}
+\usage{
+numchar2fct(data, numeric.threshold = 6, character.throshold = 6)
+}
+\arguments{
+\item{data}{dataset. data.frame or tibble}
+
+\item{numeric.threshold}{threshold for var2fct for numeric columns. Default
+is 6.}
+
+\item{character.throshold}{threshold for var2fct for character columns.
+Default is 6.}
+}
+\value{
+data.frame or tibble
+}
+\description{
+Individual thresholds for character and numeric columns
+}
+\examples{
+mtcars |> str()
+mtcars |>
+  numchar2fct(numeric.threshold = 6) |>
+  str()
+}
diff --git a/man/parse_data.Rd b/man/parse_data.Rd
new file mode 100644
index 0000000..db54782
--- /dev/null
+++ b/man/parse_data.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ds2dd_detailed.R
+\name{parse_data}
+\alias{parse_data}
+\title{Helper to auto-parse un-formatted data with haven and readr}
+\usage{
+parse_data(
+  data,
+  guess_type = TRUE,
+  col_types = NULL,
+  locale = readr::default_locale(),
+  ignore.vars = "cpr",
+  ...
+)
+}
+\arguments{
+\item{data}{data.frame or tibble}
+
+\item{guess_type}{logical to guess type with readr}
+
+\item{col_types}{specify col_types using readr semantics. Ignored if guess_type is TRUE}
+
+\item{locale}{option to specify locale. Defaults to readr::default_locale().}
+
+\item{ignore.vars}{specify column names of columns to ignore when parsing}
+
+\item{...}{ignored}
+}
+\value{
+data.frame or tibble
+}
+\description{
+Helper to auto-parse un-formatted data with haven and readr
+}
+\examples{
+mtcars |>
+  parse_data() |>
+  str()
+}
diff --git a/man/var2fct.Rd b/man/var2fct.Rd
new file mode 100644
index 0000000..5b2265f
--- /dev/null
+++ b/man/var2fct.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ds2dd_detailed.R
+\name{var2fct}
+\alias{var2fct}
+\title{Convert vector to factor based on threshold of number of unique levels}
+\usage{
+var2fct(data, unique.n)
+}
+\arguments{
+\item{data}{vector or data.frame column}
+
+\item{unique.n}{threshold to convert class to factor}
+}
+\value{
+vector
+}
+\description{
+This is a wrapper of forcats::as_factor, which sorts numeric vectors before
+factoring, but levels character vectors in order of appearance.
+}
+\examples{
+sample(seq_len(4), 20, TRUE) |>
+  var2fct(6) |>
+  summary()
+sample(letters, 20) |>
+  var2fct(6) |>
+  summary()
+sample(letters[1:4], 20, TRUE) |> var2fct(6)
+}