From c3b54b08606cc0f9df6dca750b457775608cb44f Mon Sep 17 00:00:00 2001 From: Andreas Gammelgaard Damsbo Date: Wed, 20 Nov 2024 12:09:13 +0100 Subject: [PATCH] as_factor functions to preserve attributes --- R/as_factor.R | 249 ++++++++++++++++++++++++++++++++++++++++ R/ds2dd_detailed.R | 117 +++---------------- man/as_factor.Rd | 51 ++++++++ man/fct2num.Rd | 34 ++++++ man/get_attr.Rd | 2 +- man/haven_all_levels.Rd | 2 +- man/named_levels.Rd | 25 ++++ man/set_attr.Rd | 2 +- 8 files changed, 378 insertions(+), 104 deletions(-) create mode 100644 R/as_factor.R create mode 100644 man/as_factor.Rd create mode 100644 man/fct2num.Rd create mode 100644 man/named_levels.Rd diff --git a/R/as_factor.R b/R/as_factor.R new file mode 100644 index 0000000..5e4e912 --- /dev/null +++ b/R/as_factor.R @@ -0,0 +1,249 @@ +#' Convert labelled vectors to factors while preserving attributes +#' +#' This extends [forcats::as_factor()] as well as [haven::as_factor()], by appending +#' original attributes except for "class" after converting to factor to avoid +#' ta loss in case of rich formatted and labelled data. +#' +#' Please refer to parent functions for extended documentation. +#' +#' @param x Object to coerce to a factor. +#' @param ... Other arguments passed down to method. +#' @export +#' @examples +#' # will preserve all attributes but class +#' c(1, 4, 3, "A", 7, 8, 1) |> as_factor() +#' structure(c(1, 2, 3, 2, 10, 9), +#' labels = c(Unknown = 9, Refused = 10) +#' ) |> +#' as_factor() +#' +#' structure(c(1, 2, 3, 2, 10, 9), +#' labels = c(Unknown = 9, Refused = 10), +#' class = "haven_labelled" +#' ) |> +#' as_factor() +#' +#' @importFrom forcats as_factor +#' @importFrom rlang check_dots_used +#' @export +#' @name as_factor +as_factor <- function(x, ...) { + rlang::check_dots_used() + UseMethod("as_factor") +} + +#' @rdname as_factor +#' @export +as_factor.logical <- function(x, ...) { + labels <- get_attr(x) + x <- forcats::as_factor(x, ...) + set_attr(x, labels[-match("class", names(labels))]) +} + +#' @rdname as_factor +#' @export +as_factor.numeric <- function(x, ...) { + labels <- get_attr(x) + x <- forcats::as_factor(x, ...) + set_attr(x, labels[-match("class", names(labels))]) +} + +#' @rdname as_factor +#' @export +as_factor.character <- function(x, ...) { + labels <- get_attr(x) + x <- forcats::as_factor(x, ...) + set_attr(x, labels[-match("class", names(labels))]) +} + +#' @rdname as_factor +#' @export +as_factor.haven_labelled <- function(x, ...) { + labels <- get_attr(x) + x <- haven::as_factor(x, ...) + set_attr(x, labels[-match("class", names(labels))]) +} + +#' @export +#' @rdname as_factor +as_factor.labelled <- as_factor.haven_labelled + + + +#' Get named vector of factor levels and values +#' +#' @param data factor +#' @param label character string of attribute with named vector of factor labels +#' +#' @return named vector +#' @export +#' +#' @examples +#' structure(c(1, 2, 3, 2, 10, 9), +#' labels = c(Unknown = 9, Refused = 10), +#' class = "haven_labelled" +#' ) |> as_factor() |> named_levels() +named_levels <- function(data, label = "labels",na.label=NULL, na.value=99) { + stopifnot(is.factor(data)) + if (!is.null(na.label)){ + attrs <- attributes(data) + lvls <- as.character(data) + lvls[is.na(lvls)] <- na.label + vals <- as.numeric(data) + vals[is.na(vals)] <- na.value + + lbls <- data.frame( + name = lvls, + value = vals + ) |> unique() |> + (\(d){ + stats::setNames(d$value, d$name) + })() |> + sort() + + data <- do.call(structure, + c(list(.Data=match(vals,lbls)), + attrs[-match("levels", names(attrs))], + list(levels=names(lbls), + labels=lbls))) + } + + d <- data.frame( + name = levels(data)[data], + value = as.numeric(data) + ) |> + unique() + + ## Applying labels + attr_l <- attr(x = data, which = label, exact = TRUE) + if (length(attr_l) != 0) { + d$value[match(names(attr_l), d$name)] <- unname(attr_l) + } + + out <- stats::setNames(d$value, d$name) + ## Sort if levels are numeric + ## Else, they appear in order of appearance + if (identical( + levels(data), + suppressWarnings(as.character(as.numeric(levels(data)))) + )) { + out <- out |> sort() + } + out +} + + +#' Allows conversion of factor to numeric values preserving original levels +#' +#' @param data vector +#' +#' @return numeric vector +#' @export +#' +#' @examples +#' c(1, 4, 3, "A", 7, 8, 1) |> +#' as_factor() |> fct2num() +#' +#' structure(c(1, 2, 3, 2, 10, 9), +#' labels = c(Unknown = 9, Refused = 10), +#' class = "haven_labelled" +#' ) |> +#' as_factor() |> +#' fct2num() +#' +#' structure(c(1, 2, 3, 2, 10, 9), +#' labels = c(Unknown = 9, Refused = 10) +#' ) |> +#' as_factor() |> +#' fct2num() +fct2num <- function(data) { + stopifnot(is.factor(data)) + as.numeric(named_levels(data))[match(data, names(named_levels(data)))] +} + +#' Extract attribute. Returns NA if none +#' +#' @param data vector +#' @param attr attribute name +#' +#' @return character vector +#' @export +#' +#' @examples +#' attr(mtcars$mpg, "label") <- "testing" +#' sapply(mtcars, get_attr) +#' lapply(mtcars, \(.x)get_attr(.x, NULL)) +#' mtcars |> +#' numchar2fct(numeric.threshold = 6) |> +#' ds2dd_detailed() +get_attr <- function(data, attr = NULL) { + if (is.null(attr)) { + attributes(data) + } else { + a <- attr(data, attr, exact = TRUE) + if (is.null(a)) { + NA + } else { + a + } + } +} + + +#' Set attributes for named attribute. Appends if attr is NULL +#' +#' @param data vector +#' @param label label +#' @param attr attribute name +#' +#' @return vector with attribute +#' @export +#' +set_attr <- function(data, label, attr = NULL) { + if (is.null(attr)) { + ## Has to be list... + stopifnot(is.list(label)) + ## ... with names + stopifnot(length(label)==length(names(label))) + attributes(data) <- c(attributes(data),label) + } else { + attr(data, attr) <- label + } + data +} + +#' Finish incomplete haven attributes substituting missings with values +#' +#' @param data haven labelled variable +#' +#' @return named vector +#' @export +#' +#' @examples +#' ds <- structure(c(1, 2, 3, 2, 10, 9), +#' labels = c(Unknown = 9, Refused = 10), +#' class = "haven_labelled" +#' ) +#' haven::is.labelled(ds) +#' attributes(ds) +#' ds |> haven_all_levels() +haven_all_levels <- function(data) { + stopifnot(haven::is.labelled(data)) + if (length(attributes(data)$labels) == length(unique(data))) { + out <- attributes(data)$labels + } else { + att <- attributes(data)$labels + out <- c(unique(data[!data %in% att]), att) |> + stats::setNames(c(unique(data[!data %in% att]), names(att))) + } + out +} + +# readr::read_rds("/Users/au301842/PAaSO/labelled_test.rds") |> ds2dd_detailed() +#' sample(c(TRUE,FALSE,NA),20,TRUE) |> set_attr("hidden","status") |> trial_fct() |> named_levels(na.label = "Missing") |> sort() +# trial_fct <- function(x){ +# labels <- get_attr(x) +# x <- factor(x, levels = c("FALSE", "TRUE")) +# set_attr(x, labels[-match("class", names(labels))]) +# } + diff --git a/R/ds2dd_detailed.R b/R/ds2dd_detailed.R index e01c9cd..13a515d 100644 --- a/R/ds2dd_detailed.R +++ b/R/ds2dd_detailed.R @@ -172,20 +172,20 @@ ds2dd_detailed <- function(data, if (convert.logicals) { # Labels/attributes are saved - labels <- lapply(data, \(.x){ - get_attr(.x, attr = NULL) - }) + # labels <- lapply(data, \(.x){ + # get_attr(.x, attr = NULL) + # }) - no_attr <- data |> + data <- data |> ## Converts logical to factor, which overwrites attributes - dplyr::mutate(dplyr::across(dplyr::where(is.logical), forcats::as_factor)) + dplyr::mutate(dplyr::across(dplyr::where(is.logical), as_factor)) # Old attributes are appended - data <- purrr::imap(no_attr,\(.x,.i){ - attributes(.x) <- c(attributes(.x),labels[[.i]]) - .x - }) |> - dplyr::bind_cols() + # data <- purrr::imap(no_attr,\(.x,.i){ + # attributes(.x) <- c(attributes(.x),labels[[.i]]) + # .x + # }) |> + # dplyr::bind_cols() } @@ -262,7 +262,6 @@ ds2dd_detailed <- function(data, } } - data_classes <- do.call(c, lapply(data, \(.x)class(.x)[1])) ## field_type @@ -308,27 +307,15 @@ ds2dd_detailed <- function(data, ## choices - if (any(do.call(c, lapply(data, haven::is.labelled)))) { - factor_levels <- data |> - lapply(function(x) { - if (haven::is.labelled(x)) { - att <- haven_all_levels(x) - paste(paste(att, names(att), sep = ", "), collapse = " | ") - } else { - NA - } - }) |> - (\(x)do.call(c, x))() - } else { - factor_levels <- data |> + factor_levels <- data |> lapply(function(x) { if (is.factor(x)) { - ## Re-factors to avoid confusion with missing levels - ## Assumes all relevant levels are represented in the data - re_fac <- factor(x) + ## Custom function to ensure factor order and keep original values + ## Avoiding refactoring to keep as much information as possible + lvls <- sort(named_levels(x)) paste( - paste(seq_along(levels(re_fac)), - levels(re_fac), + paste(lvls, + names(lvls), sep = ", " ), collapse = " | " @@ -338,7 +325,6 @@ ds2dd_detailed <- function(data, } }) |> (\(x)do.call(c, x))() - } dd <- dd |> dplyr::mutate( @@ -357,33 +343,6 @@ ds2dd_detailed <- function(data, ) } -#' Finish incomplete haven attributes substituting missings with values -#' -#' @param data haven labelled variable -#' -#' @return named vector -#' @export -#' -#' @examples -#' ds <- structure(c(1, 2, 3, 2, 10, 9), -#' labels = c(Unknown = 9, Refused = 10), -#' class = "haven_labelled" -#' ) -#' haven::is.labelled(ds) -#' attributes(ds) -#' ds |> haven_all_levels() -haven_all_levels <- function(data) { - stopifnot(haven::is.labelled(data)) - if (length(attributes(data)$labels) == length(unique(data))) { - out <- attributes(data)$labels - } else { - att <- attributes(data)$labels - out <- c(unique(data[!data %in% att]), att) |> - stats::setNames(c(unique(data[!data %in% att]), names(att))) - } - out -} - #' Guess time variables based on naming pattern #' @@ -567,50 +526,6 @@ numchar2fct <- function(data, numeric.threshold = 6, character.throshold = 6) { ) } -#' Extract attribute. Returns NA if none -#' -#' @param data vector -#' @param attr attribute name -#' -#' @return character vector -#' @export -#' -#' @examples -#' attr(mtcars$mpg, "label") <- "testing" -#' sapply(mtcars, get_attr) -#' lapply(mtcars, \(.x)get_attr(.x, NULL)) -#' mtcars |> -#' numchar2fct(numeric.threshold = 6) |> -#' ds2dd_detailed() -get_attr <- function(data, attr = NULL) { - if (is.null(attr)) { - attributes(data) - } else { - a <- attr(data, attr, exact = TRUE) - if (is.null(a)) { - NA - } else { - a - } - } -} -#' Set attributes for named attribute. Appends if attr is NULL -#' -#' @param data vector -#' @param label label -#' @param attr attribute name -#' -#' @return vector with attribute -#' @export -#' -set_attr <- function(data, label, attr = NULL) { - if (is.null(attr)) { - attributes(data) <- c(attributes(data),label) - } else { - attr(data, attr) <- label - } - data -} diff --git a/man/as_factor.Rd b/man/as_factor.Rd new file mode 100644 index 0000000..a681fc9 --- /dev/null +++ b/man/as_factor.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/as_factor.R +\name{as_factor} +\alias{as_factor} +\alias{as_factor.logical} +\alias{as_factor.numeric} +\alias{as_factor.character} +\alias{as_factor.haven_labelled} +\alias{as_factor.labelled} +\title{Convert labelled vectors to factors while preserving attributes} +\usage{ +as_factor(x, ...) + +\method{as_factor}{logical}(x, ...) + +\method{as_factor}{numeric}(x, ...) + +\method{as_factor}{character}(x, ...) + +\method{as_factor}{haven_labelled}(x, ...) + +\method{as_factor}{labelled}(x, ...) +} +\arguments{ +\item{x}{Object to coerce to a factor.} + +\item{...}{Other arguments passed down to method.} +} +\description{ +This extends [forcats::as_factor()] as well as [haven::as_factor()], by appending +original attributes except for "class" after converting to factor to avoid +ta loss in case of rich formatted and labelled data. +} +\details{ +Please refer to parent functions for extended documentation. +} +\examples{ +# will preserve all attributes but class +c(1, 4, 3, "A", 7, 8, 1) |> as_factor() +structure(c(1, 2, 3, 2, 10, 9), + labels = c(Unknown = 9, Refused = 10) +) |> + as_factor() + +structure(c(1, 2, 3, 2, 10, 9), + labels = c(Unknown = 9, Refused = 10), + class = "haven_labelled" +) |> + as_factor() + +} diff --git a/man/fct2num.Rd b/man/fct2num.Rd new file mode 100644 index 0000000..2d6c99e --- /dev/null +++ b/man/fct2num.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/as_factor.R +\name{fct2num} +\alias{fct2num} +\title{Allows conversion of factor to numeric values preserving original levels} +\usage{ +fct2num(data) +} +\arguments{ +\item{data}{vector} +} +\value{ +numeric vector +} +\description{ +Allows conversion of factor to numeric values preserving original levels +} +\examples{ +c(1, 4, 3, "A", 7, 8, 1) |> +as_factor() |> fct2num() + +structure(c(1, 2, 3, 2, 10, 9), + labels = c(Unknown = 9, Refused = 10), + class = "haven_labelled" +) |> + as_factor() |> + fct2num() + +structure(c(1, 2, 3, 2, 10, 9), + labels = c(Unknown = 9, Refused = 10) +) |> + as_factor() |> + fct2num() +} diff --git a/man/get_attr.Rd b/man/get_attr.Rd index a0a5f5e..b6757f6 100644 --- a/man/get_attr.Rd +++ b/man/get_attr.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ds2dd_detailed.R +% Please edit documentation in R/as_factor.R \name{get_attr} \alias{get_attr} \title{Extract attribute. Returns NA if none} diff --git a/man/haven_all_levels.Rd b/man/haven_all_levels.Rd index 6805752..7eccfde 100644 --- a/man/haven_all_levels.Rd +++ b/man/haven_all_levels.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ds2dd_detailed.R +% Please edit documentation in R/as_factor.R \name{haven_all_levels} \alias{haven_all_levels} \title{Finish incomplete haven attributes substituting missings with values} diff --git a/man/named_levels.Rd b/man/named_levels.Rd new file mode 100644 index 0000000..e1481f6 --- /dev/null +++ b/man/named_levels.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/as_factor.R +\name{named_levels} +\alias{named_levels} +\title{Get named vector of factor levels and values} +\usage{ +named_levels(data, label = "labels", na.label = NULL, na.value = 99) +} +\arguments{ +\item{data}{factor} + +\item{label}{character string of attribute with named vector of factor labels} +} +\value{ +named vector +} +\description{ +Get named vector of factor levels and values +} +\examples{ +structure(c(1, 2, 3, 2, 10, 9), + labels = c(Unknown = 9, Refused = 10), + class = "haven_labelled" +) |> as_factor() |> named_levels() +} diff --git a/man/set_attr.Rd b/man/set_attr.Rd index 8fcf43c..a7b3884 100644 --- a/man/set_attr.Rd +++ b/man/set_attr.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ds2dd_detailed.R +% Please edit documentation in R/as_factor.R \name{set_attr} \alias{set_attr} \title{Set attributes for named attribute. Appends if attr is NULL}