2024-11-20 12:09:13 +01:00
|
|
|
#' Convert labelled vectors to factors while preserving attributes
|
|
|
|
#'
|
|
|
|
#' This extends [forcats::as_factor()] as well as [haven::as_factor()], by appending
|
|
|
|
#' original attributes except for "class" after converting to factor to avoid
|
|
|
|
#' ta loss in case of rich formatted and labelled data.
|
|
|
|
#'
|
|
|
|
#' Please refer to parent functions for extended documentation.
|
2024-11-20 14:31:01 +01:00
|
|
|
#' To avoid redundancy calls and errors, functions are copy-pasted here
|
2024-11-20 12:09:13 +01:00
|
|
|
#'
|
|
|
|
#' @param x Object to coerce to a factor.
|
|
|
|
#' @param ... Other arguments passed down to method.
|
|
|
|
#' @export
|
|
|
|
#' @examples
|
2024-11-20 14:31:01 +01:00
|
|
|
#' # will preserve all attributes
|
2024-11-20 12:09:13 +01:00
|
|
|
#' c(1, 4, 3, "A", 7, 8, 1) |> as_factor()
|
|
|
|
#' structure(c(1, 2, 3, 2, 10, 9),
|
|
|
|
#' labels = c(Unknown = 9, Refused = 10)
|
|
|
|
#' ) |>
|
2024-11-26 14:46:22 +01:00
|
|
|
#' as_factor() |>
|
|
|
|
#' dput()
|
2024-11-20 12:09:13 +01:00
|
|
|
#'
|
|
|
|
#' structure(c(1, 2, 3, 2, 10, 9),
|
|
|
|
#' labels = c(Unknown = 9, Refused = 10),
|
|
|
|
#' class = "haven_labelled"
|
|
|
|
#' ) |>
|
|
|
|
#' as_factor()
|
|
|
|
#' @importFrom forcats as_factor
|
|
|
|
#' @export
|
|
|
|
#' @name as_factor
|
|
|
|
as_factor <- function(x, ...) {
|
|
|
|
UseMethod("as_factor")
|
|
|
|
}
|
|
|
|
|
2024-11-20 14:31:01 +01:00
|
|
|
#' @rdname as_factor
|
|
|
|
#' @export
|
|
|
|
as_factor.factor <- function(x, ...) {
|
|
|
|
x
|
|
|
|
}
|
|
|
|
|
2024-11-20 12:09:13 +01:00
|
|
|
#' @rdname as_factor
|
|
|
|
#' @export
|
|
|
|
as_factor.logical <- function(x, ...) {
|
|
|
|
labels <- get_attr(x)
|
2024-11-20 14:31:01 +01:00
|
|
|
x <- factor(x, levels = c("FALSE", "TRUE"))
|
|
|
|
set_attr(x, labels, overwrite = FALSE)
|
2024-11-20 12:09:13 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
#' @rdname as_factor
|
|
|
|
#' @export
|
|
|
|
as_factor.numeric <- function(x, ...) {
|
|
|
|
labels <- get_attr(x)
|
2024-11-20 14:31:01 +01:00
|
|
|
x <- factor(x)
|
|
|
|
set_attr(x, labels, overwrite = FALSE)
|
2024-11-20 12:09:13 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
#' @rdname as_factor
|
|
|
|
#' @export
|
|
|
|
as_factor.character <- function(x, ...) {
|
|
|
|
labels <- get_attr(x)
|
2024-11-26 14:46:22 +01:00
|
|
|
if (possibly_roman(x)) {
|
2024-11-20 15:23:31 +01:00
|
|
|
x <- factor(x)
|
|
|
|
} else {
|
2024-11-26 14:46:22 +01:00
|
|
|
x <- structure(
|
|
|
|
forcats::fct_inorder(x),
|
|
|
|
label = attr(x, "label", exact = TRUE)
|
|
|
|
)
|
2024-11-21 11:18:38 +01:00
|
|
|
}
|
2024-11-20 14:31:01 +01:00
|
|
|
set_attr(x, labels, overwrite = FALSE)
|
2024-11-20 12:09:13 +01:00
|
|
|
}
|
|
|
|
|
2024-11-20 14:31:01 +01:00
|
|
|
#' @param ordered If `TRUE` create an ordered (ordinal) factor, if
|
|
|
|
#' `FALSE` (the default) create a regular (nominal) factor.
|
|
|
|
#' @param levels How to create the levels of the generated factor:
|
|
|
|
#'
|
|
|
|
#' * "default": uses labels where available, otherwise the values.
|
|
|
|
#' Labels are sorted by value.
|
|
|
|
#' * "both": like "default", but pastes together the level and value
|
|
|
|
#' * "label": use only the labels; unlabelled values become `NA`
|
|
|
|
#' * "values": use only the values
|
2024-11-20 12:09:13 +01:00
|
|
|
#' @rdname as_factor
|
|
|
|
#' @export
|
2024-11-20 14:31:01 +01:00
|
|
|
as_factor.haven_labelled <- function(x, levels = c("default", "labels", "values", "both"),
|
|
|
|
ordered = FALSE, ...) {
|
|
|
|
labels_all <- get_attr(x)
|
|
|
|
|
|
|
|
levels <- match.arg(levels)
|
|
|
|
label <- attr(x, "label", exact = TRUE)
|
|
|
|
labels <- attr(x, "labels")
|
|
|
|
|
|
|
|
if (levels %in% c("default", "both")) {
|
|
|
|
if (levels == "both") {
|
|
|
|
names(labels) <- paste0("[", labels, "] ", names(labels))
|
|
|
|
}
|
|
|
|
|
|
|
|
# Replace each value with its label
|
|
|
|
vals <- unique(vctrs::vec_data(x))
|
|
|
|
levs <- replace_with(vals, unname(labels), names(labels))
|
|
|
|
# Ensure all labels are preserved
|
|
|
|
levs <- sort(c(stats::setNames(vals, levs), labels), na.last = TRUE)
|
|
|
|
levs <- unique(names(levs))
|
|
|
|
|
|
|
|
x <- replace_with(vctrs::vec_data(x), unname(labels), names(labels))
|
|
|
|
|
|
|
|
x <- factor(x, levels = levs, ordered = ordered)
|
|
|
|
} else if (levels == "labels") {
|
|
|
|
levs <- unname(labels)
|
|
|
|
labs <- names(labels)
|
|
|
|
x <- replace_with(vctrs::vec_data(x), levs, labs)
|
|
|
|
x <- factor(x, unique(labs), ordered = ordered)
|
|
|
|
} else if (levels == "values") {
|
|
|
|
if (all(x %in% labels)) {
|
|
|
|
levels <- unname(labels)
|
|
|
|
} else {
|
|
|
|
levels <- sort(unique(vctrs::vec_data(x)))
|
|
|
|
}
|
|
|
|
x <- factor(vctrs::vec_data(x), levels, ordered = ordered)
|
|
|
|
}
|
|
|
|
|
|
|
|
x <- structure(x, label = label)
|
|
|
|
|
|
|
|
set_attr(x, labels_all, overwrite = FALSE)
|
2024-11-20 12:09:13 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
#' @export
|
|
|
|
#' @rdname as_factor
|
|
|
|
as_factor.labelled <- as_factor.haven_labelled
|
|
|
|
|
2024-11-20 14:31:01 +01:00
|
|
|
replace_with <- function(x, from, to) {
|
|
|
|
stopifnot(length(from) == length(to))
|
|
|
|
|
|
|
|
out <- x
|
|
|
|
# First replace regular values
|
|
|
|
matches <- match(x, from, incomparables = NA)
|
|
|
|
if (anyNA(matches)) {
|
|
|
|
out[!is.na(matches)] <- to[matches[!is.na(matches)]]
|
|
|
|
} else {
|
|
|
|
out <- to[matches]
|
|
|
|
}
|
|
|
|
|
|
|
|
# Then tagged missing values
|
|
|
|
tagged <- haven::is_tagged_na(x)
|
|
|
|
if (!any(tagged)) {
|
|
|
|
return(out)
|
|
|
|
}
|
|
|
|
|
|
|
|
matches <- match(haven::na_tag(x), haven::na_tag(from), incomparables = NA)
|
|
|
|
|
|
|
|
# Could possibly be faster to use anyNA(matches)
|
|
|
|
out[!is.na(matches)] <- to[matches[!is.na(matches)]]
|
|
|
|
out
|
|
|
|
}
|
2024-11-20 12:09:13 +01:00
|
|
|
|
|
|
|
|
|
|
|
#' Get named vector of factor levels and values
|
|
|
|
#'
|
|
|
|
#' @param data factor
|
|
|
|
#' @param label character string of attribute with named vector of factor labels
|
2024-11-20 12:40:29 +01:00
|
|
|
#' @param na.label character string to refactor NA values. Default is NULL.
|
|
|
|
#' @param na.value new value for NA strings. Ignored if na.label is NULL.
|
|
|
|
#' Default is 99.
|
2024-11-20 12:09:13 +01:00
|
|
|
#'
|
|
|
|
#' @return named vector
|
|
|
|
#' @export
|
|
|
|
#'
|
|
|
|
#' @examples
|
2024-11-20 12:40:29 +01:00
|
|
|
#' \dontrun{
|
2024-11-20 12:09:13 +01:00
|
|
|
#' structure(c(1, 2, 3, 2, 10, 9),
|
|
|
|
#' labels = c(Unknown = 9, Refused = 10),
|
|
|
|
#' class = "haven_labelled"
|
2024-11-20 14:31:01 +01:00
|
|
|
#' ) |>
|
|
|
|
#' as_factor() |>
|
|
|
|
#' named_levels()
|
2024-11-20 12:40:29 +01:00
|
|
|
#' }
|
2024-11-20 14:31:01 +01:00
|
|
|
named_levels <- function(data, label = "labels", na.label = NULL, na.value = 99) {
|
2024-11-20 12:09:13 +01:00
|
|
|
stopifnot(is.factor(data))
|
2024-11-20 14:31:01 +01:00
|
|
|
if (!is.null(na.label)) {
|
2024-11-20 12:09:13 +01:00
|
|
|
attrs <- attributes(data)
|
|
|
|
lvls <- as.character(data)
|
|
|
|
lvls[is.na(lvls)] <- na.label
|
|
|
|
vals <- as.numeric(data)
|
|
|
|
vals[is.na(vals)] <- na.value
|
|
|
|
|
|
|
|
lbls <- data.frame(
|
|
|
|
name = lvls,
|
|
|
|
value = vals
|
2024-11-20 14:31:01 +01:00
|
|
|
) |>
|
|
|
|
unique() |>
|
2024-11-20 12:09:13 +01:00
|
|
|
(\(d){
|
|
|
|
stats::setNames(d$value, d$name)
|
|
|
|
})() |>
|
|
|
|
sort()
|
|
|
|
|
2024-11-20 14:31:01 +01:00
|
|
|
data <- do.call(
|
|
|
|
structure,
|
|
|
|
c(
|
|
|
|
list(.Data = match(vals, lbls)),
|
|
|
|
attrs[-match("levels", names(attrs))],
|
|
|
|
list(
|
|
|
|
levels = names(lbls),
|
|
|
|
labels = lbls
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
2024-11-20 12:09:13 +01:00
|
|
|
}
|
|
|
|
|
2024-11-26 14:46:22 +01:00
|
|
|
|
2024-11-21 11:18:38 +01:00
|
|
|
# Handle empty factors
|
2024-11-26 14:46:22 +01:00
|
|
|
if (all_na(data)) {
|
2024-11-21 11:18:38 +01:00
|
|
|
d <- data.frame(
|
|
|
|
name = levels(data),
|
|
|
|
value = seq_along(levels(data))
|
|
|
|
)
|
|
|
|
} else {
|
|
|
|
d <- data.frame(
|
|
|
|
name = levels(data)[data],
|
|
|
|
value = as.numeric(data)
|
|
|
|
) |>
|
2024-11-26 14:46:22 +01:00
|
|
|
unique() |>
|
|
|
|
stats::na.omit()
|
2024-11-21 11:18:38 +01:00
|
|
|
}
|
2024-11-20 12:09:13 +01:00
|
|
|
|
|
|
|
## Applying labels
|
|
|
|
attr_l <- attr(x = data, which = label, exact = TRUE)
|
|
|
|
if (length(attr_l) != 0) {
|
2024-11-26 14:46:22 +01:00
|
|
|
if (all(names(attr_l) %in% d$name)) {
|
2024-11-20 14:31:01 +01:00
|
|
|
d$value[match(names(attr_l), d$name)] <- unname(attr_l)
|
2024-11-26 14:46:22 +01:00
|
|
|
} else if (all(d$name %in% names(attr_l)) && nrow(d) < length(attr_l)){
|
|
|
|
d <- data.frame(name = names(attr_l),
|
|
|
|
value=unname(attr_l))
|
|
|
|
} else {
|
2024-11-20 14:31:01 +01:00
|
|
|
d$name[match(attr_l, d$name)] <- names(attr_l)
|
|
|
|
d$value[match(names(attr_l), d$name)] <- unname(attr_l)
|
|
|
|
}
|
2024-11-20 12:09:13 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
out <- stats::setNames(d$value, d$name)
|
|
|
|
## Sort if levels are numeric
|
|
|
|
## Else, they appear in order of appearance
|
2024-11-20 15:23:31 +01:00
|
|
|
if (possibly_numeric(levels(data))) {
|
2024-11-20 12:09:13 +01:00
|
|
|
out <- out |> sort()
|
|
|
|
}
|
|
|
|
out
|
|
|
|
}
|
|
|
|
|
2024-11-21 11:18:38 +01:00
|
|
|
#' Test if vector can be interpreted as roman numerals
|
|
|
|
#'
|
|
|
|
#' @param data character vector
|
|
|
|
#'
|
|
|
|
#' @return logical
|
|
|
|
#' @export
|
|
|
|
#'
|
|
|
|
#' @examples
|
2024-11-26 14:46:22 +01:00
|
|
|
#' sample(1:100, 10) |>
|
|
|
|
#' as.roman() |>
|
|
|
|
#' possibly_roman()
|
|
|
|
#' sample(c(TRUE, FALSE), 10, TRUE) |> possibly_roman()
|
|
|
|
#' rep(NA, 10) |> possibly_roman()
|
|
|
|
possibly_roman <- function(data) {
|
2024-11-21 11:18:38 +01:00
|
|
|
# browser()
|
2024-11-26 14:46:22 +01:00
|
|
|
if (all(is.na(data))) {
|
|
|
|
return(FALSE)
|
|
|
|
}
|
|
|
|
identical(as.character(data), as.character(utils::as.roman(data)))
|
2024-11-20 15:23:31 +01:00
|
|
|
}
|
|
|
|
|
2024-11-20 12:09:13 +01:00
|
|
|
|
|
|
|
#' Allows conversion of factor to numeric values preserving original levels
|
|
|
|
#'
|
|
|
|
#' @param data vector
|
|
|
|
#'
|
|
|
|
#' @return numeric vector
|
|
|
|
#' @export
|
|
|
|
#'
|
|
|
|
#' @examples
|
|
|
|
#' c(1, 4, 3, "A", 7, 8, 1) |>
|
2024-11-20 14:31:01 +01:00
|
|
|
#' as_factor() |>
|
|
|
|
#' fct2num()
|
2024-11-20 12:09:13 +01:00
|
|
|
#'
|
|
|
|
#' structure(c(1, 2, 3, 2, 10, 9),
|
|
|
|
#' labels = c(Unknown = 9, Refused = 10),
|
|
|
|
#' class = "haven_labelled"
|
|
|
|
#' ) |>
|
|
|
|
#' as_factor() |>
|
|
|
|
#' fct2num()
|
|
|
|
#'
|
|
|
|
#' structure(c(1, 2, 3, 2, 10, 9),
|
2024-11-20 14:31:01 +01:00
|
|
|
#' labels = c(Unknown = 9, Refused = 10),
|
|
|
|
#' class = "labelled"
|
2024-11-20 12:09:13 +01:00
|
|
|
#' ) |>
|
|
|
|
#' as_factor() |>
|
|
|
|
#' fct2num()
|
2024-11-20 14:31:01 +01:00
|
|
|
#'
|
|
|
|
#' # Outlier with labels, but no class of origin, handled like numeric vector
|
|
|
|
#' # structure(c(1, 2, 3, 2, 10, 9),
|
|
|
|
#' # labels = c(Unknown = 9, Refused = 10)
|
|
|
|
#' # ) |>
|
|
|
|
#' # as_factor() |>
|
|
|
|
#' # fct2num()
|
|
|
|
#'
|
2024-11-26 14:46:22 +01:00
|
|
|
#' v <- sample(6:19, 20, TRUE) |> factor()
|
2024-11-20 14:31:01 +01:00
|
|
|
#' dput(v)
|
|
|
|
#' named_levels(v)
|
|
|
|
#' fct2num(v)
|
2024-11-20 12:09:13 +01:00
|
|
|
fct2num <- function(data) {
|
|
|
|
stopifnot(is.factor(data))
|
2024-11-26 14:46:22 +01:00
|
|
|
if (is.character(named_levels(data))) {
|
2024-11-20 14:31:01 +01:00
|
|
|
values <- as.numeric(named_levels(data))
|
|
|
|
} else {
|
|
|
|
values <- named_levels(data)
|
|
|
|
}
|
|
|
|
|
|
|
|
out <- values[match(data, names(named_levels(data)))]
|
|
|
|
|
|
|
|
## If no NA on numeric coercion, of original names, then return
|
|
|
|
## original numeric names, else values
|
2024-11-20 15:23:31 +01:00
|
|
|
if (possibly_numeric(out)) {
|
2024-11-20 14:31:01 +01:00
|
|
|
out <- as.numeric(names(out))
|
|
|
|
}
|
|
|
|
unname(out)
|
|
|
|
}
|
|
|
|
|
2024-11-26 14:46:22 +01:00
|
|
|
possibly_numeric <- function(data) {
|
2024-11-20 14:31:01 +01:00
|
|
|
length(stats::na.omit(suppressWarnings(as.numeric(names(data))))) ==
|
|
|
|
length(data)
|
2024-11-20 12:09:13 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
#' Extract attribute. Returns NA if none
|
|
|
|
#'
|
|
|
|
#' @param data vector
|
|
|
|
#' @param attr attribute name
|
|
|
|
#'
|
|
|
|
#' @return character vector
|
|
|
|
#' @export
|
|
|
|
#'
|
|
|
|
#' @examples
|
|
|
|
#' attr(mtcars$mpg, "label") <- "testing"
|
2024-11-20 14:31:01 +01:00
|
|
|
#' do.call(c, sapply(mtcars, get_attr))
|
2024-11-20 12:40:29 +01:00
|
|
|
#' \dontrun{
|
2024-11-20 12:09:13 +01:00
|
|
|
#' mtcars |>
|
|
|
|
#' numchar2fct(numeric.threshold = 6) |>
|
|
|
|
#' ds2dd_detailed()
|
2024-11-20 12:40:29 +01:00
|
|
|
#' }
|
2024-11-20 12:09:13 +01:00
|
|
|
get_attr <- function(data, attr = NULL) {
|
|
|
|
if (is.null(attr)) {
|
|
|
|
attributes(data)
|
|
|
|
} else {
|
|
|
|
a <- attr(data, attr, exact = TRUE)
|
|
|
|
if (is.null(a)) {
|
|
|
|
NA
|
|
|
|
} else {
|
|
|
|
a
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#' Set attributes for named attribute. Appends if attr is NULL
|
|
|
|
#'
|
|
|
|
#' @param data vector
|
|
|
|
#' @param label label
|
|
|
|
#' @param attr attribute name
|
2024-11-20 12:40:29 +01:00
|
|
|
#' @param overwrite overwrite existing attributes. Default is FALSE.
|
2024-11-20 12:09:13 +01:00
|
|
|
#'
|
|
|
|
#' @return vector with attribute
|
|
|
|
#' @export
|
|
|
|
#'
|
2024-11-20 14:31:01 +01:00
|
|
|
set_attr <- function(data, label, attr = NULL, overwrite = FALSE) {
|
|
|
|
# browser()
|
2024-11-20 12:09:13 +01:00
|
|
|
if (is.null(attr)) {
|
2024-11-20 14:31:01 +01:00
|
|
|
## Has to be a named list
|
|
|
|
## Will not fail, but just return original data
|
|
|
|
if (!is.list(label) | length(label) != length(names(label))) {
|
|
|
|
return(data)
|
|
|
|
}
|
|
|
|
## Only include named labels
|
|
|
|
label <- label[!is.na(names(label))]
|
|
|
|
|
|
|
|
if (!overwrite) {
|
2024-11-20 12:40:29 +01:00
|
|
|
label <- label[!names(label) %in% names(attributes(data))]
|
|
|
|
}
|
2024-11-20 14:31:01 +01:00
|
|
|
attributes(data) <- c(attributes(data), label)
|
2024-11-20 12:09:13 +01:00
|
|
|
} else {
|
|
|
|
attr(data, attr) <- label
|
|
|
|
}
|
|
|
|
data
|
|
|
|
}
|
|
|
|
|
|
|
|
#' Finish incomplete haven attributes substituting missings with values
|
|
|
|
#'
|
|
|
|
#' @param data haven labelled variable
|
|
|
|
#'
|
|
|
|
#' @return named vector
|
|
|
|
#' @export
|
|
|
|
#'
|
|
|
|
#' @examples
|
|
|
|
#' ds <- structure(c(1, 2, 3, 2, 10, 9),
|
|
|
|
#' labels = c(Unknown = 9, Refused = 10),
|
|
|
|
#' class = "haven_labelled"
|
|
|
|
#' )
|
|
|
|
#' haven::is.labelled(ds)
|
|
|
|
#' attributes(ds)
|
|
|
|
#' ds |> haven_all_levels()
|
|
|
|
haven_all_levels <- function(data) {
|
|
|
|
stopifnot(haven::is.labelled(data))
|
|
|
|
if (length(attributes(data)$labels) == length(unique(data))) {
|
|
|
|
out <- attributes(data)$labels
|
|
|
|
} else {
|
|
|
|
att <- attributes(data)$labels
|
|
|
|
out <- c(unique(data[!data %in% att]), att) |>
|
|
|
|
stats::setNames(c(unique(data[!data %in% att]), names(att)))
|
|
|
|
}
|
|
|
|
out
|
|
|
|
}
|