From 21e635d7758bb382a5f6d048fbb9e95ede37a029 Mon Sep 17 00:00:00 2001 From: Andreas Gammelgaard Damsbo Date: Thu, 18 Jan 2024 14:57:12 +0100 Subject: [PATCH] new function`ds2dd_detailed()`which includes more details than the old `ds2dd()`. --- R/ds2dd_detailed.R | 343 ++++++++++++++++++++++++++++++++++ data/redcapcast_data.rda | Bin 940 -> 1096 bytes man/ds2dd.Rd | 12 +- man/ds2dd_detailed.Rd | 82 ++++++++ man/guess_time_only_filter.Rd | 36 ++++ man/hms2character.Rd | 21 +++ man/mark_complete.Rd | 19 ++ man/time_only_correction.Rd | 25 +++ 8 files changed, 534 insertions(+), 4 deletions(-) create mode 100644 R/ds2dd_detailed.R create mode 100644 man/ds2dd_detailed.Rd create mode 100644 man/guess_time_only_filter.Rd create mode 100644 man/hms2character.Rd create mode 100644 man/mark_complete.Rd create mode 100644 man/time_only_correction.Rd diff --git a/R/ds2dd_detailed.R b/R/ds2dd_detailed.R new file mode 100644 index 0000000..909db62 --- /dev/null +++ b/R/ds2dd_detailed.R @@ -0,0 +1,343 @@ +utils::globalVariables(c( "stats::setNames", "field_name", "field_type", "select_choices_or_calculations")) +#' Try at determining which are true time only variables +#' +#' @description +#' This is just a try at guessing data type based on data class and column names +#' hoping for a tiny bit of naming consistency. R does not include a time-only +#' data format natively, so the "hms" class from `readr` is used. This +#' has to be converted to character class before REDCap upload. +#' +#' @param data data set +#' @param validate flag to output validation data. Will output list. +#' @param sel.pos Positive selection regex string +#' @param sel.neg Negative selection regex string +#' +#' @return character vector or list depending on `validate` flag. +#' @export +#' +#' @examples +#' data <- redcapcast_data +#' data |> guess_time_only_filter() +#' data |> guess_time_only_filter(validate = TRUE) |> lapply(head) +guess_time_only_filter <- function(data, validate = FALSE, sel.pos = "[Tt]i[d(me)]", sel.neg = "[Dd]at[eo]") { + datetime_nms <- data |> + lapply(\(x)any(c("POSIXct","hms") %in% class(x))) |> + (\(x) names(data)[do.call(c, x)])() + + time_only_log <- datetime_nms |> (\(x) { + ## Detects which are determined true Time only variables + ## Inspection is necessary + grepl(pattern = sel.pos, x = x) & + !grepl(pattern = sel.neg, x = x) + })() + + if (validate) { + list( + "is.POSIX" = data[datetime_nms], + "is.datetime" = data[datetime_nms[!time_only_log]], + "is.time_only" = data[datetime_nms[time_only_log]] + ) + } else { + datetime_nms[time_only_log] + } +} + +#' Correction based on time_only_filter function. Introduces new class for easier +#' validation labelling. +#' +#' @description +#' Dependens on the data class "hms" introduced with +#' `guess_time_only_filter()` and converts these +#' +#' @param data data set +#' @param ... arguments passed on to `guess_time_only_filter()` +#' +#' @return tibble +#' @importFrom readr parse_time +#' +#' @examples +#' data <- redcapcast_data +#' ## data |> time_only_correction() +time_only_correction <- function(data, ...) { + nms <- guess_time_only_filter(data, ...) + z <- nms |> + lapply(\(y) { + readr::parse_time(format(data[[y]], format = "%H:%M:%S")) + }) |> + suppressMessages(dplyr::bind_cols()) |> + stats::setNames(nm = nms) + data[nms] <- z + data +} + +#' Change "hms" to "character" for REDCap upload. +#' +#' @param data data set +#' +#' @return data.frame or tibble +#' +#' @examples +#' data <- redcapcast_data +#' ## data |> time_only_correction() |> hms2character() +hms2character <- function(data) { + data |> + lapply(function(x) { + if ("hms" %in% class(x)) { + as.character(x) + } else { + x + } + }) |> + dplyr::bind_cols() +} + +#' Extract data from stata file for data dictionary +#' +#' @details +#' This function is a natural development of the ds2dd() function. It assumes +#' that the first column is the ID-column. No checks. +#' Please, do always inspect the data dictionary before upload. +#' +#' Ensure, that the data set is formatted with as much information as possible. +#' +#' `field.type` can be supplied +#' +#' @param data data frame +#' @param date.format date format, character string. ymd/dmy/mdy. dafault is +#' dmy. +#' @param add.auto.id flag to add id column +#' @param form.name manually specify form name(s). Vector of length 1 or +#' ncol(data). Default is NULL and "data" is used. +#' @param field.type manually specify field type(s). Vector of length 1 or +#' ncol(data). Default is NULL and "text" is used for everything but factors, +#' which wil get "radio". +#' @param field.label manually specify field label(s). Vector of length 1 or +#' ncol(data). Default is NULL and colnames(data) is used or attribute +#' `field.label.attr` for haven_labelled data set (imported .dta file with +#' `haven::read_dta()`). +#' @param field.label.attr attribute name for named labels for haven_labelled +#' data set (imported .dta file with `haven::read_dta()`. Default is "label" +#' @param field.validation manually specify field validation(s). Vector of +#' length 1 or ncol(data). Default is NULL and `levels()` are used for factors +#' or attribute `factor.labels.attr` for haven_labelled data set (imported .dta file with +#' `haven::read_dta()`). +#' @param metadata redcap metadata headings. Default is +#' REDCapCAST:::metadata_names. +#' @param validate.time Flag to validate guessed time columns +#' @param time.var.sel.pos Positive selection regex string passed to +#' `gues_time_only_filter()` as sel.pos. +#' @param time.var.sel.neg Negative selection regex string passed to +#' `gues_time_only_filter()` as sel.neg. +#' +#' @return list of length 2 +#' @export +#' +#' @examples +#' data <- redcapcast_data +#' data |> ds2dd_detailed(validate.time = TRUE) +#' data |> ds2dd_detailed() +#' iris |> ds2dd_detailed(add.auto.id = TRUE) +#' mtcars |> ds2dd_detailed(add.auto.id = TRUE) +ds2dd_detailed <- function(data, + add.auto.id = FALSE, + date.format = "dmy", + form.name = NULL, + field.type = NULL, + field.label = NULL, + field.label.attr ="label", + field.validation = NULL, + metadata = metadata_names, + validate.time = FALSE, + time.var.sel.pos = "[Tt]i[d(me)]", + time.var.sel.neg = "[Dd]at[eo]") { + ## Handles the odd case of no id column present + if (add.auto.id) { + data <- dplyr::tibble( + default_trial_id = seq_len(nrow(data)), + data + ) + message("A default id column has been added") + } + + if (validate.time) { + return(data |> guess_time_only_filter(validate = TRUE)) + } + + if (lapply(data, haven::is.labelled) |> (\(x)do.call(c, x))() |> any()) { + message("Data seems to be imported with haven from a Stata (.dta) file and will be treated as such.") + data.source <- "dta" + } else { + data.source <- "" + } + + ## data classes + + ### Only keeps the first class, as time fields (POSIXct/POSIXt) has two classes + if (data.source == "dta") { + data_classes <- + data |> + haven::as_factor() |> + time_only_correction(sel.pos = time.var.sel.pos, sel.neg = time.var.sel.neg) |> + lapply(\(x)class(x)[1]) |> + (\(x)do.call(c, x))() + } else { + data_classes <- + data |> + time_only_correction(sel.pos = time.var.sel.pos, sel.neg = time.var.sel.neg) |> + lapply(\(x)class(x)[1]) |> + (\(x)do.call(c, x))() + } + + ## --------------------------------------- + ## Building the data dictionary + ## --------------------------------------- + + ## skeleton + + dd <- data.frame(matrix(ncol = length(metadata), nrow = ncol(data))) |> + stats::setNames(metadata) |> + dplyr::tibble() + + dd$field_name <- gsub(" ", "_", tolower(colnames(data))) + + ## form_name + if (is.null(form.name)) { + dd$form_name <- "data" + } else { + if (length(form.name) == 1 | length(form.name) == nrow(dd)) { + dd$form_name <- form.name + } else { + stop("Length of supplied 'form.name' has to be one (1) or ncol(data).") + } + } + + ## field_label + + if (is.null(field.label)) { + if (data.source == "dta") { + label <- data |> + lapply(function(x) { + if (haven::is.labelled(x)) { + attributes(x)[[field.label.attr]] + } else { + NA + } + }) |> + (\(x)do.call(c, x))() + } else { + label <- data |> colnames() + } + + dd <- + dd |> dplyr::mutate(field_label = dplyr::if_else(is.na(label), field_name, label)) + } else { + if (length(field.label) == 1 | length(field.label) == nrow(dd)) { + dd$field_label <- field.label + } else { + stop("Length of supplied 'field.label' has to be one (1) or ncol(data).") + } + } + + + ## field_type + + if (is.null(field.type)) { + dd$field_type <- "text" + + dd <- + dd |> dplyr::mutate(field_type = dplyr::if_else(data_classes == "factor", "radio", field_type)) + } else { + if (length(field.type) == 1 | length(field.type) == nrow(dd)) { + dd$field_type <- field.type + } else { + stop("Length of supplied 'field.type' has to be one (1) or ncol(data).") + } + } + + ## validation + + if (is.null(field.validation)) { + dd <- + dd |> dplyr::mutate( + text_validation_type_or_show_slider_number = dplyr::case_when( + data_classes == "Date" ~ paste0("date_", date.format), + data_classes == + "hms" ~ "time_hh_mm_ss", + ## Self invented format after filtering + data_classes == + "POSIXct" ~ paste0("datetime_", date.format), + data_classes == + "numeric" ~ "number" + ) + ) + } else { + if (length(field.validation) == 1 | length(field.validation) == nrow(dd)) { + dd$text_validation_type_or_show_slider_number <- field.validation + } else { + stop("Length of supplied 'field.validation' has to be one (1) or ncol(data).") + } + } + + + + ## choices + + if (data.source == "dta") { + factor_levels <- data |> + lapply(function(x) { + if (haven::is.labelled(x)) { + att <- attributes(x)$labels + paste(paste(att, names(att), sep = ", "), collapse = " | ") + } else { + NA + } + }) |> + (\(x)do.call(c, x))() + } else { + factor_levels <- data |> + lapply(function(x) { + if (is.factor(x)) { + ## Re-factors to avoid confusion with missing levels + ## Assumes alle relevant levels are represented in the data + re_fac <- factor(x) + paste(paste(unique(as.numeric(re_fac)), levels(re_fac), sep = ", "), collapse = " | ") + } else { + NA + } + }) |> + (\(x)do.call(c, x))() + } + + dd <- + dd |> dplyr::mutate( + select_choices_or_calculations = dplyr::if_else( + is.na(factor_levels), + select_choices_or_calculations, + factor_levels + ) + ) + + list( + data = data |> + time_only_correction(sel.pos = time.var.sel.pos, sel.neg = time.var.sel.neg) |> + hms2character() |> + (\(x)stats::setNames(x, tolower(names(x))))(), + meta = dd + ) +} + +### Completion +#' Completion marking based on completed upload +#' +#' @param upload output list from `REDCapR::redcap_write()` +#' @param ls output list from `ds2dd_detailed()` +#' +#' @return list with `REDCapR::redcap_write()` results +mark_complete <- function(upload, ls){ + data <- ls$data + meta <- ls$meta + forms <- unique(meta$form_name) + cbind(data[[1]][data[[1]] %in% upload$affected_ids], + data.frame(matrix(2,ncol=length(forms),nrow=upload$records_affected_count))) |> + stats::setNames(c(names(data)[1],paste0(forms,"_complete"))) +} diff --git a/data/redcapcast_data.rda b/data/redcapcast_data.rda index 5095059e643f3adb2178cd777f4ab90abad1ea46..907c2c9b9a883d51e932a99c23527716d9c04705 100644 GIT binary patch literal 1096 zcmV-O1h@M_T4*^jL0KkKS$vKtw*UsGfB*mg|HMe?f9n3>-(vs&-}K^w04Ll4h)%*F z5wMpDQTfmV`^#oV)|S}r*I<%Kk*b<`Pe}D1h{PH_LqpU6G&D2=O)@lkjSo-&dW`|2 z)B);Y8hV(7P?3|=+5|LY!&4JY3_}FK36N-HXa*qD047YC8e(aJ8eszo6I9tm&@=OafpDf;7_r0$>Et(8Rz1001Khz#~jdh5(63NuoVR zqa@QNK|KMX7??v$LrpLM0BT?(2x!wLYGD9qZ6+ZEgMvL4*jQI3y_|%4U`NZrIV(hC zNlPd~EJ&JZ?jn!00bsZZbtIzD#-OG$M98&8s79CSsR0E}WQU2u5@z8{OY$X(=Jt zts_0!s9^`=1i?OdTe+2Dqmq~DU8&Eh0K^sk@aU6xss2p(=Cd6+2gVSNEHX1xQUKia{ot zX~~YwY_C?5N>yfttFpTQyA=@IAOQhTHPJ$fK@zG^1WG}YKz_)m20D>Y585E1(}*{s zCALB~BmlR$qAhj)8E;x<%4}b&&}1OZ!l#|cK~)8sk-y#2y^yB&n9!9N(_gbG05Z{O za%ogrO@}vf$soB3e5<}#j8*U765#_&u1h(5=Vz9)-R7^$j9McfoJ%iyOfpa>92}_w z5sHcgBvMMJaJJB_BtE|Vk1^+W{ucZ;Xn2t1I3>r1@Xe4VR1}fNr|BzJ!8){+!5oSN zqeaB}@gNE2FC=EdbGW_SIS4d_H<)jUcH+}y_b2!)BCWEU-VKjqC)z@ZsaFYz*86cv z!y4vPUhG2TQW^|<^y5rx_Ohc`Z43-tLPkI=Y~)Z%`NR-j8?j?0Nx+@xKvbR>ih77M z6s3eghGa_{VG8O?dhMvNf`o64R?Q3HM(nvKn%kSJ%XYE(M-u&w(Xwe~E${USlXWr# zaPZO4qhjuBgteq*bT9;O8jIT8%LKoVmxG{Ph$X6TE^PCz*g+o6Ju1(IIZ&OY~oaAKt- z;cqGm$fQydz+eRo3H%%cNsnv@2B#=328iQkus1i+%goH!)tYNL)|^^pf*}c+G4l0v zf7!OQG(LFxKi>`G+V@`KjRP!B^*X3Zzv-V0|LZ*zbAYc2_Q1`WCBP6}azFq|e832Q O;_gVN3K9>I#TMXn>FGWI literal 940 zcmV;d15^A$T4*^jL0KkKS)gpI9RLP6fB*mg{l-Y?f8hRM-(tV--{iuA00fW#hz`Oa z5s(~-LHW=Fo6rW8fSU$VdQ`~B4FC-T6GJ9OKmZ1efEbufGH3uaXnKtY0000000000 z000000000o1c?YCLm*)q007fRr~qk@0B9Nkpay^d8V3LX00000000000000005SxM zgGo%AT{)ciWWzN;r{ zG^sa=hMayTO#*B$APU+K_UDNdo(ieXc3!2U_GdV^2@7&4 zYaUWp04zn_RDlZ=G_gu{WRh3{OZ5hfssKV3*`Ps#ff#s@VsqR>D49m#_U*#)-3sb- zW*o?!mg8Dvrx3#0$@gb=RZ80NuH{@P$~M|;ZX7sqI9_{Tp1BRWUY?QFL1MHZ41@_L zl0XpB2m!W)5Yg*kBoM*0fDN)k5=n1b1cNo8SS(1|jgE+DR$qW6Yw$x{s%oZ$4$=n9 zErJ7W@&zZYVPiOQg8gd|s+tiRyYQzlFy(9N*Ho=>1X43D&5x+**1ecZN5;VGAlvVo z)a+t37xR~)BRk2+w%2IJ(ql>shV$tpq3t1xWTc@eH8U773|@#JSSURpN;89Fs$Kv@ zQU#e1@1c#WBRff*c3a_?DH(hQ&OAo@+sTnfBAZ!OGxqcIF6R4@&*v$<;QE+89>QH& zX=Q^ec6~IG1-a^C(r71X49UApR_#kuQU=l#*&9uRVQs2oi6yA=D20h*B(TB`EkZf4 zp-N3Q1WLxy3fTe`R9SCL1&Df#)Ga!TWpehjTRw=WiC0Q$Y~@{@8DbTS2w5mni*q9! z3@m_!L3)h|5LvZ;wef6$4} zwq4WG!KtBmg>bQ7BR(N)S_?s1I6ba%DRNB0I5-ar5oDr+8iNwC5(b+D-02R<)nxF6 zdmROcP$b*YWM%MjOr{8i^14z9j!hHVKu#c2=tS5Q^F#%s3qrz!3M0HkbBF0`S-Oez z%Q?UR4JqHh4AiL~yN4UOdZ>ygI>`oPQNQe1o-e*20bl||v9WgC;(&oG;J^V9Xn-L9 Oi@744C`eE?RgQo{0E&D7 diff --git a/man/ds2dd.Rd b/man/ds2dd.Rd index c824fb5..ba607b8 100644 --- a/man/ds2dd.Rd +++ b/man/ds2dd.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/ds2dd.R \name{ds2dd} \alias{ds2dd} -\title{Data set to data dictionary function} +\title{(DEPRECATED) Data set to data dictionary function} \usage{ ds2dd( ds, @@ -11,7 +11,7 @@ ds2dd( field.type = "text", field.label = NULL, include.column.names = FALSE, - metadata = names(redcapcast_meta) + metadata = metadata_names ) } \arguments{ @@ -34,14 +34,18 @@ names.} column names for original data set for upload.} \item{metadata}{Metadata column names. Default is the included -REDCapCAST::redcapcast_data.} +REDCapCAST::metadata_names.} } \value{ data.frame or list of data.frame and vector } \description{ +Creates a very basic data dictionary skeleton. Please see `ds2dd_detailed()` +for a more advanced function. +} +\details{ Migrated from stRoke ds2dd(). Fits better with the functionality of -'REDCapCAST' +'REDCapCAST'. } \examples{ redcapcast_data$record_id <- seq_len(nrow(redcapcast_data)) diff --git a/man/ds2dd_detailed.Rd b/man/ds2dd_detailed.Rd new file mode 100644 index 0000000..89cf42b --- /dev/null +++ b/man/ds2dd_detailed.Rd @@ -0,0 +1,82 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ds2dd_detailed.R +\name{ds2dd_detailed} +\alias{ds2dd_detailed} +\title{Extract data from stata file for data dictionary} +\usage{ +ds2dd_detailed( + data, + add.auto.id = FALSE, + date.format = "dmy", + form.name = NULL, + field.type = NULL, + field.label = NULL, + field.label.attr = "label", + field.validation = NULL, + metadata = metadata_names, + validate.time = FALSE, + time.var.sel.pos = "[Tt]i[d(me)]", + time.var.sel.neg = "[Dd]at[eo]" +) +} +\arguments{ +\item{data}{data frame} + +\item{add.auto.id}{flag to add id column} + +\item{date.format}{date format, character string. ymd/dmy/mdy. dafault is +dmy.} + +\item{form.name}{manually specify form name(s). Vector of length 1 or +ncol(data). Default is NULL and "data" is used.} + +\item{field.type}{manually specify field type(s). Vector of length 1 or +ncol(data). Default is NULL and "text" is used for everything but factors, +which wil get "radio".} + +\item{field.label}{manually specify field label(s). Vector of length 1 or +ncol(data). Default is NULL and colnames(data) is used or attribute +`field.label.attr` for haven_labelled data set (imported .dta file with +`haven::read_dta()`).} + +\item{field.label.attr}{attribute name for named labels for haven_labelled +data set (imported .dta file with `haven::read_dta()`. Default is "label"} + +\item{field.validation}{manually specify field validation(s). Vector of +length 1 or ncol(data). Default is NULL and `levels()` are used for factors +or attribute `factor.labels.attr` for haven_labelled data set (imported .dta file with +`haven::read_dta()`).} + +\item{metadata}{redcap metadata headings. Default is +REDCapCAST:::metadata_names.} + +\item{validate.time}{Flag to validate guessed time columns} + +\item{time.var.sel.pos}{Positive selection regex string passed to +`gues_time_only_filter()` as sel.pos.} + +\item{time.var.sel.neg}{Negative selection regex string passed to +`gues_time_only_filter()` as sel.neg.} +} +\value{ +list of length 2 +} +\description{ +Extract data from stata file for data dictionary +} +\details{ +This function is a natural development of the ds2dd() function. It assumes +that the first column is the ID-column. No checks. +Please, do always inspect the data dictionary before upload. + +Ensure, that the data set is formatted with as much information as possible. + +`field.type` can be supplied +} +\examples{ +data <- redcapcast_data +data |> ds2dd_detailed(validate.time = TRUE) +data |> ds2dd_detailed() +iris |> ds2dd_detailed(add.auto.id = TRUE) +mtcars |> ds2dd_detailed(add.auto.id = TRUE) +} diff --git a/man/guess_time_only_filter.Rd b/man/guess_time_only_filter.Rd new file mode 100644 index 0000000..58e6913 --- /dev/null +++ b/man/guess_time_only_filter.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ds2dd_detailed.R +\name{guess_time_only_filter} +\alias{guess_time_only_filter} +\title{Try at determining which are true time only variables} +\usage{ +guess_time_only_filter( + data, + validate = FALSE, + sel.pos = "[Tt]i[d(me)]", + sel.neg = "[Dd]at[eo]" +) +} +\arguments{ +\item{data}{data set} + +\item{validate}{flag to output validation data. Will output list.} + +\item{sel.pos}{Positive selection regex string} + +\item{sel.neg}{Negative selection regex string} +} +\value{ +character vector or list depending on `validate` flag. +} +\description{ +This is just a try at guessing data type based on data class and column names +hoping for a tiny bit of naming consistency. R does not include a time-only +data format natively, so the "hms" class from `readr` is used. This +has to be converted to character class before REDCap upload. +} +\examples{ +data <- redcapcast_data +data |> guess_time_only_filter() +data |> guess_time_only_filter(validate = TRUE) |> lapply(head) +} diff --git a/man/hms2character.Rd b/man/hms2character.Rd new file mode 100644 index 0000000..113b66a --- /dev/null +++ b/man/hms2character.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ds2dd_detailed.R +\name{hms2character} +\alias{hms2character} +\title{Change "hms" to "character" for REDCap upload.} +\usage{ +hms2character(data) +} +\arguments{ +\item{data}{data set} +} +\value{ +data.frame or tibble +} +\description{ +Change "hms" to "character" for REDCap upload. +} +\examples{ +data <- redcapcast_data +## data |> time_only_correction() |> hms2character() +} diff --git a/man/mark_complete.Rd b/man/mark_complete.Rd new file mode 100644 index 0000000..05aee6e --- /dev/null +++ b/man/mark_complete.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ds2dd_detailed.R +\name{mark_complete} +\alias{mark_complete} +\title{Completion marking based on completed upload} +\usage{ +mark_complete(upload, ls) +} +\arguments{ +\item{upload}{output list from `REDCapR::redcap_write()`} + +\item{ls}{output list from `ds2dd_detailed()`} +} +\value{ +list with `REDCapR::redcap_write()` results +} +\description{ +Completion marking based on completed upload +} diff --git a/man/time_only_correction.Rd b/man/time_only_correction.Rd new file mode 100644 index 0000000..75d3821 --- /dev/null +++ b/man/time_only_correction.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ds2dd_detailed.R +\name{time_only_correction} +\alias{time_only_correction} +\title{Correction based on time_only_filter function. Introduces new class for easier +validation labelling.} +\usage{ +time_only_correction(data, ...) +} +\arguments{ +\item{data}{data set} + +\item{...}{arguments passed on to `guess_time_only_filter()`} +} +\value{ +tibble +} +\description{ +Dependens on the data class "hms" introduced with +`guess_time_only_filter()` and converts these +} +\examples{ +data <- redcapcast_data +## data |> time_only_correction() +}