Extract data from stata file for data dictionary
Usage
ds2dd_detailed(
data,
add.auto.id = FALSE,
date.format = "dmy",
form.name = NULL,
form.sep = NULL,
form.prefix = TRUE,
field.type = NULL,
field.label = NULL,
field.label.attr = "label",
field.validation = NULL,
metadata = names(REDCapCAST::redcapcast_meta),
validate.time = FALSE,
time.var.sel.pos = "[Tt]i[d(me)]",
time.var.sel.neg = "[Dd]at[eo]"
)
Arguments
- data
data frame
- add.auto.id
flag to add id column
- date.format
date format, character string. ymd/dmy/mdy. dafault is dmy.
- form.name
manually specify form name(s). Vector of length 1 or ncol(data). Default is NULL and "data" is used.
- form.sep
If supplied dataset has form names as suffix or prefix to the column/variable names, the seperator can be specified. If supplied, the form.sep is ignored. Default is NULL.
- form.prefix
Flag to set if form is prefix (TRUE) or suffix (FALSE) to the column names. Assumes all columns have pre- or suffix if specified.
- field.type
manually specify field type(s). Vector of length 1 or ncol(data). Default is NULL and "text" is used for everything but factors, which wil get "radio".
- field.label
manually specify field label(s). Vector of length 1 or ncol(data). Default is NULL and colnames(data) is used or attribute `field.label.attr` for haven_labelled data set (imported .dta file with `haven::read_dta()`).
- field.label.attr
attribute name for named labels for haven_labelled data set (imported .dta file with `haven::read_dta()`. Default is "label"
- field.validation
manually specify field validation(s). Vector of length 1 or ncol(data). Default is NULL and `levels()` are used for factors or attribute `factor.labels.attr` for haven_labelled data set (imported .dta file with `haven::read_dta()`).
- metadata
redcap metadata headings. Default is REDCapCAST:::metadata_names.
- validate.time
Flag to validate guessed time columns
- time.var.sel.pos
Positive selection regex string passed to `gues_time_only_filter()` as sel.pos.
- time.var.sel.neg
Negative selection regex string passed to `gues_time_only_filter()` as sel.neg.
Details
This function is a natural development of the ds2dd() function. It assumes that the first column is the ID-column. No checks. Please, do always inspect the data dictionary before upload.
Ensure, that the data set is formatted with as much information as possible.
`field.type` can be supplied
Examples
data <- REDCapCAST::redcapcast_data
data |> ds2dd_detailed(validate.time = TRUE)
#> $is.POSIX
#> # A tibble: 19 × 2
#> inclusion_time event_datetime
#> <time> <dttm>
#> 1 12:38:49 NA
#> 2 10:38:57 NA
#> 3 NA NA
#> 4 NA 2024-01-18 12:49:42
#> 5 12:01:07 NA
#> 6 NA NA
#> 7 NA NA
#> 8 NA 2024-01-18 12:49:58
#> 9 NA 2024-01-18 12:50:01
#> 10 NA 2024-01-18 12:50:05
#> 11 NA 2024-01-18 12:50:07
#> 12 NA 2024-01-18 12:50:09
#> 13 20:39:19 NA
#> 14 NA NA
#> 15 NA 2024-01-18 12:50:19
#> 16 NA 2024-01-18 12:50:22
#> 17 NA 2024-01-18 12:50:24
#> 18 08:50:31 NA
#> 19 08:49:28 NA
#>
#> $is.datetime
#> # A tibble: 19 × 1
#> event_datetime
#> <dttm>
#> 1 NA
#> 2 NA
#> 3 NA
#> 4 2024-01-18 12:49:42
#> 5 NA
#> 6 NA
#> 7 NA
#> 8 2024-01-18 12:49:58
#> 9 2024-01-18 12:50:01
#> 10 2024-01-18 12:50:05
#> 11 2024-01-18 12:50:07
#> 12 2024-01-18 12:50:09
#> 13 NA
#> 14 NA
#> 15 2024-01-18 12:50:19
#> 16 2024-01-18 12:50:22
#> 17 2024-01-18 12:50:24
#> 18 NA
#> 19 NA
#>
#> $is.time_only
#> # A tibble: 19 × 1
#> inclusion_time
#> <time>
#> 1 12:38:49
#> 2 10:38:57
#> 3 NA
#> 4 NA
#> 5 12:01:07
#> 6 NA
#> 7 NA
#> 8 NA
#> 9 NA
#> 10 NA
#> 11 NA
#> 12 NA
#> 13 20:39:19
#> 14 NA
#> 15 NA
#> 16 NA
#> 17 NA
#> 18 08:50:31
#> 19 08:49:28
#>
data |> ds2dd_detailed()
#> $data
#> # A tibble: 19 × 24
#> record_id redcap_event_name redcap_repeat_instrument redcap_repeat_instance
#> <dbl> <chr> <chr> <dbl>
#> 1 1 inclusion NA NA
#> 2 2 inclusion NA NA
#> 3 2 follow1 NA NA
#> 4 2 follow1 New Event (?) 1
#> 5 3 inclusion NA NA
#> 6 3 follow1 NA NA
#> 7 3 follow2 NA NA
#> 8 3 follow1 New Event (?) 1
#> 9 3 follow1 New Event (?) 2
#> 10 3 follow2 New Event (?) 1
#> 11 3 follow2 New Event (?) 2
#> 12 3 follow2 New Event (?) 3
#> 13 4 inclusion NA NA
#> 14 4 follow2 NA NA
#> 15 4 follow2 New Event (?) 1
#> 16 4 follow2 New Event (?) 2
#> 17 4 follow2 New Event (?) 3
#> 18 5 inclusion NA NA
#> 19 6 inclusion NA NA
#> # ℹ 20 more variables: cpr <chr>, inclusion <date>, inclusion_time <chr>,
#> # dob <date>, age <dbl>, age_integer <dbl>, sex <chr>, cohabitation <chr>,
#> # hypertension <chr>, diabetes <chr>, region <chr>,
#> # baseline_data_start_complete <chr>, mrs_assessed <chr>, mrs_date <date>,
#> # mrs_score <dbl>, mrs_complete <chr>, event_datetime <dttm>,
#> # event_age <lgl>, event_type <chr>, new_event_complete <chr>
#>
#> $meta
#> # A tibble: 24 × 18
#> field_name form_name section_header field_type field_label
#> <chr> <lgl> <lgl> <chr> <chr>
#> 1 record_id NA NA text record_id
#> 2 redcap_event_name NA NA text redcap_event_na…
#> 3 redcap_repeat_instrument NA NA text redcap_repeat_i…
#> 4 redcap_repeat_instance NA NA text redcap_repeat_i…
#> 5 cpr NA NA text cpr
#> 6 inclusion NA NA text inclusion
#> 7 inclusion_time NA NA text inclusion_time
#> 8 dob NA NA text dob
#> 9 age NA NA text age
#> 10 age_integer NA NA text age_integer
#> # ℹ 14 more rows
#> # ℹ 13 more variables: select_choices_or_calculations <lgl>, field_note <lgl>,
#> # text_validation_type_or_show_slider_number <chr>,
#> # text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> # branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> # question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> # field_annotation <lgl>
#>
iris |> ds2dd_detailed(add.auto.id = TRUE)
#> A default id column has been added
#> $data
#> # A tibble: 150 × 6
#> record_id sepal.length sepal.width petal.length petal.width species
#> <int> <dbl> <dbl> <dbl> <dbl> <fct>
#> 1 1 5.1 3.5 1.4 0.2 setosa
#> 2 2 4.9 3 1.4 0.2 setosa
#> 3 3 4.7 3.2 1.3 0.2 setosa
#> 4 4 4.6 3.1 1.5 0.2 setosa
#> 5 5 5 3.6 1.4 0.2 setosa
#> 6 6 5.4 3.9 1.7 0.4 setosa
#> 7 7 4.6 3.4 1.4 0.3 setosa
#> 8 8 5 3.4 1.5 0.2 setosa
#> 9 9 4.4 2.9 1.4 0.2 setosa
#> 10 10 4.9 3.1 1.5 0.1 setosa
#> # ℹ 140 more rows
#>
#> $meta
#> # A tibble: 6 × 18
#> field_name form_name section_header field_type field_label
#> <chr> <lgl> <lgl> <chr> <chr>
#> 1 record_id NA NA text record_id
#> 2 sepal.length NA NA text sepal.length
#> 3 sepal.width NA NA text sepal.width
#> 4 petal.length NA NA text petal.length
#> 5 petal.width NA NA text petal.width
#> 6 species NA NA radio species
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> # text_validation_type_or_show_slider_number <chr>,
#> # text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> # branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> # question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> # field_annotation <lgl>
#>
mtcars |> ds2dd_detailed(add.auto.id = TRUE)
#> A default id column has been added
#> $data
#> # A tibble: 32 × 12
#> record_id mpg cyl disp hp drat wt qsec vs am gear carb
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
#> 2 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
#> 3 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
#> 4 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
#> 5 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
#> 6 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
#> 7 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
#> 8 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
#> 9 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
#> 10 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
#> # ℹ 22 more rows
#>
#> $meta
#> # A tibble: 12 × 18
#> field_name form_name section_header field_type field_label
#> <chr> <lgl> <lgl> <chr> <chr>
#> 1 record_id NA NA text record_id
#> 2 mpg NA NA text mpg
#> 3 cyl NA NA text cyl
#> 4 disp NA NA text disp
#> 5 hp NA NA text hp
#> 6 drat NA NA text drat
#> 7 wt NA NA text wt
#> 8 qsec NA NA text qsec
#> 9 vs NA NA text vs
#> 10 am NA NA text am
#> 11 gear NA NA text gear
#> 12 carb NA NA text carb
#> # ℹ 13 more variables: select_choices_or_calculations <lgl>, field_note <lgl>,
#> # text_validation_type_or_show_slider_number <chr>,
#> # text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> # branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> # question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> # field_annotation <lgl>
#>
data <- iris |>
ds2dd_detailed(add.auto.id = TRUE) |>
purrr::pluck("data")
#> A default id column has been added
names(data) <- glue::glue("{sample(x = c('a','b'),size = length(names(data)),
replace=TRUE,prob = rep(x=.5,2))}__{names(data)}")
data |> ds2dd_detailed(form.sep="__")
#> $data
#> # A tibble: 150 × 6
#> record_id sepal.length sepal.width petal.length petal.width species
#> <int> <dbl> <dbl> <dbl> <dbl> <fct>
#> 1 1 5.1 3.5 1.4 0.2 setosa
#> 2 2 4.9 3 1.4 0.2 setosa
#> 3 3 4.7 3.2 1.3 0.2 setosa
#> 4 4 4.6 3.1 1.5 0.2 setosa
#> 5 5 5 3.6 1.4 0.2 setosa
#> 6 6 5.4 3.9 1.7 0.4 setosa
#> 7 7 4.6 3.4 1.4 0.3 setosa
#> 8 8 5 3.4 1.5 0.2 setosa
#> 9 9 4.4 2.9 1.4 0.2 setosa
#> 10 10 4.9 3.1 1.5 0.1 setosa
#> # ℹ 140 more rows
#>
#> $meta
#> # A tibble: 6 × 18
#> field_name form_name section_header field_type field_label
#> <chr> <chr> <lgl> <chr> <chr>
#> 1 record_id b NA text record_id
#> 2 sepal.length b NA text sepal.length
#> 3 sepal.width b NA text sepal.width
#> 4 petal.length a NA text petal.length
#> 5 petal.width a NA text petal.width
#> 6 species a NA radio species
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> # text_validation_type_or_show_slider_number <chr>,
#> # text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> # branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> # question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> # field_annotation <lgl>
#>