Extract data from stata file for data dictionary
Usage
ds2dd_detailed(
data,
add.auto.id = FALSE,
date.format = "dmy",
form.name = NULL,
field.type = NULL,
field.label = NULL,
field.label.attr = "label",
field.validation = NULL,
metadata = metadata_names,
validate.time = FALSE,
time.var.sel.pos = "[Tt]i[d(me)]",
time.var.sel.neg = "[Dd]at[eo]"
)
Arguments
- data
data frame
- add.auto.id
flag to add id column
- date.format
date format, character string. ymd/dmy/mdy. dafault is dmy.
- form.name
manually specify form name(s). Vector of length 1 or ncol(data). Default is NULL and "data" is used.
- field.type
manually specify field type(s). Vector of length 1 or ncol(data). Default is NULL and "text" is used for everything but factors, which wil get "radio".
- field.label
manually specify field label(s). Vector of length 1 or ncol(data). Default is NULL and colnames(data) is used or attribute `field.label.attr` for haven_labelled data set (imported .dta file with `haven::read_dta()`).
- field.label.attr
attribute name for named labels for haven_labelled data set (imported .dta file with `haven::read_dta()`. Default is "label"
- field.validation
manually specify field validation(s). Vector of length 1 or ncol(data). Default is NULL and `levels()` are used for factors or attribute `factor.labels.attr` for haven_labelled data set (imported .dta file with `haven::read_dta()`).
- metadata
redcap metadata headings. Default is REDCapCAST:::metadata_names.
- validate.time
Flag to validate guessed time columns
- time.var.sel.pos
Positive selection regex string passed to `gues_time_only_filter()` as sel.pos.
- time.var.sel.neg
Negative selection regex string passed to `gues_time_only_filter()` as sel.neg.
Details
This function is a natural development of the ds2dd() function. It assumes that the first column is the ID-column. No checks. Please, do always inspect the data dictionary before upload.
Ensure, that the data set is formatted with as much information as possible.
`field.type` can be supplied
Examples
data <- redcapcast_data
data |> ds2dd_detailed(validate.time = TRUE)
#> $is.POSIX
#> inclusion_time event_datetime
#> 1 12:38:49 <NA>
#> 2 10:38:57 <NA>
#> 3 NA <NA>
#> 4 NA 2024-01-18 12:49:42
#> 5 12:01:07 <NA>
#> 6 NA <NA>
#> 7 NA <NA>
#> 8 NA 2024-01-18 12:49:58
#> 9 NA 2024-01-18 12:50:01
#> 10 NA 2024-01-18 12:50:05
#> 11 NA 2024-01-18 12:50:07
#> 12 NA 2024-01-18 12:50:09
#> 13 20:39:19 <NA>
#> 14 NA <NA>
#> 15 NA 2024-01-18 12:50:19
#> 16 NA 2024-01-18 12:50:22
#> 17 NA 2024-01-18 12:50:24
#> 18 08:50:31 <NA>
#>
#> $is.datetime
#> event_datetime
#> 1 <NA>
#> 2 <NA>
#> 3 <NA>
#> 4 2024-01-18 12:49:42
#> 5 <NA>
#> 6 <NA>
#> 7 <NA>
#> 8 2024-01-18 12:49:58
#> 9 2024-01-18 12:50:01
#> 10 2024-01-18 12:50:05
#> 11 2024-01-18 12:50:07
#> 12 2024-01-18 12:50:09
#> 13 <NA>
#> 14 <NA>
#> 15 2024-01-18 12:50:19
#> 16 2024-01-18 12:50:22
#> 17 2024-01-18 12:50:24
#> 18 <NA>
#>
#> $is.time_only
#> inclusion_time
#> 1 12:38:49
#> 2 10:38:57
#> 3 NA
#> 4 NA
#> 5 12:01:07
#> 6 NA
#> 7 NA
#> 8 NA
#> 9 NA
#> 10 NA
#> 11 NA
#> 12 NA
#> 13 20:39:19
#> 14 NA
#> 15 NA
#> 16 NA
#> 17 NA
#> 18 08:50:31
#>
data |> ds2dd_detailed()
#> $data
#> # A tibble: 18 × 23
#> record_id redcap_event_name redcap_repeat_instrument redcap_repeat_instance
#> <dbl> <chr> <chr> <dbl>
#> 1 1 inclusion NA NA
#> 2 2 inclusion NA NA
#> 3 2 follow1 NA NA
#> 4 2 follow1 New Event (?) 1
#> 5 3 inclusion NA NA
#> 6 3 follow1 NA NA
#> 7 3 follow2 NA NA
#> 8 3 follow1 New Event (?) 1
#> 9 3 follow1 New Event (?) 2
#> 10 3 follow2 New Event (?) 1
#> 11 3 follow2 New Event (?) 2
#> 12 3 follow2 New Event (?) 3
#> 13 4 inclusion NA NA
#> 14 4 follow2 NA NA
#> 15 4 follow2 New Event (?) 1
#> 16 4 follow2 New Event (?) 2
#> 17 4 follow2 New Event (?) 3
#> 18 5 inclusion NA NA
#> # ℹ 19 more variables: cpr <chr>, inclusion <date>, inclusion_time <chr>,
#> # dob <date>, age <dbl>, age_integer <dbl>, sex <chr>, cohabitation <chr>,
#> # hypertension <chr>, diabetes <chr>, region <chr>,
#> # baseline_data_start_complete <chr>, mrs_assessed <chr>, mrs_date <date>,
#> # mrs_score <dbl>, mrs_complete <chr>, event_datetime <dttm>,
#> # event_type <chr>, new_event_complete <chr>
#>
#> $meta
#> # A tibble: 23 × 18
#> field_name form_name section_header field_type field_label
#> <chr> <chr> <lgl> <chr> <chr>
#> 1 record_id data NA text record_id
#> 2 redcap_event_name data NA text redcap_event_na…
#> 3 redcap_repeat_instrument data NA text redcap_repeat_i…
#> 4 redcap_repeat_instance data NA text redcap_repeat_i…
#> 5 cpr data NA text cpr
#> 6 inclusion data NA text inclusion
#> 7 inclusion_time data NA text inclusion_time
#> 8 dob data NA text dob
#> 9 age data NA text age
#> 10 age_integer data NA text age_integer
#> # ℹ 13 more rows
#> # ℹ 13 more variables: select_choices_or_calculations <lgl>, field_note <lgl>,
#> # text_validation_type_or_show_slider_number <chr>,
#> # text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> # branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> # question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> # field_annotation <lgl>
#>
iris |> ds2dd_detailed(add.auto.id = TRUE)
#> A default id column has been added
#> $data
#> # A tibble: 150 × 6
#> default_trial_id sepal.length sepal.width petal.length petal.width species
#> <int> <dbl> <dbl> <dbl> <dbl> <fct>
#> 1 1 5.1 3.5 1.4 0.2 setosa
#> 2 2 4.9 3 1.4 0.2 setosa
#> 3 3 4.7 3.2 1.3 0.2 setosa
#> 4 4 4.6 3.1 1.5 0.2 setosa
#> 5 5 5 3.6 1.4 0.2 setosa
#> 6 6 5.4 3.9 1.7 0.4 setosa
#> 7 7 4.6 3.4 1.4 0.3 setosa
#> 8 8 5 3.4 1.5 0.2 setosa
#> 9 9 4.4 2.9 1.4 0.2 setosa
#> 10 10 4.9 3.1 1.5 0.1 setosa
#> # ℹ 140 more rows
#>
#> $meta
#> # A tibble: 6 × 18
#> field_name form_name section_header field_type field_label
#> <chr> <chr> <lgl> <chr> <chr>
#> 1 default_trial_id data NA text default_trial_id
#> 2 sepal.length data NA text Sepal.Length
#> 3 sepal.width data NA text Sepal.Width
#> 4 petal.length data NA text Petal.Length
#> 5 petal.width data NA text Petal.Width
#> 6 species data NA radio Species
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> # text_validation_type_or_show_slider_number <chr>,
#> # text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> # branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> # question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> # field_annotation <lgl>
#>
mtcars |> ds2dd_detailed(add.auto.id = TRUE)
#> A default id column has been added
#> $data
#> # A tibble: 32 × 12
#> default_trial_id mpg cyl disp hp drat wt qsec vs am gear
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 21 6 160 110 3.9 2.62 16.5 0 1 4
#> 2 2 21 6 160 110 3.9 2.88 17.0 0 1 4
#> 3 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4
#> 4 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3
#> 5 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3
#> 6 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3
#> 7 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3
#> 8 8 24.4 4 147. 62 3.69 3.19 20 1 0 4
#> 9 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4
#> 10 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4
#> # ℹ 22 more rows
#> # ℹ 1 more variable: carb <dbl>
#>
#> $meta
#> # A tibble: 12 × 18
#> field_name form_name section_header field_type field_label
#> <chr> <chr> <lgl> <chr> <chr>
#> 1 default_trial_id data NA text default_trial_id
#> 2 mpg data NA text mpg
#> 3 cyl data NA text cyl
#> 4 disp data NA text disp
#> 5 hp data NA text hp
#> 6 drat data NA text drat
#> 7 wt data NA text wt
#> 8 qsec data NA text qsec
#> 9 vs data NA text vs
#> 10 am data NA text am
#> 11 gear data NA text gear
#> 12 carb data NA text carb
#> # ℹ 13 more variables: select_choices_or_calculations <lgl>, field_note <lgl>,
#> # text_validation_type_or_show_slider_number <chr>,
#> # text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> # branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> # question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> # field_annotation <lgl>
#>