Extract data from stata file for data dictionary
Usage
ds2dd_detailed(
data,
add.auto.id = FALSE,
date.format = "dmy",
form.name = NULL,
form.sep = NULL,
form.prefix = TRUE,
field.type = NULL,
field.label = NULL,
field.label.attr = "label",
field.validation = NULL,
metadata = names(REDCapCAST::redcapcast_meta),
convert.logicals = TRUE
)
Arguments
- data
data frame
- add.auto.id
flag to add id column
- date.format
date format, character string. ymd/dmy/mdy. dafault is dmy.
- form.name
manually specify form name(s). Vector of length 1 or ncol(data). Default is NULL and "data" is used.
- form.sep
If supplied dataset has form names as suffix or prefix to the column/variable names, the seperator can be specified. If supplied, the form.name is ignored. Default is NULL.
- form.prefix
Flag to set if form is prefix (TRUE) or suffix (FALSE) to the column names. Assumes all columns have pre- or suffix if specified.
- field.type
manually specify field type(s). Vector of length 1 or ncol(data). Default is NULL and "text" is used for everything but factors, which wil get "radio".
- field.label
manually specify field label(s). Vector of length 1 or ncol(data). Default is NULL and colnames(data) is used or attribute `field.label.attr` for haven_labelled data set (imported .dta file with `haven::read_dta()`).
- field.label.attr
attribute name for named labels for haven_labelled data set (imported .dta file with `haven::read_dta()`. Default is "label"
- field.validation
manually specify field validation(s). Vector of length 1 or ncol(data). Default is NULL and `levels()` are used for factors or attribute `factor.labels.attr` for haven_labelled data set (imported .dta file with `haven::read_dta()`).
- metadata
redcap metadata headings. Default is REDCapCAST:::metadata_names.
- convert.logicals
convert logicals to factor. Default is TRUE.
Details
This function is a natural development of the ds2dd() function. It assumes that the first column is the ID-column. No checks. Please, do always inspect the data dictionary before upload.
Ensure, that the data set is formatted with as much information as possible.
`field.type` can be supplied
Examples
data <- REDCapCAST::redcapcast_data
data |> ds2dd_detailed()
#> $data
#> # A tibble: 25 × 27
#> record_id redcap_event_name redcap_repeat_instrument redcap_repeat_instance
#> <dbl> <chr> <chr> <dbl>
#> 1 1 inclusion NA NA
#> 2 2 inclusion NA NA
#> 3 2 follow1 NA NA
#> 4 2 follow1 New Event (?) 1
#> 5 3 inclusion NA NA
#> 6 3 follow1 NA NA
#> 7 3 follow2 NA NA
#> 8 3 follow1 New Event (?) 1
#> 9 3 follow1 New Event (?) 2
#> 10 3 follow2 New Event (?) 1
#> # ℹ 15 more rows
#> # ℹ 23 more variables: cpr <chr>, inclusion <date>, inclusion_time <chr>,
#> # dob <date>, age <dbl>, age_integer <dbl>, sex <chr>, cohabitation <chr>,
#> # hypertension <chr>, diabetes <chr>, region <chr>,
#> # baseline_data_start_complete <chr>, mrs_assessed <chr>, mrs_date <date>,
#> # mrs_score <dbl>, mrs_complete <chr>, con_mrs <fct>, con_calc <fct>,
#> # consensus_complete <chr>, event_datetime <dttm>, event_age <dbl>, …
#>
#> $meta
#> # A tibble: 27 × 18
#> field_name form_name section_header field_type field_label
#> <chr> <chr> <lgl> <chr> <chr>
#> 1 record_id data NA text record_id
#> 2 redcap_event_name data NA text redcap_event_na…
#> 3 redcap_repeat_instrument data NA text redcap_repeat_i…
#> 4 redcap_repeat_instance data NA text redcap_repeat_i…
#> 5 cpr data NA text cpr
#> 6 inclusion data NA text inclusion
#> 7 inclusion_time data NA text inclusion_time
#> 8 dob data NA text dob
#> 9 age data NA text age
#> 10 age_integer data NA text age_integer
#> # ℹ 17 more rows
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> # text_validation_type_or_show_slider_number <chr>,
#> # text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> # branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> # question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> # field_annotation <lgl>
#>
iris |> ds2dd_detailed(add.auto.id = TRUE)
#> A default id column has been added
#> $data
#> # A tibble: 150 × 6
#> record_id sepal.length sepal.width petal.length petal.width species
#> <int> <dbl> <dbl> <dbl> <dbl> <fct>
#> 1 1 5.1 3.5 1.4 0.2 setosa
#> 2 2 4.9 3 1.4 0.2 setosa
#> 3 3 4.7 3.2 1.3 0.2 setosa
#> 4 4 4.6 3.1 1.5 0.2 setosa
#> 5 5 5 3.6 1.4 0.2 setosa
#> 6 6 5.4 3.9 1.7 0.4 setosa
#> 7 7 4.6 3.4 1.4 0.3 setosa
#> 8 8 5 3.4 1.5 0.2 setosa
#> 9 9 4.4 2.9 1.4 0.2 setosa
#> 10 10 4.9 3.1 1.5 0.1 setosa
#> # ℹ 140 more rows
#>
#> $meta
#> # A tibble: 6 × 18
#> field_name form_name section_header field_type field_label
#> <chr> <chr> <lgl> <chr> <chr>
#> 1 record_id data NA text record_id
#> 2 sepal.length data NA text sepal.length
#> 3 sepal.width data NA text sepal.width
#> 4 petal.length data NA text petal.length
#> 5 petal.width data NA text petal.width
#> 6 species data NA radio species
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> # text_validation_type_or_show_slider_number <chr>,
#> # text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> # branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> # question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> # field_annotation <lgl>
#>
iris |>
ds2dd_detailed(
add.auto.id = TRUE,
form.name = sample(c("b", "c"), size = 6, replace = TRUE, prob = rep(.5, 2))
) |>
purrr::pluck("meta")
#> A default id column has been added
#> # A tibble: 6 × 18
#> field_name form_name section_header field_type field_label
#> <chr> <chr> <lgl> <chr> <chr>
#> 1 record_id c NA text record_id
#> 2 sepal.length b NA text sepal.length
#> 3 sepal.width b NA text sepal.width
#> 4 petal.length b NA text petal.length
#> 5 petal.width c NA text petal.width
#> 6 species c NA radio species
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> # text_validation_type_or_show_slider_number <chr>,
#> # text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> # branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> # question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> # field_annotation <lgl>
mtcars |> ds2dd_detailed(add.auto.id = TRUE)
#> A default id column has been added
#> $data
#> # A tibble: 32 × 12
#> record_id mpg cyl disp hp drat wt qsec vs am gear carb
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
#> 2 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
#> 3 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
#> 4 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
#> 5 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
#> 6 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
#> 7 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
#> 8 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
#> 9 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
#> 10 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
#> # ℹ 22 more rows
#>
#> $meta
#> # A tibble: 12 × 18
#> field_name form_name section_header field_type field_label
#> <chr> <chr> <lgl> <chr> <chr>
#> 1 record_id data NA text record_id
#> 2 mpg data NA text mpg
#> 3 cyl data NA text cyl
#> 4 disp data NA text disp
#> 5 hp data NA text hp
#> 6 drat data NA text drat
#> 7 wt data NA text wt
#> 8 qsec data NA text qsec
#> 9 vs data NA text vs
#> 10 am data NA text am
#> 11 gear data NA text gear
#> 12 carb data NA text carb
#> # ℹ 13 more variables: select_choices_or_calculations <lgl>, field_note <lgl>,
#> # text_validation_type_or_show_slider_number <chr>,
#> # text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> # branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> # question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> # field_annotation <lgl>
#>
data <- iris |>
ds2dd_detailed(add.auto.id = TRUE) |>
purrr::pluck("data")
#> A default id column has been added
names(data) <- glue::glue("{sample(x = c('a','b'),size = length(names(data)),
replace=TRUE,prob = rep(x=.5,2))}__{names(data)}")
data |> ds2dd_detailed(form.sep = "__")
#> $data
#> # A tibble: 150 × 6
#> record_id sepal.length sepal.width petal.length petal.width species
#> <int> <dbl> <dbl> <dbl> <dbl> <fct>
#> 1 1 5.1 3.5 1.4 0.2 setosa
#> 2 2 4.9 3 1.4 0.2 setosa
#> 3 3 4.7 3.2 1.3 0.2 setosa
#> 4 4 4.6 3.1 1.5 0.2 setosa
#> 5 5 5 3.6 1.4 0.2 setosa
#> 6 6 5.4 3.9 1.7 0.4 setosa
#> 7 7 4.6 3.4 1.4 0.3 setosa
#> 8 8 5 3.4 1.5 0.2 setosa
#> 9 9 4.4 2.9 1.4 0.2 setosa
#> 10 10 4.9 3.1 1.5 0.1 setosa
#> # ℹ 140 more rows
#>
#> $meta
#> # A tibble: 6 × 18
#> field_name form_name section_header field_type field_label
#> <chr> <chr> <lgl> <chr> <chr>
#> 1 record_id b NA text record_id
#> 2 sepal.length b NA text sepal.length
#> 3 sepal.width b NA text sepal.width
#> 4 petal.length b NA text petal.length
#> 5 petal.width b NA text petal.width
#> 6 species b NA radio species
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> # text_validation_type_or_show_slider_number <chr>,
#> # text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> # branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> # question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> # field_annotation <lgl>
#>