Extract data from stata file for data dictionary

Usage

ds2dd_detailed(
  data,
  add.auto.id = FALSE,
  date.format = "dmy",
  form.name = NULL,
  form.sep = NULL,
  form.prefix = TRUE,
  field.type = NULL,
  field.label = NULL,
  field.label.attr = "label",
  field.validation = NULL,
  metadata = names(REDCapCAST::redcapcast_meta),
  convert.logicals = TRUE
)

Arguments

data: data frame
add.auto.id: flag to add id column
date.format: date format, character string. ymd/dmy/mdy. dafault is dmy.
form.name: manually specify form name(s). Vector of length 1 or ncol(data). Default is NULL and "data" is used.
form.sep: If supplied dataset has form names as suffix or prefix to the column/variable names, the seperator can be specified. If supplied, the form.name is ignored. Default is NULL.
form.prefix: Flag to set if form is prefix (TRUE) or suffix (FALSE) to the column names. Assumes all columns have pre- or suffix if specified.
field.type: manually specify field type(s). Vector of length 1 or ncol(data). Default is NULL and "text" is used for everything but factors, which wil get "radio".
field.label: manually specify field label(s). Vector of length 1 or ncol(data). Default is NULL and colnames(data) is used or attribute `field.label.attr` for haven_labelled data set (imported .dta file with `haven::read_dta()`).
field.label.attr: attribute name for named labels for haven_labelled data set (imported .dta file with `haven::read_dta()`. Default is "label"
field.validation: manually specify field validation(s). Vector of length 1 or ncol(data). Default is NULL and `levels()` are used for factors or attribute `factor.labels.attr` for haven_labelled data set (imported .dta file with `haven::read_dta()`).
metadata: redcap metadata headings. Default is names(REDCapCAST::redcapcast_meta).
convert.logicals: convert logicals to factor. Default is TRUE.

Value

list of length 2

Details

This function is a natural development of the ds2dd() function. It assumes that the first column is the ID-column. No checks. Please, do always inspect the data dictionary before upload.

Ensure, that the data set is formatted with as much information as possible.

`field.type` can be supplied

Examples

## Basic parsing with default options
requireNamespace("REDCapCAST")
redcapcast_data |>
  dplyr::select(-dplyr::starts_with("redcap_")) |>
  ds2dd_detailed()
#> $data
#> # A tibble: 25 × 24
#>    record_id cpr    inclusion  inclusion_time dob          age age_integer sex  
#>        <dbl> <chr>  <date>     <chr>          <date>     <dbl>       <dbl> <chr>
#>  1         1 12034… 2023-03-13 12:38:49       1940-03-12  83.0          83 fema…
#>  2         2 01023… 2023-03-01 10:38:57       1934-02-01  89.1          89 male 
#>  3         2 NA     NA         NA             NA          NA            NA NA   
#>  4         2 NA     NA         NA             NA          NA            NA NA   
#>  5         3 23015… 2022-03-08 12:01:07       1956-01-23  66.1          66 male 
#>  6         3 NA     NA         NA             NA          NA            NA NA   
#>  7         3 NA     NA         NA             NA          NA            NA NA   
#>  8         3 NA     NA         NA             NA          NA            NA NA   
#>  9         3 NA     NA         NA             NA          NA            NA NA   
#> 10         3 NA     NA         NA             NA          NA            NA NA   
#> # ℹ 15 more rows
#> # ℹ 16 more variables: cohabitation <chr>, hypertension <chr>, diabetes <chr>,
#> #   region <chr>, baseline_data_start_complete <chr>, mrs_assessed <chr>,
#> #   mrs_date <date>, mrs_score <dbl>, mrs_complete <chr>, con_mrs <fct>,
#> #   con_calc <fct>, consensus_complete <chr>, event_datetime <dttm>,
#> #   event_age <dbl>, event_type <chr>, new_event_complete <chr>
#> 
#> $meta
#> # A tibble: 24 × 18
#>    field_name     form_name section_header field_type field_label   
#>    <chr>          <chr>     <lgl>          <chr>      <chr>         
#>  1 record_id      data      NA             text       record_id     
#>  2 cpr            data      NA             text       cpr           
#>  3 inclusion      data      NA             text       inclusion     
#>  4 inclusion_time data      NA             text       inclusion_time
#>  5 dob            data      NA             text       dob           
#>  6 age            data      NA             text       age           
#>  7 age_integer    data      NA             text       age_integer   
#>  8 sex            data      NA             text       sex           
#>  9 cohabitation   data      NA             text       cohabitation  
#> 10 hypertension   data      NA             text       hypertension  
#> # ℹ 14 more rows
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
#> 
#> attr(,"class")
#> [1] "REDCapCAST" "list"      

## Adding a record_id field
iris |> ds2dd_detailed(add.auto.id = TRUE)
#> $data
#> # A tibble: 150 × 6
#>    record_id sepal.length sepal.width petal.length petal.width species
#>        <int>        <dbl>       <dbl>        <dbl>       <dbl> <fct>  
#>  1         1          5.1         3.5          1.4         0.2 setosa 
#>  2         2          4.9         3            1.4         0.2 setosa 
#>  3         3          4.7         3.2          1.3         0.2 setosa 
#>  4         4          4.6         3.1          1.5         0.2 setosa 
#>  5         5          5           3.6          1.4         0.2 setosa 
#>  6         6          5.4         3.9          1.7         0.4 setosa 
#>  7         7          4.6         3.4          1.4         0.3 setosa 
#>  8         8          5           3.4          1.5         0.2 setosa 
#>  9         9          4.4         2.9          1.4         0.2 setosa 
#> 10        10          4.9         3.1          1.5         0.1 setosa 
#> # ℹ 140 more rows
#> 
#> $meta
#> # A tibble: 6 × 18
#>   field_name   form_name section_header field_type field_label 
#>   <chr>        <chr>     <lgl>          <chr>      <chr>       
#> 1 record_id    data      NA             text       record_id   
#> 2 sepal.length data      NA             text       Sepal.Length
#> 3 sepal.width  data      NA             text       Sepal.Width 
#> 4 petal.length data      NA             text       Petal.Length
#> 5 petal.width  data      NA             text       Petal.Width 
#> 6 species      data      NA             radio      Species     
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
#> 
#> attr(,"class")
#> [1] "REDCapCAST" "list"      

## Passing form name information to function
iris |>
  ds2dd_detailed(
    add.auto.id = TRUE,
    form.name = sample(c("b", "c"), size = 6, replace = TRUE, prob = rep(.5, 2))
  ) |>
  purrr::pluck("meta")
#> # A tibble: 6 × 18
#>   field_name   form_name section_header field_type field_label 
#>   <chr>        <chr>     <lgl>          <chr>      <chr>       
#> 1 record_id    b         NA             text       record_id   
#> 2 sepal.length c         NA             text       Sepal.Length
#> 3 sepal.width  b         NA             text       Sepal.Width 
#> 4 petal.length c         NA             text       Petal.Length
#> 5 petal.width  b         NA             text       Petal.Width 
#> 6 species      c         NA             radio      Species     
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
mtcars |> ds2dd_detailed(add.auto.id = TRUE)
#> $data
#> # A tibble: 32 × 12
#>    record_id   mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>        <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1         1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
#>  2         2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
#>  3         3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
#>  4         4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
#>  5         5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
#>  6         6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
#>  7         7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
#>  8         8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
#>  9         9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
#> 10        10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
#> # ℹ 22 more rows
#> 
#> $meta
#> # A tibble: 12 × 18
#>    field_name form_name section_header field_type field_label
#>    <chr>      <chr>     <lgl>          <chr>      <chr>      
#>  1 record_id  data      NA             text       record_id  
#>  2 mpg        data      NA             text       mpg        
#>  3 cyl        data      NA             text       cyl        
#>  4 disp       data      NA             text       disp       
#>  5 hp         data      NA             text       hp         
#>  6 drat       data      NA             text       drat       
#>  7 wt         data      NA             text       wt         
#>  8 qsec       data      NA             text       qsec       
#>  9 vs         data      NA             text       vs         
#> 10 am         data      NA             text       am         
#> 11 gear       data      NA             text       gear       
#> 12 carb       data      NA             text       carb       
#> # ℹ 13 more variables: select_choices_or_calculations <lgl>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
#> 
#> attr(,"class")
#> [1] "REDCapCAST" "list"      

## Using column name suffix to carry form name
data <- iris |>
  ds2dd_detailed(add.auto.id = TRUE) |>
  purrr::pluck("data")
names(data) <- glue::glue("{sample(x = c('a','b'),size = length(names(data)),
replace=TRUE,prob = rep(x=.5,2))}__{names(data)}")
data |> ds2dd_detailed(form.sep = "__")
#> $data
#> # A tibble: 150 × 6
#>    record_id sepal.length sepal.width petal.length petal.width species
#>        <int>        <dbl>       <dbl>        <dbl>       <dbl> <fct>  
#>  1         1          5.1         3.5          1.4         0.2 setosa 
#>  2         2          4.9         3            1.4         0.2 setosa 
#>  3         3          4.7         3.2          1.3         0.2 setosa 
#>  4         4          4.6         3.1          1.5         0.2 setosa 
#>  5         5          5           3.6          1.4         0.2 setosa 
#>  6         6          5.4         3.9          1.7         0.4 setosa 
#>  7         7          4.6         3.4          1.4         0.3 setosa 
#>  8         8          5           3.4          1.5         0.2 setosa 
#>  9         9          4.4         2.9          1.4         0.2 setosa 
#> 10        10          4.9         3.1          1.5         0.1 setosa 
#> # ℹ 140 more rows
#> 
#> $meta
#> # A tibble: 6 × 18
#>   field_name   form_name section_header field_type field_label 
#>   <chr>        <chr>     <lgl>          <chr>      <chr>       
#> 1 record_id    b         NA             text       record_id   
#> 2 sepal.length a         NA             text       sepal.length
#> 3 sepal.width  b         NA             text       sepal.width 
#> 4 petal.length a         NA             text       petal.length
#> 5 petal.width  b         NA             text       petal.width 
#> 6 species      a         NA             radio      species     
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
#> 
#> attr(,"class")
#> [1] "REDCapCAST" "list"