Skip to contents

Extract data from stata file for data dictionary

Usage

ds2dd_detailed(
  data,
  add.auto.id = FALSE,
  date.format = "dmy",
  form.name = NULL,
  form.sep = NULL,
  form.prefix = TRUE,
  field.type = NULL,
  field.label = NULL,
  field.label.attr = "label",
  field.validation = NULL,
  metadata = names(REDCapCAST::redcapcast_meta),
  convert.logicals = TRUE
)

Arguments

data

data frame

add.auto.id

flag to add id column

date.format

date format, character string. ymd/dmy/mdy. dafault is dmy.

form.name

manually specify form name(s). Vector of length 1 or ncol(data). Default is NULL and "data" is used.

form.sep

If supplied dataset has form names as suffix or prefix to the column/variable names, the seperator can be specified. If supplied, the form.name is ignored. Default is NULL.

form.prefix

Flag to set if form is prefix (TRUE) or suffix (FALSE) to the column names. Assumes all columns have pre- or suffix if specified.

field.type

manually specify field type(s). Vector of length 1 or ncol(data). Default is NULL and "text" is used for everything but factors, which wil get "radio".

field.label

manually specify field label(s). Vector of length 1 or ncol(data). Default is NULL and colnames(data) is used or attribute `field.label.attr` for haven_labelled data set (imported .dta file with `haven::read_dta()`).

field.label.attr

attribute name for named labels for haven_labelled data set (imported .dta file with `haven::read_dta()`. Default is "label"

field.validation

manually specify field validation(s). Vector of length 1 or ncol(data). Default is NULL and `levels()` are used for factors or attribute `factor.labels.attr` for haven_labelled data set (imported .dta file with `haven::read_dta()`).

metadata

redcap metadata headings. Default is REDCapCAST:::metadata_names.

convert.logicals

convert logicals to factor. Default is TRUE.

Value

list of length 2

Details

This function is a natural development of the ds2dd() function. It assumes that the first column is the ID-column. No checks. Please, do always inspect the data dictionary before upload.

Ensure, that the data set is formatted with as much information as possible.

`field.type` can be supplied

Examples

## Basic parsing with default options
REDCapCAST::redcapcast_data |>
  dplyr::select(-dplyr::starts_with("redcap_")) |>
  ds2dd_detailed()
#> $data
#> # A tibble: 25 × 24
#>    record_id cpr    inclusion  inclusion_time dob          age age_integer sex  
#>        <dbl> <chr>  <date>     <chr>          <date>     <dbl>       <dbl> <chr>
#>  1         1 12034… 2023-03-13 12:38:49       1940-03-12  83.0          83 fema…
#>  2         2 01023… 2023-03-01 10:38:57       1934-02-01  89.1          89 male 
#>  3         2 NA     NA         NA             NA          NA            NA NA   
#>  4         2 NA     NA         NA             NA          NA            NA NA   
#>  5         3 23015… 2022-03-08 12:01:07       1956-01-23  66.1          66 male 
#>  6         3 NA     NA         NA             NA          NA            NA NA   
#>  7         3 NA     NA         NA             NA          NA            NA NA   
#>  8         3 NA     NA         NA             NA          NA            NA NA   
#>  9         3 NA     NA         NA             NA          NA            NA NA   
#> 10         3 NA     NA         NA             NA          NA            NA NA   
#> # ℹ 15 more rows
#> # ℹ 16 more variables: cohabitation <chr>, hypertension <chr>, diabetes <chr>,
#> #   region <chr>, baseline_data_start_complete <chr>, mrs_assessed <chr>,
#> #   mrs_date <date>, mrs_score <dbl>, mrs_complete <chr>, con_mrs <fct>,
#> #   con_calc <fct>, consensus_complete <chr>, event_datetime <dttm>,
#> #   event_age <dbl>, event_type <chr>, new_event_complete <chr>
#> 
#> $meta
#> # A tibble: 24 × 18
#>    field_name     form_name section_header field_type field_label   
#>    <chr>          <chr>     <lgl>          <chr>      <chr>         
#>  1 record_id      data      NA             text       record_id     
#>  2 cpr            data      NA             text       cpr           
#>  3 inclusion      data      NA             text       inclusion     
#>  4 inclusion_time data      NA             text       inclusion_time
#>  5 dob            data      NA             text       dob           
#>  6 age            data      NA             text       age           
#>  7 age_integer    data      NA             text       age_integer   
#>  8 sex            data      NA             text       sex           
#>  9 cohabitation   data      NA             text       cohabitation  
#> 10 hypertension   data      NA             text       hypertension  
#> # ℹ 14 more rows
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
#> 
#> attr(,"class")
#> [1] "REDCapCAST" "list"      

## Adding a record_id field
iris |> ds2dd_detailed(add.auto.id = TRUE)
#> $data
#> # A tibble: 150 × 6
#>    record_id sepal.length sepal.width petal.length petal.width species
#>        <int>        <dbl>       <dbl>        <dbl>       <dbl> <fct>  
#>  1         1          5.1         3.5          1.4         0.2 setosa 
#>  2         2          4.9         3            1.4         0.2 setosa 
#>  3         3          4.7         3.2          1.3         0.2 setosa 
#>  4         4          4.6         3.1          1.5         0.2 setosa 
#>  5         5          5           3.6          1.4         0.2 setosa 
#>  6         6          5.4         3.9          1.7         0.4 setosa 
#>  7         7          4.6         3.4          1.4         0.3 setosa 
#>  8         8          5           3.4          1.5         0.2 setosa 
#>  9         9          4.4         2.9          1.4         0.2 setosa 
#> 10        10          4.9         3.1          1.5         0.1 setosa 
#> # ℹ 140 more rows
#> 
#> $meta
#> # A tibble: 6 × 18
#>   field_name   form_name section_header field_type field_label 
#>   <chr>        <chr>     <lgl>          <chr>      <chr>       
#> 1 record_id    data      NA             text       record_id   
#> 2 sepal.length data      NA             text       Sepal.Length
#> 3 sepal.width  data      NA             text       Sepal.Width 
#> 4 petal.length data      NA             text       Petal.Length
#> 5 petal.width  data      NA             text       Petal.Width 
#> 6 species      data      NA             radio      Species     
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
#> 
#> attr(,"class")
#> [1] "REDCapCAST" "list"      

## Passing form name information to function
iris |>
  ds2dd_detailed(
    add.auto.id = TRUE,
    form.name = sample(c("b", "c"), size = 6, replace = TRUE, prob = rep(.5, 2))
  ) |>
  purrr::pluck("meta")
#> # A tibble: 6 × 18
#>   field_name   form_name section_header field_type field_label 
#>   <chr>        <chr>     <lgl>          <chr>      <chr>       
#> 1 record_id    b         NA             text       record_id   
#> 2 sepal.length c         NA             text       Sepal.Length
#> 3 sepal.width  b         NA             text       Sepal.Width 
#> 4 petal.length c         NA             text       Petal.Length
#> 5 petal.width  b         NA             text       Petal.Width 
#> 6 species      c         NA             radio      Species     
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
mtcars |> ds2dd_detailed(add.auto.id = TRUE)
#> $data
#> # A tibble: 32 × 12
#>    record_id   mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>        <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1         1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
#>  2         2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
#>  3         3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
#>  4         4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
#>  5         5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
#>  6         6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
#>  7         7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
#>  8         8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
#>  9         9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
#> 10        10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
#> # ℹ 22 more rows
#> 
#> $meta
#> # A tibble: 12 × 18
#>    field_name form_name section_header field_type field_label
#>    <chr>      <chr>     <lgl>          <chr>      <chr>      
#>  1 record_id  data      NA             text       record_id  
#>  2 mpg        data      NA             text       mpg        
#>  3 cyl        data      NA             text       cyl        
#>  4 disp       data      NA             text       disp       
#>  5 hp         data      NA             text       hp         
#>  6 drat       data      NA             text       drat       
#>  7 wt         data      NA             text       wt         
#>  8 qsec       data      NA             text       qsec       
#>  9 vs         data      NA             text       vs         
#> 10 am         data      NA             text       am         
#> 11 gear       data      NA             text       gear       
#> 12 carb       data      NA             text       carb       
#> # ℹ 13 more variables: select_choices_or_calculations <lgl>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
#> 
#> attr(,"class")
#> [1] "REDCapCAST" "list"      

## Using column name suffix to carry form name
data <- iris |>
  ds2dd_detailed(add.auto.id = TRUE) |>
  purrr::pluck("data")
names(data) <- glue::glue("{sample(x = c('a','b'),size = length(names(data)),
replace=TRUE,prob = rep(x=.5,2))}__{names(data)}")
data |> ds2dd_detailed(form.sep = "__")
#> $data
#> # A tibble: 150 × 6
#>    record_id sepal.length sepal.width petal.length petal.width species
#>        <int>        <dbl>       <dbl>        <dbl>       <dbl> <fct>  
#>  1         1          5.1         3.5          1.4         0.2 setosa 
#>  2         2          4.9         3            1.4         0.2 setosa 
#>  3         3          4.7         3.2          1.3         0.2 setosa 
#>  4         4          4.6         3.1          1.5         0.2 setosa 
#>  5         5          5           3.6          1.4         0.2 setosa 
#>  6         6          5.4         3.9          1.7         0.4 setosa 
#>  7         7          4.6         3.4          1.4         0.3 setosa 
#>  8         8          5           3.4          1.5         0.2 setosa 
#>  9         9          4.4         2.9          1.4         0.2 setosa 
#> 10        10          4.9         3.1          1.5         0.1 setosa 
#> # ℹ 140 more rows
#> 
#> $meta
#> # A tibble: 6 × 18
#>   field_name   form_name section_header field_type field_label 
#>   <chr>        <chr>     <lgl>          <chr>      <chr>       
#> 1 record_id    a         NA             text       record_id   
#> 2 sepal.length a         NA             text       sepal.length
#> 3 sepal.width  a         NA             text       sepal.width 
#> 4 petal.length b         NA             text       petal.length
#> 5 petal.width  b         NA             text       petal.width 
#> 6 species      b         NA             radio      species     
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
#> 
#> attr(,"class")
#> [1] "REDCapCAST" "list"