Extract data from stata file for data dictionary

Usage

ds2dd_detailed(
  data,
  add.auto.id = FALSE,
  date.format = "dmy",
  form.name = NULL,
  field.type = NULL,
  field.label = NULL,
  field.label.attr = "label",
  field.validation = NULL,
  metadata = metadata_names,
  validate.time = FALSE,
  time.var.sel.pos = "[Tt]i[d(me)]",
  time.var.sel.neg = "[Dd]at[eo]"
)

Arguments

data: data frame
add.auto.id: flag to add id column
date.format: date format, character string. ymd/dmy/mdy. dafault is dmy.
form.name: manually specify form name(s). Vector of length 1 or ncol(data). Default is NULL and "data" is used.
field.type: manually specify field type(s). Vector of length 1 or ncol(data). Default is NULL and "text" is used for everything but factors, which wil get "radio".
field.label: manually specify field label(s). Vector of length 1 or ncol(data). Default is NULL and colnames(data) is used or attribute `field.label.attr` for haven_labelled data set (imported .dta file with `haven::read_dta()`).
field.label.attr: attribute name for named labels for haven_labelled data set (imported .dta file with `haven::read_dta()`. Default is "label"
field.validation: manually specify field validation(s). Vector of length 1 or ncol(data). Default is NULL and `levels()` are used for factors or attribute `factor.labels.attr` for haven_labelled data set (imported .dta file with `haven::read_dta()`).
metadata: redcap metadata headings. Default is REDCapCAST:::metadata_names.
validate.time: Flag to validate guessed time columns
time.var.sel.pos: Positive selection regex string passed to `gues_time_only_filter()` as sel.pos.
time.var.sel.neg: Negative selection regex string passed to `gues_time_only_filter()` as sel.neg.

Value

list of length 2

Details

This function is a natural development of the ds2dd() function. It assumes that the first column is the ID-column. No checks. Please, do always inspect the data dictionary before upload.

Ensure, that the data set is formatted with as much information as possible.

`field.type` can be supplied

Examples

data <- redcapcast_data
data |> ds2dd_detailed(validate.time = TRUE)
#> $is.POSIX
#>    inclusion_time      event_datetime
#> 1        12:38:49                <NA>
#> 2        10:38:57                <NA>
#> 3              NA                <NA>
#> 4              NA 2024-01-18 12:49:42
#> 5        12:01:07                <NA>
#> 6              NA                <NA>
#> 7              NA                <NA>
#> 8              NA 2024-01-18 12:49:58
#> 9              NA 2024-01-18 12:50:01
#> 10             NA 2024-01-18 12:50:05
#> 11             NA 2024-01-18 12:50:07
#> 12             NA 2024-01-18 12:50:09
#> 13       20:39:19                <NA>
#> 14             NA                <NA>
#> 15             NA 2024-01-18 12:50:19
#> 16             NA 2024-01-18 12:50:22
#> 17             NA 2024-01-18 12:50:24
#> 18       08:50:31                <NA>
#> 
#> $is.datetime
#>         event_datetime
#> 1                 <NA>
#> 2                 <NA>
#> 3                 <NA>
#> 4  2024-01-18 12:49:42
#> 5                 <NA>
#> 6                 <NA>
#> 7                 <NA>
#> 8  2024-01-18 12:49:58
#> 9  2024-01-18 12:50:01
#> 10 2024-01-18 12:50:05
#> 11 2024-01-18 12:50:07
#> 12 2024-01-18 12:50:09
#> 13                <NA>
#> 14                <NA>
#> 15 2024-01-18 12:50:19
#> 16 2024-01-18 12:50:22
#> 17 2024-01-18 12:50:24
#> 18                <NA>
#> 
#> $is.time_only
#>    inclusion_time
#> 1        12:38:49
#> 2        10:38:57
#> 3              NA
#> 4              NA
#> 5        12:01:07
#> 6              NA
#> 7              NA
#> 8              NA
#> 9              NA
#> 10             NA
#> 11             NA
#> 12             NA
#> 13       20:39:19
#> 14             NA
#> 15             NA
#> 16             NA
#> 17             NA
#> 18       08:50:31
#> 
data |> ds2dd_detailed()
#> $data
#> # A tibble: 18 × 23
#>    record_id redcap_event_name redcap_repeat_instrument redcap_repeat_instance
#>        <dbl> <chr>             <chr>                                     <dbl>
#>  1         1 inclusion         NA                                           NA
#>  2         2 inclusion         NA                                           NA
#>  3         2 follow1           NA                                           NA
#>  4         2 follow1           New Event (?)                                 1
#>  5         3 inclusion         NA                                           NA
#>  6         3 follow1           NA                                           NA
#>  7         3 follow2           NA                                           NA
#>  8         3 follow1           New Event (?)                                 1
#>  9         3 follow1           New Event (?)                                 2
#> 10         3 follow2           New Event (?)                                 1
#> 11         3 follow2           New Event (?)                                 2
#> 12         3 follow2           New Event (?)                                 3
#> 13         4 inclusion         NA                                           NA
#> 14         4 follow2           NA                                           NA
#> 15         4 follow2           New Event (?)                                 1
#> 16         4 follow2           New Event (?)                                 2
#> 17         4 follow2           New Event (?)                                 3
#> 18         5 inclusion         NA                                           NA
#> # ℹ 19 more variables: cpr <chr>, inclusion <date>, inclusion_time <chr>,
#> #   dob <date>, age <dbl>, age_integer <dbl>, sex <chr>, cohabitation <chr>,
#> #   hypertension <chr>, diabetes <chr>, region <chr>,
#> #   baseline_data_start_complete <chr>, mrs_assessed <chr>, mrs_date <date>,
#> #   mrs_score <dbl>, mrs_complete <chr>, event_datetime <dttm>,
#> #   event_type <chr>, new_event_complete <chr>
#> 
#> $meta
#> # A tibble: 23 × 18
#>    field_name               form_name section_header field_type field_label     
#>    <chr>                    <chr>     <lgl>          <chr>      <chr>           
#>  1 record_id                data      NA             text       record_id       
#>  2 redcap_event_name        data      NA             text       redcap_event_na…
#>  3 redcap_repeat_instrument data      NA             text       redcap_repeat_i…
#>  4 redcap_repeat_instance   data      NA             text       redcap_repeat_i…
#>  5 cpr                      data      NA             text       cpr             
#>  6 inclusion                data      NA             text       inclusion       
#>  7 inclusion_time           data      NA             text       inclusion_time  
#>  8 dob                      data      NA             text       dob             
#>  9 age                      data      NA             text       age             
#> 10 age_integer              data      NA             text       age_integer     
#> # ℹ 13 more rows
#> # ℹ 13 more variables: select_choices_or_calculations <lgl>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
#> 
iris |> ds2dd_detailed(add.auto.id = TRUE)
#> A default id column has been added
#> $data
#> # A tibble: 150 × 6
#>    default_trial_id sepal.length sepal.width petal.length petal.width species
#>               <int>        <dbl>       <dbl>        <dbl>       <dbl> <fct>  
#>  1                1          5.1         3.5          1.4         0.2 setosa 
#>  2                2          4.9         3            1.4         0.2 setosa 
#>  3                3          4.7         3.2          1.3         0.2 setosa 
#>  4                4          4.6         3.1          1.5         0.2 setosa 
#>  5                5          5           3.6          1.4         0.2 setosa 
#>  6                6          5.4         3.9          1.7         0.4 setosa 
#>  7                7          4.6         3.4          1.4         0.3 setosa 
#>  8                8          5           3.4          1.5         0.2 setosa 
#>  9                9          4.4         2.9          1.4         0.2 setosa 
#> 10               10          4.9         3.1          1.5         0.1 setosa 
#> # ℹ 140 more rows
#> 
#> $meta
#> # A tibble: 6 × 18
#>   field_name       form_name section_header field_type field_label     
#>   <chr>            <chr>     <lgl>          <chr>      <chr>           
#> 1 default_trial_id data      NA             text       default_trial_id
#> 2 sepal.length     data      NA             text       Sepal.Length    
#> 3 sepal.width      data      NA             text       Sepal.Width     
#> 4 petal.length     data      NA             text       Petal.Length    
#> 5 petal.width      data      NA             text       Petal.Width     
#> 6 species          data      NA             radio      Species         
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
#> 
mtcars |> ds2dd_detailed(add.auto.id = TRUE)
#> A default id column has been added
#> $data
#> # A tibble: 32 × 12
#>    default_trial_id   mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear
#>               <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1                1  21       6  160    110  3.9   2.62  16.5     0     1     4
#>  2                2  21       6  160    110  3.9   2.88  17.0     0     1     4
#>  3                3  22.8     4  108     93  3.85  2.32  18.6     1     1     4
#>  4                4  21.4     6  258    110  3.08  3.22  19.4     1     0     3
#>  5                5  18.7     8  360    175  3.15  3.44  17.0     0     0     3
#>  6                6  18.1     6  225    105  2.76  3.46  20.2     1     0     3
#>  7                7  14.3     8  360    245  3.21  3.57  15.8     0     0     3
#>  8                8  24.4     4  147.    62  3.69  3.19  20       1     0     4
#>  9                9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4
#> 10               10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4
#> # ℹ 22 more rows
#> # ℹ 1 more variable: carb <dbl>
#> 
#> $meta
#> # A tibble: 12 × 18
#>    field_name       form_name section_header field_type field_label     
#>    <chr>            <chr>     <lgl>          <chr>      <chr>           
#>  1 default_trial_id data      NA             text       default_trial_id
#>  2 mpg              data      NA             text       mpg             
#>  3 cyl              data      NA             text       cyl             
#>  4 disp             data      NA             text       disp            
#>  5 hp               data      NA             text       hp              
#>  6 drat             data      NA             text       drat            
#>  7 wt               data      NA             text       wt              
#>  8 qsec             data      NA             text       qsec            
#>  9 vs               data      NA             text       vs              
#> 10 am               data      NA             text       am              
#> 11 gear             data      NA             text       gear            
#> 12 carb             data      NA             text       carb            
#> # ℹ 13 more variables: select_choices_or_calculations <lgl>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
#>