Extract data from stata file for data dictionary

Usage

ds2dd_detailed(
  data,
  add.auto.id = FALSE,
  date.format = "dmy",
  form.name = NULL,
  form.sep = NULL,
  form.prefix = TRUE,
  field.type = NULL,
  field.label = NULL,
  field.label.attr = "label",
  field.validation = NULL,
  metadata = names(REDCapCAST::redcapcast_meta),
  validate.time = FALSE,
  time.var.sel.pos = "[Tt]i[d(me)]",
  time.var.sel.neg = "[Dd]at[eo]"
)

Arguments

data: data frame
add.auto.id: flag to add id column
date.format: date format, character string. ymd/dmy/mdy. dafault is dmy.
form.name: manually specify form name(s). Vector of length 1 or ncol(data). Default is NULL and "data" is used.
form.sep: If supplied dataset has form names as suffix or prefix to the column/variable names, the seperator can be specified. If supplied, the form.name is ignored. Default is NULL.
form.prefix: Flag to set if form is prefix (TRUE) or suffix (FALSE) to the column names. Assumes all columns have pre- or suffix if specified.
field.type: manually specify field type(s). Vector of length 1 or ncol(data). Default is NULL and "text" is used for everything but factors, which wil get "radio".
field.label: manually specify field label(s). Vector of length 1 or ncol(data). Default is NULL and colnames(data) is used or attribute `field.label.attr` for haven_labelled data set (imported .dta file with `haven::read_dta()`).
field.label.attr: attribute name for named labels for haven_labelled data set (imported .dta file with `haven::read_dta()`. Default is "label"
field.validation: manually specify field validation(s). Vector of length 1 or ncol(data). Default is NULL and `levels()` are used for factors or attribute `factor.labels.attr` for haven_labelled data set (imported .dta file with `haven::read_dta()`).
metadata: redcap metadata headings. Default is REDCapCAST:::metadata_names.
validate.time: Flag to validate guessed time columns
time.var.sel.pos: Positive selection regex string passed to `gues_time_only_filter()` as sel.pos.
time.var.sel.neg: Negative selection regex string passed to `gues_time_only_filter()` as sel.neg.

Value

list of length 2

Details

This function is a natural development of the ds2dd() function. It assumes that the first column is the ID-column. No checks. Please, do always inspect the data dictionary before upload.

Ensure, that the data set is formatted with as much information as possible.

`field.type` can be supplied

Examples

data <- REDCapCAST::redcapcast_data
data |> ds2dd_detailed(validate.time = TRUE)
#> $is.POSIX
#> # A tibble: 19 × 2
#>    inclusion_time event_datetime     
#>    <time>         <dttm>             
#>  1 12:38:49       NA                 
#>  2 10:38:57       NA                 
#>  3       NA       NA                 
#>  4       NA       2024-01-18 12:49:42
#>  5 12:01:07       NA                 
#>  6       NA       NA                 
#>  7       NA       NA                 
#>  8       NA       2024-01-18 12:49:58
#>  9       NA       2024-01-18 12:50:01
#> 10       NA       2024-01-18 12:50:05
#> 11       NA       2024-01-18 12:50:07
#> 12       NA       2024-01-18 12:50:09
#> 13 20:39:19       NA                 
#> 14       NA       NA                 
#> 15       NA       2024-01-18 12:50:19
#> 16       NA       2024-01-18 12:50:22
#> 17       NA       2024-01-18 12:50:24
#> 18 08:50:31       NA                 
#> 19 08:49:28       NA                 
#> 
#> $is.datetime
#> # A tibble: 19 × 1
#>    event_datetime     
#>    <dttm>             
#>  1 NA                 
#>  2 NA                 
#>  3 NA                 
#>  4 2024-01-18 12:49:42
#>  5 NA                 
#>  6 NA                 
#>  7 NA                 
#>  8 2024-01-18 12:49:58
#>  9 2024-01-18 12:50:01
#> 10 2024-01-18 12:50:05
#> 11 2024-01-18 12:50:07
#> 12 2024-01-18 12:50:09
#> 13 NA                 
#> 14 NA                 
#> 15 2024-01-18 12:50:19
#> 16 2024-01-18 12:50:22
#> 17 2024-01-18 12:50:24
#> 18 NA                 
#> 19 NA                 
#> 
#> $is.time_only
#> # A tibble: 19 × 1
#>    inclusion_time
#>    <time>        
#>  1 12:38:49      
#>  2 10:38:57      
#>  3       NA      
#>  4       NA      
#>  5 12:01:07      
#>  6       NA      
#>  7       NA      
#>  8       NA      
#>  9       NA      
#> 10       NA      
#> 11       NA      
#> 12       NA      
#> 13 20:39:19      
#> 14       NA      
#> 15       NA      
#> 16       NA      
#> 17       NA      
#> 18 08:50:31      
#> 19 08:49:28      
#> 
data |> ds2dd_detailed()
#> $data
#> # A tibble: 19 × 24
#>    record_id redcap_event_name redcap_repeat_instrument redcap_repeat_instance
#>        <dbl> <chr>             <chr>                                     <dbl>
#>  1         1 inclusion         NA                                           NA
#>  2         2 inclusion         NA                                           NA
#>  3         2 follow1           NA                                           NA
#>  4         2 follow1           New Event (?)                                 1
#>  5         3 inclusion         NA                                           NA
#>  6         3 follow1           NA                                           NA
#>  7         3 follow2           NA                                           NA
#>  8         3 follow1           New Event (?)                                 1
#>  9         3 follow1           New Event (?)                                 2
#> 10         3 follow2           New Event (?)                                 1
#> 11         3 follow2           New Event (?)                                 2
#> 12         3 follow2           New Event (?)                                 3
#> 13         4 inclusion         NA                                           NA
#> 14         4 follow2           NA                                           NA
#> 15         4 follow2           New Event (?)                                 1
#> 16         4 follow2           New Event (?)                                 2
#> 17         4 follow2           New Event (?)                                 3
#> 18         5 inclusion         NA                                           NA
#> 19         6 inclusion         NA                                           NA
#> # ℹ 20 more variables: cpr <chr>, inclusion <date>, inclusion_time <chr>,
#> #   dob <date>, age <dbl>, age_integer <dbl>, sex <chr>, cohabitation <chr>,
#> #   hypertension <chr>, diabetes <chr>, region <chr>,
#> #   baseline_data_start_complete <chr>, mrs_assessed <chr>, mrs_date <date>,
#> #   mrs_score <dbl>, mrs_complete <chr>, event_datetime <dttm>,
#> #   event_age <lgl>, event_type <chr>, new_event_complete <chr>
#> 
#> $meta
#> # A tibble: 24 × 18
#>    field_name               form_name section_header field_type field_label     
#>    <chr>                    <chr>     <lgl>          <chr>      <chr>           
#>  1 record_id                data      NA             text       record_id       
#>  2 redcap_event_name        data      NA             text       redcap_event_na…
#>  3 redcap_repeat_instrument data      NA             text       redcap_repeat_i…
#>  4 redcap_repeat_instance   data      NA             text       redcap_repeat_i…
#>  5 cpr                      data      NA             text       cpr             
#>  6 inclusion                data      NA             text       inclusion       
#>  7 inclusion_time           data      NA             text       inclusion_time  
#>  8 dob                      data      NA             text       dob             
#>  9 age                      data      NA             text       age             
#> 10 age_integer              data      NA             text       age_integer     
#> # ℹ 14 more rows
#> # ℹ 13 more variables: select_choices_or_calculations <lgl>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
#> 
iris |> ds2dd_detailed(add.auto.id = TRUE)
#> A default id column has been added
#> $data
#> # A tibble: 150 × 6
#>    record_id sepal.length sepal.width petal.length petal.width species
#>        <int>        <dbl>       <dbl>        <dbl>       <dbl> <fct>  
#>  1         1          5.1         3.5          1.4         0.2 setosa 
#>  2         2          4.9         3            1.4         0.2 setosa 
#>  3         3          4.7         3.2          1.3         0.2 setosa 
#>  4         4          4.6         3.1          1.5         0.2 setosa 
#>  5         5          5           3.6          1.4         0.2 setosa 
#>  6         6          5.4         3.9          1.7         0.4 setosa 
#>  7         7          4.6         3.4          1.4         0.3 setosa 
#>  8         8          5           3.4          1.5         0.2 setosa 
#>  9         9          4.4         2.9          1.4         0.2 setosa 
#> 10        10          4.9         3.1          1.5         0.1 setosa 
#> # ℹ 140 more rows
#> 
#> $meta
#> # A tibble: 6 × 18
#>   field_name   form_name section_header field_type field_label 
#>   <chr>        <chr>     <lgl>          <chr>      <chr>       
#> 1 record_id    data      NA             text       record_id   
#> 2 sepal.length data      NA             text       sepal.length
#> 3 sepal.width  data      NA             text       sepal.width 
#> 4 petal.length data      NA             text       petal.length
#> 5 petal.width  data      NA             text       petal.width 
#> 6 species      data      NA             radio      species     
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
#> 
iris |>
  ds2dd_detailed(
    add.auto.id = TRUE,
    form.name = sample(c("b", "c"), size = 6, replace = TRUE, prob = rep(.5, 2))
  ) |>
  purrr::pluck("meta")
#> A default id column has been added
#> # A tibble: 6 × 18
#>   field_name   form_name section_header field_type field_label 
#>   <chr>        <chr>     <lgl>          <chr>      <chr>       
#> 1 record_id    c         NA             text       record_id   
#> 2 sepal.length b         NA             text       sepal.length
#> 3 sepal.width  b         NA             text       sepal.width 
#> 4 petal.length b         NA             text       petal.length
#> 5 petal.width  c         NA             text       petal.width 
#> 6 species      c         NA             radio      species     
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
mtcars |> ds2dd_detailed(add.auto.id = TRUE)
#> A default id column has been added
#> $data
#> # A tibble: 32 × 12
#>    record_id   mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>        <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1         1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
#>  2         2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
#>  3         3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
#>  4         4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
#>  5         5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
#>  6         6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
#>  7         7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
#>  8         8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
#>  9         9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
#> 10        10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
#> # ℹ 22 more rows
#> 
#> $meta
#> # A tibble: 12 × 18
#>    field_name form_name section_header field_type field_label
#>    <chr>      <chr>     <lgl>          <chr>      <chr>      
#>  1 record_id  data      NA             text       record_id  
#>  2 mpg        data      NA             text       mpg        
#>  3 cyl        data      NA             text       cyl        
#>  4 disp       data      NA             text       disp       
#>  5 hp         data      NA             text       hp         
#>  6 drat       data      NA             text       drat       
#>  7 wt         data      NA             text       wt         
#>  8 qsec       data      NA             text       qsec       
#>  9 vs         data      NA             text       vs         
#> 10 am         data      NA             text       am         
#> 11 gear       data      NA             text       gear       
#> 12 carb       data      NA             text       carb       
#> # ℹ 13 more variables: select_choices_or_calculations <lgl>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
#> 
data <- iris |>
  ds2dd_detailed(add.auto.id = TRUE) |>
  purrr::pluck("data")
#> A default id column has been added
names(data) <- glue::glue("{sample(x = c('a','b'),size = length(names(data)),
replace=TRUE,prob = rep(x=.5,2))}__{names(data)}")
data |> ds2dd_detailed(form.sep = "__")
#> $data
#> # A tibble: 150 × 6
#>    record_id sepal.length sepal.width petal.length petal.width species
#>        <int>        <dbl>       <dbl>        <dbl>       <dbl> <fct>  
#>  1         1          5.1         3.5          1.4         0.2 setosa 
#>  2         2          4.9         3            1.4         0.2 setosa 
#>  3         3          4.7         3.2          1.3         0.2 setosa 
#>  4         4          4.6         3.1          1.5         0.2 setosa 
#>  5         5          5           3.6          1.4         0.2 setosa 
#>  6         6          5.4         3.9          1.7         0.4 setosa 
#>  7         7          4.6         3.4          1.4         0.3 setosa 
#>  8         8          5           3.4          1.5         0.2 setosa 
#>  9         9          4.4         2.9          1.4         0.2 setosa 
#> 10        10          4.9         3.1          1.5         0.1 setosa 
#> # ℹ 140 more rows
#> 
#> $meta
#> # A tibble: 6 × 18
#>   field_name   form_name section_header field_type field_label 
#>   <chr>        <chr>     <lgl>          <chr>      <chr>       
#> 1 record_id    b         NA             text       record_id   
#> 2 sepal.length b         NA             text       sepal.length
#> 3 sepal.width  b         NA             text       sepal.width 
#> 4 petal.length b         NA             text       petal.length
#> 5 petal.width  b         NA             text       petal.width 
#> 6 species      b         NA             radio      species     
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
#>