Skip to contents

Extract data from stata file for data dictionary

Usage

ds2dd_detailed(
  data,
  add.auto.id = FALSE,
  date.format = "dmy",
  form.name = NULL,
  form.sep = NULL,
  form.prefix = TRUE,
  field.type = NULL,
  field.label = NULL,
  field.label.attr = "label",
  field.validation = NULL,
  metadata = names(REDCapCAST::redcapcast_meta),
  validate.time = FALSE,
  time.var.sel.pos = "[Tt]i[d(me)]",
  time.var.sel.neg = "[Dd]at[eo]"
)

Arguments

data

data frame

add.auto.id

flag to add id column

date.format

date format, character string. ymd/dmy/mdy. dafault is dmy.

form.name

manually specify form name(s). Vector of length 1 or ncol(data). Default is NULL and "data" is used.

form.sep

If supplied dataset has form names as suffix or prefix to the column/variable names, the seperator can be specified. If supplied, the form.sep is ignored. Default is NULL.

form.prefix

Flag to set if form is prefix (TRUE) or suffix (FALSE) to the column names. Assumes all columns have pre- or suffix if specified.

field.type

manually specify field type(s). Vector of length 1 or ncol(data). Default is NULL and "text" is used for everything but factors, which wil get "radio".

field.label

manually specify field label(s). Vector of length 1 or ncol(data). Default is NULL and colnames(data) is used or attribute `field.label.attr` for haven_labelled data set (imported .dta file with `haven::read_dta()`).

field.label.attr

attribute name for named labels for haven_labelled data set (imported .dta file with `haven::read_dta()`. Default is "label"

field.validation

manually specify field validation(s). Vector of length 1 or ncol(data). Default is NULL and `levels()` are used for factors or attribute `factor.labels.attr` for haven_labelled data set (imported .dta file with `haven::read_dta()`).

metadata

redcap metadata headings. Default is REDCapCAST:::metadata_names.

validate.time

Flag to validate guessed time columns

time.var.sel.pos

Positive selection regex string passed to `gues_time_only_filter()` as sel.pos.

time.var.sel.neg

Negative selection regex string passed to `gues_time_only_filter()` as sel.neg.

Value

list of length 2

Details

This function is a natural development of the ds2dd() function. It assumes that the first column is the ID-column. No checks. Please, do always inspect the data dictionary before upload.

Ensure, that the data set is formatted with as much information as possible.

`field.type` can be supplied

Examples

data <- REDCapCAST::redcapcast_data
data |> ds2dd_detailed(validate.time = TRUE)
#> $is.POSIX
#> # A tibble: 19 × 2
#>    inclusion_time event_datetime     
#>    <time>         <dttm>             
#>  1 12:38:49       NA                 
#>  2 10:38:57       NA                 
#>  3       NA       NA                 
#>  4       NA       2024-01-18 12:49:42
#>  5 12:01:07       NA                 
#>  6       NA       NA                 
#>  7       NA       NA                 
#>  8       NA       2024-01-18 12:49:58
#>  9       NA       2024-01-18 12:50:01
#> 10       NA       2024-01-18 12:50:05
#> 11       NA       2024-01-18 12:50:07
#> 12       NA       2024-01-18 12:50:09
#> 13 20:39:19       NA                 
#> 14       NA       NA                 
#> 15       NA       2024-01-18 12:50:19
#> 16       NA       2024-01-18 12:50:22
#> 17       NA       2024-01-18 12:50:24
#> 18 08:50:31       NA                 
#> 19 08:49:28       NA                 
#> 
#> $is.datetime
#> # A tibble: 19 × 1
#>    event_datetime     
#>    <dttm>             
#>  1 NA                 
#>  2 NA                 
#>  3 NA                 
#>  4 2024-01-18 12:49:42
#>  5 NA                 
#>  6 NA                 
#>  7 NA                 
#>  8 2024-01-18 12:49:58
#>  9 2024-01-18 12:50:01
#> 10 2024-01-18 12:50:05
#> 11 2024-01-18 12:50:07
#> 12 2024-01-18 12:50:09
#> 13 NA                 
#> 14 NA                 
#> 15 2024-01-18 12:50:19
#> 16 2024-01-18 12:50:22
#> 17 2024-01-18 12:50:24
#> 18 NA                 
#> 19 NA                 
#> 
#> $is.time_only
#> # A tibble: 19 × 1
#>    inclusion_time
#>    <time>        
#>  1 12:38:49      
#>  2 10:38:57      
#>  3       NA      
#>  4       NA      
#>  5 12:01:07      
#>  6       NA      
#>  7       NA      
#>  8       NA      
#>  9       NA      
#> 10       NA      
#> 11       NA      
#> 12       NA      
#> 13 20:39:19      
#> 14       NA      
#> 15       NA      
#> 16       NA      
#> 17       NA      
#> 18 08:50:31      
#> 19 08:49:28      
#> 
data |> ds2dd_detailed()
#> $data
#> # A tibble: 19 × 24
#>    record_id redcap_event_name redcap_repeat_instrument redcap_repeat_instance
#>        <dbl> <chr>             <chr>                                     <dbl>
#>  1         1 inclusion         NA                                           NA
#>  2         2 inclusion         NA                                           NA
#>  3         2 follow1           NA                                           NA
#>  4         2 follow1           New Event (?)                                 1
#>  5         3 inclusion         NA                                           NA
#>  6         3 follow1           NA                                           NA
#>  7         3 follow2           NA                                           NA
#>  8         3 follow1           New Event (?)                                 1
#>  9         3 follow1           New Event (?)                                 2
#> 10         3 follow2           New Event (?)                                 1
#> 11         3 follow2           New Event (?)                                 2
#> 12         3 follow2           New Event (?)                                 3
#> 13         4 inclusion         NA                                           NA
#> 14         4 follow2           NA                                           NA
#> 15         4 follow2           New Event (?)                                 1
#> 16         4 follow2           New Event (?)                                 2
#> 17         4 follow2           New Event (?)                                 3
#> 18         5 inclusion         NA                                           NA
#> 19         6 inclusion         NA                                           NA
#> # ℹ 20 more variables: cpr <chr>, inclusion <date>, inclusion_time <chr>,
#> #   dob <date>, age <dbl>, age_integer <dbl>, sex <chr>, cohabitation <chr>,
#> #   hypertension <chr>, diabetes <chr>, region <chr>,
#> #   baseline_data_start_complete <chr>, mrs_assessed <chr>, mrs_date <date>,
#> #   mrs_score <dbl>, mrs_complete <chr>, event_datetime <dttm>,
#> #   event_age <lgl>, event_type <chr>, new_event_complete <chr>
#> 
#> $meta
#> # A tibble: 24 × 18
#>    field_name               form_name section_header field_type field_label     
#>    <chr>                    <lgl>     <lgl>          <chr>      <chr>           
#>  1 record_id                NA        NA             text       record_id       
#>  2 redcap_event_name        NA        NA             text       redcap_event_na…
#>  3 redcap_repeat_instrument NA        NA             text       redcap_repeat_i…
#>  4 redcap_repeat_instance   NA        NA             text       redcap_repeat_i…
#>  5 cpr                      NA        NA             text       cpr             
#>  6 inclusion                NA        NA             text       inclusion       
#>  7 inclusion_time           NA        NA             text       inclusion_time  
#>  8 dob                      NA        NA             text       dob             
#>  9 age                      NA        NA             text       age             
#> 10 age_integer              NA        NA             text       age_integer     
#> # ℹ 14 more rows
#> # ℹ 13 more variables: select_choices_or_calculations <lgl>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
#> 
iris |> ds2dd_detailed(add.auto.id = TRUE)
#> A default id column has been added
#> $data
#> # A tibble: 150 × 6
#>    record_id sepal.length sepal.width petal.length petal.width species
#>        <int>        <dbl>       <dbl>        <dbl>       <dbl> <fct>  
#>  1         1          5.1         3.5          1.4         0.2 setosa 
#>  2         2          4.9         3            1.4         0.2 setosa 
#>  3         3          4.7         3.2          1.3         0.2 setosa 
#>  4         4          4.6         3.1          1.5         0.2 setosa 
#>  5         5          5           3.6          1.4         0.2 setosa 
#>  6         6          5.4         3.9          1.7         0.4 setosa 
#>  7         7          4.6         3.4          1.4         0.3 setosa 
#>  8         8          5           3.4          1.5         0.2 setosa 
#>  9         9          4.4         2.9          1.4         0.2 setosa 
#> 10        10          4.9         3.1          1.5         0.1 setosa 
#> # ℹ 140 more rows
#> 
#> $meta
#> # A tibble: 6 × 18
#>   field_name   form_name section_header field_type field_label 
#>   <chr>        <lgl>     <lgl>          <chr>      <chr>       
#> 1 record_id    NA        NA             text       record_id   
#> 2 sepal.length NA        NA             text       sepal.length
#> 3 sepal.width  NA        NA             text       sepal.width 
#> 4 petal.length NA        NA             text       petal.length
#> 5 petal.width  NA        NA             text       petal.width 
#> 6 species      NA        NA             radio      species     
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
#> 
mtcars |> ds2dd_detailed(add.auto.id = TRUE)
#> A default id column has been added
#> $data
#> # A tibble: 32 × 12
#>    record_id   mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>        <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1         1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
#>  2         2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
#>  3         3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
#>  4         4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
#>  5         5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
#>  6         6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
#>  7         7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
#>  8         8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
#>  9         9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
#> 10        10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
#> # ℹ 22 more rows
#> 
#> $meta
#> # A tibble: 12 × 18
#>    field_name form_name section_header field_type field_label
#>    <chr>      <lgl>     <lgl>          <chr>      <chr>      
#>  1 record_id  NA        NA             text       record_id  
#>  2 mpg        NA        NA             text       mpg        
#>  3 cyl        NA        NA             text       cyl        
#>  4 disp       NA        NA             text       disp       
#>  5 hp         NA        NA             text       hp         
#>  6 drat       NA        NA             text       drat       
#>  7 wt         NA        NA             text       wt         
#>  8 qsec       NA        NA             text       qsec       
#>  9 vs         NA        NA             text       vs         
#> 10 am         NA        NA             text       am         
#> 11 gear       NA        NA             text       gear       
#> 12 carb       NA        NA             text       carb       
#> # ℹ 13 more variables: select_choices_or_calculations <lgl>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
#> 
data <- iris |>
  ds2dd_detailed(add.auto.id = TRUE) |>
  purrr::pluck("data")
#> A default id column has been added
names(data) <- glue::glue("{sample(x = c('a','b'),size = length(names(data)),
replace=TRUE,prob = rep(x=.5,2))}__{names(data)}")
data |> ds2dd_detailed(form.sep="__")
#> $data
#> # A tibble: 150 × 6
#>    record_id sepal.length sepal.width petal.length petal.width species
#>        <int>        <dbl>       <dbl>        <dbl>       <dbl> <fct>  
#>  1         1          5.1         3.5          1.4         0.2 setosa 
#>  2         2          4.9         3            1.4         0.2 setosa 
#>  3         3          4.7         3.2          1.3         0.2 setosa 
#>  4         4          4.6         3.1          1.5         0.2 setosa 
#>  5         5          5           3.6          1.4         0.2 setosa 
#>  6         6          5.4         3.9          1.7         0.4 setosa 
#>  7         7          4.6         3.4          1.4         0.3 setosa 
#>  8         8          5           3.4          1.5         0.2 setosa 
#>  9         9          4.4         2.9          1.4         0.2 setosa 
#> 10        10          4.9         3.1          1.5         0.1 setosa 
#> # ℹ 140 more rows
#> 
#> $meta
#> # A tibble: 6 × 18
#>   field_name   form_name section_header field_type field_label 
#>   <chr>        <chr>     <lgl>          <chr>      <chr>       
#> 1 record_id    b         NA             text       record_id   
#> 2 sepal.length b         NA             text       sepal.length
#> 3 sepal.width  b         NA             text       sepal.width 
#> 4 petal.length a         NA             text       petal.length
#> 5 petal.width  a         NA             text       petal.width 
#> 6 species      a         NA             radio      species     
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
#>