Skip to contents

Extract data from stata file for data dictionary

Usage

ds2dd_detailed(
  data,
  add.auto.id = FALSE,
  date.format = "dmy",
  form.name = NULL,
  form.sep = NULL,
  form.prefix = TRUE,
  field.type = NULL,
  field.label = NULL,
  field.label.attr = "label",
  field.validation = NULL,
  metadata = names(REDCapCAST::redcapcast_meta),
  validate.time = FALSE,
  time.var.sel.pos = "[Tt]i[d(me)]",
  time.var.sel.neg = "[Dd]at[eo]"
)

Arguments

data

data frame

add.auto.id

flag to add id column

date.format

date format, character string. ymd/dmy/mdy. dafault is dmy.

form.name

manually specify form name(s). Vector of length 1 or ncol(data). Default is NULL and "data" is used.

form.sep

If supplied dataset has form names as suffix or prefix to the column/variable names, the seperator can be specified. If supplied, the form.name is ignored. Default is NULL.

form.prefix

Flag to set if form is prefix (TRUE) or suffix (FALSE) to the column names. Assumes all columns have pre- or suffix if specified.

field.type

manually specify field type(s). Vector of length 1 or ncol(data). Default is NULL and "text" is used for everything but factors, which wil get "radio".

field.label

manually specify field label(s). Vector of length 1 or ncol(data). Default is NULL and colnames(data) is used or attribute `field.label.attr` for haven_labelled data set (imported .dta file with `haven::read_dta()`).

field.label.attr

attribute name for named labels for haven_labelled data set (imported .dta file with `haven::read_dta()`. Default is "label"

field.validation

manually specify field validation(s). Vector of length 1 or ncol(data). Default is NULL and `levels()` are used for factors or attribute `factor.labels.attr` for haven_labelled data set (imported .dta file with `haven::read_dta()`).

metadata

redcap metadata headings. Default is REDCapCAST:::metadata_names.

validate.time

Flag to validate guessed time columns

time.var.sel.pos

Positive selection regex string passed to `gues_time_only_filter()` as sel.pos.

time.var.sel.neg

Negative selection regex string passed to `gues_time_only_filter()` as sel.neg.

Value

list of length 2

Details

This function is a natural development of the ds2dd() function. It assumes that the first column is the ID-column. No checks. Please, do always inspect the data dictionary before upload.

Ensure, that the data set is formatted with as much information as possible.

`field.type` can be supplied

Examples

data <- REDCapCAST::redcapcast_data
data |> ds2dd_detailed(validate.time = TRUE)
#> $is.POSIX
#> # A tibble: 19 × 2
#>    inclusion_time event_datetime     
#>    <time>         <dttm>             
#>  1 12:38:49       NA                 
#>  2 10:38:57       NA                 
#>  3       NA       NA                 
#>  4       NA       2024-01-18 12:49:42
#>  5 12:01:07       NA                 
#>  6       NA       NA                 
#>  7       NA       NA                 
#>  8       NA       2024-01-18 12:49:58
#>  9       NA       2024-01-18 12:50:01
#> 10       NA       2024-01-18 12:50:05
#> 11       NA       2024-01-18 12:50:07
#> 12       NA       2024-01-18 12:50:09
#> 13 20:39:19       NA                 
#> 14       NA       NA                 
#> 15       NA       2024-01-18 12:50:19
#> 16       NA       2024-01-18 12:50:22
#> 17       NA       2024-01-18 12:50:24
#> 18 08:50:31       NA                 
#> 19 08:49:28       NA                 
#> 
#> $is.datetime
#> # A tibble: 19 × 1
#>    event_datetime     
#>    <dttm>             
#>  1 NA                 
#>  2 NA                 
#>  3 NA                 
#>  4 2024-01-18 12:49:42
#>  5 NA                 
#>  6 NA                 
#>  7 NA                 
#>  8 2024-01-18 12:49:58
#>  9 2024-01-18 12:50:01
#> 10 2024-01-18 12:50:05
#> 11 2024-01-18 12:50:07
#> 12 2024-01-18 12:50:09
#> 13 NA                 
#> 14 NA                 
#> 15 2024-01-18 12:50:19
#> 16 2024-01-18 12:50:22
#> 17 2024-01-18 12:50:24
#> 18 NA                 
#> 19 NA                 
#> 
#> $is.time_only
#> # A tibble: 19 × 1
#>    inclusion_time
#>    <time>        
#>  1 12:38:49      
#>  2 10:38:57      
#>  3       NA      
#>  4       NA      
#>  5 12:01:07      
#>  6       NA      
#>  7       NA      
#>  8       NA      
#>  9       NA      
#> 10       NA      
#> 11       NA      
#> 12       NA      
#> 13 20:39:19      
#> 14       NA      
#> 15       NA      
#> 16       NA      
#> 17       NA      
#> 18 08:50:31      
#> 19 08:49:28      
#> 
data |> ds2dd_detailed()
#> $data
#> # A tibble: 19 × 24
#>    record_id redcap_event_name redcap_repeat_instrument redcap_repeat_instance
#>        <dbl> <chr>             <chr>                                     <dbl>
#>  1         1 inclusion         NA                                           NA
#>  2         2 inclusion         NA                                           NA
#>  3         2 follow1           NA                                           NA
#>  4         2 follow1           New Event (?)                                 1
#>  5         3 inclusion         NA                                           NA
#>  6         3 follow1           NA                                           NA
#>  7         3 follow2           NA                                           NA
#>  8         3 follow1           New Event (?)                                 1
#>  9         3 follow1           New Event (?)                                 2
#> 10         3 follow2           New Event (?)                                 1
#> 11         3 follow2           New Event (?)                                 2
#> 12         3 follow2           New Event (?)                                 3
#> 13         4 inclusion         NA                                           NA
#> 14         4 follow2           NA                                           NA
#> 15         4 follow2           New Event (?)                                 1
#> 16         4 follow2           New Event (?)                                 2
#> 17         4 follow2           New Event (?)                                 3
#> 18         5 inclusion         NA                                           NA
#> 19         6 inclusion         NA                                           NA
#> # ℹ 20 more variables: cpr <chr>, inclusion <date>, inclusion_time <chr>,
#> #   dob <date>, age <dbl>, age_integer <dbl>, sex <chr>, cohabitation <chr>,
#> #   hypertension <chr>, diabetes <chr>, region <chr>,
#> #   baseline_data_start_complete <chr>, mrs_assessed <chr>, mrs_date <date>,
#> #   mrs_score <dbl>, mrs_complete <chr>, event_datetime <dttm>,
#> #   event_age <lgl>, event_type <chr>, new_event_complete <chr>
#> 
#> $meta
#> # A tibble: 24 × 18
#>    field_name               form_name section_header field_type field_label     
#>    <chr>                    <chr>     <lgl>          <chr>      <chr>           
#>  1 record_id                data      NA             text       record_id       
#>  2 redcap_event_name        data      NA             text       redcap_event_na…
#>  3 redcap_repeat_instrument data      NA             text       redcap_repeat_i…
#>  4 redcap_repeat_instance   data      NA             text       redcap_repeat_i…
#>  5 cpr                      data      NA             text       cpr             
#>  6 inclusion                data      NA             text       inclusion       
#>  7 inclusion_time           data      NA             text       inclusion_time  
#>  8 dob                      data      NA             text       dob             
#>  9 age                      data      NA             text       age             
#> 10 age_integer              data      NA             text       age_integer     
#> # ℹ 14 more rows
#> # ℹ 13 more variables: select_choices_or_calculations <lgl>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
#> 
iris |> ds2dd_detailed(add.auto.id = TRUE)
#> A default id column has been added
#> $data
#> # A tibble: 150 × 6
#>    record_id sepal.length sepal.width petal.length petal.width species
#>        <int>        <dbl>       <dbl>        <dbl>       <dbl> <fct>  
#>  1         1          5.1         3.5          1.4         0.2 setosa 
#>  2         2          4.9         3            1.4         0.2 setosa 
#>  3         3          4.7         3.2          1.3         0.2 setosa 
#>  4         4          4.6         3.1          1.5         0.2 setosa 
#>  5         5          5           3.6          1.4         0.2 setosa 
#>  6         6          5.4         3.9          1.7         0.4 setosa 
#>  7         7          4.6         3.4          1.4         0.3 setosa 
#>  8         8          5           3.4          1.5         0.2 setosa 
#>  9         9          4.4         2.9          1.4         0.2 setosa 
#> 10        10          4.9         3.1          1.5         0.1 setosa 
#> # ℹ 140 more rows
#> 
#> $meta
#> # A tibble: 6 × 18
#>   field_name   form_name section_header field_type field_label 
#>   <chr>        <chr>     <lgl>          <chr>      <chr>       
#> 1 record_id    data      NA             text       record_id   
#> 2 sepal.length data      NA             text       sepal.length
#> 3 sepal.width  data      NA             text       sepal.width 
#> 4 petal.length data      NA             text       petal.length
#> 5 petal.width  data      NA             text       petal.width 
#> 6 species      data      NA             radio      species     
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
#> 
iris |>
  ds2dd_detailed(
    add.auto.id = TRUE,
    form.name = sample(c("b", "c"), size = 6, replace = TRUE, prob = rep(.5, 2))
  ) |>
  purrr::pluck("meta")
#> A default id column has been added
#> # A tibble: 6 × 18
#>   field_name   form_name section_header field_type field_label 
#>   <chr>        <chr>     <lgl>          <chr>      <chr>       
#> 1 record_id    c         NA             text       record_id   
#> 2 sepal.length b         NA             text       sepal.length
#> 3 sepal.width  b         NA             text       sepal.width 
#> 4 petal.length b         NA             text       petal.length
#> 5 petal.width  c         NA             text       petal.width 
#> 6 species      c         NA             radio      species     
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
mtcars |> ds2dd_detailed(add.auto.id = TRUE)
#> A default id column has been added
#> $data
#> # A tibble: 32 × 12
#>    record_id   mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>        <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1         1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
#>  2         2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
#>  3         3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
#>  4         4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
#>  5         5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
#>  6         6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
#>  7         7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
#>  8         8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
#>  9         9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
#> 10        10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
#> # ℹ 22 more rows
#> 
#> $meta
#> # A tibble: 12 × 18
#>    field_name form_name section_header field_type field_label
#>    <chr>      <chr>     <lgl>          <chr>      <chr>      
#>  1 record_id  data      NA             text       record_id  
#>  2 mpg        data      NA             text       mpg        
#>  3 cyl        data      NA             text       cyl        
#>  4 disp       data      NA             text       disp       
#>  5 hp         data      NA             text       hp         
#>  6 drat       data      NA             text       drat       
#>  7 wt         data      NA             text       wt         
#>  8 qsec       data      NA             text       qsec       
#>  9 vs         data      NA             text       vs         
#> 10 am         data      NA             text       am         
#> 11 gear       data      NA             text       gear       
#> 12 carb       data      NA             text       carb       
#> # ℹ 13 more variables: select_choices_or_calculations <lgl>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
#> 
data <- iris |>
  ds2dd_detailed(add.auto.id = TRUE) |>
  purrr::pluck("data")
#> A default id column has been added
names(data) <- glue::glue("{sample(x = c('a','b'),size = length(names(data)),
replace=TRUE,prob = rep(x=.5,2))}__{names(data)}")
data |> ds2dd_detailed(form.sep = "__")
#> $data
#> # A tibble: 150 × 6
#>    record_id sepal.length sepal.width petal.length petal.width species
#>        <int>        <dbl>       <dbl>        <dbl>       <dbl> <fct>  
#>  1         1          5.1         3.5          1.4         0.2 setosa 
#>  2         2          4.9         3            1.4         0.2 setosa 
#>  3         3          4.7         3.2          1.3         0.2 setosa 
#>  4         4          4.6         3.1          1.5         0.2 setosa 
#>  5         5          5           3.6          1.4         0.2 setosa 
#>  6         6          5.4         3.9          1.7         0.4 setosa 
#>  7         7          4.6         3.4          1.4         0.3 setosa 
#>  8         8          5           3.4          1.5         0.2 setosa 
#>  9         9          4.4         2.9          1.4         0.2 setosa 
#> 10        10          4.9         3.1          1.5         0.1 setosa 
#> # ℹ 140 more rows
#> 
#> $meta
#> # A tibble: 6 × 18
#>   field_name   form_name section_header field_type field_label 
#>   <chr>        <chr>     <lgl>          <chr>      <chr>       
#> 1 record_id    b         NA             text       record_id   
#> 2 sepal.length b         NA             text       sepal.length
#> 3 sepal.width  b         NA             text       sepal.width 
#> 4 petal.length b         NA             text       petal.length
#> 5 petal.width  b         NA             text       petal.width 
#> 6 species      b         NA             radio      species     
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> #   text_validation_type_or_show_slider_number <chr>,
#> #   text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> #   branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> #   question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> #   field_annotation <lgl>
#>