To make the easiest possible transition from spreadsheet/dataset to REDCap, I have created a small app, which adds a graphical interface to the casting of a data dictionary and data upload. Install the package and launch the app as follows:
REDCapCAST::shiny_cast()
The app primarily wraps one function:
ds2dd_detailed()
.
library(REDCapCAST)
ds <- REDCap_split(
records = redcapcast_data,
metadata = redcapcast_meta,
forms = "all"
) |>
sanitize_split() |>
redcap_wider()
#> Joining with `by = join_by(record_id)`
#> Joining with `by = join_by(record_id)`
#> Joining with `by = join_by(record_id)`
str(ds)
#> 'data.frame': 6 obs. of 52 variables:
#> $ record_id : num 1 2 3 4 5 6
#> $ cpr : chr "1203401OB4" "0102342303" "2301569823" "0204051342" ...
#> $ inclusion : Date, format: "2023-03-13" "2023-03-01" ...
#> $ inclusion_time : 'hms' num 12:38:49 10:38:57 12:01:07 20:39:19 ...
#> ..- attr(*, "units")= chr "secs"
#> $ dob : Date, format: "1940-03-12" "1934-02-01" ...
#> $ age : num 83 89.1 66.1 117.9 126.2 ...
#> $ age_integer : num 83 89 66 117 126 91
#> $ sex : chr "female" "male" "male" "female" ...
#> $ cohabitation : chr "Yes" "Yes" "No" NA ...
#> $ hypertension : chr "No" "No" "Yes" NA ...
#> $ diabetes : chr "Yes" "No" "Yes" NA ...
#> $ region : chr "East" "South" "North" NA ...
#> $ baseline_data_start_complete: chr "Incomplete" "Incomplete" "Incomplete" "Incomplete" ...
#> $ mrs_assessed_inclusion : chr "Yes" "Yes" NA NA ...
#> $ mrs_assessed_follow1 : chr NA "Yes" "Yes" NA ...
#> $ mrs_assessed_follow2 : chr NA NA "Yes" NA ...
#> $ mrs_date_inclusion : Date, format: "2023-03-13" "2023-03-07" ...
#> $ mrs_date_follow1 : Date, format: NA "2023-03-09" ...
#> $ mrs_date_follow2 : Date, format: NA NA ...
#> $ mrs_score_inclusion : num 1 1 NA NA NA NA
#> $ mrs_score_follow1 : num NA 3 2 NA NA NA
#> $ mrs_score_follow2 : num NA NA 1 NA NA NA
#> $ mrs_complete_inclusion : chr "Incomplete" "Incomplete" NA NA ...
#> $ mrs_complete_follow1 : chr NA "Incomplete" "Incomplete" NA ...
#> $ mrs_complete_follow2 : chr NA NA "Incomplete" NA ...
#> $ con_mrs : logi NA NA NA NA NA NA
#> $ con_calc : logi NA NA NA NA NA NA
#> $ consensus_complete : chr NA NA NA NA ...
#> $ event_datetime_1_follow1 : POSIXct, format: NA "2024-01-18 12:49:42" ...
#> $ event_datetime_1_follow2 : POSIXct, format: NA NA ...
#> $ event_age_1_follow1 : num NA NA NA 96 127 NA
#> $ event_age_1_follow2 : num NA NA NA 118 NA NA
#> $ event_type_1_follow1 : chr NA "TIA" "AIS" "TIA" ...
#> $ event_type_1_follow2 : chr NA NA "ICH" "AIS" ...
#> $ new_event_complete_1_follow1: chr NA "Incomplete" "Incomplete" "Complete" ...
#> $ new_event_complete_1_follow2: chr NA NA "Incomplete" "Complete" ...
#> $ event_datetime_2_follow1 : POSIXct, format: NA NA ...
#> $ event_datetime_2_follow2 : POSIXct, format: NA NA ...
#> $ event_datetime_3_follow1 : POSIXct, format: NA NA ...
#> $ event_datetime_3_follow2 : POSIXct, format: NA NA ...
#> $ event_age_2_follow1 : num NA NA NA 105 127 NA
#> $ event_age_2_follow2 : num NA NA NA 118 NA NA
#> $ event_age_3_follow1 : num NA NA NA NA NA NA
#> $ event_age_3_follow2 : num NA NA NA 118 NA NA
#> $ event_type_2_follow1 : chr NA NA "ICH" "TIA" ...
#> $ event_type_2_follow2 : chr NA NA "TIA" "ICH" ...
#> $ event_type_3_follow1 : chr NA NA NA NA ...
#> $ event_type_3_follow2 : chr NA NA "AIS" "Unknown" ...
#> $ new_event_complete_2_follow1: chr NA NA "Incomplete" "Complete" ...
#> $ new_event_complete_2_follow2: chr NA NA "Incomplete" "Incomplete" ...
#> $ new_event_complete_3_follow1: chr NA NA NA NA ...
#> $ new_event_complete_3_follow2: chr NA NA "Incomplete" "Complete" ...
ds|>
ds2dd_detailed()|>
purrr::pluck("data") |>
str()
#> tibble [6 × 52] (S3: tbl_df/tbl/data.frame)
#> $ record_id : num [1:6] 1 2 3 4 5 6
#> $ cpr : chr [1:6] "1203401OB4" "0102342303" "2301569823" "0204051342" ...
#> $ inclusion : Date[1:6], format: "2023-03-13" "2023-03-01" ...
#> $ inclusion_time : chr [1:6] "12:38:49" "10:38:57" "12:01:07" "20:39:19" ...
#> $ dob : Date[1:6], format: "1940-03-12" "1934-02-01" ...
#> $ age : num [1:6] 83 89.1 66.1 117.9 126.2 ...
#> $ age_integer : num [1:6] 83 89 66 117 126 91
#> $ sex : chr [1:6] "female" "male" "male" "female" ...
#> $ cohabitation : chr [1:6] "Yes" "Yes" "No" NA ...
#> $ hypertension : chr [1:6] "No" "No" "Yes" NA ...
#> $ diabetes : chr [1:6] "Yes" "No" "Yes" NA ...
#> $ region : chr [1:6] "East" "South" "North" NA ...
#> $ baseline_data_start_complete: chr [1:6] "Incomplete" "Incomplete" "Incomplete" "Incomplete" ...
#> $ mrs_assessed_inclusion : chr [1:6] "Yes" "Yes" NA NA ...
#> $ mrs_assessed_follow1 : chr [1:6] NA "Yes" "Yes" NA ...
#> $ mrs_assessed_follow2 : chr [1:6] NA NA "Yes" NA ...
#> $ mrs_date_inclusion : Date[1:6], format: "2023-03-13" "2023-03-07" ...
#> $ mrs_date_follow1 : Date[1:6], format: NA "2023-03-09" ...
#> $ mrs_date_follow2 : Date[1:6], format: NA NA ...
#> $ mrs_score_inclusion : num [1:6] 1 1 NA NA NA NA
#> $ mrs_score_follow1 : num [1:6] NA 3 2 NA NA NA
#> $ mrs_score_follow2 : num [1:6] NA NA 1 NA NA NA
#> $ mrs_complete_inclusion : chr [1:6] "Incomplete" "Incomplete" NA NA ...
#> $ mrs_complete_follow1 : chr [1:6] NA "Incomplete" "Incomplete" NA ...
#> $ mrs_complete_follow2 : chr [1:6] NA NA "Incomplete" NA ...
#> $ con_mrs : Factor w/ 2 levels "FALSE","TRUE": NA NA NA NA NA NA
#> $ con_calc : Factor w/ 2 levels "FALSE","TRUE": NA NA NA NA NA NA
#> $ consensus_complete : chr [1:6] NA NA NA NA ...
#> $ event_datetime_1_follow1 : POSIXct[1:6], format: NA "2024-01-18 12:49:42" ...
#> $ event_datetime_1_follow2 : POSIXct[1:6], format: NA NA ...
#> $ event_age_1_follow1 : num [1:6] NA NA NA 96 127 NA
#> $ event_age_1_follow2 : num [1:6] NA NA NA 118 NA NA
#> $ event_type_1_follow1 : chr [1:6] NA "TIA" "AIS" "TIA" ...
#> $ event_type_1_follow2 : chr [1:6] NA NA "ICH" "AIS" ...
#> $ new_event_complete_1_follow1: chr [1:6] NA "Incomplete" "Incomplete" "Complete" ...
#> $ new_event_complete_1_follow2: chr [1:6] NA NA "Incomplete" "Complete" ...
#> $ event_datetime_2_follow1 : POSIXct[1:6], format: NA NA ...
#> $ event_datetime_2_follow2 : POSIXct[1:6], format: NA NA ...
#> $ event_datetime_3_follow1 : POSIXct[1:6], format: NA NA ...
#> $ event_datetime_3_follow2 : POSIXct[1:6], format: NA NA ...
#> $ event_age_2_follow1 : num [1:6] NA NA NA 105 127 NA
#> $ event_age_2_follow2 : num [1:6] NA NA NA 118 NA NA
#> $ event_age_3_follow1 : num [1:6] NA NA NA NA NA NA
#> $ event_age_3_follow2 : num [1:6] NA NA NA 118 NA NA
#> $ event_type_2_follow1 : chr [1:6] NA NA "ICH" "TIA" ...
#> $ event_type_2_follow2 : chr [1:6] NA NA "TIA" "ICH" ...
#> $ event_type_3_follow1 : chr [1:6] NA NA NA NA ...
#> $ event_type_3_follow2 : chr [1:6] NA NA "AIS" "Unknown" ...
#> $ new_event_complete_2_follow1: chr [1:6] NA NA "Incomplete" "Complete" ...
#> $ new_event_complete_2_follow2: chr [1:6] NA NA "Incomplete" "Incomplete" ...
#> $ new_event_complete_3_follow1: chr [1:6] NA NA NA NA ...
#> $ new_event_complete_3_follow2: chr [1:6] NA NA "Incomplete" "Complete" ...
ds|>
ds2dd_detailed()|>
purrr::pluck("meta") |>
head(10)
#> # A tibble: 10 × 18
#> field_name form_name section_header field_type field_label
#> <chr> <chr> <lgl> <chr> <chr>
#> 1 record_id data NA text record_id
#> 2 cpr data NA text cpr
#> 3 inclusion data NA text inclusion
#> 4 inclusion_time data NA text inclusion_time
#> 5 dob data NA text dob
#> 6 age data NA text age
#> 7 age_integer data NA text age_integer
#> 8 sex data NA text sex
#> 9 cohabitation data NA text cohabitation
#> 10 hypertension data NA text hypertension
#> # ℹ 13 more variables: select_choices_or_calculations <chr>, field_note <lgl>,
#> # text_validation_type_or_show_slider_number <chr>,
#> # text_validation_min <lgl>, text_validation_max <lgl>, identifier <lgl>,
#> # branching_logic <lgl>, required_field <lgl>, custom_alignment <lgl>,
#> # question_number <lgl>, matrix_group_name <lgl>, matrix_ranking <lgl>,
#> # field_annotation <lgl>
Different data formats are accepted, which all mostly implements the
readr::col_guess()
functionality to parse column
classes.
To ensure uniformity in data import this parsing has been implemented
on its own to use with ds2dd_detailed()
or any other data
set for that matter:
ds_parsed <- redcapcast_data |>
dplyr::mutate(dplyr::across(dplyr::everything(),as.character)) |>
parse_data()
str(ds_parsed)
#> tibble [25 × 27] (S3: tbl_df/tbl/data.frame)
#> $ record_id : num [1:25] 1 2 2 2 3 3 3 3 3 3 ...
#> $ redcap_event_name : chr [1:25] "inclusion" "inclusion" "follow1" "follow1" ...
#> $ redcap_repeat_instrument : chr [1:25] NA NA NA "New Event (?)" ...
#> $ redcap_repeat_instance : num [1:25] NA NA NA 1 NA NA NA 1 2 1 ...
#> $ cpr : chr [1:25] "1203401OB4" "0102342303" NA NA ...
#> $ inclusion : Date[1:25], format: "2023-03-13" "2023-03-01" ...
#> $ inclusion_time : 'hms' num [1:25] 12:38:49 10:38:57 NA NA ...
#> ..- attr(*, "units")= chr "secs"
#> $ dob : Date[1:25], format: "1940-03-12" "1934-02-01" ...
#> $ age : num [1:25] 83 89.1 NA NA 66.1 ...
#> $ age_integer : num [1:25] 83 89 NA NA 66 NA NA NA NA NA ...
#> $ sex : chr [1:25] "female" "male" NA NA ...
#> $ cohabitation : chr [1:25] "Yes" "Yes" NA NA ...
#> $ hypertension : chr [1:25] "No" "No" NA NA ...
#> $ diabetes : chr [1:25] "Yes" "No" NA NA ...
#> $ region : chr [1:25] "East" "South" NA NA ...
#> $ baseline_data_start_complete: chr [1:25] "Incomplete" "Incomplete" NA NA ...
#> $ mrs_assessed : chr [1:25] "Yes" "Yes" "Yes" NA ...
#> $ mrs_date : Date[1:25], format: "2023-03-13" "2023-03-07" ...
#> $ mrs_score : num [1:25] 1 1 3 NA NA 2 1 NA NA NA ...
#> $ mrs_complete : chr [1:25] "Incomplete" "Incomplete" "Incomplete" NA ...
#> $ con_mrs : logi [1:25] NA NA NA NA NA NA ...
#> $ con_calc : logi [1:25] NA NA NA NA NA NA ...
#> $ consensus_complete : chr [1:25] NA NA "Incomplete" NA ...
#> $ event_datetime : POSIXct[1:25], format: NA NA ...
#> $ event_age : num [1:25] NA NA NA NA NA NA NA NA NA NA ...
#> $ event_type : chr [1:25] NA NA NA "TIA" ...
#> $ new_event_complete : chr [1:25] NA NA NA "Incomplete" ...
#> - attr(*, "problems")=<externalptr>
It will ignore specified columns, which is neat for numeric-looking strings like cpr-with a leading 0:
redcapcast_data |>
dplyr::mutate(dplyr::across(dplyr::everything(),as.character)) |>
parse_data(ignore.vars = c("record_id","cpr")) |>
str()
#> tibble [25 × 27] (S3: tbl_df/tbl/data.frame)
#> $ record_id : chr [1:25] "1" "2" "2" "2" ...
#> $ redcap_event_name : chr [1:25] "inclusion" "inclusion" "follow1" "follow1" ...
#> $ redcap_repeat_instrument : chr [1:25] NA NA NA "New Event (?)" ...
#> $ redcap_repeat_instance : num [1:25] NA NA NA 1 NA NA NA 1 2 1 ...
#> $ cpr : chr [1:25] "1203401OB4" "0102342303" NA NA ...
#> $ inclusion : Date[1:25], format: "2023-03-13" "2023-03-01" ...
#> $ inclusion_time : 'hms' num [1:25] 12:38:49 10:38:57 NA NA ...
#> ..- attr(*, "units")= chr "secs"
#> $ dob : Date[1:25], format: "1940-03-12" "1934-02-01" ...
#> $ age : num [1:25] 83 89.1 NA NA 66.1 ...
#> $ age_integer : num [1:25] 83 89 NA NA 66 NA NA NA NA NA ...
#> $ sex : chr [1:25] "female" "male" NA NA ...
#> $ cohabitation : chr [1:25] "Yes" "Yes" NA NA ...
#> $ hypertension : chr [1:25] "No" "No" NA NA ...
#> $ diabetes : chr [1:25] "Yes" "No" NA NA ...
#> $ region : chr [1:25] "East" "South" NA NA ...
#> $ baseline_data_start_complete: chr [1:25] "Incomplete" "Incomplete" NA NA ...
#> $ mrs_assessed : chr [1:25] "Yes" "Yes" "Yes" NA ...
#> $ mrs_date : Date[1:25], format: "2023-03-13" "2023-03-07" ...
#> $ mrs_score : num [1:25] 1 1 3 NA NA 2 1 NA NA NA ...
#> $ mrs_complete : chr [1:25] "Incomplete" "Incomplete" "Incomplete" NA ...
#> $ con_mrs : logi [1:25] NA NA NA NA NA NA ...
#> $ con_calc : logi [1:25] NA NA NA NA NA NA ...
#> $ consensus_complete : chr [1:25] NA NA "Incomplete" NA ...
#> $ event_datetime : POSIXct[1:25], format: NA NA ...
#> $ event_age : num [1:25] NA NA NA NA NA NA NA NA NA NA ...
#> $ event_type : chr [1:25] NA NA NA "TIA" ...
#> $ new_event_complete : chr [1:25] NA NA NA "Incomplete" ...
#> - attr(*, "problems")=<externalptr>
Column classes can be passed to parse_data()
.
Making a few crude assumption for factorising data,
numchar2fct()
factorises numerical and character vectors
based on a set threshold for unique values:
mtcars |> str()
#> 'data.frame': 32 obs. of 11 variables:
#> $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
#> $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
#> $ disp: num 160 160 108 258 360 ...
#> $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
#> $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
#> $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
#> $ qsec: num 16.5 17 18.6 19.4 17 ...
#> $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
#> $ am : num 1 1 1 0 0 0 0 0 0 0 ...
#> $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
#> $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
mtcars |>
numchar2fct(numeric.threshold = 6) |>
str()
#> 'data.frame': 32 obs. of 11 variables:
#> $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
#> $ cyl : Factor w/ 3 levels "4","6","8": 2 2 1 2 3 2 3 1 1 2 ...
#> $ disp: num 160 160 108 258 360 ...
#> $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
#> $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
#> $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
#> $ qsec: num 16.5 17 18.6 19.4 17 ...
#> $ vs : Factor w/ 2 levels "0","1": 1 1 2 2 1 2 1 2 2 2 ...
#> $ am : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 1 1 ...
#> $ gear: Factor w/ 3 levels "3","4","5": 2 2 2 1 1 1 1 2 2 2 ...
#> $ carb: Factor w/ 6 levels "1","2","3","4",..: 4 4 1 1 2 1 4 2 2 4 ...
ds_parsed|>
numchar2fct(numeric.threshold = 2) |>
str()
#> Warning: There were 13 warnings in `dplyr::mutate()`.
#> The first warning was:
#> ℹ In argument: `dplyr::across(...)`.
#> Caused by warning in `.roman2numeric()`:
#> ! invalid roman numerals: inclusion inclusion follow1 follow1 inclusion follow1 follow2 follow1 follow1 follow2 follow2 follow2 inclusion follow1 follow2 follow1 follow1 follow2 follow2 follow2 inclusion follow1 follow1 follow1 inclusion
#> ℹ Run `dplyr::last_dplyr_warnings()` to see the 12 remaining warnings.
#> tibble [25 × 27] (S3: tbl_df/tbl/data.frame)
#> $ record_id : num [1:25] 1 2 2 2 3 3 3 3 3 3 ...
#> $ redcap_event_name : Factor w/ 3 levels "inclusion","follow1",..: 1 1 2 2 1 2 3 2 2 3 ...
#> $ redcap_repeat_instrument : Factor w/ 1 level "New Event (?)": NA NA NA 1 NA NA NA 1 1 1 ...
#> $ redcap_repeat_instance : num [1:25] NA NA NA 1 NA NA NA 1 2 1 ...
#> $ cpr : chr [1:25] "1203401OB4" "0102342303" NA NA ...
#> $ inclusion : Date[1:25], format: "2023-03-13" "2023-03-01" ...
#> $ inclusion_time : 'hms' num [1:25] 12:38:49 10:38:57 NA NA ...
#> ..- attr(*, "units")= chr "secs"
#> $ dob : Date[1:25], format: "1940-03-12" "1934-02-01" ...
#> $ age : num [1:25] 83 89.1 NA NA 66.1 ...
#> $ age_integer : num [1:25] 83 89 NA NA 66 NA NA NA NA NA ...
#> $ sex : Factor w/ 2 levels "female","male": 1 2 NA NA 2 NA NA NA NA NA ...
#> $ cohabitation : Factor w/ 2 levels "Yes","No": 1 1 NA NA 2 NA NA NA NA NA ...
#> $ hypertension : Factor w/ 2 levels "No","Yes": 1 1 NA NA 2 NA NA NA NA NA ...
#> $ diabetes : Factor w/ 2 levels "Yes","No": 1 2 NA NA 1 NA NA NA NA NA ...
#> $ region : Factor w/ 3 levels "East","South",..: 1 2 NA NA 3 NA NA NA NA NA ...
#> $ baseline_data_start_complete: Factor w/ 2 levels "Incomplete","Complete": 1 1 NA NA 1 NA NA NA NA NA ...
#> $ mrs_assessed : Factor w/ 1 level "Yes": 1 1 1 NA NA 1 1 NA NA NA ...
#> $ mrs_date : Date[1:25], format: "2023-03-13" "2023-03-07" ...
#> $ mrs_score : num [1:25] 1 1 3 NA NA 2 1 NA NA NA ...
#> $ mrs_complete : Factor w/ 1 level "Incomplete": 1 1 1 NA 1 1 1 NA NA NA ...
#> $ con_mrs : logi [1:25] NA NA NA NA NA NA ...
#> $ con_calc : logi [1:25] NA NA NA NA NA NA ...
#> $ consensus_complete : Factor w/ 1 level "Incomplete": NA NA 1 NA NA 1 1 NA NA NA ...
#> $ event_datetime : POSIXct[1:25], format: NA NA ...
#> $ event_age : num [1:25] NA NA NA NA NA NA NA NA NA NA ...
#> $ event_type : Factor w/ 4 levels "TIA","AIS","ICH",..: NA NA NA 1 NA NA NA 2 3 3 ...
#> $ new_event_complete : Factor w/ 2 levels "Incomplete","Complete": NA NA NA 1 NA NA NA 1 1 1 ...
#> - attr(*, "problems")=<externalptr>