REDCapCAST/SAS/REDCap_split.sas
Paul W. Egeler, MS, GStat a2dcdc73a9 Partial solve to issue #13
2019-12-27 23:37:12 -05:00

230 lines
6.6 KiB
SAS

/********************************************************************************
*
* FILE: REDCap_split.sas
*
* VERSION: 0.1.0
*
* PURPOSE: Take a REDCap dataset with multiple events and make into several
* tables with primary and foreign keys
*
* AUTHOR: Paul W. Egeler, M.S., GStat
*
* DATE: 22JUN2017
*
*******************************************************************************
*
* INSTRUCTIONS:
*
* 1. Run the SAS code provided by REDCap to load the records into your SAS session.
*
* 2. Download the data dictionary for your project.
*
* 3. Run the macro definitions REDCAP_READ_DATA_DICT and REDCAP_SPLIT
*
* 4. Run the macro call for REDCAP_READ_DATA_DICT to load in the data dictionry.
* This is necessary to split the tables correctly.
*
* 5. Run the macro call for REDCAP_SPLIT. You will have an output dataset for
* your main table as well as for each repeating instrument.
*
********************************************************************************/
%MACRO REDCAP_READ_DATA_DICT(
DATA_DICTIONARY /* The file path for the data dictionary */
);
DATA REDCAP_DATA_DICTIONARY;
LENGTH VAR_NAME $ 200 FORM_NAME $ 200 SECTION_HEADER $ 200 FIELD_TYPE $ 200 X1-X14 $ 2250;
INFILE "&DATA_DICTIONARY" FIRSTOBS = 2 DSD DLM = "," LRECL=32767;
INPUT VAR_NAME $ FORM_NAME $ SECTION_HEADER $ FIELD_TYPE $ X1-X14 $;
IF FIELD_TYPE EQ "descriptive" THEN DELETE;
DROP SECTION_HEADER X1-X14;
RUN;
%MEND REDCAP_READ_DATA_DICT;
%MACRO REDCAP_SPLIT(
DATA_SET = REDCAP /* The name of the SAS dataset created by REDCap */,
DATA_DICTIONARY = REDCAP_DATA_DICTIONARY /* The name of the SAS dataset of the data dictionary */,
NUMERIC_SUBTABLES = N /* Y/N: Should the subtables be numbered (Y) or should the be based on the name of the repeating instrument (N)? */
);
/* Find the key that links the base table to child tables */
/* Also check that REDCAP_REPEAT_INSTRUMENT is present in the data */
%LET DSID = %SYSFUNC(OPEN(&DATA_SET));
%LET KEY = %SYSFUNC(VARNAME(&DSID, 1));
%LET CHECK = %SYSFUNC(VARNUM(&DSID, REDCAP_REPEAT_INSTRUMENT));
%LET RC = %SYSFUNC(CLOSE(&DSID));
%IF &CHECK EQ 0 %THEN %DO;
%PUT ERROR: The dataset &DATA_SET does not contain repeating instruments.;
%PUT ERROR: Stopping MACRO.;
%GOTO FINISH;
%END;
/* Remove formatting from repeat instrument field */
%PUT NOTE: Removing formatting from REDCAP_REPEAT_INSTRUMENT in &DATA_SET..;
DATA &DATA_SET.;
SET &DATA_SET.;
FORMAT REDCAP_REPEAT_INSTRUMENT;
RUN;
/* Find the subtable names and number of subtables */
PROC SQL NOPRINT;
SELECT DISTINCT
REDCAP_REPEAT_INSTRUMENT,
"'"!!trim(REDCAP_REPEAT_INSTRUMENT)!!"'" AS INSTRUMENT_QUOTED
INTO
:INSTRUMENTS SEPARATED BY ' ',
:INSTRUMENT_LIST SEPARATED BY ','
FROM &DATA_SET AS A
WHERE REDCAP_REPEAT_INSTRUMENT GT '';
%LET N_SUBTABLES = &SQLOBS;
QUIT;
%IF &N_SUBTABLES EQ 0 %THEN %DO;
%PUT WARNING: There were no records containing repeating instruments in the dataset %LEFT(&DATA_SET).;
%PUT WARNING: No action was taken.;
%GOTO FINISH;
%END;
%PUT N SUBTABLES: %LEFT(&N_SUBTABLES);
%PUT INSTRUMENTS: %LEFT(&INSTRUMENTS);
%PUT INSTRUMENT LIST: %LEFT(%BQUOTE(&INSTRUMENT_LIST));
/* Get information on the variables in the dataset */
PROC CONTENTS
DATA = &DATA_SET.
OUT = REDCAP_VARNAMES(KEEP=NAME VARNUM)
NOPRINT;
RUN;
/* Make a list of fields and their associated forms based on data dictionary */
DATA REDCAP_FIELDS(KEEP=VAR_NAME FORM_NAME);
SET &DATA_DICTIONARY.;
IF FIELD_TYPE EQ "checkbox" THEN DO;
BASENAME = VAR_NAME;
DO I = 1 TO N;
SET REDCAP_VARNAMES POINT=I NOBS=N;
IF PRXMATCH("/^"!!trim(BASENAME)!!"___.+$/", NAME) THEN DO;
VAR_NAME = NAME;
OUTPUT;
END;
END;
END;
ELSE OUTPUT;
RUN;
/* Add instrument status fields to list of fields */
PROC SQL;
CREATE TABLE REDCAP_INSTRUMENT_STATUS_FIELDS AS
SELECT DISTINCT TRIM(FORM_NAME)!!"_complete" AS VAR_NAME LENGTH=200, FORM_NAME
FROM &DATA_DICTIONARY.;
QUIT;
PROC APPEND
BASE=REDCAP_FIELDS
DATA=REDCAP_INSTRUMENT_STATUS_FIELDS;
RUN;
DATA REDCAP_FIELDS;
SET REDCAP_FIELDS;
IF LENGTH(VAR_NAME) GT 32 THEN DO;
PUT "WARNING: The variable " VAR_NAME "is too long (MAX 32 CHARACTERS).";
PUT "WARNING: " VAR_NAME "will not be included in the output dataset.";
DELETE;
END;
RUN;
/* Sort out the field names */
PROC SQL NOPRINT;
SELECT VAR_NAME
INTO :VARS_BASE SEPARATED BY ' '
FROM REDCAP_FIELDS AS A
WHERE FORM_NAME NOT IN (&INSTRUMENT_LIST);
%put Base vars: &VARS_BASE;
%DO I = 1 %TO &N_SUBTABLES;
%LET INSTRUMENT_I = %SCAN(&INSTRUMENTS,&I,%STR( ));
SELECT VAR_NAME
INTO :VARS_&I. SEPARATED BY ' '
FROM REDCAP_FIELDS AS A
WHERE FORM_NAME EQ "&INSTRUMENT_I.";
%PUT &INSTRUMENT_I. vars: &&VARS_&I;
%END;
QUIT;
/* Make new data sets based on field names above */
DATA &DATA_SET._BASE (KEEP = &VARS_BASE);
SET &DATA_SET;
IF MISSING(REDCAP_REPEAT_INSTRUMENT);
RUN;
%DO I = 1 %TO &N_SUBTABLES;
%LET INSTRUMENT_I = %SCAN(&INSTRUMENTS,&I,%STR( ));
/* Create either numeric table name or name based on repeating instrument name (see issue #12) */
%IF &NUMERIC_SUBTABLES EQ N %THEN %DO;
%LET SUBTABLE_NAME = &DATA_SET._&INSTRUMENT_I.;
%IF %LENGTH(&SUBTABLE_NAME) GT 32 %THEN %DO;
%PUT ERROR: The table name &SUBTABLE_NAME is %LENGTH(&SUBTABLE_NAME) characters long (MAX 32 CHARACTERS).;
%PUT ERROR: &SUBTABLE_NAME will be skipped. Consider setting NUMERIC_SUBTABLES = Y.;
%GOTO SKIP;
%END;
%END;
%ELSE %DO;
%LET SUBTABLE_NAME = &DATA_SET._&I.;
%END;
DATA &SUBTABLE_NAME (KEEP = &KEY redcap_repeat_instance &&VARS_&I);
SET &DATA_SET;
IF REDCAP_REPEAT_INSTRUMENT EQ "&INSTRUMENT_I.";
RUN;
%PUT NOTE: Records from instrument &INSTRUMENT_I have been placed in &SUBTABLE_NAME;
%SKIP:
%END;
/* Clean up temporary datasets */
PROC DATASETS MEMTYPE=DATA LIBRARY=WORK NOLIST;
DELETE REDCAP_FIELDS REDCAP_VARNAMES REDCAP_INSTRUMENT_STATUS_FIELDS;
RUN;
%FINISH:
%MEND REDCAP_SPLIT;