Complete workflow. Comments are lacking a bit

2022-08-17 11:50:37 +02:00 · 2022-08-17 11:50:37 +02:00 · fd920d2027
commit fd920d2027
parent 897d391add
1 changed files with 158 additions and 38 deletions
--- a/conversion_upload.R
+++ b/conversion_upload.R
@ -3,7 +3,8 @@
 ## -----------------------------------------------------------------------------
 ## 
 ## This script is to create a datadictionary for REDCap, to upload data.
-## 
+## The data frame consits of severral excel sheets, nad the database already contains data
+## Instead of a complete datadictionary, individual sheets will be converted to instrument files.
 ## 

 ## -----------------------------------------------------------------------------
@ -20,22 +21,65 @@ library(dplyr)

 setwd("/Volumes/Data/toorisky/")

-dta<-read_dta("Alle apo Aarhus 2018.dta") %>% # Defined dataset
-  filter(VaskDiag==1) %>% 
-  mutate(treatment=factor(case_when(trombolyse!=2&trombektomi!=2 ~ 0,
-                                    trombolyse==2|trombektomi==2 ~ 1)),
-         sex.n=factor(ifelse(as.integer(substr(cpr, start = 10, stop = 10)) %%2 == 0, 
-                             "female", "male"))) %>%
-  left_join(.,read_excel("2022-02-08_ddsc_dataexport.xlsx", sheet = "Patienter"),
-            by=c("ForloebID"="forloebid"))%>%
-  left_join(.,read_excel("2022-02-08_ddsc_dataexport.xlsx", sheet = "3 mdr. opf."),
-            by=c("ForloebID"="ForloebID"))%>%
-  mutate(cpr=cpr.x,
-         ID=ID.x)%>%
-  dplyr::select(-starts_with("cpr."))%>%
-  dplyr::select(-starts_with("ID."))
+dput(excel_sheets("2022-02-08_ddsc_dataexport.xlsx")) # Getting sheet names

-colnames(dta)<-tolower(colnames(dta))
+shts_sel<-c("Patienter", # Selecting relevant sheets
+            "Basis-skema",
+            "Trombolyse",
+            "Trombektomi", 
+            "3 mdr. opf.")
+
+inst_names<-c("Patienter", # Renaming to avoid trailing numbers and spaces, as per REDCap
+              "Basisskema",
+              "Trombolyse",
+              "Trombektomi", 
+              "Followup") 
+
+ids<-read_dta("Alle apo Aarhus 2018.dta") |> # Defined dataset
+  filter(VaskDiag==1) |>
+  select("ForloebID") |>
+  data.frame()
+
+colnames(ids)<-tolower(colnames(ids))
+
+## Importing xls sheets and naming list from sheet names
+r_lup<-list()
+
+for (i in 1:length(shts_sel)){
+  xcl<-read_excel("2022-02-08_ddsc_dataexport.xlsx", sheet = shts_sel[i])|>
+    dplyr::select(-starts_with("cpr"))|>
+    dplyr::select(-starts_with("ID"))|>
+    data.frame()
+  
+  colnames(xcl)<-tolower(colnames(xcl))
+  
+  r_lup[[i]]<-xcl|>
+    filter(forloebid %in% ids$forloebid)
+  names(r_lup)[i]<-inst_names[i]
+  }
+
+cnames<-c()
+for (i in 1:length(r_lup)){
+  cnames<-c(cnames,colnames(r_lup[[i]]))
+}
+
+
+## Appending suffix for duplicated colnames from different sheets
+dup_names<-unique(cnames[duplicated(cnames)])[-1] # omit the first, "forloebid"
+
+for (i in 1:length(r_lup)){
+  # Getting sheetname bit
+  sht_name<-gsub('[ .]', '', names(r_lup)[i])|>
+    substr(1,6)|>
+    tolower()|>
+    (\(x) paste0("_",x))() # Lambda function in lack of better
+  
+  # Pasting
+  sht_dup<-paste0(colnames(r_lup[[i]])[colnames(r_lup[[i]]) %in% dup_names],sht_name)
+  
+  # Renaming
+  colnames(r_lup[[i]])[colnames(r_lup[[i]]) %in% dup_names] <- sht_dup
+}

 ## -----------------------------------------------------------------------------
 ## Fix missing record_id's for upload
@ -53,14 +97,17 @@ redcap <- redcap_read_oneshot(
  fields = c("record_id","forloebid")
 )$data

+## Adding and creating record_id's
+ids_r<-full_join(ids,redcap)

-## Joining and adding record_id's
-dta<-full_join(dta,redcap)
-
-n_na<-length(dta$record_id[is.na(dta$record_id)])
-n_id<-max(dta$record_id,na.rm=T)
+n_na<-length(ids_r$record_id[is.na(ids_r$record_id)])
+n_id<-max(ids_r$record_id,na.rm=T)
 # filter(!is.na(akut_ind))%>%
-dta$record_id[is.na(dta$record_id)]<-(n_id+1):(n_id+n_na) # Simple math
+ids_r$record_id[is.na(ids_r$record_id)]<-(n_id+1):(n_id+n_na) # Simple math
+
+for (i in 1:length(r_lup)){
+  r_lup[[i]]<-left_join(r_lup[[i]],ids_r)
+}

 ## -----------------------------------------------------------------------------
 ## Data dictionary
@ -68,32 +115,105 @@ dta$record_id[is.na(dta$record_id)]<-(n_id+1):(n_id+n_na) # Simple math

 setwd("/Users/au301842/nottreated/")

-icname<-colnames(read.csv("examlpe instrument.csv"))
-dd<-data.frame(matrix(ncol = length(icname),
-                      nrow=ncol(dta))) ## Data frame to collect all
+icname<-colnames(read.csv("example instrument.csv"))
+
+dd<-data.frame(matrix(ncol = length(icname))) ## Data frame to collect all
 colnames(dd)<-icname

+## Instrument for DataDictionary
+## Format dataset for REDCap upload
+
+for (i in 1:length(r_lup)){
+  dd_i<-data.frame(matrix(ncol = length(icname),nrow = ncol("[["(r_lup,i)))) ## Data frame to collect all
+  
+  colnames(dd_i)<-icname ## for easier reading
+  
  ## Variable names
-dd[1]<-colnames(dta)
+  dd_i[1]<-colnames("[["(r_lup,i))
  
  ## Form Name
-dd[2]<-"ddsc"
+  dd_i[2]<-names(r_lup)[i]
  
  ## Field Type
  # dd_i[4]<-ifelse(sapply(r_lup[[i]], class)=="factor","radio","text")
-dd[4]<-"text"
+  dd_i[4]<-"text"
  
-dd[5]<-colnames(dta)
+  ## Field Label
+  ## Using original attributes as field labels
+  fl<-lapply(r_lup[[i]], attr, "label")
+  for (j in 1:length(fl)){
+    fl[[j]]<-ifelse(is.null(fl[[j]]),
+                    names(fl)[[j]],
+                    fl[[j]])
+    ##  If no attributes, variable name is used as "placeholder"
+  }
+  dd_i[5]<-unlist(fl)

+  ## Merge all
+  dd<-rbind(dd,dd_i)
+}
+
+dd_exp<-dd %>% 
+  filter(!Variable...Field.Name %in% c("forloebid","record_id")) # These fields already exists in the database, and would cause conflicts

 ## -----------------------------------------------------------------------------
-## Instrument file
+## Instrument files
 ## -----------------------------------------------------------------------------

-write.csv(dd,"ddsc_instrument.csv",row.names = FALSE,na="")
+# Loop for doing it all - nice!
+# Splitting individual instruments from Data Dictionary file
+for (i in 1:length(inst_names)){
+  
+  if (!file.exists("instruments/")){ # Creating "instrument" subdirectory if it does not already exist
+  dir.create("instruments/", showWarnings = TRUE, recursive = FALSE, mode = "0777")}
+  
+  new_path<-paste0("instruments/",inst_names[i]) # Path for instrument folder
+  dir.create(new_path, showWarnings = FALSE, recursive = FALSE, mode = "0777") # Creating the folder path
+  
+  file.copy("OriginID.txt", # Copying origin file as per REDCap
+            new_path)
+  
+  dd_exp|>
+    filter(Form.Name==inst_names[i])|> # Filtering instrument entries
+    write.csv(paste0(new_path,"/instrument.csv"),row.names = FALSE,na="") # Exporting to folder 
+  
+  setwd(paste0("/Users/au301842/nottreated/",new_path)) # Setting wd in subfolder for zipping
+  zip(zipfile = inst_names[i], # Naming the zipfile according to instrument
+      files = dir(full.names = TRUE))
+  setwd("/Users/au301842/nottreated/")
+  
+  zip_file<-list.files(paste0("instruments/",inst_names[i]),
+                       pattern = ".zip", 
+                       full.names = TRUE)
+  
+  file.copy(zip_file, # Copy and remove zip-file for easy upload
+            "instruments/")
+  file.remove(zip_file)
+}

 ## -----------------------------------------------------------------------------
 ## Dataset export
 ## -----------------------------------------------------------------------------

-write.csv(dta,"ddsc_dataset.csv",row.names = FALSE)
+exp<-r_lup|>
+  purrr::reduce(full_join)|>
+  select(-matches("forloebid"))|>
+  select(record_id,everything()) # To arrange "record_id" as first column
+
+write.csv(exp,"ddsc_dataset.csv",row.names = FALSE,na="")
+
+
+## -----------------------------------------------------------------------------
+## Dataset export of remaing
+## -----------------------------------------------------------------------------
+
+## REDCap faces troubles importing too large data sets. Apparently.
+
+ups <- redcap_read_oneshot(
+  redcap_uri   = uri,
+  token        = token,
+  fields = "record_id"
+)$data
+
+write.csv(exp|>
+            filter(!record_id %in% ups$record_id),"ddsc_dataset_remain.csv",row.names = FALSE,na="")