New str_extract() function to extract substrings by regex pattern.

2025-04-04 23:02:33 +02:00 · 2023-09-05 10:08:26 -07:00 · 2023-09-05 10:08:26 -07:00 · bd647a9acf
commit bd647a9acf
parent 193844c212
3 changed files with 78 additions and 0 deletions
--- a/R/str_extract.R
+++ b/R/str_extract.R
@ -0,0 +1,37 @@
 #' Extract string based on regex pattern
 #'
 #' Use base::strsplit to 
 #' @param d vector of character strings
 #' @param pattern regex pattern to match
 #'
 #' @return vector of character strings
 #' @export
 #'
 #' @examples
 #' ls <- do.call(c,lapply(sample(4:8,20,TRUE),function(i){
 #' paste(sample(letters,i,TRUE),collapse = "")}))
 #' ds <- do.call(c,lapply(1:20,function(i){
 #' paste(sample(ls,1),i,sample(ls,1),"23",sep = "_")}))
 #' str_extract(ds,"([0-9]+)")
 str_extract <- function(d,pattern){
  if (!is.vector(d)) stop("Please provide a vector")
  ## Drawing on the solution in REDCapCAST::strsplitx to split around pattern
  nl <- strsplit(gsub("~~", "~", # Removes double ~
                      gsub("^~", "", # Removes leading ~
                           gsub(
                             # Splits and inserts ~ at all delimiters
                             paste0("(", pattern, ")"), "~\\1~", d
                           ))), "~")
  ## Reusing the pattern, to sub with "" and match on length 0 to index the
  ## element containing the pattern. Only first occurance included.
  indx <- lapply(nl,function(i){
    match(0,nchar(sub(pattern,"",i)))
    })
  ## Using lapply to subsset the given index for each element in list
  do.call(c,lapply(seq_along(nl), function(i){
    nl[[i]][indx[[i]]]
  } ))
 }
--- a/man/str_extract.Rd
+++ b/man/str_extract.Rd
@ -0,0 +1,26 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/str_extract.R
 \name{str_extract}
 \alias{str_extract}
 \title{Extract string based on regex pattern}
 \usage{
 str_extract(d, pattern)
 }
 \arguments{
 \item{d}{vector of character strings}
 \item{pattern}{regex pattern to match}
 }
 \value{
 vector of character strings
 }
 \description{
 Use base::strsplit to
 }
 \examples{
 ls <- do.call(c,lapply(sample(4:8,20,TRUE),function(i){
 paste(sample(letters,i,TRUE),collapse = "")}))
 ds <- do.call(c,lapply(1:20,function(i){
 paste(sample(ls,1),i,sample(ls,1),"23",sep = "_")}))
 str_extract(ds,"([0-9]+)")
 }
--- a/tests/testthat/test-str_extract.R
+++ b/tests/testthat/test-str_extract.R
@ -0,0 +1,15 @@
 # library(testthat)
 test_that("str_extract returns correct", {
  ls <- do.call(c, lapply(sample(4:8, 20, T), function(i) {
    paste(sample(letters, i, T), collapse = "")
  }))
  ds <- do.call(c, lapply(1:20, function(i) {
    paste(sample(ls, 1), i, sample(ls, 1), "23", sep = "_")
  }))
  expect_equal(nchar(str_extract(ds, "([0-9]+)")),c(rep(1,9),rep(2,11)))
  expect_error(str_extract(data.frame(ds), "([0-9]+)"))
 })