New str_extract() function to extract substrings by regex pattern.

2025-04-25 23:20:46 +02:00 · 2023-09-05 10:08:26 -07:00 · 2023-09-05 10:08:26 -07:00 · bd647a9acf
commit bd647a9acf
parent 193844c212
3 changed files with 78 additions and 0 deletions
--- a/R/str_extract.R
+++ b/R/str_extract.R
@ -0,0 +1,37 @@
+#' Extract string based on regex pattern
+#'
+#' Use base::strsplit to 
+#' @param d vector of character strings
+#' @param pattern regex pattern to match
+#'
+#' @return vector of character strings
+#' @export
+#'
+#' @examples
+#' ls <- do.call(c,lapply(sample(4:8,20,TRUE),function(i){
+#' paste(sample(letters,i,TRUE),collapse = "")}))
+#' ds <- do.call(c,lapply(1:20,function(i){
+#' paste(sample(ls,1),i,sample(ls,1),"23",sep = "_")}))
+#' str_extract(ds,"([0-9]+)")
+str_extract <- function(d,pattern){
+  if (!is.vector(d)) stop("Please provide a vector")
+  
+  ## Drawing on the solution in REDCapCAST::strsplitx to split around pattern
+  nl <- strsplit(gsub("~~", "~", # Removes double ~
+                      gsub("^~", "", # Removes leading ~
+                           gsub(
+                             # Splits and inserts ~ at all delimiters
+                             paste0("(", pattern, ")"), "~\\1~", d
+                           ))), "~")
+  
+  ## Reusing the pattern, to sub with "" and match on length 0 to index the
+  ## element containing the pattern. Only first occurance included.
+  indx <- lapply(nl,function(i){
+    match(0,nchar(sub(pattern,"",i)))
+    })
+  
+  ## Using lapply to subsset the given index for each element in list
+  do.call(c,lapply(seq_along(nl), function(i){
+    nl[[i]][indx[[i]]]
+  } ))
+}
--- a/man/str_extract.Rd
+++ b/man/str_extract.Rd
@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/str_extract.R
+\name{str_extract}
+\alias{str_extract}
+\title{Extract string based on regex pattern}
+\usage{
+str_extract(d, pattern)
+}
+\arguments{
+\item{d}{vector of character strings}
+
+\item{pattern}{regex pattern to match}
+}
+\value{
+vector of character strings
+}
+\description{
+Use base::strsplit to
+}
+\examples{
+ls <- do.call(c,lapply(sample(4:8,20,TRUE),function(i){
+paste(sample(letters,i,TRUE),collapse = "")}))
+ds <- do.call(c,lapply(1:20,function(i){
+paste(sample(ls,1),i,sample(ls,1),"23",sep = "_")}))
+str_extract(ds,"([0-9]+)")
+}
--- a/tests/testthat/test-str_extract.R
+++ b/tests/testthat/test-str_extract.R
@ -0,0 +1,15 @@
+# library(testthat)
+test_that("str_extract returns correct", {
+  ls <- do.call(c, lapply(sample(4:8, 20, T), function(i) {
+    paste(sample(letters, i, T), collapse = "")
+  }))
+  
+  ds <- do.call(c, lapply(1:20, function(i) {
+    paste(sample(ls, 1), i, sample(ls, 1), "23", sep = "_")
+  }))
+  
+  expect_equal(nchar(str_extract(ds, "([0-9]+)")),c(rep(1,9),rep(2,11)))
+  
+  expect_error(str_extract(data.frame(ds), "([0-9]+)"))
+  
+})