New chunks_of_n() and n_chunks() functions

2025-04-23 22:36:38 +02:00 · 2023-09-05 10:07:33 -07:00 · 2023-09-05 10:07:33 -07:00 · 193844c212
commit 193844c212
parent f810a7af53
4 changed files with 202 additions and 0 deletions
--- a/R/chunks_of_n.R
+++ b/R/chunks_of_n.R
@ -0,0 +1,84 @@
+#' Split to chunks of size n
+#'
+#' @param d data. Can be vector or data frame.
+#' @param n number of chunks
+#' @param label naming prefix for chunk names
+#' @param even boolean to set if size of chunks should be evenly distributed.
+#' @param pattern regex pattern to extract names from provided vector. If data
+#' frame, will assume first column is name.
+#'
+#' @return List of length n
+#' @export
+#'
+#' @examples
+#' tail(chunks_of_n(seq_len(100),7),3)
+#' tail(chunks_of_n(seq_len(100),7,even=TRUE),3)
+#' ds <- data.frame(nm=paste0("Sub",
+#' sprintf("%03s", rownames(stRoke::talos))),stRoke::talos)
+#' head(chunks_of_n(ds,7,pattern="Sub[0-9]{3}",label="grp"),2)
+#' ## Please notice that no sorting is performed. This is on purpose to preserve
+#' ## original sorting. If sorting is intended, try something like this:
+#' ds[order(ds$nm),] |> chunks_of_n(7,pattern="Sub[0-9]{3}",label="grp") |> 
+#' head(2)
+
+chunks_of_n <- function(d,n,label=NULL, even=FALSE, pattern=NULL){
+  
+  if (!(is.vector(d) |
+        is.data.frame(d)) |
+      inherits(d,"list")) {
+    stop("Provided data is not vector or data.frame.")
+  }
+  
+  if (is.data.frame(d)) ns <- nrow(d) else ns <- length(d) 
+  
+  if (even) {
+    g <- sort(rep_len(seq_len(ceiling(ns / n)), ns))
+  } else {
+    g <- ceiling(seq_len(ns) / n)
+  }
+  
+  ls <- split(d, g)
+  
+  if (!is.null(pattern)) {
+    if(is.data.frame(d)) {
+      ns <- str_extract(d=d[[1]],pattern=pattern)
+      } else ns <- str_extract(d=d,pattern=pattern)
+    
+    
+    suffix <- do.call(c, lapply(split(ns, g), function(i) {
+      paste0(i[[1]], "-", i[[length(i)]])
+    }))
+  } else suffix <- names(ls)
+  
+  if (is.character(label)){
+    names(ls) <- paste0(label,"-",suffix)
+    } else names(ls) <- suffix
+  
+  ls
+}
+
+#' Splits in n chunks
+#'
+#' @param d data
+#' @param n number of chunks
+#' @param ... arguments passed to internal `chunks_of_n()`
+#'
+#' @return List of chunks
+#' @export
+#'
+#' @examples
+#' lengths(n_chunks(d=seq_len(100),n=7,even=TRUE))
+#' lengths(n_chunks(d=seq_len(100),n=7,even=FALSE))
+n_chunks <- function(d,n,...){
+  if (!(is.vector(d) |
+        is.data.frame(d)) |
+      inherits(d,"list")) {
+    stop("Provided data is not vector or data.frame.")
+  }
+  
+  if (is.data.frame(d)) ns <- nrow(d) else ns <- length(d) 
+  
+  nn <- ceiling(ns/n)
+  
+  chunks_of_n(d=d,n=nn,...)
+}
--- a/man/chunks_of_n.Rd
+++ b/man/chunks_of_n.Rd
@ -0,0 +1,37 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/chunks_of_n.R
+\name{chunks_of_n}
+\alias{chunks_of_n}
+\title{Split to chunks of size n}
+\usage{
+chunks_of_n(d, n, label = NULL, even = FALSE, pattern = NULL)
+}
+\arguments{
+\item{d}{data. Can be vector or data frame.}
+
+\item{n}{number of chunks}
+
+\item{label}{naming prefix for chunk names}
+
+\item{even}{boolean to set if size of chunks should be evenly distributed.}
+
+\item{pattern}{regex pattern to extract names from provided vector. If data
+frame, will assume first column is name.}
+}
+\value{
+List of length n
+}
+\description{
+Split to chunks of size n
+}
+\examples{
+tail(chunks_of_n(seq_len(100),7),3)
+tail(chunks_of_n(seq_len(100),7,even=TRUE),3)
+ds <- data.frame(nm=paste0("Sub",
+sprintf("\%03s", rownames(stRoke::talos))),stRoke::talos)
+head(chunks_of_n(ds,7,pattern="Sub[0-9]{3}",label="grp"),2)
+## Please notice that no sorting is performed. This is on purpose to preserve
+## original sorting. If sorting is intended, try something like this:
+ds[order(ds$nm),] |> chunks_of_n(7,pattern="Sub[0-9]{3}",label="grp") |> 
+head(2)
+}
--- a/man/n_chunks.Rd
+++ b/man/n_chunks.Rd
@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/chunks_of_n.R
+\name{n_chunks}
+\alias{n_chunks}
+\title{Splits in n chunks}
+\usage{
+n_chunks(d, n, ...)
+}
+\arguments{
+\item{d}{data}
+
+\item{n}{number of chunks}
+
+\item{...}{arguments passed to internal \code{chunks_of_n()}}
+}
+\value{
+List of chunks
+}
+\description{
+Splits in n chunks
+}
+\examples{
+lengths(n_chunks(d=seq_len(100),n=7,even=TRUE))
+lengths(n_chunks(d=seq_len(100),n=7,even=FALSE))
+}
--- a/tests/testthat/test-chunks.R
+++ b/tests/testthat/test-chunks.R
@ -0,0 +1,56 @@
+# library(testthat)
+test_that("chunks_of_n returns correct", {
+  expect_length(chunks_of_n(seq_len(100), 7),15)
+  
+  expect_equal(lengths(chunks_of_n(seq_len(30), 7, even = TRUE), 
+                       use.names = FALSE), c(6, 6, 6, 6, 6))
+  
+  ds <- data.frame(nm = paste0("Sub",
+                               sprintf("%03s", rownames(stRoke::talos))), 
+                   stRoke::talos)
+  
+  is.list(ds)
+  
+  class(list(ds))
+  
+  expect_equal(head(names(chunks_of_n(ds, 7, 
+                                      pattern = "Sub[0-9]{3}", label = "grp")),
+                    1),"grp-Sub038-Sub011")
+  
+  expect_equal(
+    ds[order(ds$nm),] |> chunks_of_n(7, pattern = "Sub[0-9]{3}", label =
+                                       "grp") |>  head(1) |> names(),
+    "grp-Sub001-Sub020"
+  )
+  
+  ## Errors
+  expect_error(chunks_of_n(list(ds), 7, pattern = "Sub[0-9]{3}", label = "grp"))
+  
+  
+})
+
+test_that("n_chunks returns correct", {
+  expect_length(n_chunks(seq_len(100), 7),7)
+  
+  expect_equal(lengths(n_chunks(seq_len(30), 7, even = TRUE), 
+                       use.names = FALSE), rep(5,6))
+  
+  ds <- data.frame(nm = paste0("Sub",
+                               sprintf("%03s", rownames(stRoke::talos))), 
+                   stRoke::talos)
+  
+  expect_equal(head(names(n_chunks(ds, 7, 
+                                      pattern = "Sub[0-9]{3}", label = "grp")),
+                    1),"grp-Sub038-Sub603")
+  
+  expect_equal(
+    ds[order(ds$nm),] |> n_chunks(7, pattern = "Sub[0-9]{3}", label =
+                                       "grp") |>  head(1) |> names(),
+    "grp-Sub001-Sub072"
+  )
+  
+  ## Errors
+  expect_error(n_chunks(list(ds), 7, pattern = "Sub[0-9]{3}", label = "grp"))
+  
+  
+})