From 5c33453949e9d64a1e7a808a9bcad21885a2fb4f Mon Sep 17 00:00:00 2001 From: "J. Allen Baron" Date: Wed, 18 Sep 2024 15:20:13 -0400 Subject: [PATCH] Change output of suggest_regex() to tibble with stats Tests: PASS --- R/utils.R | 45 +++++++++++++++++++--- man/suggest_regex.Rd | 18 ++++++++- tests/testthat/test-utils.R | 75 +++++++++++++++++++++++++++++++++++-- 3 files changed, 127 insertions(+), 11 deletions(-) diff --git a/R/utils.R b/R/utils.R index 9bf3f6a..4b7e61b 100644 --- a/R/utils.R +++ b/R/utils.R @@ -199,18 +199,35 @@ match_arg_several <- function(arg, choices) { #' position. #' #' @param x A character vector. +#' @param pivot Whether the resulting `tibble` should be in "wide" (default) or +#' "long" format. #' -#' @returns A string with a bracketed character set for each position. +#' @returns When `pivot = "long"`, a tidy `tibble` with 3 columns and as many +#' rows as the string length of the longest input: +#' 1. `position`: indicating the position of the character set in the input. +#' 2. `regex`: giving the character set (in brackets), +#' 3. `n`: the count of input strings that have a character at that `position`. +#' +#' When `pivot = "wide"` (default), a `tibble` with the same information +#' organized into rows (1 header and 2 normal rows) corresponding to the 3 +#' columns described. #' #' @examples #' x <- c("DNA", "MHC", "TAP1", "TAP2", "520", "ACD") #' #' suggest_regex(x) +#' suggest_regex(x, "long") #' #' @export -suggest_regex <- function(x) { - x_len <- sort(unique(stringr::str_length(x))) - max_len <- max(x_len) +suggest_regex <- function(x, pivot = "wide") { + pivot <- match.arg(pivot, choices = c("wide", "long")) + out <- tibble::tibble(position = stringr::str_length(x)) |> + dplyr::count(.data$position) + max_len <- max(out$position) + missing_pos <- (1:max_len)[!1:max_len %in% out$position] + out <- out |> + tibble::add_row(position = missing_pos, n = max(out$n)) |> + dplyr::arrange(.data$position) xsplit <- stringr::str_split(x, "") @@ -232,7 +249,25 @@ suggest_regex <- function(x) { ) |> sandwich_text(c("[", "]")) - paste0(chr_at_pos, collapse = "") + out <- out |> + dplyr::mutate(regex = chr_at_pos, .after = "position") + + if (pivot == "wide") { + out <- out |> + dplyr::mutate(n = as.character(n)) |> + dplyr::rename(pos = position) |> + tidyr::pivot_longer( + cols = c("regex", "n"), + names_to = "position", + values_to = "value" + ) |> + tidyr::pivot_wider( + names_from = "pos", + values_from = "value" + ) + } + + out } diff --git a/man/suggest_regex.Rd b/man/suggest_regex.Rd index e8fa721..82625c2 100644 --- a/man/suggest_regex.Rd +++ b/man/suggest_regex.Rd @@ -4,13 +4,26 @@ \alias{suggest_regex} \title{Suggest a Regular Expression That Will Match All Input} \usage{ -suggest_regex(x) +suggest_regex(x, pivot = "wide") } \arguments{ \item{x}{A character vector.} + +\item{pivot}{Whether the resulting \code{tibble} should be in "wide" (default) or +"long" format.} } \value{ -A string with a bracketed character set for each position. +When \code{pivot = "long"}, a tidy \code{tibble} with 3 columns and as many +rows as the string length of the longest input: +\enumerate{ +\item \code{position}: indicating the position of the character set in the input. +\item \code{regex}: giving the character set (in brackets), +\item \code{n}: the count of input strings that have a character at that \code{position}. +} + +When \code{pivot = "wide"} (default), a \code{tibble} with the same information +organized into rows (1 header and 2 normal rows) corresponding to the 3 +columns described. } \description{ Collects the full set of characters found at each position across all strings @@ -22,5 +35,6 @@ position. x <- c("DNA", "MHC", "TAP1", "TAP2", "520", "ACD") suggest_regex(x) +suggest_regex(x, "long") } diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index d2ba25a..ae3bdd9 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -163,17 +163,84 @@ test_that("match_arg_several() works for integer vectors", { # suggest_regex() tests --------------------------------------------------- -test_that("suggest_regex() works", { - expect_equal(suggest_regex(LETTERS), "[ABCDEFGHIJKLMNOPQRSTUVWXYZ]") +test_that("suggest_regex(pivot = 'wide') works (default)", { + expected <- tibble::tibble( + position = c("regex", "n"), + `1` = c("[ABCDEFGHIJKLMNOPQRSTUVWXYZ]", "26") + ) + expect_equal(suggest_regex(LETTERS), expected) + + expected <- tibble::tibble( + position = c("regex", "n"), + `1` = c("[5ADMT]", "4"), + `2` = c("[2ACHN]", "4"), + `3` = c("[0ACDP]", "4"), + `4` = c("[12]", "2") + ) expect_equal( suggest_regex(c("DNA", "MHC", "TAP1", "TAP2", "520", "ACD")), - "[5ADMT][2ACHN][0ACDP][12]" + expected + ) + + # testthat uses "C" for LC_COLLATE, so this order may differ than what is on + # a local machine; see https://github.com/r-lib/testthat/issues/1181 + expected <- tibble::tibble( + position = c("regex", "n"), + `1` = c("[Aa]", "2"), `2` = c("[Bb]", "2"), `3` = c("[Cc]", "2"), + `4` = c("[Dd]", "2"), `5` = c("[Ee]", "2"), `6` = c("[Ff]", "2"), + `7` = c("[Gg]", "2"), `8` = c("[Hh]", "2"), `9` = c("[Ii]", "2"), + `10` = c("[Jj]", "2"), `11` = c("[Kk]", "2"), `12` = c("[Ll]", "2"), + `13` = c("[Mm]", "2"), `14` = c("[Nn]", "2"), `15` = c("[Oo]", "2"), + `16` = c("[Pp]", "2"), `17` = c("[Qq]", "2"), `18` = c("[Rr]", "2"), + `19` = c("[Ss]", "2"), `20` = c("[Tt]", "2"), `21` = c("[Uu]", "2"), + `22` = c("[Vv]", "2"), `23` = c("[Ww]", "2"), `24` = c("[Xx]", "2"), + `25` = c("[Yy]", "2"), `26` = c("[Zz]", "2") ) expect_equal( suggest_regex( c(paste0(LETTERS, collapse = ""), paste0(letters, collapse = "")) ), - paste0("[", LETTERS, letters, "]", collapse = "") + expected + ) +}) + +test_that("suggest_regex(pivot = 'long') works", { + expected <- tibble::tibble( + position = 1L, + regex = "[ABCDEFGHIJKLMNOPQRSTUVWXYZ]", + n = 26 + ) + expect_equal(suggest_regex(LETTERS, "long"), expected) + + expected <- tibble::tibble( + position = 1:4, + regex = c("[5ADMT]", "[2ACHN]", "[0ACDP]", "[12]"), + n = c(4, 4, 4, 2) + ) + expect_equal( + suggest_regex(c("DNA", "MHC", "TAP1", "TAP2", "520", "ACD"), "long"), + expected + ) + + # testthat uses "C" for LC_COLLATE, so this order may differ than what is on + # a local machine; see https://github.com/r-lib/testthat/issues/1181 + expected <- tibble::tibble( + position = 1:26, + regex = c( + "[Aa]", "[Bb]", "[Cc]", "[Dd]", "[Ee]", "[Ff]", "[Gg]", "[Hh]", + "[Ii]", "[Jj]", "[Kk]", "[Ll]", "[Mm]", "[Nn]", "[Oo]", "[Pp]", + "[Qq]", "[Rr]", "[Ss]", "[Tt]", "[Uu]", "[Vv]", "[Ww]", "[Xx]", + "[Yy]", "[Zz]" + ), + n = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2) + ) + expect_equal( + suggest_regex( + c(paste0(LETTERS, collapse = ""), paste0(letters, collapse = "")), + "long" + ), + expected ) })