From 5c33453949e9d64a1e7a808a9bcad21885a2fb4f Mon Sep 17 00:00:00 2001
From: "J. Allen Baron" <jabaron.phd@gmail.com>
Date: Wed, 18 Sep 2024 15:20:13 -0400
Subject: [PATCH] Change output of suggest_regex() to tibble with stats

Tests: PASS
---
 R/utils.R                   | 45 +++++++++++++++++++---
 man/suggest_regex.Rd        | 18 ++++++++-
 tests/testthat/test-utils.R | 75 +++++++++++++++++++++++++++++++++++--
 3 files changed, 127 insertions(+), 11 deletions(-)

diff --git a/R/utils.R b/R/utils.R
index 9bf3f6a..4b7e61b 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -199,18 +199,35 @@ match_arg_several <- function(arg, choices) {
 #' position.
 #'
 #' @param x A character vector.
+#' @param pivot Whether the resulting `tibble` should be in "wide" (default) or
+#' "long" format.
 #'
-#' @returns A string with a bracketed character set for each position.
+#' @returns When `pivot = "long"`, a tidy `tibble` with 3 columns and as many
+#' rows as the string length of the longest input:
+#' 1. `position`: indicating the position of the character set in the input.
+#' 2. `regex`: giving the character set (in brackets),
+#' 3. `n`: the count of input strings that have a character at that `position`.
+#'
+#' When `pivot = "wide"` (default), a `tibble` with the same information
+#' organized into rows (1 header and 2 normal rows) corresponding to the 3
+#' columns described.
 #'
 #' @examples
 #' x <- c("DNA", "MHC", "TAP1", "TAP2", "520", "ACD")
 #'
 #' suggest_regex(x)
+#' suggest_regex(x, "long")
 #'
 #' @export
-suggest_regex <- function(x) {
-    x_len <- sort(unique(stringr::str_length(x)))
-    max_len <- max(x_len)
+suggest_regex <- function(x, pivot = "wide") {
+    pivot <- match.arg(pivot, choices = c("wide", "long"))
+    out <- tibble::tibble(position = stringr::str_length(x)) |>
+        dplyr::count(.data$position)
+    max_len <- max(out$position)
+    missing_pos <- (1:max_len)[!1:max_len %in% out$position]
+    out <- out |>
+        tibble::add_row(position = missing_pos, n = max(out$n)) |>
+        dplyr::arrange(.data$position)
 
     xsplit <- stringr::str_split(x, "")
 
@@ -232,7 +249,25 @@ suggest_regex <- function(x) {
         ) |>
         sandwich_text(c("[", "]"))
 
-    paste0(chr_at_pos, collapse = "")
+    out <- out |>
+        dplyr::mutate(regex = chr_at_pos, .after = "position")
+
+    if (pivot == "wide") {
+        out <- out |>
+            dplyr::mutate(n = as.character(n)) |>
+            dplyr::rename(pos = position) |>
+            tidyr::pivot_longer(
+                cols = c("regex", "n"),
+                names_to = "position",
+                values_to = "value"
+            ) |>
+            tidyr::pivot_wider(
+                names_from = "pos",
+                values_from = "value"
+            )
+    }
+
+    out
 }
 
 
diff --git a/man/suggest_regex.Rd b/man/suggest_regex.Rd
index e8fa721..82625c2 100644
--- a/man/suggest_regex.Rd
+++ b/man/suggest_regex.Rd
@@ -4,13 +4,26 @@
 \alias{suggest_regex}
 \title{Suggest a Regular Expression That Will Match All Input}
 \usage{
-suggest_regex(x)
+suggest_regex(x, pivot = "wide")
 }
 \arguments{
 \item{x}{A character vector.}
+
+\item{pivot}{Whether the resulting \code{tibble} should be in "wide" (default) or
+"long" format.}
 }
 \value{
-A string with a bracketed character set for each position.
+When \code{pivot = "long"}, a tidy \code{tibble} with 3 columns and as many
+rows as the string length of the longest input:
+\enumerate{
+\item \code{position}: indicating the position of the character set in the input.
+\item \code{regex}: giving the character set (in brackets),
+\item \code{n}: the count of input strings that have a character at that \code{position}.
+}
+
+When \code{pivot = "wide"} (default), a \code{tibble} with the same information
+organized into rows (1 header and 2 normal rows) corresponding to the 3
+columns described.
 }
 \description{
 Collects the full set of characters found at each position across all strings
@@ -22,5 +35,6 @@ position.
 x <- c("DNA", "MHC", "TAP1", "TAP2", "520", "ACD")
 
 suggest_regex(x)
+suggest_regex(x, "long")
 
 }
diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R
index d2ba25a..ae3bdd9 100644
--- a/tests/testthat/test-utils.R
+++ b/tests/testthat/test-utils.R
@@ -163,17 +163,84 @@ test_that("match_arg_several() works for integer vectors", {
 
 # suggest_regex() tests ---------------------------------------------------
 
-test_that("suggest_regex() works", {
-    expect_equal(suggest_regex(LETTERS), "[ABCDEFGHIJKLMNOPQRSTUVWXYZ]")
+test_that("suggest_regex(pivot = 'wide') works (default)", {
+    expected <- tibble::tibble(
+        position = c("regex", "n"),
+        `1` = c("[ABCDEFGHIJKLMNOPQRSTUVWXYZ]", "26")
+    )
+    expect_equal(suggest_regex(LETTERS), expected)
+
+    expected <- tibble::tibble(
+        position = c("regex", "n"),
+        `1` = c("[5ADMT]", "4"),
+        `2` = c("[2ACHN]", "4"),
+        `3` = c("[0ACDP]", "4"),
+        `4` = c("[12]", "2")
+    )
     expect_equal(
         suggest_regex(c("DNA", "MHC", "TAP1", "TAP2", "520", "ACD")),
-        "[5ADMT][2ACHN][0ACDP][12]"
+        expected
+    )
+
+    # testthat uses "C" for LC_COLLATE, so this order may differ than what is on
+    #   a local machine; see https://github.com/r-lib/testthat/issues/1181
+    expected <- tibble::tibble(
+        position = c("regex", "n"),
+        `1` = c("[Aa]", "2"), `2` = c("[Bb]", "2"), `3` = c("[Cc]", "2"),
+        `4` = c("[Dd]", "2"), `5` = c("[Ee]", "2"), `6` = c("[Ff]", "2"),
+        `7` = c("[Gg]", "2"), `8` = c("[Hh]", "2"), `9` = c("[Ii]", "2"),
+        `10` = c("[Jj]", "2"), `11` = c("[Kk]", "2"), `12` = c("[Ll]", "2"),
+        `13` = c("[Mm]", "2"), `14` = c("[Nn]", "2"), `15` = c("[Oo]", "2"),
+        `16` = c("[Pp]", "2"), `17` = c("[Qq]", "2"), `18` = c("[Rr]", "2"),
+        `19` = c("[Ss]", "2"), `20` = c("[Tt]", "2"), `21` = c("[Uu]", "2"),
+        `22` = c("[Vv]", "2"), `23` = c("[Ww]", "2"), `24` = c("[Xx]", "2"),
+        `25` = c("[Yy]", "2"), `26` = c("[Zz]", "2")
     )
     expect_equal(
         suggest_regex(
             c(paste0(LETTERS, collapse = ""), paste0(letters, collapse = ""))
         ),
-        paste0("[", LETTERS, letters, "]", collapse = "")
+        expected
+    )
+})
+
+test_that("suggest_regex(pivot = 'long') works", {
+    expected <- tibble::tibble(
+        position = 1L,
+        regex = "[ABCDEFGHIJKLMNOPQRSTUVWXYZ]",
+        n = 26
+    )
+    expect_equal(suggest_regex(LETTERS, "long"), expected)
+
+    expected <- tibble::tibble(
+        position = 1:4,
+        regex = c("[5ADMT]", "[2ACHN]", "[0ACDP]", "[12]"),
+        n = c(4, 4, 4, 2)
+    )
+    expect_equal(
+        suggest_regex(c("DNA", "MHC", "TAP1", "TAP2", "520", "ACD"), "long"),
+        expected
+    )
+
+    # testthat uses "C" for LC_COLLATE, so this order may differ than what is on
+    #   a local machine; see https://github.com/r-lib/testthat/issues/1181
+    expected <- tibble::tibble(
+        position = 1:26,
+        regex = c(
+            "[Aa]", "[Bb]", "[Cc]",   "[Dd]", "[Ee]", "[Ff]", "[Gg]", "[Hh]",
+            "[Ii]", "[Jj]", "[Kk]",  "[Ll]", "[Mm]", "[Nn]", "[Oo]", "[Pp]",
+            "[Qq]", "[Rr]", "[Ss]", "[Tt]", "[Uu]", "[Vv]", "[Ww]", "[Xx]",
+            "[Yy]", "[Zz]"
+        ),
+        n = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2)
+    )
+    expect_equal(
+        suggest_regex(
+            c(paste0(LETTERS, collapse = ""), paste0(letters, collapse = "")),
+            "long"
+        ),
+        expected
     )
 })