Skip to content

Commit

Permalink
Change output of suggest_regex() to tibble with stats
Browse files Browse the repository at this point in the history
Tests: PASS
  • Loading branch information
allenbaron committed Nov 2, 2024
1 parent 4dbe518 commit 5c33453
Show file tree
Hide file tree
Showing 3 changed files with 127 additions and 11 deletions.
45 changes: 40 additions & 5 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -199,18 +199,35 @@ match_arg_several <- function(arg, choices) {
#' position.
#'
#' @param x A character vector.
#' @param pivot Whether the resulting `tibble` should be in "wide" (default) or
#' "long" format.
#'
#' @returns A string with a bracketed character set for each position.
#' @returns When `pivot = "long"`, a tidy `tibble` with 3 columns and as many
#' rows as the string length of the longest input:
#' 1. `position`: indicating the position of the character set in the input.
#' 2. `regex`: giving the character set (in brackets),
#' 3. `n`: the count of input strings that have a character at that `position`.
#'
#' When `pivot = "wide"` (default), a `tibble` with the same information
#' organized into rows (1 header and 2 normal rows) corresponding to the 3
#' columns described.
#'
#' @examples
#' x <- c("DNA", "MHC", "TAP1", "TAP2", "520", "ACD")
#'
#' suggest_regex(x)
#' suggest_regex(x, "long")
#'
#' @export
suggest_regex <- function(x) {
x_len <- sort(unique(stringr::str_length(x)))
max_len <- max(x_len)
suggest_regex <- function(x, pivot = "wide") {
pivot <- match.arg(pivot, choices = c("wide", "long"))
out <- tibble::tibble(position = stringr::str_length(x)) |>
dplyr::count(.data$position)
max_len <- max(out$position)
missing_pos <- (1:max_len)[!1:max_len %in% out$position]
out <- out |>
tibble::add_row(position = missing_pos, n = max(out$n)) |>
dplyr::arrange(.data$position)

xsplit <- stringr::str_split(x, "")

Expand All @@ -232,7 +249,25 @@ suggest_regex <- function(x) {
) |>
sandwich_text(c("[", "]"))

paste0(chr_at_pos, collapse = "")
out <- out |>
dplyr::mutate(regex = chr_at_pos, .after = "position")

if (pivot == "wide") {
out <- out |>
dplyr::mutate(n = as.character(n)) |>
dplyr::rename(pos = position) |>
tidyr::pivot_longer(
cols = c("regex", "n"),
names_to = "position",
values_to = "value"
) |>
tidyr::pivot_wider(
names_from = "pos",
values_from = "value"
)
}

out
}


Expand Down
18 changes: 16 additions & 2 deletions man/suggest_regex.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

75 changes: 71 additions & 4 deletions tests/testthat/test-utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -163,17 +163,84 @@ test_that("match_arg_several() works for integer vectors", {

# suggest_regex() tests ---------------------------------------------------

test_that("suggest_regex() works", {
expect_equal(suggest_regex(LETTERS), "[ABCDEFGHIJKLMNOPQRSTUVWXYZ]")
test_that("suggest_regex(pivot = 'wide') works (default)", {
expected <- tibble::tibble(
position = c("regex", "n"),
`1` = c("[ABCDEFGHIJKLMNOPQRSTUVWXYZ]", "26")
)
expect_equal(suggest_regex(LETTERS), expected)

expected <- tibble::tibble(
position = c("regex", "n"),
`1` = c("[5ADMT]", "4"),
`2` = c("[2ACHN]", "4"),
`3` = c("[0ACDP]", "4"),
`4` = c("[12]", "2")
)
expect_equal(
suggest_regex(c("DNA", "MHC", "TAP1", "TAP2", "520", "ACD")),
"[5ADMT][2ACHN][0ACDP][12]"
expected
)

# testthat uses "C" for LC_COLLATE, so this order may differ than what is on
# a local machine; see https://github.com/r-lib/testthat/issues/1181
expected <- tibble::tibble(
position = c("regex", "n"),
`1` = c("[Aa]", "2"), `2` = c("[Bb]", "2"), `3` = c("[Cc]", "2"),
`4` = c("[Dd]", "2"), `5` = c("[Ee]", "2"), `6` = c("[Ff]", "2"),
`7` = c("[Gg]", "2"), `8` = c("[Hh]", "2"), `9` = c("[Ii]", "2"),
`10` = c("[Jj]", "2"), `11` = c("[Kk]", "2"), `12` = c("[Ll]", "2"),
`13` = c("[Mm]", "2"), `14` = c("[Nn]", "2"), `15` = c("[Oo]", "2"),
`16` = c("[Pp]", "2"), `17` = c("[Qq]", "2"), `18` = c("[Rr]", "2"),
`19` = c("[Ss]", "2"), `20` = c("[Tt]", "2"), `21` = c("[Uu]", "2"),
`22` = c("[Vv]", "2"), `23` = c("[Ww]", "2"), `24` = c("[Xx]", "2"),
`25` = c("[Yy]", "2"), `26` = c("[Zz]", "2")
)
expect_equal(
suggest_regex(
c(paste0(LETTERS, collapse = ""), paste0(letters, collapse = ""))
),
paste0("[", LETTERS, letters, "]", collapse = "")
expected
)
})

test_that("suggest_regex(pivot = 'long') works", {
expected <- tibble::tibble(
position = 1L,
regex = "[ABCDEFGHIJKLMNOPQRSTUVWXYZ]",
n = 26
)
expect_equal(suggest_regex(LETTERS, "long"), expected)

expected <- tibble::tibble(
position = 1:4,
regex = c("[5ADMT]", "[2ACHN]", "[0ACDP]", "[12]"),
n = c(4, 4, 4, 2)
)
expect_equal(
suggest_regex(c("DNA", "MHC", "TAP1", "TAP2", "520", "ACD"), "long"),
expected
)

# testthat uses "C" for LC_COLLATE, so this order may differ than what is on
# a local machine; see https://github.com/r-lib/testthat/issues/1181
expected <- tibble::tibble(
position = 1:26,
regex = c(
"[Aa]", "[Bb]", "[Cc]", "[Dd]", "[Ee]", "[Ff]", "[Gg]", "[Hh]",
"[Ii]", "[Jj]", "[Kk]", "[Ll]", "[Mm]", "[Nn]", "[Oo]", "[Pp]",
"[Qq]", "[Rr]", "[Ss]", "[Tt]", "[Uu]", "[Vv]", "[Ww]", "[Xx]",
"[Yy]", "[Zz]"
),
n = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2)
)
expect_equal(
suggest_regex(
c(paste0(LETTERS, collapse = ""), paste0(letters, collapse = "")),
"long"
),
expected
)
})

Expand Down

0 comments on commit 5c33453

Please sign in to comment.