From d48da092ed76a906d6f9ccdb0a958595a0e9f24d Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Wed, 4 Dec 2024 12:24:59 +0400 Subject: [PATCH] Support Already Converted Llava Tokenizers (#333) --- src/regex_normalization.cpp | 3 ++- tests/layer_tests.py | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/regex_normalization.cpp b/src/regex_normalization.cpp index dd5b5b79..37b4ed87 100644 --- a/src/regex_normalization.cpp +++ b/src/regex_normalization.cpp @@ -31,7 +31,8 @@ std::string reformat_replace_pattern(std::string replace_pattern) { const std::map search_pattern_rewrites = { {R"( ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't))", R"((?| ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't)))"}, - {R"((^)(.))", R"((^)([\s\S]))"} + {R"((^)(.))", R"((^)([\s\S]))"}, + {R"((^)(.+))", R"((^)([\s\S]))"} }; /** diff --git a/tests/layer_tests.py b/tests/layer_tests.py index 17ed84f4..f46b1be9 100644 --- a/tests/layer_tests.py +++ b/tests/layer_tests.py @@ -154,6 +154,14 @@ def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled replace_term=r"▁\2", ) ), + ( # test backward compatibility with old regex + "\n", + "▁\n", + RegexNormalizationStep( + regex_search_pattern=r"(^)(.+)", + replace_term=r"▁$2", + ) + ), ] ) def test_regex_normalization(test_string, expected, layer):