diff --git a/README.md b/README.md
index 3e353d087..460c2a15f 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
# OpenVINO Tokenizers
[![Downloads](https://static.pepy.tech/badge/openvino-tokenizers)](https://pepy.tech/project/openvino-tokenizers)
+[![Anaconda-Server Badge](https://anaconda.org/conda-forge/openvino-tokenizers/badges/downloads.svg)](https://anaconda.org/conda-forge/openvino-tokenizers)
OpenVINO Tokenizers adds text processing operations to OpenVINO.
@@ -335,8 +336,8 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
BPE |
- 96.20 |
- 4557 |
+ 96.25 |
+ 4774 |
SentencePiece |
@@ -494,6 +495,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
96.31 |
217 |
+
+ BPE |
+ tiiuae/falcon-7b |
+ 97.24 |
+ 217 |
+
SentencePiece |
NousResearch/Llama-2-13b-hf |
diff --git a/python/openvino_tokenizers/hf_parser.py b/python/openvino_tokenizers/hf_parser.py
index 05f280bc6..8e5691b5c 100644
--- a/python/openvino_tokenizers/hf_parser.py
+++ b/python/openvino_tokenizers/hf_parser.py
@@ -40,7 +40,6 @@
NormalizeUnicode,
PaddingStep,
PreTokenizatinStep,
- PunctuationSplitStep,
RegexDecodingStep,
RegexNormalizationStep,
RegexSplitStep,
@@ -202,7 +201,7 @@ def normalization(self) -> None:
"Whitespace": lambda step_dict: RegexSplitStep.whitespace_splitter(),
"WhitespaceSplit": lambda step_dict: WhitespaceSplitStep(),
"Split": parse_split_step,
- "Punctuation": lambda step_dict: PunctuationSplitStep(step_dict["behavior"]),
+ "Punctuation": lambda step_dict: RegexSplitStep.punctuation_splitter(step_dict["behavior"]),
"ByteLevel": parse_byte_level_pretokenization_step,
"Digits": lambda step_dict: RegexSplitStep.digits_splitter(
"isolate" if step_dict["individual_digits"] else "contiguous"
diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py
index bf5f8fb74..60830fce8 100644
--- a/python/openvino_tokenizers/tokenizer_pipeline.py
+++ b/python/openvino_tokenizers/tokenizer_pipeline.py
@@ -209,7 +209,7 @@ def __post_init__(self):
def vet_split_pattern(self) -> None:
if r"(?!\S)" in self.split_pattern:
# rewrite regex pattern to get results closer to qwen.cpp results
- logger.warning(r"Replace `(?!\S)` pattern to `(?:$|[^\S])` in RegexSplit operation")
+ logger.warning(r"Replacing `(?!\S)` pattern to `(?:$|[^\S])` in RegexSplit operation")
self.split_pattern = self.split_pattern.replace(r"(?!\S)", r"(?:$|[^\S])")
if has_incompatible_re2_op(self.split_pattern):
@@ -282,6 +282,14 @@ def digits_splitter(cls, behaviour="isolate") -> "RegexSplitStep":
behaviour=behaviour,
)
+ @classmethod
+ def punctuation_splitter(cls, behaviour="isolate") -> "RegexSplitStep":
+ return cls(
+ r"\p{P}",
+ invert=False,
+ behaviour=behaviour,
+ )
+
def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
input_nodes.extend(self.create_string_constant_node(self.split_pattern).outputs())
return (
@@ -307,13 +315,6 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
return RegexSplitStep.whitespace_splitter().get_ov_subgraph(input_nodes).outputs()
-@dataclass
-class PunctuationSplitStep(PreTokenizatinStep):
- """Splits string on punctuation chars."""
-
- # behaviour: str = "Isolated"
-
-
@dataclass
class BytesToCharsStep(PreTokenizatinStep):
"""Maps chars to other chars for Byte-level BPE Tokenizer"""
diff --git a/tests/pass_rates.json b/tests/pass_rates.json
index 9ad83a001..d95944e32 100644
--- a/tests/pass_rates.json
+++ b/tests/pass_rates.json
@@ -1,3 +1,3 @@
{
- "tests/tokenizers_test.py::test_": 0.887905604719764
+ "tests/tokenizers_test.py::test_": 0.8896697795321075
}
\ No newline at end of file
diff --git a/tests/tokenizers_test.py b/tests/tokenizers_test.py
index f677730af..efe129bf5 100644
--- a/tests/tokenizers_test.py
+++ b/tests/tokenizers_test.py
@@ -90,6 +90,7 @@ def unpack_strings(strings):
bpe_models = [
"NousResearch/Meta-Llama-3-8B-Instruct",
# "meta-llama/Meta-Llama-3-8B", # cannot be part of the CI
+ "tiiuae/falcon-7b",
"stabilityai/stablecode-completion-alpha-3b-4k",
"stabilityai/stablelm-tuned-alpha-7b",
"databricks/dolly-v2-3b",