From fe390da678d5ba79f81807006029270b968baf7b Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Wed, 15 May 2024 00:52:08 +0400 Subject: [PATCH] Support Punctuation Split (#145) * Support Punctuation Split * Update Pass Rate Add Anaconda Downloads Counter * Update Pass Rate Add Anaconda Downloads Counter --- README.md | 11 +++++++++-- python/openvino_tokenizers/hf_parser.py | 3 +-- .../openvino_tokenizers/tokenizer_pipeline.py | 17 +++++++++-------- tests/pass_rates.json | 2 +- tests/tokenizers_test.py | 1 + 5 files changed, 21 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 3e353d087..460c2a15f 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # OpenVINO Tokenizers [![Downloads](https://static.pepy.tech/badge/openvino-tokenizers)](https://pepy.tech/project/openvino-tokenizers) +[![Anaconda-Server Badge](https://anaconda.org/conda-forge/openvino-tokenizers/badges/downloads.svg)](https://anaconda.org/conda-forge/openvino-tokenizers) OpenVINO Tokenizers adds text processing operations to OpenVINO. @@ -335,8 +336,8 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The BPE - 96.20 - 4557 + 96.25 + 4774 SentencePiece @@ -494,6 +495,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The 96.31 217 + + BPE + tiiuae/falcon-7b + 97.24 + 217 + SentencePiece NousResearch/Llama-2-13b-hf diff --git a/python/openvino_tokenizers/hf_parser.py b/python/openvino_tokenizers/hf_parser.py index 05f280bc6..8e5691b5c 100644 --- a/python/openvino_tokenizers/hf_parser.py +++ b/python/openvino_tokenizers/hf_parser.py @@ -40,7 +40,6 @@ NormalizeUnicode, PaddingStep, PreTokenizatinStep, - PunctuationSplitStep, RegexDecodingStep, RegexNormalizationStep, RegexSplitStep, @@ -202,7 +201,7 @@ def normalization(self) -> None: "Whitespace": lambda step_dict: RegexSplitStep.whitespace_splitter(), "WhitespaceSplit": lambda step_dict: WhitespaceSplitStep(), "Split": parse_split_step, - "Punctuation": lambda step_dict: PunctuationSplitStep(step_dict["behavior"]), + "Punctuation": lambda step_dict: RegexSplitStep.punctuation_splitter(step_dict["behavior"]), "ByteLevel": parse_byte_level_pretokenization_step, "Digits": lambda step_dict: RegexSplitStep.digits_splitter( "isolate" if step_dict["individual_digits"] else "contiguous" diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py index bf5f8fb74..60830fce8 100644 --- a/python/openvino_tokenizers/tokenizer_pipeline.py +++ b/python/openvino_tokenizers/tokenizer_pipeline.py @@ -209,7 +209,7 @@ def __post_init__(self): def vet_split_pattern(self) -> None: if r"(?!\S)" in self.split_pattern: # rewrite regex pattern to get results closer to qwen.cpp results - logger.warning(r"Replace `(?!\S)` pattern to `(?:$|[^\S])` in RegexSplit operation") + logger.warning(r"Replacing `(?!\S)` pattern to `(?:$|[^\S])` in RegexSplit operation") self.split_pattern = self.split_pattern.replace(r"(?!\S)", r"(?:$|[^\S])") if has_incompatible_re2_op(self.split_pattern): @@ -282,6 +282,14 @@ def digits_splitter(cls, behaviour="isolate") -> "RegexSplitStep": behaviour=behaviour, ) + @classmethod + def punctuation_splitter(cls, behaviour="isolate") -> "RegexSplitStep": + return cls( + r"\p{P}", + invert=False, + behaviour=behaviour, + ) + def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: input_nodes.extend(self.create_string_constant_node(self.split_pattern).outputs()) return ( @@ -307,13 +315,6 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: return RegexSplitStep.whitespace_splitter().get_ov_subgraph(input_nodes).outputs() -@dataclass -class PunctuationSplitStep(PreTokenizatinStep): - """Splits string on punctuation chars.""" - - # behaviour: str = "Isolated" - - @dataclass class BytesToCharsStep(PreTokenizatinStep): """Maps chars to other chars for Byte-level BPE Tokenizer""" diff --git a/tests/pass_rates.json b/tests/pass_rates.json index 9ad83a001..d95944e32 100644 --- a/tests/pass_rates.json +++ b/tests/pass_rates.json @@ -1,3 +1,3 @@ { - "tests/tokenizers_test.py::test_": 0.887905604719764 + "tests/tokenizers_test.py::test_": 0.8896697795321075 } \ No newline at end of file diff --git a/tests/tokenizers_test.py b/tests/tokenizers_test.py index f677730af..efe129bf5 100644 --- a/tests/tokenizers_test.py +++ b/tests/tokenizers_test.py @@ -90,6 +90,7 @@ def unpack_strings(strings): bpe_models = [ "NousResearch/Meta-Llama-3-8B-Instruct", # "meta-llama/Meta-Llama-3-8B", # cannot be part of the CI + "tiiuae/falcon-7b", "stabilityai/stablecode-completion-alpha-3b-4k", "stabilityai/stablelm-tuned-alpha-7b", "databricks/dolly-v2-3b",