Support Punctuation Split (#145)

* Support Punctuation Split * Update Pass Rate Add Anaconda Downloads Counter * Update Pass Rate Add Anaconda Downloads Counter
openvinotoolkit · May 14, 2024 · fe390da · fe390da
1 parent c58b0a5
commit fe390da
Show file tree

Hide file tree

Showing 5 changed files with 21 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,7 @@
 # OpenVINO Tokenizers
 
 [![Downloads](https://static.pepy.tech/badge/openvino-tokenizers)](https://pepy.tech/project/openvino-tokenizers)
+[![Anaconda-Server Badge](https://anaconda.org/conda-forge/openvino-tokenizers/badges/downloads.svg)](https://anaconda.org/conda-forge/openvino-tokenizers)
 
 OpenVINO Tokenizers adds text processing operations to OpenVINO.
 
@@ -335,8 +336,8 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
   <tbody>
     <tr>
       <td >BPE</td>
-      <td >96.20</td>
-      <td >4557</td>
+      <td >96.25</td>
+      <td >4774</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
@@ -494,6 +495,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
       <td >96.31</td>
       <td >217</td>
     </tr>
+    <tr>
+      <td >BPE</td>
+      <td >tiiuae/falcon-7b</td>
+      <td >97.24</td>
+      <td >217</td>
+    </tr>
     <tr>
       <td >SentencePiece</td>
       <td >NousResearch/Llama-2-13b-hf</td>

diff --git a/python/openvino_tokenizers/hf_parser.py b/python/openvino_tokenizers/hf_parser.py
@@ -40,7 +40,6 @@
     NormalizeUnicode,
     PaddingStep,
     PreTokenizatinStep,
-    PunctuationSplitStep,
     RegexDecodingStep,
     RegexNormalizationStep,
     RegexSplitStep,
@@ -202,7 +201,7 @@ def normalization(self) -> None:
         "Whitespace": lambda step_dict: RegexSplitStep.whitespace_splitter(),
         "WhitespaceSplit": lambda step_dict: WhitespaceSplitStep(),
         "Split": parse_split_step,
-        "Punctuation": lambda step_dict: PunctuationSplitStep(step_dict["behavior"]),
+        "Punctuation": lambda step_dict: RegexSplitStep.punctuation_splitter(step_dict["behavior"]),
         "ByteLevel": parse_byte_level_pretokenization_step,
         "Digits": lambda step_dict: RegexSplitStep.digits_splitter(
             "isolate" if step_dict["individual_digits"] else "contiguous"

diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py
@@ -209,7 +209,7 @@ def __post_init__(self):
     def vet_split_pattern(self) -> None:
         if r"(?!\S)" in self.split_pattern:
             #  rewrite regex pattern to get results closer to qwen.cpp results
-            logger.warning(r"Replace `(?!\S)` pattern to `(?:$|[^\S])` in RegexSplit operation")
+            logger.warning(r"Replacing `(?!\S)` pattern to `(?:$|[^\S])` in RegexSplit operation")
             self.split_pattern = self.split_pattern.replace(r"(?!\S)", r"(?:$|[^\S])")
 
         if has_incompatible_re2_op(self.split_pattern):
@@ -282,6 +282,14 @@ def digits_splitter(cls, behaviour="isolate") -> "RegexSplitStep":
             behaviour=behaviour,
         )
 
+    @classmethod
+    def punctuation_splitter(cls, behaviour="isolate") -> "RegexSplitStep":
+        return cls(
+            r"\p{P}",
+            invert=False,
+            behaviour=behaviour,
+        )
+
     def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
         input_nodes.extend(self.create_string_constant_node(self.split_pattern).outputs())
         return (
@@ -307,13 +315,6 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
         return RegexSplitStep.whitespace_splitter().get_ov_subgraph(input_nodes).outputs()
 
 
-@dataclass
-class PunctuationSplitStep(PreTokenizatinStep):
-    """Splits string on punctuation chars."""
-
-    # behaviour: str = "Isolated"
-
-
 @dataclass
 class BytesToCharsStep(PreTokenizatinStep):
     """Maps chars to other chars for Byte-level BPE Tokenizer"""

diff --git a/tests/pass_rates.json b/tests/pass_rates.json
@@ -1,3 +1,3 @@
 {
-    "tests/tokenizers_test.py::test_": 0.887905604719764
+    "tests/tokenizers_test.py::test_": 0.8896697795321075
 }
diff --git a/tests/tokenizers_test.py b/tests/tokenizers_test.py
@@ -90,6 +90,7 @@ def unpack_strings(strings):
 bpe_models = [
     "NousResearch/Meta-Llama-3-8B-Instruct",
     # "meta-llama/Meta-Llama-3-8B",  # cannot be part of the CI
+    "tiiuae/falcon-7b",
     "stabilityai/stablecode-completion-alpha-3b-4k",
     "stabilityai/stablelm-tuned-alpha-7b",
     "databricks/dolly-v2-3b",