Skip to content

Commit

Permalink
Support Punctuation Split (#145)
Browse files Browse the repository at this point in the history
* Support Punctuation Split

* Update Pass Rate

Add Anaconda Downloads Counter

* Update Pass Rate

Add Anaconda Downloads Counter
  • Loading branch information
apaniukov authored May 14, 2024
1 parent c58b0a5 commit fe390da
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 13 deletions.
11 changes: 9 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# OpenVINO Tokenizers

[![Downloads](https://static.pepy.tech/badge/openvino-tokenizers)](https://pepy.tech/project/openvino-tokenizers)
[![Anaconda-Server Badge](https://anaconda.org/conda-forge/openvino-tokenizers/badges/downloads.svg)](https://anaconda.org/conda-forge/openvino-tokenizers)

OpenVINO Tokenizers adds text processing operations to OpenVINO.

Expand Down Expand Up @@ -335,8 +336,8 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tbody>
<tr>
<td >BPE</td>
<td >96.20</td>
<td >4557</td>
<td >96.25</td>
<td >4774</td>
</tr>
<tr>
<td >SentencePiece</td>
Expand Down Expand Up @@ -494,6 +495,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<td >96.31</td>
<td >217</td>
</tr>
<tr>
<td >BPE</td>
<td >tiiuae/falcon-7b</td>
<td >97.24</td>
<td >217</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >NousResearch/Llama-2-13b-hf</td>
Expand Down
3 changes: 1 addition & 2 deletions python/openvino_tokenizers/hf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
NormalizeUnicode,
PaddingStep,
PreTokenizatinStep,
PunctuationSplitStep,
RegexDecodingStep,
RegexNormalizationStep,
RegexSplitStep,
Expand Down Expand Up @@ -202,7 +201,7 @@ def normalization(self) -> None:
"Whitespace": lambda step_dict: RegexSplitStep.whitespace_splitter(),
"WhitespaceSplit": lambda step_dict: WhitespaceSplitStep(),
"Split": parse_split_step,
"Punctuation": lambda step_dict: PunctuationSplitStep(step_dict["behavior"]),
"Punctuation": lambda step_dict: RegexSplitStep.punctuation_splitter(step_dict["behavior"]),
"ByteLevel": parse_byte_level_pretokenization_step,
"Digits": lambda step_dict: RegexSplitStep.digits_splitter(
"isolate" if step_dict["individual_digits"] else "contiguous"
Expand Down
17 changes: 9 additions & 8 deletions python/openvino_tokenizers/tokenizer_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def __post_init__(self):
def vet_split_pattern(self) -> None:
if r"(?!\S)" in self.split_pattern:
# rewrite regex pattern to get results closer to qwen.cpp results
logger.warning(r"Replace `(?!\S)` pattern to `(?:$|[^\S])` in RegexSplit operation")
logger.warning(r"Replacing `(?!\S)` pattern to `(?:$|[^\S])` in RegexSplit operation")
self.split_pattern = self.split_pattern.replace(r"(?!\S)", r"(?:$|[^\S])")

if has_incompatible_re2_op(self.split_pattern):
Expand Down Expand Up @@ -282,6 +282,14 @@ def digits_splitter(cls, behaviour="isolate") -> "RegexSplitStep":
behaviour=behaviour,
)

@classmethod
def punctuation_splitter(cls, behaviour="isolate") -> "RegexSplitStep":
return cls(
r"\p{P}",
invert=False,
behaviour=behaviour,
)

def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
input_nodes.extend(self.create_string_constant_node(self.split_pattern).outputs())
return (
Expand All @@ -307,13 +315,6 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
return RegexSplitStep.whitespace_splitter().get_ov_subgraph(input_nodes).outputs()


@dataclass
class PunctuationSplitStep(PreTokenizatinStep):
"""Splits string on punctuation chars."""

# behaviour: str = "Isolated"


@dataclass
class BytesToCharsStep(PreTokenizatinStep):
"""Maps chars to other chars for Byte-level BPE Tokenizer"""
Expand Down
2 changes: 1 addition & 1 deletion tests/pass_rates.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"tests/tokenizers_test.py::test_": 0.887905604719764
"tests/tokenizers_test.py::test_": 0.8896697795321075
}
1 change: 1 addition & 0 deletions tests/tokenizers_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def unpack_strings(strings):
bpe_models = [
"NousResearch/Meta-Llama-3-8B-Instruct",
# "meta-llama/Meta-Llama-3-8B", # cannot be part of the CI
"tiiuae/falcon-7b",
"stabilityai/stablecode-completion-alpha-3b-4k",
"stabilityai/stablelm-tuned-alpha-7b",
"databricks/dolly-v2-3b",
Expand Down

0 comments on commit fe390da

Please sign in to comment.