Skip to content

Commit

Permalink
Add Tests For WordLevel Tokenizer (#360)
Browse files Browse the repository at this point in the history
* Add Tests For WordLevel Tokenizer

* Update pass-rates
  • Loading branch information
apaniukov authored Jan 8, 2025
1 parent bf3c15d commit d5f0abf
Show file tree
Hide file tree
Showing 9 changed files with 296 additions and 20 deletions.
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,11 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<td >96.56</td>
<td >524</td>
</tr>
<tr>
<td >WordLevel</td>
<td >98.96</td>
<td >192</td>
</tr>
<tr>
<td >WordPiece</td>
<td >98.39</td>
Expand Down Expand Up @@ -827,6 +832,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<td >93.16</td>
<td >263</td>
</tr>
<tr>
<td >WordLevel</td>
<td >cisco-ai/mini-bart-g2p</td>
<td >98.96</td>
<td >192</td>
</tr>
<tr>
<td >WordPiece</td>
<td >ProsusAI/finbert</td>
Expand Down Expand Up @@ -875,6 +886,5 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
### Recreating Tokenizers From Tests
In some tokenizers, you need to select certain settings so that their output is closer to the Huggingface tokenizers:
- `THUDM/chatglm2-6b` detokenizer always skips special tokens. Use `skip_special_tokens=True` during conversion
- `THUDM/chatglm3-6b` detokenizer don't skips special tokens. Use `skip_special_tokens=False` during conversion
- All tested tiktoken based detokenizers leave extra spaces. Use `clean_up_tokenization_spaces=False` during conversion
11 changes: 9 additions & 2 deletions python/openvino_tokenizers/hf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,19 +390,26 @@ def add_padding(self, use_max_padding: bool = False) -> None:
}

def decoding(self) -> None:
skip_tokens = parse_special_tokens(self.original_tokenizer)

if self.tokenizer_json["model"]["type"] == "WordLevel":
self.pipeline.add_steps(
[
VocabDecoderStep(vocab=[f" {token}" for token in self.pipeline.vocab]),
VocabDecoderStep(
vocab=[f" {token}" for token in self.pipeline.vocab],
skip_tokens=list(skip_tokens),
do_skip_tokens=self.skip_special_tokens,
),
FuseStep(),
RegexDecodingStep.strip_forward_space(),
]
)
if self.clean_up_tokenization_spaces:
self.pipeline.add_steps(RegexDecodingStep.clean_up_tokenization_spaces())
return
elif self.tokenizer_json["decoder"] is None or self.tokenizer_json["model"]["type"] == "WordPiece":
return

skip_tokens = parse_special_tokens(self.original_tokenizer)
self.pipeline.add_steps(
VocabDecoderStep(skip_tokens=list(skip_tokens), do_skip_tokens=self.skip_special_tokens)
)
Expand Down
4 changes: 4 additions & 0 deletions python/openvino_tokenizers/tokenizer_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,10 @@ def __post_init__(self):
f"got `{self.max_splits}`"
)

@classmethod
def split_by_chars(cls) -> "RegexSplitStep":
return cls(split_pattern=".", invert=False, behaviour="isolate")

@classmethod
def bert_whitespace_splitter(cls) -> "RegexSplitStep":
return cls(split_pattern=r"\s+", invert=False)
Expand Down
5 changes: 3 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,16 @@ def add_tokenizer_type(row):
return "Tiktoken"
if not pd.isnull(row["hf_tiktoken_tokenizers_with_padding_sides_param"]):
return "Tiktoken"
if not pd.isnull(row["hf_wordlevel_tokenizers_param"]):
return "WordLevel"

results_df = get_session_results_df(session)
results_df["Tokenizer Type"] = results_df.apply(add_tokenizer_type, axis=1)
results_df = results_df[results_df.status != "skipped"] # filter skipped tests
results_df.hf_wordpiece_tokenizers_param.fillna(results_df.hf_bpe_tokenizers_param, inplace=True)
results_df.hf_wordpiece_tokenizers_param.fillna(results_df.hf_sentencepiece_tokenizers_param, inplace=True)
results_df.hf_wordpiece_tokenizers_param.fillna(results_df.hf_tiktoken_tokenizers_param, inplace=True)
results_df.hf_wordpiece_tokenizers_param.fillna(results_df.hf_wordlevel_tokenizers_param, inplace=True)
results_df.hf_wordpiece_tokenizers_param.fillna(
results_df.hf_wordpiece_tokenizers_with_padding_sides_param, inplace=True
)
Expand Down Expand Up @@ -99,8 +102,6 @@ def add_tokenizer_type(row):
"\n### Recreating Tokenizers From Tests\n\n"
"In some tokenizers, you need to select certain settings so that their output is closer "
"to the Huggingface tokenizers:\n"
"- `THUDM/chatglm2-6b` detokenizer always skips special tokens. Use `skip_special_tokens=True` "
"during conversion\n"
"- `THUDM/chatglm3-6b` detokenizer don't skips special tokens. Use `skip_special_tokens=False` "
"during conversion\n"
"- All tested tiktoken based detokenizers leave extra spaces. Use `clean_up_tokenization_spaces=False` "
Expand Down
2 changes: 2 additions & 0 deletions tests/layer_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,8 @@ def create_splitting_model(layer: PreTokenizatinStep) -> ov.CompiledModel:
),
("▁", ("▁",), RegexSplitStep(split_pattern="▁", behaviour="mergedwithprevious")),
("No split pattern", ("No split pattern",), RegexSplitStep(split_pattern="▁", behaviour="mergedwithprevious")),
("split", tuple("split"), RegexSplitStep.split_by_chars()),
("split by chars", tuple("split by chars"), RegexSplitStep.split_by_chars()),
],
)
def test_regex_split(test_string, expected, layer):
Expand Down
2 changes: 1 addition & 1 deletion tests/pass_rates.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"tests/tokenizers_test.py::test_": 0.9297414485305926
"tests/tokenizers_test.py::test_": 0.9306500079076387
}
Loading

0 comments on commit d5f0abf

Please sign in to comment.