diff --git a/README.md b/README.md index f6ae75da..7678bbb2 100644 --- a/README.md +++ b/README.md @@ -525,6 +525,11 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The 96.56 524 + + WordLevel + 98.96 + 192 + WordPiece 98.39 @@ -827,6 +832,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The 93.16 263 + + WordLevel + cisco-ai/mini-bart-g2p + 98.96 + 192 + WordPiece ProsusAI/finbert @@ -875,6 +886,5 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The ### Recreating Tokenizers From Tests In some tokenizers, you need to select certain settings so that their output is closer to the Huggingface tokenizers: -- `THUDM/chatglm2-6b` detokenizer always skips special tokens. Use `skip_special_tokens=True` during conversion - `THUDM/chatglm3-6b` detokenizer don't skips special tokens. Use `skip_special_tokens=False` during conversion - All tested tiktoken based detokenizers leave extra spaces. Use `clean_up_tokenization_spaces=False` during conversion diff --git a/python/openvino_tokenizers/hf_parser.py b/python/openvino_tokenizers/hf_parser.py index 18b86fe1..6dca996d 100644 --- a/python/openvino_tokenizers/hf_parser.py +++ b/python/openvino_tokenizers/hf_parser.py @@ -390,19 +390,26 @@ def add_padding(self, use_max_padding: bool = False) -> None: } def decoding(self) -> None: + skip_tokens = parse_special_tokens(self.original_tokenizer) + if self.tokenizer_json["model"]["type"] == "WordLevel": self.pipeline.add_steps( [ - VocabDecoderStep(vocab=[f" {token}" for token in self.pipeline.vocab]), + VocabDecoderStep( + vocab=[f" {token}" for token in self.pipeline.vocab], + skip_tokens=list(skip_tokens), + do_skip_tokens=self.skip_special_tokens, + ), FuseStep(), RegexDecodingStep.strip_forward_space(), ] ) + if self.clean_up_tokenization_spaces: + self.pipeline.add_steps(RegexDecodingStep.clean_up_tokenization_spaces()) return elif self.tokenizer_json["decoder"] is None or self.tokenizer_json["model"]["type"] == "WordPiece": return - skip_tokens = parse_special_tokens(self.original_tokenizer) self.pipeline.add_steps( VocabDecoderStep(skip_tokens=list(skip_tokens), do_skip_tokens=self.skip_special_tokens) ) diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py index fa99f6b8..21b75bcf 100644 --- a/python/openvino_tokenizers/tokenizer_pipeline.py +++ b/python/openvino_tokenizers/tokenizer_pipeline.py @@ -275,6 +275,10 @@ def __post_init__(self): f"got `{self.max_splits}`" ) + @classmethod + def split_by_chars(cls) -> "RegexSplitStep": + return cls(split_pattern=".", invert=False, behaviour="isolate") + @classmethod def bert_whitespace_splitter(cls) -> "RegexSplitStep": return cls(split_pattern=r"\s+", invert=False) diff --git a/tests/conftest.py b/tests/conftest.py index f1d3fcfc..90869daf 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -37,6 +37,8 @@ def add_tokenizer_type(row): return "Tiktoken" if not pd.isnull(row["hf_tiktoken_tokenizers_with_padding_sides_param"]): return "Tiktoken" + if not pd.isnull(row["hf_wordlevel_tokenizers_param"]): + return "WordLevel" results_df = get_session_results_df(session) results_df["Tokenizer Type"] = results_df.apply(add_tokenizer_type, axis=1) @@ -44,6 +46,7 @@ def add_tokenizer_type(row): results_df.hf_wordpiece_tokenizers_param.fillna(results_df.hf_bpe_tokenizers_param, inplace=True) results_df.hf_wordpiece_tokenizers_param.fillna(results_df.hf_sentencepiece_tokenizers_param, inplace=True) results_df.hf_wordpiece_tokenizers_param.fillna(results_df.hf_tiktoken_tokenizers_param, inplace=True) + results_df.hf_wordpiece_tokenizers_param.fillna(results_df.hf_wordlevel_tokenizers_param, inplace=True) results_df.hf_wordpiece_tokenizers_param.fillna( results_df.hf_wordpiece_tokenizers_with_padding_sides_param, inplace=True ) @@ -99,8 +102,6 @@ def add_tokenizer_type(row): "\n### Recreating Tokenizers From Tests\n\n" "In some tokenizers, you need to select certain settings so that their output is closer " "to the Huggingface tokenizers:\n" - "- `THUDM/chatglm2-6b` detokenizer always skips special tokens. Use `skip_special_tokens=True` " - "during conversion\n" "- `THUDM/chatglm3-6b` detokenizer don't skips special tokens. Use `skip_special_tokens=False` " "during conversion\n" "- All tested tiktoken based detokenizers leave extra spaces. Use `clean_up_tokenization_spaces=False` " diff --git a/tests/layer_tests.py b/tests/layer_tests.py index f46b1be9..eac73549 100644 --- a/tests/layer_tests.py +++ b/tests/layer_tests.py @@ -229,6 +229,8 @@ def create_splitting_model(layer: PreTokenizatinStep) -> ov.CompiledModel: ), ("▁", ("▁",), RegexSplitStep(split_pattern="▁", behaviour="mergedwithprevious")), ("No split pattern", ("No split pattern",), RegexSplitStep(split_pattern="▁", behaviour="mergedwithprevious")), + ("split", tuple("split"), RegexSplitStep.split_by_chars()), + ("split by chars", tuple("split by chars"), RegexSplitStep.split_by_chars()), ], ) def test_regex_split(test_string, expected, layer): diff --git a/tests/pass_rates.json b/tests/pass_rates.json index 04fab4a3..5c86997a 100644 --- a/tests/pass_rates.json +++ b/tests/pass_rates.json @@ -1,3 +1,3 @@ { - "tests/tokenizers_test.py::test_": 0.9297414485305926 + "tests/tokenizers_test.py::test_": 0.9306500079076387 } \ No newline at end of file diff --git a/tests/stats.json b/tests/stats.json index 0572d2da..438fde68 100644 --- a/tests/stats.json +++ b/tests/stats.json @@ -10978,6 +10978,196 @@ "tokenizers_test.py::test_tiktoken_tokenizers[add_tokens-glm-4-9b-chat- \\t\\n]": "passed", "tokenizers_test.py::test_tiktoken_model_tokenizer_chat[add_tokens-glm-4-9b-chat-test_chat0]": "passed", "tokenizers_test.py::test_rt_info_tiktoken[glm-4-9b-chat]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string0]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string1]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string2]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string3]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string4]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string5]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string6]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string7]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string8]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string9]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string10]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string11]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string12]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string13]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string14]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string15]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string16]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string17]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string18]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string19]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string20]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string21]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string22]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string23]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string24]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string25]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string26]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string27]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string28]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string29]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string30]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-no_clean_spaces-mini-bart-g2p-test_string31]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string0]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string1]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string2]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string3]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string4]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string5]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string6]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string7]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string8]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string9]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string10]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string11]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string12]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string13]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string14]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string15]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string16]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string17]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string18]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string19]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string20]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string21]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string22]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string23]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string24]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string25]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string26]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string27]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string28]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string29]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string30]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-no_clean_spaces-mini-bart-g2p-test_string31]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string0]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string1]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string2]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string3]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string4]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string5]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string7]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string8]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string9]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string10]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string11]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string12]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string13]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string14]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string15]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string16]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string17]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string18]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string19]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string20]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string21]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string22]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string23]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string24]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string25]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string26]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string27]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string28]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string29]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string30]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string31]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string0]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string1]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string2]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string3]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string4]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string5]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string7]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string8]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string9]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string10]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string11]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string12]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string13]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string14]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string15]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string16]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string17]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string18]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string19]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string20]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string21]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string22]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string23]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string24]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string25]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string26]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string27]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string28]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string29]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string30]": "passed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string31]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string0]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string1]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string2]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string3]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string4]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string5]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string6]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string7]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string8]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string9]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string10]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string11]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string12]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string13]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string14]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string15]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string16]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string17]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string18]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string19]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string20]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string21]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string22]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string23]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string24]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string25]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string26]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string27]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string28]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string29]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string30]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[no_add_tokens-mini-bart-g2p-test_string31]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string0]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string1]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string2]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string3]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string4]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string5]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string6]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string7]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string8]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string9]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string10]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string11]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string12]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string13]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string14]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string15]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string16]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string17]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string18]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string19]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string20]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string21]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string22]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string23]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string24]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string25]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string26]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string27]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string28]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string29]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string30]": "passed", + "tokenizers_test.py::test_wordlevel_tokenizers[add_tokens-mini-bart-g2p-test_string31]": "passed", "tokenizers_test.py::test_sentencepiece_model_detokenizer[skip_tokens-no_clean_spaces-Baichuan2-7B-Chat-sp_backend-Slow-Eng... test, string?!]": "passed", "tokenizers_test.py::test_sentencepiece_model_detokenizer[skip_tokens-no_clean_spaces-Baichuan2-7B-Chat-sp_backend-Slow-Multiline\\nstring!\\nWow!]": "passed", "tokenizers_test.py::test_sentencepiece_model_detokenizer[skip_tokens-no_clean_spaces-Baichuan2-7B-Chat-sp_backend-Slow-A lot\\t w!]": "passed", @@ -17880,6 +18070,8 @@ "tokenizers_test.py::test_tiktoken_detokenizer[no_skip_tokens-clean_spaces-glm-4-9b-chat-The process of Origami seems simple at the first glance, but in fact, it still requires a very complicated process to do it well. Taking folding a rose as an example, we can divide the entire process into three stages, including: firstly creating a grid of creases, secondly making a three-dimensional base, and thirdly finishing petal decoration. The first step is to create a grid of creases: this step is a bit like the first step of folding a gift of thousand-paper-crane. That is to say, we can fold the paper in half (or namedly equal-folds) through the symmetrical axis, and repeat such step in the other symmetrical axis. And then apply multiple equal-folds in sequence relative to each smaller rectangle divided by the two creases; After that, the creases in each direction will interweave into a complete set of uniform small square splicing patterns; these small squares form a reference space similar to a two-dimensional coordinate system, allowing us to combine adjacent creases on the plane from Three-dimensional high platforms or depressions are folded on the two-dimensional small squares to facilitate the next steps of folding. It should be noted that, in the process of creating grid creases, there may be rare cases when the folds are not aligned. The consequences of this error can be very serious. And just like the butterfly effect, it is only a slight difference at the beginning , and in the end it may generate a disaster world which is completely different from plan. Anyway, let's continue. The second step is make the three-dimensional base: In this step, we need to fold a set of symmetrical three-dimensional high platforms or depressions based on the grid creases. From the symmetry analysis, it is not difficult to find that the rose will have four symmetrical three-dimensional high platforms and supporting depressions. Therefore, we can firstly fold out a quarter of the depression and plateau patterns, which would help build a base to compose into a complex 3D structure. And then, we use this quarter as a template, and fold out the repeating patterns on the remaining three parts of the whole structure in turn. It is worth noting that the layout of the high platform not only needs to consider the regular contrast and symmetrical distribution of the length and width, but also needs to ensure the orderliness of the height dimension. This is very important, since we will never go back to this step after all parts were made, and you would better start from first step if you make anything wrong in the this step. Similar to the precautions in the first stage, please handle all the corners in three dimensions to ensure that they conform to the layout required in the plan, which would help us avoid the butterfly effect and increase the robustness in the process of three-dimensional folding. Just like building a skyscrapper in the real world, people usually take a lot of time when building the base but soon get finished when extending the structure after that. Time is worth to cost in the base, but would be saved in the future after you succeed in base. Anyway, let's continue. During the first quarter of the pattern, repeated comparisons with the finished rose were made to eliminate any possible errors in the first place. The final stage is to finish the petal grooming. At this stage, we often emphasize an important term called folding-by-heart. The intention here is no longer literally serious, but focus is moved to our understanding of the shape of a rose in nature, and we usually use natural curves to continuously correct the shape of petals in order to approach the shape of rose petals in reality. One more comment: this is also the cause of randomness to the art, which can be generated differently by different people. Recall that rose should be adjusted close to reality, so in the last step of this stage, we need to open the bloom in the center of the rose, by pulling on the four petals that have been bent. This process may be accompanied by the collapse of the overall structure of the rose, so we should be very careful to save strength of adjustment, and it must be well controlled to avoid irreversible consequences. Ultimately, after three stages of folding, we end up with a crown of rose with a similar shape close to reality. If condition is permited, we can wrap a green paper strip twisted on a straightened iron wire, and insert the rose crown we just created onto one side of the iron wire. In this way, we got a hand-made rose with a green stem. We can also repeat the steps above to increase the number of rose, so that it can be made into a cluster. Different color of rose is usually more attractive and can be considered as a better plan of gift to your friend. In summary, by creating a grid of creases, making a three-dimensional base, and finishing with petals, we created a three-dimensional rose from a two-dimensional paper. Although this process may seem simple, it is indeed a work of art created by us humans with the help of imagination and common materials. At last, Please comment to assess the above content.]": "failed", "tokenizers_test.py::test_tiktoken_detokenizer[no_skip_tokens-clean_spaces-glm-4-9b-chat-]": "failed", "tokenizers_test.py::test_tiktoken_detokenizer[skip_tokens-clean_spaces-glm-4-9b-chat-The process of Origami seems simple at the first glance, but in fact, it still requires a very complicated process to do it well. Taking folding a rose as an example, we can divide the entire process into three stages, including: firstly creating a grid of creases, secondly making a three-dimensional base, and thirdly finishing petal decoration. The first step is to create a grid of creases: this step is a bit like the first step of folding a gift of thousand-paper-crane. That is to say, we can fold the paper in half (or namedly equal-folds) through the symmetrical axis, and repeat such step in the other symmetrical axis. And then apply multiple equal-folds in sequence relative to each smaller rectangle divided by the two creases; After that, the creases in each direction will interweave into a complete set of uniform small square splicing patterns; these small squares form a reference space similar to a two-dimensional coordinate system, allowing us to combine adjacent creases on the plane from Three-dimensional high platforms or depressions are folded on the two-dimensional small squares to facilitate the next steps of folding. It should be noted that, in the process of creating grid creases, there may be rare cases when the folds are not aligned. The consequences of this error can be very serious. And just like the butterfly effect, it is only a slight difference at the beginning , and in the end it may generate a disaster world which is completely different from plan. Anyway, let's continue. The second step is make the three-dimensional base: In this step, we need to fold a set of symmetrical three-dimensional high platforms or depressions based on the grid creases. From the symmetry analysis, it is not difficult to find that the rose will have four symmetrical three-dimensional high platforms and supporting depressions. Therefore, we can firstly fold out a quarter of the depression and plateau patterns, which would help build a base to compose into a complex 3D structure. And then, we use this quarter as a template, and fold out the repeating patterns on the remaining three parts of the whole structure in turn. It is worth noting that the layout of the high platform not only needs to consider the regular contrast and symmetrical distribution of the length and width, but also needs to ensure the orderliness of the height dimension. This is very important, since we will never go back to this step after all parts were made, and you would better start from first step if you make anything wrong in the this step. Similar to the precautions in the first stage, please handle all the corners in three dimensions to ensure that they conform to the layout required in the plan, which would help us avoid the butterfly effect and increase the robustness in the process of three-dimensional folding. Just like building a skyscrapper in the real world, people usually take a lot of time when building the base but soon get finished when extending the structure after that. Time is worth to cost in the base, but would be saved in the future after you succeed in base. Anyway, let's continue. During the first quarter of the pattern, repeated comparisons with the finished rose were made to eliminate any possible errors in the first place. The final stage is to finish the petal grooming. At this stage, we often emphasize an important term called folding-by-heart. The intention here is no longer literally serious, but focus is moved to our understanding of the shape of a rose in nature, and we usually use natural curves to continuously correct the shape of petals in order to approach the shape of rose petals in reality. One more comment: this is also the cause of randomness to the art, which can be generated differently by different people. Recall that rose should be adjusted close to reality, so in the last step of this stage, we need to open the bloom in the center of the rose, by pulling on the four petals that have been bent. This process may be accompanied by the collapse of the overall structure of the rose, so we should be very careful to save strength of adjustment, and it must be well controlled to avoid irreversible consequences. Ultimately, after three stages of folding, we end up with a crown of rose with a similar shape close to reality. If condition is permited, we can wrap a green paper strip twisted on a straightened iron wire, and insert the rose crown we just created onto one side of the iron wire. In this way, we got a hand-made rose with a green stem. We can also repeat the steps above to increase the number of rose, so that it can be made into a cluster. Different color of rose is usually more attractive and can be considered as a better plan of gift to your friend. In summary, by creating a grid of creases, making a three-dimensional base, and finishing with petals, we created a three-dimensional rose from a two-dimensional paper. Although this process may seem simple, it is indeed a work of art created by us humans with the help of imagination and common materials. At last, Please comment to assess the above content.]": "failed", + "tokenizers_test.py::test_wordlevel_detokenizer[no_skip_tokens-clean_spaces-mini-bart-g2p-test_string6]": "failed", + "tokenizers_test.py::test_wordlevel_detokenizer[skip_tokens-clean_spaces-mini-bart-g2p-test_string6]": "failed", "tokenizers_test.py::test_sentencepiece_model_tokenizer[no_add_tokens-TinyLlama-1.1B-Chat-v1.0-sp_backend-Slow-[INST] <> A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. <> You will act as a Christian, and fully summarize following text:\\nSometimes it's nice to take a minute in the pew by yourself beforehand. You have this beautiful church probably almost all to yourself. Can you feel its energy resonating through you? Can you feel the majesty of the Lord's kingdom and how you're a part of it? Take a moment to kneel and pray with your head down and hands clasped together. Reflect on your faith and how you feel currently. Think about how you've been responding to God's call and how you've been living in the light of his love. When the priest is ready for you, of course. You'll probably see him there by his lonesome or someone else walk out just before you. Sit down either across from him or behind the screen -- it's totally up to you whether or not you prefer to remain anonymous. He won't treat you any differently either way. Make the sign of the cross upon his prompt, saying, \"Bless me, Father, for I have sinned. It has been 10 years since my last confession.\" This is your standard, traditional phrasing. However, if you just sit down and say hello, that's fine, too. The priest knows what he's doing. The Byzantine Rite is a bit different. The priest may sit to your side and put his epitrachelion on your head. He may then also do the Prayer of Absolution. But the idea remains the exact same -- just go wherever he takes you. Once you sit down and you've made the sign of the cross, just sit back and follow the priest's lead. He'll ask you how long it's been since your last confession (if you don't voluntarily offer that information), how you are feeling, maybe how your faith is going, and then ask you what sins you would like to talk about with him and God. It's just a casual conversation! Do not fret. There is absolutely zero pressure on your part. Again, as long as you come there with the intention of leaving with a clean heart, you're more than welcome in the church. There is no wrong way to go about confession! This part is intimidating, but think about it this way: the priest you're talking to has probably heard just about everything before. Whatever you have to say will not blow his mind. So when he asks, start rattling them off, from the most serious to the least. If he asks any questions, answer them, but do not feel the need to go into detail. A simple, \"I did so and so,\" will suffice. Your priest is going to be very understanding. If you don't remember the exact timeframe, that's fine. If you don't remember your motivation, that's fine. All your priest cares about is that you're being as honest as possible and that your heart is in the right place. He'll talk you through everything, possibly asking about your intentions, but mainly just letting you know that God loves you, sin and all. If he has any ideas to bring you closer to God, he may suggest them at this juncture. He's there to help, after all. He will then ask you to make an Act of Contrition. That goes like this: My God, I am sorry for my sins with all my heart.In choosing to do wrong and failing to do good,I have sinned against You whom I should loveabove all things. I firmly intend, with your help,to do penance, to sin no more, andto avoid whatever leads me to sin.Our Savior Jesus Christ suffered and died for us.In his name, my God, have mercy.If you are a Roman Catholic, your act of contrition will go like this: Oh my God, I am very sorry for having offended thee. But most of all, because they offend you, my God, who is all good and deserving of all my love. I firmly resolve with the help of thy grace, to sin no more, and to avoid the near occasion of sin. Amen. Don't worry! It won't be anything huge. Take the absolution to heart -- you now have a brand new, clean slate to work with. \"Penance\" is your expression of regret and repentance, showing God that you're truly sorry and that you wish for nothing more than to be forgiven. Thanks. [/INST]]": "failed", "tokenizers_test.py::test_sentencepiece_model_tokenizer_chat[no_add_tokens-TinyLlama-1.1B-Chat-v1.0-sp_backend-Slow-test_chat0]": "failed", "tokenizers_test.py::test_sentencepiece_model_tokenizer[add_tokens-TinyLlama-1.1B-Chat-v1.0-sp_backend-Slow-[INST] <> A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. <> You will act as a Christian, and fully summarize following text:\\nSometimes it's nice to take a minute in the pew by yourself beforehand. You have this beautiful church probably almost all to yourself. Can you feel its energy resonating through you? Can you feel the majesty of the Lord's kingdom and how you're a part of it? Take a moment to kneel and pray with your head down and hands clasped together. Reflect on your faith and how you feel currently. Think about how you've been responding to God's call and how you've been living in the light of his love. When the priest is ready for you, of course. You'll probably see him there by his lonesome or someone else walk out just before you. Sit down either across from him or behind the screen -- it's totally up to you whether or not you prefer to remain anonymous. He won't treat you any differently either way. Make the sign of the cross upon his prompt, saying, \"Bless me, Father, for I have sinned. It has been 10 years since my last confession.\" This is your standard, traditional phrasing. However, if you just sit down and say hello, that's fine, too. The priest knows what he's doing. The Byzantine Rite is a bit different. The priest may sit to your side and put his epitrachelion on your head. He may then also do the Prayer of Absolution. But the idea remains the exact same -- just go wherever he takes you. Once you sit down and you've made the sign of the cross, just sit back and follow the priest's lead. He'll ask you how long it's been since your last confession (if you don't voluntarily offer that information), how you are feeling, maybe how your faith is going, and then ask you what sins you would like to talk about with him and God. It's just a casual conversation! Do not fret. There is absolutely zero pressure on your part. Again, as long as you come there with the intention of leaving with a clean heart, you're more than welcome in the church. There is no wrong way to go about confession! This part is intimidating, but think about it this way: the priest you're talking to has probably heard just about everything before. Whatever you have to say will not blow his mind. So when he asks, start rattling them off, from the most serious to the least. If he asks any questions, answer them, but do not feel the need to go into detail. A simple, \"I did so and so,\" will suffice. Your priest is going to be very understanding. If you don't remember the exact timeframe, that's fine. If you don't remember your motivation, that's fine. All your priest cares about is that you're being as honest as possible and that your heart is in the right place. He'll talk you through everything, possibly asking about your intentions, but mainly just letting you know that God loves you, sin and all. If he has any ideas to bring you closer to God, he may suggest them at this juncture. He's there to help, after all. He will then ask you to make an Act of Contrition. That goes like this: My God, I am sorry for my sins with all my heart.In choosing to do wrong and failing to do good,I have sinned against You whom I should loveabove all things. I firmly intend, with your help,to do penance, to sin no more, andto avoid whatever leads me to sin.Our Savior Jesus Christ suffered and died for us.In his name, my God, have mercy.If you are a Roman Catholic, your act of contrition will go like this: Oh my God, I am very sorry for having offended thee. But most of all, because they offend you, my God, who is all good and deserving of all my love. I firmly resolve with the help of thy grace, to sin no more, and to avoid the near occasion of sin. Amen. Don't worry! It won't be anything huge. Take the absolution to heart -- you now have a brand new, clean slate to work with. \"Penance\" is your expression of regret and repentance, showing God that you're truly sorry and that you wish for nothing more than to be forgiven. Thanks. [/INST]]": "failed", diff --git a/tests/tokenizers_test.py b/tests/tokenizers_test.py index 6c6d63c9..1c021ce7 100644 --- a/tests/tokenizers_test.py +++ b/tests/tokenizers_test.py @@ -138,6 +138,9 @@ # "Salesforce/xgen-7b-8k-base", # not compatible with transformers 4.44.0 "THUDM/glm-4-9b-chat", ] +wordlevel_models =[ + "cisco-ai/mini-bart-g2p" +] def get_tokenizer(hf_tokenizer, add_special_tokens=True, use_max_padding=False, use_sentencepiece_backend=False): @@ -256,10 +259,6 @@ def hf_sentencepiece_tokenizers_with_padding_sides( ): pytest.skip("Unigram model supports only sentencepiece backend.") - if hf_tokenizer.pad_token is None: - hf_tokenizer.pad_token = hf_tokenizer.eos_token - hf_tokenizer.pad_token_id = hf_tokenizer.eos_token_id or 0 - return hf_tokenizer @@ -271,9 +270,6 @@ def hf_bpe_tokenizers(request): @pytest.fixture(scope="session", params=bpe_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) def hf_bpe_tokenizers_with_padding_sides(request, use_left_padding): hf_tokenizer = get_hf_tokenizer(request, left_padding=use_left_padding) - if hf_tokenizer.pad_token is None: - hf_tokenizer.pad_token = hf_tokenizer.eos_token - hf_tokenizer.pad_token_id = hf_tokenizer.eos_token_id or 0 return hf_tokenizer @@ -285,12 +281,14 @@ def hf_tiktoken_tokenizers(request): @pytest.fixture(scope="session", params=tiktiken_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) def hf_tiktoken_tokenizers_with_padding_sides(request, use_left_padding): hf_tokenizer = get_hf_tokenizer(request, trust_remote_code=True, left_padding=use_left_padding) - if hf_tokenizer.pad_token is None: - hf_tokenizer.pad_token = hf_tokenizer.eos_token - hf_tokenizer.pad_token_id = hf_tokenizer.eos_token_id or getattr(hf_tokenizer, "eod_id") or 0 return hf_tokenizer +@pytest.fixture(scope="session", params=wordlevel_models, ids=lambda checkpoint: checkpoint.split("/")[-1]) +def hf_wordlevel_tokenizers(request): + return get_hf_tokenizer(request) + + @pytest.fixture(scope="session") def wordpiece_tokenizers(hf_wordpiece_tokenizers, do_add_special_tokens): return get_tokenizer(hf_wordpiece_tokenizers, add_special_tokens=do_add_special_tokens) @@ -413,6 +411,18 @@ def tiktoken_tokenizers_detokenizers(hf_tiktoken_tokenizers, do_skip_special_tok ) +@pytest.fixture(scope="session") +def wordlevel_tokenizers(hf_wordlevel_tokenizers, do_add_special_tokens): + return get_tokenizer(hf_wordlevel_tokenizers, add_special_tokens=do_add_special_tokens) + + +@pytest.fixture(scope="session") +def wordlevel_tokenizers_detokenizers(hf_wordlevel_tokenizers, do_skip_special_tokens, do_clean_up_tokenization_spaces): + return get_tokenizer_detokenizer( + hf_wordlevel_tokenizers, skip_special_tokens=do_skip_special_tokens, clean_up_tokenization_spaces=do_clean_up_tokenization_spaces + ) + + @pytest.fixture( scope="session", params=["openlm-research/open_llama_3b_v2"], ids=lambda checkpoint: checkpoint.split("/")[-1] ) @@ -472,7 +482,7 @@ def check_detokenizer_output( hf_tokenizer, _, ov_detokenizer = detokenizers hf_detokenizer_kwargs = {} if hf_detokenizer_kwargs is None else hf_detokenizer_kwargs - token_ids = hf_tokenizer(test_string, return_tensors="np").input_ids + token_ids = hf_tokenizer(test_string, return_tensors="np", padding=True).input_ids hf_output = hf_tokenizer.batch_decode(token_ids, **hf_detokenizer_kwargs) ov_output = ov_detokenizer(token_ids.astype("int32"))["string_output"].tolist() @@ -788,6 +798,48 @@ def test_tiktoken_detokenizer( ) +@pytest.mark.parametrize( + "test_string", + [string.split() for string in ( + *eng_test_strings, + *multilingual_test_strings, + *emoji_test_strings, + *misc_strings, + ) if string.split()], +) +def test_wordlevel_tokenizers(wordlevel_tokenizers, test_string, do_add_special_tokens): + hf_tokenizer_kwargs = {"add_special_tokens": do_add_special_tokens, "padding": True} + check_tokenizer_output( + wordlevel_tokenizers, + test_string=test_string, + skip_missing_outputs=True, + hf_tokenizer_kwargs=hf_tokenizer_kwargs, + calculate_diff=True, + ) + + +@pytest.mark.parametrize( + "test_string", + [string.split() for string in ( + *eng_test_strings, + *multilingual_test_strings, + *emoji_test_strings, + *misc_strings, + ) if string.split()], +) +def test_wordlevel_detokenizer( + wordlevel_tokenizers_detokenizers, test_string, do_skip_special_tokens, do_clean_up_tokenization_spaces +): + hf_detokenizer_kwargs = { + "skip_special_tokens": do_skip_special_tokens, "padding": True, "clean_up_tokenization_spaces": do_clean_up_tokenization_spaces + } + check_detokenizer_output( + wordlevel_tokenizers_detokenizers, + test_string=test_string, + hf_detokenizer_kwargs=hf_detokenizer_kwargs, + ) + + def test_streaming_detokenizer(sentencepiece_streaming_tokenizers): hf_tokenizer, _, ov_detokenizer = sentencepiece_streaming_tokenizers diff --git a/tests/utils.py b/tests/utils.py index 30421801..d56045ce 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -2,16 +2,24 @@ from transformers import AutoTokenizer +MAX_RETRY = 2 + + def get_hf_tokenizer(request, fast_tokenizer=True, trust_remote_code=False, left_padding=None): kwargs = {} if left_padding is not None: kwargs["padding_side"] = "left" if left_padding else "right" kwargs["truncation_side"] = "left" if left_padding else "right" - for retry in range(2): + for retry in range(1, MAX_RETRY + 1): try: - return AutoTokenizer.from_pretrained( + tokenizer = AutoTokenizer.from_pretrained( request.param, use_fast=fast_tokenizer, trust_remote_code=trust_remote_code, **kwargs ) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + tokenizer.pad_token_id = tokenizer.eos_token_id or getattr(tokenizer, "eod_id", None) or 0 + return tokenizer except requests.ReadTimeout: - pass + if retry == MAX_RETRY: + raise