Skip to content

Commit

Permalink
Test add_special_tokens properly (#1586)
Browse files Browse the repository at this point in the history
- In HF `tiny-random-phi3` add_special_tokens works, while for
`Qwen2-0.5B-Instruct` it does not work even in HF.
- Use `"katuni4ka/tiny-random-phi3"` instead of
`"Qwen/Qwen2-0.5B-Instruct"` for special tokens testing.

---------

Co-authored-by: Ilya Lavrenov <[email protected]>
  • Loading branch information
pavel-esir and ilya-lavrenov authored Jan 18, 2025
1 parent ca6f5cb commit d4bb7c1
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 14 deletions.
2 changes: 1 addition & 1 deletion tests/python_tests/test_continuous_batching.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def test_cb_streamer_vs_return_vs_stateful(prompt):
@pytest.mark.parametrize("model_descr", get_chat_models_list())
@pytest.mark.precommit
def test_chat_scenario_vs_stateful(model_descr, generation_config_kwargs: Dict):
model_id, models_path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
model_id, models_path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1]))
cb_pipe = get_continuous_batching(models_path)

ov_pipe.start_chat()
Expand Down
2 changes: 1 addition & 1 deletion tests/python_tests/test_llm_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def test_chat_scenario(model_descr, generation_config_kwargs: Dict):
chat_history_hf = []
chat_history_ov = []

model_id, path, tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
model_id, path, tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1]))

ov_generation_config = GenerationConfig(**generation_config_kwargs)
hf_generation_config = convert_to_hf(opt_model.generation_config, ov_generation_config)
Expand Down
36 changes: 24 additions & 12 deletions tests/python_tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]):
@pytest.mark.nightly
def test_set_chat_template():
model_descr = get_chat_models_list()[0]
model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1]))

prompt = "how are you?"
dummy_conversation = [
Expand Down Expand Up @@ -223,24 +223,36 @@ def test_set_chat_template():
]
@pytest.mark.precommit
@pytest.mark.nightly
@pytest.mark.parametrize("add_special_tokens", [True, False])
@pytest.mark.parametrize("skip_special_tokens", [True, False])
@pytest.mark.parametrize("prompt", prompts)
def test_encode_decode_with_special_tokens_option(add_special_tokens, skip_special_tokens, prompt):
def test_encode_decode_with_special_tokens_option(prompt):
import numpy as np
model_descr = get_chat_models_list()[0]
model_id, path, hf_tokenizer, model_opt, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
model_descr = get_models_list()[0]
model_id, path, hf_tokenizer, model_opt, ov_pipe = read_model((model_descr[0], model_descr[1]))
ov_tokenzier = ov_pipe.get_tokenizer()

# Calling encode with 'add_special_tokens' will set state flag.
ov_res = ov_tokenzier.encode(prompt, add_special_tokens=add_special_tokens).input_ids.data
hf_res = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=add_special_tokens)["input_ids"]
assert np.all(ov_res == hf_res)
ov_res_add_spec = ov_tokenzier.encode(prompt, add_special_tokens=True).input_ids.data
ov_res_no_spec = ov_tokenzier.encode(prompt, add_special_tokens=False).input_ids.data
hf_res_add_spec = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=True)["input_ids"]
hf_res_no_spec = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=False)["input_ids"]
assert np.all(ov_res_add_spec == hf_res_add_spec)
assert np.all(ov_res_no_spec == hf_res_no_spec)

# Check that add_special_tokens flag indeed made any difference
assert ov_res_add_spec.size != ov_res_no_spec.size
assert hf_res_add_spec.size != hf_res_no_spec.size

# Decode with 'skip_special_tokens'
decoded_genai = ov_tokenzier.decode(ov_res, skip_special_tokens=skip_special_tokens)[0]
decoded_hf = hf_tokenizer.decode(hf_res[0], skip_special_tokens=skip_special_tokens)
assert decoded_genai == decoded_hf
decoded_genai_skip_spec = ov_tokenzier.decode(hf_res_add_spec, skip_special_tokens=True)[0]
decoded_genai_no_skip = ov_tokenzier.decode(hf_res_add_spec, skip_special_tokens=False)[0]
decoded_hf_skip_spec = hf_tokenizer.decode(hf_res_add_spec[0], skip_special_tokens=True)
decoded_hf_no_skip = hf_tokenizer.decode(hf_res_add_spec[0], skip_special_tokens=False)
assert decoded_genai_skip_spec == decoded_hf_skip_spec
assert decoded_genai_no_skip == decoded_hf_no_skip

# Check that skip_special_tokens indeed made any difference
assert decoded_genai_skip_spec != decoded_genai_no_skip
assert decoded_hf_skip_spec != decoded_hf_no_skip


@pytest.mark.precommit
Expand Down

0 comments on commit d4bb7c1

Please sign in to comment.