From 7c9a6f58af1fc2c690454385b3e39ad3bf029d91 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Fri, 27 Dec 2024 12:37:41 +0100 Subject: [PATCH] Some fixes --- .../beam_search_causal_lm.cpp | 1 + .../beam_search_causal_lm.py | 1 + src/cpp/src/generation_config.cpp | 20 ++++++---- src/cpp/src/json_utils.hpp | 12 ++++++ .../openvino_genai/py_openvino_genai.pyi | 2 + src/python/py_generation_config.cpp | 1 + tests/python_tests/common.py | 4 +- tests/python_tests/test_generation_config.py | 38 +++++++++++++++++++ tests/python_tests/test_tokenizer.py | 13 +++++-- .../continuous_batching_benchmark.cpp | 5 --- 10 files changed, 79 insertions(+), 18 deletions(-) diff --git a/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp b/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp index 236b31b351..fc18fa8e0c 100644 --- a/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp +++ b/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp @@ -17,6 +17,7 @@ int main(int argc, char* argv[]) try { config.max_new_tokens = 20; config.num_beam_groups = 3; config.num_beams = 15; + config.diversity_penalty = 1.0f; config.num_return_sequences = config.num_beams; // Since the streamer is set, the results will diff --git a/samples/python/beam_search_causal_lm/beam_search_causal_lm.py b/samples/python/beam_search_causal_lm/beam_search_causal_lm.py index 16b8b76175..4e2430a47f 100755 --- a/samples/python/beam_search_causal_lm/beam_search_causal_lm.py +++ b/samples/python/beam_search_causal_lm/beam_search_causal_lm.py @@ -19,6 +19,7 @@ def main(): config.max_new_tokens = 20 config.num_beam_groups = 3 config.num_beams = 15 + config.diversity_penalty = 1 config.num_return_sequences = config.num_beams beams = pipe.generate(args.prompts, config) diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index 29d7119a43..329b68a44c 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -33,8 +33,14 @@ GenerationConfig::GenerationConfig(const std::filesystem::path& json_path) { read_json_param(data, "stop_strings", stop_strings); // note that include_stop_str_in_output is not present in HF GenerationConfig read_json_param(data, "include_stop_str_in_output", include_stop_str_in_output); - // note that stop_token_ids is not present in HF GenerationConfig - read_json_param(data, "stop_token_ids", stop_token_ids); + // note that stop_token_ids is not present in HF GenerationConfig, but some generation_config.json define + // multiple eos_token_id (e.g. https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/generation_config.json) + // so, we need to read them as 'stop_token_ids' + read_json_param(data, "eos_token_id", stop_token_ids); + + if (eos_token_id == -1 && !stop_token_ids.empty()) { + eos_token_id = *stop_token_ids.begin(); + } // note that echo is not present in HF GenerationConfig read_json_param(data, "echo", echo); @@ -195,7 +201,7 @@ void GenerationConfig::validate() const { "ignore_eos is true, in this case either 'max_new_tokens', or 'max_length' should be defined."); OPENVINO_ASSERT(eos_token_id != -1 || !stop_token_ids.empty() || !stop_strings.empty() || max_new_tokens != SIZE_MAX || max_length != SIZE_MAX, - "Either 'eos_token_id', or 'stop_token_ids', or ''stop_strings'', or 'max_new_tokens', or 'max_length' should be defined."); + "Either 'eos_token_id', or 'stop_token_ids', or 'stop_strings', or 'max_new_tokens', or 'max_length' should be defined."); OPENVINO_ASSERT(max_new_tokens > 0 || (max_new_tokens == 0 && echo), "'max_new_tokens' must be greater than 0, if `echo` is set, 0 is also accepted"); OPENVINO_ASSERT(min_new_tokens <= max_new_tokens, "min_new_tokens must be less or equal max_new_tokens"); @@ -217,8 +223,8 @@ void GenerationConfig::validate() const { } if (is_multinomial()) { - OPENVINO_ASSERT(top_k > 0, "When 'do_sample' is true, top_k must be a strictly positive, but got ", top_k); - OPENVINO_ASSERT(top_p > 0 && top_p <= 1.0f, "When 'do_sample' is true, top_p must be a positive float > 0 and < 1, but got ", top_p); + OPENVINO_ASSERT(top_k >= 0, "When 'do_sample' is true, top_k must be a non-negative, but got ", top_k); + OPENVINO_ASSERT(top_p > 0 && top_p <= 1.0f, "When 'do_sample' is true, top_p must be a positive float > 0.0 and <= 1.0, but got ", top_p); OPENVINO_ASSERT(temperature > 0, "When 'do_sample' is true, temperature must be a strictly positive float, but got ", temperature); } else { // parameters requiring multinomial @@ -246,8 +252,8 @@ void GenerationConfig::validate() const { // parameters requiring beam search OPENVINO_ASSERT(num_beam_groups == 1, "'num_beam_groups' is supported by beam search only and should be 1 otherwise, but got ", num_beam_groups); OPENVINO_ASSERT(no_repeat_ngram_size == std::numeric_limits::max(), "'no_repeat_ngram_size' is supported only by beam search, otherwise should be set to max of size_t, but got ", no_repeat_ngram_size); - OPENVINO_ASSERT(diversity_penalty == 0.0f, "'diversity_penalty' is set to non default value 0.0f, but got ", diversity_penalty, ", which is supported only by beam search sampling, but 'num_beams' is set to 1"); - OPENVINO_ASSERT(length_penalty == 1.0f, "'length_penalty' is set to non default value 1.0f, but got ", length_penalty, ", which is supported only by beam search sampling, but 'num_beams' is set to 1"); + OPENVINO_ASSERT(diversity_penalty == 0.0f, "'diversity_penalty' is set to ", diversity_penalty, " (default is 0.0f), which is supported only by beam search sampling"); + OPENVINO_ASSERT(length_penalty == 1.0f, "'length_penalty' is set to ", length_penalty, " (default is 1.0f), which is supported only by beam search sampling"); } // assistant generation diff --git a/src/cpp/src/json_utils.hpp b/src/cpp/src/json_utils.hpp index 13d792e9db..4a4bb001df 100644 --- a/src/cpp/src/json_utils.hpp +++ b/src/cpp/src/json_utils.hpp @@ -4,6 +4,9 @@ #pragma once +#include +#include + #include namespace ov { @@ -40,6 +43,15 @@ void read_json_param(const nlohmann::json& data, const std::string& name, std::v } } +template +void read_json_param(const nlohmann::json& data, const std::string& name, std::set& param) { + if (data.contains(name) && data[name].is_array()) { + for (const auto elem : data[name]) { + param.insert(elem.get()); + } + } +} + } // namespace utils } // namespace genai } // namespace ov diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index ea9ad1bbf3..5d82fa89a3 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -609,6 +609,8 @@ class GenerationConfig: ... def is_greedy_decoding(self) -> bool: ... + def is_multinomial(self) -> bool: + ... def is_prompt_lookup(self) -> bool: ... def set_eos_token_id(self, tokenizer_eos_token_id: int) -> None: diff --git a/src/python/py_generation_config.cpp b/src/python/py_generation_config.cpp index c7517400f3..a4bf99856b 100644 --- a/src/python/py_generation_config.cpp +++ b/src/python/py_generation_config.cpp @@ -118,6 +118,7 @@ void init_generation_config(py::module_& m) { .def("set_eos_token_id", &GenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id")) .def("is_beam_search", &GenerationConfig::is_beam_search) .def("is_greedy_decoding", &GenerationConfig::is_greedy_decoding) + .def("is_multinomial", &GenerationConfig::is_multinomial) .def("is_assisting_generation", &GenerationConfig::is_assisting_generation) .def("is_prompt_lookup", &GenerationConfig::is_prompt_lookup) .def("validate", &GenerationConfig::validate) diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index f940d272ed..3dcae674ce 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -299,7 +299,7 @@ def convert_to_hf( kwargs['pad_token_id'] = default_generation_config.pad_token_id kwargs['repetition_penalty'] = generation_config.repetition_penalty - if generation_config.num_beams > 1: + if generation_config.is_beam_search(): # beam search case kwargs['num_beam_groups'] = generation_config.num_beam_groups kwargs['num_beams'] = generation_config.num_beams @@ -309,7 +309,7 @@ def convert_to_hf( kwargs['output_scores'] = True if generation_config.num_beam_groups > 1: kwargs['diversity_penalty'] = generation_config.diversity_penalty - elif generation_config.do_sample: + elif generation_config.is_multinomial(): # mulitinomial kwargs['temperature'] = generation_config.temperature kwargs['top_k'] = generation_config.top_k diff --git a/tests/python_tests/test_generation_config.py b/tests/python_tests/test_generation_config.py index 28477c58a3..6e4ae49716 100644 --- a/tests/python_tests/test_generation_config.py +++ b/tests/python_tests/test_generation_config.py @@ -2,6 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 from openvino_genai import GenerationConfig +from typing import Tuple, List +import json +import os import pytest configs = [ @@ -94,3 +97,38 @@ def test_invalid_generation_configs_throws(generation_config): config = GenerationConfig() with pytest.raises(RuntimeError): config.update_generation_config(**generation_config) + + +def load_genai_generation_config_from_file(configs: List[Tuple], temp_path): + for json_file in temp_path.glob("*.json"): + json_file.unlink() + + for config_json, config_name in configs: + with (temp_path / config_name).open('w') as f: + json.dump(config_json, f) + + ov_generation_config = GenerationConfig(temp_path / "generation_config.json") + + for _, config_name in configs: + os.remove(temp_path / config_name) + + return ov_generation_config + +@pytest.mark.precommit +@pytest.mark.nightly +def test_multiple_eos_are_read_as_stop_token_ids(tmp_path): + generation_config_json = { + "eos_token_id": [ + 2, + 32000, + 32007 + ] + } + configs = [ + (generation_config_json, "generation_config.json"), + ] + + generation_config = load_genai_generation_config_from_file(configs, tmp_path) + + assert generation_config.eos_token_id == 2 + assert generation_config.stop_token_ids == { 2, 32000, 32007 } diff --git a/tests/python_tests/test_tokenizer.py b/tests/python_tests/test_tokenizer.py index 6c27edcd71..8129298763 100644 --- a/tests/python_tests/test_tokenizer.py +++ b/tests/python_tests/test_tokenizer.py @@ -1,6 +1,7 @@ # Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +import os import pytest import numpy as np from transformers import AutoTokenizer @@ -17,15 +18,19 @@ def load_genai_tokenizer_with_configs(configs: List[Tuple], temp_path): - # load Tokenizer where all configs are cleared. - # remove existing jsons from previous tests for json_file in temp_path.glob("*.json"): json_file.unlink() for config_json, config_name in configs: with (temp_path / config_name).open('w') as f: json.dump(config_json, f) - return openvino_genai.Tokenizer(temp_path) + + ov_tokenizer = openvino_genai.Tokenizer(temp_path) + + for _, config_name in configs: + os.remove(temp_path / config_name) + + return ov_tokenizer def get_chat_templates(): @@ -181,7 +186,7 @@ def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]): @pytest.mark.nightly def test_set_chat_template(): model_descr = get_chat_models_list()[0] - model_id, path, hf_tokenizer, model_opt, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) + model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) prompt = "how are you?" dummy_conversation = [ diff --git a/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp b/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp index 6cf462fdf8..e0c50cda02 100644 --- a/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp +++ b/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp @@ -123,11 +123,6 @@ Dataset filtered_dataset(const std::string& models_path, const std::string& data ov::genai::GenerationConfig greedy_search = ov::genai::greedy(); greedy_search.max_new_tokens = std::min(max_output_len, output_len); greedy_search.ignore_eos = true; - greedy_search.repetition_penalty = 1.0; - greedy_search.frequency_penalty = 0.0; - greedy_search.presence_penalty = 0.0; - greedy_search.diversity_penalty = 0.0; - greedy_search.length_penalty = 0.0; dataset.push_data(human_question, greedy_search); dataset.push_lens(input_len, output_len);