From 4b9dd6a48daac6ef8efe95d3a617887ba399dbd2 Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Mon, 9 Dec 2024 23:04:55 +0400
Subject: [PATCH] Port to master (#1335)

Ported from https://github.com/openvinotoolkit/openvino.genai/pull/1319
and https://github.com/openvinotoolkit/openvino.genai/pull/1219

---------

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 .../openvino/genai/generation_config.hpp      |  8 +++---
 src/cpp/src/generation_config.cpp             |  3 +++
 .../openvino_genai/py_openvino_genai.pyi      | 18 ++++++-------
 src/python/py_generation_config.cpp           |  6 ++---
 tests/python_tests/test_generate_api.py       | 26 +++++++++++++++++++
 5 files changed, 45 insertions(+), 16 deletions(-)
diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
index 8d23b298ba..2402f57fba 100644
--- a/src/cpp/include/openvino/genai/generation_config.hpp
+++ b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -36,11 +36,11 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER };
  * @param max_new_tokens the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
  * @param ignore_eos if set to true, then generation will not stop even if <eos> token is met.
  * @param eos_token_id token_id of <eos> (end of sentence)
- * @param min_new_tokens set 0 probability for eos_token_id for the first eos_token_id generated tokens. Ignored for non continuous batching.
+ * @param min_new_tokens set 0 probability for eos_token_id for the first eos_token_id generated tokens.
  *
- * @param stop_strings vector of strings that will cause pipeline to stop generating further tokens. Ignored for non continuous batching.
+ * @param stop_strings A set of strings that will cause pipeline to stop generating further tokens.
  * @param include_stop_str_in_output if set to true stop string that matched generation will be included in generation output (default: false)
- * @param stop_token_ids vector of tokens that will cause pipeline to stop generating further tokens. Ignored for non continuous batching.
+ * @param stop_token_ids A set of tokens that will cause pipeline to stop generating further tokens.
  * @param echo if set to true, output will include user prompt (default: false).
  * @param logprobs number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
  *                 Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
@@ -154,7 +154,7 @@ static constexpr ov::Property<size_t> max_new_tokens{"max_new_tokens"};
 static constexpr ov::Property<size_t> max_length{"max_length"};
 static constexpr ov::Property<bool> ignore_eos{"ignore_eos"};
 static constexpr ov::Property<size_t> min_new_tokens{"min_new_tokens"};
-static constexpr ov::Property<std::vector<std::string>> stop_strings{"stop_strings"};
+static constexpr ov::Property<std::set<std::string>> stop_strings{"stop_strings"};
 static constexpr ov::Property<bool> include_stop_str_in_output{"include_stop_str_in_output"};
 static constexpr ov::Property<std::set<int64_t>> stop_token_ids{"stop_token_ids"};
 
diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
index 3d7e05a762..0829e8376a 100644
--- a/src/cpp/src/generation_config.cpp
+++ b/src/cpp/src/generation_config.cpp
@@ -131,6 +131,9 @@ bool GenerationConfig::is_speculative_decoding() const {
 }
 
 void GenerationConfig::validate() const {
+    OPENVINO_ASSERT(eos_token_id == -1 || stop_token_ids.find(eos_token_id) != stop_token_ids.end(),
+        "'stop_token_ids' must contain 'eos_token_id'. Please, call 'set_eos_token_id' with 'eos_token_id' value");
+
     OPENVINO_ASSERT(!do_sample || num_beams == 1, 
                     "Beam search with sampling is not supported yet. "
                     "Please either set do_sample=false to use beam search "
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index 7c6555f38f..b13ee37f24 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -483,10 +483,10 @@ class GenerationConfig:
         max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
         ignore_eos:    if set to true, then generation will not stop even if <eos> token is met.
         eos_token_id:  token_id of <eos> (end of sentence)
-        min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. Ignored for non continuous batching.
-        stop_strings: list of strings that will cause pipeline to stop generating further tokens. Ignored for non continuous batching.
+        min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
+        stop_strings: a set of strings that will cause pipeline to stop generating further tokens.
         include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false)
-        stop_token_ids: list of tokens that will cause pipeline to stop generating further tokens. Ignored for non continuous batching.
+        stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens.
         echo:           if set to true, the model will echo the prompt in the output.
         logprobs:       number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
                         Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
@@ -756,10 +756,10 @@ class LLMPipeline:
             max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
             ignore_eos:    if set to true, then generation will not stop even if <eos> token is met.
             eos_token_id:  token_id of <eos> (end of sentence)
-            min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. Ignored for non continuous batching.
-            stop_strings: list of strings that will cause pipeline to stop generating further tokens. Ignored for non continuous batching.
+            min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
+            stop_strings: a set of strings that will cause pipeline to stop generating further tokens.
             include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false)
-            stop_token_ids: list of tokens that will cause pipeline to stop generating further tokens. Ignored for non continuous batching.
+            stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens.
             echo:           if set to true, the model will echo the prompt in the output.
             logprobs:       number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
                             Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
@@ -837,10 +837,10 @@ class LLMPipeline:
             max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
             ignore_eos:    if set to true, then generation will not stop even if <eos> token is met.
             eos_token_id:  token_id of <eos> (end of sentence)
-            min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. Ignored for non continuous batching.
-            stop_strings: list of strings that will cause pipeline to stop generating further tokens. Ignored for non continuous batching.
+            min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
+            stop_strings: a set of strings that will cause pipeline to stop generating further tokens.
             include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false)
-            stop_token_ids: list of tokens that will cause pipeline to stop generating further tokens. Ignored for non continuous batching.
+            stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens.
             echo:           if set to true, the model will echo the prompt in the output.
             logprobs:       number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
                             Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
diff --git a/src/python/py_generation_config.cpp b/src/python/py_generation_config.cpp
index dc5ec98ed6..d24a915dd6 100644
--- a/src/python/py_generation_config.cpp
+++ b/src/python/py_generation_config.cpp
@@ -40,10 +40,10 @@ char generation_config_docstring[] = R"(
     max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
     ignore_eos:    if set to true, then generation will not stop even if <eos> token is met.
     eos_token_id:  token_id of <eos> (end of sentence)
-    min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. Ignored for non continuous batching.
-    stop_strings: list of strings that will cause pipeline to stop generating further tokens. Ignored for non continuous batching.
+    min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
+    stop_strings: a set of strings that will cause pipeline to stop generating further tokens.
     include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false)
-    stop_token_ids: list of tokens that will cause pipeline to stop generating further tokens. Ignored for non continuous batching.
+    stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens.
     echo:           if set to true, the model will echo the prompt in the output.
     logprobs:       number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
                     Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 80df79f31b..d17f3c0232 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -844,3 +844,29 @@ def test_batch_switch():
     pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
     pipe.generate(["a"], max_new_tokens=2)
     pipe.generate(["1", "2"], max_new_tokens=2)
+
+
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_stop_token_ids():
+    pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
+    res = pipe.generate(
+        ov.Tensor([(1,)]),
+        max_new_tokens=3,
+        stop_token_ids={-1, 9935, pipe.get_tokenizer().get_eos_token_id()},
+        include_stop_str_in_output=False
+    )
+    assert 2 == len(res.tokens[0])
+    assert 9935 in res.tokens[0]
+
+
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_stop_strings():
+    pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
+    res = pipe.generate(
+        "",
+        max_new_tokens=5,
+        stop_strings={"ignored", "боль"}
+    )
+    assert "боль" not in res