From d51655319062f50f8ac2ff2cbd4c8d69d5918d01 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Thu, 2 Jan 2025 13:05:25 +0100
Subject: [PATCH] Refactored tests

---
 tests/python_tests/common.py            | 142 +++-------------
 tests/python_tests/test_llm_pipeline.py | 148 +----------------
 tests/python_tests/test_sampling.py     | 210 +++++++++++-------------
 tests/python_tests/test_vlm_pipeline.py |  17 +-
 4 files changed, 127 insertions(+), 390 deletions(-)

diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index 6b5dc8419e..951ab08230 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -20,20 +20,6 @@ def get_greedy() -> GenerationConfig:
     generation_config.max_new_tokens = 30
     return generation_config
 
-def get_greedy_with_min_and_max_tokens() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_return_sequences = 1
-    generation_config.min_new_tokens = 15
-    generation_config.max_new_tokens = 30
-    return generation_config
-
-def get_greedy_with_repetition_penalty() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_return_sequences = 1
-    generation_config.repetition_penalty = 2.0
-    generation_config.max_new_tokens = 30
-    return generation_config
-
 def get_greedy_with_penalties() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.num_return_sequences = 1
@@ -42,33 +28,6 @@ def get_greedy_with_penalties() -> GenerationConfig:
     generation_config.max_new_tokens = 30
     return generation_config
 
-def get_greedy_with_single_stop_string() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_return_sequences = 1
-    generation_config.min_new_tokens = 15
-    generation_config.max_new_tokens = 50
-    generation_config.stop_strings = {"anag"} # expected match on "manage"
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
-def get_greedy_with_multiple_stop_strings() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_return_sequences = 1
-    generation_config.min_new_tokens = 1
-    generation_config.max_new_tokens = 50
-    generation_config.stop_strings = {".", "software", "Intel"}
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
-def get_greedy_with_multiple_stop_strings_no_match() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_return_sequences = 1
-    generation_config.min_new_tokens = 1
-    generation_config.max_new_tokens = 50
-    generation_config.stop_strings = {"Einstein", "sunny", "geothermal"}
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
 def get_beam_search() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.num_beam_groups = 3
@@ -79,78 +38,6 @@ def get_beam_search() -> GenerationConfig:
     generation_config.num_return_sequences = generation_config.num_beams
     return generation_config
 
-def get_beam_search_min_and_max_tokens() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_beam_groups = 3
-    generation_config.num_beams = 6
-    generation_config.diversity_penalty = 1
-    generation_config.min_new_tokens = 15
-    generation_config.max_new_tokens = 30
-    generation_config.num_return_sequences = 3
-    generation_config.num_return_sequences = generation_config.num_beams
-    return generation_config
-
-def get_beam_search_with_single_stop_string() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_beam_groups = 3
-    generation_config.num_beams = 6
-    generation_config.diversity_penalty = 1
-    generation_config.max_new_tokens = 50
-    generation_config.num_return_sequences = generation_config.num_beams
-    generation_config.stop_strings = {"open sour"}  # expected match on "open source"
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
-def get_beam_search_with_multiple_stop_strings() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_beam_groups = 3
-    generation_config.num_beams = 6
-    generation_config.diversity_penalty = 1
-    generation_config.max_new_tokens = 50
-    generation_config.num_return_sequences = generation_config.num_beams
-    generation_config.stop_strings = {".", "software", "Intel"}
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
-def get_beam_search_with_multiple_stop_strings_no_match() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_beam_groups = 3
-    generation_config.num_beams = 6
-    generation_config.diversity_penalty = 1
-    generation_config.max_new_tokens = 30
-    generation_config.num_return_sequences = generation_config.num_beams
-    generation_config.stop_strings = {"Einstein", "sunny", "geothermal"}
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
-def get_greedy_stop_strings_exclude_from_output() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.max_new_tokens = 30
-    generation_config.stop_strings = { "machines" }
-    generation_config.include_stop_str_in_output = False
-    return generation_config
-
-def get_greedy_stop_strings_include_to_output() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.max_new_tokens = 30
-    generation_config.stop_strings = { "machines" }
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
-def get_greedy_n_stop_strings_exclude_from_output() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.max_new_tokens = 30
-    generation_config.stop_strings = { "machines", "manage" }
-    generation_config.include_stop_str_in_output = False
-    return generation_config
-
-def get_greedy_n_stop_strings_include_to_output() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.max_new_tokens = 30
-    generation_config.stop_strings = { "machines", "manage" }
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
 def get_multinomial_temperature() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.do_sample = True
@@ -300,9 +187,15 @@ def convert_to_hf(
 
     # copy default parameters
     kwargs['bos_token_id'] = default_generation_config.bos_token_id
-    kwargs['eos_token_id'] = default_generation_config.eos_token_id
     kwargs['pad_token_id'] = default_generation_config.pad_token_id
 
+    if len(generation_config.stop_token_ids) > 0:
+        kwargs['eos_token_id'] = list(generation_config.stop_token_ids)
+    elif generation_config.eos_token_id != -1:
+        kwargs['eos_token_id'] = generation_config.eos_token_id
+    else:
+        kwargs['eos_token_id'] = default_generation_config.eos_token_id
+        
     # copy penalties
     kwargs['repetition_penalty'] = generation_config.repetition_penalty
 
@@ -314,8 +207,12 @@ def convert_to_hf(
         kwargs['no_repeat_ngram_size'] = generation_config.no_repeat_ngram_size
         kwargs['num_return_sequences'] = generation_config.num_return_sequences
         kwargs['output_scores'] = True
+
         if generation_config.num_beam_groups > 1:
             kwargs['diversity_penalty'] = generation_config.diversity_penalty
+
+        from ov_genai_test_utils import STOP_CRITERIA_MAP
+        kwargs['early_stopping'] = STOP_CRITERIA_MAP[generation_config.stop_criteria]
     elif generation_config.is_multinomial():
         # mulitinomial
         kwargs['temperature'] = generation_config.temperature
@@ -338,9 +235,10 @@ def run_hugging_face(
 ) -> List[GenerationResult]:
     hf_generation_config = convert_to_hf(opt_model.generation_config, generation_config)
     generation_results = []
+
     for prompt in prompts:
         inputs = hf_tokenizer(prompt, return_tensors="pt")
-        prompt_len = inputs['input_ids'].numel()
+        prompt_len = 0 if generation_config.echo else inputs['input_ids'].numel()
         generate_outputs = opt_model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'],
                                               generation_config=hf_generation_config, return_dict_in_generate=True, tokenizer=hf_tokenizer)
         all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True)
@@ -379,9 +277,11 @@ def run_continuous_batching(
 def run_llm_pipeline(
     models_path : Path,
     prompts: List[str],
-    generation_config : GenerationConfig
+    generation_config : GenerationConfig,
+    use_cb : bool = False
 ) -> List[GenerationResult]:
-    ov_pipe = LLMPipeline(models_path, device='CPU')
+    properties = { 'scheduler_config' : SchedulerConfig() } if use_cb else { }
+    ov_pipe = LLMPipeline(models_path, device='CPU', **properties)
 
     generation_results = []
     for prompt in prompts:
@@ -400,7 +300,7 @@ def run_llm_pipeline(
     return generation_results
 
 
-def compare_generation_result(hf_result: GenerationResult, ov_result: GenerationResult, generation_config: GenerationConfig ):
+def compare_generation_result(hf_result: GenerationResult, ov_result: GenerationResult, generation_config: GenerationConfig):
     if generation_config.is_beam_search():
         assert len(hf_result.m_scores) == len(ov_result.m_scores)
         for hf_score, ov_score in zip(hf_result.m_scores, ov_result.m_scores):
@@ -429,7 +329,7 @@ def compare_generation_results(prompts: List[str], hf_results: List[GenerationRe
         compare_generation_result(ref_result, ov_result, generation_config)
 
 
-def get_hugging_face_models(model_id: str, use_optimum = True):
+def get_hugging_face_models(model_id: str, use_optimum = True):    
     hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
     opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True) if use_optimum else \
                 AutoModelForCausalLM.from_pretrained(model_id)
@@ -448,7 +348,7 @@ def convert_models(opt_model : OVModelForCausalLM, hf_tokenizer : AutoTokenizer,
     serialize(detokenizer, models_path / "openvino_detokenizer.xml")
 
 
-def run_llm_pipeline_with_ref(model_id: str, prompts: List[str], generation_config: GenerationConfig | dict, tmp_path: Path):
+def run_llm_pipeline_with_ref(model_id: str, prompts: List[str], generation_config: GenerationConfig | dict, tmp_path: Path, use_cb : bool = False):
     use_optimum = True
     models_path : Path = tmp_path / model_id
     opt_model, hf_tokenizer = get_hugging_face_models(model_id, use_optimum)
@@ -459,7 +359,7 @@ def run_llm_pipeline_with_ref(model_id: str, prompts: List[str], generation_conf
     if use_optimum:
         convert_models(opt_model, hf_tokenizer, models_path)
 
-    ov_results = run_llm_pipeline(models_path, prompts, generation_config)
+    ov_results = run_llm_pipeline(models_path, prompts, generation_config, use_cb)
     hf_results = run_hugging_face(opt_model, hf_tokenizer, prompts, generation_config)
 
     compare_generation_results(prompts, hf_results, ov_results, generation_config)
diff --git a/tests/python_tests/test_llm_pipeline.py b/tests/python_tests/test_llm_pipeline.py
index 6e3cce06d0..f5a23ca276 100644
--- a/tests/python_tests/test_llm_pipeline.py
+++ b/tests/python_tests/test_llm_pipeline.py
@@ -143,18 +143,14 @@ def run_hf_ov_genai_comparison_encoded_inputs(
 #
 
 test_cases = [
-    (dict(max_new_tokens=20), 'table is made of'),
     (dict(max_new_tokens=20), '你好！ 你好嗎？'),
     (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=30, diversity_penalty=1.0), 'Alan Turing was a'),
-    (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'),
-    (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'The Sun is yellow because'),
-    (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.5), 'The Sun is yellow because'),
 ]
 @pytest.mark.parametrize("generation_config,prompt", test_cases)
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_decoding(model_descr, generation_config, prompt):
+def test_string_inputs(model_descr, generation_config, prompt):
     run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt)
 
 
@@ -171,120 +167,6 @@ def test_encoded_inputs(model_descr, inputs):
     run_hf_ov_genai_comparison_encoded_inputs(read_model(model_descr), dict(max_new_tokens=20), *inputs)
 
 
-test_configs = [
-    dict(max_new_tokens=20),
-    dict(max_new_tokens=200, ignore_eos=True),
-    dict(max_new_tokens=20, num_beam_groups=3, num_beams=15, diversity_penalty=1.0)
-]
-batched_prompts = [
-    ['table is made', 'They sky is blue because', 'Difference between Jupiter and Mars is that'],
-    ['hello', 'Here is the longest nowel ever: '],
-    ['Alan Turing was a', 'return 0', '你好！ 你好嗎？'],
-    ['table is made', 'table is made [force left pad tokens]']
-]
-@pytest.mark.parametrize("generation_config", test_configs)
-@pytest.mark.parametrize("prompts", batched_prompts)
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_batch_text_input(model_descr, generation_config, prompts):
-    run_hf_ov_genai_comparison_batched(read_model(model_descr), generation_config, prompts)
-
-
-prompts = ['The Sun is yellow because', 'Difference between Jupiter and Mars is that', 'table is made of']
-@pytest.mark.parametrize("num_beam_groups", [2, 3, 8])
-@pytest.mark.parametrize("group_size", [5, 3, 10])
-@pytest.mark.parametrize("max_new_tokens", [20, 15])
-@pytest.mark.parametrize("diversity_penalty", [1.0 , 1.5])
-@pytest.mark.parametrize("prompt", prompts)
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_beam_search_decoding(model_descr, num_beam_groups, group_size, max_new_tokens, diversity_penalty, prompt):
-    generation_config = dict(
-        num_beam_groups=num_beam_groups,
-        num_beams=num_beam_groups * group_size,
-        diversity_penalty=diversity_penalty,
-        num_return_sequences=num_beam_groups * group_size,
-        max_new_tokens=max_new_tokens,
-    )
-    run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt)
-
-
-@pytest.mark.parametrize("stop_criteria", [StopCriteria.NEVER, StopCriteria.EARLY, StopCriteria.HEURISTIC])
-@pytest.mark.parametrize("prompt", prompts)
-@pytest.mark.parametrize("max_new_tokens", [10, 80])
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_beam_search_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens):
-    # todo: with EARLY stop_criteria looks like HF return invalid out with sentence<eos><unk><unk>
-    # while genai ends sentence with <eos>
-    if (stop_criteria == StopCriteria.EARLY):
-        pytest.skip()
-    generation_config = dict(
-        num_beam_groups=2,
-        num_beams=2 * 3,
-        diversity_penalty=1.0,
-        num_return_sequences=2 * 3,
-        max_new_tokens=max_new_tokens,
-        stop_criteria=stop_criteria,
-    )
-    run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt)
-
-
-# test long sequences
-@pytest.mark.parametrize("num_beam_groups", [2])
-@pytest.mark.parametrize("group_size", [5])
-@pytest.mark.parametrize("max_new_tokens", [800, 2000])
-@pytest.mark.parametrize("prompt", prompts)
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.nightly
-def test_beam_search_long_sentences(model_descr, num_beam_groups, group_size,
-                                    max_new_tokens, prompt):
-    generation_config = dict(
-        num_beam_groups=num_beam_groups,
-        num_beams=num_beam_groups * group_size,
-        diversity_penalty=1.0,
-        num_return_sequences=num_beam_groups * group_size,
-        max_new_tokens=max_new_tokens,
-    )
-    run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt)
-
-
-@pytest.mark.parametrize("prompt", prompts)
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_greedy_repetition_penalty(model_descr, prompt):
-    model_id, path, tokenizer, model, pipe = read_model(model_descr)
-
-    generation_config = dict(
-        repetition_penalty=2.0,
-        max_new_tokens=20,
-        do_sample=False
-    )
-    run_hf_ov_genai_comparison_text_inputs((model_id, path, tokenizer, model, pipe), generation_config, prompt)
-
-    generation_config = dict(
-        repetition_penalty=1.0,
-        max_new_tokens=20,
-        do_sample=False
-    )
-    run_hf_ov_genai_comparison_text_inputs((model_id, path, tokenizer, model, pipe), generation_config, prompt)
-
-    ov_output = pipe.generate(prompt, **generation_config)
-
-    generation_config = dict(
-        repetition_penalty=0.5,
-        max_new_tokens=20,
-        do_sample=False
-    )
-    ov_output_half_penalty = pipe.generate(prompt, **generation_config)
-
-    assert(len(set(ov_output.split(' '))) > len(set(ov_output_half_penalty.split(' '))))
-
-
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_batch_size_switch():
@@ -657,34 +539,6 @@ def test_perf_metrics(model_descr, generation_config, prompt):
 # Misc
 #
 
-# TODO: move to test_sampling.py
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_stop_token_ids():
-    ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
-    res = ov_pipe.generate(
-        ov.Tensor([(1,)]),
-        max_new_tokens=3,
-        stop_token_ids={9935, ov_pipe.get_tokenizer().get_eos_token_id()},
-        include_stop_str_in_output=False
-    )
-    assert 2 == len(res.tokens[0])
-    assert 9935 in res.tokens[0]
-
-
-# TODO: move to test_sampling.py
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_stop_strings():
-    ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
-    res = ov_pipe.generate(
-        "",
-        max_new_tokens=5,
-        stop_strings={"ignored", "боль"}
-    )
-    assert "боль" not in res
-
-
 # TODO: move this test to test_tokenizer.py
 @pytest.mark.skip(reason="probably both models ov + hf doesn't fit to memory")
 @pytest.mark.precommit
diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py
index 3475ee5cba..3df4223219 100644
--- a/tests/python_tests/test_sampling.py
+++ b/tests/python_tests/test_sampling.py
@@ -1,76 +1,93 @@
 # Copyright (C) 2018-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-import os
+
 import sys
 import pytest
-import shutil
 import sys
 from dataclasses import dataclass
 from pathlib import Path
-from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer
+from openvino_genai import GenerationConfig, StopCriteria
 from typing import List, TypedDict
 
-from common import get_hugging_face_models, convert_models, \
-    get_greedy, get_beam_search, get_multinomial_temperature, \
-    get_greedy_with_penalties, get_multinomial_temperature, \
-    get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, \
-    get_multinomial_temperature_top_p_and_top_k, get_greedy_with_repetition_penalty, \
-    get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \
-    get_greedy, get_greedy_with_min_and_max_tokens, \
-    get_greedy_with_single_stop_string, get_greedy_with_multiple_stop_strings, get_greedy_with_multiple_stop_strings_no_match, \
-    get_beam_search, get_beam_search_min_and_max_tokens, get_beam_search_with_single_stop_string, \
-    get_beam_search_with_multiple_stop_strings, get_beam_search_with_multiple_stop_strings_no_match, get_multinomial_max_and_min_token, \
-    get_multinomial_temperature_and_frequence_penalty, get_multinomial_temperature_and_presence_penalty, \
-    get_greedy_stop_strings_exclude_from_output, get_greedy_stop_strings_include_to_output, \
-    get_greedy_n_stop_strings_exclude_from_output, get_greedy_n_stop_strings_include_to_output, \
-    run_llm_pipeline_with_ref, get_multinomial_temperature_and_repetition_penalty, get_scheduler_config, \
-    run_continuous_batching
+from common import get_hugging_face_models, convert_models, run_llm_pipeline_with_ref, run_llm_pipeline
 
 
-# TODO: currently, this test drops EOS token as both HF and OV use `skip_special_tokens=True`, which should be disabled for samlpling tests
 @pytest.mark.precommit
-def test_beam_search_has_eos_token_at_end(tmp_path):
-    '''
-    Current test checks that in case of beam search, some generation results
-    explicitly have EOS token at the end, which is aligned with HF
+@pytest.mark.parametrize("generation_config,prompt",
+                         [(dict(max_new_tokens=30), 'table is made of'),
+                          (dict(max_new_tokens=30, min_new_tokens=30), '你好！ 你好嗎？'),
+                          (dict(max_new_tokens=30, ignore_eos=True), 'Alan Turing was a'),
+                        #   (dict(max_length=40), 'table is made of'),
+                          (dict(stop_token_ids={28998}), 'The Sun is yellow because'), # since a test does not hang, it means stop token is met
+                        #   (dict(max_new_tokens=1, min_new_tokens=0, echo=True), 'What is OpenVINO?')
+                          ],
+                         ids=["max_new_tokens",
+                              "min_and_max_new_tokens",
+                              "max_new_tokens_and_ignore_eos_true",
+                            #   "max_length",
+                              "stop_token_ids",
+                            #   "echo_with_generation",
+                              ])
+def test_basic_stop_criteria(tmp_path, generation_config, prompt):
+    model_id : str = "katuni4ka/tiny-random-phi3"
+    run_llm_pipeline_with_ref(model_id, [prompt], generation_config, tmp_path)
 
-    Example of current output:
-    { -1.23264,  that I don't know about.
-    I don't know what you're talking about, but I'm pretty sure it's a Canadian thing.</s> }
-    '''
-    model_id = "facebook/opt-125m"
-    prompts = ["Tell me something about Canada"]
-    run_llm_pipeline_with_ref(model_id, prompts, get_beam_search(), tmp_path)
 
-
-# TODO: currently, this test drops EOS token as both HF and OV use `skip_special_tokens=True`, which should be disabled for samlpling tests
 @pytest.mark.precommit
-def test_greedy_has_eos_token_at_end(tmp_path):
-    '''
-    Current test checks that in case of gready, some generation results
-    explicitly have EOS token at the end, which is aligned with HF:
+@pytest.mark.parametrize("generation_config",
+                         [dict(max_new_tokens=50, min_new_tokens=15, stop_strings={"anag"}, include_stop_str_in_output=True), # expected match on "manage"
+                          dict(max_new_tokens=50, min_new_tokens=1, stop_strings={".", "software", "Intel"}, include_stop_str_in_output=True),
+                          dict(max_new_tokens=50, min_new_tokens=1, stop_strings={"Einstein", "sunny", "geothermal"}, include_stop_str_in_output=True), # expected no match
+                          dict(max_new_tokens=30, stop_strings={ "machines" }, include_stop_str_in_output=False),
+                          dict(max_new_tokens=30, stop_strings={ "machines" }, include_stop_str_in_output=True),
+                          dict(max_new_tokens=30, stop_strings={ "machines", "manage" }, include_stop_str_in_output=False),
+                          dict(max_new_tokens=30, stop_strings={ "machines", "manage" }, include_stop_str_in_output=True),],
+                         ids=["single_stop_string",
+                              "multiple_stop_strings_match",
+                              "multiple_stop_strings_no_match",
+                              "single_stop_string_exclude_from_output",
+                              "single_stop_string_include_to_output",
+                              "multiple_stop_strings_exclude_from_output",
+                              "multiple_stop_strings_include_to_output"])
+def test_stop_strings(tmp_path, generation_config):
+    prompts = [ "What is OpenVINO?" ]
+    model_id : str = "facebook/opt-125m"
+    run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path)
 
-    Example of current output:
-    {  a software program</s> }
-    '''
-    model_id = "bigscience/bloomz-560m"
-    prompts = ["What is OpenVINO?"]
-    run_llm_pipeline_with_ref(model_id, prompts, get_greedy(), tmp_path)
+
+@pytest.mark.precommit
+@pytest.mark.parametrize("generation_config",
+                         [dict(max_new_tokens=30),
+                          dict(max_new_tokens=30, repetition_penalty=2.0),],
+                         ids=["basic",
+                              "repetition_penalty",])
+def test_greedy(tmp_path, generation_config):
+    prompts = [ "What is OpenVINO?" ]
+    model_id : str = "katuni4ka/tiny-random-phi3"
+    run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path)
 
 
-# TODO: consider removing all these functions with generation configs and use Dict with properties, which can be converted to generation config
 @pytest.mark.precommit
 @pytest.mark.parametrize("generation_config",
-                         [get_greedy(), get_greedy_with_min_and_max_tokens(), get_greedy_with_repetition_penalty(), get_greedy_with_single_stop_string(),
-                          get_greedy_with_multiple_stop_strings(), get_greedy_with_multiple_stop_strings_no_match(),
-                          get_beam_search(), get_beam_search_min_and_max_tokens(), get_beam_search_with_multiple_stop_strings_no_match(),
-                          get_greedy_stop_strings_exclude_from_output(), get_greedy_stop_strings_include_to_output(),
-                          get_greedy_n_stop_strings_exclude_from_output(), get_greedy_n_stop_strings_include_to_output()],
-                         ids=["greedy", "greedy_with_min_and_max_tokens", "greedy_with_repetition_penalty", "greedy_with_single_stop_string",
-                              "greedy_with_multiple_stop_strings", "greedy_with_multiple_stop_strings_no_match", "beam_search", "beam_search_min_and_max_tokens",
-                              "beam_search_with_multiple_stop_strings_no_match", "greedy_stop_strings_exclude_from_output", "greedy_stop_strings_include_to_output",
-                              "greedy_n_stop_strings_exclude_from_output", "greedy_n_stop_strings_include_to_output"])
-def test_sampling_against_optimum(tmp_path, generation_config):
+                         [dict(max_new_tokens=30, num_beams=2),
+                          dict(max_new_tokens=30, num_beams=2, stop_criteria=StopCriteria.NEVER),
+                          dict(max_new_tokens=30, num_beams=2, stop_criteria=StopCriteria.EARLY),
+                        #   dict(max_new_tokens=30, num_beams=2, echo=True),
+                          dict(max_new_tokens=30, num_beams=2, length_penalty=1.0),
+                          dict(max_new_tokens=30, num_beams=2, no_repeat_ngram_size=2),
+                          dict(max_new_tokens=30, num_beams=6, num_beam_groups=3, diversity_penalty=1.2, num_return_sequences=3),
+                          dict(max_new_tokens=30, min_new_tokens=15, num_beams=2, num_return_sequences=1),
+                          dict(max_new_tokens=30, num_beams=2, stop_strings={"Einstein", "sunny", "geothermal"}, include_stop_str_in_output=True),],
+                         ids=["single_group_stop_criteria_heuristic",
+                              "single_group_stop_criteria_never",
+                              "single_group_stop_criteria_early",
+                            #   "single_group_with_echo",
+                              "single_group_lenght_penalty",
+                              "single_group_no_repeat_ngram_size",
+                              "multiple_groups",
+                              "single_group_min_new_tokens",
+                              "single_group_with_multiple_stop_strings_no_match",])
+def test_beam_search(tmp_path, generation_config):
     prompts = [ "What is OpenVINO?" ]
     model_id : str = "facebook/opt-125m"
     run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path)
@@ -82,14 +99,30 @@ def test_sampling_against_optimum(tmp_path, generation_config):
     reason="Stop strings do not seem to work as expected with beam search in HF, so comparison will fail. If it changes, these cases shall be merged to the test above.",
     strict=True,
 )
-@pytest.mark.parametrize("generation_config", [get_beam_search_with_single_stop_string(), get_beam_search_with_multiple_stop_strings()],
-                         ids=["beam_search_with_single_stop_string", "beam_search_with_multiple_stop_strings"])
+@pytest.mark.parametrize("generation_config",
+                         [dict(max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, num_return_sequences=6, stop_strings={"open sour"}, include_stop_str_in_output=True),
+                          dict(max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, num_return_sequences=6, stop_strings={".", "software", "Intel"}, include_stop_str_in_output=True),],
+                         ids=["single_stop_string_match", "multiple_stop_strings_match"])
 def test_beam_search_with_stop_string(tmp_path, generation_config):
     prompts = [ "What is OpenVINO?" ]
     model_id : str = "facebook/opt-125m"
     run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path)
 
 
+@pytest.mark.precommit
+@pytest.mark.parametrize("generation_config",
+                         [dict(max_new_tokens=1, min_new_tokens=0, echo=True),
+                          dict(max_new_tokens=30, num_beams=2, echo=True),],
+                         ids=["echo_with_generation",
+                              "single_group_with_echo",])
+def test_echo(tmp_path, generation_config):
+    prompts = [ "What is OpenVINO?" ]
+    model_id : str = "facebook/opt-125m"
+    # TODO: support in stateful mode and remove 'use_cb=True' and this test at all
+    # as we can enable new parameters set in other tests
+    run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path, use_cb=True)
+
+
 # TODO: remove platform specific reference texts once CVS-159912 is done and use comparison with HF
 # and merge this tests with 'test_sampling_against_optimum' by extending a list of generation configs
 
@@ -117,6 +150,12 @@ class RandomSamplingTestStruct:
     prompts: List[str]
     ref_texts: List[List[str]]
 
+from common import get_multinomial_temperature, get_greedy_with_penalties, \
+    get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, \
+    get_multinomial_temperature_top_p_and_top_k, get_multinomial_all_parameters, \
+    get_multinomial_temperature_and_num_return_sequence, get_multinomial_max_and_min_token, \
+    get_multinomial_temperature_and_frequence_penalty, get_multinomial_temperature_and_presence_penalty, \
+    get_multinomial_temperature_and_repetition_penalty
 
 RANDOM_SAMPLING_TEST_CASES = [
     RandomSamplingTestStruct(
@@ -279,7 +318,7 @@ def test_multinomial_sampling_against_reference(tmp_path, test_struct: RandomSam
 
     prompts = test_struct.prompts
     generation_config.rng_seed = 0
-    generation_configs = [generation_config]
+    generation_configs = generation_config
     model_id : str = "facebook/opt-125m"
     model, hf_tokenizer = get_hugging_face_models(model_id, use_optimum=True)
 
@@ -287,64 +326,7 @@ def test_multinomial_sampling_against_reference(tmp_path, test_struct: RandomSam
     convert_models(model, hf_tokenizer, models_path)
 
     # run multinomial without comparison with reference
-    _ = run_continuous_batching(models_path, get_scheduler_config(), prompts, generation_configs)
+    _ = run_llm_pipeline(models_path, prompts, generation_configs)
 
     # Reference comparison is not performed as sampling results are non-deterministic.
     # Discrete_distribution impl depends on platform, model inference results may depend on CPU.
-
-
-@pytest.mark.precommit
-@pytest.mark.parametrize("get_generation_config", [get_greedy, get_beam_search, get_multinomial_all_parameters],
-                         ids=["greedy", "beam_search", "multinomial_all_parameters"])
-@pytest.mark.parametrize("max_num_batched_tokens", [2, 4, 256])
-def test_echo_prompt_phase_only(tmp_path, get_generation_config, max_num_batched_tokens):
-    generation_config = get_generation_config()
-    generation_config.max_new_tokens = 0
-    generation_config.echo = True
-
-    scheduler_config = get_scheduler_config()
-    scheduler_config.max_num_batched_tokens = max_num_batched_tokens
-    generation_configs = [generation_config]
-    model_id : str = "facebook/opt-125m"
-    opt_model, hf_tokenizer = get_hugging_face_models(model_id, use_optimum=True)
-
-    model_path : Path = tmp_path / model_id
-    convert_models(opt_model, hf_tokenizer, model_path)
-
-    cb_pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU")
-
-    outputs = cb_pipe.generate(["What is OpenVINO?"], generation_configs)
-    assert(len(outputs))
-    for output in outputs:
-        assert(len(output.m_generation_ids))
-        for sequence in output.m_generation_ids:
-            assert(sequence == "What is OpenVINO?")
-
-
-@pytest.mark.precommit
-@pytest.mark.parametrize("get_generation_config", [get_greedy, get_beam_search, get_multinomial_all_parameters],
-                         ids=["greedy", "beam_search", "multinomial_all_parameters"])
-@pytest.mark.parametrize("max_num_batched_tokens", [2, 4, 256])
-def test_echo_with_generation_phase(tmp_path, get_generation_config, max_num_batched_tokens):
-    generation_config = get_generation_config()
-    generation_config.max_new_tokens = 10
-    generation_config.echo = True
-
-    scheduler_config = get_scheduler_config()
-    scheduler_config.max_num_batched_tokens = max_num_batched_tokens
-    generation_configs = [generation_config]
-    model_id : str = "facebook/opt-125m"
-    opt_model, hf_tokenizer = get_hugging_face_models(model_id, use_optimum=True)
-
-    model_path : Path = tmp_path / model_id
-    convert_models(opt_model, hf_tokenizer, model_path)
-
-    cb_pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU")
-    outputs = cb_pipe.generate(["What is OpenVINO?"], generation_configs)
-    assert(len(outputs))
-
-    for output in outputs:
-        assert(len(output.m_generation_ids))
-        for sequence in output.m_generation_ids:
-            assert(sequence.startswith("What is OpenVINO?"))
-            assert(len(sequence) > len("What is OpenVINO?"))
diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py
index b4df6492bb..81c181bc54 100644
--- a/tests/python_tests/test_vlm_pipeline.py
+++ b/tests/python_tests/test_vlm_pipeline.py
@@ -6,8 +6,8 @@
 import pytest
 import transformers
 from optimum.intel.openvino import OVModelForVisualCausalLM
-from openvino_genai import VLMPipeline
-from common import get_greedy, get_image_by_link, get_beam_search, get_greedy, get_multinomial_all_parameters
+from openvino_genai import VLMPipeline, GenerationConfig
+from common import get_image_by_link, get_beam_search, get_multinomial_all_parameters
 
 def get_ov_model(cache):
     model_dir = cache.mkdir("tiny-random-minicpmv-2_6")
@@ -49,21 +49,22 @@ def streamer(word: str) -> bool:
         return False
 
     models_path = get_ov_model(cache)
+    generation_config = GenerationConfig(max_new_tokens=30)
 
     for links in image_links_for_testing:
         images = []
         for link in links:
             images.append(get_image_by_link(link))
 
-        pipe = VLMPipeline(models_path, "CPU")
-        pipe.start_chat()
+        ov_pipe = VLMPipeline(models_path, "CPU")
+        ov_pipe.start_chat()
 
-        pipe.generate(prompts[0], images=images, generation_config=get_greedy(), streamer=streamer)
+        ov_pipe.generate(prompts[0], images=images, generation_config=generation_config, streamer=streamer)
 
         for prompt in prompts[1:]:
-            pipe.generate(prompt, generation_config=get_greedy(), streamer=streamer)
+            ov_pipe.generate(prompt, generation_config=generation_config, streamer=streamer)
 
-        pipe.finish_chat()
+        ov_pipe.finish_chat()
 
 
 @pytest.mark.precommit
@@ -95,7 +96,7 @@ def test_perf_metrics(cache):
     images = [get_image_by_link(image_links[0])]
 
     pipe = VLMPipeline(models_path, "CPU")
-    result = pipe.generate(prompts[0], images=images, generation_config=get_greedy())
+    result = pipe.generate(prompts[0], images=images, generation_config=GenerationConfig(max_new_tokens=30))
 
     perf_metrics = result.perf_metrics