diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 4aad3d4bc3..fb0c9c4b0b 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -16,10 +16,10 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu20_2025.0.0.dev20241224_x86_64.tgz
-  l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241224_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/m_openvino_toolkit_macos_12_6_2025.0.0.dev20241224_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/w_openvino_toolkit_windows_2025.0.0.dev20241224_x86_64.zip
+  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/l_openvino_toolkit_ubuntu20_2025.0.0.dev20241230_x86_64.tgz
+  l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241230_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/m_openvino_toolkit_macos_12_6_2025.0.0.dev20241230_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/w_openvino_toolkit_windows_2025.0.0.dev20241230_x86_64.zip
 jobs:
   cpp-multinomial-greedy_causal_lm-ubuntu:
     runs-on: ubuntu-20.04-8-cores
diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/genai-tools.yml
similarity index 78%
rename from .github/workflows/llm_bench-python.yml
rename to .github/workflows/genai-tools.yml
index 56145c080c..bd6cb46362 100644
--- a/.github/workflows/llm_bench-python.yml
+++ b/.github/workflows/genai-tools.yml
@@ -1,7 +1,7 @@
 # This workflow will install Python dependencies, run tests and lint with a single version of Python
 # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 
-name: llm_bench Python Test
+name: GenAI tools
 
 on:
   workflow_dispatch:
@@ -44,9 +44,10 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels
-        revision: latest_available_commit
+        revision: 345163f87953fb0dd8dd590257eb7fc84378da8e
 
-  build:
+  llm_bench:
+    name: 'LLM bench tests'
     defaults:
       run:
         shell: bash
@@ -60,7 +61,6 @@ jobs:
       OV_INSTALL_DIR: ${{ github.workspace }}/ov
       SRC_DIR: ${{ github.workspace }}
       LLM_BENCH_PYPATH: ${{ github.workspace }}/tools/llm_bench
-      WWB_PATH: ${{ github.workspace }}/tools/who_what_benchmark
 
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -70,6 +70,12 @@ jobs:
         uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
         with:
           python-version: ${{ matrix.python-version }}
+      - name: Lint with flake8
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install flake8 pytest black
+          # stop the build if there are Python syntax errors or undefined names
+          python -m flake8 ${{ env.LLM_BENCH_PYPATH }} --config=${{ env.LLM_BENCH_PYPATH }}/setup.cfg
       - name: Download OpenVINO package
         uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
         with:
@@ -78,59 +84,42 @@ jobs:
           merge-multiple: true
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip
-          python -m pip install flake8 pytest black
           python -m pip install ${{ env.SRC_DIR }}/thirdparty/openvino_tokenizers -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
           python -m pip install ${{ env.SRC_DIR }} -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
-          GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
+          python -m pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
         working-directory: ${{ env.OV_INSTALL_DIR }}
-      - name: Lint with flake8
-        run: |
-          # stop the build if there are Python syntax errors or undefined names
-          python -m flake8 ${{ env.LLM_BENCH_PYPATH }} --config=${{ env.LLM_BENCH_PYPATH }}/setup.cfg
-          python -m flake8 ${{ env.WWB_PATH }} --config=${{ env.WWB_PATH }}/setup.cfg
-      - name: Create code style diff for samples
-        if: failure()
-        run: |
-          python -m black -l 160 -S ${{ env.LLM_BENCH_PYPATH }}/
-          git diff > llm.bench_diff.diff
-      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
-        if: failure()
-        with:
-          name: llm.bench_diff
-          path: llm.bench_diff.diff
-      - name: Test native pytorch model on Linux
+      - name: Test native pytorch model
         run: |
           git clone --depth 1 https://huggingface.co/katuni4ka/tiny-random-qwen
           python ./tools/llm_bench/benchmark.py -m tiny-random-qwen -d cpu -n 1 -f pt -ic 20
           rm -rf tiny-random-qwen
         env:
           GIT_LFS_SKIP_SMUDGE: 0
-      - name: Test tiny-random-baichuan2 on Linux Optimum Intel
+      - name: Test tiny-random-baichuan2 Optimum Intel
         run: |
           optimum-cli export openvino --model katuni4ka/tiny-random-baichuan2 --trust-remote-code --weight-format fp16 ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16
           python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16/ -d cpu -n 1 --optimum -ic 10
           rm -rf ./ov_models/tiny-random-baichuan2
-      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux Optimum Intel
+      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov Optimum Intel
         run: |
           huggingface-cli download OpenVINO/LCM_Dreamshaper_v7-int8-ov --local-dir ov_models/lcm_dreamshaper_v7
           python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --optimum --num_steps 4
-      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI
+      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov with GenAI
         run: |
           python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --num_steps 4
-      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI and LoRA
+      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov with GenAI and LoRA
         run: |
           wget -O ./ov_models/soulcard.safetensors https://civitai.com/api/download/models/72591
           python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7 --num_steps 4
           rm -rf ./ov_models/lcm_dreamshaper_v7/
-      - name: Test TinyLlama-1.1B-Chat-v1.0 in Speculative Deconding mode on Linux
+      - name: Test TinyLlama-1.1B-Chat-v1.0 in Speculative Decoding via GenAI
         run: |
           optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format fp16 ov_models/TinyLlama-1.1B-Chat-v1.0/FP16
           optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format int8 ov_models/TinyLlama-1.1B-Chat-v1.0/INT8
           python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --assistant_confidence_threshold 0.4 -ic 20
           python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --num_assistant_tokens 5 -ic 20
           rm -rf ov_models/TinyLlama-1.1B-Chat-v1.0
-      - name: Test whisper-tiny on Linux
+      - name: Test whisper-tiny via GenAI
         run: |
           GIT_LFS_SKIP_SMUDGE=1 git clone --depth 1 --branch main --single-branch https://huggingface.co/datasets/facebook/multilingual_librispeech
           cd multilingual_librispeech
@@ -143,60 +132,64 @@ jobs:
           python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1
           rm -rf ./ov_models/whisper-tiny
           rm -rf multilingual_librispeech
-      - name: Text InternVL2-1B on Linux
+      - name: Text InternVL2-1B via GenAI
         run: |
           optimum-cli export openvino --model OpenGVLab/InternVL2-1B ./ov_models/internvl2-1B --task image-text-to-text --trust-remote-code
           python ./tools/llm_bench/benchmark.py -m ./ov_models/internvl2-1B --media https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --prompt "What is unusual on this image?" -ic 20
           python ./tools/llm_bench/benchmark.py -m ./ov_models/internvl2-1B --media https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --prompt "What is unusual on this image?" -ic 20 --optimum
           rm -rf ./ov_models/internvl2-1B
-      - name: WWB Tests
-        run: |
-          pip install git+https://github.com/huggingface/optimum-intel.git
-          GIT_CLONE_PROTECTION_ACTIVE=false PIP_PRE=1 PIP_EXTRA_INDEX_URL=https://storage.openvinotoolkit.org/simple/wheels/nightly pip install ${{ env.WWB_PATH }}
-          python -m pytest -v ${{ env.WWB_PATH }}/tests
-  stateful:
+
+  wwb:
+    name: 'WWB tests'
     defaults:
       run:
         shell: bash
     runs-on: ubuntu-22.04
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.11"]
     needs: [ openvino_download ]
     env:
       OV_INSTALL_DIR: ${{ github.workspace }}/ov
       SRC_DIR: ${{ github.workspace }}
-      LLM_BENCH_PYPATH: ${{ github.workspace }}/tools/llm_bench
       WWB_PATH: ${{ github.workspace }}/tools/who_what_benchmark
 
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           submodules: recursive
-      - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
         with:
-          python-version: "3.11"
+          python-version: ${{ matrix.python-version }}
+      - name: Lint with flake8
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install flake8 pytest black
+          # stop the build if there are Python syntax errors or undefined names
+          python -m flake8 ${{ env.WWB_PATH }} --config=${{ env.WWB_PATH }}/setup.cfg
       - name: Download OpenVINO package
         uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
         with:
           name: ${{ needs.openvino_download.outputs.ov_artifact_name }}
           path: ${{ env.OV_INSTALL_DIR }}
           merge-multiple: true
-      - name: Test stateful
+      - name: Install dependencies
         run: |
           python -m pip install ${{ env.SRC_DIR }}/thirdparty/openvino_tokenizers -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
           python -m pip install ${{ env.SRC_DIR }} -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
-          GIT_CLONE_PROTECTION_ACTIVE=false python -m pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
-          python ${{ env.LLM_BENCH_PYPATH }}/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir ${{ env.SRC_DIR }} --stateful
-          grep beam_idx ${{ env.SRC_DIR }}/pytorch/dldt/FP32/openvino_model.xml
+          python -m pip install -r ${{ env.WWB_PATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
+          python -m pip install git+https://github.com/huggingface/optimum-intel.git@main#egg=optimum-intel
         working-directory: ${{ env.OV_INSTALL_DIR }}
       - name: WWB Tests
         run: |
-          pip install pytest
-          pip install git+https://github.com/huggingface/optimum-intel.git
-          GIT_CLONE_PROTECTION_ACTIVE=false PIP_PRE=1 PIP_EXTRA_INDEX_URL=https://storage.openvinotoolkit.org/simple/wheels/nightly pip install ${{ env.WWB_PATH }}
+          python -m pip install -v ${{ env.WWB_PATH }}
           python -m pytest -v ${{ env.WWB_PATH }}/tests
 
   Overall_Status:
     name: ci/gha_overall_status_llm_bench
-    needs: [openvino_download, build, stateful]
+    needs: [openvino_download, llm_bench, wwb]
     if: ${{ always() }}
     runs-on: ubuntu-latest
     steps:
diff --git a/.github/workflows/job_vlm_sample_llava.yml b/.github/workflows/job_vlm_sample_llava.yml
index 5f4634616a..781526f71f 100644
--- a/.github/workflows/job_vlm_sample_llava.yml
+++ b/.github/workflows/job_vlm_sample_llava.yml
@@ -11,7 +11,7 @@ on:
         type: string
 
 env:
-  l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241224_x86_64.tgz
+  l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241230_x86_64.tgz
 
 jobs:
   visual_language_chat_sample-ubuntu-llava:
diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml
index c525b0be68..cbd847240d 100644
--- a/.github/workflows/lcm_dreamshaper_cpp.yml
+++ b/.github/workflows/lcm_dreamshaper_cpp.yml
@@ -18,8 +18,8 @@ concurrency:
 
 env:
   PYTHON_VERSION: '3.9'
-  LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241224_x86_64.tgz
-  WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/w_openvino_toolkit_windows_2025.0.0.dev20241224_x86_64.zip
+  LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241230_x86_64.tgz
+  WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/w_openvino_toolkit_windows_2025.0.0.dev20241230_x86_64.zip
   OV_INSTALL_DIR: ${{ github.workspace }}/ov
 
 jobs:
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 9b21491f9b..0d7a5b7bae 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -52,7 +52,7 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels
-        revision: latest_available_commit
+        revision: 345163f87953fb0dd8dd590257eb7fc84378da8e
 
     - name: Clone docker tag from OpenVINO repo
       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -109,10 +109,10 @@ jobs:
           merge-multiple: true
 
       - name: CMake Build
-        run: |    
+        run: |
           source ${{ env.OV_INSTALL_DIR }}/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ${{ env.SRC_DIR}} -B ${{ env.BUILD_DIR }}
-          cmake --build ${{ env.BUILD_DIR}} --config ${{ matrix.build-type }} --parallel $(nproc)
+          cmake --build ${{ env.BUILD_DIR}} --config ${{ matrix.build-type }} --parallel $(nproc) --verbose
           cmake --install ${{ env.BUILD_DIR }} --config ${{ matrix.build-type }} --prefix ${{ env.INSTALL_DIR }}
       
       - name: Pack Artifacts
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 4d9b7f032b..062b83fc27 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -1,4 +1,4 @@
-name: macOS (12, Python 3.9)
+name: macOS (12, Python 3.10)
 on:
   workflow_dispatch:
   pull_request:
@@ -16,8 +16,8 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  PYTHON_VERSION: '3.9'
-  OV_BRANCH: master
+  PYTHON_VERSION: '3.10'
+  OV_BRANCH: 345163f87953fb0dd8dd590257eb7fc84378da8e
   OV_TARBALL: ''
 
 jobs:
@@ -219,7 +219,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
+          cmake --build ./build/ --config Release --parallel --verbose
 
       - name: Test bindings
         run: |
@@ -284,7 +284,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release --target py_openvino_genai -j
+          cmake --build ./build/ --config Release --target py_openvino_genai --parallel --verbose
 
       - name: Test bindings
         run: |
@@ -350,7 +350,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
-          cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
+          cmake --build ./build/ --config ${{ matrix.build-type }} --target package --parallel --verbose
 
       - name: Build and Install dependencies
         run: |
diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml
index 34c5a0f87e..3b01697f26 100644
--- a/.github/workflows/stable_diffusion_1_5_cpp.yml
+++ b/.github/workflows/stable_diffusion_1_5_cpp.yml
@@ -45,7 +45,7 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels
-        revision: latest_available_commit
+        revision: 345163f87953fb0dd8dd590257eb7fc84378da8e
 
   openvino_download_windows:
     name: Download OpenVINO for Windows
@@ -71,7 +71,7 @@ jobs:
       with:
         platform: windows
         commit_packages_to_provide: wheels
-        revision: latest_available_commit
+        revision: 345163f87953fb0dd8dd590257eb7fc84378da8e
 
   stable_diffusion_1_5_cpp-linux:
     runs-on: ubuntu-22.04-8-cores
@@ -122,6 +122,8 @@ jobs:
           source openvino_sd_cpp/bin/activate
           optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --weight-format fp16 --task stable-diffusion models/dreamlike-art-dreamlike-anime-1.0/FP16
           wget -O ./models/soulcard.safetensors https://civitai.com/api/download/models/72591
+        env:
+          HF_HUB_ENABLE_HF_TRANSFER: 1
 
       - name: Run text2image app
         run: |
@@ -198,6 +200,8 @@ jobs:
           . "./openvino_sd_cpp/Scripts/Activate.ps1"
           optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 models/dreamlike-art-dreamlike-anime-1.0/FP16
           Invoke-WebRequest -Uri 'https://civitai.com/api/download/models/72591' -OutFile 'models/soulcard.safetensors'
+        env:
+          HF_HUB_ENABLE_HF_TRANSFER: 1
 
       - name: Run text2image app
         run: |
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index fc63129281..95a713d7a1 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -17,7 +17,7 @@ concurrency:
 
 env:
   PYTHON_VERSION: '3.11'
-  OV_BRANCH: master
+  OV_BRANCH: 345163f87953fb0dd8dd590257eb7fc84378da8e
   OV_TARBALL: ''
 
 jobs:
@@ -230,7 +230,7 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
+          cmake --build ./build/ --config Release --parallel --verbose
 
       - name: Test bindings
         run: |
@@ -295,7 +295,7 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release --target py_openvino_genai -j
+          cmake --build ./build/ --config Release --target py_openvino_genai --parallel --verbose
 
       - name: Test bindings
         run: |
@@ -360,7 +360,7 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release --target py_openvino_genai -j
+          cmake --build ./build/ --config Release --target py_openvino_genai --parallel --verbose
 
       - name: Test bindings
         run: |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fec8df34af..3a67a24bab 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,6 +60,7 @@ if(NOT OpenVINODeveloperPackage_FOUND)
 endif()
 
 include(cmake/features.cmake)
+include(cmake/version.cmake)
 
 if(ENABLE_PYTHON)
     # the following two calls are required for cross-compilation
@@ -85,7 +86,6 @@ if(MSVC AND MSVC_VERSION GREATER_EQUAL 1930 AND MSVC_VERSION LESS 1941)
     add_compile_definitions(_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR)
 endif()
 
-
 add_subdirectory(thirdparty)
 add_subdirectory(src)
 if(EXISTS "${OpenVINOGenAI_SOURCE_DIR}/samples")
diff --git a/README.md b/README.md
index be3de5e8ce..c5cf799973 100644
--- a/README.md
+++ b/README.md
@@ -133,13 +133,15 @@ from PIL import Image
 
 # Choose GPU instead of CPU in the line below to run the model on Intel integrated or discrete GPU
 pipe = ov_genai.VLMPipeline("./InternVL2-1B", "CPU")
+pipe.start_chat()
 
 image = Image.open("dog.jpg")
 image_data = np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8)
 image_data = ov.Tensor(image_data)  
 
 prompt = "Can you describe the image?"
-print(pipe.generate(prompt, image=image_data, max_new_tokens=100))
+result = pipe.generate(prompt, image=image_data, max_new_tokens=100)
+print(result.texts[0])
 ```
 
 ### Run generation using VLMPipeline in C++
@@ -392,7 +394,7 @@ See [here](https://openvinotoolkit.github.io/openvino_notebooks/?search=Automati
 
 ## Additional materials
 
-- [List of supported models](https://github.com/openvinotoolkit/openvino.genai/blob/master/src/docs/SUPPORTED_MODELS.md) (NOTE: models can work, but were not tried yet)
+- [List of supported models](https://github.com/openvinotoolkit/openvino.genai/blob/master/SUPPORTED_MODELS.md) (NOTE: models can work, but were not tried yet)
 - [OpenVINO Generative AI workflow](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html)
 - [Optimum-intel and OpenVINO](https://huggingface.co/docs/optimum/intel/openvino/export)
 
diff --git a/src/docs/SUPPORTED_MODELS.md b/SUPPORTED_MODELS.md
similarity index 95%
rename from src/docs/SUPPORTED_MODELS.md
rename to SUPPORTED_MODELS.md
index 44da29ced4..6b45f47890 100644
--- a/src/docs/SUPPORTED_MODELS.md
+++ b/SUPPORTED_MODELS.md
@@ -147,6 +147,8 @@
   </tbody>
 </table>
 
+> [!NOTE]
+> LoRA adapters are supported.
 
 The pipeline can work with other similar topologies produced by `optimum-intel` with the same model signature. The model is required to have the following inputs after the conversion:
 1. `input_ids` contains the tokens.
@@ -165,12 +167,14 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
       <th>Architecture</th>
       <th>Text 2 image</th>
       <th>Image 2 image</th>
+      <th>LoRA support</th>
       <th>Example HuggingFace Models</th>
     </tr>
     <tr>
       <td><code>Latent Consistency Model</code></td>
       <td>Supported</td>
       <td>Supported</td>
+      <td>Supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7"><code>SimianLuo/LCM_Dreamshaper_v7</code></a></li>
@@ -181,6 +185,7 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
       <td><code>Stable Diffusion</code></td>
       <td>Supported</td>
       <td>Supported</td>
+      <td>Supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/CompVis/stable-diffusion-v1-1"><code>CompVis/stable-diffusion-v1-1</code></a></li>
@@ -213,6 +218,7 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
       <td><code>Stable Diffusion XL</code></td>
       <td>Supported</td>
       <td>Supported</td>
+      <td>Supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/stabilityai/stable-diffusion-xl-base-0.9"><code>stabilityai/stable-diffusion-xl-base-0.9</code></a></li>
@@ -225,6 +231,7 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
       <td><code>Stable Diffusion 3</code></td>
       <td>Supported</td>
       <td>Not supported</td>
+      <td>Not supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers"><code>stabilityai/stable-diffusion-3-medium-diffusers</code></a></li>
@@ -237,6 +244,7 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
       <td><code>Flux</code></td>
       <td>Supported</td>
       <td>Not supported</td>
+      <td>Not supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/black-forest-labs/FLUX.1-schnell"><code>black-forest-labs/FLUX.1-schnell</code></a></li>
@@ -260,10 +268,12 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
   <tbody style="vertical-align: top;">
     <tr>
       <th>Architecture</th>
+      <th>LoRA support</th>
       <th>Example HuggingFace Models</th>
     </tr>
     <tr>
       <td><code>Stable Diffusion</code></td>
+      <td>Supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/stabilityai/stable-diffusion-2-inpainting"><code>stabilityai/stable-diffusion-2-inpainting</code></a></li>
@@ -275,13 +285,22 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
     </tr>
     <tr>
       <td><code>Stable Diffusion XL</code></td>
+      <td>Supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/diffusers/stable-diffusion-xl-1.0-inpainting-0.1"><code>diffusers/stable-diffusion-xl-1.0-inpainting-0.1</code></a></li>
         </ul>
       </td>
     </tr>
-    </tr>
+    <!-- <tr>
+      <td><code>FLUX</code></td>
+      <td>Not supported</td>
+      <td>
+        <ul>
+          <li><a href="https://huggingface.co/black-forest-labs/FLUX.1-Fill-dev"><code>black-forest-labs/FLUX.1-Fill-dev</code></a></li>
+        </ul>
+      </td>
+    </tr> -->
   </tbody>
 </table>
 
@@ -292,11 +311,13 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
     <tr>
       <th>Architecture</th>
       <th>Models</th>
+      <th>LoRA support</th>
       <th>Example HuggingFace Models</th>
     </tr>
     <tr>
       <td><code>InternVL2</code></td>
       <td>InternVL2</td>
+      <td>Not supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/OpenGVLab/InternVL2-1B"><code>OpenGVLab/InternVL2-1B</code></a></li>
@@ -309,6 +330,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
     <tr>
       <td><code>LLaVA</code></td>
       <td>LLaVA-v1.5</td>
+      <td>Not supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/llava-hf/llava-1.5-7b-hf"><code>llava-hf/llava-1.5-7b-hf</code></a></li>
@@ -318,6 +340,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
     <tr>
       <td><code>LLaVA-NeXT</code></td>
       <td>LLaVa-v1.6</td>
+      <td>Not supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf"><code>llava-hf/llava-v1.6-mistral-7b-hf</code></a></li>
@@ -329,6 +352,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
     <tr>
       <td><code>MiniCPMV</code></td>
       <td>MiniCPM-V-2_6</td>
+      <td>Not supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/openbmb/MiniCPM-V-2_6"><code>openbmb/MiniCPM-V-2_6</code></a></li>
@@ -345,11 +369,13 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
     <tr>
       <th>Architecture</th>
       <th>Models</th>
+      <th>LoRA support</th>
       <th>Example HuggingFace Models</th>
     </tr>
     <tr>
       <td rowspan=2><code>WhisperForConditionalGeneration</code></td>
       <td>Whisper</td>
+      <td>Not supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/openai/whisper-tiny"><code>openai/whisper-tiny</code></a></li>
@@ -366,6 +392,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
     </tr>
     <tr>
       <td>Distil-Whisper</td>
+      <td>Not supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/distil-whisper/distil-small.en"><code>distil-whisper/distil-small.en</code></a></li>
diff --git a/cmake/templates/__version__.py.in b/cmake/templates/__version__.py.in
deleted file mode 100644
index ce8e01a246..0000000000
--- a/cmake/templates/__version__.py.in
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-# Will be overwritten by cmake.
-__version__ = "@OpenVINOGenAI_VERSION@"
diff --git a/cmake/templates/version.cpp.in b/cmake/templates/version.cpp.in
new file mode 100644
index 0000000000..f6015832f9
--- /dev/null
+++ b/cmake/templates/version.cpp.in
@@ -0,0 +1,19 @@
+// Copyright (C) 2023-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "openvino/genai/version.hpp"
+
+namespace ov {
+namespace genai {
+
+const Version get_version() {
+    const static Version version = {
+        "@OpenVINOGenAI_FULL_VERSION@",
+        "OpenVINO GenAI version",
+    };
+
+    return version;
+}
+
+} // namespace genai
+} // namespace ov
diff --git a/cmake/templates/version.hpp.in b/cmake/templates/version.hpp.in
new file mode 100644
index 0000000000..34120ef632
--- /dev/null
+++ b/cmake/templates/version.hpp.in
@@ -0,0 +1,34 @@
+// Copyright (C) 2023-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "openvino/core/version.hpp"
+#include "openvino/genai/visibility.hpp"
+
+/**
+ * OpenVINO GenAI major version
+ */
+#define OPENVINO_GENAI_VERSION_MAJOR @OpenVINOGenAI_VERSION_MAJOR@
+
+/**
+ * OpenVINO GenAI minor version
+ */
+#define OPENVINO_GENAI_VERSION_MINOR @OpenVINOGenAI_VERSION_MINOR@
+
+/**
+ * OpenVINO GenAI patch version
+ */
+#define OPENVINO_GENAI_VERSION_PATCH @OpenVINOGenAI_VERSION_PATCH@
+
+namespace ov {
+namespace genai {
+
+/**
+ * Returns OpenVINO GenAI full version including git commit and hash information in form of:
+ *   <MAJOR>.<MINOR>.<PATCH>.<REVISION>-<COMMIT NUMBER>-<COMMIT HASH>[-<BRANCH SUFFIX>]
+ */
+OPENVINO_EXTERN_C OPENVINO_GENAI_EXPORTS const ov::Version OPENVINO_CDECL get_version();
+
+} // namespace genai
+} // namespace ov
diff --git a/cmake/version.cmake b/cmake/version.cmake
new file mode 100644
index 0000000000..b9b51e8fe2
--- /dev/null
+++ b/cmake/version.cmake
@@ -0,0 +1,72 @@
+# Copyright (C) 2018-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+find_package(Git QUIET)
+
+function(ov_genai_branch_name VAR)
+    if(GIT_FOUND)
+        execute_process(
+                COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD
+                WORKING_DIRECTORY ${OpenVINOGenAI_SOURCE_DIR}
+                OUTPUT_VARIABLE GIT_BRANCH
+                RESULT_VARIABLE EXIT_CODE
+                OUTPUT_STRIP_TRAILING_WHITESPACE)
+        if(EXIT_CODE EQUAL 0)
+            set(${VAR} ${GIT_BRANCH} PARENT_SCOPE)
+        endif()
+    endif()
+endfunction()
+
+function(ov_genai_commit_hash VAR)
+    if(GIT_FOUND)
+        execute_process(
+                COMMAND ${GIT_EXECUTABLE} rev-parse --short=11 HEAD
+                WORKING_DIRECTORY ${OpenVINOGenAI_SOURCE_DIR}
+                OUTPUT_VARIABLE GIT_COMMIT_HASH
+                RESULT_VARIABLE EXIT_CODE
+                OUTPUT_STRIP_TRAILING_WHITESPACE)
+        if(EXIT_CODE EQUAL 0)
+            set(${VAR} ${GIT_COMMIT_HASH} PARENT_SCOPE)
+        endif()
+    endif()
+endfunction()
+
+function(ov_genai_commit_number VAR)
+    set(GIT_COMMIT_NUMBER_FOUND OFF)
+    if(GIT_FOUND)
+        execute_process(
+                COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
+                WORKING_DIRECTORY ${OpenVINOGenAI_SOURCE_DIR}
+                OUTPUT_VARIABLE GIT_COMMIT_NUMBER
+                RESULT_VARIABLE EXIT_CODE
+                OUTPUT_STRIP_TRAILING_WHITESPACE)
+        if(EXIT_CODE EQUAL 0)
+            set(GIT_COMMIT_NUMBER_FOUND ON)
+            set(${VAR} ${GIT_COMMIT_NUMBER} PARENT_SCOPE)
+        endif()
+    endif()
+    if(NOT GIT_COMMIT_NUMBER_FOUND)
+        # set zeros since git is not available
+        set(${VAR} "000" PARENT_SCOPE)
+    endif()
+endfunction()
+
+function(ov_genai_full_version full_version)
+    if(GIT_FOUND)
+        ov_genai_branch_name(GIT_BRANCH)
+        ov_genai_commit_hash(GIT_COMMIT_HASH)
+        ov_genai_commit_number(GIT_COMMIT_NUMBER)
+
+        if(NOT GIT_BRANCH MATCHES "^(master|HEAD)$")
+            set(GIT_BRANCH_POSTFIX "-${GIT_BRANCH}")
+        endif()
+
+        set(${full_version} "${OpenVINOGenAI_VERSION}-${GIT_COMMIT_NUMBER}-${GIT_COMMIT_HASH}${GIT_BRANCH_POSTFIX}" PARENT_SCOPE)
+    else()
+        set(${full_version} "${OpenVINOGenAI_VERSION}" PARENT_SCOPE)
+    endif()
+endfunction()
+
+ov_genai_full_version(OpenVINOGenAI_FULL_VERSION)
+message(STATUS "OpenVINO GenAI full version: ${OpenVINOGenAI_FULL_VERSION}")
diff --git a/llm_bench/python/README.md b/llm_bench/python/README.md
deleted file mode 100644
index 272ed11d1b..0000000000
--- a/llm_bench/python/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# Benchmarking Script for Large Language Models
-
-> [!IMPORTANT]  
-> LLM bench code was moved to [tools](../../tools/llm_bench/) directory. Please navigate to the new directory for continue of tool usage.
\ No newline at end of file
diff --git a/llm_bench/python/who_what_benchmark/README.md b/llm_bench/python/who_what_benchmark/README.md
deleted file mode 100644
index 414b4d9342..0000000000
--- a/llm_bench/python/who_what_benchmark/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# Simple Accuracy Benchmark for Generative AI models
-
-> [!IMPORTANT]  
-> Who What Benchmark code was moved to [tools](../../../tools/who_what_benchmark/) directory. Please navigate to the new directory for continue of tool usage.
\ No newline at end of file
diff --git a/samples/cpp/visual_language_chat/README.md b/samples/cpp/visual_language_chat/README.md
index 39364d51ee..73baf0088a 100644
--- a/samples/cpp/visual_language_chat/README.md
+++ b/samples/cpp/visual_language_chat/README.md
@@ -29,7 +29,7 @@ Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/o
 
 Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model `llava-hf/llava-v1.6-mistral-7b-hf` can benefit from being run on a dGPU. Modify the source code to change the device for inference to the `GPU`.
 
-See [SUPPORTED_MODELS.md](../../../src/docs/SUPPORTED_MODELS.md#visual-language-models) for the list of supported models.
+See [SUPPORTED_MODELS.md](../../../SUPPORTED_MODELS.md#visual-language-models) for the list of supported models.
 
 ## Run benchmark:
 
diff --git a/samples/cpp/whisper_speech_recognition/README.md b/samples/cpp/whisper_speech_recognition/README.md
index d649266613..2ea3322dee 100644
--- a/samples/cpp/whisper_speech_recognition/README.md
+++ b/samples/cpp/whisper_speech_recognition/README.md
@@ -31,7 +31,7 @@ Output:
 timestamps: [0, 2] text:  How are you doing today?
 ```
 
-See [SUPPORTED_MODELS.md](../../../src/docs/SUPPORTED_MODELS.md#whisper-models) for the list of supported models.
+See [SUPPORTED_MODELS.md](../../../SUPPORTED_MODELS.md#whisper-models) for the list of supported models.
 
 # Whisper pipeline usage
 
diff --git a/samples/deployment-requirements.txt b/samples/deployment-requirements.txt
index 428e0955a5..c6ad9eaaa8 100644
--- a/samples/deployment-requirements.txt
+++ b/samples/deployment-requirements.txt
@@ -2,4 +2,4 @@
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 openvino_genai~=2025.0.0.0.dev
 librosa==0.10.2.post1  # For Whisper
-pillow==11.0.0  # Image processing for VLMs
+pillow==11.1.0  # Image processing for VLMs
diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt
index a589696beb..2f71891b7b 100644
--- a/samples/export-requirements.txt
+++ b/samples/export-requirements.txt
@@ -2,7 +2,7 @@
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 openvino-tokenizers~=2025.0.0.0.dev
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
 numpy<2.0.0; sys_platform == 'darwin'
 einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.5  # For Qwen
@@ -10,3 +10,4 @@ diffusers==0.32.1 # For image generation pipelines
 timm==1.0.12  # For exporting InternVL2
 torchvision  # For visual language models
 transformers>=4.43 # For Whisper
+hf_transfer # for faster models download, should used with env var HF_HUB_ENABLE_HF_TRANSFER=1
\ No newline at end of file
diff --git a/samples/python/multinomial_causal_lm/multinomial_causal_lm.py b/samples/python/multinomial_causal_lm/multinomial_causal_lm.py
index 953388ed6a..5ec9d54601 100755
--- a/samples/python/multinomial_causal_lm/multinomial_causal_lm.py
+++ b/samples/python/multinomial_causal_lm/multinomial_causal_lm.py
@@ -90,7 +90,7 @@ def put(self, token_id: int) -> bool:
             word = text[self.print_len:]            
             self.tokens_cache = []
             self.print_len = 0
-        elif len(text) >= 3 and text[-3:] == chr(65533):
+        elif len(text) >= 3 and text[-1] == chr(65533):
             # Don't print incomplete text.
             pass
         elif len(text) > self.print_len:
diff --git a/samples/python/whisper_speech_recognition/README.md b/samples/python/whisper_speech_recognition/README.md
index aeb46444bf..5f373df2b7 100644
--- a/samples/python/whisper_speech_recognition/README.md
+++ b/samples/python/whisper_speech_recognition/README.md
@@ -38,7 +38,7 @@ Output:
 timestamps: [0, 2] text:  How are you doing today?
 ```
 
-See [SUPPORTED_MODELS.md](../../../src/docs/SUPPORTED_MODELS.md#whisper-models) for the list of supported models.
+See [SUPPORTED_MODELS.md](../../../SUPPORTED_MODELS.md#whisper-models) for the list of supported models.
 
 # Whisper pipeline usage
 
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index d02f32ded9..e954037daf 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -54,19 +54,32 @@ FetchContent_MakeAvailable(safetensors.h)
 
 ov_genai_build_jinja2cpp()
 
+# generate version files
+
+configure_file("${OpenVINOGenAI_SOURCE_DIR}/cmake/templates/version.hpp.in"
+               "${CMAKE_CURRENT_BINARY_DIR}/openvino/genai/version.hpp" @ONLY)
+
+configure_file("${OpenVINOGenAI_SOURCE_DIR}/cmake/templates/version.cpp.in"
+               "${CMAKE_CURRENT_BINARY_DIR}/version.cpp" @ONLY)
+
 # Library
 
 file(GLOB_RECURSE SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/src/*.c")
+list(APPEND SOURCE_FILES "${CMAKE_CURRENT_BINARY_DIR}/version.cpp")
 
 set(TARGET_NAME openvino_genai)
+
 add_library(${TARGET_NAME} SHARED ${SOURCE_FILES})
+add_library(openvino::genai ALIAS ${TARGET_NAME})
+
 if(TARGET openvino_tokenizers)
     add_dependencies(${TARGET_NAME} openvino_tokenizers)
 endif()
-add_library(openvino::genai ALIAS ${TARGET_NAME})
 
 target_include_directories(${TARGET_NAME}
-    PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>" "$<INSTALL_INTERFACE:runtime/include>"
+    PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+           "$<INSTALL_INTERFACE:runtime/include>"
+           "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>"
     PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src")
 
 target_include_directories(${TARGET_NAME} SYSTEM PRIVATE "${safetensors.h_SOURCE_DIR}")
@@ -81,6 +94,7 @@ set_target_properties(${TARGET_NAME} PROPERTIES
     LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>"
     RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>"
 )
+
 # Extract two last digits from OpenVINOGenAI_VERSION_MAJOR because SOVERSION can only contain up to 4 symbols.
 string(REGEX MATCH [=[[0-9][0-9]$]=] MAJOR_SUFFIX ${OpenVINOGenAI_VERSION_MAJOR})
 if(DEFINED PY_BUILD_CMAKE_PACKAGE_NAME AND LINUX)
@@ -98,7 +112,7 @@ endif()
 
 if(OpenVINODeveloperPackage_FOUND)
     # must be called after all target_link_libraries
-    # ov_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
+    ov_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
 
     ov_ncc_naming_style(FOR_TARGET ${TARGET_NAME}
                         SOURCE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}/include")
@@ -142,6 +156,9 @@ install(TARGETS ${TARGET_NAME} EXPORT OpenVINOGenAITargets
 
 install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/
         DESTINATION runtime/include COMPONENT core_genai_dev)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/openvino/genai/version.hpp
+        DESTINATION runtime/include/openvino/genai COMPONENT core_genai_dev)
+
 install(EXPORT OpenVINOGenAITargets FILE OpenVINOGenAITargets.cmake
         NAMESPACE openvino:: DESTINATION runtime/cmake
         COMPONENT core_genai_dev)
diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
index 74466ee488..ed9fc3a30d 100644
--- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
@@ -52,8 +52,9 @@ struct PipelineMetrics {
 
 class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
 protected:
-    class ImplInterface;
+    class IContinuousBatchingPipeline;
     class ContinuousBatchingImpl;
+
     class ContinuousBatchingForSpeculativeDecodingImpl;
     class ContinuousBatchingForPromptLookupImpl;
     class SpeculativeDecodingImpl;
@@ -64,7 +65,7 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
     friend class SpeculativeDecodingImpl;
     friend class PromptLookupImpl;
 
-    std::shared_ptr<ImplInterface> m_impl;
+    std::shared_ptr<IContinuousBatchingPipeline> m_impl;
 
     ContinuousBatchingPipeline() = default;
 
diff --git a/src/cpp/include/openvino/genai/lora_adapter.hpp b/src/cpp/include/openvino/genai/lora_adapter.hpp
index 277ec57cc3..b6b91bee20 100644
--- a/src/cpp/include/openvino/genai/lora_adapter.hpp
+++ b/src/cpp/include/openvino/genai/lora_adapter.hpp
@@ -188,7 +188,7 @@ class OPENVINO_GENAI_EXPORTS AdapterController {
     AdapterController(std::shared_ptr<ov::Model> model, const AdapterConfig& config, std::string device);
 
     // Apply adapters configured in the current config set last time, or set and use new config given as optional `config` argument
-    void apply(ov::InferRequest& request, const std::optional<AdapterConfig>& config = std::nullopt);
+    void apply(ov::InferRequest request, const std::optional<AdapterConfig>& config = std::nullopt);
 
     // Returns true if a given name is one of the state names created by this adapter controller for dynamic LoRA
     // Helps to distinguish LoRA states from other states (e.g. KV cache state) in the model for a partial state reset.
diff --git a/src/cpp/src/continuous_batching_adapter.hpp b/src/cpp/src/continuous_batching_adapter.hpp
new file mode 100644
index 0000000000..0b0065aa1f
--- /dev/null
+++ b/src/cpp/src/continuous_batching_adapter.hpp
@@ -0,0 +1,171 @@
+
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "llm_pipeline_base.hpp"
+
+#include "openvino/genai/continuous_batching_pipeline.hpp"
+
+namespace ov::genai {
+
+Tokenizer dont_construct() {
+    OPENVINO_THROW("Continuous Batching backend can't be constructed"
+        "from ireq because the model must be transformed");
+}
+
+template<class... Ts> struct overloaded : Ts... {using Ts::operator()...;};
+template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
+
+class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
+    ContinuousBatchingPipeline m_impl;
+public:
+    ContinuousBatchingAdapter(
+        const ov::InferRequest& request,
+        const Tokenizer& tokenizer,
+        OptionalGenerationConfig generation_config
+    ): LLMPipelineImplBase{dont_construct(), GenerationConfig{}},
+        m_impl{std::filesystem::path{}, SchedulerConfig{}, std::string{}} { }
+
+    ContinuousBatchingAdapter(
+        const std::filesystem::path& models_path,
+        const Tokenizer& tokenizer,
+        const SchedulerConfig& scheduler_config,
+        const std::string& device,
+        const ov::AnyMap& plugin_config
+    ): LLMPipelineImplBase{tokenizer, GenerationConfig()}, m_impl{
+        models_path,
+        tokenizer,
+        scheduler_config,
+        device,
+        plugin_config} {
+        m_generation_config = m_impl.get_config();
+    }
+
+    ContinuousBatchingAdapter(
+        const std::string& model_str,
+        const ov::Tensor& weights_tensor,
+        const Tokenizer& tokenizer,
+        const SchedulerConfig& scheduler_config,
+        const std::string& device,
+        const ov::AnyMap& plugin_config,
+        const ov::genai::GenerationConfig& generation_config
+    ): LLMPipelineImplBase{tokenizer, GenerationConfig()}, m_impl{
+        model_str, 
+        weights_tensor,
+        tokenizer,
+        scheduler_config,
+        device,
+        plugin_config,
+        generation_config} {}
+
+    ContinuousBatchingAdapter(
+        const std::filesystem::path& models_path,
+        const SchedulerConfig& scheduler_config,
+        const std::string& device,
+        const ov::AnyMap& plugin_config
+    ): LLMPipelineImplBase{Tokenizer(models_path), GenerationConfig()}, m_impl{
+        models_path,
+        m_tokenizer,
+        scheduler_config,
+        device,
+        plugin_config} {
+        m_generation_config = m_impl.get_config();
+    }
+
+    DecodedResults generate(
+        StringInputs inputs,
+        OptionalGenerationConfig generation_config,
+        StreamerVariant streamer
+    ) override {
+        std::vector<std::string> prompts = std::visit(overloaded{
+            [](const std::string& prompt) {
+                return std::vector{prompt};
+            },
+            [](std::vector<std::string>& prompts) {
+                return prompts;
+            }
+        }, inputs);
+        const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config;
+        // -1 == config.eos_token_id and config.validate() are handled in m_impl.
+        std::vector<GenerationResult> generated = m_impl.generate(
+            prompts,
+            std::vector<GenerationConfig>{prompts.size(), config},
+            streamer
+        );
+        std::vector<std::string> plain_replies;
+        std::vector<float> plain_scores;
+        for (GenerationResult& res : generated) {
+            OPENVINO_ASSERT(res.m_status == GenerationStatus::FINISHED || res.m_status == GenerationStatus::DROPPED_BY_HANDLE, "Got unfinished GenerationStatus");
+            std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_replies));
+            std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores));
+        }
+        return {std::move(plain_replies), std::move(plain_scores)};
+    }
+
+    EncodedResults generate(
+        const EncodedInputs& inputs,
+        OptionalGenerationConfig generation_config,
+        StreamerVariant streamer
+    ) override {
+        std::vector<ov::Tensor> input_ids = std::visit(overloaded{
+            [](const ov::Tensor& inp) {
+                size_t batch_size = inp.get_shape().at(0);
+                if (1 == batch_size) {
+                    return std::vector{inp};
+                }
+                std::vector<ov::Tensor> input_ids;
+                input_ids.reserve(batch_size);
+                size_t max_len = inp.get_shape().at(1);
+                const int64_t* const source = inp.data<const int64_t>();
+                for (size_t batch_id = 0; batch_id < batch_size; ++batch_id) {
+                    input_ids.emplace_back(ov::element::i64, ov::Shape(1, max_len));
+                    int64_t* destination = input_ids.back().data<int64_t>();
+                    std::copy_n(source + batch_id * max_len, max_len, destination);
+                }
+                return input_ids;
+            },
+            [](const TokenizedInputs& inp) {
+                size_t batch_size = inp.input_ids.get_shape().at(0);
+                std::vector<ov::Tensor> input_ids;
+                input_ids.reserve(batch_size);
+                size_t max_len = inp.input_ids.get_shape().at(1);
+                const int64_t* const source = inp.input_ids.data<const int64_t>();
+                const int64_t* const attention_mask = inp.attention_mask.data<const int64_t>();
+                for (size_t batch_id = 0; batch_id < batch_size; ++batch_id) {
+                    input_ids.emplace_back(ov::element::i64, ov::Shape(1, max_len));
+                    int64_t* destination = input_ids.back().data<int64_t>();
+                    size_t copy_count = 0;
+                    for (size_t idx = 0; idx < max_len; ++idx) {
+                        if (1 == attention_mask[batch_id * max_len + idx]) {
+                            destination[copy_count++] = source[batch_id * max_len + idx];
+                        }
+                    }
+                    input_ids.back().set_shape({1, copy_count});
+                }
+                return input_ids;
+            }
+        }, inputs);
+
+        const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config;
+        // -1 == config.eos_token_id and config.validate() are handled in m_impl.
+        std::vector<EncodedGenerationResult> generated = m_impl.generate(input_ids, std::vector<GenerationConfig>{input_ids.size(), config}, streamer);
+        std::vector<std::vector<int64_t>> plain_tokens;
+        std::vector<float> plain_scores;
+        for (EncodedGenerationResult& res : generated) {
+            OPENVINO_ASSERT(res.m_status == GenerationStatus::FINISHED || res.m_status == GenerationStatus::DROPPED_BY_HANDLE, "Got unfinished GenerationStatus");
+            std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_tokens));
+            std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores));
+        }
+        return {std::move(plain_tokens), std::move(plain_scores)};
+    }
+
+    void start_chat(const std::string& system_message) override {
+        m_impl.start_chat();
+    };
+
+    void finish_chat() override {
+        m_impl.finish_chat();
+    };
+};
+
+} // namespace ov::genai
diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index 52ec6a8302..44bfaf7f21 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -5,6 +5,7 @@
 #include "continuous_batching_impl.hpp"
 #include "utils.hpp"
 #include "utils/paged_attention_transformations.hpp"
+#include "lora_helper.hpp"
 
 namespace ov::genai {
 template<class... Ts> struct overloaded : Ts... {using Ts::operator()...;};
@@ -17,38 +18,45 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
     const std::string& device,
     const ov::AnyMap& properties,
     const ov::genai::GenerationConfig& generation_config,
-    bool is_validation_mode_enabled
-    ) {
+    bool is_validation_mode_enabled) {
     m_tokenizer = tokenizer;
     m_generation_config = generation_config;
     m_is_validation_mode_enabled = is_validation_mode_enabled;
 
-    ov::Core core;
-
-    auto [core_properties, compile_properties] = utils::split_core_compile_config(properties);
-    core.set_property(core_properties);
-
-    DeviceConfig device_config(core, scheduler_config, device, compile_properties);
+    ov::Core core = utils::singleton_core();
+    DeviceConfig device_config(core, scheduler_config, device, properties);
 
     bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction;
     utils::apply_paged_attention_transformations(model, device_config, is_need_per_layer_cache_control);
+    utils::apply_gather_before_matmul_transformation(model);
 
-    init(model, scheduler_config, compile_properties, device_config, core);
+    initialize_pipeline(model, scheduler_config, properties, device_config, core);
 }
 
 void ContinuousBatchingPipeline::ContinuousBatchingImpl::_pull_awaiting_requests() {
     std::lock_guard<std::mutex> lock{m_awaiting_requests_mutex};
     m_requests.insert(m_requests.end(), m_awaiting_requests.begin(), m_awaiting_requests.end());
     m_awaiting_requests.clear();
+    m_pipeline_metrics.requests = m_requests.size();
 }
 
-void ContinuousBatchingPipeline::ContinuousBatchingImpl::init(
+void ContinuousBatchingPipeline::ContinuousBatchingImpl::initialize_pipeline(
     std::shared_ptr<ov::Model> model,
     const SchedulerConfig& scheduler_config,
     const ov::AnyMap& properties,
     const DeviceConfig& device_config,
     ov::Core& core) {
-    auto compiled_model = core.compile_model(model, device_config.get_device(), properties);
+    ov::CompiledModel compiled_model;
+
+    // apply LoRA
+    if (auto filtered_properties = extract_adapters_from_properties(properties, &m_generation_config.adapters)) {
+        m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model.");
+        m_adapter_controller = AdapterController(model, *m_generation_config.adapters, device_config.get_device());   // TODO: Make the prefix name configurable
+        compiled_model = core.compile_model(model, device_config.get_device(), *filtered_properties);
+    } else {
+        compiled_model = core.compile_model(model, device_config.get_device(), properties);
+    }
+
     ov::genai::utils::print_compiled_model_properties(compiled_model, "LLM with Paged Attention");
     ov::InferRequest infer_request = compiled_model.create_infer_request();
 
@@ -68,9 +76,12 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init(
         can_use_partial_preemption = false;
     }
     m_scheduler = std::make_shared<Scheduler>(device_config.get_block_size(), m_cache_manager, updated_config, device_config.get_num_layers(), can_use_partial_preemption);
-    // and finally create model runner
+
+    // model runner
     bool is_use_cache_eviction = m_scheduler->get_config().use_cache_eviction;
     m_model_runner = std::make_shared<ModelRunner>(infer_request, m_scheduler->get_block_size(), device_config.get_num_layers(), is_use_cache_eviction);
+
+    // sampler
     m_sampler = std::make_shared<Sampler>(m_tokenizer);
     m_sampler->set_seed(m_generation_config.rng_seed);
 
@@ -91,9 +102,8 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::add_request(uint64_t request
 
     SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, input_ids,
                                                                         sampling_params,
-                                                                        m_scheduler->get_block_size(),
-                                                                        m_scheduler->get_config().enable_prefix_caching);
-    sequence_group->set_sequence_group_ptr(sequence_group);
+                                                                        m_scheduler->get_block_size());
+
     if (m_scheduler->get_config().enable_prefix_caching) {
         m_scheduler->restore_cached_blocks(sequence_group);
     }
@@ -102,6 +112,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::add_request(uint64_t request
         std::lock_guard<std::mutex> lock{m_awaiting_requests_mutex};
         m_awaiting_requests.push_back(sequence_group);
     }
+
     return std::make_shared<GenerationHandleImpl>(sequence_group->get_generation_stream(), sampling_params);
 };
 
@@ -113,6 +124,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::add_request(uint64_t request
     timer.start();
     ov::Tensor input_ids = m_tokenizer.encode(prompt).input_ids;
     timer.end();
+
     return add_request(request_id, input_ids, sampling_params);
 }
 
@@ -127,24 +139,26 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
 
     _pull_awaiting_requests();
 
-    m_pipeline_metrics.requests = m_requests.size();
     Scheduler::Output scheduler_output;
     {
-        static ManualTimer timer("scheduling");
-        timer.start();
-        m_scheduler->clean_empty_blocks(m_requests);
+        static ManualTimer scheduling_timer("scheduling");
+        scheduling_timer.start();
         scheduler_output = m_scheduler->schedule(m_requests);
+        scheduling_timer.end();
+
         m_pipeline_metrics.scheduled_requests = scheduler_output.m_scheduled_sequence_groups_ids.size();
         m_pipeline_metrics.cache_usage = scheduler_output.m_cache_usage;
-        m_pipeline_metrics.max_cache_usage =
-            std::max(m_pipeline_metrics.max_cache_usage, scheduler_output.m_cache_usage);
+        m_pipeline_metrics.max_cache_usage = std::max(m_pipeline_metrics.max_cache_usage, scheduler_output.m_cache_usage);
         _register_step_cache_usage(scheduler_output.m_cache_usage);
         m_pipeline_metrics.avg_cache_usage = _get_current_running_average_cache_usage();
+
+        static ManualTimer copy_blocks_timer("scheduling");
+        copy_blocks_timer.start();
         m_cache_manager->copy_blocks(scheduler_output.m_block_copy_map);
-        timer.end();
+        copy_blocks_timer.end();
     }
 
-    // if no tokens were scheduled, we are out of memory
+    // if no tokens were scheduled, we are out of memory => free all requests and return
     if (scheduler_output.m_total_num_scheduled_tokens == 0) {
         for (size_t i = 0; i < m_requests.size(); ++i) {
             SequenceGroup::Ptr sequence_group = m_requests[i];
@@ -166,15 +180,14 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
     }
 
 #ifdef DEBUG_CACHE_STATE_DUMP
-
     CacheStateDumper dumper(CacheStateDumper::get_run_id_for_generation_step(step_count, "before_eviction"));
     dumper.dump_cache_state(*m_scheduler, m_requests, step_count);
 #endif
-    const auto& sched_config = m_scheduler->get_config();
 
     // evict unimportant blocks from KV cache, if requested
+    const auto& sched_config = m_scheduler->get_config();
     if (sched_config.use_cache_eviction) {
-        maybe_evict_cache_blocks(sched_config);
+        _maybe_evict_cache_blocks(sched_config);
     }
 
 #ifdef DEBUG_CACHE_STATE_DUMP
@@ -183,6 +196,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
     step_count++;
 #endif
 
+    // process generation_config.echo parameter
     _fill_prompt_log_probs(m_requests, logits);
 
     SamplerOutput sampler_output;
@@ -195,8 +209,8 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
 
     // process sampler_output (e.g. fork or drop sequences from BlockScheduler)
     {
-        static ManualTimer timer("fork / free sequence");
-        timer.start();
+        static ManualTimer free_fork_timer("fork / free sequence");
+        free_fork_timer.start();
 
         for (const auto& pair : sampler_output.m_forked_sequences) {
             uint64_t parent_id = pair.first;
@@ -208,35 +222,49 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
         for (auto seq_id : sampler_output.m_dropped_sequences)
             m_scheduler->free_sequence(seq_id);
 
-        timer.end();
+        free_fork_timer.end();
     }
 
     // notify requests dropped by handle
     {
-        static ManualTimer timer("notify requests dropped by handle");
-        timer.start();
+        static ManualTimer report_tokens_timer("notify requests dropped by handle");
+        report_tokens_timer.start();
         _notify_requests_dropped_by_handle();
-        timer.end();
+        report_tokens_timer.end();
     }
 
     // free non running requests for current step
 
     {
-        static ManualTimer timer("free non running requests");
-        timer.start();
+        static ManualTimer clean_up_requests_timer("free non running requests");
+        clean_up_requests_timer.start();
         _free_non_running_requests();
-        timer.end();
+        clean_up_requests_timer.end();
     }
 
     step_timer.end();
 }
 
+void ContinuousBatchingPipeline::ContinuousBatchingImpl::set_adapters(const std::optional<AdapterConfig>& adapters) {
+    if (m_adapter_controller) {
+        m_adapter_controller->apply(m_model_runner->get_infer_request(), adapters);
+    }
+}
+
 std::vector<EncodedGenerationResult>
 ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<ov::Tensor>& input_ids,
                                                              const std::vector<GenerationConfig>& sampling_params,
                                                              const StreamerVariant& streamer) {
     OPENVINO_ASSERT(!has_non_finished_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request");
     OPENVINO_ASSERT(input_ids.size() == sampling_params.size());
+
+    // checks that all requests has the same LoRA adapters property value
+    for (size_t i = 1; i < sampling_params.size(); ++i) {
+        OPENVINO_ASSERT(sampling_params[i - 1].adapters == sampling_params[i].adapters,
+            "LoRA adapters value must be the same for all requests");
+    }
+    set_adapters(sampling_params[0].adapters);
+
     const std::shared_ptr<StreamerBase>& streamer_ptr = std::visit(overloaded{
         [](std::monostate) -> std::shared_ptr<StreamerBase> {
             return nullptr;
@@ -320,7 +348,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
 
         for (size_t i = 0; i < num_outputs; ++i) {
             const auto & sequence = sequences[i];
-            const float score = sampling_params.is_beam_search() ? sequence->get_beam_search_score(sampling_params) : sequence->get_cumulative_log_probs();
+            const float score = sampling_params.is_beam_search() ? sequence->get_beam_search_score(sampling_params) : sequence->get_cumulative_log_prob();
             const auto & generated_ids = sequence->get_generated_ids();
 
             if (sampling_params.echo)
@@ -375,7 +403,7 @@ float ContinuousBatchingPipeline::ContinuousBatchingImpl::_get_current_running_a
     return std::accumulate(m_previous_step_cache_usages.begin(), m_previous_step_cache_usages.end(), 0.0) / m_previous_step_cache_usages.size();
 }
 
-void ContinuousBatchingPipeline::ContinuousBatchingImpl::maybe_evict_cache_blocks(const SchedulerConfig& sched_config) {
+void ContinuousBatchingPipeline::ContinuousBatchingImpl::_maybe_evict_cache_blocks(const SchedulerConfig& sched_config) {
     std::unordered_map<SequenceGroup::Ptr, size_t> seq_group_to_num_blocks_evicted_map;
     auto sequence_attention_scores = m_model_runner->get_last_attention_scores();
     for (auto& seq_id_and_attention_scores : sequence_attention_scores) {
@@ -417,7 +445,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
     const float * logits_data = logits.data<float>();
     ov::Shape logits_shape = logits.get_shape();
     OPENVINO_ASSERT(logits_shape.size() == 3);
-    size_t batch_seq_len = logits_shape[1], vocab_size = logits_shape[2];
+    size_t vocab_size = logits_shape[2];
     for (size_t sequence_group_id = 0, currently_processed_tokens = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) {
         SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id];
         // requests not scheduled, in decoding phase or not echoing are not processed
@@ -427,18 +455,17 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
 
         size_t num_running_sequences = sequence_group->num_running_seqs();
         OPENVINO_ASSERT(num_running_sequences == 1);
-        size_t actual_seq_len = sequence_group->get_num_scheduled_tokens();
-        size_t padded_amount_of_processed_tokens = std::max(actual_seq_len, batch_seq_len);
+        size_t output_seq_len = sequence_group->get_output_seq_len();
 
         const float * sequence_group_logits_data = logits_data + vocab_size * currently_processed_tokens;
 
         size_t num_prompt_tokens_processed = sequence_group->get_num_processed_tokens();
-        OPENVINO_ASSERT(num_prompt_tokens_processed + actual_seq_len <= sequence_group->get_prompt_len());
+        OPENVINO_ASSERT(num_prompt_tokens_processed + output_seq_len <= sequence_group->get_prompt_len());
 
         // if we processed the whole prompt we don't include last logprob as it will be processed by the sampler (it's already completion)
         // otherwise we include it as it will be used in the next part of the prompt
         int exclude_last_logprob = 1;
-        if (num_prompt_tokens_processed + actual_seq_len < sequence_group->get_prompt_len())
+        if (num_prompt_tokens_processed + output_seq_len < sequence_group->get_prompt_len())
             exclude_last_logprob = 0;
 
         // if we start processing the prompt we add "fake" log prob for the first position (begin of sequence)
@@ -446,7 +473,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
             sequence_group->append_prompt_log_prob(1.0);
 
         for (int token_logits_offset = 0, token_id_offset = num_prompt_tokens_processed + 1;
-             token_logits_offset < actual_seq_len - exclude_last_logprob;
+             token_logits_offset < output_seq_len - exclude_last_logprob;
              token_logits_offset++, token_id_offset++) {
 
             const float* token_logits = (sequence_group_logits_data + token_logits_offset * vocab_size);
@@ -471,7 +498,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
 
             sequence_group->append_prompt_log_prob(token_logit - max_value - log_sum);
         }
-        currently_processed_tokens += padded_amount_of_processed_tokens * num_running_sequences;
+        currently_processed_tokens += output_seq_len * num_running_sequences;
         // For max_new_tokens == 0, we don't reach sampling so need to notify handle separately
         if(sequence_group->get_sampling_parameters().max_new_tokens == 0) {
             sequence_group->notify_handle_echo_only();
diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp
index 8da05c6dfa..d319147f2c 100644
--- a/src/cpp/src/continuous_batching_impl.hpp
+++ b/src/cpp/src/continuous_batching_impl.hpp
@@ -3,16 +3,19 @@
 
 #pragma once
 
-#include "continuous_batching_impl_interface.hpp"
-#include "openvino/genai/continuous_batching_pipeline.hpp"
+#include "icontinuous_batching.hpp"
+
+#include "openvino/genai/lora_adapter.hpp"
 #include "cache_eviction.hpp"
 
 namespace ov::genai {
-class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatchingPipeline::ImplInterface {
+
+class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatchingPipeline::IContinuousBatchingPipeline {
 protected:
     std::shared_ptr<Scheduler> m_scheduler;
     std::shared_ptr<CacheManager> m_cache_manager;
     std::shared_ptr<ModelRunner> m_model_runner;
+    std::optional<AdapterController> m_adapter_controller;
     std::shared_ptr<Sampler> m_sampler;
 
     // current requests to process
@@ -26,7 +29,7 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
 
     static const size_t AVG_CACHE_USAGE_WINDOW_SIZE_IN_STEPS = 1000;
     std::deque<float> m_previous_step_cache_usages;
-    
+
     // flag to enable validation mode for sampler
     bool m_is_validation_mode_enabled = false;
 
@@ -37,21 +40,41 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
     // used by tests only
     ContinuousBatchingImpl() = default;
 
+    void initialize_pipeline(std::shared_ptr<ov::Model> model,
+                             const SchedulerConfig& scheduler_config,
+                             const ov::AnyMap& plugin_config,
+                             const DeviceConfig& device_config,
+                             ov::Core& core);
+
+    /**
+     * Pulls requests from awaiting queue to running queue
+     * Should be called within each call of step()
+     */
+    virtual void _pull_awaiting_requests();
+
+    /**
+     * Releases non-running (finished, dropped or OOM) requests from running queue
+     */
     void _free_non_running_requests();
+
+    /**
+     * Notify dropped requests by pushing empty output
+     */
     void _notify_requests_dropped_by_handle();
-    void _register_step_cache_usage(float step_cache_usage);
-    float _get_current_running_average_cache_usage() const;
-    void maybe_evict_cache_blocks(const SchedulerConfig& sched_config);
 
-    void init(std::shared_ptr<ov::Model> model,
-              const SchedulerConfig& scheduler_config,
-              const ov::AnyMap& plugin_config,
-              const DeviceConfig& device_config,
-              ov::Core& core);
+    /**
+     * Handles 'echo' generation parameter
+     */
+    void _fill_prompt_log_probs(std::vector<SequenceGroup::Ptr>& sequence_groups, ov::Tensor& logits);
 
-    virtual void _pull_awaiting_requests();
+    /**
+     * Performs KV cache eviction is enabled / requireed
+     */
+    void _maybe_evict_cache_blocks(const SchedulerConfig& sched_config);
+
+    void _register_step_cache_usage(float step_cache_usage);
+    float _get_current_running_average_cache_usage() const;
 
-    void _fill_prompt_log_probs(std::vector<SequenceGroup::Ptr>& sequence_groups, ov::Tensor& logits);
 public:
     ContinuousBatchingImpl(const std::shared_ptr<ov::Model>& model,
                            const Tokenizer& tokenizer,
@@ -64,6 +87,7 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
     GenerationHandle add_request(uint64_t request_id,
                                  const ov::Tensor& input_ids,
                                  ov::genai::GenerationConfig sampling_params) override;
+
     GenerationHandle add_request(uint64_t request_id,
                                  const std::string& prompt,
                                  ov::genai::GenerationConfig sampling_params) override;
@@ -76,5 +100,11 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
     generate(const std::vector<ov::Tensor>& input_ids,
              const std::vector<GenerationConfig>& sampling_params,
              const StreamerVariant& streamer) override;
+
+    /**
+     * Updates LoRA adapters for current generation call
+     */
+    void set_adapters(const std::optional<AdapterConfig>& adapters);
 };
-}
\ No newline at end of file
+
+} // namespace ov::genai
diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index 148eb2fa9f..c1c0677ff3 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -47,19 +47,19 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::p
     auto properties_without_draft_model = properties;
     auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model);
     auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
-    
-    std::filesystem::path openvino_model_name = "openvino_model.xml";
-    auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
+
+    auto model = utils::singleton_core().read_model(models_path / "openvino_model.xml", {}, properties);
     auto tokenizer = ov::genai::Tokenizer(models_path, tokenizer_properties);
     auto generation_config = utils::from_config_json_if_exists(models_path);
+
     if (is_prompt_lookup_enabled) {
-        OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually excluded");
+        OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
         m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config);
-    } else if (draft_model_desr.model == nullptr) {
-        m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
-    } else {
+    } else if (draft_model_desr.model != nullptr) {
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
         m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
+    } else {
+        m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
     }
 }
 
@@ -73,17 +73,17 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
     auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model);
     auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
     std::filesystem::path openvino_model_name = "openvino_model.xml";
-    auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
+    auto model = utils::singleton_core().read_model(models_path / openvino_model_name, {}, properties_without_draft_model);
     auto generation_config = utils::from_config_json_if_exists(models_path);
 
     if (is_prompt_lookup_enabled) {
-        OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually excluded");
+        OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
         m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config);
-    } else if (draft_model_desr.model == nullptr) {
-        m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
-    } else {
+    } else if (draft_model_desr.model != nullptr) {
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
         m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
+    } else {
+        m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
     }
 }
 
@@ -101,13 +101,13 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
     auto model = utils::singleton_core().read_model(model_str, weights_tensor);
 
     if (is_prompt_lookup_enabled) {
-        OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually excluded");
+        OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
         m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config);
-    } else if (draft_model_desr.model == nullptr) {
-        m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
-    } else {
+    } else if (draft_model_desr.model != nullptr) {
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
-        m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);    
+        m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
+    } else {
+        m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
     }
 }
 
diff --git a/src/cpp/src/device_config.hpp b/src/cpp/src/device_config.hpp
index cc2e21b9a1..fee6c7abd1 100644
--- a/src/cpp/src/device_config.hpp
+++ b/src/cpp/src/device_config.hpp
@@ -117,22 +117,22 @@ class DeviceConfig {
         }
 
         for (size_t layer_id = 0; layer_id < m_num_decoder_layers; layer_id++) {
-            m_key_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(),
-                                                         ov::Dimension(m_num_kv_heads[layer_id]),
-                                                         ov::Dimension(m_block_size),
-                                                         ov::Dimension(m_head_size)});
-
             m_value_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(),
                                                            ov::Dimension(m_num_kv_heads[layer_id]),
                                                            ov::Dimension(m_block_size),
                                                            ov::Dimension(m_head_size)});
 
-            if (m_device.find("GPU") != std::string::npos) {
+            if (m_device.find("GPU") == std::string::npos) {
+                m_key_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(),
+                                                             ov::Dimension(m_num_kv_heads[layer_id]),
+                                                             ov::Dimension(m_block_size),
+                                                             ov::Dimension(m_head_size)});
+            } else  if (m_device.find("GPU") != std::string::npos) {
                 // Update key shape, as the key's shape is different from the value's shape
                 m_key_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(),
-                                                     ov::Dimension(m_num_kv_heads[layer_id]),
-                                                     ov::Dimension(m_head_size),
-                                                     ov::Dimension(m_block_size)});
+                                                             ov::Dimension(m_num_kv_heads[layer_id]),
+                                                             ov::Dimension(m_head_size),
+                                                             ov::Dimension(m_block_size)});
             }
         }
     }
diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
index 59be603fd9..25402e22e7 100644
--- a/src/cpp/src/generation_config.cpp
+++ b/src/cpp/src/generation_config.cpp
@@ -230,9 +230,9 @@ void GenerationConfig::validate() const {
         OPENVINO_ASSERT(temperature > 0, "When 'do_sample' is true, temperature must be a strictly positive float, but got ", temperature);
     } else {
         // parameters requiring multinomial
-        OPENVINO_ASSERT(top_k == std::numeric_limits<size_t>::max(), "When 'do_sample' is false, top_k must be max of size_t, but got ", top_k);
-        OPENVINO_ASSERT(top_p == 1.0f, "When 'do_sample' is false, top_p must be 1.0f, but got ", top_p);
-        OPENVINO_ASSERT(temperature == 1.0f, "When 'do_sample' is false, temperature must be a 1.0f, but got ", temperature);
+        // OPENVINO_ASSERT(top_k == std::numeric_limits<size_t>::max(), "When 'do_sample' is false, top_k must be max of size_t, but got ", top_k);
+        // OPENVINO_ASSERT(top_p == 1.0f, "When 'do_sample' is false, top_p must be 1.0f, but got ", top_p);
+        // OPENVINO_ASSERT(temperature == 1.0f, "When 'do_sample' is false, temperature must be a 1.0f, but got ", temperature);
     }
 
     if (is_beam_search()) {
@@ -252,10 +252,10 @@ void GenerationConfig::validate() const {
         }
     } else {
         // parameters requiring beam search
-        OPENVINO_ASSERT(num_beam_groups == 1, "'num_beam_groups' is supported by beam search only and should be 1 otherwise, but got ", num_beam_groups);
-        OPENVINO_ASSERT(no_repeat_ngram_size == std::numeric_limits<size_t>::max(), "'no_repeat_ngram_size' is supported only by beam search, otherwise should be set to max of size_t, but got ", no_repeat_ngram_size);
-        OPENVINO_ASSERT(diversity_penalty == 0.0f, "'diversity_penalty' is set to ", diversity_penalty, " (default is 0.0f), which is supported only by beam search sampling");
-        OPENVINO_ASSERT(length_penalty == 1.0f, "'length_penalty' is set to ", length_penalty, " (default is 1.0f), which is supported only by beam search sampling");
+        // OPENVINO_ASSERT(num_beam_groups == 1, "'num_beam_groups' is supported by beam search only and should be 1 otherwise, but got ", num_beam_groups);
+        // OPENVINO_ASSERT(no_repeat_ngram_size == std::numeric_limits<size_t>::max(), "'no_repeat_ngram_size' is supported only by beam search, otherwise should be set to max of size_t, but got ", no_repeat_ngram_size);
+        // OPENVINO_ASSERT(diversity_penalty == 0.0f, "'diversity_penalty' is set to ", diversity_penalty, " (default is 0.0f), which is supported only by beam search sampling");
+        // OPENVINO_ASSERT(length_penalty == 1.0f, "'length_penalty' is set to ", length_penalty, " (default is 1.0f), which is supported only by beam search sampling");
     }
 
     // assistant generation
diff --git a/src/cpp/src/continuous_batching_impl_interface.cpp b/src/cpp/src/icontinuous_batching.cpp
similarity index 79%
rename from src/cpp/src/continuous_batching_impl_interface.cpp
rename to src/cpp/src/icontinuous_batching.cpp
index 10fc102aa0..e32616b0aa 100644
--- a/src/cpp/src/continuous_batching_impl_interface.cpp
+++ b/src/cpp/src/icontinuous_batching.cpp
@@ -1,40 +1,41 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include "continuous_batching_impl_interface.hpp"
+#include "icontinuous_batching.hpp"
 
 namespace ov::genai {
 
-GenerationConfig ContinuousBatchingPipeline::ImplInterface::get_config() const {
+GenerationConfig ContinuousBatchingPipeline::IContinuousBatchingPipeline::get_config() const {
     return m_generation_config;
 }
 
-PipelineMetrics ContinuousBatchingPipeline::ImplInterface::get_metrics() const {
+PipelineMetrics ContinuousBatchingPipeline::IContinuousBatchingPipeline::get_metrics() const {
     return m_pipeline_metrics;
 }
 
-Tokenizer ContinuousBatchingPipeline::ImplInterface::get_tokenizer() {
+Tokenizer ContinuousBatchingPipeline::IContinuousBatchingPipeline::get_tokenizer() {
     return m_tokenizer;
 }
 
-void ContinuousBatchingPipeline::ImplInterface::start_chat(const std::string& system_message) {
+void ContinuousBatchingPipeline::IContinuousBatchingPipeline::start_chat(const std::string& system_message) {
     if (!system_message.empty()) {
         m_history.push_back({{"role", "system"}, {"content", system_message}});
     }
     m_is_chat_conversation = true;
 };
 
-void ContinuousBatchingPipeline::ImplInterface::finish_chat() {
+void ContinuousBatchingPipeline::IContinuousBatchingPipeline::finish_chat() {
     m_is_chat_conversation = false;
     m_history.clear();
 };
 
 std::vector<GenerationResult>
-ContinuousBatchingPipeline::ImplInterface::generate(
+ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
     const std::vector<std::string>& prompts,
     std::vector<ov::genai::GenerationConfig> sampling_params,
     const StreamerVariant& streamer) {
     std::vector<ov::Tensor> input_ids;
+
     static ManualTimer timer("tokenize");
     if (m_is_chat_conversation) {
         OPENVINO_ASSERT(1 == prompts.size(), "Can't chat with multiple prompts");
@@ -47,13 +48,15 @@ ContinuousBatchingPipeline::ImplInterface::generate(
         timer.end();
     } else {
         input_ids.reserve(prompts.size());
+        timer.start();
         for (const std::string& prompt : prompts) {
-            timer.start();
             input_ids.push_back(m_tokenizer.encode(prompt).input_ids);
-            timer.end();
         }
+        timer.end();
     }
+
     std::vector<EncodedGenerationResult> encoded = generate(input_ids, sampling_params, streamer);
+
     std::vector<GenerationResult> decoded;
     decoded.reserve(encoded.size());
     for (EncodedGenerationResult& res : encoded) {
@@ -65,6 +68,7 @@ ContinuousBatchingPipeline::ImplInterface::generate(
                 m_history.push_back({{"role", "assistant"}, {"content", generated.back()}});
             }
         }
+
         decoded.push_back(GenerationResult{
             res.m_request_id,
             std::move(generated),
@@ -72,6 +76,7 @@ ContinuousBatchingPipeline::ImplInterface::generate(
             res.m_status
         });
     }
+
     return decoded;
 }
-}
\ No newline at end of file
+}
diff --git a/src/cpp/src/continuous_batching_impl_interface.hpp b/src/cpp/src/icontinuous_batching.hpp
similarity index 72%
rename from src/cpp/src/continuous_batching_impl_interface.hpp
rename to src/cpp/src/icontinuous_batching.hpp
index 909383c98a..12030f06f7 100644
--- a/src/cpp/src/continuous_batching_impl_interface.hpp
+++ b/src/cpp/src/icontinuous_batching.hpp
@@ -12,7 +12,10 @@
 
 namespace ov::genai {
 
-class ContinuousBatchingPipeline::ImplInterface {
+/**
+ * Base interface for all continuous batching based pipelines
+ */
+class ContinuousBatchingPipeline::IContinuousBatchingPipeline {
 protected:
     Tokenizer m_tokenizer;
 
@@ -35,6 +38,7 @@ class ContinuousBatchingPipeline::ImplInterface {
             // std::cout << std::endl;
         }
     } m_perf;
+
     bool m_is_chat_conversation = false;
     ChatHistory m_history;
 
@@ -43,27 +47,57 @@ class ContinuousBatchingPipeline::ImplInterface {
     PipelineMetrics get_metrics() const;
     ov::genai::Tokenizer get_tokenizer();
 
+    /**
+     * Adds requests to awaiting queue using encoded inputs
+     */
     virtual GenerationHandle add_request(uint64_t request_id,
                                          const ov::Tensor& input_ids,
                                          ov::genai::GenerationConfig sampling_params) = 0;
+
+    /**
+     * Adds request to running queue based on string input
+     * This step also performs tokenization's encode
+     */
     virtual GenerationHandle add_request(uint64_t request_id,
                                          const std::string& prompt,
                                          ov::genai::GenerationConfig sampling_params) = 0;
     
+    /**
+     * Checks whether server (pipeline) has non-finished requests and step() should be called within a loop
+     */
     virtual bool has_non_finished_requests() = 0;
 
+    /**
+     * Performs a single inference step of all running (and pulls awaiting) requests
+     */
     virtual void step() = 0;
 
+    /**
+     * Performs monolitic generation based on encoded prompts
+     */
     virtual std::vector<EncodedGenerationResult>
     generate(const std::vector<ov::Tensor>& input_ids,
              const std::vector<GenerationConfig>& sampling_params,
              const StreamerVariant& streamer) = 0;
+
+    /**
+     * Performs monolitic generation based on text prompts
+     */
     std::vector<GenerationResult>
     generate(const std::vector<std::string>& prompts,
              std::vector<ov::genai::GenerationConfig> sampling_params,
              const StreamerVariant& streamer);
 
+    /**
+     * Starts chat with a given system prompt
+     * 
+     * In chat scenario prompts passed to `generate` method are accumulated inside the pipeline until `finish_chat` is called
+     */
     void start_chat(const std::string& system_message);
+
+    /**
+     * Ends chat
+     */
     void finish_chat();
 };
 }
\ No newline at end of file
diff --git a/src/cpp/src/image_generation/models/autoencoder_kl.cpp b/src/cpp/src/image_generation/models/autoencoder_kl.cpp
index 4ffab62c53..a5608db80f 100644
--- a/src/cpp/src/image_generation/models/autoencoder_kl.cpp
+++ b/src/cpp/src/image_generation/models/autoencoder_kl.cpp
@@ -91,8 +91,7 @@ AutoencoderKL::Config::Config(const std::filesystem::path& config_path) {
 
 AutoencoderKL::AutoencoderKL(const std::filesystem::path& vae_decoder_path)
     : m_config(vae_decoder_path / "config.json") {
-    ov::Core core = utils::singleton_core();
-    m_decoder_model = core.read_model((vae_decoder_path / "openvino_model.xml").string());
+    m_decoder_model = utils::singleton_core().read_model(vae_decoder_path / "openvino_model.xml");
     // apply VaeImageProcessor postprocessing steps by merging them into the VAE decoder model
     merge_vae_image_post_processing();
 }
@@ -100,8 +99,7 @@ AutoencoderKL::AutoencoderKL(const std::filesystem::path& vae_decoder_path)
 AutoencoderKL::AutoencoderKL(const std::filesystem::path& vae_encoder_path,
                              const std::filesystem::path& vae_decoder_path)
     : AutoencoderKL(vae_decoder_path) {
-    ov::Core core = utils::singleton_core();
-    m_encoder_model = core.read_model((vae_encoder_path / "openvino_model.xml").string());
+    m_encoder_model = utils::singleton_core().read_model(vae_encoder_path / "openvino_model.xml");
 }
 
 AutoencoderKL::AutoencoderKL(const std::filesystem::path& vae_decoder_path,
@@ -131,8 +129,7 @@ AutoencoderKL::AutoencoderKL(const std::string& vae_decoder_model,
                              const Tensor& vae_decoder_weights,
                              const Config& vae_decoder_config)
     : m_config(vae_decoder_config) {
-    ov::Core core = utils::singleton_core();
-    m_decoder_model = core.read_model(vae_decoder_model, vae_decoder_weights);
+    m_decoder_model = utils::singleton_core().read_model(vae_decoder_model, vae_decoder_weights);
     // apply VaeImageProcessor postprocessing steps by merging them into the VAE decoder model
     merge_vae_image_post_processing();
 }
@@ -143,8 +140,7 @@ AutoencoderKL::AutoencoderKL(const std::string& vae_encoder_model,
                              const Tensor& vae_decoder_weights,
                              const Config& vae_decoder_config)
     : AutoencoderKL(vae_decoder_model, vae_decoder_weights, vae_decoder_config) {
-    ov::Core core = utils::singleton_core();
-    m_encoder_model = core.read_model(vae_encoder_model, vae_encoder_weights);
+    m_encoder_model = utils::singleton_core().read_model(vae_encoder_model, vae_encoder_weights);
 }
 
 AutoencoderKL::AutoencoderKL(const std::string& vae_decoder_model,
diff --git a/src/cpp/src/image_generation/models/clip_text_model.cpp b/src/cpp/src/image_generation/models/clip_text_model.cpp
index a119483417..ece88572f9 100644
--- a/src/cpp/src/image_generation/models/clip_text_model.cpp
+++ b/src/cpp/src/image_generation/models/clip_text_model.cpp
@@ -37,8 +37,7 @@ CLIPTextModel::Config::Config(const std::filesystem::path& config_path) {
 CLIPTextModel::CLIPTextModel(const std::filesystem::path& root_dir) :
     m_clip_tokenizer(get_tokenizer_path_by_text_encoder(root_dir)),
     m_config(root_dir / "config.json") {
-    ov::Core core = utils::singleton_core();
-    m_model = core.read_model((root_dir / "openvino_model.xml").string());
+    m_model = utils::singleton_core().read_model(root_dir / "openvino_model.xml");
 }
 
 CLIPTextModel::CLIPTextModel(const std::filesystem::path& root_dir,
@@ -53,8 +52,7 @@ CLIPTextModel::CLIPTextModel(const std::string& model,
                              const Config& config,
                              const Tokenizer& clip_tokenizer) :
     m_clip_tokenizer(clip_tokenizer), m_config(config) {
-    ov::Core core = utils::singleton_core();
-    m_model = core.read_model(model, weights);
+    m_model = utils::singleton_core().read_model(model, weights);
 }
 
 CLIPTextModel::CLIPTextModel(const std::string& model,
diff --git a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp b/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
index 685c1f6c0e..e695c763cb 100644
--- a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
+++ b/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
@@ -28,8 +28,7 @@ CLIPTextModelWithProjection::Config::Config(const std::filesystem::path& config_
 CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::filesystem::path& root_dir) :
     m_clip_tokenizer(get_tokenizer_path_by_text_encoder(root_dir)),
     m_config(root_dir / "config.json") {
-    ov::Core core = utils::singleton_core();
-    m_model = core.read_model((root_dir / "openvino_model.xml").string());
+    m_model = utils::singleton_core().read_model(root_dir / "openvino_model.xml");
 }
 
 CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::filesystem::path& root_dir,
@@ -44,8 +43,7 @@ CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string& mode
                                                          const Config& config,
                                                          const Tokenizer& clip_tokenizer) :
     m_clip_tokenizer(clip_tokenizer), m_config(config) {
-    ov::Core core = utils::singleton_core();
-    m_model = core.read_model(model, weights);
+    m_model = utils::singleton_core().read_model(model, weights);
 }
 
 CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string& model,
diff --git a/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp b/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp
index 285ea197e7..71193a38e7 100644
--- a/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp
+++ b/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp
@@ -26,7 +26,7 @@ FluxTransformer2DModel::Config::Config(const std::filesystem::path& config_path)
 
 FluxTransformer2DModel::FluxTransformer2DModel(const std::filesystem::path& root_dir)
     : m_config(root_dir / "config.json") {
-    m_model = utils::singleton_core().read_model((root_dir / "openvino_model.xml").string());
+    m_model = utils::singleton_core().read_model(root_dir / "openvino_model.xml");
     m_vae_scale_factor = ov::genai::get_vae_scale_factor(root_dir.parent_path() / "vae_decoder" / "config.json");
 }
 
@@ -42,8 +42,7 @@ FluxTransformer2DModel::FluxTransformer2DModel(const std::string& model,
                                                const Config& config,
                                                const size_t vae_scale_factor) :
     m_config(config), m_vae_scale_factor(vae_scale_factor) {
-    ov::Core core = utils::singleton_core();
-    m_model = core.read_model(model, weights);
+    m_model = utils::singleton_core().read_model(model, weights);
 }
 
 FluxTransformer2DModel::FluxTransformer2DModel(const std::string& model,
diff --git a/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp b/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp
index b6f74acc51..69b0e6dcff 100644
--- a/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp
+++ b/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp
@@ -28,7 +28,7 @@ SD3Transformer2DModel::Config::Config(const std::filesystem::path& config_path)
 
 SD3Transformer2DModel::SD3Transformer2DModel(const std::filesystem::path& root_dir)
     : m_config(root_dir / "config.json") {
-    m_model = utils::singleton_core().read_model((root_dir / "openvino_model.xml").string());
+    m_model = utils::singleton_core().read_model(root_dir / "openvino_model.xml");
     m_vae_scale_factor = get_vae_scale_factor(root_dir.parent_path() / "vae_decoder" / "config.json");
 }
 
@@ -44,8 +44,7 @@ SD3Transformer2DModel::SD3Transformer2DModel(const std::string& model,
                                              const Config& config,
                                              const size_t vae_scale_factor) :
     m_config(config), m_vae_scale_factor(vae_scale_factor) {
-    ov::Core core = utils::singleton_core();
-    m_model = core.read_model(model, weights);
+    m_model = utils::singleton_core().read_model(model, weights);
 }
 
 SD3Transformer2DModel::SD3Transformer2DModel(const std::string& model,
diff --git a/src/cpp/src/image_generation/models/t5_encoder_model.cpp b/src/cpp/src/image_generation/models/t5_encoder_model.cpp
index bb133c3aac..ef41898cc3 100644
--- a/src/cpp/src/image_generation/models/t5_encoder_model.cpp
+++ b/src/cpp/src/image_generation/models/t5_encoder_model.cpp
@@ -16,8 +16,7 @@ std::filesystem::path get_tokenizer_path_by_text_encoder(const std::filesystem::
 
 T5EncoderModel::T5EncoderModel(const std::filesystem::path& root_dir) :
     m_tokenizer(get_tokenizer_path_by_text_encoder(root_dir)) {
-    ov::Core core = utils::singleton_core();
-    m_model = core.read_model((root_dir / "openvino_model.xml").string());
+    m_model = utils::singleton_core().read_model(root_dir / "openvino_model.xml");
 }
 
 T5EncoderModel::T5EncoderModel(const std::filesystem::path& root_dir,
@@ -31,8 +30,7 @@ T5EncoderModel::T5EncoderModel(const std::string& model,
                                const Tensor& weights,
                                const Tokenizer& tokenizer) :
     m_tokenizer(tokenizer) {
-    ov::Core core = utils::singleton_core();
-    m_model = core.read_model(model, weights);
+    m_model = utils::singleton_core().read_model(model, weights);
 }
 
 T5EncoderModel::T5EncoderModel(const std::string& model,
@@ -60,9 +58,7 @@ T5EncoderModel& T5EncoderModel::reshape(int batch_size, int max_sequence_length)
 
 T5EncoderModel& T5EncoderModel::compile(const std::string& device, const ov::AnyMap& properties) {
     OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot re-compile already compiled model");
-    ov::Core core = utils::singleton_core();
-    ov::CompiledModel compiled_model;
-    compiled_model = core.compile_model(m_model, device, properties);
+    ov::CompiledModel compiled_model = utils::singleton_core().compile_model(m_model, device, properties);
     ov::genai::utils::print_compiled_model_properties(compiled_model, "T5 encoder model");
     m_request = compiled_model.create_infer_request();
     // release the original model
diff --git a/src/cpp/src/image_generation/models/unet2d_condition_model.cpp b/src/cpp/src/image_generation/models/unet2d_condition_model.cpp
index 40d0a6125d..fd3e97314d 100644
--- a/src/cpp/src/image_generation/models/unet2d_condition_model.cpp
+++ b/src/cpp/src/image_generation/models/unet2d_condition_model.cpp
@@ -30,8 +30,7 @@ UNet2DConditionModel::Config::Config(const std::filesystem::path& config_path) {
 
 UNet2DConditionModel::UNet2DConditionModel(const std::filesystem::path& root_dir) :
     m_config(root_dir / "config.json") {
-    ov::Core core = utils::singleton_core();
-    m_model = core.read_model((root_dir / "openvino_model.xml").string());
+    m_model = utils::singleton_core().read_model(root_dir / "openvino_model.xml");
     m_vae_scale_factor = get_vae_scale_factor(root_dir.parent_path() / "vae_decoder" / "config.json");
 }
 
@@ -47,8 +46,7 @@ UNet2DConditionModel::UNet2DConditionModel(const std::string& model,
                                            const Config& config,
                                            const size_t vae_scale_factor) :
     m_config(config), m_vae_scale_factor(vae_scale_factor) {
-    ov::Core core = utils::singleton_core();
-    m_model = core.read_model(model, weights);
+    m_model = utils::singleton_core().read_model(model, weights);
 }
 
 UNet2DConditionModel::UNet2DConditionModel(const std::string& model,
diff --git a/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp b/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp
index 7db7ca9451..2dc1b9ef0b 100644
--- a/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp
+++ b/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp
@@ -10,13 +10,10 @@
 namespace ov {
 namespace genai {
 
-
 class UNet2DConditionModel::UNetInferenceDynamic : public UNet2DConditionModel::UNetInference {
 public:
     virtual void compile(std::shared_ptr<ov::Model> model, const std::string& device, const ov::AnyMap& properties) override {
-        ov::Core core = utils::singleton_core();
-
-        ov::CompiledModel compiled_model = core.compile_model(model, device, properties);
+        ov::CompiledModel compiled_model = utils::singleton_core().compile_model(model, device, properties);
         ov::genai::utils::print_compiled_model_properties(compiled_model, "UNet 2D Condition dynamic model");
         m_request = compiled_model.create_infer_request();
     }
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 3e378e78cf..11efed8b32 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -1,475 +1,48 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include <filesystem>
 #include <fstream>
-#include <variant>
-#include <algorithm>
+
 #include <nlohmann/json.hpp>
-#include <openvino/openvino.hpp>
-#include "openvino/genai/continuous_batching_pipeline.hpp"
-#include "openvino/genai/generation_config.hpp"
+
 #include "openvino/genai/llm_pipeline.hpp"
 #include "openvino/genai/perf_metrics.hpp"
-#include "llm_pipeline_base.hpp"
+
 #include "llm_pipeline_static.hpp"
-#include "utils.hpp"
-#include "text_callback_streamer.hpp"
-#include "openvino/genai/lora_adapter.hpp"
-#include "lora_helper.hpp"
+#include "llm_pipeline_stateful.hpp"
+#include "continuous_batching_adapter.hpp"
 #include "speculative_decoding/speculative_decoding_impl.hpp"
-#include "sampler.hpp"
-#include "lm_encoding.hpp"
 
 namespace ov {
 namespace genai {
 
-class StatefulLLMPipeline final : public LLMPipelineImplBase {
-public:
-    ov::InferRequest m_model_runner;
-    bool is_chat_conversation = false;
-    bool m_trust_encoded_history = true;
-    ChatHistory m_history;
-    std::string m_templated_chat_history = {};
-    std::vector<int64_t> m_tokenized_chat_history;
-    ov::genai::utils::GenerationChatInputsType m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
-    size_t m_kv_cache_seq_length_axis = 2;
-    Sampler m_sampler;
-    // Tail of previous output in chat mode is missing in KV cache, let's keep it
-    std::optional<int64_t> m_last_disappeared_token = std::nullopt;
-    // If sequence contains some symbols, which could be ambiguously encoded by tokenizer, we need to trim kv cache
-    // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history 
-    // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history
-    ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0};
-
-    StatefulLLMPipeline(
-        const ov::InferRequest& request,
-        const ov::genai::Tokenizer& tokenizer,
-        OptionalGenerationConfig generation_config=std::nullopt
-    ) : LLMPipelineImplBase(tokenizer),
-       m_model_runner(request) {
-       GenerationConfig default_config;
-       m_generation_config = (generation_config.has_value()) ? *generation_config : default_config;
-    }
-
-    StatefulLLMPipeline(
-        const std::filesystem::path& models_path,
-        const ov::genai::Tokenizer& tokenizer,
-        const std::string& device,
-        const ov::AnyMap& plugin_config
-    ) : StatefulLLMPipeline{
-            ov::genai::utils::read_model_with_config(models_path, plugin_config),
-            tokenizer, 
-            device, 
-            plugin_config, 
-            utils::from_config_json_if_exists(models_path)
-        } {}
-
-    StatefulLLMPipeline(
-        const std::shared_ptr<ov::Model>& model,
-        const ov::genai::Tokenizer& tokenizer,
-        const std::string& device,
-        const ov::AnyMap& config,
-        const ov::genai::GenerationConfig& generation_config
-    ) : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) {
-        ov::CompiledModel compiled_model;
-        auto [core_plugin_config, plugin_config] = ov::genai::utils::split_core_compile_config(config);
-        utils::slice_matmul_stateful_model(model);
-        m_kv_cache_seq_length_axis = ov::genai::utils::get_seq_len_axis(model);
-
-        if (auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) {
-            m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model.");
-            m_adapter_controller = AdapterController(model, *m_generation_config.adapters, device);   // TODO: Make the prefix name configurable
-            compiled_model = utils::singleton_core().compile_model(model, device, *filtered_plugin_config);
-            m_model_runner = compiled_model.create_infer_request();
-        } else {
-            compiled_model = utils::singleton_core().compile_model(model, device, plugin_config);
-            m_model_runner = compiled_model.create_infer_request();
-        }
-        ov::genai::utils::print_compiled_model_properties(compiled_model, "Stateful LLM model");
-
-        // If eos_token_id was not provided, take value
-        if (m_generation_config.eos_token_id == -1)
-            m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
-
-        m_sampler.set_seed(m_generation_config.rng_seed);
-    }
-
-    StatefulLLMPipeline(
-        const std::filesystem::path& models_path,
-        const std::string& device,
-        const ov::AnyMap& plugin_config
-    ) : StatefulLLMPipeline{models_path, Tokenizer(models_path), device, plugin_config} {}
-
-    DecodedResults generate(
-        StringInputs inputs,
-        OptionalGenerationConfig generation_config,
-        StreamerVariant streamer
-    ) override {
-        if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::UNDEF)
-            m_chat_input_type = ov::genai::utils::GenerationChatInputsType::STRING;
-
-        if (is_chat_conversation)
-            OPENVINO_ASSERT(m_chat_input_type != ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS,
-                            "Chat doesn't support switching between input types. Please, continue using EncodedInputs or restart the chat.");
-
-        auto start_time = std::chrono::steady_clock::now();
-        GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
-        // If eos_token_id was not provided, take value from default m_generation_config
-        if (config.eos_token_id == -1)
-            config.set_eos_token_id(m_generation_config.eos_token_id);
-        config.validate();
-
-        TokenizedInputs encoded_input;
-
-        if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) {
-            OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts");
-            encoded_input = m_tokenizer.encode(*input_vector);
-        } else if (auto input_prompt = std::get_if<std::string>(&inputs)) {
-            std::string& prompt = *input_prompt;
-
-            if (is_chat_conversation) {
-                // KV cache in model already contains prompts and answers from previous iterations.
-                // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns
-                // token_ids = {<bos token>, ...<valuable tokens>}. So if tokenizer applies only to the new prompt,
-                // <bos token> will be inserted on every iteration.
-                // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt
-                // and takes only the difference between them.
-                // The chat history cannot be saved as already encoded tokens because generate call doesn't return <eos> token, but
-                // KV cache contains it. So we have to add it manually or get it by tokenization all chat history.
-
-                m_history.push_back({{"role", "user"}, {"content", prompt}});
-                constexpr bool add_generation_prompt = true;
-                auto new_templated_chat_history  = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
-                // Do not add special tokens in chat scenario to be aligned with HF.
-                auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false));
-                auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false));
-
-                // some symbols combinations can be encoded by the tokenizer in different ways
-                // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history
-                // so let's check it out, find the trusted part and use it in on the next step
-                size_t trusted_history_length = 0;
-                if (!m_tokenized_chat_history.empty()) {
-                    std::set<int64_t> stop_tokens = config.stop_token_ids;
-                    trusted_history_length = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_chat_history, stop_tokens);
-                    m_trust_encoded_history = trusted_history_length == SIZE_MAX;
-                }
-
-                if (m_tokenized_chat_history.empty()) {
-                    encoded_input = new_chat_tokens;
-                } else if (trusted_history_length != SIZE_MAX || m_kv_history_manager.does_kv_cache_need_to_update()) {
-                    // does_kv_cache_need_to_update will be true here if beam search is activated
-                    // in beam search mode we want to remove all history about last model answer from kv cache and add the best answer directly
-                    // if we have difference in model answer and decoded answer it anyway will be less then entire history, so let's use data from m_kv_history_manager
-                    if (m_kv_history_manager.does_kv_cache_need_to_update()) {
-                        trusted_history_length = m_kv_history_manager.trusted_history_length;
-                    } else {
-                        m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_tokenized_chat_history.size() - trusted_history_length;
-                        // if prev generation was finished because of max len was reached, kv cache is missed one last token, let's keep it
-                        m_kv_history_manager.num_tokens_to_remove_from_kv_cache -= m_last_disappeared_token.has_value() ? 1 : 0;
-                    }
-
-                    ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.input_ids.get_element_type(),
-                                                       {1, new_chat_tokens.input_ids.get_shape().at(1) - trusted_history_length},
-                                                       new_chat_tokens.input_ids.data<int64_t>() + trusted_history_length);
-
-                    ov::Tensor new_attention_mask(ov::element::i64, new_tensor.get_shape());
-                    std::fill_n(new_attention_mask.data<int64_t>(), new_tensor.get_shape()[1], 1);
-
-                    encoded_input.input_ids = ov::Tensor(new_chat_tokens.input_ids.get_element_type(),
-                                                       {1, new_chat_tokens.input_ids.get_shape().at(1) - trusted_history_length});
-                    new_tensor.copy_to(encoded_input.input_ids);
-                    encoded_input.attention_mask = new_attention_mask;
-                    m_last_disappeared_token = std::nullopt;
-                } else {
-                    encoded_input = utils::subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens);
-                }
-                m_templated_chat_history = new_templated_chat_history;
-
-                m_tokenized_chat_history.clear();
-                m_tokenized_chat_history.reserve(new_chat_tokens.input_ids.get_size());
-                std::copy_n(new_chat_tokens.input_ids.data<int64_t>(), new_chat_tokens.input_ids.get_size(),
-                            std::back_inserter(m_tokenized_chat_history));
-
-                // TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied
-            } else {
-                encoded_input = m_tokenizer.encode(prompt);
-            }
-        }
-
-        auto encode_stop_time =  std::chrono::steady_clock::now();
-        auto encoded_results = generate(encoded_input, config, streamer);
-
-        auto decode_start_time =  std::chrono::steady_clock::now();
-        DecodedResults decoded_results = {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores};
-        auto decode_stop_time =  std::chrono::steady_clock::now();
-
-        if (is_chat_conversation) {
-            // Tail of chat template is missing in KV cache.
-            // Find the tail to concatenate it with the next input prompt.
-            auto answer = decoded_results.texts[0];
-            m_templated_chat_history.append(answer);
-            m_history.push_back({{"role", "assistant"}, {"content", answer}});
-        }
-
-        // generate_durations
-        decoded_results.perf_metrics = encoded_results.perf_metrics;
-
-        auto& raw_counters = decoded_results.perf_metrics.raw_metrics;
-        auto stop_time = std::chrono::steady_clock::now();
-        raw_counters.generate_durations = std::vector<MicroSeconds>();
-        raw_counters.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
-        raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_microsec(encode_stop_time - start_time));
-        raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_stop_time - decode_start_time));
-
-        // Added tokenization/detokenization times, and updated generate duration, need to reevaluate statistics.
-        decoded_results.perf_metrics.m_evaluated = false;
-        decoded_results.perf_metrics.evaluate_statistics(start_time);
-        return decoded_results;
-    }
-
-    void reset_kv_state() {
-        if(m_adapter_controller) {
-            for(auto& state: m_model_runner.query_state()) {
-                if(!m_adapter_controller->has_state_name(state.get_name())) {
-                    state.reset();
-                }
-            }
-        } else {
-            m_model_runner.reset_state();
-        }
-    }
-
-    EncodedResults generate(
-        const EncodedInputs& inputs,
-        OptionalGenerationConfig generation_config,
-        StreamerVariant streamer
-    ) override {
-        if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::UNDEF)
-            m_chat_input_type = ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS;
-
-        if (is_chat_conversation)
-            // if chat was run in StringInputs mode, but it was called EncodedInputs generate, last m_history entry will be with assistant role
-            OPENVINO_ASSERT(m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS || m_history.back()["role"] == "user",
-                            "Chat doesn't support switching between input types. Please, continue using StringInputs or restart the chat.");
-
-        auto start_time = std::chrono::steady_clock::now();
-        ov::Tensor input_ids;
-        ov::Tensor attention_mask;
-        if (auto data = std::get_if<ov::Tensor>(&inputs)) {
-            input_ids = *data;
-            attention_mask = ov::genai::utils::init_attention_mask(input_ids);
-        } else if (auto data = std::get_if<TokenizedInputs>(&inputs)) {
-            input_ids = data->input_ids;
-            attention_mask = data->attention_mask;
-        }
-
-        if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS)
-            std::copy(input_ids.data<int64_t>(), input_ids.data<int64_t>() + input_ids.get_size(), std::back_inserter(m_tokenized_chat_history));
-
-        // Tail of previous output in chat mode is missing in KV cache.
-        if (m_last_disappeared_token.has_value()) {
-            attention_mask = ov::genai::utils::push_front_inputs(attention_mask, 1);
-            input_ids = ov::genai::utils::push_front_inputs(input_ids, *m_last_disappeared_token);
-        }
-
-        GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
-
-        // If eos_token_id was not provided, take value from default m_generation_config
-        if (config.eos_token_id == -1)
-            config.set_eos_token_id(m_generation_config.eos_token_id);
-        config.validate();
-
-        // Stateful pipeline does not provide logprobs for prompt tokens
-        OPENVINO_ASSERT(config.echo == false, "Echo is not supported in the stateful pipeline");
-
-        std::shared_ptr<StreamerBase> streamer_ptr;
-        if (auto streamer_obj = std::get_if<std::monostate>(&streamer)) {
-            streamer_ptr = nullptr;
-        } else if (auto streamer_obj = std::get_if<std::shared_ptr<StreamerBase>>(&streamer)) {
-            streamer_ptr = *streamer_obj;
-        } else if (auto callback = std::get_if<std::function<bool(std::string)>>(&streamer)) {
-            streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
-        }
-
-        auto batch_size = input_ids.get_shape().at(0);
-        OPENVINO_ASSERT(streamer_ptr == nullptr || batch_size == 1 && config.num_return_sequences == 1 &&
-            (config.is_greedy_decoding() || config.is_multinomial()),
-            "Currently streaming is possible only with batch size=1 and only for greedy or multinomial decoding");
-
-        auto num_inputs = m_model_runner.get_compiled_model().inputs().size();
-        OPENVINO_ASSERT(num_inputs == 4 || num_inputs == 3, "Model should have 3 or 4 inputs: "
-                        "either (input_ids, attention_mask, beam_idx) or "
-                        "(input_ids, attention_mask, position_ids, beam_idx) "
-                        "but you have '" + std::to_string(num_inputs) + "' inputs");
-
-        ov::genai::utils::trim_kv_cache(m_model_runner, m_kv_history_manager.num_tokens_to_remove_from_kv_cache, m_kv_cache_seq_length_axis, m_adapter_controller);
-
-        size_t kv_cache_len = 0;
-        ov::Tensor concatenated_attention_mask;
-        if (is_chat_conversation && !m_tokenized_chat_history.empty()) {
-            OPENVINO_ASSERT(batch_size == 1, "continuation of generation is possible only for batch 1");
-            // If history is saved in KV cache, concatenate new attention_mask with the already existing.
-            // Between subsequent runs attention_mask should not be modified.
-            auto atten_mask_history = m_model_runner.get_tensor("attention_mask");
-            auto prompt_len = attention_mask.get_shape()[1];
-
-            kv_cache_len = atten_mask_history.get_shape()[1] - m_kv_history_manager.num_tokens_to_remove_from_kv_cache;
-
-            ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, {batch_size, kv_cache_len + prompt_len}};
-            auto start_atten_hst = atten_mask_history.data<int64_t>();
-
-            std::copy(start_atten_hst, start_atten_hst + kv_cache_len,
-                    new_atten_mask.data<int64_t>());
-            std::copy(attention_mask.data<int64_t>(), attention_mask.data<int64_t>() + prompt_len,
-                    new_atten_mask.data<int64_t>() + kv_cache_len);
-            concatenated_attention_mask = new_atten_mask;
-        } else {
-            concatenated_attention_mask = attention_mask;
-        }
-
-        size_t prev_attn_mask_size = concatenated_attention_mask.get_shape()[1];
-
-        bool position_ids_available = (num_inputs == 4);
-        std::optional<ov::Tensor> position_ids = std::nullopt;
-        if (position_ids_available) {
-            position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
-            utils::initialize_position_ids(*position_ids, attention_mask, kv_cache_len);
-        }
-
-        if(m_adapter_controller) {
-            m_adapter_controller->apply(m_model_runner, config.adapters);
-        }
-
-        if (is_chat_conversation && !m_trust_encoded_history) {
-            m_trust_encoded_history = true;
-            m_kv_history_manager.reset();
-        }
-
-        std::vector<SequenceGroup::Ptr> requests;
-        size_t block_size = 1;
-        bool enable_prefix_caching = false;
-
-        for (size_t request_id = 0; request_id < batch_size; request_id++) {
-            SequenceGroup::Ptr sequence_group;
-            if (is_chat_conversation) {
-                ov::Tensor tokenized_chat_history = ov::Tensor(ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data());
-                sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_chat_history, config, block_size, enable_prefix_caching);
-            } else {
-                size_t seq_len = input_ids.get_shape().at(1);
-                size_t batch_offset = request_id * seq_len;
-                const int64_t* prompt_start = input_ids.data<const int64_t>() + batch_offset;
-                std::vector<int64_t> tokenized_prompt(prompt_start, prompt_start + seq_len);
-
-                sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_prompt, config, block_size, enable_prefix_caching);
-            }
-
-            sequence_group->set_sequence_group_ptr(sequence_group);
-            requests.push_back(sequence_group);
-        }
-
-        if (m_sampler.get_seed() != config.rng_seed) {
-            m_sampler.set_seed(config.rng_seed);
-        }
-
-        ov::genai::EncodedResults result;
-        std::tie(result, m_last_disappeared_token) = ov::genai::get_lm_encoded_results(m_model_runner, input_ids, concatenated_attention_mask,
-                                                                                       streamer_ptr, m_sampler, requests, position_ids, std::nullopt);
-
-        if (is_chat_conversation) {
-            // force remove from kv_cache last answer
-            if (config.is_beam_search() && m_chat_input_type != ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS) {
-                m_kv_history_manager.trusted_history_length = m_tokenized_chat_history.size();
-                m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_model_runner.get_tensor("attention_mask").get_shape()[1] - prev_attn_mask_size;
-            }
-
-            std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history));
-        } else {
-            reset_kv_state();
-            m_last_disappeared_token = std::nullopt;
-        }
-
-        if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS)
-            std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history));
-
-        auto stop_time = std::chrono::steady_clock::now();
-
-        // If is called without tokenization then that stat will not be reported.
-        auto& metrics = result.perf_metrics;
-        metrics.num_input_tokens = batch_size * input_ids.get_shape().at(1);
-        metrics.load_time = this->m_load_time_ms;
-        metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
-        metrics.evaluate_statistics(start_time);
-        return result;
-    }
-
-    void start_chat(const std::string& system_message) override {
-        is_chat_conversation = true;
-        m_trust_encoded_history = true;
-        m_kv_history_manager.reset();
-        m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
-        m_last_disappeared_token = std::nullopt;
-        if (!m_tokenized_chat_history.empty()) {
-            reset_kv_state();
-            m_history = {};
-            m_templated_chat_history = "";
-            m_tokenized_chat_history.clear();
-        }
-        if (system_message.empty())
-            return;
-
-        m_history.push_back({{"role", "system"}, {"content", system_message}});
-        constexpr bool add_generation_prompt = false;
+namespace {
 
-        m_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
-    }
+/* 
+* NPU reads some properties from the config file, but when LLMPipeline is initialized
+* from the model_str and weights_tensor, there are no files.
+* In the later case ModelDesc is stored in properties.
+* This function pops ModelDescr from the the properties and returns a pair of updated properties and ModelDescr.
+*/
+std::pair<ov::AnyMap, ov::genai::static_llm::ModelConfigDesc> split_model_descr(const ov::AnyMap& properties) {
+    ov::AnyMap main_properties = properties;
+    ov::genai::static_llm::ModelConfigDesc model_descr;
 
-    void finish_chat() override {
-        is_chat_conversation = false;
-        m_trust_encoded_history = true;
-        m_kv_history_manager.reset();
-        m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
-        m_last_disappeared_token = std::nullopt;
-        if (!m_tokenized_chat_history.empty()) {
-            reset_kv_state();
-            m_history.clear();
-            m_templated_chat_history.clear();
-            m_tokenized_chat_history.clear();
+    auto pop_property = [](ov::AnyMap& orig_propertis, const std::string& key, auto& value) {
+        if (orig_propertis.find(key) != orig_propertis.end()) {
+            value = orig_propertis.at(key).as<std::decay_t<decltype(value)>>();
+            orig_propertis.erase(key);
         }
-    }
-};
-
-DecodedResults LLMPipeline::generate(
-        StringInputs inputs,
-        OptionalGenerationConfig generation_config,
-        StreamerVariant streamer
-) {
-    return m_pimpl->generate(inputs, generation_config, streamer);
-}
-
-DecodedResults LLMPipeline::generate(StringInputs text, const ov::AnyMap& config_map) {
-    auto config_arg = utils::get_config_from_map(config_map);
-    GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config();
-    config.update_generation_config(config_map);
-
-    return m_pimpl->generate(text, config, utils::get_streamer_from_map(config_map));
-}
-
-EncodedResults LLMPipeline::generate(
-    const EncodedInputs& inputs,
-    OptionalGenerationConfig generation_config,
-    StreamerVariant streamer
-) {
-    return m_pimpl->generate(inputs, generation_config, streamer);
+    };
+    pop_property(main_properties, "name_or_path", model_descr.name_or_path);
+    pop_property(main_properties, "type", model_descr.type);
+    pop_property(main_properties, "num_key_value_heads", model_descr.num_key_value_heads);
+    
+    return {main_properties, model_descr};
 }
 
-EncodedResults LLMPipeline::generate(const EncodedInputs& inputs, const ov::AnyMap& config_map) {
-    auto config_arg = utils::get_config_from_map(config_map);
-    GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config();
-    config.update_generation_config(config_map);
+} // namespace
 
-    return m_pimpl->generate(inputs, config, utils::get_streamer_from_map(config_map));
-}
 
 std::pair<std::string, Any> streamer(StreamerVariant func) {
     if (auto streamer_obj = std::get_if<std::shared_ptr<StreamerBase>>(&func)) {
@@ -491,7 +64,7 @@ std::pair<std::string, Any> draft_model(
     auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
     
     std::filesystem::path openvino_model_name = "openvino_model.xml";
-    auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
+    auto model = utils::singleton_core().read_model(models_path / openvino_model_name, {}, plugin_config);
     auto generation_config = utils::from_config_json_if_exists(models_path);
     auto tokenizer = ov::genai::Tokenizer(models_path);
     return { utils::DRAFT_MODEL_ARG_NAME, Any::make<ModelDesc>(model, tokenizer, device, plugin_config, scheduler_config, generation_config) };
@@ -510,194 +83,7 @@ std::pair<std::string, Any> draft_model(
     return { utils::DRAFT_MODEL_ARG_NAME, Any::make<ModelDesc>(model, tokenizer, device, plugin_config, scheduler_config, generation_config) };
 }
 
-}  // namespace genai
-}  // namespace ov
-
-namespace {
-using namespace ov::genai;
-
-template<class... Ts> struct overloaded : Ts... {using Ts::operator()...;};
-template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
-
-Tokenizer dont_construct() {
-    OPENVINO_THROW("Continuous Batching backend can't be constructed"
-        "from ireq because the model must be transformed");
-}
-
-class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
-public:
-    ContinuousBatchingPipeline m_impl;
-
-    ContinuousBatchingAdapter(
-        const ov::InferRequest& request,
-        const Tokenizer& tokenizer,
-        OptionalGenerationConfig generation_config
-    ): LLMPipelineImplBase{dont_construct()}, m_impl{{}, {}, {}} {}
-
-    ContinuousBatchingAdapter(
-        const std::filesystem::path& models_path,
-        const Tokenizer& tokenizer,
-        const SchedulerConfig& scheduler_config,
-        const std::string& device,
-        const ov::AnyMap& plugin_config
-    ): LLMPipelineImplBase{tokenizer}, m_impl{
-        models_path.string(),
-        tokenizer,
-        scheduler_config,
-        device,
-        plugin_config} {
-        m_generation_config = m_impl.get_config();
-    }
-
-    ContinuousBatchingAdapter(
-        const std::string& model_str,
-        const ov::Tensor& weights_tensor,
-        const Tokenizer& tokenizer,
-        const SchedulerConfig& scheduler_config,
-        const std::string& device,
-        const ov::AnyMap& plugin_config,
-        const ov::genai::GenerationConfig& generation_config
-    ): LLMPipelineImplBase{tokenizer}, m_impl{
-        model_str, 
-        weights_tensor,
-        tokenizer,
-        scheduler_config,
-        device,
-        plugin_config,
-        generation_config} {}
-
-    ContinuousBatchingAdapter(
-        const std::filesystem::path& models_path,
-        const SchedulerConfig& scheduler_config,
-        const std::string& device,
-        const ov::AnyMap& plugin_config
-    ): LLMPipelineImplBase{Tokenizer(models_path.string())}, m_impl{
-        models_path.string(),
-        m_tokenizer,
-        scheduler_config,
-        device,
-        plugin_config} {
-        m_generation_config = m_impl.get_config();
-    }
-
-    DecodedResults generate(
-        StringInputs inputs,
-        OptionalGenerationConfig generation_config,
-        StreamerVariant streamer
-    ) override {
-        std::vector<std::string> prompts = std::visit(overloaded{
-            [](const std::string& prompt) {
-                return std::vector{prompt};
-            },
-            [](std::vector<std::string>& prompts) {
-                return prompts;
-            }
-        }, inputs);
-        const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config;
-        // -1 == config.eos_token_id and config.validate() are handled in m_impl.
-        std::vector<GenerationResult> generated = m_impl.generate(
-            prompts,
-            std::vector<GenerationConfig>{prompts.size(), config},
-            streamer
-        );
-        std::vector<std::string> plain_replies;
-        std::vector<float> plain_scores;
-        for (GenerationResult& res : generated) {
-            OPENVINO_ASSERT(res.m_status == GenerationStatus::FINISHED || res.m_status == GenerationStatus::DROPPED_BY_HANDLE, "Got unfinished GenerationStatus");
-            std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_replies));
-            std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores));
-        }
-        return {std::move(plain_replies), std::move(plain_scores)};
-    }
-
-    EncodedResults generate(
-        const EncodedInputs& inputs,
-        OptionalGenerationConfig generation_config,
-        StreamerVariant streamer
-    ) override {
-        std::vector<ov::Tensor> input_ids = std::visit(overloaded{
-            [](const ov::Tensor& inp) {
-                size_t batch_size = inp.get_shape().at(0);
-                if (1 == batch_size) {
-                    return std::vector{inp};
-                }
-                std::vector<ov::Tensor> input_ids;
-                input_ids.reserve(batch_size);
-                size_t max_len = inp.get_shape().at(1);
-                const int64_t* const source = inp.data<const int64_t>();
-                for (size_t batch_id = 0; batch_id < batch_size; ++batch_id) {
-                    input_ids.emplace_back(ov::element::i64, ov::Shape(1, max_len));
-                    int64_t* destination = input_ids.back().data<int64_t>();
-                    std::copy_n(source + batch_id * max_len, max_len, destination);
-                }
-                return input_ids;
-            },
-            [](const TokenizedInputs& inp) {
-                size_t batch_size = inp.input_ids.get_shape().at(0);
-                std::vector<ov::Tensor> input_ids;
-                input_ids.reserve(batch_size);
-                size_t max_len = inp.input_ids.get_shape().at(1);
-                const int64_t* const source = inp.input_ids.data<const int64_t>();
-                const int64_t* const attention_mask = inp.attention_mask.data<const int64_t>();
-                for (size_t batch_id = 0; batch_id < batch_size; ++batch_id) {
-                    input_ids.emplace_back(ov::element::i64, ov::Shape(1, max_len));
-                    int64_t* destination = input_ids.back().data<int64_t>();
-                    size_t copy_count = 0;
-                    for (size_t idx = 0; idx < max_len; ++idx) {
-                        if (1 == attention_mask[batch_id * max_len + idx]) {
-                            destination[copy_count++] = source[batch_id * max_len + idx];
-                        }
-                    }
-                    input_ids.back().set_shape({1, copy_count});
-                }
-                return input_ids;
-            }
-        }, inputs);
-        const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config;
-        // -1 == config.eos_token_id and config.validate() are handled in m_impl.
-        std::vector<EncodedGenerationResult> generated = m_impl.generate(input_ids, std::vector<GenerationConfig>{input_ids.size(), config}, streamer);
-        std::vector<std::vector<int64_t>> plain_tokens;
-        std::vector<float> plain_scores;
-        for (EncodedGenerationResult& res : generated) {
-            OPENVINO_ASSERT(res.m_status == GenerationStatus::FINISHED || res.m_status == GenerationStatus::DROPPED_BY_HANDLE, "Got unfinished GenerationStatus");
-            std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_tokens));
-            std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores));
-        }
-        return {std::move(plain_tokens), std::move(plain_scores)};
-    }
-
-    void start_chat(const std::string& system_message) override {
-        m_impl.start_chat();
-    };
-
-    void finish_chat() override {
-        m_impl.finish_chat();
-    };
-};
-
-/* 
-* NPU reads some properties from the config file, but when LLMPipeline is initialized
-* from the model_str and weights_tensor, there are not files. 
-* In the later case ModelDesc is stored in properties.
-* This function pops ModelDescr from the the properties and returns a pair of updated properties and ModelDescr.
-*/
-std::pair<ov::AnyMap, ov::genai::ModelConfigDesc> split_model_descr(const ov::AnyMap& properties) {
-    ov::AnyMap main_properties = properties;
-    ov::genai::ModelConfigDesc model_descr;
-
-    auto pop_property = [](ov::AnyMap& orig_propertis, const std::string& key, auto& value) {
-        if (orig_propertis.find(key) != orig_propertis.end()) {
-            value = orig_propertis.at(key).as<std::decay_t<decltype(value)>>();
-            orig_propertis.erase(key);
-        }
-    };
-    pop_property(main_properties, "name_or_path", model_descr.name_or_path);
-    pop_property(main_properties, "type", model_descr.type);
-    pop_property(main_properties, "num_key_value_heads", model_descr.num_key_value_heads);
-    
-    return {main_properties, model_descr};
-}
-}
+// Public LLMPipeline
 
 ov::genai::LLMPipeline::LLMPipeline(
     const ov::InferRequest& request,
@@ -705,8 +91,6 @@ ov::genai::LLMPipeline::LLMPipeline(
     OptionalGenerationConfig generation_config) {
     auto start_time = std::chrono::steady_clock::now();
     m_pimpl = std::make_unique<StatefulLLMPipeline>(request, tokenizer, generation_config);
-    auto stop_time = std::chrono::steady_clock::now();
-    m_pimpl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
 }
 
 ov::genai::LLMPipeline::LLMPipeline(
@@ -721,32 +105,31 @@ ov::genai::LLMPipeline::LLMPipeline(
         auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, plugin_config);
     } else if (device == "NPU") {
-        m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, tokenizer, device, properties);
+        m_pimpl = static_llm::LLMPipelineFactory::create(models_path, tokenizer, device, properties);
     } else {
         m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, tokenizer, device, properties);
     }
-    auto stop_time = std::chrono::steady_clock::now();
-    m_pimpl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
+    m_pimpl->save_load_time(start_time);
 }
 
 ov::genai::LLMPipeline::LLMPipeline(
     const std::filesystem::path& models_path,
     const std::string& device,
-    const ov::AnyMap& config) {
+    const ov::AnyMap& properties) {
     auto start_time = std::chrono::steady_clock::now();
 
-    if (config.find(ov::genai::scheduler_config.name()) != config.end() || 
-        config.find(utils::DRAFT_MODEL_ARG_NAME) != config.end() || 
-        config.find(ov::genai::prompt_lookup.name()) != config.end()) {
-        auto [plugin_config, scheduler_config] = utils::split_scheduler_config(config);
-        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, plugin_config);
+    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || 
+        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || 
+        properties.find(ov::genai::prompt_lookup.name()) != properties.end()) {
+        auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties);
+        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, device_properties);
     } else if (device == "NPU") {
-        m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, device, config);
+        m_pimpl = static_llm::LLMPipelineFactory::create(models_path, device, properties);
     } else {
-        m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, device, config);
+        m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, device, properties);
     }
-    auto stop_time = std::chrono::steady_clock::now();
-    m_pimpl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
+
+    m_pimpl->save_load_time(start_time);
 }
 
 ov::genai::LLMPipeline::LLMPipeline(
@@ -754,18 +137,17 @@ ov::genai::LLMPipeline::LLMPipeline(
     const ov::Tensor& weights_tensor,
     const ov::genai::Tokenizer& tokenizer,
     const std::string& device,
-    const ov::AnyMap& config,
+    const ov::AnyMap& properties,
     const ov::genai::GenerationConfig& generation_config) {
-    auto [core_properties, plugin_config] = ov::genai::utils::split_core_compile_config(config);
-
     auto start_time = std::chrono::steady_clock::now();
-    if (plugin_config.find(ov::genai::scheduler_config.name()) != plugin_config.end() || 
-        plugin_config.find(utils::DRAFT_MODEL_ARG_NAME) != plugin_config.end() || 
-        plugin_config.find(ov::genai::prompt_lookup.name()) != plugin_config.end()){
 
-        auto [plugin_config_, scheduler_config] = utils::split_scheduler_config(plugin_config);
+    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || 
+        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || 
+        properties.find(ov::genai::prompt_lookup.name()) != properties.end()){
+
+        auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties);
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor,
-                                                              tokenizer, scheduler_config, device, plugin_config_, generation_config);
+                                                              tokenizer, scheduler_config, device, device_properties, generation_config);
     } else if (device == "NPU") {
         // TODO: CVS-158771 Currently, it's a workaround. Probably there is a better solution.
         // NPU reads some properties from the config file, but when LLMPipeline is initialized 
@@ -778,34 +160,64 @@ ov::genai::LLMPipeline::LLMPipeline(
         //                                      {"num_key_value_heads", 32}};
         // ov::genai::LLMPipeline pipe(model_str,..., model_descr_properties);
         // This will convert from AnyMap to ModelDesc.
-        auto [properties, model_descr] = split_model_descr(plugin_config);
+        auto [filtered_properties, model_descr] = split_model_descr(properties);
 
-        m_pimpl = std::make_unique<StaticLLMPipeline>(
+        m_pimpl = static_llm::LLMPipelineFactory::create(
             utils::singleton_core().read_model(model_str, weights_tensor), 
             model_descr,
             tokenizer,
             device,
-            properties,
+            filtered_properties,
             generation_config
         );
     } else {
         m_pimpl = std::make_unique<StatefulLLMPipeline>(
-            utils::singleton_core().read_model(model_str, weights_tensor), 
+            utils::singleton_core().read_model(model_str, weights_tensor),
             tokenizer,
             device,
-            plugin_config,
+            properties,
             generation_config);
     }
-    auto stop_time = std::chrono::steady_clock::now();
-    m_pimpl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
+
+    m_pimpl->save_load_time(start_time);
+}
+
+DecodedResults LLMPipeline::generate(
+        StringInputs inputs,
+        OptionalGenerationConfig generation_config,
+        StreamerVariant streamer) {
+    return m_pimpl->generate(inputs, generation_config, streamer);
+}
+
+DecodedResults LLMPipeline::generate(StringInputs text, const ov::AnyMap& config_map) {
+    auto config_arg = utils::get_config_from_map(config_map);
+    GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config();
+    config.update_generation_config(config_map);
+
+    return m_pimpl->generate(text, config, utils::get_streamer_from_map(config_map));
+}
+
+EncodedResults LLMPipeline::generate(
+    const EncodedInputs& inputs,
+    OptionalGenerationConfig generation_config,
+    StreamerVariant streamer) {
+    return m_pimpl->generate(inputs, generation_config, streamer);
+}
+
+EncodedResults LLMPipeline::generate(const EncodedInputs& inputs, const ov::AnyMap& config_map) {
+    auto config_arg = utils::get_config_from_map(config_map);
+    GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config();
+    config.update_generation_config(config_map);
+
+    return m_pimpl->generate(inputs, config, utils::get_streamer_from_map(config_map));
 }
 
 ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const {
-    return m_pimpl->m_generation_config;
+    return m_pimpl->get_generation_config();
 }
 
 ov::genai::Tokenizer ov::genai::LLMPipeline::get_tokenizer() {
-    return m_pimpl->m_tokenizer;
+    return m_pimpl->get_tokenizer();
 }
 
 void ov::genai::LLMPipeline::start_chat(const std::string& system_message) {
@@ -817,13 +229,10 @@ void ov::genai::LLMPipeline::finish_chat() {
 }
 
 void ov::genai::LLMPipeline::set_generation_config(const GenerationConfig& config) {
-    int64_t default_eos_token_id = m_pimpl->m_generation_config.eos_token_id;
-    m_pimpl->m_generation_config = config;
-    // if eos_token_id was not provided in config forward from default config
-    if (config.eos_token_id == -1)
-        m_pimpl->m_generation_config.set_eos_token_id(default_eos_token_id);
-
-    m_pimpl->m_generation_config.validate();
+    m_pimpl->set_generation_config(config);
 }
 
 ov::genai::LLMPipeline::~LLMPipeline() = default;
+
+} // namespace genai
+} // namespace ov
diff --git a/src/cpp/src/llm_pipeline_base.hpp b/src/cpp/src/llm_pipeline_base.hpp
index b2ad581e0b..5573272d7e 100644
--- a/src/cpp/src/llm_pipeline_base.hpp
+++ b/src/cpp/src/llm_pipeline_base.hpp
@@ -13,8 +13,26 @@ namespace genai {
 class LLMPipelineImplBase {
 public:
     LLMPipelineImplBase(const Tokenizer& tokenizer,
-                        const GenerationConfig& config = {})
-    : m_tokenizer(tokenizer), m_generation_config(config) {
+                        const GenerationConfig& config)
+    : m_tokenizer(tokenizer), m_generation_config(config) { }
+
+    Tokenizer get_tokenizer() {
+        return m_tokenizer;
+    }
+
+    GenerationConfig get_generation_config() const {
+        return m_generation_config;
+    }
+
+    void set_generation_config(GenerationConfig config) {
+        int64_t default_eos_token_id = m_generation_config.eos_token_id;
+        m_generation_config = config;
+
+        // if eos_token_id was not provided in config forward from default config
+        if (m_generation_config.eos_token_id == -1)
+            m_generation_config.set_eos_token_id(default_eos_token_id);
+
+        m_generation_config.validate();
     }
 
     virtual DecodedResults generate(
@@ -34,6 +52,12 @@ class LLMPipelineImplBase {
 
     virtual ~LLMPipelineImplBase() = default;
 
+    void save_load_time(std::chrono::steady_clock::time_point start_time) {
+        auto stop_time = std::chrono::steady_clock::now();
+        m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
+    }
+
+protected:
     Tokenizer m_tokenizer;
     GenerationConfig m_generation_config;
     std::optional<AdapterController> m_adapter_controller;
diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp
new file mode 100644
index 0000000000..153fcc6fce
--- /dev/null
+++ b/src/cpp/src/llm_pipeline_stateful.cpp
@@ -0,0 +1,402 @@
+
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "llm_pipeline_stateful.hpp"
+
+#include "lora_helper.hpp"
+#include "lm_encoding.hpp"
+#include "text_callback_streamer.hpp"
+#include "utils.hpp"
+
+namespace ov::genai {
+
+StatefulLLMPipeline::StatefulLLMPipeline(
+    const ov::InferRequest& request,
+    const ov::genai::Tokenizer& tokenizer,
+    OptionalGenerationConfig generation_config)
+    : LLMPipelineImplBase(tokenizer, generation_config.has_value() ? *generation_config : GenerationConfig()),
+    m_model_runner(request) {}
+
+StatefulLLMPipeline::StatefulLLMPipeline(
+    const std::filesystem::path& models_path,
+    const ov::genai::Tokenizer& tokenizer,
+    const std::string& device,
+    const ov::AnyMap& properties)
+    : StatefulLLMPipeline{
+        utils::singleton_core().read_model(models_path / "openvino_model.xml", {}, properties),
+        tokenizer,
+        device,
+        properties,
+        utils::from_config_json_if_exists(models_path)
+    } {}
+
+StatefulLLMPipeline::StatefulLLMPipeline(
+    const std::shared_ptr<ov::Model>& model,
+    const ov::genai::Tokenizer& tokenizer,
+    const std::string& device,
+    const ov::AnyMap& properties,
+    const ov::genai::GenerationConfig& generation_config)
+    : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) {
+    utils::apply_slice_before_matmul_transformation(model);
+    m_kv_cache_seq_length_axis = ov::genai::utils::get_seq_len_axis(model);
+
+    ov::CompiledModel compiled_model;
+    if (auto filtered_properties = extract_adapters_from_properties(properties, &m_generation_config.adapters)) {
+        m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model.");
+        m_adapter_controller = AdapterController(model, *m_generation_config.adapters, device);   // TODO: Make the prefix name configurable
+        compiled_model = utils::singleton_core().compile_model(model, device, *filtered_properties);
+        m_model_runner = compiled_model.create_infer_request();
+    } else {
+        compiled_model = utils::singleton_core().compile_model(model, device, properties);
+        m_model_runner = compiled_model.create_infer_request();
+    }
+    ov::genai::utils::print_compiled_model_properties(compiled_model, "Stateful LLM model");
+
+    // If eos_token_id was not provided, take value
+    if (m_generation_config.eos_token_id == -1)
+        m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
+
+    m_sampler.set_seed(m_generation_config.rng_seed);
+}
+
+StatefulLLMPipeline::StatefulLLMPipeline(
+    const std::filesystem::path& models_path,
+    const std::string& device,
+    const ov::AnyMap& plugin_config)
+    : StatefulLLMPipeline{models_path, Tokenizer(models_path), device, plugin_config} {}
+
+DecodedResults StatefulLLMPipeline::generate(
+    StringInputs inputs,
+    OptionalGenerationConfig generation_config,
+    StreamerVariant streamer) {
+    if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::UNDEF)
+        m_chat_input_type = ov::genai::utils::GenerationChatInputsType::STRING;
+
+    if (is_chat_conversation)
+        OPENVINO_ASSERT(m_chat_input_type != ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS,
+                        "Chat doesn't support switching between input types. Please, continue using EncodedInputs or restart the chat.");
+
+    auto start_time = std::chrono::steady_clock::now();
+    GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
+    // If eos_token_id was not provided, take value from default m_generation_config
+    if (config.eos_token_id == -1)
+        config.set_eos_token_id(m_generation_config.eos_token_id);
+    config.validate();
+
+    TokenizedInputs encoded_input;
+
+    if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) {
+        OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts");
+        encoded_input = m_tokenizer.encode(*input_vector);
+    } else if (auto input_prompt = std::get_if<std::string>(&inputs)) {
+        std::string& prompt = *input_prompt;
+
+        if (is_chat_conversation) {
+            // KV cache in model already contains prompts and answers from previous iterations.
+            // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns
+            // token_ids = {<bos token>, ...<valuable tokens>}. So if tokenizer applies only to the new prompt,
+            // <bos token> will be inserted on every iteration.
+            // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt
+            // and takes only the difference between them.
+            // The chat history cannot be saved as already encoded tokens because generate call doesn't return <eos> token, but
+            // KV cache contains it. So we have to add it manually or get it by tokenization all chat history.
+
+            m_history.push_back({{"role", "user"}, {"content", prompt}});
+            constexpr bool add_generation_prompt = true;
+            auto new_templated_chat_history  = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
+            // Do not add special tokens in chat scenario to be aligned with HF.
+            auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false));
+            auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false));
+
+            // some symbols combinations can be encoded by the tokenizer in different ways
+            // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history
+            // so let's check it out, find the trusted part and use it in on the next step
+            size_t trusted_history_length = 0;
+            if (!m_tokenized_chat_history.empty()) {
+                std::set<int64_t> stop_tokens = config.stop_token_ids;
+                trusted_history_length = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_chat_history, stop_tokens);
+                m_trust_encoded_history = trusted_history_length == SIZE_MAX;
+            }
+
+            if (m_tokenized_chat_history.empty()) {
+                encoded_input = new_chat_tokens;
+            } else if (trusted_history_length != SIZE_MAX || m_kv_history_manager.does_kv_cache_need_to_update()) {
+                // does_kv_cache_need_to_update will be true here if beam search is activated
+                // in beam search mode we want to remove all history about last model answer from kv cache and add the best answer directly
+                // if we have difference in model answer and decoded answer it anyway will be less then entire history, so let's use data from m_kv_history_manager
+                if (m_kv_history_manager.does_kv_cache_need_to_update()) {
+                    trusted_history_length = m_kv_history_manager.trusted_history_length;
+                } else {
+                    m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_tokenized_chat_history.size() - trusted_history_length;
+                    // if prev generation was finished because of max len was reached, kv cache is missed one last token, let's keep it
+                    m_kv_history_manager.num_tokens_to_remove_from_kv_cache -= m_last_disappeared_token.has_value() ? 1 : 0;
+                }
+
+                ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.input_ids.get_element_type(),
+                                                    {1, new_chat_tokens.input_ids.get_shape().at(1) - trusted_history_length},
+                                                    new_chat_tokens.input_ids.data<int64_t>() + trusted_history_length);
+
+                ov::Tensor new_attention_mask(ov::element::i64, new_tensor.get_shape());
+                std::fill_n(new_attention_mask.data<int64_t>(), new_tensor.get_shape()[1], 1);
+
+                encoded_input.input_ids = ov::Tensor(new_chat_tokens.input_ids.get_element_type(),
+                                                    {1, new_chat_tokens.input_ids.get_shape().at(1) - trusted_history_length});
+                new_tensor.copy_to(encoded_input.input_ids);
+                encoded_input.attention_mask = new_attention_mask;
+                m_last_disappeared_token = std::nullopt;
+            } else {
+                encoded_input = utils::subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens);
+            }
+            m_templated_chat_history = new_templated_chat_history;
+
+            m_tokenized_chat_history.clear();
+            m_tokenized_chat_history.reserve(new_chat_tokens.input_ids.get_size());
+            std::copy_n(new_chat_tokens.input_ids.data<int64_t>(), new_chat_tokens.input_ids.get_size(),
+                        std::back_inserter(m_tokenized_chat_history));
+
+            // TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied
+        } else {
+            encoded_input = m_tokenizer.encode(prompt);
+        }
+    }
+
+    auto encode_stop_time =  std::chrono::steady_clock::now();
+    auto encoded_results = generate(encoded_input, config, streamer);
+
+    auto decode_start_time =  std::chrono::steady_clock::now();
+    DecodedResults decoded_results = {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores};
+    auto decode_stop_time =  std::chrono::steady_clock::now();
+
+    if (is_chat_conversation) {
+        // Tail of chat template is missing in KV cache.
+        // Find the tail to concatenate it with the next input prompt.
+        auto answer = decoded_results.texts[0];
+        m_templated_chat_history.append(answer);
+        m_history.push_back({{"role", "assistant"}, {"content", answer}});
+    }
+
+    // generate_durations
+    decoded_results.perf_metrics = encoded_results.perf_metrics;
+
+    auto& raw_counters = decoded_results.perf_metrics.raw_metrics;
+    auto stop_time = std::chrono::steady_clock::now();
+    raw_counters.generate_durations = std::vector<MicroSeconds>();
+    raw_counters.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
+    raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_microsec(encode_stop_time - start_time));
+    raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_stop_time - decode_start_time));
+
+    // Added tokenization/detokenization times, and updated generate duration, need to reevaluate statistics.
+    decoded_results.perf_metrics.m_evaluated = false;
+    decoded_results.perf_metrics.evaluate_statistics(start_time);
+    return decoded_results;
+}
+
+EncodedResults StatefulLLMPipeline::generate(
+    const EncodedInputs& inputs,
+    OptionalGenerationConfig generation_config,
+    StreamerVariant streamer) {
+    if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::UNDEF)
+        m_chat_input_type = ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS;
+
+    if (is_chat_conversation)
+        // if chat was run in StringInputs mode, but it was called EncodedInputs generate, last m_history entry will be with assistant role
+        OPENVINO_ASSERT(m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS || m_history.back()["role"] == "user",
+                        "Chat doesn't support switching between input types. Please, continue using StringInputs or restart the chat.");
+
+    auto start_time = std::chrono::steady_clock::now();
+    ov::Tensor input_ids;
+    ov::Tensor attention_mask;
+    if (auto data = std::get_if<ov::Tensor>(&inputs)) {
+        input_ids = *data;
+        attention_mask = ov::genai::utils::init_attention_mask(input_ids);
+    } else if (auto data = std::get_if<TokenizedInputs>(&inputs)) {
+        input_ids = data->input_ids;
+        attention_mask = data->attention_mask;
+    }
+
+    if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS)
+        std::copy(input_ids.data<int64_t>(), input_ids.data<int64_t>() + input_ids.get_size(), std::back_inserter(m_tokenized_chat_history));
+
+    // Tail of previous output in chat mode is missing in KV cache.
+    if (m_last_disappeared_token.has_value()) {
+        attention_mask = ov::genai::utils::push_front_inputs(attention_mask, 1);
+        input_ids = ov::genai::utils::push_front_inputs(input_ids, *m_last_disappeared_token);
+    }
+
+    GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
+
+    // If eos_token_id was not provided, take value from default m_generation_config
+    if (config.eos_token_id == -1)
+        config.set_eos_token_id(m_generation_config.eos_token_id);
+    config.validate();
+
+    // Stateful pipeline does not provide logprobs for prompt tokens
+    OPENVINO_ASSERT(config.echo == false, "Echo is not supported in the stateful pipeline");
+
+    std::shared_ptr<StreamerBase> streamer_ptr;
+    if (auto streamer_obj = std::get_if<std::monostate>(&streamer)) {
+        streamer_ptr = nullptr;
+    } else if (auto streamer_obj = std::get_if<std::shared_ptr<StreamerBase>>(&streamer)) {
+        streamer_ptr = *streamer_obj;
+    } else if (auto callback = std::get_if<std::function<bool(std::string)>>(&streamer)) {
+        streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
+    }
+
+    auto batch_size = input_ids.get_shape().at(0);
+    OPENVINO_ASSERT(streamer_ptr == nullptr || batch_size == 1 && config.num_return_sequences == 1 &&
+        (config.is_greedy_decoding() || config.is_multinomial()),
+        "Currently streaming is possible only with batch size=1 and only for greedy or multinomial decoding");
+
+    auto num_inputs = m_model_runner.get_compiled_model().inputs().size();
+    OPENVINO_ASSERT(num_inputs == 4 || num_inputs == 3, "Model should have 3 or 4 inputs: "
+                    "either (input_ids, attention_mask, beam_idx) or "
+                    "(input_ids, attention_mask, position_ids, beam_idx) "
+                    "but you have '" + std::to_string(num_inputs) + "' inputs");
+
+    ov::genai::utils::trim_kv_cache(m_model_runner, m_kv_history_manager.num_tokens_to_remove_from_kv_cache, m_kv_cache_seq_length_axis, m_adapter_controller);
+
+    size_t kv_cache_len = 0;
+    ov::Tensor concatenated_attention_mask;
+    if (is_chat_conversation && !m_tokenized_chat_history.empty()) {
+        OPENVINO_ASSERT(batch_size == 1, "continuation of generation is possible only for batch 1");
+        // If history is saved in KV cache, concatenate new attention_mask with the already existing.
+        // Between subsequent runs attention_mask should not be modified.
+        auto atten_mask_history = m_model_runner.get_tensor("attention_mask");
+        auto prompt_len = attention_mask.get_shape()[1];
+
+        kv_cache_len = atten_mask_history.get_shape()[1] - m_kv_history_manager.num_tokens_to_remove_from_kv_cache;
+
+        ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, {batch_size, kv_cache_len + prompt_len}};
+        auto start_atten_hst = atten_mask_history.data<int64_t>();
+
+        std::copy(start_atten_hst, start_atten_hst + kv_cache_len,
+                new_atten_mask.data<int64_t>());
+        std::copy(attention_mask.data<int64_t>(), attention_mask.data<int64_t>() + prompt_len,
+                new_atten_mask.data<int64_t>() + kv_cache_len);
+        concatenated_attention_mask = new_atten_mask;
+    } else {
+        concatenated_attention_mask = attention_mask;
+    }
+
+    size_t prev_attn_mask_size = concatenated_attention_mask.get_shape()[1];
+
+    bool position_ids_available = (num_inputs == 4);
+    std::optional<ov::Tensor> position_ids = std::nullopt;
+    if (position_ids_available) {
+        position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
+        utils::initialize_position_ids(*position_ids, attention_mask, kv_cache_len);
+    }
+
+    if(m_adapter_controller) {
+        m_adapter_controller->apply(m_model_runner, config.adapters);
+    }
+
+    if (is_chat_conversation && !m_trust_encoded_history) {
+        m_trust_encoded_history = true;
+        m_kv_history_manager.reset();
+    }
+
+    std::vector<SequenceGroup::Ptr> requests;
+    size_t block_size = 1;
+
+    for (size_t request_id = 0; request_id < batch_size; request_id++) {
+        SequenceGroup::Ptr sequence_group;
+        if (is_chat_conversation) {
+            ov::Tensor tokenized_chat_history = ov::Tensor(ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data());
+            sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_chat_history, config, block_size);
+        } else {
+            size_t seq_len = input_ids.get_shape().at(1);
+            size_t batch_offset = request_id * seq_len;
+            const int64_t* prompt_start = input_ids.data<const int64_t>() + batch_offset;
+            std::vector<int64_t> tokenized_prompt(prompt_start, prompt_start + seq_len);
+
+            sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_prompt, config, block_size);
+        }
+
+        requests.push_back(sequence_group);
+    }
+
+    if (m_sampler.get_seed() != config.rng_seed) {
+        m_sampler.set_seed(config.rng_seed);
+    }
+
+    ov::genai::EncodedResults result;
+    std::tie(result, m_last_disappeared_token) = get_lm_encoded_results(m_model_runner, input_ids, concatenated_attention_mask,
+                                                                        streamer_ptr, m_sampler, requests, position_ids, std::nullopt);
+
+    if (is_chat_conversation) {
+        // force remove from kv_cache last answer
+        if (config.is_beam_search() && m_chat_input_type != ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS) {
+            m_kv_history_manager.trusted_history_length = m_tokenized_chat_history.size();
+            m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_model_runner.get_tensor("attention_mask").get_shape()[1] - prev_attn_mask_size;
+        }
+
+        std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history));
+    } else {
+        reset_kv_state();
+        m_last_disappeared_token = std::nullopt;
+    }
+
+    if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS)
+        std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history));
+
+    auto stop_time = std::chrono::steady_clock::now();
+
+    // If is called without tokenization then that stat will not be reported.
+    auto& metrics = result.perf_metrics;
+    metrics.num_input_tokens = batch_size * input_ids.get_shape().at(1);
+    metrics.load_time = m_load_time_ms;
+    metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
+    metrics.evaluate_statistics(start_time);
+    return result;
+}
+
+void StatefulLLMPipeline::start_chat(const std::string& system_message) {
+    is_chat_conversation = true;
+    m_trust_encoded_history = true;
+    m_kv_history_manager.reset();
+    m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
+    m_last_disappeared_token = std::nullopt;
+    if (!m_tokenized_chat_history.empty()) {
+        reset_kv_state();
+        m_history = {};
+        m_templated_chat_history = "";
+        m_tokenized_chat_history.clear();
+    }
+    if (system_message.empty())
+        return;
+
+    m_history.push_back({{"role", "system"}, {"content", system_message}});
+    constexpr bool add_generation_prompt = false;
+
+    m_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
+}
+
+void StatefulLLMPipeline::reset_kv_state() {
+    if(m_adapter_controller) {
+        for(auto& state: m_model_runner.query_state()) {
+            if(!m_adapter_controller->has_state_name(state.get_name())) {
+                state.reset();
+            }
+        }
+    } else {
+        m_model_runner.reset_state();
+    }
+}
+
+void StatefulLLMPipeline::finish_chat() {
+    is_chat_conversation = false;
+    m_trust_encoded_history = true;
+    m_kv_history_manager.reset();
+    m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
+    m_last_disappeared_token = std::nullopt;
+    if (!m_tokenized_chat_history.empty()) {
+        reset_kv_state();
+        m_history.clear();
+        m_templated_chat_history.clear();
+        m_tokenized_chat_history.clear();
+    }
+}
+
+} // namespace ov::genai
diff --git a/src/cpp/src/llm_pipeline_stateful.hpp b/src/cpp/src/llm_pipeline_stateful.hpp
new file mode 100644
index 0000000000..dbf8d89391
--- /dev/null
+++ b/src/cpp/src/llm_pipeline_stateful.hpp
@@ -0,0 +1,77 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+
+#include "llm_pipeline_base.hpp"
+#include "sampler.hpp"
+#include "utils.hpp"
+
+namespace ov::genai {
+
+class StatefulLLMPipeline final : public LLMPipelineImplBase {
+    ov::InferRequest m_model_runner;
+    Sampler m_sampler;
+
+    // Chat scenario specific parameters
+    bool is_chat_conversation = false;
+    bool m_trust_encoded_history = true;
+    ChatHistory m_history;
+    std::string m_templated_chat_history = {};
+    std::vector<int64_t> m_tokenized_chat_history;
+    ov::genai::utils::GenerationChatInputsType m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
+    // Tail of previous output in chat mode is missing in KV cache, let's keep it
+    std::optional<int64_t> m_last_disappeared_token = std::nullopt;
+    // If sequence contains some symbols, which could be ambiguously encoded by tokenizer, we need to trim kv cache
+    // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history 
+    // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history
+    ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0};
+    size_t m_kv_cache_seq_length_axis = 2;
+
+    void reset_kv_state();
+public:
+
+    StatefulLLMPipeline(
+        const ov::InferRequest& request,
+        const ov::genai::Tokenizer& tokenizer,
+        OptionalGenerationConfig generation_config = std::nullopt
+    );
+
+    StatefulLLMPipeline(
+        const std::filesystem::path& models_path,
+        const ov::genai::Tokenizer& tokenizer,
+        const std::string& device,
+        const ov::AnyMap& plugin_config
+    );
+
+    StatefulLLMPipeline(
+        const std::shared_ptr<ov::Model>& model,
+        const ov::genai::Tokenizer& tokenizer,
+        const std::string& device,
+        const ov::AnyMap& config,
+        const ov::genai::GenerationConfig& generation_config
+    );
+
+    StatefulLLMPipeline(
+        const std::filesystem::path& models_path,
+        const std::string& device,
+        const ov::AnyMap& plugin_config
+    );
+
+    DecodedResults generate(
+        StringInputs inputs,
+        OptionalGenerationConfig generation_config,
+        StreamerVariant streamer
+    ) override;
+
+    EncodedResults generate(
+        const EncodedInputs& inputs,
+        OptionalGenerationConfig generation_config,
+        StreamerVariant streamer
+    ) override;
+
+    void start_chat(const std::string& system_message) override;
+
+    void finish_chat() override;
+};
+
+} // namespace ov::genai
diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 6f4f124894..c98b571179 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -1,8 +1,10 @@
-// Copyright (C) 2024 Intel Corporation
+// Copyright (C) 2024-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #include "llm_pipeline_static.hpp"
 
+#include "sampler.hpp"
+
 #include <fstream>
 #include <regex>
 
@@ -235,12 +237,12 @@ enum class GenerateHint {
 
 std::string to_string(GenerateHint h) {
     switch(h) {
-        case GenerateHint::FAST_COMPILE : 
+        case GenerateHint::FAST_COMPILE :
             return "FAST_COMPILE";
-        case GenerateHint::BEST_PERF : 
+        case GenerateHint::BEST_PERF :
             return "BEST_PERF";
         default:
-            OPENVINO_THROW("Unsupported value for type GenerateHint provided");        
+            OPENVINO_THROW("Unsupported value for type GenerateHint provided");
     }
 }
 
@@ -396,12 +398,12 @@ KVAxesPosition get_kv_axes(const std::string& model_type) {
     return axes;
 }
 
-ov::genai::ModelConfigDesc get_modeldesc_from_json(const std::filesystem::path& filepath) {
+ov::genai::static_llm::ModelConfigDesc get_modeldesc_from_json(const std::filesystem::path& filepath) {
     std::ifstream file(filepath);
-    OPENVINO_ASSERT(file.is_open(), "Could not open file: " + filepath.string());
+    OPENVINO_ASSERT(file.is_open(), "Could not open file: ", filepath);
     nlohmann::json config_data = nlohmann::json::parse(file);
 
-    ov::genai::ModelConfigDesc desc;
+    ov::genai::static_llm::ModelConfigDesc desc;
     desc.type = config_data["model_type"].get<std::string>();
     // NB: In case _name_or_path field isn't presented in config.json
     if (config_data.contains("_name_or_path")) {
@@ -586,6 +588,19 @@ std::optional<uint32_t> pop_int_and_cast(ov::AnyMap& config, const std::string&
     return std::nullopt;
 }
 
+void update_config(ov::AnyMap& config, const std::pair<std::string, ov::Any>& pair) {
+    if (config.count(pair.first) == 0) {
+        config.insert(pair);
+    }
+}
+
+void rename_key(ov::AnyMap& config, const std::string& old_key, const std::string& new_key) {
+    if (config.count(old_key) != 0) {
+        auto opt_value = pop_option(config, old_key);
+        config[new_key] = opt_value.value();
+    }
+}
+
 ov::Tensor make_tensor_slice(ov::Tensor tensor, size_t dim, size_t start_pos, size_t end_pos) {
     ov::Shape start_shape(std::vector<size_t>(tensor.get_shape().size(), 0u));
     start_shape[dim] = start_pos;
@@ -632,18 +647,292 @@ void copy_columns_by_row_chunks(const ov::Tensor& src, ov::Tensor& dst) {
     }
 }
 
+void stream_generated_tokens(std::shared_ptr<ov::genai::StreamerBase> streamer_ptr,
+                             ov::genai::GenerationHandle& handle) {
+    if (streamer_ptr && handle->can_read()) {
+        std::unordered_map<uint64_t, ov::genai::GenerationOutput> token = handle->back();
+        for (const auto& gen_token : token.begin()->second.generated_ids) {
+            if (streamer_ptr->put(gen_token)) {
+                handle->drop();
+                break;
+            }
+        }
+    }
+}
+
+enum StaticPipelineKind {
+    STATEFUL,
+    STATELESS
+};
+StaticPipelineKind str_to_pipeline(const std::string& str) {
+    if (str == "STATEFUL") {
+        return StaticPipelineKind::STATEFUL;
+    }
+    if (str == "STATELESS") {
+        return StaticPipelineKind::STATELESS;
+    }
+    OPENVINO_THROW("Unsupported \"PIPELINE\" provided: ",
+                   str, ". Please select either \"STATEFUL\" or \"STATELESS\".");
+}
 } // anonymous namespace
 
 namespace ov {
 namespace genai {
+namespace static_llm {
 
-StaticLLMPipeline::StaticLLMPipeline(
+StatefulLLMPipeline::StatefulLLMPipeline(
     const std::filesystem::path& models_path,
     const ov::genai::Tokenizer& tokenizer,
-    const std::string& device,
+    const std::string&,
     const ov::AnyMap& config
 ) : LLMPipelineImplBase(tokenizer,
                         utils::from_config_json_if_exists(models_path)) {
+
+    auto model = genai::utils::singleton_core().read_model(models_path / "openvino_model.xml", {}, config);
+    ModelConfigDesc model_desc = get_modeldesc_from_json(models_path / "config.json");
+    ov::AnyMap properties = config;
+
+    auto compiled = setupAndCompileModel(model, model_desc, properties);
+    m_request = compiled->create_infer_request();
+}
+
+
+StatefulLLMPipeline::StatefulLLMPipeline(
+    const std::shared_ptr<ov::Model>& model,
+    const ModelConfigDesc& model_desc,
+    const ov::genai::Tokenizer& tokenizer,
+    const std::string&,
+    const ov::AnyMap& properties,
+    const ov::genai::GenerationConfig& generation_config
+) : LLMPipelineImplBase(tokenizer, generation_config) {
+    ov::AnyMap properties_copy = properties;
+    auto compiled = setupAndCompileModel(model, model_desc, properties_copy);
+    m_request = compiled->create_infer_request();
+}
+
+std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
+    const std::shared_ptr<ov::Model>& model,
+    const ModelConfigDesc& model_desc,
+    ov::AnyMap& pipeline_config) {
+
+    const uint32_t kMaxPromptLen = pop_int_and_cast(pipeline_config, "MAX_PROMPT_LEN").value_or(1024u);
+    const uint32_t kMinResponseLen = pop_int_and_cast(pipeline_config, "MIN_RESPONSE_LEN").value_or(128u);
+    m_kvcache_total = kMaxPromptLen + kMinResponseLen;
+    std::string generate_hint = pop_or_default<std::string>(pipeline_config, "GENERATE_HINT", "FAST_COMPILE");
+
+    update_config(pipeline_config, {"NPU_USE_NPUW", "YES"});
+    update_config(pipeline_config, {"NPUW_LLM", "YES"});
+
+    KVAxesPosition axes = get_kv_axes(model_desc.type);
+    update_config(pipeline_config, {"NPUW_LLM_BATCH_DIM", axes.batch});
+    update_config(pipeline_config, {"NPUW_LLM_SEQ_LEN_DIM", axes.seq_len});
+
+    update_config(pipeline_config, {"NPUW_LLM_MAX_PROMPT_LEN", kMaxPromptLen});
+    update_config(pipeline_config, {"NPUW_LLM_MIN_RESPONSE_LEN", kMinResponseLen});
+    update_config(pipeline_config, {"NPUW_LLM_GENERATE_HINT", generate_hint});
+
+    // NB: Try to apply opt transpose only for Llama-2-7b-chat-hf model
+    if ( model_desc.name_or_path == "meta-llama/Llama-2-7b-chat-hf" ||
+        (model_desc.type == "llama" && model_desc.num_key_value_heads == 32)) {
+            update_config(pipeline_config, {"NPUW_LLM_OPTIMIZE_V_TENSORS", true});
+    }
+
+    rename_key(pipeline_config, "PREFILL_CONFIG", "NPUW_LLM_PREFILL_CONFIG");
+    rename_key(pipeline_config, "GENERATE_CONFIG", "NPUW_LLM_GENERATE_CONFIG");
+
+    // Replace CACHE_DIR option if NPUW is enabled
+    set_npuw_cache_dir(pipeline_config);
+
+    return std::make_shared<ov::CompiledModel>(genai::utils::singleton_core().compile_model(model, "NPU", pipeline_config));
+}
+
+DecodedResults StatefulLLMPipeline::generate(
+    StringInputs inputs,
+    OptionalGenerationConfig generation_config,
+    StreamerVariant streamer
+) {
+    auto start_time = std::chrono::steady_clock::now();
+
+    GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
+    std::string prompt;
+    if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) {
+        OPENVINO_ASSERT(input_vector->size() == 1u, "Currently only batch size=1 is supported");
+        prompt = std::move(input_vector->front());
+    } else {
+        OPENVINO_ASSERT(std::holds_alternative<std::string>(inputs));
+        prompt = std::get<std::string>(inputs);
+    }
+
+    ov::genai::TokenizedInputs tokenized_input;
+    if (m_is_chat_conversation) {
+        m_history.push_back({{"role", "user"}, {"content", prompt}});
+        constexpr bool add_generation_prompt = true;
+        prompt = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
+        // for chat ov::genai::add_special_tokens(false) is aligned with stateful pipeline and HF
+        tokenized_input = m_tokenizer.encode(prompt, ov::genai::add_special_tokens(false));
+    } else {
+        tokenized_input = m_tokenizer.encode(prompt);
+    }
+
+    auto encode_stop_time =  std::chrono::steady_clock::now();
+    auto encoded_results = generate(tokenized_input, config, streamer);
+
+    auto decode_start_time =  std::chrono::steady_clock::now();
+    DecodedResults decoded_results = {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores};
+    auto decode_stop_time =  std::chrono::steady_clock::now();
+
+    if (m_is_chat_conversation) {
+        auto answer = decoded_results.texts[0];
+        m_history.push_back({{"role", "assistant"}, {"content", answer}});
+    }
+
+    // generate_durations
+    decoded_results.perf_metrics = encoded_results.perf_metrics;
+    auto& raw_counters = decoded_results.perf_metrics.raw_metrics;
+    auto stop_time = std::chrono::steady_clock::now();
+    raw_counters.generate_durations = std::vector<MicroSeconds>();
+    raw_counters.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
+    raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_microsec(encode_stop_time - start_time));
+    raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_stop_time - decode_start_time));
+    decoded_results.perf_metrics.m_evaluated = false;
+    decoded_results.perf_metrics.evaluate_statistics(start_time);
+    return decoded_results;
+}
+
+EncodedResults StatefulLLMPipeline::generate(
+    const EncodedInputs& inputs,
+    OptionalGenerationConfig generation_config,
+    StreamerVariant streamer
+) {
+    auto start_time = std::chrono::steady_clock::now();
+    ov::Tensor input_ids;
+    ov::Tensor attention_mask;
+
+    if (auto data = std::get_if<ov::Tensor>(&inputs)) {
+        input_ids = *data;
+        attention_mask = ov::genai::utils::init_attention_mask(input_ids);
+    } else if (auto data = std::get_if<TokenizedInputs>(&inputs)) {
+        input_ids = data->input_ids;
+        attention_mask = data->attention_mask;
+    }
+
+    OPENVINO_ASSERT(input_ids.get_shape().at(0) == 1u, "Currently only batch size=1 is supported");
+
+    GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
+    // If eos_token_id was not provided, take value from default m_generation_config
+    if (config.eos_token_id == -1)
+        config.set_eos_token_id(m_generation_config.eos_token_id);
+    config.validate();
+
+    std::shared_ptr<StreamerBase> streamer_ptr;
+    if (auto streamer_obj = std::get_if<std::monostate>(&streamer)) {
+        streamer_ptr = nullptr;
+    } else if (auto streamer_obj = std::get_if<std::shared_ptr<StreamerBase>>(&streamer)) {
+        streamer_ptr = *streamer_obj;
+    } else if (auto callback = std::get_if<std::function<bool(std::string)>>(&streamer)) {
+        streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
+    }
+
+    OPENVINO_ASSERT(config.is_greedy_decoding(), "Currently only greedy decoding is supported");
+
+    ov::Shape prompts_shape = input_ids.get_shape();
+    const size_t batch_size = prompts_shape[0];
+    ov::genai::EncodedResults results;
+    auto& raw_perf_counters = results.perf_metrics.raw_metrics;
+    // NB: Only batch=1 is supported now
+    results.scores.resize(1u);
+    results.scores[0] = 0u;
+    results.tokens.resize(1u);
+
+    // TODO: Check if there is enough space in KV-cache to process input prompt
+    auto prompt_len = input_ids.get_size();
+
+    ov::Tensor position_ids{ov::element::i64, input_ids.get_shape()};
+    utils::initialize_position_ids(position_ids, attention_mask);
+
+    m_request.set_tensor("input_ids", input_ids);
+    m_request.set_tensor("attention_mask", attention_mask);
+    m_request.set_tensor("position_ids", position_ids);
+
+    m_request.infer();
+
+    int64_t last_token = utils::argmax(m_request.get_tensor("logits"), 0);
+
+    results.tokens[0].push_back(last_token);
+    if (streamer_ptr && streamer_ptr->put(last_token)) {
+        return results;
+    }
+
+    int64_t input_ids_data = -1;
+    int64_t position_ids_data = prompt_len - 1;
+    std::vector<int64_t> attention_mask_data(prompt_len - 1, 1);
+    m_request.set_tensor("input_ids", ov::Tensor(ov::element::i64, ov::Shape{1,1},  reinterpret_cast<void*>(&input_ids_data)));
+    m_request.set_tensor("position_ids", ov::Tensor(ov::element::i64, ov::Shape{1,1}, reinterpret_cast<void*>(&position_ids_data)));
+
+    const size_t max_tokens = config.get_max_new_tokens(prompt_len);
+    for (int i = 0; i < max_tokens - 1; ++i) {
+        // KV Cache is full, no further generation is possible
+        if (position_ids_data + 1 == m_kvcache_total) {
+            break;
+        }
+
+        // Just change the variables here, as pointers to them are already set to corresponding tensors
+        input_ids_data = last_token;
+        ++position_ids_data;
+        // However, attention_mask changes its shape on each iteration, it should be re-set explicitly
+        attention_mask_data.push_back(1);
+        m_request.set_tensor("attention_mask", ov::Tensor(ov::element::i64, ov::Shape{1,attention_mask_data.size()}, (void*)&attention_mask_data[0]));
+
+        m_request.infer();
+
+        last_token = utils::argmax(m_request.get_tensor("logits"), 0);
+        results.tokens[0].push_back(last_token);
+
+        raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
+        raw_perf_counters.m_batch_sizes.emplace_back(batch_size);
+        if (streamer_ptr && streamer_ptr->put(last_token)) {
+            break;
+        }
+
+        if (last_token == config.eos_token_id && !config.ignore_eos) {
+            break;
+        }
+    }
+
+    if (streamer_ptr) {
+        streamer_ptr->end();
+    }
+
+    auto stop_time = std::chrono::steady_clock::now();
+    // If is called without tokenization then that stat will not be reported.
+    auto& metrics = results.perf_metrics;
+    metrics.num_input_tokens = batch_size * input_ids.get_shape().at(1);
+    metrics.load_time = this->m_load_time_ms;
+    metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
+    metrics.evaluate_statistics(start_time);
+    return results;
+}
+
+void StatefulLLMPipeline::start_chat(const std::string& system_message) {
+    if (!system_message.empty()) {
+        m_history.push_back({{"role", "system"}, {"content", system_message}});
+    }
+    m_is_chat_conversation = true;
+};
+
+void StatefulLLMPipeline::finish_chat() {
+    m_is_chat_conversation = false;
+    m_history.clear();
+};
+
+StatelessLLMPipeline::StatelessLLMPipeline(
+    const std::filesystem::path& models_path,
+    const ov::genai::Tokenizer& tokenizer,
+    const std::string& device,
+    const ov::AnyMap& config
+) : LLMPipelineImplBase(tokenizer,
+                        utils::from_config_json_if_exists(models_path)),
+    m_sampler(m_tokenizer) {
     auto properties = config;
     /* NB: Static LLM pipeline consists of two models,
        first to process the input prompt (prefill),
@@ -660,7 +949,7 @@ StaticLLMPipeline::StaticLLMPipeline(
     const auto use_blobs = pop_or_default(properties, "USE_BLOBS", false);
     if (!use_blobs) {
         ModelConfigDesc model_desc = get_modeldesc_from_json(models_path / "config.json");
-        auto model = genai::utils::singleton_core().read_model((models_path / "openvino_model.xml").string());
+        auto model = genai::utils::singleton_core().read_model(models_path / "openvino_model.xml", {}, properties);
         setupAndCompileModels(model, device, model_desc, properties);
     } else {
         setupAndImportModels(models_path, device, properties);
@@ -672,24 +961,25 @@ StaticLLMPipeline::StaticLLMPipeline(
     if (m_generation_config.eos_token_id == -1) {
         m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
     }
+
+    m_sampler.set_seed(m_generation_config.rng_seed);
 };
 
-StaticLLMPipeline::StaticLLMPipeline(
+StatelessLLMPipeline::StatelessLLMPipeline(
     const std::filesystem::path& models_path,
     const std::string& device,
     const ov::AnyMap& properties
-) : StaticLLMPipeline(models_path, Tokenizer(models_path), device, properties) {
+) : StatelessLLMPipeline(models_path, Tokenizer(models_path), device, properties) {
 }
 
-StaticLLMPipeline::StaticLLMPipeline(
+StatelessLLMPipeline::StatelessLLMPipeline(
     const std::shared_ptr<ov::Model>& model,
     const ModelConfigDesc& model_desc,
     const ov::genai::Tokenizer& tokenizer,
     const std::string& device,
     const ov::AnyMap& properties,
     const ov::genai::GenerationConfig& generation_config
-) : LLMPipelineImplBase(tokenizer, generation_config) {
-    
+) : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) {
     bool use_blobs = false;
     auto anyopt = get_option<bool>(properties, "USE_BLOBS");
     if (anyopt.has_value()) {
@@ -708,9 +998,11 @@ StaticLLMPipeline::StaticLLMPipeline(
     if (m_generation_config.eos_token_id == -1) {
         m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
     }
+
+    m_sampler.set_seed(m_generation_config.rng_seed);
 }
 
-void StaticLLMPipeline::setupAndCompileModels(
+void StatelessLLMPipeline::setupAndCompileModels(
     const std::shared_ptr<ov::Model>& model,
     const std::string& device,
     const ModelConfigDesc& model_desc,
@@ -727,7 +1019,7 @@ void StaticLLMPipeline::setupAndCompileModels(
         9) Compile both models
     */
 
-    ov::Core core;
+    ov::Core core = utils::singleton_core();
 
     // NB: Get information about NPU if available
     auto npudesc = extract_npu_descriptor(core);
@@ -789,7 +1081,7 @@ void StaticLLMPipeline::setupAndCompileModels(
     ov::genai::utils::print_compiled_model_properties(prefill_compiled_model, "Static LLM prefill compiled model");
 }
 
-void StaticLLMPipeline::setupAndImportModels(
+void StatelessLLMPipeline::setupAndImportModels(
     const std::filesystem::path& models_path,
     const std::string& device,
     ov::AnyMap& properties) {
@@ -802,7 +1094,7 @@ void StaticLLMPipeline::setupAndImportModels(
         3) Import generate model from model directory or specified path
         4) Fill in m_kvcache_desc
     */
-    ov::Core core;
+    ov::Core core = utils::singleton_core();
 
     auto import_blob = [this,
                         &models_path,
@@ -863,19 +1155,19 @@ void StaticLLMPipeline::setupAndImportModels(
     m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, 2u };
 }
 
-void StaticLLMPipeline::start_chat(const std::string& system_message) {
+void StatelessLLMPipeline::start_chat(const std::string& system_message) {
     if (!system_message.empty()) {
         m_history.push_back({{"role", "system"}, {"content", system_message}});
     }
     m_is_chat_conversation = true;
 };
 
-void StaticLLMPipeline::finish_chat() {
+void StatelessLLMPipeline::finish_chat() {
     m_is_chat_conversation = false;
     m_history.clear();
 };
 
-void StaticLLMPipeline::prepare_for_new_conversation() {
+void StatelessLLMPipeline::prepare_for_new_conversation() {
     fill_tensor<int64_t>(m_prefill_request.get_tensor("input_ids"), m_tokenizer.get_pad_token_id());
     fill_tensor<int64_t>(m_prefill_request.get_tensor("position_ids"), 0u);
     fill_tensor<int64_t>(m_prefill_request.get_tensor("attention_mask"), 0u);
@@ -883,7 +1175,7 @@ void StaticLLMPipeline::prepare_for_new_conversation() {
     m_kvcache_desc.num_stored_tokens = 0u;
 }
 
-DecodedResults StaticLLMPipeline::generate(
+DecodedResults StatelessLLMPipeline::generate(
     StringInputs inputs,
     OptionalGenerationConfig generation_config,
     StreamerVariant streamer
@@ -938,7 +1230,7 @@ DecodedResults StaticLLMPipeline::generate(
     return decoded_results;
 }
 
-EncodedResults StaticLLMPipeline::generate(
+EncodedResults StatelessLLMPipeline::generate(
     const EncodedInputs& inputs,
     OptionalGenerationConfig generation_config,
     StreamerVariant streamer
@@ -955,7 +1247,10 @@ EncodedResults StaticLLMPipeline::generate(
         attention_mask = data->attention_mask;
     }
 
-    if (input_ids.get_shape().at(0) > 1u) {
+    ov::Shape prompts_shape = input_ids.get_shape();
+    const size_t batch_size = prompts_shape[0];
+
+    if (batch_size > 1u) {
         OPENVINO_THROW("Currently only batch size=1 is supported");
     }
 
@@ -974,12 +1269,14 @@ EncodedResults StaticLLMPipeline::generate(
         streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
     }
 
-    if (!config.is_greedy_decoding()) {
-        OPENVINO_THROW("Currently only greedy decoding is supported");
+    if (!config.is_greedy_decoding() && !config.is_multinomial()) {
+        OPENVINO_THROW("Currently only greedy and multinomial decoding are supported");
+    }
+
+    if (config.num_return_sequences != 1u) {
+        OPENVINO_THROW("Currently only \"num_return_sequences\" equal to 1 is supported!");
     }
 
-    ov::Shape prompts_shape = input_ids.get_shape();
-    const size_t batch_size = prompts_shape[0];
     ov::genai::EncodedResults results;
     auto& raw_perf_counters = results.perf_metrics.raw_metrics;
     // NB: Only batch=1 is supported now
@@ -1016,11 +1313,21 @@ EncodedResults StaticLLMPipeline::generate(
 
     // NB: Now there are prompt_len tokens in KV-cache
     m_kvcache_desc.num_stored_tokens += static_cast<uint32_t>(prompt_len);
-    int64_t last_token = utils::argmax(m_prefill_request.get_tensor("logits"), 0);
-    results.tokens[0].push_back(last_token);
-    if (streamer_ptr && streamer_ptr->put(last_token)) {
-        return results;
-    }
+
+    auto logits = m_prefill_request.get_tensor("logits");
+    int64_t output_sequence_len = logits.get_shape().at(1);
+
+    auto sequence_group = std::make_shared<SequenceGroup>(
+        0 /* request_id */, padded_input_ids, config, 1 /* block_size */);
+    sequence_group->update_processed_tokens_num(m_kvcache_desc.max_prompt_size - output_sequence_len);
+    sequence_group->schedule_tokens(output_sequence_len);
+
+    // NB: Controls what tokens are ready to be pushed into the streamer
+    GenerationHandle handle = std::make_shared<GenerationHandleImpl>(
+        sequence_group->get_generation_stream(), sequence_group->get_sampling_parameters());
+
+    SamplerOutput sampler_output = m_sampler.sample({sequence_group}, logits);
+    stream_generated_tokens(streamer_ptr, handle);
 
     // Outputs: logits, ...
     const auto kStartOutputKVCacheLayers = 1u;
@@ -1061,30 +1368,28 @@ EncodedResults StaticLLMPipeline::generate(
     std::fill(attention_mask_data, attention_mask_data + m_kvcache_desc.num_stored_tokens - 1u, 1u);
     attention_mask_data[m_kvcache_desc.total_size - 1] = 1u;
 
-    const size_t max_tokens = config.get_max_new_tokens(prompt_len);
-    for (int i = 0; i < max_tokens - 1; ++i) {
-        input_ids_data[0] = last_token;
+    while (sequence_group->is_running()) {
+        sequence_group->schedule_tokens(1);
+        const auto running_sequences = sequence_group->get_running_sequences();
+        OPENVINO_ASSERT(running_sequences.size() == 1u);
+
+        input_ids_data[0] = running_sequences.front()->get_generated_ids().back();
         position_ids_data[0] = m_kvcache_desc.num_stored_tokens;
         attention_mask_data[m_kvcache_desc.num_stored_tokens - 1] = 1u;
 
         m_kvcache_request.infer();
         m_kvcache_desc.num_stored_tokens += 1;
 
-        last_token = utils::argmax(m_kvcache_request.get_tensor("logits"), 0);
-        results.tokens[0].push_back(last_token);
-
         raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
         raw_perf_counters.m_batch_sizes.emplace_back(batch_size);
-        if (streamer_ptr && streamer_ptr->put(last_token)) {
-            break;
-        }
 
-        if (last_token == config.eos_token_id && !config.ignore_eos) {
-            break;
-        }
+        SamplerOutput sampler_output = m_sampler.sample(
+            {sequence_group}, m_kvcache_request.get_tensor("logits"));
+        stream_generated_tokens(streamer_ptr, handle);
 
         // NB: KV-cache is full, further generation is impossible
         if (m_kvcache_desc.num_stored_tokens == m_kvcache_desc.total_size) {
+            sequence_group->set_out_of_memory();
             break;
         }
 
@@ -1108,6 +1413,12 @@ EncodedResults StaticLLMPipeline::generate(
         streamer_ptr->end();
     }
 
+    OPENVINO_ASSERT(sequence_group->get_finished_sequences().size() == 1u);
+    auto sequence = sequence_group->get_finished_sequences().front();
+    results.tokens[0] = sequence->get_generated_ids();
+    results.scores[0] = sequence->get_cumulative_log_prob();
+    m_sampler.clear_request_info(sequence_group->get_request_id());
+
     auto stop_time = std::chrono::steady_clock::now();
     // If is called without tokenization then that stat will not be reported.
     auto& metrics = results.perf_metrics;
@@ -1118,5 +1429,49 @@ EncodedResults StaticLLMPipeline::generate(
     return results;
 }
 
+std::unique_ptr<LLMPipelineImplBase>
+LLMPipelineFactory::create(const std::filesystem::path& models_path,
+                           const ov::genai::Tokenizer& tokenizer,
+                           const std::string& device,
+                           const ov::AnyMap& config) {
+    auto properties = config;
+    const auto pipeline_mode = str_to_pipeline(pop_or_default(properties, "STATIC_PIPELINE", std::string("STATELESS")));
+    if (pipeline_mode == StaticPipelineKind::STATEFUL) {
+        return std::make_unique<ov::genai::static_llm::StatefulLLMPipeline>(models_path, tokenizer, device, properties);
+    }
+    return std::make_unique<ov::genai::static_llm::StatelessLLMPipeline>(models_path, tokenizer, device, properties);
+}
+
+std::unique_ptr<LLMPipelineImplBase>
+LLMPipelineFactory::create(const std::filesystem::path& models_path,
+                                 const std::string& device,
+                                 const ov::AnyMap& config) {
+    return create(models_path, Tokenizer(models_path), device, config);
+}
+
+std::unique_ptr<LLMPipelineImplBase> LLMPipelineFactory::create(const std::shared_ptr<ov::Model>& model,
+                                                                const ModelConfigDesc& model_desc,
+                                                                const ov::genai::Tokenizer& tokenizer,
+                                                                const std::string& device,
+                                                                const ov::AnyMap& properties,
+                                                                const ov::genai::GenerationConfig& generation_config) {
+    auto properties_copy = properties;
+    const auto pipeline_mode = str_to_pipeline(pop_or_default(properties_copy, "STATIC_PIPELINE", std::string("STATELESS")));
+    if (pipeline_mode == StaticPipelineKind::STATEFUL) {
+        return std::make_unique<ov::genai::static_llm::StatefulLLMPipeline>(model,
+                                                                            model_desc,
+                                                                            tokenizer,
+                                                                            device,
+                                                                            properties_copy,
+                                                                            generation_config);
+    }
+    return std::make_unique<ov::genai::static_llm::StatelessLLMPipeline>(model,
+                                                                         model_desc,
+                                                                         tokenizer,
+                                                                         device,
+                                                                         properties_copy,
+                                                                         generation_config);
+}
+}  // namespace static_llm
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp
index 7acc28c684..dd51c31b29 100644
--- a/src/cpp/src/llm_pipeline_static.hpp
+++ b/src/cpp/src/llm_pipeline_static.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2024 Intel Corporation
+// Copyright (C) 2024-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
@@ -6,9 +6,11 @@
 #include <filesystem>
 
 #include "llm_pipeline_base.hpp"
+#include "sampler.hpp"
 
 namespace ov {
 namespace genai {
+namespace static_llm {
 
 struct ModelConfigDesc {
     std::string type;
@@ -16,16 +18,34 @@ struct ModelConfigDesc {
     int num_key_value_heads;
 };
 
-class StaticLLMPipeline final : public LLMPipelineImplBase {
+struct LLMPipelineFactory {
+    static std::unique_ptr<LLMPipelineImplBase> create(const std::filesystem::path& path,
+                                                       const ov::genai::Tokenizer& tokenizer,
+                                                       const std::string& device,
+                                                       const ov::AnyMap& config);
+
+    static std::unique_ptr<LLMPipelineImplBase> create(const std::filesystem::path& path,
+                                                       const std::string& device,
+                                                       const ov::AnyMap& config);
+
+    static std::unique_ptr<LLMPipelineImplBase> create(const std::shared_ptr<ov::Model>& model,
+                                                       const ModelConfigDesc& model_desc,
+                                                       const ov::genai::Tokenizer& tokenizer,
+                                                       const std::string& device,
+                                                       const ov::AnyMap& properties,
+                                                       const ov::genai::GenerationConfig& generation_config = {});
+};
+
+class StatefulLLMPipeline : public LLMPipelineImplBase {
 public:
-    StaticLLMPipeline(
+    StatefulLLMPipeline(
         const std::filesystem::path& path,
         const ov::genai::Tokenizer& tokenizer,
         const std::string& device,
         const ov::AnyMap& config
     );
 
-    StaticLLMPipeline(
+    StatefulLLMPipeline(
         const std::shared_ptr<ov::Model>& model,
         const ModelConfigDesc& model_desc,
         const ov::genai::Tokenizer& tokenizer,
@@ -34,12 +54,57 @@ class StaticLLMPipeline final : public LLMPipelineImplBase {
         const ov::genai::GenerationConfig& generation_config = {}
     );
 
-    StaticLLMPipeline(
+    std::shared_ptr<ov::CompiledModel> setupAndCompileModel(
+        const std::shared_ptr<ov::Model>& model,
+        const ModelConfigDesc& model_desc,
+        ov::AnyMap& pipeline_config);
+
+    DecodedResults generate(
+        StringInputs inputs,
+        OptionalGenerationConfig generation_config,
+        StreamerVariant streamer
+    ) override;
+
+    EncodedResults generate(
+        const EncodedInputs& inputs,
+        OptionalGenerationConfig generation_config,
+        StreamerVariant streamer
+    ) override;
+
+    void start_chat(const std::string& system_message) override;
+    void finish_chat() override;
+
+private:
+    uint32_t m_kvcache_total = 0u;
+    ov::InferRequest m_request;
+    bool m_is_chat_conversation = false;
+    ChatHistory m_history;
+};
+
+class StatelessLLMPipeline final : public LLMPipelineImplBase {
+public:
+    StatelessLLMPipeline(
+        const std::filesystem::path& path,
+        const ov::genai::Tokenizer& tokenizer,
+        const std::string& device,
+        const ov::AnyMap& config
+    );
+
+    StatelessLLMPipeline(
         const std::filesystem::path& path,
         const std::string& device,
         const ov::AnyMap& config
     );
 
+    StatelessLLMPipeline(
+        const std::shared_ptr<ov::Model>& model,
+        const ModelConfigDesc& model_desc,
+        const ov::genai::Tokenizer& tokenizer,
+        const std::string& device,
+        const ov::AnyMap& properties,
+        const ov::genai::GenerationConfig& generation_config = {}
+    );
+
     void setupAndCompileModels(
         const std::shared_ptr<ov::Model>& model,
         const std::string& device,
@@ -77,6 +142,8 @@ class StaticLLMPipeline final : public LLMPipelineImplBase {
         bool v_tensors_transposed;
     };
 
+    Sampler m_sampler;
+
     KVCacheDesc m_kvcache_desc;
     ov::InferRequest m_kvcache_request;
     ov::InferRequest m_prefill_request;
@@ -85,5 +152,6 @@ class StaticLLMPipeline final : public LLMPipelineImplBase {
     ChatHistory m_history;
 };
 
+}  // namespace static_llm
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp
index 17a20dd961..9ef876d8aa 100644
--- a/src/cpp/src/lm_encoding.cpp
+++ b/src/cpp/src/lm_encoding.cpp
@@ -119,10 +119,13 @@ std::pair<EncodedResults, std::optional<int64_t>> get_lm_encoded_results(
 
     auto logits = m_llm.get_tensor("logits");
 
-    int64_t sequence_len = logits.get_shape().at(1);
+    // since we have applied `Slice` operation to last MatMul, model output sequence lenght is 1
+    // so, we need to update sequence groups to think that they already have processed all prompt tokens except last ones
+    // and schedule only `output_sequence_len` ones
+    int64_t output_sequence_len = logits.get_shape().at(1);
     for (auto& sequence_group : sequence_groups) {
-        sequence_group->update_processed_tokens_num(sequence_group->get_prompt_len() - sequence_len);
-        sequence_group->schedule_tokens(sequence_len);
+        sequence_group->update_processed_tokens_num(sequence_group->get_prompt_len() - output_sequence_len);
+        sequence_group->schedule_tokens(output_sequence_len);
     }
 
     std::map<size_t, size_t> beam_offets;
@@ -217,7 +220,7 @@ std::pair<EncodedResults, std::optional<int64_t>> get_lm_encoded_results(
 
         for (size_t seq_id = 0; seq_id < num_outputs; ++seq_id) {
             const auto & sequence = sequences[seq_id];
-            const float score = sampling_params.is_beam_search() ? sequence->get_beam_search_score(sampling_params) : sequence->get_cumulative_log_probs();
+            const float score = sampling_params.is_beam_search() ? sequence->get_beam_search_score(sampling_params) : sequence->get_cumulative_log_prob();
 
             results.tokens.push_back(sequence->get_generated_ids());
             results.scores.push_back(score);
diff --git a/src/cpp/src/lora_adapter.cpp b/src/cpp/src/lora_adapter.cpp
index fd446ef708..e060e55160 100644
--- a/src/cpp/src/lora_adapter.cpp
+++ b/src/cpp/src/lora_adapter.cpp
@@ -1305,7 +1305,7 @@ AdapterController::AdapterController(std::shared_ptr<ov::Model> model, const Ada
 
 
 // Call it every time when adapter config is changed; if adapter was configured as a static one, this call is not required
-void AdapterController::apply(ov::InferRequest& request, const std::optional<AdapterConfig>& config) {
+void AdapterController::apply(ov::InferRequest request, const std::optional<AdapterConfig>& config) {
     OPENVINO_ASSERT(m_pimpl || !config || !*config,
         "Adapters are passed to AdapterController but it was not configured to use adapters. "
         "Enable using adapters by pass them in the constructor first.");
diff --git a/src/cpp/src/model_runner.hpp b/src/cpp/src/model_runner.hpp
index 1b96cdc505..27eee9e27d 100644
--- a/src/cpp/src/model_runner.hpp
+++ b/src/cpp/src/model_runner.hpp
@@ -52,7 +52,7 @@ class ModelRunner {
     /**
      * @return The ov::InferRequest this ModelRunner is handling.
      */
-    ov::InferRequest get_infer_request() const {
+    ov::InferRequest get_infer_request() {
         return m_request;
     }
 
@@ -114,28 +114,54 @@ class ModelRunner {
         subsequence_begins_data[0] = 0;
         block_indices_begins_data[0] = 0;
 
+        bool matmul_gathering_is_available = false;
+        size_t gathering_current_index = 0;
+        std::vector<int64_t> gather_indices_values;
+        try {
+            std::ignore = m_request.get_tensor("sampled_tokens_indices");
+            matmul_gathering_is_available = true;
+        } catch (const ov::Exception&) {}
+
+
         for (size_t i = 0; i < num_sequence_groups; ++i) {
             size_t seq_group_id = scheduler_output.m_scheduled_sequence_groups_ids[i];
-            SequenceGroup::CPtr sequence_group = sequence_groups[seq_group_id];
-            std::vector<Sequence::CPtr> running_sequences = sequence_group->get_running_sequences();
+            SequenceGroup::Ptr sequence_group = sequence_groups[seq_group_id];
+            std::vector<Sequence::Ptr> running_sequences = sequence_group->get_running_sequences();
             size_t num_running_sequences = running_sequences.size();
             size_t num_scheduled_tokens = sequence_group->get_num_scheduled_tokens();
             size_t group_position_id = sequence_group->get_num_processed_tokens();
+            size_t prompt_len = sequence_group->get_prompt_len();
 
-            // spec: In case of multiple input tokens for current sequence (prompt_len > 1),
-            // context_len corresponds to first token within subgroup of scheduled tokens
-            size_t group_context_len = group_position_id;
+            // Next variables are only for sliced matmul case
+            size_t output_seq_len = 0;
+            const bool echo_output = sequence_group->get_sampling_parameters().echo;
+            const bool sampling_is_required = sequence_group->requires_sampling();
+            const size_t tokens_to_sample_per_sequence = 1 + sequence_group->get_num_tokens_to_validate();
 
             for (size_t seq_id = 0; seq_id < num_running_sequences; ++seq_id) {
+                output_seq_len = 0;
                 Sequence::CPtr sequence = running_sequences[seq_id];
-
-                for (size_t token_id = 0, position_id = group_position_id; token_id < num_scheduled_tokens; ++token_id, ++position_id) {
+                for (size_t token_id = 0, position_id = group_position_id; token_id < num_scheduled_tokens; ++token_id, ++position_id, ++gathering_current_index) {
                     // compute token for current sequence
-                    input_ids_data[token_id] = position_id < sequence_group->get_prompt_len() ?
+                    input_ids_data[token_id] = position_id < prompt_len ?
                         sequence_group->get_prompt_ids()[position_id] :
-                        sequence->get_generated_ids()[position_id - sequence_group->get_prompt_len()];
+                        sequence->get_generated_ids()[position_id - prompt_len];
 
                     position_ids_data[token_id] = position_id;
+
+                    // Check if token gathering is required for the entire sequence group
+                    if (matmul_gathering_is_available && (sampling_is_required || echo_output)) {
+                        // Determine if the current token should be gathered
+                        if (echo_output ||
+                            // Skip gathering for prompt tokens
+                            group_position_id + token_id >= prompt_len - 1 &&
+                            // Gather only the last scheduled token or 1 + num_tokens_to_validate tokens for SD
+                            // In SD, tokens_to_sample_per_sequence may exceed num_scheduled_tokens
+                            token_id + tokens_to_sample_per_sequence >= num_scheduled_tokens) {
+                            gather_indices_values.push_back(gathering_current_index);
+                            output_seq_len++;
+                        }
+                    }
                 }
 
                 size_t expected_kv_cache_size = sequence_group->get_num_processed_tokens() - sequence_group->get_num_evicted_tokens();
@@ -153,6 +179,7 @@ class ModelRunner {
                 subsequence_begins_data += 1;
                 block_indices_begins_data += 1;
             }
+            sequence_group->set_output_seq_len(matmul_gathering_is_available ? output_seq_len : num_scheduled_tokens);
         }
 
         // typical LLM parameters
@@ -168,6 +195,12 @@ class ModelRunner {
         m_request.set_tensor("block_indices_begins", block_indices_begins);
         m_request.set_tensor("max_context_len", max_context_len);
 
+        if (matmul_gathering_is_available) {
+            ov::Tensor gather_indices(ov::element::i64, {gather_indices_values.size()});
+            std::memcpy(gather_indices.data(), gather_indices_values.data(), gather_indices_values.size() * sizeof(int64_t));
+            m_request.set_tensor("sampled_tokens_indices", gather_indices);
+        }
+
         // print_tensor("input_ids", input_ids);
         // print_tensor("position_ids", position_ids);
 
diff --git a/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp b/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp
index 8c9e520728..ffc8a8aab2 100644
--- a/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp
+++ b/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp
@@ -82,4 +82,5 @@ void ContinuousBatchingPipeline::ContinuousBatchingForPromptLookupImpl::generate
         request->set_num_validated_tokens(max_validation_len);
     }
 }
+
 }
\ No newline at end of file
diff --git a/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp b/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp
index f934a56939..7a893a2603 100644
--- a/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp
+++ b/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp
@@ -73,10 +73,19 @@ std::vector<EncodedGenerationResult>
 ContinuousBatchingPipeline::PromptLookupImpl::generate(const std::vector<ov::Tensor>& input_ids,
                                                        const std::vector<GenerationConfig>& sampling_params,
                                                        const StreamerVariant& streamer) {
-    ManualTimer generate_timer("speculative_decoding: generate()");
-    generate_timer.start();
     OPENVINO_ASSERT(!has_non_finished_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request");
     OPENVINO_ASSERT(input_ids.size() == sampling_params.size());
+
+    ManualTimer generate_timer("speculative_decoding: generate()");
+    generate_timer.start();
+
+    // checks that all requests has the same LoRA adapters property value
+    for (size_t i = 1; i < sampling_params.size(); ++i) {
+        OPENVINO_ASSERT(sampling_params[i - 1].adapters == sampling_params[i].adapters,
+            "LoRA adapters value must be the same for all requests");
+    }
+    m_pipeline->set_adapters(sampling_params[0].adapters);
+
     const std::shared_ptr<StreamerBase>& streamer_ptr = std::visit(overloaded{
         [](std::monostate) -> std::shared_ptr<StreamerBase> {
             return nullptr;
diff --git a/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp b/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp
index dae721741b..0c05c2afd0 100644
--- a/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp
+++ b/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp
@@ -11,11 +11,11 @@
 
 namespace ov::genai {
 
-class ContinuousBatchingPipeline::PromptLookupImpl : public ContinuousBatchingPipeline::ImplInterface {
+class ContinuousBatchingPipeline::PromptLookupImpl : public ContinuousBatchingPipeline::IContinuousBatchingPipeline {
 protected:
     std::shared_ptr<ContinuousBatchingForPromptLookupImpl> m_pipeline;
     SpeculativeDecodingMetrics m_sd_metrics;
-    
+
 public:
     PromptLookupImpl(const std::shared_ptr<ov::Model>& model,
                      const Tokenizer& tokenizer,
diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index 9c18dc7721..54850f657b 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2023-2024 Intel Corporation
+// Copyright (C) 2023-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #include "sampler.hpp"
@@ -743,13 +743,13 @@ process_stop_strings(const std::set<std::string>& stop_strings, Tokenizer& token
     return result;
 }
 
-SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
+SamplerOutput Sampler::sample(const std::vector<SequenceGroup::Ptr> & sequence_groups,
                               ov::Tensor logits,
                               bool is_validation_mode_enabled) {
     const float * logits_data = logits.data<float>();
     ov::Shape logits_shape = logits.get_shape();
     OPENVINO_ASSERT(logits_shape.size() == 3);
-    size_t batch_seq_len = logits_shape[1], vocab_size = logits_shape[2];
+    size_t vocab_size = logits_shape[2];
 
     SamplerOutput sampler_output;
     for (size_t sequence_group_id = 0, currently_processed_tokens = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) {
@@ -758,8 +758,7 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
             continue;
 
         size_t num_running_sequences = sequence_group->num_running_seqs();
-        size_t actual_seq_len = sequence_group->get_num_scheduled_tokens(); // points to a token which needs to be sampled
-        size_t padded_amount_of_processed_tokens = std::max(actual_seq_len, batch_seq_len);
+        size_t output_seq_len = sequence_group->get_output_seq_len();
         const ov::genai::GenerationConfig& sampling_params = sequence_group->get_sampling_parameters();
 
         const auto request_id = sequence_group->get_request_id();
@@ -774,13 +773,13 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
         auto& stop_strings = m_stop_strings.at(request_id);
         auto& logit_processor = m_logit_processors.at(request_id);
         const void * sequence_group_logits_data = logits_data + vocab_size * currently_processed_tokens;
-        ov::Tensor sequence_group_logits(ov::element::f32, ov::Shape{num_running_sequences, actual_seq_len, vocab_size}, (void *)sequence_group_logits_data);
+        ov::Tensor sequence_group_logits(ov::element::f32, ov::Shape{num_running_sequences, output_seq_len, vocab_size}, (void *)sequence_group_logits_data);
         size_t max_removed_tokens_per_request = 0, min_generated_len = std::numeric_limits<size_t>::max(), updated_validation_len = 0;
         if (sequence_group->requires_sampling()) {
             // get number of token to be validated
             auto num_tokens_to_process = sequence_group->get_num_tokens_to_validate();
-            if (num_tokens_to_process > actual_seq_len - 1) {
-                auto delta = num_tokens_to_process - (actual_seq_len - 1);
+            if (num_tokens_to_process > output_seq_len - 1) {
+                auto delta = num_tokens_to_process - (output_seq_len - 1);
                 updated_validation_len = std::max(updated_validation_len, delta);
                 num_tokens_to_process -= delta;
             }
@@ -914,7 +913,7 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
         }
 
         // accumulate a number of processed tokens
-        currently_processed_tokens += padded_amount_of_processed_tokens * num_running_sequences;
+        currently_processed_tokens += output_seq_len * num_running_sequences;
     }
 
     return sampler_output;
diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index 981e11560f..7796f93d1e 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -1,5 +1,5 @@
 
-// Copyright (C) 2023-2024 Intel Corporation
+// Copyright (C) 2023-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
@@ -67,7 +67,7 @@ class Sampler {
     Sampler() = default;
     Sampler(Tokenizer & tokenizer) : m_tokenizer(tokenizer) {};
 
-    SamplerOutput sample(std::vector<SequenceGroup::Ptr> & sequence_groups, ov::Tensor logits, bool is_validation_mode_enabled = false);
+    SamplerOutput sample(const std::vector<SequenceGroup::Ptr> & sequence_groups, ov::Tensor logits, bool is_validation_mode_enabled = false);
     void set_seed(size_t new_seed) {
         rng_engine.seed(new_seed);
         seed = new_seed;
diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp
index da65c68bec..0057b19329 100644
--- a/src/cpp/src/scheduler.hpp
+++ b/src/cpp/src/scheduler.hpp
@@ -56,6 +56,10 @@ class Scheduler {
 
     Output schedule(std::vector<SequenceGroup::Ptr>& sequence_groups) {
         Output scheduler_output;
+
+        // free some blocks taken by non-confirmed condidates in SD / prompt look-up
+        clean_empty_blocks(sequence_groups);
+
         if (m_block_manager.get_total_number_of_kv_blocks() == 0) {
             _initialize_cache(sequence_groups);
         }
@@ -84,6 +88,10 @@ class Scheduler {
         return scheduler_output;
     }
 
+    /**
+     * Some requests can contain empty blocks after prompt look-up or speculative decoding
+     * when candidates are not confirmed by main model and we need to free blocks, taken by these candidates
+     */
     void clean_empty_blocks(std::vector<SequenceGroup::Ptr>& seq_groups) {
         for (const auto& seq_group : seq_groups)
             m_block_manager.free_empty_physical_blocks(seq_group);
diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp
index 220e93c032..b6bcc83530 100644
--- a/src/cpp/src/sequence_group.hpp
+++ b/src/cpp/src/sequence_group.hpp
@@ -1,12 +1,14 @@
-// Copyright (C) 2023-2024 Intel Corporation
+// Copyright (C) 2023-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
 
 #include <vector>
+#include <cassert>
 #include <set>
 #include <cstdlib>
 #include <string_view>
+#include <memory>
 
 #include "openvino/genai/generation_handle.hpp"
 #include "openvino/genai/generation_config.hpp"
@@ -40,32 +42,32 @@ class Sequence {
     GenerationFinishReason m_finish_reason = GenerationFinishReason::NONE;
     float m_cumulative_log_prob = 0.0f;
     std::vector<int64_t> m_prefix_hashes;
-    std::weak_ptr<SequenceGroup> m_sequence_group;
+    SequenceGroup* m_sequence_group = nullptr;
     static std::mutex m_counter_mutex;
 
     size_t _make_hash(size_t content_length);
-public:
-    using Ptr = std::shared_ptr<Sequence>;
-    using CPtr = std::shared_ptr<const Sequence>;
 
-    // don't use directly
-    Sequence(const uint64_t id) : m_grouped_id(id) {};
+    explicit Sequence(const uint64_t id) : m_grouped_id(id) {}
 
-    // don't use directly
     Sequence(const Sequence& seq, const uint64_t id) :
         m_generated_ids(seq.m_generated_ids),
         m_grouped_id(id),
         m_status(seq.m_status),
-        m_cumulative_log_prob(seq.m_cumulative_log_prob){
+        m_cumulative_log_prob(seq.m_cumulative_log_prob),
+        m_sequence_group(seq.m_sequence_group) {
         OPENVINO_ASSERT(seq.m_id != m_id);
     }
 
+public:
+    using Ptr = std::shared_ptr<Sequence>;
+    using CPtr = std::shared_ptr<const Sequence>;
+
     static Sequence::Ptr create(const uint64_t id) {
-        return std::make_shared<Sequence>(id);
+        return Sequence::Ptr(new Sequence(id));
     }
 
     static Sequence::Ptr fork(Sequence::CPtr sequence, const uint64_t id) {
-        return std::make_shared<Sequence>(*sequence, id);
+        return Sequence::Ptr(new Sequence(*sequence, id));
     }
 
     bool operator ==(const Sequence& other) const {
@@ -130,7 +132,7 @@ class Sequence {
         GenerationOutput output;
         if (token_cnt > 0) {
             OPENVINO_ASSERT(m_generated_ids.size());
-            output.score = get_cumulative_log_probs();
+            output.score = get_cumulative_log_prob();
 
             auto generated_token_id = get_generated_ids();
             auto generated_log_probs = get_generated_log_probs();
@@ -163,7 +165,7 @@ class Sequence {
         return m_generated_log_probs;
     }
 
-    float get_cumulative_log_probs() const {
+    float get_cumulative_log_prob() const {
         return m_cumulative_log_prob;
     }
 
@@ -173,20 +175,18 @@ class Sequence {
     }
 
     float get_beam_search_score(const ov::genai::GenerationConfig& sampling_params) const {
-        float cumulative_log_prob = get_cumulative_log_probs(), current_length = get_generated_len();
+        float cumulative_log_prob = get_cumulative_log_prob(), current_length = get_generated_len();
         float score = cumulative_log_prob / std::pow(current_length, sampling_params.length_penalty);
         return score;
     }
 
     // Each KV block can be uniquely identified by
-    void set_sequence_group_ptr(std::shared_ptr<SequenceGroup> sequence_group) {
+    void set_sequence_group_ptr(SequenceGroup* sequence_group) {
+        assert(sequence_group != nullptr);
         m_sequence_group = sequence_group;
     }
 
-    std::shared_ptr<SequenceGroup> get_sequence_group_ptr() const {
-        OPENVINO_ASSERT(!m_sequence_group.expired());
-        return m_sequence_group.lock();
-    }
+    std::shared_ptr<SequenceGroup> get_sequence_group_ptr() const;
 
     // Each KV block can be uniquely identified by
     // the tokens within the block and the tokens in the prefix before the block.
@@ -198,7 +198,7 @@ class Sequence {
 // - each sequence shares the same prompt and KV-caches for promp
 // - in case of beam search each sequence also shares specific part of generic phase
 //   via reference counter mechanism on BlockManager level
-class SequenceGroup {
+class SequenceGroup  : public std::enable_shared_from_this<SequenceGroup> {
     uint64_t m_request_id;
     std::vector<Sequence::Ptr> m_sequences;
     ov::genai::GenerationConfig m_sampling_params;
@@ -206,7 +206,6 @@ class SequenceGroup {
     TokenIds m_prompt_ids;
     std::vector<float> m_prompt_log_probs;
     GenerationStream::Ptr m_generation_stream;
-    bool m_enable_prefix_caching;
     size_t m_num_evicted_tokens = 0;
     bool m_has_echoed = false;
 
@@ -223,36 +222,37 @@ class SequenceGroup {
     size_t m_num_validation_tokens = 0;
     // flag to enable/disable token generation, e.g. in speculative decoding scenario
     bool m_is_gen_paused = false;
+    // output seq len at current iteration
+    size_t m_output_seq_len = 0;
 
     size_t m_num_streamed_tokens = 0, m_stream_window_size = 0;
 
-
-    SequenceGroup(uint64_t request_id, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size, bool enable_prefix_caching)
+    SequenceGroup(uint64_t request_id, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size)
         : m_request_id(request_id),
           m_sampling_params(sampling_params),
           m_block_size(block_size),
-          m_enable_prefix_caching(enable_prefix_caching) {
-            m_generation_stream = GenerationStream::create();    
-           }
+          m_generation_stream(GenerationStream::create()) { }
 
 public:
     using Ptr = std::shared_ptr<SequenceGroup>;
     using CPtr = std::shared_ptr<const SequenceGroup>;
 
-    SequenceGroup(uint64_t request_id, const TokenIds& input_ids, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size, bool enable_prefix_caching)
-        : SequenceGroup(request_id, ov::Tensor(ov::element::i64, ov::Shape{input_ids.size()}, (void *)input_ids.data()), sampling_params, block_size, enable_prefix_caching) {
+    SequenceGroup(uint64_t request_id, const TokenIds& input_ids, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size)
+        : SequenceGroup(request_id, ov::Tensor(ov::element::i64, ov::Shape{input_ids.size()}, (void *)input_ids.data()), sampling_params, block_size) {
     }
 
-    SequenceGroup(uint64_t request_id, const ov::Tensor input_ids, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size, bool enable_prefix_caching)
-        : SequenceGroup(request_id, sampling_params, block_size, enable_prefix_caching) {
-        add_sequence(Sequence::create(m_next_sequence_id++));
-
+    SequenceGroup(uint64_t request_id, const ov::Tensor input_ids, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size)
+        : SequenceGroup(request_id, sampling_params, block_size) {
         m_prompt_ids.resize(input_ids.get_size());
         std::copy_n(input_ids.data<int64_t>(), input_ids.get_size(), m_prompt_ids.begin());
         m_prompt_log_probs.reserve(m_prompt_ids.size());
+
+        // create a single sequence
+        add_sequence(Sequence::create(m_next_sequence_id++));
     }
 
     void add_sequence(const Sequence::Ptr & sequence) {
+        sequence->set_sequence_group_ptr(this);
         m_sequences.emplace_back(sequence);
     }
 
@@ -292,8 +292,8 @@ class SequenceGroup {
     }
 
     size_t num_finished_seqs() const {
-        return std::count_if(m_sequences.begin(), m_sequences.end(), [] (Sequence::CPtr seq) {
-            return seq->has_finished();
+        return std::count_if(m_sequences.begin(), m_sequences.end(), [this] (Sequence::CPtr seq) {
+            return seq->has_finished() || seq->out_of_memory() || handle_dropped();
         });
     }
 
@@ -322,7 +322,6 @@ class SequenceGroup {
         return it != m_sequences.end();
     }
 
-
     /**
      * @param seq_id Sequence identifier
      * @return Pointer to the sequence with this ID.
@@ -344,8 +343,8 @@ class SequenceGroup {
 
         std::sort(finished_seqs.begin(), finished_seqs.end(), [=] (Sequence::CPtr s1, Sequence::CPtr s2) -> bool {
             bool is_beam_search = m_sampling_params.is_beam_search();
-            const float score_1 = is_beam_search ? s1->get_beam_search_score(m_sampling_params) : s1->get_cumulative_log_probs();
-            const float score_2 = is_beam_search ? s2->get_beam_search_score(m_sampling_params) : s2->get_cumulative_log_probs();
+            const float score_1 = is_beam_search ? s1->get_beam_search_score(m_sampling_params) : s1->get_cumulative_log_prob();
+            const float score_2 = is_beam_search ? s2->get_beam_search_score(m_sampling_params) : s2->get_cumulative_log_prob();
             return score_1 > score_2;
         });
 
@@ -397,6 +396,14 @@ class SequenceGroup {
         return m_num_processed_tokens;
     }
 
+    size_t get_output_seq_len() const {
+        return m_output_seq_len;
+    }
+
+    void set_output_seq_len(size_t len) {
+        m_output_seq_len = len;
+    }
+
     /**
      * Registers within the sequence group that a given amount of tokens
      * has been evicted from the underlying KV cache.
@@ -409,7 +416,6 @@ class SequenceGroup {
         m_num_evicted_tokens += num_evicted_tokens;
     }
 
-
     /**
      * Resets the eviction tracking on this sequence to the state prior to any eviction taking place.
      */
@@ -434,18 +440,20 @@ class SequenceGroup {
         return get_num_processed_tokens() + get_num_scheduled_tokens();
     }
 
-
     bool requires_sampling() const {
         return get_context_len() >= get_prompt_len() && get_context_len() > m_max_content_len && m_sampling_params.max_new_tokens > 0;
     }
 
     void schedule_tokens(size_t num_tokens) {
         m_num_scheduled_tokens = num_tokens;
+        // Unless otherwise specified, the sampler will process all scheduled tokens.
+        m_output_seq_len = num_tokens;
     }
 
     void clear_scheduled_tokens() {
         m_num_scheduled_tokens = 0;
         m_num_validation_tokens = 0;
+        m_output_seq_len = 0;
     }
 
     bool is_scheduled() const {
@@ -513,7 +521,6 @@ class SequenceGroup {
         return (get_context_len() - get_num_evicted_tokens() + m_block_size - 1) / m_block_size;
     }
 
-
     // requires number of physical blocks for next generation
     size_t get_num_blocks() const {
         return get_num_logical_blocks();
@@ -524,10 +531,9 @@ class SequenceGroup {
     }
 
     Sequence::Ptr fork_sequence(Sequence::CPtr sequence) {
-        auto ptr = sequence->get_sequence_group_ptr();
-        m_sequences.emplace_back(Sequence::fork(std::move(sequence), m_next_sequence_id++));
-        set_sequence_group_ptr(ptr);
-        return m_sequences.back();
+        auto forked_sequence = Sequence::fork(sequence, m_next_sequence_id++);
+        m_sequences.emplace_back(forked_sequence);
+        return forked_sequence;
     }
 
     const ov::genai::GenerationConfig& get_sampling_parameters() const {
@@ -568,12 +574,6 @@ class SequenceGroup {
         return m_is_gen_paused;
     }
 
-    void set_sequence_group_ptr(std::shared_ptr<SequenceGroup> sequence_group) {
-        for (auto sequence: m_sequences) {
-            sequence->set_sequence_group_ptr(sequence_group);
-        }
-    }
-
     GenerationStream::Ptr get_generation_stream() {
         return m_generation_stream;
     }
@@ -600,7 +600,7 @@ class SequenceGroup {
                 output.generated_ids.insert(output.generated_ids.begin(), m_prompt_ids.begin(), m_prompt_ids.end());
                 output.generated_log_probs.insert(output.generated_log_probs.begin(), m_prompt_log_probs.begin(), m_prompt_log_probs.end());
             }
-            output.score = m_sampling_params.is_beam_search() ? sequence->get_beam_search_score(m_sampling_params) : sequence->get_cumulative_log_probs();
+            output.score = m_sampling_params.is_beam_search() ? sequence->get_beam_search_score(m_sampling_params) : sequence->get_cumulative_log_prob();
             output.finish_reason = sequence->get_finish_reason();
             outputs.emplace(sequence->get_grouped_id(), output);
         }
@@ -684,4 +684,10 @@ class SequenceGroup {
         m_generation_stream->push(std::move(outputs));
     } 
 };
+
+inline std::shared_ptr<SequenceGroup> Sequence::get_sequence_group_ptr() const {
+    assert(m_sequence_group != nullptr);
+    return m_sequence_group->shared_from_this();
+}
+
 }
diff --git a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp
index 36f274f30f..a1d0e85f17 100644
--- a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp
+++ b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp
@@ -17,7 +17,7 @@ ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::Contin
     m_tokenizer = tokenizer;
     m_generation_config = generation_config;
     m_is_validation_mode_enabled = is_validation_mode_enabled;
-    init(model, scheduler_config, plugin_config, device_config, core);
+    initialize_pipeline(model, scheduler_config, plugin_config, device_config, core);
 }
 
 void
@@ -159,7 +159,7 @@ init_request(
     for (const auto& candidate_sequence : candidates) {
         Sequence::Ptr sequence;
         if (is_init_all_sequences_in_request && candidate_sequence.first > 0) {
-            sequence = Sequence::Ptr(new Sequence(candidate_sequence.first));
+            sequence = Sequence::create(candidate_sequence.first);
             sequence->set_status(ov::genai::SequenceStatus::RUNNING);
             request->add_sequence(sequence);
         } else {
diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
index 257c20bf01..526c5df2d4 100644
--- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
+++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
@@ -25,10 +25,6 @@ bool are_tokenizers_equal(Tokenizer& lhs, Tokenizer& rhs) {
 
 ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(const ov::genai::ModelDesc& main_model_desc, 
                                                                              const ov::genai::ModelDesc& draft_model_desc) {
-    ov::Core core;
-    auto [core_properties, compile_properties] = utils::split_core_compile_config(main_model_desc.properties);
-    core.set_property(core_properties);
-
     auto main_model = main_model_desc.model;
     auto draft_model = draft_model_desc.model;
 
@@ -37,14 +33,16 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con
 
     utils::apply_paged_attention_transformations(main_model, main_model_desc.scheduler_config.use_cache_eviction);
     utils::apply_paged_attention_transformations(draft_model, main_model_desc.scheduler_config.use_cache_eviction);
+    utils::apply_gather_before_matmul_transformation(main_model);
+    utils::apply_gather_before_matmul_transformation(draft_model);
 
     std::string draft_device = draft_model_desc.device.empty() ? main_model_desc.device : draft_model_desc.device;
-
-    bool is_scheduler_undefined = draft_model_desc.scheduler_config == SchedulerConfig();
+    bool is_draft_scheduler_undefined = draft_model_desc.scheduler_config == SchedulerConfig();
 
     ov::genai::SchedulerConfig main_scheduler_config_updated = main_scheduler_config,
-                               draft_scheduler_config = is_scheduler_undefined ? main_scheduler_config : draft_model_desc.scheduler_config;
-    if (is_scheduler_undefined) {
+                               draft_scheduler_config = is_draft_scheduler_undefined ? main_scheduler_config : draft_model_desc.scheduler_config;
+
+    if (is_draft_scheduler_undefined) {
         // split KV cache to 2 caches for main and draft models
         size_t main_model_hidden_size = utils::get_hidden_size(main_model),
                draft_model_hidden_size = utils::get_hidden_size(draft_model);
@@ -61,9 +59,10 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con
         draft_scheduler_config.cache_size = draft_cache_size;
     }
 
-    ov::AnyMap draft_properties = draft_model_desc.properties == ov::AnyMap{} ? compile_properties : draft_model_desc.properties;
+    ov::AnyMap draft_properties = draft_model_desc.properties.empty() ? main_model_desc.properties : draft_model_desc.properties;
 
-    DeviceConfig main_device_config(core, main_scheduler_config_updated, main_device, compile_properties),
+    ov::Core core = utils::singleton_core();
+    DeviceConfig main_device_config(core, main_scheduler_config_updated, main_device, main_model_desc.properties),
                  draft_device_config(core, draft_scheduler_config, draft_device, draft_properties);
 
     utils::set_kv_cache_type_and_shape(main_model, main_device_config);
@@ -82,7 +81,7 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con
     // to create `main_pipeline` with enabled validation_mode and `draft_pipeline` with disabled validation mode
     m_main_pipeline = std::make_shared<ContinuousBatchingForSpeculativeDecodingImpl>(core,
         main_model, main_model_tokenizer, main_model_desc.generation_config,
-        main_device_config, main_scheduler_config_updated, main_device, compile_properties, true);
+        main_device_config, main_scheduler_config_updated, main_device, main_model_desc.properties, true);
     m_draft_pipeline = std::make_shared<ContinuousBatchingForSpeculativeDecodingImpl>(core,
         draft_model, draft_model_tokenizer, draft_model_desc.generation_config,
         draft_device_config, draft_scheduler_config, draft_device, draft_properties, false);
@@ -193,10 +192,20 @@ std::vector<EncodedGenerationResult>
 ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<ov::Tensor>& input_ids,
                                                               const std::vector<GenerationConfig>& sampling_params,
                                                               const StreamerVariant& streamer) {
-    ManualTimer generate_timer("speculative_decoding: generate()");
-    generate_timer.start();
     OPENVINO_ASSERT(!has_non_finished_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request");
     OPENVINO_ASSERT(input_ids.size() == sampling_params.size());
+
+    ManualTimer generate_timer("speculative_decoding: generate()");
+    generate_timer.start();
+
+    // checks that all requests has the same LoRA adapters property value
+    for (size_t i = 1; i < sampling_params.size(); ++i) {
+        OPENVINO_ASSERT(sampling_params[i - 1].adapters == sampling_params[i].adapters,
+            "LoRA adapters value must be the same for all requests");
+    }
+    m_main_pipeline->set_adapters(sampling_params[0].adapters);
+    m_draft_pipeline->set_adapters(sampling_params[0].adapters);
+
     const std::shared_ptr<StreamerBase>& streamer_ptr = std::visit(overloaded{
         [](std::monostate) -> std::shared_ptr<StreamerBase> {
             return nullptr;
diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp
index 3df02ac394..2f8067cbab 100644
--- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp
+++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp
@@ -34,7 +34,7 @@ struct ModelDesc {
     ModelDesc() = default;
 };
 
-class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBatchingPipeline::ImplInterface {
+class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBatchingPipeline::IContinuousBatchingPipeline {
 protected:
     std::shared_ptr<ContinuousBatchingForSpeculativeDecodingImpl> m_main_pipeline, m_draft_pipeline;
     SpeculativeDecodingMetrics m_sd_metrics;
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index 82c0a17a55..e1def95931 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -148,18 +148,16 @@ class Tokenizer::TokenizerImpl {
         m_skip_special_tokens = skip_special_tokens_flag;
     }
 
-    TokenizerImpl() = default;
-
-    TokenizerImpl(const std::filesystem::path& models_path,  const ov::AnyMap& properties) {
+    TokenizerImpl(const std::filesystem::path& models_path, const ov::AnyMap& properties) {
         setup_tokenizer(models_path, properties);
     }
 
-    TokenizerImpl(const std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::Model>>& models,  const ov::AnyMap& properties) {
+    TokenizerImpl(const std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::Model>>& models, const ov::AnyMap& properties) {
         setup_tokenizer(models, properties);
     }
 
     void setup_tokenizer(const std::filesystem::path& models_path, const ov::AnyMap& properties) {
-        ScopedVar env_manager(tokenizers_relative_to_genai().string());
+        ScopedVar env_manager(tokenizers_relative_to_genai());
         auto core = get_core_singleton();
 
         OPENVINO_ASSERT(models_path.extension() != ".xml", "'models_path' parameter should be a path to a dir not a xml file");
@@ -168,11 +166,11 @@ class Tokenizer::TokenizerImpl {
         std::shared_ptr<ov::Model> ov_detokenizer = nullptr;
 
         if (std::filesystem::exists(models_path / "openvino_tokenizer.xml")) {
-            ov_tokenizer = core.read_model(models_path / "openvino_tokenizer.xml");
+            ov_tokenizer = core.read_model(models_path / "openvino_tokenizer.xml", {}, properties);
         }
 
         if (std::filesystem::exists(models_path / "openvino_detokenizer.xml")) {
-            ov_detokenizer = core.read_model(models_path / "openvino_detokenizer.xml");
+            ov_detokenizer = core.read_model(models_path / "openvino_detokenizer.xml", {}, properties);
         }
 
         setup_tokenizer(std::make_pair(ov_tokenizer, ov_detokenizer), properties);
@@ -242,10 +240,12 @@ class Tokenizer::TokenizerImpl {
             decode({1, 33, 199, 42, 42});
         }
 
-        utils::read_rt_info(ov_tokenizer, "chat_template", m_chat_template);
-        utils::read_rt_info(ov_tokenizer, "pad_token_id", m_pad_token_id);
-        utils::read_rt_info(ov_tokenizer, "bos_token_id", m_bos_token_id);
-        utils::read_rt_info(ov_tokenizer, "eos_token_id", m_eos_token_id);
+        if (m_tokenizer) {
+            utils::read_rt_info(ov_tokenizer, "chat_template", m_chat_template);
+            utils::read_rt_info(ov_tokenizer, "pad_token_id", m_pad_token_id);
+            utils::read_rt_info(ov_tokenizer, "bos_token_id", m_bos_token_id);
+            utils::read_rt_info(ov_tokenizer, "eos_token_id", m_eos_token_id);
+        }
 
         m_chat_template = patch_chat_template(m_chat_template);
         if (m_detokenizer) {
@@ -389,12 +389,13 @@ class Tokenizer::TokenizerImpl {
         OPENVINO_ASSERT(m_ireq_queue_tokenizer, "Either openvino_tokenizer.xml was not provided or it was not loaded correctly. "
                                                 "Tokenizer::encode is not available");
 
-        CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_tokenizer.get());
+        CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(m_ireq_queue_tokenizer.get());
         set_state_if_necessary(infer_request_guard, tokenization_params);
         size_t batch_size = 1;
         infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt});
         infer_request_guard.get().start_async();
         infer_request_guard.get().wait();
+
         return get_copied_results(
             infer_request_guard.get().get_output_tensor(0),
             infer_request_guard.get().get_output_tensor(1)
@@ -404,6 +405,7 @@ class Tokenizer::TokenizerImpl {
     TokenizedInputs encode(std::vector<std::string>& prompts, const ov::AnyMap& tokenization_params = {}) {
         OPENVINO_ASSERT(m_ireq_queue_tokenizer, "Either openvino_tokenizer.xml was not provided or it was not loaded correctly. "
                                                 "Tokenizer::encode is not available");
+
         TokenizedInputs unpadded;
         {
             CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_tokenizer.get());
@@ -418,6 +420,7 @@ class Tokenizer::TokenizerImpl {
                 infer_request_guard.get().get_output_tensor(1)
             );
         }
+
         return pad_left(unpadded.input_ids, unpadded.attention_mask);
     }
 
@@ -431,7 +434,7 @@ class Tokenizer::TokenizerImpl {
     }
 
     std::string decode(std::vector<int64_t> tokens, const ov::AnyMap& detokenization_params = {}) {
-        OPENVINO_ASSERT(m_detokenizer, "Detokenize model has not been provided. Tokenizer::decode is not available");
+        OPENVINO_ASSERT(m_detokenizer, "Detokenizer model has not been provided. Tokenizer::decode is not available");
 
         CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_detokenizer.get());
         set_state_if_necessary(infer_request_guard, detokenization_params);
@@ -443,7 +446,7 @@ class Tokenizer::TokenizerImpl {
     }
 
     std::vector<std::string> decode(ov::Tensor tokens, const ov::AnyMap& detokenization_params = {}) {
-        OPENVINO_ASSERT(m_detokenizer, "Detokenize model has not been provided. Tokenizer::decode is not available");
+        OPENVINO_ASSERT(m_detokenizer, "Detokenizer model has not been provided. Tokenizer::decode is not available");
         OPENVINO_ASSERT(tokens.get_element_type() == ov::element::i64, "tokens tensor element type should be an i64");
         OPENVINO_ASSERT(tokens.get_shape().size() == 2, "tokens tensor should of rank 2 with shape [batch_size, seq_len]");
 
@@ -459,7 +462,7 @@ class Tokenizer::TokenizerImpl {
     }
 
     std::vector<std::string> decode(std::vector<std::vector<int64_t>> lines, const ov::AnyMap& detokenization_params = {}) {
-        OPENVINO_ASSERT(m_detokenizer, "Detokenize model has not been provided. Tokenizer::decode is not available");
+        OPENVINO_ASSERT(m_detokenizer, "Detokenizer model has not been provided. Tokenizer::decode is not available");
 
         auto compare_lengths = [](const std::vector<int64_t>& a, const std::vector<int64_t>& b) {
             return a.size() < b.size();
@@ -597,7 +600,7 @@ Tokenizer::Tokenizer(
     ov::Tensor&  detokenizer_weights_tensor,
     const ov::AnyMap& properties
 ) {
-    ScopedVar env_manager(tokenizers_relative_to_genai().string());
+    ScopedVar env_manager(tokenizers_relative_to_genai());
     auto core = get_core_singleton();
 
     auto ov_tokenizer = core.read_model(tokenizer_model_str, tokenizer_weights_tensor);
@@ -606,7 +609,7 @@ Tokenizer::Tokenizer(
 }
 
 Tokenizer::Tokenizer(const std::string& model_str, ov::Tensor& weights_tensor, const ov::AnyMap& properties) {
-    ScopedVar env_manager(tokenizers_relative_to_genai().string());
+    ScopedVar env_manager(tokenizers_relative_to_genai());
     auto core = get_core_singleton();
     auto model = core.read_model(model_str, weights_tensor);
 
diff --git a/src/cpp/src/tokenizers_path.hpp b/src/cpp/src/tokenizers_path.hpp
index a8ef1cb214..489542f2aa 100644
--- a/src/cpp/src/tokenizers_path.hpp
+++ b/src/cpp/src/tokenizers_path.hpp
@@ -26,16 +26,16 @@ class ScopedVar {
 public:
     static constexpr char ENVIRONMENT_VARIABLE_NAME[] = "OPENVINO_TOKENIZERS_PATH_GENAI";
 
-    explicit ScopedVar(const std::string& environment_variable_value) {
+    explicit ScopedVar(const std::filesystem::path& environment_variable_value) {
 #ifdef _WIN32
         char* value = nullptr;
         size_t len = 0;
         _dupenv_s(&value, &len, ENVIRONMENT_VARIABLE_NAME);
         if (value == nullptr)
-            _putenv_s(ENVIRONMENT_VARIABLE_NAME, environment_variable_value.c_str());
+            _putenv_s(ENVIRONMENT_VARIABLE_NAME, environment_variable_value.string().c_str());
 #else
         if (!getenv(ENVIRONMENT_VARIABLE_NAME))
-            setenv(ENVIRONMENT_VARIABLE_NAME, environment_variable_value.c_str(), 1);
+            setenv(ENVIRONMENT_VARIABLE_NAME, environment_variable_value.string().c_str(), 1);
 #endif
         else
             was_already_set = true;
diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
index 83dbf15376..9261aa7a4a 100644
--- a/src/cpp/src/utils.cpp
+++ b/src/cpp/src/utils.cpp
@@ -4,9 +4,11 @@
 #include "utils.hpp"
 
 #include <fstream>
+#include <memory>
 
 #include "openvino/op/add.hpp"
 #include "openvino/op/divide.hpp"
+#include "openvino/op/gather.hpp"
 #include "openvino/op/multiply.hpp"
 #include "openvino/op/matmul.hpp"
 #include "openvino/op/slice.hpp"
@@ -200,27 +202,6 @@ ProcessorConfig from_any_map(
     return extracted_config;
 }
 
-/**
- * Split config by core and compile configs
- * There are not supported by `core.compile` function plugin options like `ENABLE_MMAP`
- * Move this options to `core.set_property` config
- */
-std::pair<ov::AnyMap, ov::AnyMap> split_core_compile_config(const ov::AnyMap& properties) {
-    const std::vector<std::string> unsupported_by_compile_properties{"ENABLE_MMAP"};
-    ov::AnyMap core_properties;
-    ov::AnyMap compile_properties{properties};
-
-    for (const auto option : unsupported_by_compile_properties) {
-        auto iter = properties.find(option);
-        if (iter != properties.end()) {
-            core_properties[option] = iter->second;
-            compile_properties.erase(option);
-        }
-    }
-
-    return {core_properties, compile_properties};
-};
-
 /**
  * scheduler_config is a separate config for continuous batching pipeline. 
  * This routine splits scheduler_config from plugin_config.
@@ -236,14 +217,6 @@ std::pair<ov::AnyMap, SchedulerConfig> split_scheduler_config(const ov::AnyMap&
     return {plugin_config, scheduler_config};
 };
 
-std::shared_ptr<ov::Model> read_model_with_config(const std::filesystem::path& models_path, const ov::AnyMap& properties) {
-    auto [core_properties, compile_properties] = split_core_compile_config(properties);
-    ov::Core core;
-    core.set_property(core_properties);
-    std::filesystem::path openvino_model_name = "openvino_model.xml";
-    return core.read_model((models_path / openvino_model_name).string());
-}
-
 ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend) {
     auto minuend_size = minuend.input_ids.get_size();
     auto subtrahend_size = subtrahend.input_ids.get_size();
@@ -259,23 +232,34 @@ ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::Token
     return {new_input_ids, new_attention_mask};
 }
 
-void slice_matmul_stateful_model(std::shared_ptr<ov::Model> model) {
-    auto last_node = model->output(0).get_node()->input_value(0).get_node();
-    ov::Node* matmul = dynamic_cast<ov::op::v0::MatMul*>(last_node);
-    if (matmul) {
-        // we have found matmul, do nothing
-    } else if(auto add = dynamic_cast<ov::op::v1::Add*>(last_node)) {
-        matmul = dynamic_cast<ov::op::v0::MatMul*>(add->input_value(0).get_node());
-    } else if (auto transpose = dynamic_cast<ov::op::v1::Transpose*>(last_node)) {
-        matmul = dynamic_cast<ov::op::v0::MatMul*>(transpose->input_value(0).get_node());
-    } else if (auto multiply = dynamic_cast<ov::op::v1::Multiply*>(last_node)) {
-        if (auto tanh = dynamic_cast<ov::op::v0::Tanh*>(multiply->input_value(0).get_node())) {
-            if (auto divide = dynamic_cast<ov::op::v1::Divide*>(tanh->input_value(0).get_node())) {
-                matmul = dynamic_cast<ov::op::v0::MatMul*>(divide->input_value(0).get_node());
+namespace {
+std::shared_ptr<ov::Node> find_llm_matmul(const std::shared_ptr<ov::Model>& model) {
+    auto last_node = model->output(0).get_node()->input_value(0).get_node_shared_ptr();
+    std::shared_ptr<ov::Node> matmul = std::dynamic_pointer_cast<ov::op::v0::MatMul>(last_node);
+    // There are several patterns for matmul we are looking for:
+    // Matmul -> Result
+    // Matmul -> Add -> Result
+    // Matmul -> Transpose -> Result
+    // MatMul -> Divide -> Tanh -> Multiply -> Result
+    if (!matmul) {
+        if(auto add = std::dynamic_pointer_cast<ov::op::v1::Add>(last_node)) {
+            matmul = std::dynamic_pointer_cast<ov::op::v0::MatMul>(add->input_value(0).get_node_shared_ptr());
+        } else if (auto transpose = std::dynamic_pointer_cast<ov::op::v1::Transpose>(last_node)) {
+            matmul = std::dynamic_pointer_cast<ov::op::v0::MatMul>(transpose->input_value(0).get_node_shared_ptr());
+        } else if (auto multiply = std::dynamic_pointer_cast<ov::op::v1::Multiply>(last_node)) {
+            if (auto tanh = std::dynamic_pointer_cast<ov::op::v0::Tanh>(multiply->input_value(0).get_node_shared_ptr())) {
+                if (auto divide = std::dynamic_pointer_cast<ov::op::v1::Divide>(tanh->input_value(0).get_node_shared_ptr())) {
+                    matmul = std::dynamic_pointer_cast<ov::op::v0::MatMul>(divide->input_value(0).get_node_shared_ptr());
+                }
             }
         }
     }
+    return matmul;
+}
+} // namespace
 
+void apply_slice_before_matmul_transformation(std::shared_ptr<ov::Model> model) {
+    auto matmul = find_llm_matmul(model);
     if (matmul && matmul->input(0).get_partial_shape().rank().get_length() == 3) {
         auto start = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-1});
         auto stop = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-2});
@@ -286,6 +270,19 @@ void slice_matmul_stateful_model(std::shared_ptr<ov::Model> model) {
     }
 }
 
+void apply_gather_before_matmul_transformation(std::shared_ptr<ov::Model> model) {
+    auto matmul =  ov::genai::utils::find_llm_matmul(model);
+    if (matmul && matmul->input(0).get_partial_shape().rank().get_length() == 3) {
+        auto indices = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::PartialShape{-1});
+        indices->set_friendly_name("sampled_tokens_indices");
+        indices->output(0).get_tensor().set_names({"sampled_tokens_indices"});
+        auto axis = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{0});
+        auto gather = std::make_shared<ov::op::v8::Gather>(matmul->input_value(0), indices, axis);
+        matmul->input(0).replace_source_output(gather);
+        model->add_parameters({indices});
+    }
+}
+
 template <typename T>
 void read_rt_info(std::shared_ptr<ov::Model>& model, const char* name, T& value) {
     if (!model)
@@ -425,7 +422,6 @@ void print_compiled_model_properties(ov::CompiledModel& compiled_Model, const ch
         }
     }
 }
-
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
index 6207c889a2..ad0e1a05d4 100644
--- a/src/cpp/src/utils.hpp
+++ b/src/cpp/src/utils.hpp
@@ -82,11 +82,7 @@ const std::string DRAFT_MODEL_ARG_NAME = "draft_model";
 template<typename Config = ov::genai::GenerationConfig>
 Config from_config_json_if_exists(const std::filesystem::path& models_path, const char config_name[] = "generation_config.json") {
     auto config_file_path = models_path / config_name;
-    if (std::filesystem::exists(config_file_path)) {
-        return Config{(config_file_path).string()};
-    } else {
-        return Config{};
-    }
+    return std::filesystem::exists(config_file_path) ? Config{config_file_path} : Config{};
 }
 
 ov::genai::StreamerVariant get_streamer_from_map(const ov::AnyMap& config_map);
@@ -99,14 +95,13 @@ ProcessorConfig from_any_map(
 );
 
 
-std::pair<ov::AnyMap, ov::AnyMap> split_core_compile_config(const ov::AnyMap& properties);
 std::pair<ov::AnyMap, SchedulerConfig> split_scheduler_config(const ov::AnyMap& properties);
 
-std::shared_ptr<ov::Model> read_model_with_config(const std::filesystem::path& models_path, const ov::AnyMap& properties);
-
 ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend);
 
-void slice_matmul_stateful_model(std::shared_ptr<ov::Model> model);
+void apply_slice_before_matmul_transformation(std::shared_ptr<ov::Model> model);
+
+void apply_gather_before_matmul_transformation(std::shared_ptr<ov::Model> model);
 
 ov::Core singleton_core();
 
diff --git a/src/cpp/src/utils/paged_attention_transformations.hpp b/src/cpp/src/utils/paged_attention_transformations.hpp
index 88ac0876c5..2cb32adcdc 100644
--- a/src/cpp/src/utils/paged_attention_transformations.hpp
+++ b/src/cpp/src/utils/paged_attention_transformations.hpp
@@ -27,6 +27,8 @@ size_t get_hidden_size(const std::shared_ptr<ov::Model> model);
 
 void set_kv_cache_type_and_shape(std::shared_ptr<ov::Model> model, DeviceConfig& device_config);
 
+void apply_gather_before_matmul_transformation(std::shared_ptr<ov::Model> model);
+
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/visual_language/embedding_model.cpp b/src/cpp/src/visual_language/embedding_model.cpp
index 307bdcebac..a2a9750c33 100644
--- a/src/cpp/src/visual_language/embedding_model.cpp
+++ b/src/cpp/src/visual_language/embedding_model.cpp
@@ -21,7 +21,7 @@ EmbeddingsModel::EmbeddingsModel(const std::filesystem::path& model_dir,
                                  const std::string& device,
                                  const ov::AnyMap& properties) {
     ov::Core core = utils::singleton_core();
-    std::shared_ptr<ov::Model> m_model = core.read_model((model_dir / "openvino_text_embeddings_model.xml").string());
+    std::shared_ptr<ov::Model> m_model = core.read_model(model_dir / "openvino_text_embeddings_model.xml", {}, properties);
     // apply embedding postprocessing step by merging them into the model
     merge_postprocess(m_model, scale_emb);
 
diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp
index d625485205..ebc5c3b5dd 100644
--- a/src/cpp/src/visual_language/pipeline.cpp
+++ b/src/cpp/src/visual_language/pipeline.cpp
@@ -175,7 +175,6 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         std::vector<SequenceGroup::Ptr> requests;
         size_t request_id = 0;
         size_t block_size = 1; // not used
-        bool enable_prefix_caching = false;
 
         size_t history_size = m_language.get_tensor("attention_mask").get_shape().at(1) - to_remove_from_hist;
         size_t inputs_embeds_size = inputs_embeds.get_shape().at(1);
@@ -185,8 +184,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         std::fill_n(prompt_ids.data<int64_t>(), prompt_ids.get_size(), m_tokenizer.get_pad_token_id());
         std::copy(tokenized_history.begin(), tokenized_history.end(), prompt_ids.data<int64_t>());
 
-        SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, prompt_ids, generation_config, block_size, enable_prefix_caching);
-        sequence_group->set_sequence_group_ptr(sequence_group);
+        SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, prompt_ids, generation_config, block_size);
         requests.push_back(sequence_group);
 
         std::shared_ptr<StreamerBase> streamer_ptr = std::visit(overloaded{
diff --git a/src/cpp/src/visual_language/processor_config.cpp b/src/cpp/src/visual_language/processor_config.cpp
index 7b953e5bed..fc524fce9c 100644
--- a/src/cpp/src/visual_language/processor_config.cpp
+++ b/src/cpp/src/visual_language/processor_config.cpp
@@ -8,7 +8,7 @@
 
 ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_path) {
     std::ifstream stream(json_path);
-    OPENVINO_ASSERT(stream.is_open(), "Failed to open '" + json_path.string() + "' with processor config");
+    OPENVINO_ASSERT(stream.is_open(), "Failed to open '", json_path, "' with processor config");
     nlohmann::json parsed = nlohmann::json::parse(stream);
     using ov::genai::utils::read_json_param;
     read_json_param(parsed, "patch_size", patch_size); // For llava - stored in config.json vision_config
diff --git a/src/cpp/src/visual_language/vlm_config.cpp b/src/cpp/src/visual_language/vlm_config.cpp
index c4022ab80e..c711998128 100644
--- a/src/cpp/src/visual_language/vlm_config.cpp
+++ b/src/cpp/src/visual_language/vlm_config.cpp
@@ -8,7 +8,7 @@
 
 ov::genai::VLMConfig::VLMConfig(const std::filesystem::path& json_path) {
     std::ifstream stream(json_path);
-    OPENVINO_ASSERT(stream.is_open(), "Failed to open '" + json_path.string() + "' with processor config");
+    OPENVINO_ASSERT(stream.is_open(), "Failed to open '", json_path, "' with processor config");
     nlohmann::json parsed = nlohmann::json::parse(stream);
     using ov::genai::utils::read_json_param;
     model_type = to_vlm_model_type(parsed.at("model_type"));
diff --git a/src/cpp/src/whisper_pipeline.cpp b/src/cpp/src/whisper_pipeline.cpp
index f0fb34cdf6..70dbc48507 100644
--- a/src/cpp/src/whisper_pipeline.cpp
+++ b/src/cpp/src/whisper_pipeline.cpp
@@ -54,19 +54,16 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi
                                 const ov::AnyMap& properties)
         : WhisperPipelineImplBase{models_path} {
         ov::Core core = utils::singleton_core();
-        auto [core_properties, compile_properties] = ov::genai::utils::split_core_compile_config(properties);
-        core.set_property(core_properties);
 
-        ov::CompiledModel compiled_model;
-        compiled_model =
-            core.compile_model((models_path / "openvino_encoder_model.xml").string(), device, compile_properties);
+        ov::CompiledModel compiled_model = core.compile_model(models_path / "openvino_encoder_model.xml", device, properties);
         ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper encoder model");
         m_models.encoder = compiled_model.create_infer_request();
-        compiled_model =
-            core.compile_model((models_path / "openvino_decoder_model.xml").string(), device, compile_properties);
+
+        compiled_model = core.compile_model(models_path / "openvino_decoder_model.xml", device, properties);
         ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper decoder model");
         m_models.decoder = compiled_model.create_infer_request();
-        compiled_model = core.compile_model(models_path / "openvino_decoder_with_past_model.xml", device, compile_properties);
+
+        compiled_model = core.compile_model(models_path / "openvino_decoder_with_past_model.xml", device, properties);
         m_models.decoder_with_past = compiled_model.create_infer_request();
         ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper decoder with past model");
 
@@ -81,6 +78,10 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi
                                    ChunkStreamerVariant streamer) override {
         auto start_time = std::chrono::steady_clock::now();
         WhisperGenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
+
+        // If eos_token_id was not provided, take value from default m_generation_config
+        if (config.eos_token_id == -1)
+            config.set_eos_token_id(m_generation_config.eos_token_id);
         config.validate();
 
         std::shared_ptr<ChunkStreamerBase> streamer_ptr;
diff --git a/src/cpp/src/whisper_pipeline_static.cpp b/src/cpp/src/whisper_pipeline_static.cpp
index cc61eb0659..01fe882187 100644
--- a/src/cpp/src/whisper_pipeline_static.cpp
+++ b/src/cpp/src/whisper_pipeline_static.cpp
@@ -546,9 +546,9 @@ WhisperPipeline::StaticWhisperPipeline::StaticWhisperPipeline(const std::filesys
     : WhisperPipelineImplBase{models_path} {
     ov::Core core = utils::singleton_core();
 
-    auto encoder_model = core.read_model(models_path / "openvino_encoder_model.xml");
-    auto decoder_model = core.read_model(models_path / "openvino_decoder_model.xml");
-    auto decoder_with_past_model = core.read_model(models_path / "openvino_decoder_with_past_model.xml");
+    auto encoder_model = core.read_model(models_path / "openvino_encoder_model.xml", {}, properties);
+    auto decoder_model = core.read_model(models_path / "openvino_decoder_model.xml", {}, properties);
+    auto decoder_with_past_model = core.read_model(models_path / "openvino_decoder_with_past_model.xml", {}, properties);
 
     add_attention_mask_input_for_decoder(decoder_model);
     add_attention_mask_input(decoder_with_past_model);
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 75a2fd59a7..1293246260 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -34,9 +34,6 @@ file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.py"
           "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/py_openvino_genai.pyi"
      DESTINATION "${CMAKE_BINARY_DIR}/openvino_genai/")
 
-configure_file("${OpenVINOGenAI_SOURCE_DIR}/cmake/templates/__version__.py.in"
-               "${CMAKE_BINARY_DIR}/openvino_genai/__version__.py" @ONLY)
-
 if(OpenVINODeveloperPackage_FOUND)
     # TODO: commit changes separately
     # ov_add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME})
@@ -69,18 +66,12 @@ endif()
 install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.py"
               "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.pyi"
               "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/py_openvino_genai.pyi"
-              "${CMAKE_BINARY_DIR}/openvino_genai/__version__.py"
         DESTINATION python/openvino_genai
         COMPONENT pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR})
 install(TARGETS ${TARGET_NAME}
         LIBRARY DESTINATION python/openvino_genai
         COMPONENT pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR})
 
-install(FILES "${CMAKE_BINARY_DIR}/openvino_genai/__version__.py"
-        DESTINATION openvino_genai
-        COMPONENT wheel_genai
-        EXCLUDE_FROM_ALL)
-
 install(FILES "${OpenVINOGenAI_SOURCE_DIR}/LICENSE"
               "${OpenVINOGenAI_SOURCE_DIR}/third-party-programs.txt"
               "${OpenVINOGenAI_SOURCE_DIR}/SECURITY.md"
@@ -154,7 +145,8 @@ if(pybind11_stubgen_AVAILABLE)
     endif()
 
     set(stub_files_location "${OpenVINOGenAI_BINARY_DIR}/src/python")
-    set(generated_files ${stub_files_location}/openvino_genai/__init__.pyi
+    set(init_pyi_file "${stub_files_location}/openvino_genai/__init__.pyi")
+    set(generated_files ${init_pyi_file}
                         ${stub_files_location}/openvino_genai/py_openvino_genai.pyi)
     set_source_files_properties(${generated_files} PROPERTIES GENERATED ON)
 
@@ -184,6 +176,9 @@ if(pybind11_stubgen_AVAILABLE)
                                             "${CMAKE_BINARY_DIR}/openvino_genai/py_openvino_genai.pyi"
         COMMAND "${CMAKE_COMMAND}" -E env PYTHONPATH=${CMAKE_BINARY_DIR}:${openvino_pythonpath}:$ENV{PYTHONPATH}
                 ${pybind11_stubgen} --output-dir ${stub_files_location} openvino_genai
+        COMMAND "${CMAKE_COMMAND}"
+                -D init_pyi_file=${init_pyi_file}
+                -P "${CMAKE_CURRENT_SOURCE_DIR}/clean_version.cmake"
         ${validation_command}
         ${copy_to_source_command}
         COMMAND "${CMAKE_COMMAND}" -E copy ${generated_files} "${CMAKE_BINARY_DIR}/openvino_genai/"
@@ -192,6 +187,7 @@ if(pybind11_stubgen_AVAILABLE)
             ${python_sources}
             ${validation_dependencies}
             "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.py"
+            "${CMAKE_CURRENT_SOURCE_DIR}/clean_version.cmake"
             "${CMAKE_CURRENT_SOURCE_DIR}/compare_pyi.cmake"
         COMMENT "[${pybind11_stubgen_dep}] Generate .pyi files"
         VERBATIM)
diff --git a/src/python/clean_version.cmake b/src/python/clean_version.cmake
new file mode 100644
index 0000000000..f02e293493
--- /dev/null
+++ b/src/python/clean_version.cmake
@@ -0,0 +1,21 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+foreach(var IN ITEMS init_pyi_file)
+    if(NOT DEFINED ${var})
+        message(FATAL_ERROR "Variable ${var} is not defined")
+    endif()
+endforeach()
+
+file(STRINGS ${init_pyi_file} file_lines)
+
+foreach(file_line IN LISTS file_lines)
+    if(file_line MATCHES "^__version__.*")
+        set(file_line "__version__: str")
+    endif()
+
+    set(file_content "${file_content}${file_line}\n")
+endforeach()
+
+file(WRITE ${init_pyi_file} ${file_content})
diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py
index dd98b25da1..7620fd22d4 100644
--- a/src/python/openvino_genai/__init__.py
+++ b/src/python/openvino_genai/__init__.py
@@ -5,8 +5,6 @@
 
 import openvino  # add_dll_directory for openvino lib
 import os
-from .__version__ import __version__
-
 
 if hasattr(os, "add_dll_directory"):
     os.add_dll_directory(os.path.dirname(__file__))
@@ -17,8 +15,11 @@
     RawPerfMetrics,
     PerfMetrics,
     StreamerBase,
+    get_version,
 )
 
+__version__ = get_version()
+
 # VLM pipeline
 
 from .py_openvino_genai import (
diff --git a/src/python/openvino_genai/__init__.pyi b/src/python/openvino_genai/__init__.pyi
index a5aa727eba..d9211d1e72 100644
--- a/src/python/openvino_genai/__init__.pyi
+++ b/src/python/openvino_genai/__init__.pyi
@@ -41,10 +41,11 @@ from openvino_genai.py_openvino_genai import WhisperGenerationConfig
 from openvino_genai.py_openvino_genai import WhisperPerfMetrics
 from openvino_genai.py_openvino_genai import WhisperPipeline
 from openvino_genai.py_openvino_genai import WhisperRawPerfMetrics
-from openvino_genai.py_openvino_genai import draft_model
 from openvino_genai.py_openvino_genai import RawImageGenerationPerfMetrics
 from openvino_genai.py_openvino_genai import ImageGenerationPerfMetrics
+from openvino_genai.py_openvino_genai import draft_model
+from openvino_genai.py_openvino_genai import get_version
 import os as os
 from . import py_openvino_genai
-__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationResult', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'PerfMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMPipeline', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'ImageGenerationPerfMetrics', 'RawImageGenerationPerfMetrics', 'draft_model', 'openvino', 'os', 'py_openvino_genai']
-__version__: str = '2025.0.0.0'
+__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationResult', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'PerfMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMPipeline', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'ImageGenerationPerfMetrics', 'RawImageGenerationPerfMetrics', 'draft_model', 'get_version', 'openvino', 'os', 'py_openvino_genai']
+__version__: str
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index cc706895c2..8f16b6f9fa 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -5,7 +5,11 @@ from __future__ import annotations
 import openvino._pyopenvino
 import os
 import typing
+<<<<<<< HEAD
 __all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedGenerationResult', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationFinishReason', 'GenerationHandle', 'GenerationOutput', 'GenerationResult', 'GenerationStatus', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'MeanStdPair', 'PerfMetrics', 'PipelineMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMDecodedResults', 'VLMPerfMetrics', 'VLMPipeline', 'VLMRawPerfMetrics', 'WhisperDecodedResultChunk', 'WhisperDecodedResults', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'ImageGenerationPerfMetrics', 'RawImageGenerationPerfMetrics']
+=======
+__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedGenerationResult', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationFinishReason', 'GenerationHandle', 'GenerationOutput', 'GenerationResult', 'GenerationStatus', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'MeanStdPair', 'PerfMetrics', 'PipelineMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMDecodedResults', 'VLMPerfMetrics', 'VLMPipeline', 'VLMRawPerfMetrics', 'WhisperDecodedResultChunk', 'WhisperDecodedResults', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'get_version']
+>>>>>>> master
 class Adapter:
     """
     Immutable LoRA Adapter that carries the adaptation matrices and serves as unique adapter identifier.
@@ -364,7 +368,7 @@ class ContinuousBatchingPipeline:
     def __init__(self, models_path: os.PathLike, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}, tokenizer_properties: dict[str, typing.Any] = {}) -> None:
         ...
     @typing.overload
-    def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}) -> None:
+    def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, **kwargs) -> None:
         ...
     @typing.overload
     def add_request(self, request_id: int, input_ids: openvino._pyopenvino.Tensor, generation_config: GenerationConfig) -> GenerationHandle:
@@ -697,6 +701,7 @@ class GenerationResult:
     """
     m_generation_ids: list[str]
     m_scores: list[float]
+    m_status: GenerationStatus
     def __init__(self) -> None:
         ...
     def __repr__(self) -> str:
@@ -2199,10 +2204,6 @@ class WhisperRawPerfMetrics:
     @property
     def features_extraction_durations(self) -> list[float]:
         ...
-def draft_model(models_path: os.PathLike, device: str = '', **kwargs) -> openvino._pyopenvino.OVAny:
-    """
-    device on which inference will be performed
-    """
 class RawImageGenerationPerfMetrics:
     """
 
@@ -2297,4 +2298,12 @@ class ImageGenerationPerfMetrics:
     def get_load_time(self) -> float:
         ...
     def get_generate_duration(self) -> float:
-        ...
\ No newline at end of file
+        ...
+def draft_model(models_path: os.PathLike, device: str = '', **kwargs) -> openvino._pyopenvino.OVAny:
+    """
+    device on which inference will be performed
+    """
+def get_version() -> str:
+    """
+    OpenVINO GenAI version
+    """
diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp
index 2b48e4d44d..975100cb11 100644
--- a/src/python/py_continuous_batching_pipeline.cpp
+++ b/src/python/py_continuous_batching_pipeline.cpp
@@ -119,6 +119,13 @@ std::ostream& operator << (std::ostream& stream, const GenerationResult& generat
 } // namespace
 
 void init_continuous_batching_pipeline(py::module_& m) {
+    py::enum_<ov::genai::GenerationStatus>(m, "GenerationStatus")
+        .value("RUNNING", ov::genai::GenerationStatus::RUNNING)
+        .value("FINISHED", ov::genai::GenerationStatus::FINISHED)
+        .value("IGNORED", ov::genai::GenerationStatus::IGNORED)
+        .value("DROPPED_BY_PIPELINE", ov::genai::GenerationStatus::DROPPED_BY_PIPELINE)
+        .value("DROPPED_BY_HANDLE", ov::genai::GenerationStatus::DROPPED_BY_HANDLE);
+
     py::class_<GenerationResult>(m, "GenerationResult", generation_result_docstring)
         .def(py::init<>())
         .def_readonly("m_request_id", &GenerationResult::m_request_id)
@@ -130,6 +137,7 @@ void init_continuous_batching_pipeline(py::module_& m) {
                 r.m_generation_ids = generation_ids;
             })
         .def_readwrite("m_scores", &GenerationResult::m_scores)
+        .def_readwrite("m_status", &GenerationResult::m_status)
         .def("__repr__",
             [](const GenerationResult &r) -> py::str {
                 std::stringstream stream;
@@ -148,13 +156,6 @@ void init_continuous_batching_pipeline(py::module_& m) {
         .def_readwrite("m_generation_ids", &EncodedGenerationResult::m_generation_ids)
         .def_readwrite("m_scores", &EncodedGenerationResult::m_scores);
 
-    py::enum_<ov::genai::GenerationStatus>(m, "GenerationStatus")
-        .value("RUNNING", ov::genai::GenerationStatus::RUNNING)
-        .value("FINISHED", ov::genai::GenerationStatus::FINISHED)
-        .value("IGNORED", ov::genai::GenerationStatus::IGNORED)
-        .value("DROPPED_BY_PIPELINE", ov::genai::GenerationStatus::DROPPED_BY_PIPELINE)
-        .value("DROPPED_BY_HANDLE", ov::genai::GenerationStatus::DROPPED_BY_HANDLE);
-
     py::enum_<ov::genai::GenerationFinishReason>(m, "GenerationFinishReason")
         .value("NONE", ov::genai::GenerationFinishReason::NONE)
         .value("STOP", ov::genai::GenerationFinishReason::STOP)
@@ -222,15 +223,14 @@ void init_continuous_batching_pipeline(py::module_& m) {
         py::arg("properties") = ov::AnyMap({}),
         py::arg("tokenizer_properties") = ov::AnyMap({}))
 
-        .def(py::init([](const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& plugin_config) {
+        .def(py::init([](const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const py::kwargs& kwargs) {
             ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
-            return std::make_unique<ContinuousBatchingPipeline>(models_path, tokenizer, scheduler_config, device, pyutils::properties_to_any_map(plugin_config));
+            return std::make_unique<ContinuousBatchingPipeline>(models_path, tokenizer, scheduler_config, device, pyutils::kwargs_to_any_map(kwargs));
         }),
         py::arg("models_path"),
         py::arg("tokenizer"),
         py::arg("scheduler_config"),
-        py::arg("device"),
-        py::arg("properties") = ov::AnyMap({}))
+        py::arg("device"))
 
         .def("get_tokenizer", &ContinuousBatchingPipeline::get_tokenizer)
         .def("get_config", &ContinuousBatchingPipeline::get_config)
diff --git a/src/python/py_llm_pipeline.cpp b/src/python/py_llm_pipeline.cpp
index 7360975a0b..2d5e5e6abc 100644
--- a/src/python/py_llm_pipeline.cpp
+++ b/src/python/py_llm_pipeline.cpp
@@ -71,7 +71,7 @@ py::object call_common_generate(
         DecodedResults res = pipe.generate(string_input, updated_config, streamer);
         // If input was a string return a single string otherwise return DecodedResults.
         if (updated_config.has_value() && (*updated_config).num_return_sequences == 1) {
-            results = py::cast<py::object>(pyutils::handle_utf8(res.texts)[0]);
+            results = py::cast<py::object>(pyutils::handle_utf8(res.texts[0]));
         } else {
             results = py::cast(res);
         }
diff --git a/src/python/py_openvino_genai.cpp b/src/python/py_openvino_genai.cpp
index 429f48f30d..f8e577d5c8 100644
--- a/src/python/py_openvino_genai.cpp
+++ b/src/python/py_openvino_genai.cpp
@@ -11,6 +11,7 @@
 #include <pybind11/typing.h>
 
 #include "openvino/genai/llm_pipeline.hpp"
+#include "openvino/genai/version.hpp"
 
 #include "py_utils.hpp"
 
@@ -21,6 +22,7 @@ using ov::genai::DecodedResults;
 using ov::genai::EncodedResults;
 using ov::genai::StreamerBase;
 using ov::genai::StringInputs;
+using ov::genai::get_version;
 
 void init_lora_adapter(py::module_& m);
 void init_perf_metrics(py::module_& m);
@@ -82,7 +84,12 @@ class ConstructableStreamer: public StreamerBase {
 PYBIND11_MODULE(py_openvino_genai, m) {
     m.doc() = "Pybind11 binding for OpenVINO GenAI library";
 
+    m.def("get_version", [] () -> py::str {
+        return get_version().buildNumber;
+    }, get_version().description);
+
     init_perf_metrics(m);
+
     py::class_<DecodedResults>(m, "DecodedResults", decoded_results_docstring)
         .def(py::init<>())
         .def_property_readonly("texts", [](const DecodedResults &dr) -> py::typing::List<py::str> { return pyutils::handle_utf8((std::vector<std::string>)dr); })
diff --git a/src/python/py_utils.cpp b/src/python/py_utils.cpp
index 34522409ea..1fc34a36d2 100644
--- a/src/python/py_utils.cpp
+++ b/src/python/py_utils.cpp
@@ -6,6 +6,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <pybind11/stl_bind.h>
+#include <pybind11/stl/filesystem.h>
 #include <pybind11/functional.h>
 
 #include <openvino/runtime/auto/properties.hpp>
@@ -37,6 +38,8 @@ py::list handle_utf8(const std::vector<std::string>& decoded_res) {
     return res;
 }
 
+namespace {
+
 bool py_object_is_any_map(const py::object& py_obj) {
     if (!py::isinstance<py::dict>(py_obj)) {
         return false;
@@ -259,8 +262,6 @@ ov::Any py_object_to_any(const py::object& py_obj, std::string property_name) {
         return py::cast<ov::device::Type>(py_obj);
     } else if (py::isinstance<ov::streams::Num>(py_obj)) {
         return py::cast<ov::streams::Num>(py_obj);
-    } else if (py::isinstance<ov::Affinity>(py_obj)) {
-        return py::cast<ov::Affinity>(py_obj);
     } else if (py::isinstance<ov::Tensor>(py_obj)) {
         return py::cast<ov::Tensor>(py_obj);
     } else if (py::isinstance<ov::Output<ov::Node>>(py_obj)) {
@@ -290,15 +291,16 @@ ov::Any py_object_to_any(const py::object& py_obj, std::string property_name) {
     OPENVINO_THROW("Property \"" + property_name + "\" got unsupported type.");
 }
 
-std::map<std::string, ov::Any> properties_to_any_map(const std::map<std::string, py::object>& properties) {
-    std::map<std::string, ov::Any> properties_to_cpp;
+} // namespace
+
+ov::AnyMap properties_to_any_map(const std::map<std::string, py::object>& properties) {
+    ov::AnyMap properties_to_cpp;
     for (const auto& property : properties) {
         properties_to_cpp[property.first] = py_object_to_any(property.second, property.first);
     }
     return properties_to_cpp;
 }
 
-
 ov::AnyMap kwargs_to_any_map(const py::kwargs& kwargs) {
     ov::AnyMap params = {};
 
@@ -321,13 +323,13 @@ ov::AnyMap kwargs_to_any_map(const py::kwargs& kwargs) {
     return params;
 }
 
-std::string ov_tokenizers_module_path() {
+std::filesystem::path ov_tokenizers_module_path() {
     // Try a path relative to build artifacts folder first.
     std::filesystem::path from_relative = tokenizers_relative_to_genai();
     if (std::filesystem::exists(from_relative)) {
-        return from_relative.string();
+        return from_relative;
     }
-    return py::str(py::module_::import("openvino_tokenizers").attr("_ext_path"));
+    return py::module_::import("openvino_tokenizers").attr("_ext_path").cast<std::filesystem::path>();
 }
 
 ov::genai::StreamerVariant pystreamer_to_streamer(const PyBindStreamerVariant& py_streamer) {
@@ -356,7 +358,7 @@ ov::genai::OptionalGenerationConfig update_config_from_kwargs(const ov::genai::O
         return std::nullopt;
 
     ov::genai::GenerationConfig res_config;
-    if(config.has_value())
+    if (config.has_value())
         res_config = *config;
 
     if (!kwargs.empty())
diff --git a/src/python/py_utils.hpp b/src/python/py_utils.hpp
index 20094196a6..9d78ab0930 100644
--- a/src/python/py_utils.hpp
+++ b/src/python/py_utils.hpp
@@ -1,6 +1,8 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
+#define PYBIND11_DETAILED_ERROR_MESSAGES
+
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <pybind11/stl_bind.h>
@@ -28,17 +30,11 @@ py::list handle_utf8(const std::vector<std::string>& decoded_res);
 
 py::str handle_utf8(const std::string& text);
 
-ov::Any py_object_to_any(const py::object& py_obj, std::string property_name);
-
-bool py_object_is_any_map(const py::object& py_obj);
-
-ov::AnyMap py_object_to_any_map(const py::object& py_obj);
-
-std::map<std::string, ov::Any> properties_to_any_map(const std::map<std::string, py::object>& properties);
+ov::AnyMap properties_to_any_map(const std::map<std::string, py::object>& properties);
 
 ov::AnyMap kwargs_to_any_map(const py::kwargs& kwargs);
 
-std::string ov_tokenizers_module_path();
+std::filesystem::path ov_tokenizers_module_path();
 
 ov::genai::OptionalGenerationConfig update_config_from_kwargs(const ov::genai::OptionalGenerationConfig& config, const py::kwargs& kwargs);
 
diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp
index b0cfa0a42a..570018f341 100644
--- a/src/python/py_vlm_pipeline.cpp
+++ b/src/python/py_vlm_pipeline.cpp
@@ -114,11 +114,11 @@ void init_vlm_pipeline(py::module_& m) {
 
     py::class_<ov::genai::VLMDecodedResults>(m, "VLMDecodedResults", decoded_results_docstring)
         .def(py::init<>())
-        .def_property_readonly("texts", [](const ov::genai::VLMDecodedResults &dr) -> py::typing::List<py::str> { return pyutils::handle_utf8((std::vector<std::string>)dr); })
+        .def_property_readonly("texts", [](const ov::genai::VLMDecodedResults &dr) -> py::typing::List<py::str> { return pyutils::handle_utf8(dr.texts); })
         .def_readonly("scores", &ov::genai::VLMDecodedResults::scores)
         .def_readonly("perf_metrics", &ov::genai::VLMDecodedResults::perf_metrics)
         .def("__str__", [](const ov::genai::VLMDecodedResults &dr) -> py::str {
-            auto valid_utf8_strings = pyutils::handle_utf8((std::vector<std::string>)dr);
+            auto valid_utf8_strings = pyutils::handle_utf8(dr.texts);
             py::str res;
             if (valid_utf8_strings.size() == 1)
                 return valid_utf8_strings[0];
diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt
index b8c2e625c5..5880010841 100644
--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@@ -23,6 +23,8 @@ file(GLOB src_files "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/sequence_group.cpp"
                     "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/utils/*.cpp"
                     "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/utils.cpp"
                     "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/continuous_batching*.cpp"
+                    "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/icontinuous_batching.cpp"
+                    "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/lora_helper.cpp"
                     "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/text_callback_streamer.cpp")
 
 add_executable(${TEST_TARGET_NAME} ${tests_src})
diff --git a/tests/cpp/block_manager.cpp b/tests/cpp/block_manager.cpp
index 466cc23864..46c2fdddd7 100644
--- a/tests/cpp/block_manager.cpp
+++ b/tests/cpp/block_manager.cpp
@@ -13,12 +13,11 @@ TEST(TestBlockManager, general_test) {
     ov::genai::TokenIds prompt_ids;
 
     ov::genai::SequenceGroup::Ptr sequence_group = std::make_shared<ov::genai::SequenceGroup>(
-        0, 
+        0,
         ov::Tensor(ov::element::i64, {
         prompt_ids.size()}, prompt_ids.data()),
         ov::genai::beam_search(),
-        4, 
-        false);
+        4);
     auto sequence = sequence_group->get_not_finished_sequences()[0];
     bm.allocate(sequence, 6);
     auto seq_id = sequence->get_id();
@@ -46,13 +45,11 @@ TEST(TestBlockManager, required_blocks_count) {
 
     std::vector<uint64_t> tokens = {0,1,2,3,4};
     ov::genai::SequenceGroup::Ptr sequence_group = std::make_shared<ov::genai::SequenceGroup>(
-        0, 
+        0,
         ov::Tensor(ov::element::i64, {
         tokens.size()}, tokens.data()),
         ov::genai::beam_search(),
-        4,
-        false);
-    sequence_group->set_sequence_group_ptr(sequence_group);
+        4);
     sequence_group->schedule_tokens(5);
     auto required_blocks = bm.required_blocks_count(sequence_group);
     EXPECT_EQ(required_blocks, 2);
@@ -62,7 +59,7 @@ TEST(TestBlockManager, required_blocks_count) {
     EXPECT_EQ(bm.get_number_of_blocks_occupied_by_sequence(sequence_group), 2);
 
     sequence_group->finish_iteration();
-    auto sequence_to_fork = sequence_group->get_running_sequences()[0];    
+    auto sequence_to_fork = sequence_group->get_running_sequences()[0];
     for (size_t i = 0; i < 4; ++i) {
         const auto forked_sequence = sequence_group->fork_sequence(sequence_to_fork);
         bm.fork_sequence(sequence_to_fork->get_id(), forked_sequence->get_id());
@@ -98,9 +95,7 @@ TEST(TestBlockManager, CanFreeBlocksFromSequence) {
             ov::Tensor(ov::element::i64, {
                     tokens.size()}, tokens.data()),
             ov::genai::beam_search(),
-            BLOCK_SIZE,
-            false);
-    sequence_group->set_sequence_group_ptr(sequence_group);
+            BLOCK_SIZE);
     sequence_group->schedule_tokens(5);
     bm.append_slots(sequence_group);
     ASSERT_EQ(bm.num_free_blocks(), 5);
diff --git a/tests/cpp/cache_manager.cpp b/tests/cpp/cache_manager.cpp
index 5dc848aba5..095cc39f09 100644
--- a/tests/cpp/cache_manager.cpp
+++ b/tests/cpp/cache_manager.cpp
@@ -11,14 +11,17 @@
 
 using namespace ov::genai;
 
-std::shared_ptr<ov::Model> get_dummy_model(size_t num_layers) {
+std::shared_ptr<ov::Model> get_dummy_model(ov::Core core, size_t num_layers) {
     ov::NodeVector keys;
     ov::NodeVector values;
     ov::ParameterVector params;
+    ov::element::Type inference_precision = core.get_property("CPU", ov::hint::inference_precision);
+    ov::element::Type kv_cache_type = inference_precision == ov::element::bf16 ? ov::element::bf16 : ov::element::f16;
+
     auto shape = ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()});
     for (size_t i = 0; i < num_layers; i++) {
-        auto key = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, shape);
-        auto value = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, shape);
+        auto key = std::make_shared<ov::op::v0::Parameter>(kv_cache_type, shape);
+        auto value = std::make_shared<ov::op::v0::Parameter>(kv_cache_type, shape);
         key->get_output_tensor(0).set_names({"key_cache." + std::to_string(i)});
         value->get_output_tensor(0).set_names({"value_cache." + std::to_string(i)});
         keys.push_back(key);
@@ -57,7 +60,7 @@ TEST(TestCacheManager, test_cache_size_param) {
     std::vector<size_t> num_kv_heads(12, 12);
     device_config.set_model_params(num_kv_heads, 64, num_decoder_layers);
 
-    ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request();
+    ov::InferRequest request = core.compile_model(get_dummy_model(core, num_decoder_layers)).create_infer_request();
     auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, request, core);
     auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers());
     cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks());
@@ -80,7 +83,7 @@ TEST(TestCacheManager, test_kv_blocks_param) {
     std::vector<size_t> num_kv_heads(12, 12);
     device_config.set_model_params(num_kv_heads, 64, num_decoder_layers);
 
-    ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request();
+    ov::InferRequest request = core.compile_model(get_dummy_model(core, num_decoder_layers)).create_infer_request();
     auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, request, core);
     auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers());
     OPENVINO_ASSERT(block_manager.get_total_number_of_kv_blocks(), scheduler_config.num_kv_blocks);
@@ -107,7 +110,7 @@ TEST(TestCacheManager, test_dynamic_cache_increase) {
     }
 
 
-    ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request();
+    ov::InferRequest request = core.compile_model(get_dummy_model(core, num_decoder_layers)).create_infer_request();
     auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, request, core);
     auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers());
 
diff --git a/tests/cpp/sampler.cpp b/tests/cpp/sampler.cpp
index f146ab7426..3741880827 100644
--- a/tests/cpp/sampler.cpp
+++ b/tests/cpp/sampler.cpp
@@ -38,7 +38,7 @@ TEST(SamplerValidationMode, gen_phase_to_cut_whole_seq) {
     std::vector<int64_t> input_vector{0, 1, 2, 3, 4};
     ov::Tensor input_tensor(ov::element::i64, ov::Shape{1, 5}, input_vector.data());
     std::vector<SequenceGroup::Ptr> sequence_groups{
-        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32, false)),
+        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32)),
     };
 
     // to emulate processed prompt and add next token [ 0 ]
@@ -82,7 +82,7 @@ TEST(SamplerValidationMode, gen_phase_to_cut_part_seq) {
     std::vector<int64_t> input_vector{0, 1, 2, 3, 4};
     ov::Tensor input_tensor(ov::element::i64, ov::Shape{1, 5}, input_vector.data());
     std::vector<SequenceGroup::Ptr> sequence_groups{
-        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32, false)),
+        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32)),
     };
 
     // to emulate processed prompt and add next token [ 0 ]
@@ -127,7 +127,7 @@ TEST(SamplerValidationMode, gen_phase) {
     std::vector<int64_t> input_vector{0, 1, 2, 3, 4};
     ov::Tensor input_tensor(ov::element::i64, ov::Shape{1, 5}, input_vector.data());
     std::vector<SequenceGroup::Ptr> sequence_groups{
-        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32, false)),
+        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32)),
     };
 
     // to emulate processed prompt and add next token [ 0 ]
@@ -171,7 +171,7 @@ TEST(SamplerValidationMode, prompt_phase_to_cut_part_seq) {
     std::vector<int64_t> input_vector{0, 1, 2, 3, 4};
     ov::Tensor input_tensor(ov::element::i64, ov::Shape{1, 5}, input_vector.data());
     std::vector<SequenceGroup::Ptr> sequence_groups{
-        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32, false)),
+        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32)),
     };
 
     // append candidates [ 0, 1, 1 ]
@@ -217,7 +217,7 @@ TEST(SamplerValidationMode, prompt_phase_to_cut_whole_seq) {
     std::vector<int64_t> input_vector{0, 1, 2, 3, 4};
     ov::Tensor input_tensor(ov::element::i64, ov::Shape{1, 5}, input_vector.data());
     std::vector<SequenceGroup::Ptr> sequence_groups{
-        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32, false)),
+        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32)),
     };
 
     // append candidates [ 1, 2, 3 ]
@@ -262,7 +262,7 @@ TEST(SamplerValidationMode, prompt_phase) {
     std::vector<int64_t> input_vector{0, 1, 2, 3, 4};
     ov::Tensor input_tensor(ov::element::i64, ov::Shape{1, 5}, input_vector.data());
     std::vector<SequenceGroup::Ptr> sequence_groups{
-        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32, false)),
+        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32)),
     };
 
     // append candidates [ 0, 1, 2 ]
diff --git a/tests/cpp/scheduler.cpp b/tests/cpp/scheduler.cpp
index cc0b53a433..23594adf50 100644
--- a/tests/cpp/scheduler.cpp
+++ b/tests/cpp/scheduler.cpp
@@ -18,14 +18,17 @@ void clear_finished_sequences(std::vector<SequenceGroup::Ptr>& requests) {
     });
     requests.erase(new_end, requests.end());
 }
-std::shared_ptr<ov::Model> get_model(size_t num_layers) {
+std::shared_ptr<ov::Model> get_model(ov::Core core, size_t num_layers) {
     ov::NodeVector keys;
     ov::NodeVector values;
     ov::ParameterVector params;
+    ov::element::Type inference_precision = core.get_property("CPU", ov::hint::inference_precision);
+    ov::element::Type kv_cache_type = inference_precision == ov::element::bf16 ? ov::element::bf16 : ov::element::f16;
+
     auto shape = ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()});
     for (size_t i = 0; i < num_layers; i++) {
-        auto key = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, shape);
-        auto value = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, shape);
+        auto key = std::make_shared<ov::op::v0::Parameter>(kv_cache_type, shape);
+        auto value = std::make_shared<ov::op::v0::Parameter>(kv_cache_type, shape);
         key->get_output_tensor(0).set_names({"key_cache." + std::to_string(i)});
         value->get_output_tensor(0).set_names({"value_cache." + std::to_string(i)});
         keys.push_back(key);
@@ -42,12 +45,12 @@ std::shared_ptr<ov::Model> get_model(size_t num_layers) {
 std::shared_ptr<CacheManager> init_cache_manager(SchedulerConfig scheduler_config) {
     ov::Core core = ov::Core();
     size_t num_decoder_layers = 12;
-    ov::InferRequest request = core.compile_model(get_model(num_decoder_layers)).create_infer_request();
+    ov::InferRequest request = core.compile_model(get_model(core, num_decoder_layers)).create_infer_request();
     size_t head_size = 64, head_size_u8 = head_size + 8;
     std::vector<size_t> num_kv_heads(12, 12);
     ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU");
     device_config.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers);
-    return std::make_shared<CacheManager>(device_config, request, core);  
+    return std::make_shared<CacheManager>(device_config, request, core);
 }
 
 TEST(TestScheduler, general_test) {
@@ -63,17 +66,17 @@ TEST(TestScheduler, general_test) {
     for (auto scheduler_config: configs) {
         std::vector<uint64_t> tokens = {0,1,2,3,4,5,6,7};
         SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                                ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                                ov::genai::greedy(), 4);
         auto idx0 = (*sequence_group1)[0]->get_id();
         SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                                ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                                ov::genai::greedy(), 4);
         auto idx1 = (*sequence_group2)[0]->get_id();
         SequenceGroup::Ptr sequence_group3 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                                ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                                ov::genai::greedy(), 4);
         auto idx2 = (*sequence_group3)[0]->get_id();
         std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2, sequence_group3};
-        
-        // schedule 3 sequence groups that use 6 kv blocks 
+
+        // schedule 3 sequence groups that use 6 kv blocks
         Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config);
         auto out1 = scheduler.schedule(requests);
 
@@ -82,7 +85,7 @@ TEST(TestScheduler, general_test) {
         EXPECT_EQ(out1.m_block_tables[idx0][0].size(), 2);
         EXPECT_EQ(out1.m_block_tables[idx1][0].size(), 2);
         EXPECT_EQ(out1.m_block_tables[idx2][0].size(), 2);
-        // tokens.size() * 2 tokens should be scheduled on prompt phase, corresponding to first three sequences 
+        // tokens.size() * 2 tokens should be scheduled on prompt phase, corresponding to first three sequences
         EXPECT_EQ(out1.m_total_num_scheduled_tokens, tokens.size() * 3);
         EXPECT_EQ(out1.is_prompt, !scheduler_config.dynamic_split_fuse);
 
@@ -109,7 +112,7 @@ TEST(TestScheduler, general_test) {
         EXPECT_EQ(out3.m_block_tables[idx0][0].size(), 3);
         EXPECT_EQ(out3.m_block_tables[idx1][0].size(), 3);
         // 2 tokens should be scheduled on generate phase for "0" and "1" sequence, "2" sequence should be preempted
-        EXPECT_EQ(out3.m_total_num_scheduled_tokens, 2); 
+        EXPECT_EQ(out3.m_total_num_scheduled_tokens, 2);
         EXPECT_FALSE(out3.is_prompt);
 
         // check that scheduler has no block table for sequence_group3
@@ -124,7 +127,7 @@ TEST(TestScheduler, general_test) {
 
         auto out4 = scheduler.schedule(requests);
 
-        // check that sequence_group3 is fully scehuled 
+        // check that sequence_group3 is fully scehuled
         EXPECT_EQ(out4.m_block_tables[idx2][0].size(), 2);
         EXPECT_FALSE(out4.m_block_tables[idx2][0][0]->is_free());
         EXPECT_EQ(out4.m_block_tables[idx2][0][0]->get_index(), 0);
@@ -168,10 +171,10 @@ TEST_P(AppendSlotsSchedulerTest, test_append_slots_considers_all_sequences) {
     auto scheduler_config = GetParam();
     std::vector<uint64_t> tokens = {0,1,2,3,4,5,6,7};
     SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                            ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                            ov::genai::greedy(), 4);
     auto idx0 = (*sequence_group1)[0]->get_id();
     SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                            ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                            ov::genai::greedy(), 4);
     auto idx1 = (*sequence_group2)[0]->get_id();
     std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};
 
@@ -233,11 +236,11 @@ TEST_P(PartialPreemptionSchedulerTest, test_partial_preemption) {
     auto scheduler_config = GetParam();
     std::vector<uint64_t> tokens1 = {0,1,2,3,4,5,6,7,8,9,10};
     SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens1.size()}, tokens1.data()),
-                                                                            ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                            ov::genai::greedy(), 4);
     std::vector<uint64_t> tokens2 = {0,1,2,3,4,5,6,7};
     auto idx0 = (*sequence_group1)[0]->get_id();
     SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens2.size()}, tokens2.data()),
-                                                                            ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                            ov::genai::greedy(), 4);
     auto idx1 = (*sequence_group2)[0]->get_id();
     std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};
 
@@ -324,9 +327,9 @@ TEST(TestScheduler, test_partial_preemption_beam_search) {
 
         // create beam search group
         SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                                ov::genai::beam_search(), 4, scheduler_config.enable_prefix_caching);
-        sequence_group->set_sequence_group_ptr(sequence_group);
+                                                                                ov::genai::beam_search(), 4);
         std::vector<SequenceGroup::Ptr> requests = {sequence_group};
+        EXPECT_NO_THROW(requests[0]->get_running_sequences()[0]->get_sequence_group_ptr());
 
         Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config);
         auto out = scheduler.schedule(requests);
@@ -336,7 +339,7 @@ TEST(TestScheduler, test_partial_preemption_beam_search) {
         sequence_group->finish_iteration();
 
         // make 2 forked sequence
-        auto sequence_to_fork = sequence_group->get_running_sequences()[0];    
+        auto sequence_to_fork = sequence_group->get_running_sequences()[0];
         for (size_t i = 0; i < 2; ++i) {
             const auto forked_sequence = sequence_group->fork_sequence(sequence_to_fork);
             scheduler.fork_sequence(sequence_to_fork->get_id(), forked_sequence->get_id());
@@ -352,7 +355,7 @@ TEST(TestScheduler, test_partial_preemption_beam_search) {
             }
             sequence_group->finish_iteration();
         }
-        // currently sequence occupies 4 blocks (1 shared, 3 not shared) 
+        // currently sequence occupies 4 blocks (1 shared, 3 not shared)
 
         // make another 2 forked sequence
         for (size_t i = 0; i < 2; ++i) {
@@ -373,8 +376,7 @@ TEST(TestScheduler, test_partial_preemption_beam_search) {
 
         // create group, which requires 1 block
         SequenceGroup::Ptr sequence_group_greedy = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                                ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
-        sequence_group_greedy->set_sequence_group_ptr(sequence_group_greedy);
+                                                                                ov::genai::greedy(), 4);
 
         // set greedy group at the beginning of list to make it higher priority
         std::vector<SequenceGroup::Ptr> new_requests = {sequence_group_greedy, sequence_group};
@@ -386,8 +388,8 @@ TEST(TestScheduler, test_partial_preemption_beam_search) {
 
         EXPECT_EQ(sequence_group->get_num_processed_tokens(), 12);
         EXPECT_EQ(sequence_group->get_context_len(), 12);
-        
-        // beam search group should be partially preempted and 5 blocks should be released 
+
+        // beam search group should be partially preempted and 5 blocks should be released
         out = scheduler.schedule(new_requests);
         sequence_group_greedy->get_sequences()[0]->append_token(token, 0.5);
         sequence_group_greedy->finish_iteration();
@@ -399,8 +401,8 @@ TEST(TestScheduler, test_partial_preemption_beam_search) {
         EXPECT_EQ(scheduler.get_block_tables(*seqs[2])[0].size(), 2);
         EXPECT_EQ(scheduler.get_block_tables(*seqs[3])[0].size(), 2);
         EXPECT_EQ(scheduler.get_block_tables(*seqs[4])[0].size(), 2);
-        
-        // append another 20 tokens to greedy group, this should result in usage of all free blocks and 
+
+        // append another 20 tokens to greedy group, this should result in usage of all free blocks and
         // another partial preemption of beam search group
         for (size_t i = 0; i < 20; i++) {
             out = scheduler.schedule(new_requests);
@@ -431,13 +433,13 @@ TEST(TestScheduler, test_partially_preempted_prompt) {
     for (auto scheduler_config: configs) {
         std::vector<uint64_t> tokens = {0,1,2,3,4,5,6,7,8,9,10,11};
         SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                                ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                                ov::genai::greedy(), 4);
         auto idx0 = (*sequence_group1)[0]->get_id();
         SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                                ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                                ov::genai::greedy(), 4);
         auto idx1 = (*sequence_group2)[0]->get_id();
-        std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};                                                
-        
+        std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};
+
         // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks.
         Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config);
         auto out1 = scheduler.schedule(requests);
@@ -450,7 +452,7 @@ TEST(TestScheduler, test_partially_preempted_prompt) {
 
         // sequence_group2 should be fully preempted
         auto out2 = scheduler.schedule(requests);
-        
+
         // check that sequence_group1 has one more allocated block
         auto block_tables_for_all_layers = scheduler.get_block_tables(*(*sequence_group1)[0]);
         auto block_table1 = block_tables_for_all_layers[0];
@@ -467,7 +469,7 @@ TEST(TestScheduler, test_partially_preempted_prompt) {
 
         std::vector<uint64_t> ref_ids = {0};
         EXPECT_EQ(out2.m_scheduled_sequence_groups_ids, ref_ids);
-        EXPECT_EQ(out2.m_total_num_scheduled_tokens, 1); 
+        EXPECT_EQ(out2.m_total_num_scheduled_tokens, 1);
 
         if (scheduler_config.dynamic_split_fuse) {
             // for dynamic_split_fuse sequence_group2 is preemted partially, part of prompt is left
@@ -479,12 +481,12 @@ TEST(TestScheduler, test_partially_preempted_prompt) {
             // for vllm case sequence_group2 is fully preempted
             EXPECT_FALSE(scheduler.has_block_table(idx1));
         }
-        
+
         for (auto seq: requests) {
             std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences();
             seq->finish_iteration();
         }
-        
+
         // finish first sequence
         requests[0]->get_running_sequences()[0]->set_status(SequenceStatus::FINISHED);
         scheduler.free_sequence(idx0);
@@ -496,11 +498,11 @@ TEST(TestScheduler, test_partially_preempted_prompt) {
 
         if (scheduler_config.dynamic_split_fuse) {
             // remaining part of prompt should be scheduled
-            EXPECT_EQ(out3.m_total_num_scheduled_tokens, 4); 
+            EXPECT_EQ(out3.m_total_num_scheduled_tokens, 4);
         }
         else {
             // prompt should be fully scheduled
-            EXPECT_EQ(out3.m_total_num_scheduled_tokens, 12); 
+            EXPECT_EQ(out3.m_total_num_scheduled_tokens, 12);
         }
 
         EXPECT_EQ(out3.m_block_tables[idx1][0][0]->get_index(), 3);
@@ -541,16 +543,14 @@ TEST(TestScheduler, prefix_caching_test) {
             std::vector<uint64_t> tokens = histrory_tokens;
             tokens.insert(tokens.end(), prompt_tokens.begin(), prompt_tokens.end());
             SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                                    ov::genai::greedy(), 4, 
-                                                                                    scheduler_config.enable_prefix_caching);
-            sequence_group->set_sequence_group_ptr(sequence_group);
+                                                                                    ov::genai::greedy(), 4);
             scheduler.restore_cached_blocks(sequence_group);
             std::vector<SequenceGroup::Ptr> requests = {sequence_group};
 
             auto out1 = scheduler.schedule(requests);
             if (chat_iteration == 0)
                 EXPECT_EQ(out1.m_total_num_scheduled_tokens, prompt_tokens.size());
-            else 
+            else
                 EXPECT_EQ(out1.m_total_num_scheduled_tokens, prompt_tokens.size() + 1);
             for (auto seq: requests) {
                 std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences();
@@ -604,14 +604,10 @@ TEST(TestScheduler, prefix_caching_test_two_identical_sequences) {
             std::vector<uint64_t> tokens = histrory_tokens;
             tokens.insert(tokens.end(), prompt_tokens.begin(), prompt_tokens.end());
             SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                                    ov::genai::greedy(), 4, 
-                                                                                    scheduler_config.enable_prefix_caching);
+                                                                                    ov::genai::greedy(), 4);
 
             SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                                    ov::genai::greedy(), 4, 
-                                                                                    scheduler_config.enable_prefix_caching);
-            sequence_group1->set_sequence_group_ptr(sequence_group1);
-            sequence_group2->set_sequence_group_ptr(sequence_group2);
+                                                                                    ov::genai::greedy(), 4);
             std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};
             // restore cached blocks
             for (auto request: requests) {
@@ -622,7 +618,7 @@ TEST(TestScheduler, prefix_caching_test_two_identical_sequences) {
             auto out1 = scheduler.schedule(requests);
             if (chat_iteration == 0)
                 EXPECT_EQ(out1.m_total_num_scheduled_tokens, prompt_tokens.size() * 2);
-            else 
+            else
                 EXPECT_EQ(out1.m_total_num_scheduled_tokens, (prompt_tokens.size() + 1) * 2);
             for (auto seq: requests) {
                 std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences();
@@ -650,7 +646,7 @@ TEST(TestScheduler, prefix_caching_test_two_identical_sequences) {
                 scheduler.free_sequence(idx0);
             }
             auto generated_ids = requests[0]->get_sequences()[0]->get_generated_ids();
-            
+
             histrory_tokens.insert(histrory_tokens.end(), prompt_tokens.begin(), prompt_tokens.end());
             histrory_tokens.insert(histrory_tokens.end(), generated_ids.begin(), generated_ids.end());
         }
@@ -676,10 +672,8 @@ TEST(TestScheduler, prefix_caching_with_max_new_tokens_equal_1) {
 
         for (size_t chat_iteration = 0; chat_iteration < chat_iterations; chat_iteration++) {
             SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {prompt_tokens.size()}, prompt_tokens.data()),
-                                                                                    ov::genai::greedy(), 32, 
-                                                                                    scheduler_config.enable_prefix_caching);
+                                                                                    ov::genai::greedy(), 32);
 
-            sequence_group->set_sequence_group_ptr(sequence_group);
             std::vector<SequenceGroup::Ptr> requests = {sequence_group};
             // restore cached blocks
             for (auto request: requests) {
@@ -690,7 +684,7 @@ TEST(TestScheduler, prefix_caching_with_max_new_tokens_equal_1) {
             auto out1 = scheduler.schedule(requests);
             if (chat_iteration == 0)
                 EXPECT_EQ(out1.m_total_num_scheduled_tokens, prompt_tokens.size());
-            else 
+            else
                 EXPECT_EQ(out1.m_total_num_scheduled_tokens, 1);
             for (auto seq: requests) {
                 std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences();
@@ -721,10 +715,10 @@ TEST(TestScheduler, test_partially_preempted_prompt_not_allowed) {
 
     std::vector<uint64_t> tokens = {0,1,2,3,4,5,6,7,8,9,10,11};
     SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                            ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                            ov::genai::greedy(), 4);
     auto idx0 = (*sequence_group1)[0]->get_id();
     SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                            ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                            ov::genai::greedy(), 4);
     auto idx1 = (*sequence_group2)[0]->get_id();
     std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};
 
@@ -796,10 +790,10 @@ TEST(TestScheduler, test_partially_preempted_prompt_not_allowed2) {
 
     std::vector<uint64_t> tokens = {0,1,2,3,4,5,6,7,8,9};
     SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                            ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                            ov::genai::greedy(), 4);
     auto idx0 = (*sequence_group1)[0]->get_id();
     SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                            ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                            ov::genai::greedy(), 4);
     auto idx1 = (*sequence_group2)[0]->get_id();
     std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};
 
@@ -909,12 +903,11 @@ TEST(TestScheduler, FullyPreemptsCacheEvictedSequences) {
                                                                          ov::Tensor(ov::element::i64, {tokens1.size()},
                                                                                     tokens1.data()),
                                                                          ov::genai::greedy(),
-                                                                         2,
-                                                                         scheduler_config.enable_prefix_caching);
+                                                                         2);
     std::vector<uint64_t> tokens2 = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; // 5 full blocks, larger than eviction arena size (3 blocks) - will start evicting already at prompt stage
     auto idx1 = (*sequence_group1)[0]->get_id();
     SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens2.size()}, tokens2.data()),
-                                                                         ov::genai::greedy(), 2, scheduler_config.enable_prefix_caching);
+                                                                         ov::genai::greedy(), 2);
     auto idx2 = (*sequence_group2)[0]->get_id();
     std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};
 
diff --git a/tests/cpp/speculative_decoding.cpp b/tests/cpp/speculative_decoding.cpp
index bb10c2cc8f..1cf8db0fab 100644
--- a/tests/cpp/speculative_decoding.cpp
+++ b/tests/cpp/speculative_decoding.cpp
@@ -20,9 +20,7 @@ class CBForSDTest : public testing::Test, public ov::genai::ContinuousBatchingPi
 
             ov::genai::SequenceGroup::Ptr sequence_group = std::make_shared<ov::genai::SequenceGroup>(request_id, input_ids,
                                                                                 sampling_params, 
-                                                                                32,
-                                                                                true);
-            sequence_group->set_sequence_group_ptr(sequence_group);
+                                                                                32);
 
             {
                 std::lock_guard<std::mutex> lock{m_awaiting_requests_mutex};
diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index 9040fa435f..2fca58a959 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -7,7 +7,7 @@
 
 from optimum.intel import OVModelForCausalLM
 from pathlib import Path
-from openvino_genai import ContinuousBatchingPipeline, SchedulerConfig, GenerationResult, GenerationConfig
+from openvino_genai import ContinuousBatchingPipeline, LLMPipeline, SchedulerConfig, GenerationResult, GenerationConfig, DecodedResults, StopCriteria
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from transformers import GenerationConfig as HFGenerationConfig
 from typing import List, Tuple
@@ -20,20 +20,6 @@ def get_greedy() -> GenerationConfig:
     generation_config.max_new_tokens = 30
     return generation_config
 
-def get_greedy_with_min_and_max_tokens() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_return_sequences = 1
-    generation_config.min_new_tokens = 15
-    generation_config.max_new_tokens = 30
-    return generation_config
-
-def get_greedy_with_repetition_penalty() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_return_sequences = 1
-    generation_config.repetition_penalty = 2.0
-    generation_config.max_new_tokens = 30
-    return generation_config
-
 def get_greedy_with_penalties() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.num_return_sequences = 1
@@ -42,33 +28,6 @@ def get_greedy_with_penalties() -> GenerationConfig:
     generation_config.max_new_tokens = 30
     return generation_config
 
-def get_greedy_with_single_stop_string() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_return_sequences = 1
-    generation_config.min_new_tokens = 15
-    generation_config.max_new_tokens = 50
-    generation_config.stop_strings = {"anag"} # expected match on "manage"
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
-def get_greedy_with_multiple_stop_strings() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_return_sequences = 1
-    generation_config.min_new_tokens = 1
-    generation_config.max_new_tokens = 50
-    generation_config.stop_strings = {".", "software", "Intel"}
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
-def get_greedy_with_multiple_stop_strings_no_match() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_return_sequences = 1
-    generation_config.min_new_tokens = 1
-    generation_config.max_new_tokens = 50
-    generation_config.stop_strings = {"Einstein", "sunny", "geothermal"}
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
 def get_beam_search() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.num_beam_groups = 3
@@ -79,78 +38,6 @@ def get_beam_search() -> GenerationConfig:
     generation_config.num_return_sequences = generation_config.num_beams
     return generation_config
 
-def get_beam_search_min_and_max_tokens() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_beam_groups = 3
-    generation_config.num_beams = 6
-    generation_config.diversity_penalty = 1
-    generation_config.min_new_tokens = 15
-    generation_config.max_new_tokens = 30
-    generation_config.num_return_sequences = 3
-    generation_config.num_return_sequences = generation_config.num_beams
-    return generation_config
-
-def get_beam_search_with_single_stop_string() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_beam_groups = 3
-    generation_config.num_beams = 6
-    generation_config.diversity_penalty = 1
-    generation_config.max_new_tokens = 50
-    generation_config.num_return_sequences = generation_config.num_beams
-    generation_config.stop_strings = {"open sour"}  # expected match on "open source"
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
-def get_beam_search_with_multiple_stop_strings() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_beam_groups = 3
-    generation_config.num_beams = 6
-    generation_config.diversity_penalty = 1
-    generation_config.max_new_tokens = 50
-    generation_config.num_return_sequences = generation_config.num_beams
-    generation_config.stop_strings = {".", "software", "Intel"}
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
-def get_beam_search_with_multiple_stop_strings_no_match() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_beam_groups = 3
-    generation_config.num_beams = 6
-    generation_config.diversity_penalty = 1
-    generation_config.max_new_tokens = 30
-    generation_config.num_return_sequences = generation_config.num_beams
-    generation_config.stop_strings = {"Einstein", "sunny", "geothermal"}
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
-def get_greedy_stop_strings_exclude_from_output() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.max_new_tokens = 30
-    generation_config.stop_strings = { "machines" }
-    generation_config.include_stop_str_in_output = False
-    return generation_config
-
-def get_greedy_stop_strings_include_to_output() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.max_new_tokens = 30
-    generation_config.stop_strings = { "machines" }
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
-def get_greedy_n_stop_strings_exclude_from_output() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.max_new_tokens = 30
-    generation_config.stop_strings = { "machines", "manage" }
-    generation_config.include_stop_str_in_output = False
-    return generation_config
-
-def get_greedy_n_stop_strings_include_to_output() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.max_new_tokens = 30
-    generation_config.stop_strings = { "machines", "manage" }
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
 def get_multinomial_temperature() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.do_sample = True
@@ -288,7 +175,11 @@ def convert_to_hf(
     default_generation_config : HFGenerationConfig,
     generation_config : GenerationConfig
 ) -> HFGenerationConfig:
+    if generation_config is None:
+        return
+
     kwargs = {}
+    kwargs['return_dict_in_generate'] = True
 
     # generic parameters
     kwargs['max_length'] = generation_config.max_length
@@ -300,8 +191,16 @@ def convert_to_hf(
 
     # copy default parameters
     kwargs['bos_token_id'] = default_generation_config.bos_token_id
-    kwargs['eos_token_id'] = default_generation_config.eos_token_id
     kwargs['pad_token_id'] = default_generation_config.pad_token_id
+
+    if len(generation_config.stop_token_ids) > 0:
+        kwargs['eos_token_id'] = list(generation_config.stop_token_ids)
+    elif generation_config.eos_token_id != -1:
+        kwargs['eos_token_id'] = generation_config.eos_token_id
+    else:
+        kwargs['eos_token_id'] = default_generation_config.eos_token_id
+
+    # copy penalties
     kwargs['repetition_penalty'] = generation_config.repetition_penalty
 
     if generation_config.is_beam_search():
@@ -312,8 +211,20 @@ def convert_to_hf(
         kwargs['no_repeat_ngram_size'] = generation_config.no_repeat_ngram_size
         kwargs['num_return_sequences'] = generation_config.num_return_sequences
         kwargs['output_scores'] = True
+
         if generation_config.num_beam_groups > 1:
             kwargs['diversity_penalty'] = generation_config.diversity_penalty
+
+        # in OpenVINO GenAI this parameter is called stop_criteria,
+        # while in HF it's called early_stopping.
+        # HF values True, False and "never" correspond to OV GenAI values "EARLY", "HEURISTIC" and "NEVER"
+        STOP_CRITERIA_MAP = {
+            StopCriteria.NEVER: "never",
+            StopCriteria.EARLY: True,
+            StopCriteria.HEURISTIC: False
+        }
+
+        kwargs['early_stopping'] = STOP_CRITERIA_MAP[generation_config.stop_criteria]
     elif generation_config.is_multinomial():
         # mulitinomial
         kwargs['temperature'] = generation_config.temperature
@@ -332,23 +243,53 @@ def run_hugging_face(
     opt_model,
     hf_tokenizer,
     prompts: List[str],
-    generation_configs: List[GenerationConfig],
+    generation_configs: List[GenerationConfig] | GenerationConfig,
 ) -> List[GenerationResult]:
     generation_results = []
-    for prompt, generation_config in zip(prompts, generation_configs):
-        inputs = hf_tokenizer(prompt, return_tensors="pt")
-        prompt_len = inputs['input_ids'].numel()
-        generate_outputs = opt_model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'],
-                                              generation_config=convert_to_hf(opt_model.generation_config, generation_config),
-                                              return_dict_in_generate=True, tokenizer=hf_tokenizer)
-        all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True)
 
-        generation_result = GenerationResult()
-        generation_result.m_generation_ids = all_text_batch
-        # sequences_scores are available only for beam search case
-        if generation_config.is_beam_search():
-            generation_result.m_scores = [score for score in generate_outputs.sequences_scores]
-        generation_results.append(generation_result)
+    if type(generation_configs) is list:
+        # process prompt by promp as we have multiple generation configs
+        for prompt, generation_config in zip(prompts, generation_configs):
+            hf_generation_config = convert_to_hf(opt_model.generation_config, generation_config)
+            inputs = hf_tokenizer(prompt, return_tensors="pt")
+            input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask']
+            prompt_len = 0 if generation_config.echo else input_ids.numel()
+
+            generate_outputs = opt_model.generate(input_ids=input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer)
+            all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True)
+
+            generation_result = GenerationResult()
+            generation_result.m_generation_ids = all_text_batch
+            # sequences_scores are available only for beam search case
+            if generation_config.is_beam_search():
+                generation_result.m_scores = [score for score in generate_outputs.sequences_scores]
+            generation_results.append(generation_result)
+    else:
+        # process all prompts as a single batch as we have a single generation config for all prompts
+        inputs = hf_tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True, padding_side='left')
+        input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask']
+        hf_generation_config = convert_to_hf(opt_model.generation_config, generation_configs)
+        hf_encoded_outputs = opt_model.generate(input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer)
+
+        generation_ids = []
+        scores = []
+
+        for idx, hf_encoded_out in enumerate(hf_encoded_outputs.sequences):
+            prompt_idx = idx // hf_generation_config.num_return_sequences
+            prompt_len = 0 if generation_configs.echo else input_ids[prompt_idx].numel()
+            decoded_text = hf_tokenizer.decode(hf_encoded_out[prompt_len:], skip_special_tokens=True)
+            generation_ids.append(decoded_text)
+            if generation_configs.is_beam_search():
+                scores.append(hf_encoded_outputs.sequences_scores[idx])
+
+            # if we need to move to next generation result
+            if (idx + 1) // hf_generation_config.num_return_sequences != prompt_idx:
+                generation_result = GenerationResult()
+                generation_result.m_generation_ids = generation_ids
+                generation_result.m_scores = scores
+                generation_results.append(generation_result)
+                generation_ids = []
+                scores = []
 
     del hf_tokenizer
     del opt_model
@@ -360,16 +301,65 @@ def run_continuous_batching(
     models_path : Path,
     scheduler_config : SchedulerConfig,
     prompts: List[str],
-    generation_configs : List[GenerationConfig]
+    generation_configs : List[GenerationConfig] | GenerationConfig 
 ) -> List[GenerationResult]:
-    pipe = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU")
-    output = pipe.generate(prompts, generation_configs)
-    del pipe
+    if type(generation_configs) is not list:
+        generation_configs = [generation_configs] * len(prompts)
+ 
+    cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU', tokenizer_properties={}, properties=get_default_properties())
+    output = cb_pipe.generate(prompts, generation_configs)
+
+    del cb_pipe
     shutil.rmtree(models_path)
+
     return output
 
 
-def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, generation_config: GenerationConfig):
+def get_default_properties():
+    import openvino.properties.hint as hints
+    import openvino as ov
+
+    return {
+        hints.inference_precision : ov.Type.f32,
+        hints.kv_cache_precision : ov.Type.f16,
+    }
+
+
+def run_llm_pipeline(
+    models_path : Path,
+    prompts: List[str],
+    generation_config : GenerationConfig,
+    use_cb : bool = False
+) -> List[GenerationResult]:
+    properties = get_default_properties()
+    if use_cb:
+        properties['scheduler_config'] = SchedulerConfig()
+
+    ov_pipe = LLMPipeline(models_path, device='CPU', **properties)
+
+    generate_outputs : DecodedResults = ov_pipe.generate(inputs=prompts, generation_config=generation_config)
+
+    index = 0
+    generation_results = []
+
+    for _ in prompts:
+        generation_result = GenerationResult()
+
+        generation_result.m_generation_ids = generate_outputs.texts[index : index + generation_config.num_return_sequences]
+        # sequences_scores are available only for beam search case
+        if generation_config.is_beam_search():
+            generation_result.m_scores = generate_outputs.scores[index : index + generation_config.num_return_sequences]
+        generation_results.append(generation_result)
+
+        index += generation_config.num_return_sequences
+
+    del ov_pipe
+    shutil.rmtree(models_path)
+
+    return generation_results
+
+
+def compare_generation_result(hf_result: GenerationResult, ov_result: GenerationResult, generation_config: GenerationConfig):
     if generation_config.is_beam_search():
         assert len(hf_result.m_scores) == len(ov_result.m_scores)
         for hf_score, ov_score in zip(hf_result.m_scores, ov_result.m_scores):
@@ -386,46 +376,79 @@ def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, ge
             assert hf_text == ov_text
 
 
-def get_hugging_face_model_and_tokenizer(model_id: str, use_optimum = True):
+def compare_generation_results(prompts: List[str], hf_results: List[GenerationResult], ov_results: List[GenerationResult], generation_configs: List[GenerationConfig] | GenerationConfig):
+    if type(generation_configs) is not list:
+        generation_configs = [generation_configs]
+
+    assert len(prompts) == len(hf_results)
+    assert len(prompts) == len(ov_results)
+
+    for prompt, ref_result, ov_result, generation_config in zip(prompts, hf_results, ov_results, generation_configs):
+        print(f"Prompt = {prompt}\nReference result = {ref_result}\nOpenVINO result = {ov_result.m_generation_ids}")
+        compare_generation_result(ref_result, ov_result, generation_config)
+
+
+def get_hugging_face_models(model_id: str):
     hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True) if use_optimum else \
-                AutoModelForCausalLM.from_pretrained(model_id)
+    opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, compile=False, load_in_8bit=False, trust_remote_code=True, ov_config=get_default_properties())
     return opt_model, hf_tokenizer
 
 
-def save_ov_model_from_optimum(model, hf_tokenizer, models_path: Path):
-    model.save_pretrained(models_path)
+def convert_models(opt_model : OVModelForCausalLM, hf_tokenizer : AutoTokenizer, models_path: Path):
+    opt_model.save_pretrained(models_path)
+
+    # to store tokenizer config jsons with special tokens
+    hf_tokenizer.save_pretrained(models_path)
+
+    # save generation config
+    opt_model.generation_config.save_pretrained(models_path)
+
     # convert tokenizers as well
     from openvino_tokenizers import convert_tokenizer
     from openvino import serialize
-    tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True, skip_special_tokens=True)
+
+    tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True)
     serialize(tokenizer, models_path / "openvino_tokenizer.xml")
     serialize(detokenizer, models_path / "openvino_detokenizer.xml")
 
 
-def _generate_and_compare_with_reference_results(models_path: Path, prompts: List[str], reference_results: List[GenerationResult], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig):
-    ov_results : List[GenerationResult] = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs)
+def run_llm_pipeline_with_ref(model_id: str, prompts: List[str], generation_config: GenerationConfig | dict, tmp_path: Path, use_cb : bool = False):
+    models_path : Path = tmp_path / model_id
+    opt_model, hf_tokenizer = get_hugging_face_models(model_id)
 
-    assert len(prompts) == len(reference_results)
-    assert len(prompts) == len(ov_results)
+    if type(generation_config) is dict:
+        generation_config = GenerationConfig(**generation_config)
+
+    convert_models(opt_model, hf_tokenizer, models_path)
 
-    for prompt, ref_result, ov_result, generation_config in zip(prompts, reference_results, ov_results, generation_configs):
-        print(f"Prompt = {prompt}\nref result = {ref_result}\nOV result = {ov_result.m_generation_ids}")
-        compare_results(ref_result, ov_result, generation_config)
+    ov_results = run_llm_pipeline(models_path, prompts, generation_config, use_cb)
+    hf_results = run_hugging_face(opt_model, hf_tokenizer, prompts, generation_config)
 
+    compare_generation_results(prompts, hf_results, ov_results, generation_config)
+
+
+def run_cb_pipeline_with_ref(tmp_path: str, model_id: str, scheduler_params: dict = {}, generation_config : GenerationConfig | dict = None):
+    prompts, generation_configs = get_test_dataset()
+    scheduler_config = get_scheduler_config(scheduler_params)
+
+    # override dataset's generation config
+    if generation_config is not None:
+        if type(generation_config) is dict:
+            generation_config = GenerationConfig(**generation_config)
+        generation_configs = [generation_config] * len(prompts)
 
-def generate_and_compare_with_hf(model_id: str, prompts: List[str], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig, tmp_path: Path):
-    use_optimum = True
     models_path : Path = tmp_path / model_id
-    opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum)
+    opt_model, hf_tokenizer = get_hugging_face_models(model_id)
 
-    if use_optimum:
-        save_ov_model_from_optimum(opt_model, hf_tokenizer, models_path)
+    convert_models(opt_model, hf_tokenizer, models_path)
 
-    hf_results = run_hugging_face(opt_model=opt_model, hf_tokenizer=hf_tokenizer, prompts=prompts, generation_configs=generation_configs)
-    _generate_and_compare_with_reference_results(models_path, prompts, hf_results, generation_configs, scheduler_config)
+    hf_results = run_hugging_face(opt_model, hf_tokenizer, prompts, generation_configs)
+    ov_results = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs)
 
+    compare_generation_results(prompts, hf_results, ov_results, generation_configs)
 
+
+# TODO: remove after Generator property is supported by LLMPipeline / VLMPipeline
 def generate_and_compare_with_reference_text(models_path: Path, prompts: List[str], reference_texts_per_prompt: List[List[str]], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig):
     ov_results : List[GenerationResult] = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs)
 
@@ -440,19 +463,6 @@ def generate_and_compare_with_reference_text(models_path: Path, prompts: List[st
             assert ref_text == ov_text
 
 
-def run_continuous_batching_pipeline_test(tmp_path: str, model_id: str, scheduler_params: dict = None, generation_config = None):
-    prompts, generation_configs = get_test_dataset()
-    scheduler_config = get_scheduler_config(scheduler_params)
-
-    if generation_config is not None:
-        generation_config.rng_seed = 0
-        generation_configs = [generation_config] * len(prompts)
-
-    generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path)
-
-
-DEFAULT_SCHEDULER_CONFIG = get_scheduler_config({"num_kv_blocks": 300, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256})
-
 def get_image_by_link(link):
     from PIL import Image
     import requests
diff --git a/tests/python_tests/models/real_models b/tests/python_tests/models/real_models
index 98fa18bd5e..420f8f53b6 100644
--- a/tests/python_tests/models/real_models
+++ b/tests/python_tests/models/real_models
@@ -11,7 +11,7 @@ EleutherAI/gpt-neo-2.7B
 EleutherAI/gpt-neox-20b
 EleutherAI/pythia-160m
 GAIR/Abel-7B-002
-# OrionStarAI/Orion-14B-Base: pip install flash_attn (https://github.com/huggingface/transformers/pull/30954)
+OrionStarAI/Orion-14B-Base
 PygmalionAI/pygmalion-6b
 Qwen/Qwen-7B
 Qwen/Qwen-7B-Chat
@@ -21,6 +21,8 @@ Qwen/Qwen1.5-7B
 Qwen/Qwen1.5-7B-Chat
 Qwen/Qwen1.5-MoE-A2.7B
 Qwen/Qwen1.5-MoE-A2.7B-Chat
+Qwen/Qwen2-7B
+Qwen/Qwen2-7B-Instruct
 Salesforce/codegen-350M-multi
 Salesforce/codegen-350M-nl
 Salesforce/codegen2-1b
@@ -48,15 +50,16 @@ bigscience/bloomz-1b7
 bigscience/bloomz-560m
 bigscience/bloomz-7b1
 cerebras/Cerebras-GPT-13B
-# core42/jais-13b: wrong output with PA
-# core42/jais-13b-chat: wrong output with PA
+core42/jais-13b
+core42/jais-13b-chat
 databricks/dolly-v1-6b
 databricks/dolly-v2-3b
 # deepseek-ai/deepseek-coder-33b-instruct: OpenVINO tokenizers - Cannot convert tokenizer of this type without `.model` file
 # deepseek-ai/deepseek-coder-6.7b-instruct: OpenVINO tokenizers - Cannot convert tokenizer of this type without `.model` file
-# deepseek-ai/deepseek-moe-16b-base: optimum -  Trying to export a deepseek model, that is a custom or unsupported architecture
-# facebook/blenderbot-3B: optimum - IndexError: tuple index out of range
-# facebook/incoder-1B: CB - Failed to detect "eos_token_id" in openvino_tokenizer.xml runtime information
+deepseek-ai/deepseek-moe-16b-base
+deepseek-ai/DeepSeek-V3-Base
+facebook/blenderbot-3B
+facebook/incoder-1B
 facebook/opt-1.3b
 facebook/opt-125m
 facebook/opt-2.7b
@@ -66,6 +69,7 @@ google/gemma-1.1-7b-it
 google/gemma-2b
 google/gemma-2b-it
 google/gemma-7b
+google/gemma-2-9b
 google/pegasus-big_patent
 google/pegasus-large
 gpt2
@@ -86,6 +90,10 @@ microsoft/DialoGPT-medium
 microsoft/Orca-2-7b
 microsoft/Phi-3-mini-128k-instruct
 microsoft/Phi-3-mini-4k-instruct
+microsoft/Phi-3-medium-128k-instruct
+microsoft/Phi-3-small-8k-instruct
+microsoft/Phi-3-small-128k-instruct
+microsoft/Phi-3.5-MoE-instruct
 # microsoft/biogpt: OpenVINO Tokenizers - openvino.runtime.exceptions.OVTypeError: Tokenizer type is not supported: <class 'transformers.models.biogpt.tokenization_biogpt.BioGptTokenizer'>
 microsoft/phi-1_5
 microsoft/phi-2
@@ -106,10 +114,10 @@ openbmb/MiniCPM-2B-dpo-bf16
 openbmb/MiniCPM-2B-sft-bf16
 openchat/openchat_3.5
 openlm-research/open_llama_13b
-# openlm-research/open_llama_3b: CPU - head size must be multiple of 16, current: 100
-# openlm-research/open_llama_3b_v2: CPU - head size must be multiple of 16, current: 100
+openlm-research/open_llama_3b
+openlm-research/open_llama_3b_v2
 # replit/replit-code-v1-3b: OpenVINO Tokenizers - AttributeError: 'ReplitLMTokenizer' object has no attribute 'sp_model'
-# rinna/bilingual-gpt-neox-4b: OpenVINO Tokenizers - trash output (https://jira.devtools.intel.com/browse/CVS-142063)
+rinna/bilingual-gpt-neox-4b
 rinna/youri-7b-chat
 stabilityai/stable-code-3b
 stabilityai/stable-zephyr-3b
@@ -120,3 +128,4 @@ tiiuae/falcon-rw-7b
 togethercomputer/RedPajama-INCITE-Chat-3B-v1
 # xverse/XVERSE-7B-Chat: Transformers - Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3
 # xverse/XVERSE-MoE-A4.2B: Transformers - Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3
+Deci/DeciLM-7B
\ No newline at end of file
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
index 9e8e4681f9..66fb58f46d 100644
--- a/tests/python_tests/ov_genai_test_utils.py
+++ b/tests/python_tests/ov_genai_test_utils.py
@@ -13,6 +13,8 @@
 import shutil
 import json
 
+import openvino_genai as ov_genai
+from common import get_default_properties
 
 def get_models_list():
     precommit_models = [
@@ -52,6 +54,7 @@ def get_models_list():
 
     if pytest.selected_model_ids:
         model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')]
+
     # pytest.set_trace()
     prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
     return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]
@@ -81,66 +84,57 @@ def get_chat_models_list():
 
 @functools.lru_cache(1)
 def read_model(params, **tokenizer_kwargs):
-    model_id, path = params
+    model_id, models_path = params
 
     from optimum.intel.openvino import OVModelForCausalLM
     from transformers import AutoTokenizer
     hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
 
-    if (path / "openvino_model.xml").exists():
-        opt_model = OVModelForCausalLM.from_pretrained(path, trust_remote_code=True,
-                                                       compile=False, device='CPU')
+    if (models_path / "openvino_model.xml").exists():
+        opt_model = OVModelForCausalLM.from_pretrained(models_path, trust_remote_code=True,
+                                                       compile=False, device='CPU', ov_config=get_default_properties())
     else:
         ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer,
                                                                              with_detokenizer=True,
                                                                              **tokenizer_kwargs)
-        openvino.save_model(ov_tokenizer, path / "openvino_tokenizer.xml")
-        openvino.save_model(ov_detokenizer, path / "openvino_detokenizer.xml")
+        openvino.save_model(ov_tokenizer, models_path / "openvino_tokenizer.xml")
+        openvino.save_model(ov_detokenizer, models_path / "openvino_detokenizer.xml")
 
         # to store tokenizer config jsons with special tokens
-        hf_tokenizer.save_pretrained(path)
+        hf_tokenizer.save_pretrained(models_path)
 
         opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True,
-                                                       compile=False, device='CPU', load_in_8bit=False)
-        opt_model.generation_config.save_pretrained(path)
-        opt_model.config.save_pretrained(path)
-        opt_model.save_pretrained(path)
+                                                       compile=False, device='CPU', load_in_8bit=False, ov_config=get_default_properties())
+        opt_model.generation_config.save_pretrained(models_path)
+        opt_model.config.save_pretrained(models_path)
+        opt_model.save_pretrained(models_path)
 
     return (
         model_id,
-        path,
+        models_path,
         hf_tokenizer,
         opt_model,
-        ov_genai.LLMPipeline(path, 'CPU', ENABLE_MMAP=False),
+        ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False, **get_default_properties()),
     )
 
 
-# in OpenVINO GenAI this parameter is called stop_criteria,
-# while in HF it's called early_stopping.
-# HF values True, False and "never" correspond to OV GenAI values "EARLY", "HEURISTIC" and "NEVER"
-STOP_CRITERIA_MAP = {
-    ov_genai.StopCriteria.NEVER: "never",
-    ov_genai.StopCriteria.EARLY: True,
-    ov_genai.StopCriteria.HEURISTIC: False
-}
-
-
 @pytest.fixture(scope="module")
 def model_tmp_path(tmpdir_factory):
-    model_id, path, _, _, _ = read_model(get_models_list()[0])
+    model_id, models_path, _, _, _ = read_model(get_models_list()[0])
     temp_path = tmpdir_factory.mktemp(model_id.replace('/', '_'))
 
     # copy openvino converted model and tokenizers
     for pattern in ['*.xml', '*.bin']:
-        for src_file in path.glob(pattern):
+        for src_file in models_path.glob(pattern):
             if src_file.is_file():
                 shutil.copy(src_file, temp_path / src_file.name)
+
     yield model_id, Path(temp_path)
 
 
 @pytest.fixture(scope="module")
 def model_tokenizers_tmp_path(tmpdir_factory):
-    model_id, path, _, _, _ = read_model(get_models_list()[0])
+    model_id, models_path, _, _, _ = read_model(get_models_list()[0])
     temp_path = tmpdir_factory.mktemp(model_id.replace('/', '_'))
 
     # If tokens were not found in IR, it fallback to reading from config.
@@ -148,10 +142,11 @@ def model_tokenizers_tmp_path(tmpdir_factory):
     # and set tokens in configs and to check if they are read and validated correctly.
     import openvino as ov
 
+    core = ov.Core()
+
     # copy openvino converted model and tokenizers
     for pattern in ['*.xml', '*.bin']:
-        for src_file in path.glob(pattern):
-            core = ov.Core()
+        for src_file in models_path.glob(pattern):
 
             # Update files if they are openvino_tokenizer.xml or openvino_detokenizer.xml
             if src_file.name in ['openvino_tokenizer.xml', 'openvino_detokenizer.xml']:
@@ -166,8 +161,10 @@ def model_tokenizers_tmp_path(tmpdir_factory):
 
             if src_file in ['openvino_tokenizer.bin', 'openvino_detokenizer.bin']:
                 continue
+
             if src_file.is_file():
                 shutil.copy(src_file, temp_path / src_file.name)
+
     yield model_id, Path(temp_path)
 
 
@@ -181,7 +178,7 @@ def load_genai_pipe_with_configs(configs: List[Tuple], temp_path):
         with (temp_path / config_name).open('w') as f:
             json.dump(config_json, f)
 
-    ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU')
+    ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU', **get_default_properties())
 
     for _, config_name in configs:
         os.remove(temp_path / config_name)
@@ -191,4 +188,4 @@ def load_genai_pipe_with_configs(configs: List[Tuple], temp_path):
 
 @functools.lru_cache(1)
 def get_continuous_batching(path):
-    return ov_genai.LLMPipeline(path, 'CPU', scheduler_config=ov_genai.SchedulerConfig())
+    return ov_genai.LLMPipeline(path, 'CPU', scheduler_config=ov_genai.SchedulerConfig(), **get_default_properties())
diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt
index c2c7d634f5..e23eaacc21 100644
--- a/tests/python_tests/requirements.txt
+++ b/tests/python_tests/requirements.txt
@@ -1,9 +1,11 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 diffusers==0.32.1
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
 numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64"
 onnx==1.17.0
 pytest
+pytest-html
+hf_transfer
 
 # requirements for specific models
 # - hf-tiny-model-private/tiny-random-RoFormerForCausalLM
@@ -32,4 +34,4 @@ sacremoses
 # - openai/whisper-base
 librosa
 soundfile
-datasets
\ No newline at end of file
+datasets
diff --git a/tests/python_tests/test_continuous_batching.py b/tests/python_tests/test_continuous_batching.py
index 01762bf9e3..d7ce0b1ece 100644
--- a/tests/python_tests/test_continuous_batching.py
+++ b/tests/python_tests/test_continuous_batching.py
@@ -9,8 +9,8 @@
 from pathlib import Path
 from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer
 
-from common import get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, \
-    get_scheduler_config, get_greedy, run_continuous_batching_pipeline_test, get_beam_search, get_greedy, \
+from common import get_default_properties, get_hugging_face_models, convert_models, generate_and_compare_with_reference_text, \
+    get_scheduler_config, get_greedy, run_cb_pipeline_with_ref, get_beam_search, get_greedy, \
     get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \
     get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p
 from test_sampling import RandomSamplingTestStruct, get_current_platform_ref_texts
@@ -39,19 +39,19 @@ def read_models_list(file_name: str):
 @pytest.mark.precommit
 @pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit")))
 def test_e2e_precommit(tmp_path, model_id):
-    run_continuous_batching_pipeline_test(tmp_path, model_id)
+    run_cb_pipeline_with_ref(tmp_path, model_id)
 
 
 @pytest.mark.nightly
 @pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly")))
 def test_e2e_nightly(tmp_path, model_id):
-    run_continuous_batching_pipeline_test(tmp_path, model_id)
+    run_cb_pipeline_with_ref(tmp_path, model_id)
 
 
 @pytest.mark.real_models
 @pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models")))
 def test_e2e_real_models(tmp_path, model_id):
-    run_continuous_batching_pipeline_test(tmp_path, model_id)
+    run_cb_pipeline_with_ref(tmp_path, model_id)
 
 #
 # Comparison with stateful
@@ -77,8 +77,8 @@ def test_continuous_batching_vs_stateful(prompt, generation_config):
         "facebook/opt-125m",
         Path("opt-125m")
     ))
-    cb = get_continuous_batching(path)
-    generated = cb.generate(prompt, **generation_config)
+    cb_pipe = get_continuous_batching(path)
+    generated = cb_pipe.generate(prompt, **generation_config)
     reference = stateful.generate(prompt, **generation_config)
     assert generated.texts == reference.texts
     if 1 != generation_config.get("num_return_sequences", 1):
@@ -117,8 +117,8 @@ def test_cb_streamer_vs_return_vs_stateful(prompt):
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
 def test_chat_scenario_vs_stateful(model_descr, generation_config_kwargs: Dict):
-    model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
-    cb_pipe = get_continuous_batching(path)
+    model_id, models_path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
+    cb_pipe = get_continuous_batching(models_path)
 
     ov_pipe.start_chat()
     cb_pipe.start_chat()
@@ -150,12 +150,12 @@ def test_post_oom_health(tmp_path, sampling_config):
     scheduler_config.num_kv_blocks = 10 # Low cache size to trigger OOM quickly
 
     model_id : str = "facebook/opt-125m"
-    opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
+    opt_model, hf_tokenizer = get_hugging_face_models(model_id)
 
     models_path : Path = tmp_path / model_id
-    save_ov_model_from_optimum(opt_model, hf_tokenizer, models_path)
+    convert_models(opt_model, hf_tokenizer, models_path)
 
-    cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU")
+    cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU", **get_default_properties())
 
     # First run should return incomplete response
     output = cb_pipe.generate(["What is OpenVINO?"], [generation_config])
@@ -201,7 +201,7 @@ def get_beam_search_seq_len_300() -> GenerationConfig:
 @pytest.mark.parametrize("params", scheduler_params_list)
 @pytest.mark.precommit
 def test_preemption(tmp_path, params):
-    run_continuous_batching_pipeline_test(tmp_path, "facebook/opt-125m", scheduler_params=params[0], generation_config=params[1])
+    run_cb_pipeline_with_ref(tmp_path, "facebook/opt-125m", scheduler_params=params[0], generation_config=params[1])
 
 
 multinomial_params = RandomSamplingTestStruct(
@@ -249,13 +249,12 @@ def test_preemption(tmp_path, params):
 def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse):
     generation_configs = multinomial_params.generation_config
     for config in generation_configs:
-        config.rng_seed = 0
         config.max_new_tokens = 30
     model_id : str = "facebook/opt-125m"
-    model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
+    model, hf_tokenizer = get_hugging_face_models(model_id)
 
     models_path : Path = tmp_path / model_id
-    save_ov_model_from_optimum(model, hf_tokenizer, models_path)
+    convert_models(model, hf_tokenizer, models_path)
 
     scheduler_config = get_scheduler_config({"num_kv_blocks": 3, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256})
     generate_and_compare_with_reference_text(models_path, multinomial_params.prompts, multinomial_params.ref_texts, generation_configs, scheduler_config)
@@ -329,15 +328,12 @@ def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse):
 @pytest.mark.precommit
 @pytest.mark.skip(reason="Random sampling results are non deterministic due to: discrete_distribution impl depends on platform, model inference results may depend on CPU. Test passes on CI but fails locally.")
 def test_preemption_with_multinomial_n_seq(tmp_path, dynamic_split_fuse):
-    generation_configs = multinomial_params_n_seq.generation_config
-    for config in generation_configs:
-        config.rng_seed = 0
     model_id : str = "facebook/opt-125m"
-    model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
+    opt_model, hf_tokenizer = get_hugging_face_models(model_id)
 
     models_path : Path = tmp_path / model_id
-    save_ov_model_from_optimum(model, hf_tokenizer, models_path)
+    convert_models(opt_model, hf_tokenizer, models_path)
 
     # needed kv_blocks - 16 (2 blocks per sequence (30 tokens to generated text + prompt (> 2 tokens)) * (1 + 3 + 4) seq )
     scheduler_config = get_scheduler_config({"num_kv_blocks": 8, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256})
-    generate_and_compare_with_reference_text(models_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, generation_configs, scheduler_config)
+    generate_and_compare_with_reference_text(models_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, multinomial_params_n_seq.generation_config, scheduler_config)
diff --git a/tests/python_tests/test_generation_config.py b/tests/python_tests/test_generation_config.py
index 110caaf0e5..0a42685b05 100644
--- a/tests/python_tests/test_generation_config.py
+++ b/tests/python_tests/test_generation_config.py
@@ -23,6 +23,10 @@
     dict(max_new_tokens=1, do_sample=True, top_k=1),
     dict(max_new_tokens=1, do_sample=True, top_p=0.5),
     dict(max_new_tokens=1, do_sample=True, temperature=0.5),
+    # parameters requiring multimonial are ignored when do_sample=False
+    dict(max_new_tokens=1, top_k=1), # requires do_sample=True
+    dict(max_new_tokens=1, top_p=0.5), # requires do_sample=True
+    dict(max_new_tokens=1, temperature=2.0), # requires do_sample=True
     # beam search
     dict(max_new_tokens=1, num_beams=2),
     dict(max_new_tokens=1, num_beams=2, num_return_sequences=1),
@@ -30,6 +34,11 @@
     dict(max_new_tokens=1, num_beams=4, num_beam_groups=2, diversity_penalty=1.0),
     dict(max_new_tokens=1, num_beams=4, length_penalty=1.0),
     dict(max_new_tokens=1, num_beams=4, no_repeat_ngram_size=2),
+    # parameters requiring beam search are ignored when num_beams == 1
+    dict(max_new_tokens=1, num_beam_groups=2), # requiring beam search
+    dict(max_new_tokens=1, no_repeat_ngram_size=2), # requiring beam search
+    dict(max_new_tokens=1, diversity_penalty=1.0), # requiring beam search
+    dict(max_new_tokens=1, length_penalty=2), # requiring beam search
     # assistant generation
     dict(max_new_tokens=1, assistant_confidence_threshold=0.5),
     dict(max_new_tokens=1, num_assistant_tokens=2),
@@ -66,10 +75,6 @@ def test_valid_configs(generation_config_kwargs):
     dict(max_new_tokens=1, do_sample=True, top_p=1.1), # 'top_p' must be within (0, 1] when 'do_sample' is True
     dict(max_new_tokens=1, do_sample=True, top_p=0), # 'top_p' must be within (0, 1] when 'do_sample' is True
     dict(max_new_tokens=1, do_sample=True, temperature=-1.0), # invalid temp
-    # parameters requiring multimonial
-    dict(max_new_tokens=1, top_k=1), # requires do_sample=True
-    dict(max_new_tokens=1, top_p=0.5), # requires do_sample=True
-    dict(max_new_tokens=1, temperature=2.0), # requires do_sample=True
     # beam search
     dict(max_new_tokens=1, num_beams=2, num_return_sequences=3), # 'num_beams' must be >= 'num_return_sequences'
     dict(max_new_tokens=1, num_beams=3, num_beam_groups=2), # 'num_beams' must be divisible by 'num_beam_groups'
@@ -80,11 +85,6 @@ def test_valid_configs(generation_config_kwargs):
     dict(max_new_tokens=1, num_beams=2, frequency_penalty=1.0), # 'frequency_penalty' is not supported by beam search
     dict(max_new_tokens=1, num_beams=2, presence_penalty=1.0), # 'presence_penalty' is not supported by beam search
     dict(max_new_tokens=1, num_beams=2, repetition_penalty=0.0), # 'repetition_penalty' is not supported by beam search
-    # parameters requiring beam search
-    dict(max_new_tokens=1, num_beam_groups=2), # requiring beam search
-    dict(max_new_tokens=1, no_repeat_ngram_size=2), # requiring beam search
-    dict(max_new_tokens=1, diversity_penalty=1.0), # requiring beam search
-    dict(max_new_tokens=1, length_penalty=2), # requiring beam search
     # assistant generation
     dict(max_new_tokens=1, num_assistant_tokens=2, do_sample=True, num_return_sequences=2), # 'num_return_sequences' must be 1, as we cannot use different number of tokens per sequence within a group
     dict(max_new_tokens=1, assistant_confidence_threshold=1.0, do_sample=True, num_return_sequences=2), # 'num_return_sequences' must be 1, as we cannot use different number of tokens per sequence within a group
diff --git a/tests/python_tests/test_kv_cache_eviction.py b/tests/python_tests/test_kv_cache_eviction.py
index 6228f53dd1..428047ea28 100644
--- a/tests/python_tests/test_kv_cache_eviction.py
+++ b/tests/python_tests/test_kv_cache_eviction.py
@@ -15,7 +15,7 @@
 from openvino import serialize
 from transformers import AutoTokenizer
 
-from common import TESTS_ROOT, run_continuous_batching_pipeline_test
+from common import TESTS_ROOT, run_cb_pipeline_with_ref, get_default_properties
 
 
 def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]:
@@ -42,7 +42,7 @@ class ConvertedModel:
 @pytest.fixture(scope='module')
 def converted_model(tmp_path_factory):
     model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-    model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True)
+    model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, load_in_8bit=False, compile=False, ov_config=get_default_properties())
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     models_path = tmp_path_factory.mktemp("cacheopt_test_models") / model_id
     model.save_pretrained(models_path)
@@ -112,8 +112,8 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t
     scheduler_config_opt.enable_prefix_caching = enable_prefix_caching
 
     models_path = converted_model.models_path
-    model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU")
-    model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU")
+    model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU", {}, get_default_properties())
+    model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU", {}, get_default_properties())
 
     tokenizer = converted_model.tokenizer
 
@@ -150,6 +150,7 @@ def get_greedy_seq_len_300() -> GenerationConfig:
     generation_config.max_new_tokens = 300
     return generation_config
 
+
 def get_beam_search_seq_len_300() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.num_beam_groups = 3
@@ -159,6 +160,7 @@ def get_beam_search_seq_len_300() -> GenerationConfig:
     generation_config.num_return_sequences = generation_config.num_beams
     return generation_config
 
+
 scheduler_params_list = [
                          ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": True}, get_greedy_seq_len_300()),
                          ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": True}, get_beam_search_seq_len_300()),
@@ -168,5 +170,5 @@ def get_beam_search_seq_len_300() -> GenerationConfig:
 @pytest.mark.parametrize("params", scheduler_params_list)
 @pytest.mark.precommit
 def test_dynamic_memory_allocation(tmp_path, params):
-    run_continuous_batching_pipeline_test(tmp_path, "facebook/opt-125m", params[0], params[1])
+    run_cb_pipeline_with_ref(tmp_path, "facebook/opt-125m", scheduler_params=params[0], generation_config=params[1])
 
diff --git a/tests/python_tests/test_llm_pipeline.py b/tests/python_tests/test_llm_pipeline.py
index 6e3cce06d0..5278f4424f 100644
--- a/tests/python_tests/test_llm_pipeline.py
+++ b/tests/python_tests/test_llm_pipeline.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import openvino_genai as ov_genai
-from openvino_genai import StopCriteria, GenerationConfig
+from openvino_genai import GenerationConfig
 import pytest
 from typing import Union, List, Dict, Optional
 import numpy as np
@@ -10,152 +10,30 @@
 import sys
 from pathlib import Path
 import torch
-import math
+
+from common import run_llm_pipeline_with_ref, convert_to_hf
 from ov_genai_test_utils import (
     get_models_list,
     read_model,
     load_genai_pipe_with_configs,
     get_chat_models_list,
     model_tmp_path,
-    STOP_CRITERIA_MAP,
 )
 
-
-def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, prompts: Union[str, List[str]]):
-    model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr
-    config = generation_config.copy()  # to avoid side effects
-    num_beams = config['num_beams'] if 'num_beams' in config else 1
-    config['num_return_sequences'] = num_beams
-
-    if not isinstance(prompts, list):
-        prompts = [prompts]
-
-    if 'do_sample' not in config:
-        # Some HF models have default do_sample = True, and if we set beam search generation config
-        # it conflicts with `diversity_penalty` and/or `num_beam_groups`.
-        # Need to set explicitly to False, but only if test arguments omitted this arg.
-        # Do not apply 'repetition_penalty' if sampling is not used.
-        config['do_sample'] = False
-        config['repetition_penalty'] = 1.0 # 1.0 means no penalty
-
-    generation_config_hf = config.copy()
-    if generation_config_hf.get('stop_criteria'):
-        generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')]
-    generation_config_hf.pop('ignore_eos', None)
-
-    # Encode the batch of prompts
-    hf_tokenizer.padding_side = "left"
-    encoded_prompts = hf_tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True)
-    prompt_ids, attention_mask = encoded_prompts['input_ids'], encoded_prompts['attention_mask']
-
-    hf_encoded_outputs = opt_model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf)
-
-    hf_outputs = []
-    for idx, hf_encoded_out in enumerate(hf_encoded_outputs):
-        prompt_count = idx // num_beams
-        hf_outputs.append(hf_tokenizer.decode(hf_encoded_out[prompt_ids[prompt_count].shape[0]:], skip_special_tokens=True))
-
-    ov_outputs = ov_pipe.generate(prompts, **config).texts
-
-    hf_outputs.sort()
-    ov_outputs.sort()
-    for i, (hf_output, ov_output) in enumerate(zip(hf_outputs, ov_outputs)):
-        if hf_output != ov_output:
-            print(f'hf_output: {hf_output}')
-            print(f'ov_output: {ov_output}')
-        assert hf_output == ov_output
-
-
-def run_hf_ov_genai_comparison_text_inputs(model_descr, generation_config: Dict, prompt: str):
-    model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr
-
-    config = generation_config.copy()  # to avoid side effects
-
-    if 'do_sample' not in config:
-        # Some HF models have default do_sample = True, and if we set beam search generation config
-        # it conflicts with `diversity_penalty` and/or `num_beam_groups`.
-        # Need to set explicitly to False, but only if test arguments omitted this arg.
-        # Do not apply 'repetition_penalty' if sampling is not used.
-        config['do_sample'] = False
-        config['repetition_penalty'] = 1.0 # 1.0 means no penalty
-
-    generation_config_hf = config.copy()
-    if generation_config_hf.get('stop_criteria'):
-        generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')]
-    generation_config_hf.pop('ignore_eos', None)
-
-    encoded_prompt = hf_tokenizer([prompt], return_tensors='pt', add_special_tokens=True)
-    prompt_ids, attention_mask = encoded_prompt['input_ids'], encoded_prompt['attention_mask']
-    hf_encoded_output = opt_model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf)
-    hf_output = hf_tokenizer.decode(hf_encoded_output[0, prompt_ids.shape[1]:], skip_special_tokens=True)
-
-    ov_output = ov_pipe.generate(prompt, **config)
-    if config.get('num_return_sequences', 1) > 1:
-        assert hf_output in ov_output.texts
-    else:
-        if hf_output != ov_output:
-            print(f'hf_output: {hf_output}')
-            print(f'ov_output: {ov_output}')
-
-        assert hf_output == ov_output
-
-
-def run_hf_ov_genai_comparison_encoded_inputs(
-        model_descr,
-        generation_config: Dict,
-        input_ids: np.ndarray,
-        attention_mask: Optional[np.array] = None
-    ):
-    device = 'CPU'
-    model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr
-
-    config = generation_config.copy()  # to avoid side effects
-
-    if 'do_sample' not in config:
-        # Some HF models have default do_sample = True, and if we set beam search generation config
-        # it conflicts with `diversity_penalty` and/or `num_beam_groups`.
-        # Need to set explicitly to False, but only if test arguments omitted this arg.
-        # Do not apply 'repetition_penalty' if sampling is not used.
-        config['do_sample'] = False
-        config['repetition_penalty'] = 1.0 # 1.0 means no penalty
-
-    generation_config_hf = config.copy()
-    if generation_config_hf.get('stop_criteria'):
-        generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')]
-    generation_config_hf.pop('ignore_eos', None)
-
-    if attention_mask is not None:
-        inputs_ov = ov_genai.TokenizedInputs(ov.Tensor(input_ids), ov.Tensor(attention_mask))
-        inputs_hf = dict(inputs=torch.tensor(input_ids), attention_mask=torch.tensor(attention_mask))
-    else:
-        inputs_hf = dict(inputs=torch.tensor(input_ids))
-        inputs_ov = ov.Tensor(input_ids)
-
-    hf_output = opt_model.generate(**inputs_hf, **generation_config_hf)
-    ov_output = ov_pipe.generate(inputs_ov, **config)
-
-    hf_res = hf_output[0, input_ids.shape[1]:].numpy()
-    ov_res = np.array(ov_output.tokens, dtype=np.int64)
-    assert np.all(ov_res == hf_res)
-
 #
 # e2e work
 #
 
 test_cases = [
-    (dict(max_new_tokens=20), 'table is made of'),
     (dict(max_new_tokens=20), '你好！ 你好嗎？'),
-    (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=30, diversity_penalty=1.0), 'Alan Turing was a'),
-    (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'),
-    (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'The Sun is yellow because'),
-    (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.5), 'The Sun is yellow because'),
+    (dict(max_new_tokens=30, num_beams=15, num_beam_groups=3, num_return_sequences=15, diversity_penalty=1.0), 'Alan Turing was a'),
 ]
-@pytest.mark.parametrize("generation_config,prompt", test_cases)
+@pytest.mark.parametrize("generation_config_dict,prompt", test_cases)
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_decoding(model_descr, generation_config, prompt):
-    run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt)
+def test_string_inputs(model_descr, generation_config_dict, prompt):
+    run_llm_pipeline_with_ref(model_id=model_descr[0], prompts=[prompt], generation_config=generation_config_dict, tmp_path=model_descr[1])
 
 
 input_tensors_list = [
@@ -168,13 +46,33 @@ def test_decoding(model_descr, generation_config, prompt):
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_encoded_inputs(model_descr, inputs):
-    run_hf_ov_genai_comparison_encoded_inputs(read_model(model_descr), dict(max_new_tokens=20), *inputs)
+    device = 'CPU'
+    model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model(model_descr)
+
+    ov_generation_config = GenerationConfig(max_new_tokens=20)
+    hf_generation_config = convert_to_hf(opt_model.generation_config, ov_generation_config)
+
+    input_ids, attention_mask = inputs
+    prompt_len = input_ids.shape[1]
+
+    if attention_mask is not None:
+        inputs_ov = ov_genai.TokenizedInputs(ov.Tensor(input_ids), ov.Tensor(attention_mask))
+        inputs_hf = dict(inputs=torch.tensor(input_ids), attention_mask=torch.tensor(attention_mask))
+    else:
+        inputs_hf = dict(inputs=torch.tensor(input_ids))
+        inputs_ov = ov.Tensor(input_ids)
+
+    hf_output = opt_model.generate(**inputs_hf, generation_config=hf_generation_config).sequences[0]
+    ov_output = ov_pipe.generate(inputs_ov, ov_generation_config)
+
+    hf_res = hf_output[prompt_len:].numpy()
+    ov_res = np.array(ov_output.tokens, dtype=np.int64)
+    assert np.all(ov_res == hf_res)
 
 
 test_configs = [
     dict(max_new_tokens=20),
-    dict(max_new_tokens=200, ignore_eos=True),
-    dict(max_new_tokens=20, num_beam_groups=3, num_beams=15, diversity_penalty=1.0)
+    dict(max_new_tokens=20, num_beam_groups=2, num_beams=6, diversity_penalty=1.0)
 ]
 batched_prompts = [
     ['table is made', 'They sky is blue because', 'Difference between Jupiter and Mars is that'],
@@ -182,107 +80,13 @@ def test_encoded_inputs(model_descr, inputs):
     ['Alan Turing was a', 'return 0', '你好！ 你好嗎？'],
     ['table is made', 'table is made [force left pad tokens]']
 ]
-@pytest.mark.parametrize("generation_config", test_configs)
+@pytest.mark.parametrize("generation_config_dict", test_configs)
 @pytest.mark.parametrize("prompts", batched_prompts)
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_batch_text_input(model_descr, generation_config, prompts):
-    run_hf_ov_genai_comparison_batched(read_model(model_descr), generation_config, prompts)
-
-
-prompts = ['The Sun is yellow because', 'Difference between Jupiter and Mars is that', 'table is made of']
-@pytest.mark.parametrize("num_beam_groups", [2, 3, 8])
-@pytest.mark.parametrize("group_size", [5, 3, 10])
-@pytest.mark.parametrize("max_new_tokens", [20, 15])
-@pytest.mark.parametrize("diversity_penalty", [1.0 , 1.5])
-@pytest.mark.parametrize("prompt", prompts)
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_beam_search_decoding(model_descr, num_beam_groups, group_size, max_new_tokens, diversity_penalty, prompt):
-    generation_config = dict(
-        num_beam_groups=num_beam_groups,
-        num_beams=num_beam_groups * group_size,
-        diversity_penalty=diversity_penalty,
-        num_return_sequences=num_beam_groups * group_size,
-        max_new_tokens=max_new_tokens,
-    )
-    run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt)
-
-
-@pytest.mark.parametrize("stop_criteria", [StopCriteria.NEVER, StopCriteria.EARLY, StopCriteria.HEURISTIC])
-@pytest.mark.parametrize("prompt", prompts)
-@pytest.mark.parametrize("max_new_tokens", [10, 80])
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_beam_search_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens):
-    # todo: with EARLY stop_criteria looks like HF return invalid out with sentence<eos><unk><unk>
-    # while genai ends sentence with <eos>
-    if (stop_criteria == StopCriteria.EARLY):
-        pytest.skip()
-    generation_config = dict(
-        num_beam_groups=2,
-        num_beams=2 * 3,
-        diversity_penalty=1.0,
-        num_return_sequences=2 * 3,
-        max_new_tokens=max_new_tokens,
-        stop_criteria=stop_criteria,
-    )
-    run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt)
-
-
-# test long sequences
-@pytest.mark.parametrize("num_beam_groups", [2])
-@pytest.mark.parametrize("group_size", [5])
-@pytest.mark.parametrize("max_new_tokens", [800, 2000])
-@pytest.mark.parametrize("prompt", prompts)
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.nightly
-def test_beam_search_long_sentences(model_descr, num_beam_groups, group_size,
-                                    max_new_tokens, prompt):
-    generation_config = dict(
-        num_beam_groups=num_beam_groups,
-        num_beams=num_beam_groups * group_size,
-        diversity_penalty=1.0,
-        num_return_sequences=num_beam_groups * group_size,
-        max_new_tokens=max_new_tokens,
-    )
-    run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt)
-
-
-@pytest.mark.parametrize("prompt", prompts)
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_greedy_repetition_penalty(model_descr, prompt):
-    model_id, path, tokenizer, model, pipe = read_model(model_descr)
-
-    generation_config = dict(
-        repetition_penalty=2.0,
-        max_new_tokens=20,
-        do_sample=False
-    )
-    run_hf_ov_genai_comparison_text_inputs((model_id, path, tokenizer, model, pipe), generation_config, prompt)
-
-    generation_config = dict(
-        repetition_penalty=1.0,
-        max_new_tokens=20,
-        do_sample=False
-    )
-    run_hf_ov_genai_comparison_text_inputs((model_id, path, tokenizer, model, pipe), generation_config, prompt)
-
-    ov_output = pipe.generate(prompt, **generation_config)
-
-    generation_config = dict(
-        repetition_penalty=0.5,
-        max_new_tokens=20,
-        do_sample=False
-    )
-    ov_output_half_penalty = pipe.generate(prompt, **generation_config)
-
-    assert(len(set(ov_output.split(' '))) > len(set(ov_output_half_penalty.split(' '))))
+def test_batch_string_inputs(model_descr, generation_config_dict, prompts):
+    run_llm_pipeline_with_ref(model_id=model_descr[0], prompts=prompts, generation_config=generation_config_dict, tmp_path=model_descr[1])
 
 
 @pytest.mark.precommit
@@ -313,17 +117,14 @@ def test_batch_size_switch():
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_chat_compare_with_HF(model_descr, generation_config_kwargs: Dict):
+def test_chat_scenario(model_descr, generation_config_kwargs: Dict):
     chat_history_hf = []
     chat_history_ov = []
-    chat_prompt = ''
 
-    # Will set add_special_tokens=False inside pipeline when start_chat() is called.
     model_id, path, tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
 
-    from transformers import GenerationConfig as HFGenerationConfig
-    hf_generation_config = HFGenerationConfig(**generation_config_kwargs)
     ov_generation_config = GenerationConfig(**generation_config_kwargs)
+    hf_generation_config = convert_to_hf(opt_model.generation_config, ov_generation_config)
 
     ov_pipe.start_chat()
     for prompt in questions:
@@ -332,9 +133,10 @@ def test_chat_compare_with_HF(model_descr, generation_config_kwargs: Dict):
 
         chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True)
         tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False)
+        prompt_len = tokenized['input_ids'].numel()
 
-        answer = opt_model.generate(**tokenized, generation_config=hf_generation_config)
-        answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
+        answer = opt_model.generate(**tokenized, generation_config=hf_generation_config).sequences[0]
+        answer_str = tokenizer.decode(answer[prompt_len:], skip_special_tokens=True)
         chat_history_hf.append({'role': 'assistant', 'content': answer_str})
 
         answer_ov = ov_pipe.generate(prompt, generation_config=ov_generation_config)
@@ -559,39 +361,27 @@ def test_unicode_pybind_decoding_one_string_streamer():
 # Perf metrics
 #
 
-def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: str) -> ov_genai.PerfMetrics:
-    model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr
-
-    config = generation_config.copy()  # to avoid side effects
-
-    if 'do_sample' not in config:
-        # Some HF models have default do_sample = True, and if we set beam search generation config
-        # it conflicts with `diversity_penalty` and/or `num_beam_groups`.
-        # Need to set explicitly to False, but only if test arguments omitted this arg.
-        # Do not apply 'repetition_penalty' if sampling is not used.
-        config['do_sample'] = False
-        config['repetition_penalty'] = 1.0 # 1.0 means no penalty
-
-    return ov_pipe.generate([prompt], **config).perf_metrics
+def run_perf_metrics_collection(model_descr, generation_config_dict: dict, prompt: str) -> ov_genai.PerfMetrics:
+    model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model(model_descr)
+    return ov_pipe.generate([prompt], **generation_config_dict).perf_metrics
 
 
 test_cases = [
     (dict(max_new_tokens=20), 'table is made of'),
 ]
 @pytest.mark.parametrize("generation_config,prompt", test_cases)
-@pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
 @pytest.mark.nightly
-@pytest.mark.skip(reason="load_time + mean_gen_duration < total_time fails in https://github.com/openvinotoolkit/openvino.genai/actions/runs/12503590506/job/34884840100?pr=1440.")
-def test_perf_metrics(model_descr, generation_config, prompt):
+def test_perf_metrics(generation_config, prompt):
     import time
     start_time = time.perf_counter()
-    perf_metrics = run_perf_metrics_collection(read_model(model_descr), generation_config, prompt)
+    model_id, path = 'katuni4ka/tiny-random-gemma2', Path('katuni4ka-tiny-random-gemma2')
+    perf_metrics = run_perf_metrics_collection((model_id, path), generation_config, prompt)
     total_time = (time.perf_counter() - start_time) * 1000
 
     # Check that load time is adequate.
     load_time = perf_metrics.get_load_time()
-    assert load_time > 0 and load_time < 1000.0
+    assert load_time > 0 and load_time < total_time
 
     # Check that num input and generated tokens are adequate.
     num_generated_tokens = perf_metrics.get_num_generated_tokens()
@@ -657,34 +447,6 @@ def test_perf_metrics(model_descr, generation_config, prompt):
 # Misc
 #
 
-# TODO: move to test_sampling.py
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_stop_token_ids():
-    ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
-    res = ov_pipe.generate(
-        ov.Tensor([(1,)]),
-        max_new_tokens=3,
-        stop_token_ids={9935, ov_pipe.get_tokenizer().get_eos_token_id()},
-        include_stop_str_in_output=False
-    )
-    assert 2 == len(res.tokens[0])
-    assert 9935 in res.tokens[0]
-
-
-# TODO: move to test_sampling.py
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_stop_strings():
-    ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
-    res = ov_pipe.generate(
-        "",
-        max_new_tokens=5,
-        stop_strings={"ignored", "боль"}
-    )
-    assert "боль" not in res
-
-
 # TODO: move this test to test_tokenizer.py
 @pytest.mark.skip(reason="probably both models ov + hf doesn't fit to memory")
 @pytest.mark.precommit
@@ -698,7 +460,7 @@ def test_left_pad():
     ]
     models = read_model(("microsoft/phi-1_5", Path("phi-1_5/")))
 
-    config = {
+    generation_config_dict = {
         "max_new_tokens": 20,
         "num_beam_groups": 2,
         "num_beams": 2,
@@ -713,4 +475,5 @@ def test_left_pad():
     }
 
     models[2].pad_token = models[2].eos_token
-    run_hf_ov_genai_comparison_batched(models, config, prompts)
+    
+    run_llm_pipeline_with_ref(model_id=models[0], prompts=prompts, generation_config=generation_config_dict, tmp_path=models[1])
diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py
index c3500d15ac..d2d3673356 100644
--- a/tests/python_tests/test_llm_pipeline_static.py
+++ b/tests/python_tests/test_llm_pipeline_static.py
@@ -1,14 +1,30 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 import openvino_genai as ov_genai
-from openvino.runtime import Core
+from openvino_genai import GenerationConfig
+
 import pytest
+import platform
 import sys
 from ov_genai_test_utils import (
     get_models_list,
     get_chat_models_list,
+    read_model
 )
+from common import get_default_properties
+
+from common import                                      \
+    get_greedy,                                         \
+    get_greedy_with_penalties,                          \
+    get_multinomial_temperature,                        \
+    get_multinomial_all_parameters,                     \
+    get_multinomial_temperature_and_presence_penalty,   \
+    get_beam_search
+
+
+if sys.platform == 'darwin' or platform.machine() in ["aarch64", "arm64", "ARM64"]:
+    pytest.skip("NPU plugin is available only on Linux and Windows x86_64", allow_module_level=True)
 
 
 # This test suite is designed specifically to validate the functionality and robustness of the StaticLLMPipeline on NPUW:CPU.
@@ -24,23 +40,27 @@
 def generate_chat_history(model_path, device, pipeline_config, questions):
     pipe = ov_genai.LLMPipeline(model_path, device, **pipeline_config)
     pipe.start_chat()
-    chat_history = [ pipe.generate(question, max_new_tokens=50) for question in questions ]
+    chat_history = [ pipe.generate(question, max_new_tokens=50, do_sample=False) for question in questions ]
     pipe.finish_chat()
     return chat_history
 
 
-@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
+generation_configs = [
+    get_greedy(),
+    get_greedy_with_penalties()
+]
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_generation_compare_with_stateful():
-    prompt = 'The Sun is yellow because'
-    model_path = get_models_list()[0][1]
+@pytest.mark.parametrize("generation_config", generation_configs)
+def test_generation_compare_with_stateful(generation_config):
+    prompt = 'What is OpenVINO?'
+    model_path = read_model(get_models_list()[0])[1]
 
     stateful_pipe = ov_genai.LLMPipeline(model_path, "CPU")
-    ref_out = stateful_pipe.generate(prompt, max_new_tokens=100)
+    ref_out = stateful_pipe.generate(prompt, generation_config)
 
     static_pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config)
-    actual_out = static_pipe.generate(prompt, max_new_tokens=100)
+    actual_out = static_pipe.generate(prompt, generation_config)
 
     if ref_out != actual_out:
         print(f'ref_out: {ref_out}\n')
@@ -48,11 +68,29 @@ def test_generation_compare_with_stateful():
     assert ref_out == actual_out
 
 
-@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
+generation_configs = [
+    get_multinomial_temperature_and_presence_penalty()
+]
+@pytest.mark.precommit
+@pytest.mark.nightly
+@pytest.mark.parametrize("generation_config", generation_configs)
+def test_multinomial_sampling(generation_config):
+    # Multinomial sampling is highly sensitive to raw logits values. For fair comparison,
+    # a reference implementation producing identical logits (e.g., from StaticLLMPipeline)
+    # would be necessary. However, the CPU in StatefulPipeline and StaticLLMPipeline may apply
+    # different optimizations due to differences in provided topologies, leading to slight
+    # variations in raw logits. Therefore, there is no reliable reference for validation,
+    # so only ensure that no exceptions are raised.
+    prompt = 'What is OpenVINO?'
+    model_path = read_model(get_models_list()[0])[1]
+    static_pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config)
+    actual_out = static_pipe.generate(prompt, generation_config)
+
+
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_length_properties_set_no_exception():
-    model_path = get_models_list()[0][1]
+    model_path = read_model(get_models_list()[0])[1]
     # NB: Check it doesn't throw any exception
     pipeline_config = { "MAX_PROMPT_LEN": 128, "MIN_RESPONSE_LEN": 64 }
     pipeline_config |= common_config
@@ -65,22 +103,20 @@ def test_length_properties_set_no_exception():
     { "MIN_RESPONSE_LEN": -1  },
     { "MIN_RESPONSE_LEN": "1" }
 ]
-@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
 @pytest.mark.parametrize("pipeline_config", pipeline_configs)
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_invalid_length_properties_raise_error(pipeline_config):
-    model_path = get_models_list()[0][1]
+    model_path = read_model(get_models_list()[0])[1]
     pipeline_config |= common_config
     with pytest.raises(RuntimeError):
         pipe = ov_genai.LLMPipeline(model_path, "NPU", **pipeline_config)
 
 
-@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_batch_one_no_exception():
-    model_path = get_models_list()[0][1]
+    model_path = read_model(get_models_list()[0])[1]
     prompt = 'The Sun is yellow because'
     static_pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config)
     # Check it doesn't throw any exception when batch of size 1 is provided
@@ -88,11 +124,10 @@ def test_batch_one_no_exception():
 
 
 # TODO: For the further batch support
-@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_batch_raise_error():
-    model_path = get_models_list()[0][1]
+    model_path = read_model(get_models_list()[0])[1]
     prompt = 'The Sun is yellow because'
     pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config)
     with pytest.raises(RuntimeError):
@@ -101,26 +136,26 @@ def test_batch_raise_error():
 
 # TODO: For the further sampling support
 generation_configs = [
-    dict(num_beam_groups=3),
-    dict(do_sample=True)
+    get_beam_search(),
+    # NB: Only num_return_sequences=1 is supported!
+    get_multinomial_all_parameters()
 ]
-@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
 @pytest.mark.parametrize("generation_config", generation_configs)
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_unsupported_sampling_raise_error(generation_config):
-    model_path = get_models_list()[0][1]
-    prompt = 'The Sun is yellow because'
+    model_path = read_model(get_models_list()[0])[1]
+    prompt = 'What is OpenVINO?'
+
     pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config)
     with pytest.raises(RuntimeError):
-        pipe.generate(prompt, **generation_config)
+        pipe.generate(prompt, generation_config)
 
 
-@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_max_number_of_tokens():
-    model_path = get_models_list()[0][1]
+def test_terminate_by_max_number_of_tokens():
+    model_path = read_model(get_models_list()[0])[1]
     prompt = 'The Sun is yellow because'
     num_tokens = 128
 
@@ -132,12 +167,52 @@ def test_max_number_of_tokens():
     assert len(encoded_results.tokens[0]) == num_tokens
 
 
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_terminate_by_out_of_memory():
+    model_path = read_model(get_models_list()[0])[1]
+    prompt = 'The Sun is yellow because'
+    pipeline_config = { "MAX_PROMPT_LEN": 64, "MIN_RESPONSE_LEN": 64 }
+    pipeline_config |= common_config
+    kv_cache_size = pipeline_config['MAX_PROMPT_LEN'] + pipeline_config['MIN_RESPONSE_LEN']
+
+    tokenizer = ov_genai.Tokenizer(model_path)
+    tokenized_input = tokenizer.encode(prompt)
+    input_len = tokenized_input.input_ids.get_shape()[1]
+
+    pipe = ov_genai.LLMPipeline(model_path, "NPU", **pipeline_config)
+    encoded_results = pipe.generate(tokenized_input, max_new_tokens=1000, ignore_eos=True)
+
+    assert len(encoded_results.tokens[0]) == (kv_cache_size - input_len + 1)
+
+
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_terminate_by_sampler():
+    model_path = read_model(get_models_list()[0])[1]
+    prompt = 'The Sun is yellow because'
+
+    current_iter = 0
+    num_iters = 10
+    def callback(subword):
+        nonlocal current_iter
+        current_iter += 1
+        return current_iter == num_iters
+
+    tokenizer = ov_genai.Tokenizer(model_path)
+    tokenized_input = tokenizer.encode(prompt)
+
+    pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config)
+    encoded_results = pipe.generate(tokenized_input, max_new_tokens=1000, ignore_eos=True, streamer=callback)
+
+    assert len(encoded_results.tokens[0]) == num_iters
+
+
 # FIXME: Known problem, output differs from stateful pipeline starting from 3rd prompt!
-@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
 @pytest.mark.skip(reason="JIRA-144780: Output differs from stateful pipeline")
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_chat_generation(model_descr):
+def test_chat_generation():
     questions = [
         '1+1=',
         'What is the previous answer?',
@@ -145,9 +220,9 @@ def test_chat_generation(model_descr):
         'What was my first question?'
     ]
 
-    model_path = get_chat_models_list()[0][1]
+    model_path = read_model(get_chat_models_list()[0])[1]
 
-    chat_history_stateful = generate_chat_history(model_path, "CPU", { }, questions)
+    chat_history_stateful = generate_chat_history(model_path, "CPU", get_default_properties(), questions)
     chat_history_static   = generate_chat_history(model_path, "NPU", common_config, questions)
 
     print('npu chat: \n{chat_history_static}\n')
diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py
index 25ae9d8afa..004d4f9d9d 100644
--- a/tests/python_tests/test_sampling.py
+++ b/tests/python_tests/test_sampling.py
@@ -1,84 +1,96 @@
 # Copyright (C) 2018-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-import os
+
 import sys
 import pytest
-import shutil
 import sys
 from dataclasses import dataclass
 from pathlib import Path
-from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer
+from openvino_genai import GenerationConfig, StopCriteria
 from typing import List, TypedDict
 
-from common import get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, \
-    get_greedy, get_beam_search, get_multinomial_temperature, \
-    get_greedy_with_penalties, get_multinomial_temperature, \
-    get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, \
-    get_multinomial_temperature_top_p_and_top_k, DEFAULT_SCHEDULER_CONFIG, get_greedy_with_repetition_penalty, \
-    get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \
-    get_greedy, get_greedy_with_min_and_max_tokens, \
-    get_greedy_with_single_stop_string, get_greedy_with_multiple_stop_strings, get_greedy_with_multiple_stop_strings_no_match, \
-    get_beam_search, get_beam_search_min_and_max_tokens, get_beam_search_with_single_stop_string, \
-    get_beam_search_with_multiple_stop_strings, get_beam_search_with_multiple_stop_strings_no_match, get_multinomial_max_and_min_token, \
-    get_multinomial_temperature_and_frequence_penalty, get_multinomial_temperature_and_presence_penalty, \
-    get_greedy_stop_strings_exclude_from_output, get_greedy_stop_strings_include_to_output, \
-    get_greedy_n_stop_strings_exclude_from_output, get_greedy_n_stop_strings_include_to_output, \
-    generate_and_compare_with_hf, get_multinomial_temperature_and_repetition_penalty, get_scheduler_config, \
-    run_continuous_batching
+from common import get_hugging_face_models, convert_models, run_llm_pipeline_with_ref, run_llm_pipeline
 
 
-# TODO: currently, this test drops EOS token as both HF and OV use `skip_special_tokens=True`, which should be disabled for samlpling tests
 @pytest.mark.precommit
-def test_beam_search_has_eos_token_at_end(tmp_path):
-    '''
-    Current test checks that in case of beam search, some generation results
-    explicitly have EOS token at the end, which is aligned with HF
+@pytest.mark.parametrize("generation_config,prompt",
+                         [(dict(max_new_tokens=30), 'table is made of'),
+                          (dict(max_new_tokens=30, min_new_tokens=30), '你好！ 你好嗎？'),
+                          (dict(max_new_tokens=30, ignore_eos=True), 'Alan Turing was a'),
+                        #   (dict(max_length=40), 'table is made of'),
+                          (dict(stop_token_ids={28998}), 'The Sun is yellow because'), # since a test does not hang, it means stop token is met
+                        #   (dict(max_new_tokens=1, min_new_tokens=0, echo=True), 'What is OpenVINO?')
+                          ],
+                         ids=["max_new_tokens",
+                              "min_and_max_new_tokens",
+                              "max_new_tokens_and_ignore_eos_true",
+                            #   "max_length",
+                              "stop_token_ids",
+                            #   "echo_with_generation",
+                              ])
+def test_basic_stop_criteria(tmp_path, generation_config, prompt):
+    model_id : str = "katuni4ka/tiny-random-phi3"
+    run_llm_pipeline_with_ref(model_id, [prompt], generation_config, tmp_path)
 
-    Example of current output:
-    { -1.23264,  that I don't know about.
-    I don't know what you're talking about, but I'm pretty sure it's a Canadian thing.</s> }
-    '''
-    model_id = "facebook/opt-125m"
-    prompts = ["Tell me something about Canada"]
-    generation_configs = [get_beam_search()]
-    scheduler_config = get_scheduler_config()
-    generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path)
 
-
-# TODO: currently, this test drops EOS token as both HF and OV use `skip_special_tokens=True`, which should be disabled for samlpling tests
 @pytest.mark.precommit
-def test_greedy_has_eos_token_at_end(tmp_path):
-    '''
-    Current test checks that in case of gready, some generation results
-    explicitly have EOS token at the end, which is aligned with HF:
+@pytest.mark.parametrize("generation_config",
+                         [dict(max_new_tokens=50, min_new_tokens=15, stop_strings={"anag"}, include_stop_str_in_output=True), # expected match on "manage"
+                          dict(max_new_tokens=50, min_new_tokens=1, stop_strings={".", "software", "Intel"}, include_stop_str_in_output=True),
+                          dict(max_new_tokens=50, min_new_tokens=1, stop_strings={"Einstein", "sunny", "geothermal"}, include_stop_str_in_output=True), # expected no match
+                          dict(max_new_tokens=30, stop_strings={ "machines" }, include_stop_str_in_output=False),
+                          dict(max_new_tokens=30, stop_strings={ "machines" }, include_stop_str_in_output=True),
+                          dict(max_new_tokens=30, stop_strings={ "machines", "manage" }, include_stop_str_in_output=False),
+                          dict(max_new_tokens=30, stop_strings={ "machines", "manage" }, include_stop_str_in_output=True),],
+                         ids=["single_stop_string",
+                              "multiple_stop_strings_match",
+                              "multiple_stop_strings_no_match",
+                              "single_stop_string_exclude_from_output",
+                              "single_stop_string_include_to_output",
+                              "multiple_stop_strings_exclude_from_output",
+                              "multiple_stop_strings_include_to_output"])
+def test_stop_strings(tmp_path, generation_config):
+    prompts = [ "What is OpenVINO?" ]
+    model_id : str = "facebook/opt-125m"
+    run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path)
 
-    Example of current output:
-    {  a software program</s> }
-    '''
-    model_id = "bigscience/bloomz-560m"
-    prompts = ["What is OpenVINO?"]
-    generation_configs = [get_greedy()]
-    scheduler_config = get_scheduler_config()
-    generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path)
+
+@pytest.mark.precommit
+@pytest.mark.parametrize("generation_config",
+                         [dict(max_new_tokens=30),
+                          dict(max_new_tokens=30, repetition_penalty=2.0),],
+                         ids=["basic",
+                              "repetition_penalty",])
+def test_greedy(tmp_path, generation_config):
+    prompts = [ "What is OpenVINO?" ]
+    model_id : str = "katuni4ka/tiny-random-phi3"
+    run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path)
 
 
-# TODO: consider removing all these functions with generation configs and use Dict with properties, which can be converted to generation config
 @pytest.mark.precommit
 @pytest.mark.parametrize("generation_config",
-                         [get_greedy(), get_greedy_with_min_and_max_tokens(), get_greedy_with_repetition_penalty(), get_greedy_with_single_stop_string(),
-                          get_greedy_with_multiple_stop_strings(), get_greedy_with_multiple_stop_strings_no_match(),
-                          get_beam_search(), get_beam_search_min_and_max_tokens(), get_beam_search_with_multiple_stop_strings_no_match(),
-                          get_greedy_stop_strings_exclude_from_output(), get_greedy_stop_strings_include_to_output(),
-                          get_greedy_n_stop_strings_exclude_from_output(), get_greedy_n_stop_strings_include_to_output()],
-                         ids=["greedy", "greedy_with_min_and_max_tokens", "greedy_with_repetition_penalty", "greedy_with_single_stop_string",
-                              "greedy_with_multiple_stop_strings", "greedy_with_multiple_stop_strings_no_match", "beam_search", "beam_search_min_and_max_tokens",
-                              "beam_search_with_multiple_stop_strings_no_match", "greedy_stop_strings_exclude_from_output", "greedy_stop_strings_include_to_output",
-                              "greedy_n_stop_strings_exclude_from_output", "greedy_n_stop_strings_include_to_output"])
-def test_sampling_against_optimum(tmp_path, generation_config):
+                         [dict(max_new_tokens=30, num_beams=2),
+                          dict(max_new_tokens=30, num_beams=2, stop_criteria=StopCriteria.NEVER),
+                          dict(max_new_tokens=30, num_beams=2, stop_criteria=StopCriteria.EARLY),
+                        #   dict(max_new_tokens=30, num_beams=2, echo=True),
+                          dict(max_new_tokens=30, num_beams=2, length_penalty=1.0),
+                          dict(max_new_tokens=30, num_beams=2, no_repeat_ngram_size=2),
+                          dict(max_new_tokens=30, num_beams=6, num_beam_groups=3, diversity_penalty=1.2, num_return_sequences=3),
+                          dict(max_new_tokens=30, min_new_tokens=15, num_beams=2, num_return_sequences=1),
+                          dict(max_new_tokens=30, num_beams=2, stop_strings={"Einstein", "sunny", "geothermal"}, include_stop_str_in_output=True),],
+                         ids=["single_group_stop_criteria_heuristic",
+                              "single_group_stop_criteria_never",
+                              "single_group_stop_criteria_early",
+                            #   "single_group_with_echo",
+                              "single_group_lenght_penalty",
+                              "single_group_no_repeat_ngram_size",
+                              "multiple_groups",
+                              "single_group_min_new_tokens",
+                              "single_group_with_multiple_stop_strings_no_match",])
+def test_beam_search(tmp_path, generation_config):
     prompts = [ "What is OpenVINO?" ]
-    generation_configs = [generation_config]
     model_id : str = "facebook/opt-125m"
-    generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path)
+    run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path)
 
 
 @pytest.mark.precommit
@@ -87,13 +99,28 @@ def test_sampling_against_optimum(tmp_path, generation_config):
     reason="Stop strings do not seem to work as expected with beam search in HF, so comparison will fail. If it changes, these cases shall be merged to the test above.",
     strict=True,
 )
-@pytest.mark.parametrize("generation_config", [get_beam_search_with_single_stop_string(), get_beam_search_with_multiple_stop_strings()],
-                         ids=["beam_search_with_single_stop_string", "beam_search_with_multiple_stop_strings"])
+@pytest.mark.parametrize("generation_config",
+                         [dict(max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, num_return_sequences=6, stop_strings={"open sour"}, include_stop_str_in_output=True),
+                          dict(max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, num_return_sequences=6, stop_strings={".", "software", "Intel"}, include_stop_str_in_output=True),],
+                         ids=["single_stop_string_match", "multiple_stop_strings_match"])
 def test_beam_search_with_stop_string(tmp_path, generation_config):
     prompts = [ "What is OpenVINO?" ]
-    generation_configs = [generation_config]
     model_id : str = "facebook/opt-125m"
-    generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path)
+    run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path)
+
+
+@pytest.mark.precommit
+@pytest.mark.parametrize("generation_config",
+                         [dict(max_new_tokens=1, min_new_tokens=0, echo=True),
+                          dict(max_new_tokens=30, num_beams=2, echo=True),],
+                         ids=["echo_with_generation",
+                              "single_group_with_echo",])
+def test_echo(tmp_path, generation_config):
+    prompts = [ "What is OpenVINO?" ]
+    model_id : str = "facebook/opt-125m"
+    # TODO: support in stateful mode and remove 'use_cb=True' and this test at all
+    # as we can enable new parameters set in other tests
+    run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path, use_cb=True)
 
 
 # TODO: remove platform specific reference texts once CVS-159912 is done and use comparison with HF
@@ -123,6 +150,12 @@ class RandomSamplingTestStruct:
     prompts: List[str]
     ref_texts: List[List[str]]
 
+from common import get_multinomial_temperature, get_greedy_with_penalties, \
+    get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, \
+    get_multinomial_temperature_top_p_and_top_k, get_multinomial_all_parameters, \
+    get_multinomial_temperature_and_num_return_sequence, get_multinomial_max_and_min_token, \
+    get_multinomial_temperature_and_frequence_penalty, get_multinomial_temperature_and_presence_penalty, \
+    get_multinomial_temperature_and_repetition_penalty
 
 RANDOM_SAMPLING_TEST_CASES = [
     RandomSamplingTestStruct(
@@ -285,72 +318,15 @@ def test_multinomial_sampling_against_reference(tmp_path, test_struct: RandomSam
 
     prompts = test_struct.prompts
     generation_config.rng_seed = 0
-    generation_configs = [generation_config]
+    generation_configs = generation_config
     model_id : str = "facebook/opt-125m"
-    model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
+    model, hf_tokenizer = get_hugging_face_models(model_id)
 
     models_path : Path = tmp_path / model_id
-    save_ov_model_from_optimum(model, hf_tokenizer, models_path)
+    convert_models(model, hf_tokenizer, models_path)
 
     # run multinomial without comparison with reference
-    _ = run_continuous_batching(models_path, DEFAULT_SCHEDULER_CONFIG, prompts, generation_configs)
+    _ = run_llm_pipeline(models_path, prompts, generation_configs)
 
     # Reference comparison is not performed as sampling results are non-deterministic.
     # Discrete_distribution impl depends on platform, model inference results may depend on CPU.
-
-
-@pytest.mark.precommit
-@pytest.mark.parametrize("get_generation_config", [get_greedy, get_beam_search, get_multinomial_all_parameters],
-                         ids=["greedy", "beam_search", "multinomial_all_parameters"])
-@pytest.mark.parametrize("max_num_batched_tokens", [2, 4, 256])
-def test_echo_prompt_phase_only(tmp_path, get_generation_config, max_num_batched_tokens):
-    generation_config = get_generation_config()
-    generation_config.max_new_tokens = 0
-    generation_config.echo = True
-
-    scheduler_config = get_scheduler_config()
-    scheduler_config.max_num_batched_tokens = max_num_batched_tokens
-    generation_configs = [generation_config]
-    model_id : str = "facebook/opt-125m"
-    opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
-
-    model_path : Path = tmp_path / model_id
-    save_ov_model_from_optimum(opt_model, hf_tokenizer, model_path)
-
-    cb_pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU")
-
-    outputs = cb_pipe.generate(["What is OpenVINO?"], generation_configs)
-    assert(len(outputs))
-    for output in outputs:
-        assert(len(output.m_generation_ids))
-        for sequence in output.m_generation_ids:
-            assert(sequence == "What is OpenVINO?")
-
-
-@pytest.mark.precommit
-@pytest.mark.parametrize("get_generation_config", [get_greedy, get_beam_search, get_multinomial_all_parameters],
-                         ids=["greedy", "beam_search", "multinomial_all_parameters"])
-@pytest.mark.parametrize("max_num_batched_tokens", [2, 4, 256])
-def test_echo_with_generation_phase(tmp_path, get_generation_config, max_num_batched_tokens):
-    generation_config = get_generation_config()
-    generation_config.max_new_tokens = 10
-    generation_config.echo = True
-
-    scheduler_config = get_scheduler_config()
-    scheduler_config.max_num_batched_tokens = max_num_batched_tokens
-    generation_configs = [generation_config]
-    model_id : str = "facebook/opt-125m"
-    opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
-
-    model_path : Path = tmp_path / model_id
-    save_ov_model_from_optimum(opt_model, hf_tokenizer, model_path)
-
-    cb_pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU")
-    outputs = cb_pipe.generate(["What is OpenVINO?"], generation_configs)
-    assert(len(outputs))
-
-    for output in outputs:
-        assert(len(output.m_generation_ids))
-        for sequence in output.m_generation_ids:
-            assert(sequence.startswith("What is OpenVINO?"))
-            assert(len(sequence) > len("What is OpenVINO?"))
diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py
index b4df6492bb..e6f897bcef 100644
--- a/tests/python_tests/test_vlm_pipeline.py
+++ b/tests/python_tests/test_vlm_pipeline.py
@@ -6,8 +6,8 @@
 import pytest
 import transformers
 from optimum.intel.openvino import OVModelForVisualCausalLM
-from openvino_genai import VLMPipeline
-from common import get_greedy, get_image_by_link, get_beam_search, get_greedy, get_multinomial_all_parameters
+from openvino_genai import VLMPipeline, GenerationConfig
+from common import get_image_by_link, get_beam_search, get_multinomial_all_parameters, get_default_properties
 
 def get_ov_model(cache):
     model_dir = cache.mkdir("tiny-random-minicpmv-2_6")
@@ -19,7 +19,7 @@ def get_ov_model(cache):
     ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(processor.tokenizer, with_detokenizer=True)
     openvino.save_model(ov_tokenizer, model_dir / "openvino_tokenizer.xml")
     openvino.save_model(ov_detokenizer, model_dir / "openvino_detokenizer.xml")
-    model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, trust_remote_code=True)
+    model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, load_in_8bit=False, trust_remote_code=True, ov_config=get_default_properties())
     processor.save_pretrained(model_dir)
     model.save_pretrained(model_dir)
     return model_dir
@@ -49,21 +49,22 @@ def streamer(word: str) -> bool:
         return False
 
     models_path = get_ov_model(cache)
+    generation_config = GenerationConfig(max_new_tokens=30)
 
     for links in image_links_for_testing:
         images = []
         for link in links:
             images.append(get_image_by_link(link))
 
-        pipe = VLMPipeline(models_path, "CPU")
-        pipe.start_chat()
+        ov_pipe = VLMPipeline(models_path, "CPU")
+        ov_pipe.start_chat()
 
-        pipe.generate(prompts[0], images=images, generation_config=get_greedy(), streamer=streamer)
+        ov_pipe.generate(prompts[0], images=images, generation_config=generation_config, streamer=streamer)
 
         for prompt in prompts[1:]:
-            pipe.generate(prompt, generation_config=get_greedy(), streamer=streamer)
+            ov_pipe.generate(prompt, generation_config=generation_config, streamer=streamer)
 
-        pipe.finish_chat()
+        ov_pipe.finish_chat()
 
 
 @pytest.mark.precommit
@@ -95,7 +96,7 @@ def test_perf_metrics(cache):
     images = [get_image_by_link(image_links[0])]
 
     pipe = VLMPipeline(models_path, "CPU")
-    result = pipe.generate(prompts[0], images=images, generation_config=get_greedy())
+    result = pipe.generate(prompts[0], images=images, generation_config=GenerationConfig(max_new_tokens=30))
 
     perf_metrics = result.perf_metrics
 
diff --git a/tools/llm_bench/llm_bench_utils/config_class.py b/tools/llm_bench/llm_bench_utils/config_class.py
index 7dd27b198b..9c149c98b6 100644
--- a/tools/llm_bench/llm_bench_utils/config_class.py
+++ b/tools/llm_bench/llm_bench_utils/config_class.py
@@ -102,7 +102,9 @@
         "olmo",
         "phi3",
         "starcoder",
-        "instruct-gpt"
+        "instruct-gpt",
+        "granite",
+        "granitemoe",
     ],
     'ldm_super_resolution': ['ldm-super-resolution'],
 }
diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py
index 316c9d0b89..596da8cb3a 100644
--- a/tools/llm_bench/llm_bench_utils/ov_utils.py
+++ b/tools/llm_bench/llm_bench_utils/ov_utils.py
@@ -701,7 +701,7 @@ def put(self, token_id: int) -> bool:
                 word = text[self.print_len:]
                 self.tokens_cache = []
                 self.print_len = 0
-            elif len(text) >= 3 and text[-3:] == chr(65533):
+            elif len(text) >= 3 and text[-1] == chr(65533):
                 # Don't print incomplete text.
                 pass
             elif len(text) > self.print_len:
diff --git a/tools/llm_bench/llm_bench_utils/pt_utils.py b/tools/llm_bench/llm_bench_utils/pt_utils.py
index dc2c6d05f5..877c135a3c 100644
--- a/tools/llm_bench/llm_bench_utils/pt_utils.py
+++ b/tools/llm_bench/llm_bench_utils/pt_utils.py
@@ -62,11 +62,14 @@ def create_text_gen_model(model_path, device, **kwargs):
             model_class = PT_MODEL_CLASSES_MAPPING.get(model_type, PT_MODEL_CLASSES_MAPPING[default_model_type])
             token_class = TOKENIZE_CLASSES_MAPPING.get(model_type, TOKENIZE_CLASSES_MAPPING[default_model_type])
             start = time.perf_counter()
-            if model_type == 'chatglm':
-                model = model_class.from_pretrained(model_path, trust_remote_code=True).to('cpu', dtype=float)
-            else:
-                model = model_class.from_pretrained(model_path, trust_remote_code=True)
-            tokenizer = token_class.from_pretrained(model_path, trust_remote_code=True)
+            trust_remote_code = False
+            try:
+                model = model_class.from_pretrained(model_path, trust_remote_code=trust_remote_code)
+            except Exception:
+                start = time.perf_counter()
+                trust_remote_code = True
+                model = model_class.from_pretrained(model_path, trust_remote_code=trust_remote_code)
+            tokenizer = token_class.from_pretrained(model_path, trust_remote_code=trust_remote_code)
             end = time.perf_counter()
             from_pretrain_time = end - start
         else:
diff --git a/tools/llm_bench/requirements.txt b/tools/llm_bench/requirements.txt
index f5f4a3fdeb..6bf8d8cddf 100644
--- a/tools/llm_bench/requirements.txt
+++ b/tools/llm_bench/requirements.txt
@@ -1,5 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 numpy
+--extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 openvino
 openvino-tokenizers
diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py
index 4822b228ca..c768d427e7 100644
--- a/tools/llm_bench/task/text_generation.py
+++ b/tools/llm_bench/task/text_generation.py
@@ -198,7 +198,6 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list,
 
 def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data_list, md5_list, prompt_index,
                               streamer, tokens_len, streaming, model_precision, proc_id, mem_consumption):
-    set_seed(args['seed'])
     input_text_list = [input_text] * args['batch_size']
     if args["output_dir"] is not None and num == 0:
         for bs_index, in_text in enumerate(input_text_list):
@@ -226,6 +225,7 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
         log.info(out_str)
     gen_config = model.get_generation_config()
     gen_config.max_new_tokens = max_gen_tokens
+    gen_config.rng_seed = args["seed"]
     gen_config.num_beams = args["num_beams"]
     gen_config.do_sample = False
     if args.get('draft_model', ''):
@@ -353,7 +353,6 @@ def token_printer():
 
 def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, args, iter_data_list, md5_list,
                                           prompt_index, streamer, tokens_len, streaming, model_precision, proc_id, mem_consumption):
-    set_seed(args['seed'])
     input_text_list = [input_text] * args['batch_size']
     if args["output_dir"] is not None and num == 0:
         for bs_index, in_text in enumerate(input_text_list):
@@ -379,6 +378,7 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg
     max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count']
     streamer.reset()
     gen_config = model.get_generation_config()
+    gen_config.rng_seed = args["seed"]
     gen_config.max_new_tokens = max_gen_tokens
     gen_config.num_beams = args["num_beams"]
     gen_config.do_sample = False
diff --git a/tools/who_what_benchmark/requirements.txt b/tools/who_what_benchmark/requirements.txt
index 9d151abbf3..d4b702de78 100644
--- a/tools/who_what_benchmark/requirements.txt
+++ b/tools/who_what_benchmark/requirements.txt
@@ -1,11 +1,10 @@
+accelerate>=0.26.0
 transformers>=4.35.2
 sentence-transformers>=2.2.2
-openvino
-openvino-tokenizers
 openvino-genai
-openvino-telemetry
-optimum-intel>=1.19.0
+optimum-intel[nncf]>=1.19.0
 pandas>=2.0.3
 numpy>=1.23.5
 tqdm>=4.66.1
 diffusers
+datasets<3.2.0
diff --git a/tools/who_what_benchmark/tests/test_cli_image.py b/tools/who_what_benchmark/tests/test_cli_image.py
index 536d015612..ccd6ee1cec 100644
--- a/tools/who_what_benchmark/tests/test_cli_image.py
+++ b/tools/who_what_benchmark/tests/test_cli_image.py
@@ -1,3 +1,4 @@
+import itertools
 import subprocess  # nosec B404
 import os
 import shutil
@@ -9,6 +10,9 @@
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
+MODEL_CACHE = tempfile.mkdtemp()
+OV_IMAGE_MODELS = ["OpenVINO/stable-diffusion-v1-5-int8-ov"]
+
 
 def run_wwb(args):
     logger.info(" ".join(["TRANSFOREMRS_VERBOSITY=debug wwb"] + args))
@@ -17,6 +21,19 @@ def run_wwb(args):
     return result
 
 
+def setup_module():
+    for model_id in OV_IMAGE_MODELS:
+        MODEL_PATH = os.path.join(MODEL_CACHE, model_id.replace("/", "--"))
+        subprocess.run(["huggingface-cli", "download",
+                        model_id, "--local-dir",
+                        MODEL_PATH], capture_output=True, text=True)
+
+
+def teardown_module():
+    logger.info("Remove models")
+    shutil.rmtree(MODEL_CACHE)
+
+
 @pytest.mark.parametrize(
     ("model_id", "model_type", "backend"),
     [
@@ -25,6 +42,8 @@ def run_wwb(args):
         ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "hf"),
         ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "openvino"),
         ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "text-to-image", "hf"),
+        ("hf-internal-testing/tiny-stable-diffusion-torch", "image-inpainting", "hf"),
+        ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "image-inpainting", "hf"),
     ],
 )
 def test_image_model_types(model_id, model_type, backend):
@@ -68,21 +87,16 @@ def test_image_model_types(model_id, model_type, backend):
 
 @pytest.mark.parametrize(
     ("model_id", "model_type"),
-    [
-        ("OpenVINO/LCM_Dreamshaper_v7-int8-ov", "image-to-image"),
-        ("OpenVINO/LCM_Dreamshaper_v7-int8-ov", "text-to-image"),
-    ],
+    list(itertools.product(OV_IMAGE_MODELS,
+                           ["image-to-image",
+                            "text-to-image",
+                            "image-inpainting"
+                            ])),
 )
 def test_image_model_genai(model_id, model_type):
     with tempfile.TemporaryDirectory() as temp_dir:
         GT_FILE = os.path.join(temp_dir, "gt.csv")
-        MODEL_PATH = os.path.join(temp_dir, model_id.replace("/", "--"))
-
-        result = subprocess.run(["huggingface-cli", "download",
-                                 model_id, "--local-dir",
-                                 MODEL_PATH],
-                                capture_output=True, text=True)
-        assert result.returncode == 0
+        MODEL_PATH = os.path.join(MODEL_CACHE, model_id.replace("/", "--"))
 
         wwb_args = [
             "--base-model",
@@ -169,7 +183,6 @@ def test_image_model_genai(model_id, model_type):
 
         shutil.rmtree("reference", ignore_errors=True)
         shutil.rmtree("target", ignore_errors=True)
-        shutil.rmtree(MODEL_PATH, ignore_errors=True)
         shutil.rmtree(output_dir, ignore_errors=True)
 
 
diff --git a/tools/who_what_benchmark/whowhatbench/__init__.py b/tools/who_what_benchmark/whowhatbench/__init__.py
index f608601ec8..194426f208 100644
--- a/tools/who_what_benchmark/whowhatbench/__init__.py
+++ b/tools/who_what_benchmark/whowhatbench/__init__.py
@@ -3,7 +3,8 @@
 from .text_evaluator import TextEvaluator as Evaluator
 from .text2image_evaluator import Text2ImageEvaluator
 from .visualtext_evaluator import VisualTextEvaluator
-from .image2image import Image2ImageEvaluator
+from .im2im_evaluator import Image2ImageEvaluator
+from .inpaint_evaluator import InpaintingEvaluator
 
 
 __all__ = [
@@ -13,5 +14,6 @@
     "Text2ImageEvaluator",
     "VisualTextEvaluator",
     "Image2ImageEvaluator",
+    "InpaintingEvaluator",
     "EVALUATOR_REGISTRY",
 ]
diff --git a/tools/who_what_benchmark/whowhatbench/image2image.py b/tools/who_what_benchmark/whowhatbench/im2im_evaluator.py
similarity index 100%
rename from tools/who_what_benchmark/whowhatbench/image2image.py
rename to tools/who_what_benchmark/whowhatbench/im2im_evaluator.py
diff --git a/tools/who_what_benchmark/whowhatbench/inpaint_evaluator.py b/tools/who_what_benchmark/whowhatbench/inpaint_evaluator.py
new file mode 100644
index 0000000000..c3fe0825f7
--- /dev/null
+++ b/tools/who_what_benchmark/whowhatbench/inpaint_evaluator.py
@@ -0,0 +1,133 @@
+import os
+from typing import Any, Union
+
+import datasets
+import pandas as pd
+from tqdm import tqdm
+from transformers import set_seed
+import torch
+import openvino_genai
+
+from .registry import register_evaluator
+from .text2image_evaluator import Text2ImageEvaluator
+
+from .whowhat_metrics import ImageSimilarity
+
+
+def preprocess_fn(example):
+    return {
+        "prompts": example["inpaint_caption"],
+        "images": example["coco_image"],
+        "masks": example["mask"],
+    }
+
+
+def prepare_default_data(num_samples=None):
+    DATASET_NAME = "phiyodr/InpaintCOCO"
+    NUM_SAMPLES = 10 if num_samples is None else num_samples
+    set_seed(42)
+    default_dataset = datasets.load_dataset(
+        DATASET_NAME, split="test", streaming=True
+    ).filter(lambda example: example["inpaint_caption"] != "").take(NUM_SAMPLES)
+    return default_dataset.map(
+        lambda x: preprocess_fn(x), remove_columns=default_dataset.column_names
+    )
+
+
+@register_evaluator("image-inpainting")
+class InpaintingEvaluator(Text2ImageEvaluator):
+    def __init__(
+        self,
+        base_model: Any = None,
+        gt_data: str = None,
+        test_data: Union[str, list] = None,
+        metrics="similarity",
+        similarity_model_id: str = "openai/clip-vit-large-patch14",
+        num_inference_steps=4,
+        crop_prompts=True,
+        num_samples=None,
+        gen_image_fn=None,
+        seed=42,
+        is_genai=False,
+    ) -> None:
+        assert (
+            base_model is not None or gt_data is not None
+        ), "Text generation pipeline for evaluation or ground trush data must be defined"
+
+        self.test_data = test_data
+        self.metrics = metrics
+        self.crop_prompt = crop_prompts
+        self.num_samples = num_samples
+        self.num_inference_steps = num_inference_steps
+        self.seed = seed
+        self.similarity = None
+        self.similarity = ImageSimilarity(similarity_model_id)
+        self.last_cmp = None
+        self.gt_dir = os.path.dirname(gt_data)
+        self.generation_fn = gen_image_fn
+        self.is_genai = is_genai
+        self.resolution = None
+
+        if base_model:
+            self.gt_data = self._generate_data(
+                base_model, gen_image_fn, os.path.join(self.gt_dir, "reference")
+            )
+        else:
+            self.gt_data = pd.read_csv(gt_data, keep_default_na=False)
+
+    def _generate_data(self, model, gen_image_fn=None, image_dir="reference"):
+        def default_gen_image_fn(model, prompt, image, mask, num_inference_steps, generator=None):
+            with torch.no_grad():
+                output = model(
+                    prompt,
+                    image=image,
+                    mask_image=mask,
+                    num_inference_steps=num_inference_steps,
+                    output_type="pil",
+                    generator=generator,
+                )
+            return output.images[0]
+
+        generation_fn = gen_image_fn or default_gen_image_fn
+
+        if self.test_data:
+            if isinstance(self.test_data, str):
+                data = pd.read_csv(self.test_data)
+            else:
+                if isinstance(self.test_data, dict):
+                    assert "prompts" in self.test_data
+                    assert "images" in self.test_data
+                    assert "masks" in self.test_data
+                    data = dict(self.test_data)
+                data = pd.DataFrame.from_dict(data)
+        else:
+            data = pd.DataFrame.from_dict(prepare_default_data(self.num_samples))
+
+        prompts = data["prompts"]
+        images = data["images"]
+        masks = data["masks"]
+        output_images = []
+        rng = torch.Generator(device="cpu")
+
+        if not os.path.exists(image_dir):
+            os.makedirs(image_dir)
+
+        for i, (prompt, image, mask) in tqdm(enumerate(zip(prompts, images, masks)), desc="Evaluate pipeline"):
+            set_seed(self.seed)
+            rng = rng.manual_seed(self.seed)
+            output = generation_fn(
+                model,
+                prompt,
+                image=image,
+                mask=mask,
+                num_inference_steps=self.num_inference_steps,
+                generator=openvino_genai.TorchGenerator(self.seed) if self.is_genai else rng
+            )
+            image_path = os.path.join(image_dir, f"{i}.png")
+            output.save(image_path)
+            output_images.append(image_path)
+
+        res_data = {"prompts": list(prompts), "images": output_images}
+        df = pd.DataFrame(res_data)
+
+        return df
diff --git a/tools/who_what_benchmark/whowhatbench/model_loaders.py b/tools/who_what_benchmark/whowhatbench/model_loaders.py
index f54d232bc2..c792a3c0b2 100644
--- a/tools/who_what_benchmark/whowhatbench/model_loaders.py
+++ b/tools/who_what_benchmark/whowhatbench/model_loaders.py
@@ -2,7 +2,7 @@
 import json
 
 from transformers import AutoConfig, AutoModelForCausalLM, AutoModel, AutoModelForVision2Seq
-from diffusers import DiffusionPipeline, AutoPipelineForImage2Image
+from diffusers import DiffusionPipeline, AutoPipelineForImage2Image, AutoPipelineForInpainting
 
 
 logging.basicConfig(level=logging.INFO)
@@ -41,8 +41,19 @@ def load_text_genai_pipeline(model_dir, device="CPU", ov_config=None):
     return GenAIModelWrapper(openvino_genai.LLMPipeline(model_dir, device=device, **ov_config), model_dir, "text")
 
 
+def load_text_llamacpp_pipeline(model_dir):
+    try:
+        from llama_cpp import Llama
+    except ImportError:
+        logger.error(
+            "Failed to import llama_cpp package. Please install llama-cpp-python.")
+        exit(-1)
+    model = Llama(model_dir)
+    return model
+
+
 def load_text_model(
-    model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False
+    model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False, use_llamacpp=False,
 ):
     if use_hf:
         logger.info("Using HF Transformers API")
@@ -53,6 +64,9 @@ def load_text_model(
     elif use_genai:
         logger.info("Using OpenVINO GenAI API")
         model = load_text_genai_pipeline(model_id, device, ov_config)
+    elif use_llamacpp:
+        logger.info("Using llama.cpp API")
+        model = load_text_llamacpp_pipeline(model_id)
     else:
         logger.info("Using Optimum API")
         from optimum.intel.openvino import OVModelForCausalLM
@@ -107,7 +121,7 @@ def load_text2image_model(
 
         try:
             model = TEXT2IMAGEPipeline.from_pretrained(
-                model_id, trust_remote_code=True, device=device, ov_config=ov_config
+                model_id, trust_remote_code=True, device=device, ov_config=ov_config, safety_checker=None,
             )
         except ValueError:
             config = AutoConfig.from_pretrained(
@@ -119,6 +133,7 @@ def load_text2image_model(
                 use_cache=True,
                 device=device,
                 ov_config=ov_config,
+                safety_checker=None,
             )
 
     return model
@@ -211,7 +226,7 @@ def load_imagetext2image_model(
         from optimum.intel.openvino import OVPipelineForImage2Image
         try:
             model = OVPipelineForImage2Image.from_pretrained(
-                model_id, trust_remote_code=True, device=device, ov_config=ov_config
+                model_id, trust_remote_code=True, device=device, ov_config=ov_config, safety_checker=None,
             )
         except ValueError:
             config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
@@ -222,12 +237,60 @@ def load_imagetext2image_model(
                 use_cache=True,
                 device=device,
                 ov_config=ov_config,
+                safety_checker=None,
+            )
+    return model
+
+
+def load_inpainting_genai_pipeline(model_dir, device="CPU", ov_config=None):
+    try:
+        import openvino_genai
+    except ImportError as e:
+        logger.error("Failed to import openvino_genai package. Please install it. Details:\n", e)
+        exit(-1)
+
+    return GenAIModelWrapper(
+        openvino_genai.InpaintingPipeline(model_dir, device, **ov_config),
+        model_dir,
+        "image-inpainting"
+    )
+
+
+def load_inpainting_model(
+    model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False
+):
+    if use_hf:
+        logger.info("Using HF Transformers API")
+        model = AutoPipelineForInpainting.from_pretrained(
+            model_id, trust_remote_code=True
+        )
+    elif use_genai:
+        logger.info("Using OpenVINO GenAI API")
+        model = load_inpainting_genai_pipeline(model_id, device, ov_config)
+    else:
+        logger.info("Using Optimum API")
+        from optimum.intel.openvino import OVPipelineForInpainting
+        try:
+            model = OVPipelineForInpainting.from_pretrained(
+                model_id, trust_remote_code=True, device=device, ov_config=ov_config, safety_checker=None,
+            )
+        except ValueError as e:
+            logger.error("Failed to load inpaiting pipeline. Details:\n", e)
+            config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
+            model = OVPipelineForInpainting.from_pretrained(
+                model_id,
+                config=config,
+                trust_remote_code=True,
+                use_cache=True,
+                device=device,
+                ov_config=ov_config,
+                safety_checker=None,
             )
     return model
 
 
 def load_model(
-    model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False
+    model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False, use_llamacpp=False
 ):
     if model_id is None:
         return None
@@ -239,7 +302,7 @@ def load_model(
         ov_options = {}
 
     if model_type == "text":
-        return load_text_model(model_id, device, ov_options, use_hf, use_genai)
+        return load_text_model(model_id, device, ov_options, use_hf, use_genai, use_llamacpp)
     elif model_type == "text-to-image":
         return load_text2image_model(
             model_id, device, ov_options, use_hf, use_genai
@@ -248,5 +311,7 @@ def load_model(
         return load_visual_text_model(model_id, device, ov_options, use_hf, use_genai)
     elif model_type == "image-to-image":
         return load_imagetext2image_model(model_id, device, ov_options, use_hf, use_genai)
+    elif model_type == "image-inpainting":
+        return load_inpainting_model(model_id, device, ov_options, use_hf, use_genai)
     else:
         raise ValueError(f"Unsupported model type: {model_type}")
diff --git a/tools/who_what_benchmark/whowhatbench/text_evaluator.py b/tools/who_what_benchmark/whowhatbench/text_evaluator.py
index 50ce224def..433521a186 100644
--- a/tools/who_what_benchmark/whowhatbench/text_evaluator.py
+++ b/tools/who_what_benchmark/whowhatbench/text_evaluator.py
@@ -108,6 +108,7 @@ def __init__(
         generation_config=None,
         generation_config_base=None,
         seqs_per_request=None,
+        use_chat_template=None,
     ) -> None:
         assert (
             base_model is not None or gt_data is not None
@@ -123,6 +124,7 @@ def __init__(
         self.generation_config_base = generation_config
         self.seqs_per_request = seqs_per_request
         self.generation_fn = gen_answer_fn
+        self.use_chat_template = use_chat_template
         if self.generation_config is not None:
             assert self.seqs_per_request is not None
 
@@ -202,15 +204,21 @@ def worst_examples(self, top_k: int = 5, metric="similarity"):
         return res
 
     def _generate_data(self, model, gen_answer_fn=None, generation_config=None):
-        def default_gen_answer(model, tokenizer, prompt, max_new_tokens, crop_question):
-            inputs = self.tokenizer(prompt, return_tensors="pt")
-
-            tokens = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens)
-
-            if crop_question:
-                tokens = tokens[:, inputs["input_ids"].shape[-1] :]
-
-            return self.tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
+        def default_gen_answer(model, tokenizer, prompt, max_new_tokens, crop_question, use_chat_template=False):
+            if use_chat_template:
+                message = [{"role": "user", "content": prompt}]
+                inputs = tokenizer.apply_chat_template(message, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+                tokens = model.generate(inputs, do_sample=False, max_new_tokens=max_new_tokens)
+                if crop_question:
+                    tokens = tokens[:, inputs.shape[-1]:]
+                res = self.tokenizer.decode(tokens[0], skip_special_tokens=True)
+                return res
+            else:
+                inputs = self.tokenizer(prompt, return_tensors="pt")
+                tokens = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens)
+                if crop_question:
+                    tokens = tokens[:, inputs["input_ids"].shape[-1] :]
+                return self.tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
 
         gen_answer_fn = gen_answer_fn or default_gen_answer
 
@@ -250,6 +258,7 @@ def default_gen_answer(model, tokenizer, prompt, max_new_tokens, crop_question):
                         p,
                         self.max_new_tokens,
                         self._crop_question,
+                        self.use_chat_template
                     )
                 )
         else:
diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py
index 2ff8c45975..7d4354f846 100644
--- a/tools/who_what_benchmark/whowhatbench/wwb.py
+++ b/tools/who_what_benchmark/whowhatbench/wwb.py
@@ -40,6 +40,11 @@ def parse_args():
         default=None,
         help="Tokenizer for divergency metric. If not provided, it will be load from base_model or target_model.",
     )
+    parser.add_argument(
+        "--chat-template",
+        action="store_true",
+        help="Whether apply the default chat template.",
+    )
     parser.add_argument(
         "--gt-data",
         default=None,
@@ -55,7 +60,7 @@ def parse_args():
     parser.add_argument(
         "--model-type",
         type=str,
-        choices=["text", "text-to-image", "visual-text", "image-to-image"],
+        choices=["text", "text-to-image", "visual-text", "image-to-image", "image-inpainting"],
         default="text",
         help="Indicated the model type: 'text' - for causal text generation, 'text-to-image' - for image generation, "
         "visual-text - for Visual Language Models, image-to-image - for image generation based on image and prompt",
@@ -137,6 +142,11 @@ def parse_args():
         action="store_true",
         help="Use LLMPipeline from transformers library to instantiate the model.",
     )
+    parser.add_argument(
+        "--llamacpp",
+        action="store_true",
+        help="Use llama-cpp-python to instantiate the model.",
+    )
     parser.add_argument(
         "--image-size",
         type=int,
@@ -190,9 +200,13 @@ def load_prompts(args):
 def load_tokenizer(args):
     tokenizer = None
     if args.tokenizer is not None:
-        tokenizer = AutoTokenizer.from_pretrained(
-            args.tokenizer, trust_remote_code=True
-        )
+        if args.llamacpp:
+            from llama_cpp.llama_tokenizer import LlamaHFTokenizer
+            tokenizer = LlamaHFTokenizer.from_pretrained(args.tokenizer)
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(
+                args.tokenizer, trust_remote_code=True
+            )
     elif args.base_model is not None:
         tokenizer = AutoTokenizer.from_pretrained(
             args.base_model, trust_remote_code=True
@@ -246,8 +260,29 @@ def diff_strings(a: str, b: str, *, use_loguru_colors: bool = False) -> str:
     return "".join(output)
 
 
-def genai_gen_text(model, tokenizer, question, max_new_tokens, skip_question):
-    return model.generate(question, do_sample=False, max_new_tokens=max_new_tokens)
+def genai_gen_text(model, tokenizer, question, max_new_tokens, skip_question, use_chat_template=False):
+    if use_chat_template:
+        model.start_chat()
+        result = model.generate(question, do_sample=False, max_new_tokens=max_new_tokens)
+        model.finish_chat()
+        return result
+    else:
+        return model.generate(question, do_sample=False, max_new_tokens=max_new_tokens)
+
+
+def llamacpp_gen_text(model, tokenizer, question, max_new_tokens, skip_question, use_chat_template=False):
+    if use_chat_template:
+        output = model.create_chat_completion(messages=[{"role": "user", "content": question}], max_tokens=max_new_tokens, temperature=0.0)
+        text = output["choices"][0]["message"]["content"]
+        if skip_question:
+            text = text[len(question):]
+        return text
+    else:
+        output = model(question, max_tokens=max_new_tokens, echo=True, temperature=0.0)
+        text = output["choices"][0]["text"]
+        if skip_question:
+            text = text[len(question):]
+        return text
 
 
 def genai_gen_image(model, prompt, num_inference_steps, generator=None):
@@ -282,6 +317,20 @@ def genai_gen_image2image(model, prompt, image, num_inference_steps, generator=N
     return image
 
 
+def genai_gen_inpainting(model, prompt, image, mask, num_inference_steps, generator=None):
+    image_data = ov.Tensor(np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8))
+    mask_data = ov.Tensor(np.array(mask.getdata()).reshape(1, mask.size[1], mask.size[0], 3).astype(np.uint8))
+    image_tensor = model.generate(
+        prompt,
+        image=image_data,
+        mask_image=mask_data,
+        num_inference_steps=num_inference_steps,
+        generator=generator,
+    )
+    image = Image.fromarray(image_tensor.data[0])
+    return image
+
+
 def genai_gen_visual_text(model, prompt, image, processor, tokenizer, max_new_tokens, crop_question):
     image_data = ov.Tensor(np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8))
     config = model.get_generation_config()
@@ -308,7 +357,15 @@ def create_evaluator(base_model, args):
         prompts = load_prompts(args)
 
         if task == "text":
-            tokenizer = load_tokenizer(args)
+            tokenizer = load_tokenizer(args) if not args.llamacpp else None
+
+            if args.genai:
+                gen_answer_fn = genai_gen_text
+            elif args.llamacpp:
+                gen_answer_fn = llamacpp_gen_text
+            else:
+                gen_answer_fn = None
+
             return EvaluatorCLS(
                 base_model=base_model,
                 gt_data=args.gt_data,
@@ -317,7 +374,8 @@ def create_evaluator(base_model, args):
                 similarity_model_id=args.data_encoder,
                 num_samples=args.num_samples,
                 language=args.language,
-                gen_answer_fn=genai_gen_text if args.genai else None,
+                gen_answer_fn=gen_answer_fn,
+                use_chat_template=args.chat_template,
             )
         elif task == "text-to-image":
             return EvaluatorCLS(
@@ -355,6 +413,17 @@ def create_evaluator(base_model, args):
                 is_genai=args.genai,
                 seed=args.seed,
             )
+        elif task == "image-inpainting":
+            return EvaluatorCLS(
+                base_model=base_model,
+                gt_data=args.gt_data,
+                test_data=prompts,
+                num_samples=args.num_samples,
+                num_inference_steps=args.num_inference_steps,
+                gen_image_fn=genai_gen_inpainting if args.genai else None,
+                is_genai=args.genai,
+                seed=args.seed,
+            )
         else:
             raise ValueError(f"Unsupported task: {task}")
 
@@ -442,10 +511,11 @@ def main():
                 args.ov_config,
                 args.hf,
                 args.genai,
+                args.llamacpp
             )
             all_metrics_per_question, all_metrics = evaluator.score(
                 target_model,
-                evaluator.get_generation_fn() if args.genai else None,
+                evaluator.get_generation_fn() if args.genai or args.llamacpp else None,
                 output_dir=args.output
             )
         logger.info("Metrics for model: %s", args.target_model)