diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 4aad3d4bc3..fb0c9c4b0b 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -16,10 +16,10 @@ concurrency:
cancel-in-progress: true
env:
- l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu20_2025.0.0.dev20241224_x86_64.tgz
- l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241224_x86_64.tgz
- m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/m_openvino_toolkit_macos_12_6_2025.0.0.dev20241224_x86_64.tgz
- w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/w_openvino_toolkit_windows_2025.0.0.dev20241224_x86_64.zip
+ l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/l_openvino_toolkit_ubuntu20_2025.0.0.dev20241230_x86_64.tgz
+ l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241230_x86_64.tgz
+ m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/m_openvino_toolkit_macos_12_6_2025.0.0.dev20241230_x86_64.tgz
+ w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/w_openvino_toolkit_windows_2025.0.0.dev20241230_x86_64.zip
jobs:
cpp-multinomial-greedy_causal_lm-ubuntu:
runs-on: ubuntu-20.04-8-cores
diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/genai-tools.yml
similarity index 78%
rename from .github/workflows/llm_bench-python.yml
rename to .github/workflows/genai-tools.yml
index 56145c080c..bd6cb46362 100644
--- a/.github/workflows/llm_bench-python.yml
+++ b/.github/workflows/genai-tools.yml
@@ -1,7 +1,7 @@
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
-name: llm_bench Python Test
+name: GenAI tools
on:
workflow_dispatch:
@@ -44,9 +44,10 @@ jobs:
with:
platform: ubuntu22
commit_packages_to_provide: wheels
- revision: latest_available_commit
+ revision: 345163f87953fb0dd8dd590257eb7fc84378da8e
- build:
+ llm_bench:
+ name: 'LLM bench tests'
defaults:
run:
shell: bash
@@ -60,7 +61,6 @@ jobs:
OV_INSTALL_DIR: ${{ github.workspace }}/ov
SRC_DIR: ${{ github.workspace }}
LLM_BENCH_PYPATH: ${{ github.workspace }}/tools/llm_bench
- WWB_PATH: ${{ github.workspace }}/tools/who_what_benchmark
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -70,6 +70,12 @@ jobs:
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
+ - name: Lint with flake8
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install flake8 pytest black
+ # stop the build if there are Python syntax errors or undefined names
+ python -m flake8 ${{ env.LLM_BENCH_PYPATH }} --config=${{ env.LLM_BENCH_PYPATH }}/setup.cfg
- name: Download OpenVINO package
uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
with:
@@ -78,59 +84,42 @@ jobs:
merge-multiple: true
- name: Install dependencies
run: |
- python -m pip install --upgrade pip
- python -m pip install flake8 pytest black
python -m pip install ${{ env.SRC_DIR }}/thirdparty/openvino_tokenizers -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
python -m pip install ${{ env.SRC_DIR }} -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
- GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
+ python -m pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
working-directory: ${{ env.OV_INSTALL_DIR }}
- - name: Lint with flake8
- run: |
- # stop the build if there are Python syntax errors or undefined names
- python -m flake8 ${{ env.LLM_BENCH_PYPATH }} --config=${{ env.LLM_BENCH_PYPATH }}/setup.cfg
- python -m flake8 ${{ env.WWB_PATH }} --config=${{ env.WWB_PATH }}/setup.cfg
- - name: Create code style diff for samples
- if: failure()
- run: |
- python -m black -l 160 -S ${{ env.LLM_BENCH_PYPATH }}/
- git diff > llm.bench_diff.diff
- - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
- if: failure()
- with:
- name: llm.bench_diff
- path: llm.bench_diff.diff
- - name: Test native pytorch model on Linux
+ - name: Test native pytorch model
run: |
git clone --depth 1 https://huggingface.co/katuni4ka/tiny-random-qwen
python ./tools/llm_bench/benchmark.py -m tiny-random-qwen -d cpu -n 1 -f pt -ic 20
rm -rf tiny-random-qwen
env:
GIT_LFS_SKIP_SMUDGE: 0
- - name: Test tiny-random-baichuan2 on Linux Optimum Intel
+ - name: Test tiny-random-baichuan2 Optimum Intel
run: |
optimum-cli export openvino --model katuni4ka/tiny-random-baichuan2 --trust-remote-code --weight-format fp16 ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16
python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16/ -d cpu -n 1 --optimum -ic 10
rm -rf ./ov_models/tiny-random-baichuan2
- - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux Optimum Intel
+ - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov Optimum Intel
run: |
huggingface-cli download OpenVINO/LCM_Dreamshaper_v7-int8-ov --local-dir ov_models/lcm_dreamshaper_v7
python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --optimum --num_steps 4
- - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI
+ - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov with GenAI
run: |
python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --num_steps 4
- - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI and LoRA
+ - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov with GenAI and LoRA
run: |
wget -O ./ov_models/soulcard.safetensors https://civitai.com/api/download/models/72591
python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7 --num_steps 4
rm -rf ./ov_models/lcm_dreamshaper_v7/
- - name: Test TinyLlama-1.1B-Chat-v1.0 in Speculative Deconding mode on Linux
+ - name: Test TinyLlama-1.1B-Chat-v1.0 in Speculative Decoding via GenAI
run: |
optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format fp16 ov_models/TinyLlama-1.1B-Chat-v1.0/FP16
optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format int8 ov_models/TinyLlama-1.1B-Chat-v1.0/INT8
python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --assistant_confidence_threshold 0.4 -ic 20
python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --num_assistant_tokens 5 -ic 20
rm -rf ov_models/TinyLlama-1.1B-Chat-v1.0
- - name: Test whisper-tiny on Linux
+ - name: Test whisper-tiny via GenAI
run: |
GIT_LFS_SKIP_SMUDGE=1 git clone --depth 1 --branch main --single-branch https://huggingface.co/datasets/facebook/multilingual_librispeech
cd multilingual_librispeech
@@ -143,60 +132,64 @@ jobs:
python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1
rm -rf ./ov_models/whisper-tiny
rm -rf multilingual_librispeech
- - name: Text InternVL2-1B on Linux
+ - name: Text InternVL2-1B via GenAI
run: |
optimum-cli export openvino --model OpenGVLab/InternVL2-1B ./ov_models/internvl2-1B --task image-text-to-text --trust-remote-code
python ./tools/llm_bench/benchmark.py -m ./ov_models/internvl2-1B --media https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --prompt "What is unusual on this image?" -ic 20
python ./tools/llm_bench/benchmark.py -m ./ov_models/internvl2-1B --media https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --prompt "What is unusual on this image?" -ic 20 --optimum
rm -rf ./ov_models/internvl2-1B
- - name: WWB Tests
- run: |
- pip install git+https://github.com/huggingface/optimum-intel.git
- GIT_CLONE_PROTECTION_ACTIVE=false PIP_PRE=1 PIP_EXTRA_INDEX_URL=https://storage.openvinotoolkit.org/simple/wheels/nightly pip install ${{ env.WWB_PATH }}
- python -m pytest -v ${{ env.WWB_PATH }}/tests
- stateful:
+
+ wwb:
+ name: 'WWB tests'
defaults:
run:
shell: bash
runs-on: ubuntu-22.04
+ strategy:
+ fail-fast: false
+ matrix:
+ python-version: ["3.11"]
needs: [ openvino_download ]
env:
OV_INSTALL_DIR: ${{ github.workspace }}/ov
SRC_DIR: ${{ github.workspace }}
- LLM_BENCH_PYPATH: ${{ github.workspace }}/tools/llm_bench
WWB_PATH: ${{ github.workspace }}/tools/who_what_benchmark
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
submodules: recursive
- - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
- python-version: "3.11"
+ python-version: ${{ matrix.python-version }}
+ - name: Lint with flake8
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install flake8 pytest black
+ # stop the build if there are Python syntax errors or undefined names
+ python -m flake8 ${{ env.WWB_PATH }} --config=${{ env.WWB_PATH }}/setup.cfg
- name: Download OpenVINO package
uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
with:
name: ${{ needs.openvino_download.outputs.ov_artifact_name }}
path: ${{ env.OV_INSTALL_DIR }}
merge-multiple: true
- - name: Test stateful
+ - name: Install dependencies
run: |
python -m pip install ${{ env.SRC_DIR }}/thirdparty/openvino_tokenizers -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
python -m pip install ${{ env.SRC_DIR }} -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
- GIT_CLONE_PROTECTION_ACTIVE=false python -m pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
- python ${{ env.LLM_BENCH_PYPATH }}/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir ${{ env.SRC_DIR }} --stateful
- grep beam_idx ${{ env.SRC_DIR }}/pytorch/dldt/FP32/openvino_model.xml
+ python -m pip install -r ${{ env.WWB_PATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
+ python -m pip install git+https://github.com/huggingface/optimum-intel.git@main#egg=optimum-intel
working-directory: ${{ env.OV_INSTALL_DIR }}
- name: WWB Tests
run: |
- pip install pytest
- pip install git+https://github.com/huggingface/optimum-intel.git
- GIT_CLONE_PROTECTION_ACTIVE=false PIP_PRE=1 PIP_EXTRA_INDEX_URL=https://storage.openvinotoolkit.org/simple/wheels/nightly pip install ${{ env.WWB_PATH }}
+ python -m pip install -v ${{ env.WWB_PATH }}
python -m pytest -v ${{ env.WWB_PATH }}/tests
Overall_Status:
name: ci/gha_overall_status_llm_bench
- needs: [openvino_download, build, stateful]
+ needs: [openvino_download, llm_bench, wwb]
if: ${{ always() }}
runs-on: ubuntu-latest
steps:
diff --git a/.github/workflows/job_vlm_sample_llava.yml b/.github/workflows/job_vlm_sample_llava.yml
index 5f4634616a..781526f71f 100644
--- a/.github/workflows/job_vlm_sample_llava.yml
+++ b/.github/workflows/job_vlm_sample_llava.yml
@@ -11,7 +11,7 @@ on:
type: string
env:
- l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241224_x86_64.tgz
+ l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241230_x86_64.tgz
jobs:
visual_language_chat_sample-ubuntu-llava:
diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml
index c525b0be68..cbd847240d 100644
--- a/.github/workflows/lcm_dreamshaper_cpp.yml
+++ b/.github/workflows/lcm_dreamshaper_cpp.yml
@@ -18,8 +18,8 @@ concurrency:
env:
PYTHON_VERSION: '3.9'
- LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241224_x86_64.tgz
- WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/w_openvino_toolkit_windows_2025.0.0.dev20241224_x86_64.zip
+ LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241230_x86_64.tgz
+ WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/w_openvino_toolkit_windows_2025.0.0.dev20241230_x86_64.zip
OV_INSTALL_DIR: ${{ github.workspace }}/ov
jobs:
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 9b21491f9b..0d7a5b7bae 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -52,7 +52,7 @@ jobs:
with:
platform: ubuntu22
commit_packages_to_provide: wheels
- revision: latest_available_commit
+ revision: 345163f87953fb0dd8dd590257eb7fc84378da8e
- name: Clone docker tag from OpenVINO repo
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -109,10 +109,10 @@ jobs:
merge-multiple: true
- name: CMake Build
- run: |
+ run: |
source ${{ env.OV_INSTALL_DIR }}/setupvars.sh
cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ${{ env.SRC_DIR}} -B ${{ env.BUILD_DIR }}
- cmake --build ${{ env.BUILD_DIR}} --config ${{ matrix.build-type }} --parallel $(nproc)
+ cmake --build ${{ env.BUILD_DIR}} --config ${{ matrix.build-type }} --parallel $(nproc) --verbose
cmake --install ${{ env.BUILD_DIR }} --config ${{ matrix.build-type }} --prefix ${{ env.INSTALL_DIR }}
- name: Pack Artifacts
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 4d9b7f032b..062b83fc27 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -1,4 +1,4 @@
-name: macOS (12, Python 3.9)
+name: macOS (12, Python 3.10)
on:
workflow_dispatch:
pull_request:
@@ -16,8 +16,8 @@ concurrency:
cancel-in-progress: true
env:
- PYTHON_VERSION: '3.9'
- OV_BRANCH: master
+ PYTHON_VERSION: '3.10'
+ OV_BRANCH: 345163f87953fb0dd8dd590257eb7fc84378da8e
OV_TARBALL: ''
jobs:
@@ -219,7 +219,7 @@ jobs:
run: |
source ${OV_INSTALL_DIR}/setupvars.sh
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
- cmake --build ./build/ --config Release -j
+ cmake --build ./build/ --config Release --parallel --verbose
- name: Test bindings
run: |
@@ -284,7 +284,7 @@ jobs:
run: |
source ${OV_INSTALL_DIR}/setupvars.sh
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
- cmake --build ./build/ --config Release --target py_openvino_genai -j
+ cmake --build ./build/ --config Release --target py_openvino_genai --parallel --verbose
- name: Test bindings
run: |
@@ -350,7 +350,7 @@ jobs:
run: |
source ${OV_INSTALL_DIR}/setupvars.sh
cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
- cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
+ cmake --build ./build/ --config ${{ matrix.build-type }} --target package --parallel --verbose
- name: Build and Install dependencies
run: |
diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml
index 34c5a0f87e..3b01697f26 100644
--- a/.github/workflows/stable_diffusion_1_5_cpp.yml
+++ b/.github/workflows/stable_diffusion_1_5_cpp.yml
@@ -45,7 +45,7 @@ jobs:
with:
platform: ubuntu22
commit_packages_to_provide: wheels
- revision: latest_available_commit
+ revision: 345163f87953fb0dd8dd590257eb7fc84378da8e
openvino_download_windows:
name: Download OpenVINO for Windows
@@ -71,7 +71,7 @@ jobs:
with:
platform: windows
commit_packages_to_provide: wheels
- revision: latest_available_commit
+ revision: 345163f87953fb0dd8dd590257eb7fc84378da8e
stable_diffusion_1_5_cpp-linux:
runs-on: ubuntu-22.04-8-cores
@@ -122,6 +122,8 @@ jobs:
source openvino_sd_cpp/bin/activate
optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --weight-format fp16 --task stable-diffusion models/dreamlike-art-dreamlike-anime-1.0/FP16
wget -O ./models/soulcard.safetensors https://civitai.com/api/download/models/72591
+ env:
+ HF_HUB_ENABLE_HF_TRANSFER: 1
- name: Run text2image app
run: |
@@ -198,6 +200,8 @@ jobs:
. "./openvino_sd_cpp/Scripts/Activate.ps1"
optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 models/dreamlike-art-dreamlike-anime-1.0/FP16
Invoke-WebRequest -Uri 'https://civitai.com/api/download/models/72591' -OutFile 'models/soulcard.safetensors'
+ env:
+ HF_HUB_ENABLE_HF_TRANSFER: 1
- name: Run text2image app
run: |
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index fc63129281..95a713d7a1 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -17,7 +17,7 @@ concurrency:
env:
PYTHON_VERSION: '3.11'
- OV_BRANCH: master
+ OV_BRANCH: 345163f87953fb0dd8dd590257eb7fc84378da8e
OV_TARBALL: ''
jobs:
@@ -230,7 +230,7 @@ jobs:
run: |
. "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
- cmake --build ./build/ --config Release -j
+ cmake --build ./build/ --config Release --parallel --verbose
- name: Test bindings
run: |
@@ -295,7 +295,7 @@ jobs:
run: |
. "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
- cmake --build ./build/ --config Release --target py_openvino_genai -j
+ cmake --build ./build/ --config Release --target py_openvino_genai --parallel --verbose
- name: Test bindings
run: |
@@ -360,7 +360,7 @@ jobs:
run: |
. "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
- cmake --build ./build/ --config Release --target py_openvino_genai -j
+ cmake --build ./build/ --config Release --target py_openvino_genai --parallel --verbose
- name: Test bindings
run: |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fec8df34af..3a67a24bab 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,6 +60,7 @@ if(NOT OpenVINODeveloperPackage_FOUND)
endif()
include(cmake/features.cmake)
+include(cmake/version.cmake)
if(ENABLE_PYTHON)
# the following two calls are required for cross-compilation
@@ -85,7 +86,6 @@ if(MSVC AND MSVC_VERSION GREATER_EQUAL 1930 AND MSVC_VERSION LESS 1941)
add_compile_definitions(_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR)
endif()
-
add_subdirectory(thirdparty)
add_subdirectory(src)
if(EXISTS "${OpenVINOGenAI_SOURCE_DIR}/samples")
diff --git a/README.md b/README.md
index be3de5e8ce..c5cf799973 100644
--- a/README.md
+++ b/README.md
@@ -133,13 +133,15 @@ from PIL import Image
# Choose GPU instead of CPU in the line below to run the model on Intel integrated or discrete GPU
pipe = ov_genai.VLMPipeline("./InternVL2-1B", "CPU")
+pipe.start_chat()
image = Image.open("dog.jpg")
image_data = np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8)
image_data = ov.Tensor(image_data)
prompt = "Can you describe the image?"
-print(pipe.generate(prompt, image=image_data, max_new_tokens=100))
+result = pipe.generate(prompt, image=image_data, max_new_tokens=100)
+print(result.texts[0])
```
### Run generation using VLMPipeline in C++
@@ -392,7 +394,7 @@ See [here](https://openvinotoolkit.github.io/openvino_notebooks/?search=Automati
## Additional materials
-- [List of supported models](https://github.com/openvinotoolkit/openvino.genai/blob/master/src/docs/SUPPORTED_MODELS.md) (NOTE: models can work, but were not tried yet)
+- [List of supported models](https://github.com/openvinotoolkit/openvino.genai/blob/master/SUPPORTED_MODELS.md) (NOTE: models can work, but were not tried yet)
- [OpenVINO Generative AI workflow](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html)
- [Optimum-intel and OpenVINO](https://huggingface.co/docs/optimum/intel/openvino/export)
diff --git a/src/docs/SUPPORTED_MODELS.md b/SUPPORTED_MODELS.md
similarity index 95%
rename from src/docs/SUPPORTED_MODELS.md
rename to SUPPORTED_MODELS.md
index 44da29ced4..6b45f47890 100644
--- a/src/docs/SUPPORTED_MODELS.md
+++ b/SUPPORTED_MODELS.md
@@ -147,6 +147,8 @@
+> [!NOTE]
+> LoRA adapters are supported.
The pipeline can work with other similar topologies produced by `optimum-intel` with the same model signature. The model is required to have the following inputs after the conversion:
1. `input_ids` contains the tokens.
@@ -165,12 +167,14 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
Latent Consistency Model |
Supported |
Supported |
+ Supported |
SimianLuo/LCM_Dreamshaper_v7
@@ -181,6 +185,7 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
Stable Diffusion |
Supported |
Supported |
+ Supported |
CompVis/stable-diffusion-v1-1
@@ -213,6 +218,7 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
Stable Diffusion XL |
Supported |
Supported |
+ Supported |
stabilityai/stable-diffusion-xl-base-0.9
@@ -225,6 +231,7 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
Stable Diffusion 3 |
Supported |
Not supported |
+ Not supported |
stabilityai/stable-diffusion-3-medium-diffusers
@@ -237,6 +244,7 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
Flux |
Supported |
Not supported |
+ Not supported |
black-forest-labs/FLUX.1-schnell
@@ -260,10 +268,12 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
Architecture |
+ LoRA support |
Example HuggingFace Models |
Stable Diffusion |
+ Supported |
|
Stable Diffusion XL |
+ Supported |
|
-
+
@@ -292,11 +311,13 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
Architecture |
Models |
+ LoRA support |
Example HuggingFace Models |
InternVL2 |
InternVL2 |
+ Not supported |
OpenGVLab/InternVL2-1B
@@ -309,6 +330,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
LLaVA |
LLaVA-v1.5 |
+ Not supported |
llava-hf/llava-1.5-7b-hf
@@ -318,6 +340,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
LLaVA-NeXT |
LLaVa-v1.6 |
+ Not supported |
llava-hf/llava-v1.6-mistral-7b-hf
@@ -329,6 +352,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
MiniCPMV |
MiniCPM-V-2_6 |
+ Not supported |
openbmb/MiniCPM-V-2_6
@@ -345,11 +369,13 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
Architecture |
Models |
+ LoRA support |
Example HuggingFace Models |
WhisperForConditionalGeneration |
Whisper |
+ Not supported |
openai/whisper-tiny
@@ -366,6 +392,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
|
Distil-Whisper |
+ Not supported |
distil-whisper/distil-small.en
diff --git a/cmake/templates/__version__.py.in b/cmake/templates/__version__.py.in
deleted file mode 100644
index ce8e01a246..0000000000
--- a/cmake/templates/__version__.py.in
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-# Will be overwritten by cmake.
-__version__ = "@OpenVINOGenAI_VERSION@"
diff --git a/cmake/templates/version.cpp.in b/cmake/templates/version.cpp.in
new file mode 100644
index 0000000000..f6015832f9
--- /dev/null
+++ b/cmake/templates/version.cpp.in
@@ -0,0 +1,19 @@
+// Copyright (C) 2023-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "openvino/genai/version.hpp"
+
+namespace ov {
+namespace genai {
+
+const Version get_version() {
+ const static Version version = {
+ "@OpenVINOGenAI_FULL_VERSION@",
+ "OpenVINO GenAI version",
+ };
+
+ return version;
+}
+
+} // namespace genai
+} // namespace ov
diff --git a/cmake/templates/version.hpp.in b/cmake/templates/version.hpp.in
new file mode 100644
index 0000000000..34120ef632
--- /dev/null
+++ b/cmake/templates/version.hpp.in
@@ -0,0 +1,34 @@
+// Copyright (C) 2023-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "openvino/core/version.hpp"
+#include "openvino/genai/visibility.hpp"
+
+/**
+ * OpenVINO GenAI major version
+ */
+#define OPENVINO_GENAI_VERSION_MAJOR @OpenVINOGenAI_VERSION_MAJOR@
+
+/**
+ * OpenVINO GenAI minor version
+ */
+#define OPENVINO_GENAI_VERSION_MINOR @OpenVINOGenAI_VERSION_MINOR@
+
+/**
+ * OpenVINO GenAI patch version
+ */
+#define OPENVINO_GENAI_VERSION_PATCH @OpenVINOGenAI_VERSION_PATCH@
+
+namespace ov {
+namespace genai {
+
+/**
+ * Returns OpenVINO GenAI full version including git commit and hash information in form of:
+ * ...--[-]
+ */
+OPENVINO_EXTERN_C OPENVINO_GENAI_EXPORTS const ov::Version OPENVINO_CDECL get_version();
+
+} // namespace genai
+} // namespace ov
diff --git a/cmake/version.cmake b/cmake/version.cmake
new file mode 100644
index 0000000000..b9b51e8fe2
--- /dev/null
+++ b/cmake/version.cmake
@@ -0,0 +1,72 @@
+# Copyright (C) 2018-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+find_package(Git QUIET)
+
+function(ov_genai_branch_name VAR)
+ if(GIT_FOUND)
+ execute_process(
+ COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD
+ WORKING_DIRECTORY ${OpenVINOGenAI_SOURCE_DIR}
+ OUTPUT_VARIABLE GIT_BRANCH
+ RESULT_VARIABLE EXIT_CODE
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+ if(EXIT_CODE EQUAL 0)
+ set(${VAR} ${GIT_BRANCH} PARENT_SCOPE)
+ endif()
+ endif()
+endfunction()
+
+function(ov_genai_commit_hash VAR)
+ if(GIT_FOUND)
+ execute_process(
+ COMMAND ${GIT_EXECUTABLE} rev-parse --short=11 HEAD
+ WORKING_DIRECTORY ${OpenVINOGenAI_SOURCE_DIR}
+ OUTPUT_VARIABLE GIT_COMMIT_HASH
+ RESULT_VARIABLE EXIT_CODE
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+ if(EXIT_CODE EQUAL 0)
+ set(${VAR} ${GIT_COMMIT_HASH} PARENT_SCOPE)
+ endif()
+ endif()
+endfunction()
+
+function(ov_genai_commit_number VAR)
+ set(GIT_COMMIT_NUMBER_FOUND OFF)
+ if(GIT_FOUND)
+ execute_process(
+ COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
+ WORKING_DIRECTORY ${OpenVINOGenAI_SOURCE_DIR}
+ OUTPUT_VARIABLE GIT_COMMIT_NUMBER
+ RESULT_VARIABLE EXIT_CODE
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+ if(EXIT_CODE EQUAL 0)
+ set(GIT_COMMIT_NUMBER_FOUND ON)
+ set(${VAR} ${GIT_COMMIT_NUMBER} PARENT_SCOPE)
+ endif()
+ endif()
+ if(NOT GIT_COMMIT_NUMBER_FOUND)
+ # set zeros since git is not available
+ set(${VAR} "000" PARENT_SCOPE)
+ endif()
+endfunction()
+
+function(ov_genai_full_version full_version)
+ if(GIT_FOUND)
+ ov_genai_branch_name(GIT_BRANCH)
+ ov_genai_commit_hash(GIT_COMMIT_HASH)
+ ov_genai_commit_number(GIT_COMMIT_NUMBER)
+
+ if(NOT GIT_BRANCH MATCHES "^(master|HEAD)$")
+ set(GIT_BRANCH_POSTFIX "-${GIT_BRANCH}")
+ endif()
+
+ set(${full_version} "${OpenVINOGenAI_VERSION}-${GIT_COMMIT_NUMBER}-${GIT_COMMIT_HASH}${GIT_BRANCH_POSTFIX}" PARENT_SCOPE)
+ else()
+ set(${full_version} "${OpenVINOGenAI_VERSION}" PARENT_SCOPE)
+ endif()
+endfunction()
+
+ov_genai_full_version(OpenVINOGenAI_FULL_VERSION)
+message(STATUS "OpenVINO GenAI full version: ${OpenVINOGenAI_FULL_VERSION}")
diff --git a/llm_bench/python/README.md b/llm_bench/python/README.md
deleted file mode 100644
index 272ed11d1b..0000000000
--- a/llm_bench/python/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# Benchmarking Script for Large Language Models
-
-> [!IMPORTANT]
-> LLM bench code was moved to [tools](../../tools/llm_bench/) directory. Please navigate to the new directory for continue of tool usage.
\ No newline at end of file
diff --git a/llm_bench/python/who_what_benchmark/README.md b/llm_bench/python/who_what_benchmark/README.md
deleted file mode 100644
index 414b4d9342..0000000000
--- a/llm_bench/python/who_what_benchmark/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# Simple Accuracy Benchmark for Generative AI models
-
-> [!IMPORTANT]
-> Who What Benchmark code was moved to [tools](../../../tools/who_what_benchmark/) directory. Please navigate to the new directory for continue of tool usage.
\ No newline at end of file
diff --git a/samples/cpp/visual_language_chat/README.md b/samples/cpp/visual_language_chat/README.md
index 39364d51ee..73baf0088a 100644
--- a/samples/cpp/visual_language_chat/README.md
+++ b/samples/cpp/visual_language_chat/README.md
@@ -29,7 +29,7 @@ Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/o
Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model `llava-hf/llava-v1.6-mistral-7b-hf` can benefit from being run on a dGPU. Modify the source code to change the device for inference to the `GPU`.
-See [SUPPORTED_MODELS.md](../../../src/docs/SUPPORTED_MODELS.md#visual-language-models) for the list of supported models.
+See [SUPPORTED_MODELS.md](../../../SUPPORTED_MODELS.md#visual-language-models) for the list of supported models.
## Run benchmark:
diff --git a/samples/cpp/whisper_speech_recognition/README.md b/samples/cpp/whisper_speech_recognition/README.md
index d649266613..2ea3322dee 100644
--- a/samples/cpp/whisper_speech_recognition/README.md
+++ b/samples/cpp/whisper_speech_recognition/README.md
@@ -31,7 +31,7 @@ Output:
timestamps: [0, 2] text: How are you doing today?
```
-See [SUPPORTED_MODELS.md](../../../src/docs/SUPPORTED_MODELS.md#whisper-models) for the list of supported models.
+See [SUPPORTED_MODELS.md](../../../SUPPORTED_MODELS.md#whisper-models) for the list of supported models.
# Whisper pipeline usage
diff --git a/samples/deployment-requirements.txt b/samples/deployment-requirements.txt
index 428e0955a5..c6ad9eaaa8 100644
--- a/samples/deployment-requirements.txt
+++ b/samples/deployment-requirements.txt
@@ -2,4 +2,4 @@
--extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
openvino_genai~=2025.0.0.0.dev
librosa==0.10.2.post1 # For Whisper
-pillow==11.0.0 # Image processing for VLMs
+pillow==11.1.0 # Image processing for VLMs
diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt
index a589696beb..2f71891b7b 100644
--- a/samples/export-requirements.txt
+++ b/samples/export-requirements.txt
@@ -2,7 +2,7 @@
--extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
--extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
openvino-tokenizers~=2025.0.0.0.dev
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
numpy<2.0.0; sys_platform == 'darwin'
einops==0.8.0 # For Qwen
transformers_stream_generator==0.0.5 # For Qwen
@@ -10,3 +10,4 @@ diffusers==0.32.1 # For image generation pipelines
timm==1.0.12 # For exporting InternVL2
torchvision # For visual language models
transformers>=4.43 # For Whisper
+hf_transfer # for faster models download, should used with env var HF_HUB_ENABLE_HF_TRANSFER=1
\ No newline at end of file
diff --git a/samples/python/multinomial_causal_lm/multinomial_causal_lm.py b/samples/python/multinomial_causal_lm/multinomial_causal_lm.py
index 953388ed6a..5ec9d54601 100755
--- a/samples/python/multinomial_causal_lm/multinomial_causal_lm.py
+++ b/samples/python/multinomial_causal_lm/multinomial_causal_lm.py
@@ -90,7 +90,7 @@ def put(self, token_id: int) -> bool:
word = text[self.print_len:]
self.tokens_cache = []
self.print_len = 0
- elif len(text) >= 3 and text[-3:] == chr(65533):
+ elif len(text) >= 3 and text[-1] == chr(65533):
# Don't print incomplete text.
pass
elif len(text) > self.print_len:
diff --git a/samples/python/whisper_speech_recognition/README.md b/samples/python/whisper_speech_recognition/README.md
index aeb46444bf..5f373df2b7 100644
--- a/samples/python/whisper_speech_recognition/README.md
+++ b/samples/python/whisper_speech_recognition/README.md
@@ -38,7 +38,7 @@ Output:
timestamps: [0, 2] text: How are you doing today?
```
-See [SUPPORTED_MODELS.md](../../../src/docs/SUPPORTED_MODELS.md#whisper-models) for the list of supported models.
+See [SUPPORTED_MODELS.md](../../../SUPPORTED_MODELS.md#whisper-models) for the list of supported models.
# Whisper pipeline usage
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index d02f32ded9..e954037daf 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -54,19 +54,32 @@ FetchContent_MakeAvailable(safetensors.h)
ov_genai_build_jinja2cpp()
+# generate version files
+
+configure_file("${OpenVINOGenAI_SOURCE_DIR}/cmake/templates/version.hpp.in"
+ "${CMAKE_CURRENT_BINARY_DIR}/openvino/genai/version.hpp" @ONLY)
+
+configure_file("${OpenVINOGenAI_SOURCE_DIR}/cmake/templates/version.cpp.in"
+ "${CMAKE_CURRENT_BINARY_DIR}/version.cpp" @ONLY)
+
# Library
file(GLOB_RECURSE SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/src/*.c")
+list(APPEND SOURCE_FILES "${CMAKE_CURRENT_BINARY_DIR}/version.cpp")
set(TARGET_NAME openvino_genai)
+
add_library(${TARGET_NAME} SHARED ${SOURCE_FILES})
+add_library(openvino::genai ALIAS ${TARGET_NAME})
+
if(TARGET openvino_tokenizers)
add_dependencies(${TARGET_NAME} openvino_tokenizers)
endif()
-add_library(openvino::genai ALIAS ${TARGET_NAME})
target_include_directories(${TARGET_NAME}
- PUBLIC "$" "$"
+ PUBLIC "$"
+ "$"
+ "$"
PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src")
target_include_directories(${TARGET_NAME} SYSTEM PRIVATE "${safetensors.h_SOURCE_DIR}")
@@ -81,6 +94,7 @@ set_target_properties(${TARGET_NAME} PROPERTIES
LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>"
RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>"
)
+
# Extract two last digits from OpenVINOGenAI_VERSION_MAJOR because SOVERSION can only contain up to 4 symbols.
string(REGEX MATCH [=[[0-9][0-9]$]=] MAJOR_SUFFIX ${OpenVINOGenAI_VERSION_MAJOR})
if(DEFINED PY_BUILD_CMAKE_PACKAGE_NAME AND LINUX)
@@ -98,7 +112,7 @@ endif()
if(OpenVINODeveloperPackage_FOUND)
# must be called after all target_link_libraries
- # ov_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
+ ov_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
ov_ncc_naming_style(FOR_TARGET ${TARGET_NAME}
SOURCE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}/include")
@@ -142,6 +156,9 @@ install(TARGETS ${TARGET_NAME} EXPORT OpenVINOGenAITargets
install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/
DESTINATION runtime/include COMPONENT core_genai_dev)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/openvino/genai/version.hpp
+ DESTINATION runtime/include/openvino/genai COMPONENT core_genai_dev)
+
install(EXPORT OpenVINOGenAITargets FILE OpenVINOGenAITargets.cmake
NAMESPACE openvino:: DESTINATION runtime/cmake
COMPONENT core_genai_dev)
diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
index 74466ee488..ed9fc3a30d 100644
--- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
@@ -52,8 +52,9 @@ struct PipelineMetrics {
class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
protected:
- class ImplInterface;
+ class IContinuousBatchingPipeline;
class ContinuousBatchingImpl;
+
class ContinuousBatchingForSpeculativeDecodingImpl;
class ContinuousBatchingForPromptLookupImpl;
class SpeculativeDecodingImpl;
@@ -64,7 +65,7 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
friend class SpeculativeDecodingImpl;
friend class PromptLookupImpl;
- std::shared_ptr m_impl;
+ std::shared_ptr m_impl;
ContinuousBatchingPipeline() = default;
diff --git a/src/cpp/include/openvino/genai/lora_adapter.hpp b/src/cpp/include/openvino/genai/lora_adapter.hpp
index 277ec57cc3..b6b91bee20 100644
--- a/src/cpp/include/openvino/genai/lora_adapter.hpp
+++ b/src/cpp/include/openvino/genai/lora_adapter.hpp
@@ -188,7 +188,7 @@ class OPENVINO_GENAI_EXPORTS AdapterController {
AdapterController(std::shared_ptr model, const AdapterConfig& config, std::string device);
// Apply adapters configured in the current config set last time, or set and use new config given as optional `config` argument
- void apply(ov::InferRequest& request, const std::optional& config = std::nullopt);
+ void apply(ov::InferRequest request, const std::optional& config = std::nullopt);
// Returns true if a given name is one of the state names created by this adapter controller for dynamic LoRA
// Helps to distinguish LoRA states from other states (e.g. KV cache state) in the model for a partial state reset.
diff --git a/src/cpp/src/continuous_batching_adapter.hpp b/src/cpp/src/continuous_batching_adapter.hpp
new file mode 100644
index 0000000000..0b0065aa1f
--- /dev/null
+++ b/src/cpp/src/continuous_batching_adapter.hpp
@@ -0,0 +1,171 @@
+
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "llm_pipeline_base.hpp"
+
+#include "openvino/genai/continuous_batching_pipeline.hpp"
+
+namespace ov::genai {
+
+Tokenizer dont_construct() {
+ OPENVINO_THROW("Continuous Batching backend can't be constructed"
+ "from ireq because the model must be transformed");
+}
+
+template struct overloaded : Ts... {using Ts::operator()...;};
+template overloaded(Ts...) -> overloaded;
+
+class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
+ ContinuousBatchingPipeline m_impl;
+public:
+ ContinuousBatchingAdapter(
+ const ov::InferRequest& request,
+ const Tokenizer& tokenizer,
+ OptionalGenerationConfig generation_config
+ ): LLMPipelineImplBase{dont_construct(), GenerationConfig{}},
+ m_impl{std::filesystem::path{}, SchedulerConfig{}, std::string{}} { }
+
+ ContinuousBatchingAdapter(
+ const std::filesystem::path& models_path,
+ const Tokenizer& tokenizer,
+ const SchedulerConfig& scheduler_config,
+ const std::string& device,
+ const ov::AnyMap& plugin_config
+ ): LLMPipelineImplBase{tokenizer, GenerationConfig()}, m_impl{
+ models_path,
+ tokenizer,
+ scheduler_config,
+ device,
+ plugin_config} {
+ m_generation_config = m_impl.get_config();
+ }
+
+ ContinuousBatchingAdapter(
+ const std::string& model_str,
+ const ov::Tensor& weights_tensor,
+ const Tokenizer& tokenizer,
+ const SchedulerConfig& scheduler_config,
+ const std::string& device,
+ const ov::AnyMap& plugin_config,
+ const ov::genai::GenerationConfig& generation_config
+ ): LLMPipelineImplBase{tokenizer, GenerationConfig()}, m_impl{
+ model_str,
+ weights_tensor,
+ tokenizer,
+ scheduler_config,
+ device,
+ plugin_config,
+ generation_config} {}
+
+ ContinuousBatchingAdapter(
+ const std::filesystem::path& models_path,
+ const SchedulerConfig& scheduler_config,
+ const std::string& device,
+ const ov::AnyMap& plugin_config
+ ): LLMPipelineImplBase{Tokenizer(models_path), GenerationConfig()}, m_impl{
+ models_path,
+ m_tokenizer,
+ scheduler_config,
+ device,
+ plugin_config} {
+ m_generation_config = m_impl.get_config();
+ }
+
+ DecodedResults generate(
+ StringInputs inputs,
+ OptionalGenerationConfig generation_config,
+ StreamerVariant streamer
+ ) override {
+ std::vector prompts = std::visit(overloaded{
+ [](const std::string& prompt) {
+ return std::vector{prompt};
+ },
+ [](std::vector& prompts) {
+ return prompts;
+ }
+ }, inputs);
+ const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config;
+ // -1 == config.eos_token_id and config.validate() are handled in m_impl.
+ std::vector generated = m_impl.generate(
+ prompts,
+ std::vector{prompts.size(), config},
+ streamer
+ );
+ std::vector plain_replies;
+ std::vector plain_scores;
+ for (GenerationResult& res : generated) {
+ OPENVINO_ASSERT(res.m_status == GenerationStatus::FINISHED || res.m_status == GenerationStatus::DROPPED_BY_HANDLE, "Got unfinished GenerationStatus");
+ std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_replies));
+ std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores));
+ }
+ return {std::move(plain_replies), std::move(plain_scores)};
+ }
+
+ EncodedResults generate(
+ const EncodedInputs& inputs,
+ OptionalGenerationConfig generation_config,
+ StreamerVariant streamer
+ ) override {
+ std::vector input_ids = std::visit(overloaded{
+ [](const ov::Tensor& inp) {
+ size_t batch_size = inp.get_shape().at(0);
+ if (1 == batch_size) {
+ return std::vector{inp};
+ }
+ std::vector input_ids;
+ input_ids.reserve(batch_size);
+ size_t max_len = inp.get_shape().at(1);
+ const int64_t* const source = inp.data();
+ for (size_t batch_id = 0; batch_id < batch_size; ++batch_id) {
+ input_ids.emplace_back(ov::element::i64, ov::Shape(1, max_len));
+ int64_t* destination = input_ids.back().data();
+ std::copy_n(source + batch_id * max_len, max_len, destination);
+ }
+ return input_ids;
+ },
+ [](const TokenizedInputs& inp) {
+ size_t batch_size = inp.input_ids.get_shape().at(0);
+ std::vector input_ids;
+ input_ids.reserve(batch_size);
+ size_t max_len = inp.input_ids.get_shape().at(1);
+ const int64_t* const source = inp.input_ids.data();
+ const int64_t* const attention_mask = inp.attention_mask.data();
+ for (size_t batch_id = 0; batch_id < batch_size; ++batch_id) {
+ input_ids.emplace_back(ov::element::i64, ov::Shape(1, max_len));
+ int64_t* destination = input_ids.back().data();
+ size_t copy_count = 0;
+ for (size_t idx = 0; idx < max_len; ++idx) {
+ if (1 == attention_mask[batch_id * max_len + idx]) {
+ destination[copy_count++] = source[batch_id * max_len + idx];
+ }
+ }
+ input_ids.back().set_shape({1, copy_count});
+ }
+ return input_ids;
+ }
+ }, inputs);
+
+ const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config;
+ // -1 == config.eos_token_id and config.validate() are handled in m_impl.
+ std::vector generated = m_impl.generate(input_ids, std::vector{input_ids.size(), config}, streamer);
+ std::vector> plain_tokens;
+ std::vector plain_scores;
+ for (EncodedGenerationResult& res : generated) {
+ OPENVINO_ASSERT(res.m_status == GenerationStatus::FINISHED || res.m_status == GenerationStatus::DROPPED_BY_HANDLE, "Got unfinished GenerationStatus");
+ std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_tokens));
+ std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores));
+ }
+ return {std::move(plain_tokens), std::move(plain_scores)};
+ }
+
+ void start_chat(const std::string& system_message) override {
+ m_impl.start_chat();
+ };
+
+ void finish_chat() override {
+ m_impl.finish_chat();
+ };
+};
+
+} // namespace ov::genai
diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index 52ec6a8302..44bfaf7f21 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -5,6 +5,7 @@
#include "continuous_batching_impl.hpp"
#include "utils.hpp"
#include "utils/paged_attention_transformations.hpp"
+#include "lora_helper.hpp"
namespace ov::genai {
template struct overloaded : Ts... {using Ts::operator()...;};
@@ -17,38 +18,45 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
const std::string& device,
const ov::AnyMap& properties,
const ov::genai::GenerationConfig& generation_config,
- bool is_validation_mode_enabled
- ) {
+ bool is_validation_mode_enabled) {
m_tokenizer = tokenizer;
m_generation_config = generation_config;
m_is_validation_mode_enabled = is_validation_mode_enabled;
- ov::Core core;
-
- auto [core_properties, compile_properties] = utils::split_core_compile_config(properties);
- core.set_property(core_properties);
-
- DeviceConfig device_config(core, scheduler_config, device, compile_properties);
+ ov::Core core = utils::singleton_core();
+ DeviceConfig device_config(core, scheduler_config, device, properties);
bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction;
utils::apply_paged_attention_transformations(model, device_config, is_need_per_layer_cache_control);
+ utils::apply_gather_before_matmul_transformation(model);
- init(model, scheduler_config, compile_properties, device_config, core);
+ initialize_pipeline(model, scheduler_config, properties, device_config, core);
}
void ContinuousBatchingPipeline::ContinuousBatchingImpl::_pull_awaiting_requests() {
std::lock_guard lock{m_awaiting_requests_mutex};
m_requests.insert(m_requests.end(), m_awaiting_requests.begin(), m_awaiting_requests.end());
m_awaiting_requests.clear();
+ m_pipeline_metrics.requests = m_requests.size();
}
-void ContinuousBatchingPipeline::ContinuousBatchingImpl::init(
+void ContinuousBatchingPipeline::ContinuousBatchingImpl::initialize_pipeline(
std::shared_ptr model,
const SchedulerConfig& scheduler_config,
const ov::AnyMap& properties,
const DeviceConfig& device_config,
ov::Core& core) {
- auto compiled_model = core.compile_model(model, device_config.get_device(), properties);
+ ov::CompiledModel compiled_model;
+
+ // apply LoRA
+ if (auto filtered_properties = extract_adapters_from_properties(properties, &m_generation_config.adapters)) {
+ m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model.");
+ m_adapter_controller = AdapterController(model, *m_generation_config.adapters, device_config.get_device()); // TODO: Make the prefix name configurable
+ compiled_model = core.compile_model(model, device_config.get_device(), *filtered_properties);
+ } else {
+ compiled_model = core.compile_model(model, device_config.get_device(), properties);
+ }
+
ov::genai::utils::print_compiled_model_properties(compiled_model, "LLM with Paged Attention");
ov::InferRequest infer_request = compiled_model.create_infer_request();
@@ -68,9 +76,12 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init(
can_use_partial_preemption = false;
}
m_scheduler = std::make_shared(device_config.get_block_size(), m_cache_manager, updated_config, device_config.get_num_layers(), can_use_partial_preemption);
- // and finally create model runner
+
+ // model runner
bool is_use_cache_eviction = m_scheduler->get_config().use_cache_eviction;
m_model_runner = std::make_shared(infer_request, m_scheduler->get_block_size(), device_config.get_num_layers(), is_use_cache_eviction);
+
+ // sampler
m_sampler = std::make_shared(m_tokenizer);
m_sampler->set_seed(m_generation_config.rng_seed);
@@ -91,9 +102,8 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::add_request(uint64_t request
SequenceGroup::Ptr sequence_group = std::make_shared(request_id, input_ids,
sampling_params,
- m_scheduler->get_block_size(),
- m_scheduler->get_config().enable_prefix_caching);
- sequence_group->set_sequence_group_ptr(sequence_group);
+ m_scheduler->get_block_size());
+
if (m_scheduler->get_config().enable_prefix_caching) {
m_scheduler->restore_cached_blocks(sequence_group);
}
@@ -102,6 +112,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::add_request(uint64_t request
std::lock_guard lock{m_awaiting_requests_mutex};
m_awaiting_requests.push_back(sequence_group);
}
+
return std::make_shared(sequence_group->get_generation_stream(), sampling_params);
};
@@ -113,6 +124,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::add_request(uint64_t request
timer.start();
ov::Tensor input_ids = m_tokenizer.encode(prompt).input_ids;
timer.end();
+
return add_request(request_id, input_ids, sampling_params);
}
@@ -127,24 +139,26 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
_pull_awaiting_requests();
- m_pipeline_metrics.requests = m_requests.size();
Scheduler::Output scheduler_output;
{
- static ManualTimer timer("scheduling");
- timer.start();
- m_scheduler->clean_empty_blocks(m_requests);
+ static ManualTimer scheduling_timer("scheduling");
+ scheduling_timer.start();
scheduler_output = m_scheduler->schedule(m_requests);
+ scheduling_timer.end();
+
m_pipeline_metrics.scheduled_requests = scheduler_output.m_scheduled_sequence_groups_ids.size();
m_pipeline_metrics.cache_usage = scheduler_output.m_cache_usage;
- m_pipeline_metrics.max_cache_usage =
- std::max(m_pipeline_metrics.max_cache_usage, scheduler_output.m_cache_usage);
+ m_pipeline_metrics.max_cache_usage = std::max(m_pipeline_metrics.max_cache_usage, scheduler_output.m_cache_usage);
_register_step_cache_usage(scheduler_output.m_cache_usage);
m_pipeline_metrics.avg_cache_usage = _get_current_running_average_cache_usage();
+
+ static ManualTimer copy_blocks_timer("scheduling");
+ copy_blocks_timer.start();
m_cache_manager->copy_blocks(scheduler_output.m_block_copy_map);
- timer.end();
+ copy_blocks_timer.end();
}
- // if no tokens were scheduled, we are out of memory
+ // if no tokens were scheduled, we are out of memory => free all requests and return
if (scheduler_output.m_total_num_scheduled_tokens == 0) {
for (size_t i = 0; i < m_requests.size(); ++i) {
SequenceGroup::Ptr sequence_group = m_requests[i];
@@ -166,15 +180,14 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
}
#ifdef DEBUG_CACHE_STATE_DUMP
-
CacheStateDumper dumper(CacheStateDumper::get_run_id_for_generation_step(step_count, "before_eviction"));
dumper.dump_cache_state(*m_scheduler, m_requests, step_count);
#endif
- const auto& sched_config = m_scheduler->get_config();
// evict unimportant blocks from KV cache, if requested
+ const auto& sched_config = m_scheduler->get_config();
if (sched_config.use_cache_eviction) {
- maybe_evict_cache_blocks(sched_config);
+ _maybe_evict_cache_blocks(sched_config);
}
#ifdef DEBUG_CACHE_STATE_DUMP
@@ -183,6 +196,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
step_count++;
#endif
+ // process generation_config.echo parameter
_fill_prompt_log_probs(m_requests, logits);
SamplerOutput sampler_output;
@@ -195,8 +209,8 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
// process sampler_output (e.g. fork or drop sequences from BlockScheduler)
{
- static ManualTimer timer("fork / free sequence");
- timer.start();
+ static ManualTimer free_fork_timer("fork / free sequence");
+ free_fork_timer.start();
for (const auto& pair : sampler_output.m_forked_sequences) {
uint64_t parent_id = pair.first;
@@ -208,35 +222,49 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
for (auto seq_id : sampler_output.m_dropped_sequences)
m_scheduler->free_sequence(seq_id);
- timer.end();
+ free_fork_timer.end();
}
// notify requests dropped by handle
{
- static ManualTimer timer("notify requests dropped by handle");
- timer.start();
+ static ManualTimer report_tokens_timer("notify requests dropped by handle");
+ report_tokens_timer.start();
_notify_requests_dropped_by_handle();
- timer.end();
+ report_tokens_timer.end();
}
// free non running requests for current step
{
- static ManualTimer timer("free non running requests");
- timer.start();
+ static ManualTimer clean_up_requests_timer("free non running requests");
+ clean_up_requests_timer.start();
_free_non_running_requests();
- timer.end();
+ clean_up_requests_timer.end();
}
step_timer.end();
}
+void ContinuousBatchingPipeline::ContinuousBatchingImpl::set_adapters(const std::optional& adapters) {
+ if (m_adapter_controller) {
+ m_adapter_controller->apply(m_model_runner->get_infer_request(), adapters);
+ }
+}
+
std::vector
ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector& input_ids,
const std::vector& sampling_params,
const StreamerVariant& streamer) {
OPENVINO_ASSERT(!has_non_finished_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request");
OPENVINO_ASSERT(input_ids.size() == sampling_params.size());
+
+ // checks that all requests has the same LoRA adapters property value
+ for (size_t i = 1; i < sampling_params.size(); ++i) {
+ OPENVINO_ASSERT(sampling_params[i - 1].adapters == sampling_params[i].adapters,
+ "LoRA adapters value must be the same for all requests");
+ }
+ set_adapters(sampling_params[0].adapters);
+
const std::shared_ptr& streamer_ptr = std::visit(overloaded{
[](std::monostate) -> std::shared_ptr {
return nullptr;
@@ -320,7 +348,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vectorget_beam_search_score(sampling_params) : sequence->get_cumulative_log_probs();
+ const float score = sampling_params.is_beam_search() ? sequence->get_beam_search_score(sampling_params) : sequence->get_cumulative_log_prob();
const auto & generated_ids = sequence->get_generated_ids();
if (sampling_params.echo)
@@ -375,7 +403,7 @@ float ContinuousBatchingPipeline::ContinuousBatchingImpl::_get_current_running_a
return std::accumulate(m_previous_step_cache_usages.begin(), m_previous_step_cache_usages.end(), 0.0) / m_previous_step_cache_usages.size();
}
-void ContinuousBatchingPipeline::ContinuousBatchingImpl::maybe_evict_cache_blocks(const SchedulerConfig& sched_config) {
+void ContinuousBatchingPipeline::ContinuousBatchingImpl::_maybe_evict_cache_blocks(const SchedulerConfig& sched_config) {
std::unordered_map seq_group_to_num_blocks_evicted_map;
auto sequence_attention_scores = m_model_runner->get_last_attention_scores();
for (auto& seq_id_and_attention_scores : sequence_attention_scores) {
@@ -417,7 +445,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
const float * logits_data = logits.data();
ov::Shape logits_shape = logits.get_shape();
OPENVINO_ASSERT(logits_shape.size() == 3);
- size_t batch_seq_len = logits_shape[1], vocab_size = logits_shape[2];
+ size_t vocab_size = logits_shape[2];
for (size_t sequence_group_id = 0, currently_processed_tokens = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) {
SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id];
// requests not scheduled, in decoding phase or not echoing are not processed
@@ -427,18 +455,17 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
size_t num_running_sequences = sequence_group->num_running_seqs();
OPENVINO_ASSERT(num_running_sequences == 1);
- size_t actual_seq_len = sequence_group->get_num_scheduled_tokens();
- size_t padded_amount_of_processed_tokens = std::max(actual_seq_len, batch_seq_len);
+ size_t output_seq_len = sequence_group->get_output_seq_len();
const float * sequence_group_logits_data = logits_data + vocab_size * currently_processed_tokens;
size_t num_prompt_tokens_processed = sequence_group->get_num_processed_tokens();
- OPENVINO_ASSERT(num_prompt_tokens_processed + actual_seq_len <= sequence_group->get_prompt_len());
+ OPENVINO_ASSERT(num_prompt_tokens_processed + output_seq_len <= sequence_group->get_prompt_len());
// if we processed the whole prompt we don't include last logprob as it will be processed by the sampler (it's already completion)
// otherwise we include it as it will be used in the next part of the prompt
int exclude_last_logprob = 1;
- if (num_prompt_tokens_processed + actual_seq_len < sequence_group->get_prompt_len())
+ if (num_prompt_tokens_processed + output_seq_len < sequence_group->get_prompt_len())
exclude_last_logprob = 0;
// if we start processing the prompt we add "fake" log prob for the first position (begin of sequence)
@@ -446,7 +473,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
sequence_group->append_prompt_log_prob(1.0);
for (int token_logits_offset = 0, token_id_offset = num_prompt_tokens_processed + 1;
- token_logits_offset < actual_seq_len - exclude_last_logprob;
+ token_logits_offset < output_seq_len - exclude_last_logprob;
token_logits_offset++, token_id_offset++) {
const float* token_logits = (sequence_group_logits_data + token_logits_offset * vocab_size);
@@ -471,7 +498,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
sequence_group->append_prompt_log_prob(token_logit - max_value - log_sum);
}
- currently_processed_tokens += padded_amount_of_processed_tokens * num_running_sequences;
+ currently_processed_tokens += output_seq_len * num_running_sequences;
// For max_new_tokens == 0, we don't reach sampling so need to notify handle separately
if(sequence_group->get_sampling_parameters().max_new_tokens == 0) {
sequence_group->notify_handle_echo_only();
diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp
index 8da05c6dfa..d319147f2c 100644
--- a/src/cpp/src/continuous_batching_impl.hpp
+++ b/src/cpp/src/continuous_batching_impl.hpp
@@ -3,16 +3,19 @@
#pragma once
-#include "continuous_batching_impl_interface.hpp"
-#include "openvino/genai/continuous_batching_pipeline.hpp"
+#include "icontinuous_batching.hpp"
+
+#include "openvino/genai/lora_adapter.hpp"
#include "cache_eviction.hpp"
namespace ov::genai {
-class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatchingPipeline::ImplInterface {
+
+class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatchingPipeline::IContinuousBatchingPipeline {
protected:
std::shared_ptr m_scheduler;
std::shared_ptr m_cache_manager;
std::shared_ptr m_model_runner;
+ std::optional m_adapter_controller;
std::shared_ptr m_sampler;
// current requests to process
@@ -26,7 +29,7 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
static const size_t AVG_CACHE_USAGE_WINDOW_SIZE_IN_STEPS = 1000;
std::deque m_previous_step_cache_usages;
-
+
// flag to enable validation mode for sampler
bool m_is_validation_mode_enabled = false;
@@ -37,21 +40,41 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
// used by tests only
ContinuousBatchingImpl() = default;
+ void initialize_pipeline(std::shared_ptr model,
+ const SchedulerConfig& scheduler_config,
+ const ov::AnyMap& plugin_config,
+ const DeviceConfig& device_config,
+ ov::Core& core);
+
+ /**
+ * Pulls requests from awaiting queue to running queue
+ * Should be called within each call of step()
+ */
+ virtual void _pull_awaiting_requests();
+
+ /**
+ * Releases non-running (finished, dropped or OOM) requests from running queue
+ */
void _free_non_running_requests();
+
+ /**
+ * Notify dropped requests by pushing empty output
+ */
void _notify_requests_dropped_by_handle();
- void _register_step_cache_usage(float step_cache_usage);
- float _get_current_running_average_cache_usage() const;
- void maybe_evict_cache_blocks(const SchedulerConfig& sched_config);
- void init(std::shared_ptr model,
- const SchedulerConfig& scheduler_config,
- const ov::AnyMap& plugin_config,
- const DeviceConfig& device_config,
- ov::Core& core);
+ /**
+ * Handles 'echo' generation parameter
+ */
+ void _fill_prompt_log_probs(std::vector& sequence_groups, ov::Tensor& logits);
- virtual void _pull_awaiting_requests();
+ /**
+ * Performs KV cache eviction is enabled / requireed
+ */
+ void _maybe_evict_cache_blocks(const SchedulerConfig& sched_config);
+
+ void _register_step_cache_usage(float step_cache_usage);
+ float _get_current_running_average_cache_usage() const;
- void _fill_prompt_log_probs(std::vector& sequence_groups, ov::Tensor& logits);
public:
ContinuousBatchingImpl(const std::shared_ptr& model,
const Tokenizer& tokenizer,
@@ -64,6 +87,7 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
GenerationHandle add_request(uint64_t request_id,
const ov::Tensor& input_ids,
ov::genai::GenerationConfig sampling_params) override;
+
GenerationHandle add_request(uint64_t request_id,
const std::string& prompt,
ov::genai::GenerationConfig sampling_params) override;
@@ -76,5 +100,11 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
generate(const std::vector& input_ids,
const std::vector& sampling_params,
const StreamerVariant& streamer) override;
+
+ /**
+ * Updates LoRA adapters for current generation call
+ */
+ void set_adapters(const std::optional& adapters);
};
-}
\ No newline at end of file
+
+} // namespace ov::genai
diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index 148eb2fa9f..c1c0677ff3 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -47,19 +47,19 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::p
auto properties_without_draft_model = properties;
auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model);
auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
-
- std::filesystem::path openvino_model_name = "openvino_model.xml";
- auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
+
+ auto model = utils::singleton_core().read_model(models_path / "openvino_model.xml", {}, properties);
auto tokenizer = ov::genai::Tokenizer(models_path, tokenizer_properties);
auto generation_config = utils::from_config_json_if_exists(models_path);
+
if (is_prompt_lookup_enabled) {
- OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually excluded");
+ OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config);
- } else if (draft_model_desr.model == nullptr) {
- m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties, generation_config);
- } else {
+ } else if (draft_model_desr.model != nullptr) {
auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
m_impl = std::make_shared(main_model_descr, draft_model_desr);
+ } else {
+ m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties, generation_config);
}
}
@@ -73,17 +73,17 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model);
auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
std::filesystem::path openvino_model_name = "openvino_model.xml";
- auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
+ auto model = utils::singleton_core().read_model(models_path / openvino_model_name, {}, properties_without_draft_model);
auto generation_config = utils::from_config_json_if_exists(models_path);
if (is_prompt_lookup_enabled) {
- OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually excluded");
+ OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config);
- } else if (draft_model_desr.model == nullptr) {
- m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties, generation_config);
- } else {
+ } else if (draft_model_desr.model != nullptr) {
auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
m_impl = std::make_shared(main_model_descr, draft_model_desr);
+ } else {
+ m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties, generation_config);
}
}
@@ -101,13 +101,13 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
auto model = utils::singleton_core().read_model(model_str, weights_tensor);
if (is_prompt_lookup_enabled) {
- OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually excluded");
+ OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config);
- } else if (draft_model_desr.model == nullptr) {
- m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties, generation_config);
- } else {
+ } else if (draft_model_desr.model != nullptr) {
auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
- m_impl = std::make_shared(main_model_descr, draft_model_desr);
+ m_impl = std::make_shared(main_model_descr, draft_model_desr);
+ } else {
+ m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties, generation_config);
}
}
diff --git a/src/cpp/src/device_config.hpp b/src/cpp/src/device_config.hpp
index cc2e21b9a1..fee6c7abd1 100644
--- a/src/cpp/src/device_config.hpp
+++ b/src/cpp/src/device_config.hpp
@@ -117,22 +117,22 @@ class DeviceConfig {
}
for (size_t layer_id = 0; layer_id < m_num_decoder_layers; layer_id++) {
- m_key_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(),
- ov::Dimension(m_num_kv_heads[layer_id]),
- ov::Dimension(m_block_size),
- ov::Dimension(m_head_size)});
-
m_value_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(),
ov::Dimension(m_num_kv_heads[layer_id]),
ov::Dimension(m_block_size),
ov::Dimension(m_head_size)});
- if (m_device.find("GPU") != std::string::npos) {
+ if (m_device.find("GPU") == std::string::npos) {
+ m_key_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(),
+ ov::Dimension(m_num_kv_heads[layer_id]),
+ ov::Dimension(m_block_size),
+ ov::Dimension(m_head_size)});
+ } else if (m_device.find("GPU") != std::string::npos) {
// Update key shape, as the key's shape is different from the value's shape
m_key_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(),
- ov::Dimension(m_num_kv_heads[layer_id]),
- ov::Dimension(m_head_size),
- ov::Dimension(m_block_size)});
+ ov::Dimension(m_num_kv_heads[layer_id]),
+ ov::Dimension(m_head_size),
+ ov::Dimension(m_block_size)});
}
}
}
diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
index 59be603fd9..25402e22e7 100644
--- a/src/cpp/src/generation_config.cpp
+++ b/src/cpp/src/generation_config.cpp
@@ -230,9 +230,9 @@ void GenerationConfig::validate() const {
OPENVINO_ASSERT(temperature > 0, "When 'do_sample' is true, temperature must be a strictly positive float, but got ", temperature);
} else {
// parameters requiring multinomial
- OPENVINO_ASSERT(top_k == std::numeric_limits::max(), "When 'do_sample' is false, top_k must be max of size_t, but got ", top_k);
- OPENVINO_ASSERT(top_p == 1.0f, "When 'do_sample' is false, top_p must be 1.0f, but got ", top_p);
- OPENVINO_ASSERT(temperature == 1.0f, "When 'do_sample' is false, temperature must be a 1.0f, but got ", temperature);
+ // OPENVINO_ASSERT(top_k == std::numeric_limits::max(), "When 'do_sample' is false, top_k must be max of size_t, but got ", top_k);
+ // OPENVINO_ASSERT(top_p == 1.0f, "When 'do_sample' is false, top_p must be 1.0f, but got ", top_p);
+ // OPENVINO_ASSERT(temperature == 1.0f, "When 'do_sample' is false, temperature must be a 1.0f, but got ", temperature);
}
if (is_beam_search()) {
@@ -252,10 +252,10 @@ void GenerationConfig::validate() const {
}
} else {
// parameters requiring beam search
- OPENVINO_ASSERT(num_beam_groups == 1, "'num_beam_groups' is supported by beam search only and should be 1 otherwise, but got ", num_beam_groups);
- OPENVINO_ASSERT(no_repeat_ngram_size == std::numeric_limits::max(), "'no_repeat_ngram_size' is supported only by beam search, otherwise should be set to max of size_t, but got ", no_repeat_ngram_size);
- OPENVINO_ASSERT(diversity_penalty == 0.0f, "'diversity_penalty' is set to ", diversity_penalty, " (default is 0.0f), which is supported only by beam search sampling");
- OPENVINO_ASSERT(length_penalty == 1.0f, "'length_penalty' is set to ", length_penalty, " (default is 1.0f), which is supported only by beam search sampling");
+ // OPENVINO_ASSERT(num_beam_groups == 1, "'num_beam_groups' is supported by beam search only and should be 1 otherwise, but got ", num_beam_groups);
+ // OPENVINO_ASSERT(no_repeat_ngram_size == std::numeric_limits::max(), "'no_repeat_ngram_size' is supported only by beam search, otherwise should be set to max of size_t, but got ", no_repeat_ngram_size);
+ // OPENVINO_ASSERT(diversity_penalty == 0.0f, "'diversity_penalty' is set to ", diversity_penalty, " (default is 0.0f), which is supported only by beam search sampling");
+ // OPENVINO_ASSERT(length_penalty == 1.0f, "'length_penalty' is set to ", length_penalty, " (default is 1.0f), which is supported only by beam search sampling");
}
// assistant generation
diff --git a/src/cpp/src/continuous_batching_impl_interface.cpp b/src/cpp/src/icontinuous_batching.cpp
similarity index 79%
rename from src/cpp/src/continuous_batching_impl_interface.cpp
rename to src/cpp/src/icontinuous_batching.cpp
index 10fc102aa0..e32616b0aa 100644
--- a/src/cpp/src/continuous_batching_impl_interface.cpp
+++ b/src/cpp/src/icontinuous_batching.cpp
@@ -1,40 +1,41 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
-#include "continuous_batching_impl_interface.hpp"
+#include "icontinuous_batching.hpp"
namespace ov::genai {
-GenerationConfig ContinuousBatchingPipeline::ImplInterface::get_config() const {
+GenerationConfig ContinuousBatchingPipeline::IContinuousBatchingPipeline::get_config() const {
return m_generation_config;
}
-PipelineMetrics ContinuousBatchingPipeline::ImplInterface::get_metrics() const {
+PipelineMetrics ContinuousBatchingPipeline::IContinuousBatchingPipeline::get_metrics() const {
return m_pipeline_metrics;
}
-Tokenizer ContinuousBatchingPipeline::ImplInterface::get_tokenizer() {
+Tokenizer ContinuousBatchingPipeline::IContinuousBatchingPipeline::get_tokenizer() {
return m_tokenizer;
}
-void ContinuousBatchingPipeline::ImplInterface::start_chat(const std::string& system_message) {
+void ContinuousBatchingPipeline::IContinuousBatchingPipeline::start_chat(const std::string& system_message) {
if (!system_message.empty()) {
m_history.push_back({{"role", "system"}, {"content", system_message}});
}
m_is_chat_conversation = true;
};
-void ContinuousBatchingPipeline::ImplInterface::finish_chat() {
+void ContinuousBatchingPipeline::IContinuousBatchingPipeline::finish_chat() {
m_is_chat_conversation = false;
m_history.clear();
};
std::vector
-ContinuousBatchingPipeline::ImplInterface::generate(
+ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
const std::vector& prompts,
std::vector sampling_params,
const StreamerVariant& streamer) {
std::vector input_ids;
+
static ManualTimer timer("tokenize");
if (m_is_chat_conversation) {
OPENVINO_ASSERT(1 == prompts.size(), "Can't chat with multiple prompts");
@@ -47,13 +48,15 @@ ContinuousBatchingPipeline::ImplInterface::generate(
timer.end();
} else {
input_ids.reserve(prompts.size());
+ timer.start();
for (const std::string& prompt : prompts) {
- timer.start();
input_ids.push_back(m_tokenizer.encode(prompt).input_ids);
- timer.end();
}
+ timer.end();
}
+
std::vector encoded = generate(input_ids, sampling_params, streamer);
+
std::vector decoded;
decoded.reserve(encoded.size());
for (EncodedGenerationResult& res : encoded) {
@@ -65,6 +68,7 @@ ContinuousBatchingPipeline::ImplInterface::generate(
m_history.push_back({{"role", "assistant"}, {"content", generated.back()}});
}
}
+
decoded.push_back(GenerationResult{
res.m_request_id,
std::move(generated),
@@ -72,6 +76,7 @@ ContinuousBatchingPipeline::ImplInterface::generate(
res.m_status
});
}
+
return decoded;
}
-}
\ No newline at end of file
+}
diff --git a/src/cpp/src/continuous_batching_impl_interface.hpp b/src/cpp/src/icontinuous_batching.hpp
similarity index 72%
rename from src/cpp/src/continuous_batching_impl_interface.hpp
rename to src/cpp/src/icontinuous_batching.hpp
index 909383c98a..12030f06f7 100644
--- a/src/cpp/src/continuous_batching_impl_interface.hpp
+++ b/src/cpp/src/icontinuous_batching.hpp
@@ -12,7 +12,10 @@
namespace ov::genai {
-class ContinuousBatchingPipeline::ImplInterface {
+/**
+ * Base interface for all continuous batching based pipelines
+ */
+class ContinuousBatchingPipeline::IContinuousBatchingPipeline {
protected:
Tokenizer m_tokenizer;
@@ -35,6 +38,7 @@ class ContinuousBatchingPipeline::ImplInterface {
// std::cout << std::endl;
}
} m_perf;
+
bool m_is_chat_conversation = false;
ChatHistory m_history;
@@ -43,27 +47,57 @@ class ContinuousBatchingPipeline::ImplInterface {
PipelineMetrics get_metrics() const;
ov::genai::Tokenizer get_tokenizer();
+ /**
+ * Adds requests to awaiting queue using encoded inputs
+ */
virtual GenerationHandle add_request(uint64_t request_id,
const ov::Tensor& input_ids,
ov::genai::GenerationConfig sampling_params) = 0;
+
+ /**
+ * Adds request to running queue based on string input
+ * This step also performs tokenization's encode
+ */
virtual GenerationHandle add_request(uint64_t request_id,
const std::string& prompt,
ov::genai::GenerationConfig sampling_params) = 0;
+ /**
+ * Checks whether server (pipeline) has non-finished requests and step() should be called within a loop
+ */
virtual bool has_non_finished_requests() = 0;
+ /**
+ * Performs a single inference step of all running (and pulls awaiting) requests
+ */
virtual void step() = 0;
+ /**
+ * Performs monolitic generation based on encoded prompts
+ */
virtual std::vector
generate(const std::vector& input_ids,
const std::vector& sampling_params,
const StreamerVariant& streamer) = 0;
+
+ /**
+ * Performs monolitic generation based on text prompts
+ */
std::vector
generate(const std::vector& prompts,
std::vector sampling_params,
const StreamerVariant& streamer);
+ /**
+ * Starts chat with a given system prompt
+ *
+ * In chat scenario prompts passed to `generate` method are accumulated inside the pipeline until `finish_chat` is called
+ */
void start_chat(const std::string& system_message);
+
+ /**
+ * Ends chat
+ */
void finish_chat();
};
}
\ No newline at end of file
diff --git a/src/cpp/src/image_generation/models/autoencoder_kl.cpp b/src/cpp/src/image_generation/models/autoencoder_kl.cpp
index 4ffab62c53..a5608db80f 100644
--- a/src/cpp/src/image_generation/models/autoencoder_kl.cpp
+++ b/src/cpp/src/image_generation/models/autoencoder_kl.cpp
@@ -91,8 +91,7 @@ AutoencoderKL::Config::Config(const std::filesystem::path& config_path) {
AutoencoderKL::AutoencoderKL(const std::filesystem::path& vae_decoder_path)
: m_config(vae_decoder_path / "config.json") {
- ov::Core core = utils::singleton_core();
- m_decoder_model = core.read_model((vae_decoder_path / "openvino_model.xml").string());
+ m_decoder_model = utils::singleton_core().read_model(vae_decoder_path / "openvino_model.xml");
// apply VaeImageProcessor postprocessing steps by merging them into the VAE decoder model
merge_vae_image_post_processing();
}
@@ -100,8 +99,7 @@ AutoencoderKL::AutoencoderKL(const std::filesystem::path& vae_decoder_path)
AutoencoderKL::AutoencoderKL(const std::filesystem::path& vae_encoder_path,
const std::filesystem::path& vae_decoder_path)
: AutoencoderKL(vae_decoder_path) {
- ov::Core core = utils::singleton_core();
- m_encoder_model = core.read_model((vae_encoder_path / "openvino_model.xml").string());
+ m_encoder_model = utils::singleton_core().read_model(vae_encoder_path / "openvino_model.xml");
}
AutoencoderKL::AutoencoderKL(const std::filesystem::path& vae_decoder_path,
@@ -131,8 +129,7 @@ AutoencoderKL::AutoencoderKL(const std::string& vae_decoder_model,
const Tensor& vae_decoder_weights,
const Config& vae_decoder_config)
: m_config(vae_decoder_config) {
- ov::Core core = utils::singleton_core();
- m_decoder_model = core.read_model(vae_decoder_model, vae_decoder_weights);
+ m_decoder_model = utils::singleton_core().read_model(vae_decoder_model, vae_decoder_weights);
// apply VaeImageProcessor postprocessing steps by merging them into the VAE decoder model
merge_vae_image_post_processing();
}
@@ -143,8 +140,7 @@ AutoencoderKL::AutoencoderKL(const std::string& vae_encoder_model,
const Tensor& vae_decoder_weights,
const Config& vae_decoder_config)
: AutoencoderKL(vae_decoder_model, vae_decoder_weights, vae_decoder_config) {
- ov::Core core = utils::singleton_core();
- m_encoder_model = core.read_model(vae_encoder_model, vae_encoder_weights);
+ m_encoder_model = utils::singleton_core().read_model(vae_encoder_model, vae_encoder_weights);
}
AutoencoderKL::AutoencoderKL(const std::string& vae_decoder_model,
diff --git a/src/cpp/src/image_generation/models/clip_text_model.cpp b/src/cpp/src/image_generation/models/clip_text_model.cpp
index a119483417..ece88572f9 100644
--- a/src/cpp/src/image_generation/models/clip_text_model.cpp
+++ b/src/cpp/src/image_generation/models/clip_text_model.cpp
@@ -37,8 +37,7 @@ CLIPTextModel::Config::Config(const std::filesystem::path& config_path) {
CLIPTextModel::CLIPTextModel(const std::filesystem::path& root_dir) :
m_clip_tokenizer(get_tokenizer_path_by_text_encoder(root_dir)),
m_config(root_dir / "config.json") {
- ov::Core core = utils::singleton_core();
- m_model = core.read_model((root_dir / "openvino_model.xml").string());
+ m_model = utils::singleton_core().read_model(root_dir / "openvino_model.xml");
}
CLIPTextModel::CLIPTextModel(const std::filesystem::path& root_dir,
@@ -53,8 +52,7 @@ CLIPTextModel::CLIPTextModel(const std::string& model,
const Config& config,
const Tokenizer& clip_tokenizer) :
m_clip_tokenizer(clip_tokenizer), m_config(config) {
- ov::Core core = utils::singleton_core();
- m_model = core.read_model(model, weights);
+ m_model = utils::singleton_core().read_model(model, weights);
}
CLIPTextModel::CLIPTextModel(const std::string& model,
diff --git a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp b/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
index 685c1f6c0e..e695c763cb 100644
--- a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
+++ b/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
@@ -28,8 +28,7 @@ CLIPTextModelWithProjection::Config::Config(const std::filesystem::path& config_
CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::filesystem::path& root_dir) :
m_clip_tokenizer(get_tokenizer_path_by_text_encoder(root_dir)),
m_config(root_dir / "config.json") {
- ov::Core core = utils::singleton_core();
- m_model = core.read_model((root_dir / "openvino_model.xml").string());
+ m_model = utils::singleton_core().read_model(root_dir / "openvino_model.xml");
}
CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::filesystem::path& root_dir,
@@ -44,8 +43,7 @@ CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string& mode
const Config& config,
const Tokenizer& clip_tokenizer) :
m_clip_tokenizer(clip_tokenizer), m_config(config) {
- ov::Core core = utils::singleton_core();
- m_model = core.read_model(model, weights);
+ m_model = utils::singleton_core().read_model(model, weights);
}
CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string& model,
diff --git a/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp b/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp
index 285ea197e7..71193a38e7 100644
--- a/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp
+++ b/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp
@@ -26,7 +26,7 @@ FluxTransformer2DModel::Config::Config(const std::filesystem::path& config_path)
FluxTransformer2DModel::FluxTransformer2DModel(const std::filesystem::path& root_dir)
: m_config(root_dir / "config.json") {
- m_model = utils::singleton_core().read_model((root_dir / "openvino_model.xml").string());
+ m_model = utils::singleton_core().read_model(root_dir / "openvino_model.xml");
m_vae_scale_factor = ov::genai::get_vae_scale_factor(root_dir.parent_path() / "vae_decoder" / "config.json");
}
@@ -42,8 +42,7 @@ FluxTransformer2DModel::FluxTransformer2DModel(const std::string& model,
const Config& config,
const size_t vae_scale_factor) :
m_config(config), m_vae_scale_factor(vae_scale_factor) {
- ov::Core core = utils::singleton_core();
- m_model = core.read_model(model, weights);
+ m_model = utils::singleton_core().read_model(model, weights);
}
FluxTransformer2DModel::FluxTransformer2DModel(const std::string& model,
diff --git a/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp b/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp
index b6f74acc51..69b0e6dcff 100644
--- a/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp
+++ b/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp
@@ -28,7 +28,7 @@ SD3Transformer2DModel::Config::Config(const std::filesystem::path& config_path)
SD3Transformer2DModel::SD3Transformer2DModel(const std::filesystem::path& root_dir)
: m_config(root_dir / "config.json") {
- m_model = utils::singleton_core().read_model((root_dir / "openvino_model.xml").string());
+ m_model = utils::singleton_core().read_model(root_dir / "openvino_model.xml");
m_vae_scale_factor = get_vae_scale_factor(root_dir.parent_path() / "vae_decoder" / "config.json");
}
@@ -44,8 +44,7 @@ SD3Transformer2DModel::SD3Transformer2DModel(const std::string& model,
const Config& config,
const size_t vae_scale_factor) :
m_config(config), m_vae_scale_factor(vae_scale_factor) {
- ov::Core core = utils::singleton_core();
- m_model = core.read_model(model, weights);
+ m_model = utils::singleton_core().read_model(model, weights);
}
SD3Transformer2DModel::SD3Transformer2DModel(const std::string& model,
diff --git a/src/cpp/src/image_generation/models/t5_encoder_model.cpp b/src/cpp/src/image_generation/models/t5_encoder_model.cpp
index bb133c3aac..ef41898cc3 100644
--- a/src/cpp/src/image_generation/models/t5_encoder_model.cpp
+++ b/src/cpp/src/image_generation/models/t5_encoder_model.cpp
@@ -16,8 +16,7 @@ std::filesystem::path get_tokenizer_path_by_text_encoder(const std::filesystem::
T5EncoderModel::T5EncoderModel(const std::filesystem::path& root_dir) :
m_tokenizer(get_tokenizer_path_by_text_encoder(root_dir)) {
- ov::Core core = utils::singleton_core();
- m_model = core.read_model((root_dir / "openvino_model.xml").string());
+ m_model = utils::singleton_core().read_model(root_dir / "openvino_model.xml");
}
T5EncoderModel::T5EncoderModel(const std::filesystem::path& root_dir,
@@ -31,8 +30,7 @@ T5EncoderModel::T5EncoderModel(const std::string& model,
const Tensor& weights,
const Tokenizer& tokenizer) :
m_tokenizer(tokenizer) {
- ov::Core core = utils::singleton_core();
- m_model = core.read_model(model, weights);
+ m_model = utils::singleton_core().read_model(model, weights);
}
T5EncoderModel::T5EncoderModel(const std::string& model,
@@ -60,9 +58,7 @@ T5EncoderModel& T5EncoderModel::reshape(int batch_size, int max_sequence_length)
T5EncoderModel& T5EncoderModel::compile(const std::string& device, const ov::AnyMap& properties) {
OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot re-compile already compiled model");
- ov::Core core = utils::singleton_core();
- ov::CompiledModel compiled_model;
- compiled_model = core.compile_model(m_model, device, properties);
+ ov::CompiledModel compiled_model = utils::singleton_core().compile_model(m_model, device, properties);
ov::genai::utils::print_compiled_model_properties(compiled_model, "T5 encoder model");
m_request = compiled_model.create_infer_request();
// release the original model
diff --git a/src/cpp/src/image_generation/models/unet2d_condition_model.cpp b/src/cpp/src/image_generation/models/unet2d_condition_model.cpp
index 40d0a6125d..fd3e97314d 100644
--- a/src/cpp/src/image_generation/models/unet2d_condition_model.cpp
+++ b/src/cpp/src/image_generation/models/unet2d_condition_model.cpp
@@ -30,8 +30,7 @@ UNet2DConditionModel::Config::Config(const std::filesystem::path& config_path) {
UNet2DConditionModel::UNet2DConditionModel(const std::filesystem::path& root_dir) :
m_config(root_dir / "config.json") {
- ov::Core core = utils::singleton_core();
- m_model = core.read_model((root_dir / "openvino_model.xml").string());
+ m_model = utils::singleton_core().read_model(root_dir / "openvino_model.xml");
m_vae_scale_factor = get_vae_scale_factor(root_dir.parent_path() / "vae_decoder" / "config.json");
}
@@ -47,8 +46,7 @@ UNet2DConditionModel::UNet2DConditionModel(const std::string& model,
const Config& config,
const size_t vae_scale_factor) :
m_config(config), m_vae_scale_factor(vae_scale_factor) {
- ov::Core core = utils::singleton_core();
- m_model = core.read_model(model, weights);
+ m_model = utils::singleton_core().read_model(model, weights);
}
UNet2DConditionModel::UNet2DConditionModel(const std::string& model,
diff --git a/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp b/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp
index 7db7ca9451..2dc1b9ef0b 100644
--- a/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp
+++ b/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp
@@ -10,13 +10,10 @@
namespace ov {
namespace genai {
-
class UNet2DConditionModel::UNetInferenceDynamic : public UNet2DConditionModel::UNetInference {
public:
virtual void compile(std::shared_ptr model, const std::string& device, const ov::AnyMap& properties) override {
- ov::Core core = utils::singleton_core();
-
- ov::CompiledModel compiled_model = core.compile_model(model, device, properties);
+ ov::CompiledModel compiled_model = utils::singleton_core().compile_model(model, device, properties);
ov::genai::utils::print_compiled_model_properties(compiled_model, "UNet 2D Condition dynamic model");
m_request = compiled_model.create_infer_request();
}
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 3e378e78cf..11efed8b32 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -1,475 +1,48 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
-#include
#include
-#include
-#include
+
#include
-#include
-#include "openvino/genai/continuous_batching_pipeline.hpp"
-#include "openvino/genai/generation_config.hpp"
+
#include "openvino/genai/llm_pipeline.hpp"
#include "openvino/genai/perf_metrics.hpp"
-#include "llm_pipeline_base.hpp"
+
#include "llm_pipeline_static.hpp"
-#include "utils.hpp"
-#include "text_callback_streamer.hpp"
-#include "openvino/genai/lora_adapter.hpp"
-#include "lora_helper.hpp"
+#include "llm_pipeline_stateful.hpp"
+#include "continuous_batching_adapter.hpp"
#include "speculative_decoding/speculative_decoding_impl.hpp"
-#include "sampler.hpp"
-#include "lm_encoding.hpp"
namespace ov {
namespace genai {
-class StatefulLLMPipeline final : public LLMPipelineImplBase {
-public:
- ov::InferRequest m_model_runner;
- bool is_chat_conversation = false;
- bool m_trust_encoded_history = true;
- ChatHistory m_history;
- std::string m_templated_chat_history = {};
- std::vector m_tokenized_chat_history;
- ov::genai::utils::GenerationChatInputsType m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
- size_t m_kv_cache_seq_length_axis = 2;
- Sampler m_sampler;
- // Tail of previous output in chat mode is missing in KV cache, let's keep it
- std::optional m_last_disappeared_token = std::nullopt;
- // If sequence contains some symbols, which could be ambiguously encoded by tokenizer, we need to trim kv cache
- // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history
- // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history
- ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0};
-
- StatefulLLMPipeline(
- const ov::InferRequest& request,
- const ov::genai::Tokenizer& tokenizer,
- OptionalGenerationConfig generation_config=std::nullopt
- ) : LLMPipelineImplBase(tokenizer),
- m_model_runner(request) {
- GenerationConfig default_config;
- m_generation_config = (generation_config.has_value()) ? *generation_config : default_config;
- }
-
- StatefulLLMPipeline(
- const std::filesystem::path& models_path,
- const ov::genai::Tokenizer& tokenizer,
- const std::string& device,
- const ov::AnyMap& plugin_config
- ) : StatefulLLMPipeline{
- ov::genai::utils::read_model_with_config(models_path, plugin_config),
- tokenizer,
- device,
- plugin_config,
- utils::from_config_json_if_exists(models_path)
- } {}
-
- StatefulLLMPipeline(
- const std::shared_ptr& model,
- const ov::genai::Tokenizer& tokenizer,
- const std::string& device,
- const ov::AnyMap& config,
- const ov::genai::GenerationConfig& generation_config
- ) : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) {
- ov::CompiledModel compiled_model;
- auto [core_plugin_config, plugin_config] = ov::genai::utils::split_core_compile_config(config);
- utils::slice_matmul_stateful_model(model);
- m_kv_cache_seq_length_axis = ov::genai::utils::get_seq_len_axis(model);
-
- if (auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) {
- m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model.");
- m_adapter_controller = AdapterController(model, *m_generation_config.adapters, device); // TODO: Make the prefix name configurable
- compiled_model = utils::singleton_core().compile_model(model, device, *filtered_plugin_config);
- m_model_runner = compiled_model.create_infer_request();
- } else {
- compiled_model = utils::singleton_core().compile_model(model, device, plugin_config);
- m_model_runner = compiled_model.create_infer_request();
- }
- ov::genai::utils::print_compiled_model_properties(compiled_model, "Stateful LLM model");
-
- // If eos_token_id was not provided, take value
- if (m_generation_config.eos_token_id == -1)
- m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
-
- m_sampler.set_seed(m_generation_config.rng_seed);
- }
-
- StatefulLLMPipeline(
- const std::filesystem::path& models_path,
- const std::string& device,
- const ov::AnyMap& plugin_config
- ) : StatefulLLMPipeline{models_path, Tokenizer(models_path), device, plugin_config} {}
-
- DecodedResults generate(
- StringInputs inputs,
- OptionalGenerationConfig generation_config,
- StreamerVariant streamer
- ) override {
- if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::UNDEF)
- m_chat_input_type = ov::genai::utils::GenerationChatInputsType::STRING;
-
- if (is_chat_conversation)
- OPENVINO_ASSERT(m_chat_input_type != ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS,
- "Chat doesn't support switching between input types. Please, continue using EncodedInputs or restart the chat.");
-
- auto start_time = std::chrono::steady_clock::now();
- GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
- // If eos_token_id was not provided, take value from default m_generation_config
- if (config.eos_token_id == -1)
- config.set_eos_token_id(m_generation_config.eos_token_id);
- config.validate();
-
- TokenizedInputs encoded_input;
-
- if (auto input_vector = std::get_if>(&inputs)) {
- OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts");
- encoded_input = m_tokenizer.encode(*input_vector);
- } else if (auto input_prompt = std::get_if(&inputs)) {
- std::string& prompt = *input_prompt;
-
- if (is_chat_conversation) {
- // KV cache in model already contains prompts and answers from previous iterations.
- // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns
- // token_ids = {, ...}. So if tokenizer applies only to the new prompt,
- // will be inserted on every iteration.
- // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt
- // and takes only the difference between them.
- // The chat history cannot be saved as already encoded tokens because generate call doesn't return token, but
- // KV cache contains it. So we have to add it manually or get it by tokenization all chat history.
-
- m_history.push_back({{"role", "user"}, {"content", prompt}});
- constexpr bool add_generation_prompt = true;
- auto new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
- // Do not add special tokens in chat scenario to be aligned with HF.
- auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false));
- auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false));
-
- // some symbols combinations can be encoded by the tokenizer in different ways
- // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history
- // so let's check it out, find the trusted part and use it in on the next step
- size_t trusted_history_length = 0;
- if (!m_tokenized_chat_history.empty()) {
- std::set stop_tokens = config.stop_token_ids;
- trusted_history_length = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_chat_history, stop_tokens);
- m_trust_encoded_history = trusted_history_length == SIZE_MAX;
- }
-
- if (m_tokenized_chat_history.empty()) {
- encoded_input = new_chat_tokens;
- } else if (trusted_history_length != SIZE_MAX || m_kv_history_manager.does_kv_cache_need_to_update()) {
- // does_kv_cache_need_to_update will be true here if beam search is activated
- // in beam search mode we want to remove all history about last model answer from kv cache and add the best answer directly
- // if we have difference in model answer and decoded answer it anyway will be less then entire history, so let's use data from m_kv_history_manager
- if (m_kv_history_manager.does_kv_cache_need_to_update()) {
- trusted_history_length = m_kv_history_manager.trusted_history_length;
- } else {
- m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_tokenized_chat_history.size() - trusted_history_length;
- // if prev generation was finished because of max len was reached, kv cache is missed one last token, let's keep it
- m_kv_history_manager.num_tokens_to_remove_from_kv_cache -= m_last_disappeared_token.has_value() ? 1 : 0;
- }
-
- ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.input_ids.get_element_type(),
- {1, new_chat_tokens.input_ids.get_shape().at(1) - trusted_history_length},
- new_chat_tokens.input_ids.data() + trusted_history_length);
-
- ov::Tensor new_attention_mask(ov::element::i64, new_tensor.get_shape());
- std::fill_n(new_attention_mask.data(), new_tensor.get_shape()[1], 1);
-
- encoded_input.input_ids = ov::Tensor(new_chat_tokens.input_ids.get_element_type(),
- {1, new_chat_tokens.input_ids.get_shape().at(1) - trusted_history_length});
- new_tensor.copy_to(encoded_input.input_ids);
- encoded_input.attention_mask = new_attention_mask;
- m_last_disappeared_token = std::nullopt;
- } else {
- encoded_input = utils::subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens);
- }
- m_templated_chat_history = new_templated_chat_history;
-
- m_tokenized_chat_history.clear();
- m_tokenized_chat_history.reserve(new_chat_tokens.input_ids.get_size());
- std::copy_n(new_chat_tokens.input_ids.data(), new_chat_tokens.input_ids.get_size(),
- std::back_inserter(m_tokenized_chat_history));
-
- // TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied
- } else {
- encoded_input = m_tokenizer.encode(prompt);
- }
- }
-
- auto encode_stop_time = std::chrono::steady_clock::now();
- auto encoded_results = generate(encoded_input, config, streamer);
-
- auto decode_start_time = std::chrono::steady_clock::now();
- DecodedResults decoded_results = {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores};
- auto decode_stop_time = std::chrono::steady_clock::now();
-
- if (is_chat_conversation) {
- // Tail of chat template is missing in KV cache.
- // Find the tail to concatenate it with the next input prompt.
- auto answer = decoded_results.texts[0];
- m_templated_chat_history.append(answer);
- m_history.push_back({{"role", "assistant"}, {"content", answer}});
- }
-
- // generate_durations
- decoded_results.perf_metrics = encoded_results.perf_metrics;
-
- auto& raw_counters = decoded_results.perf_metrics.raw_metrics;
- auto stop_time = std::chrono::steady_clock::now();
- raw_counters.generate_durations = std::vector();
- raw_counters.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
- raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_microsec(encode_stop_time - start_time));
- raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_stop_time - decode_start_time));
-
- // Added tokenization/detokenization times, and updated generate duration, need to reevaluate statistics.
- decoded_results.perf_metrics.m_evaluated = false;
- decoded_results.perf_metrics.evaluate_statistics(start_time);
- return decoded_results;
- }
-
- void reset_kv_state() {
- if(m_adapter_controller) {
- for(auto& state: m_model_runner.query_state()) {
- if(!m_adapter_controller->has_state_name(state.get_name())) {
- state.reset();
- }
- }
- } else {
- m_model_runner.reset_state();
- }
- }
-
- EncodedResults generate(
- const EncodedInputs& inputs,
- OptionalGenerationConfig generation_config,
- StreamerVariant streamer
- ) override {
- if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::UNDEF)
- m_chat_input_type = ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS;
-
- if (is_chat_conversation)
- // if chat was run in StringInputs mode, but it was called EncodedInputs generate, last m_history entry will be with assistant role
- OPENVINO_ASSERT(m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS || m_history.back()["role"] == "user",
- "Chat doesn't support switching between input types. Please, continue using StringInputs or restart the chat.");
-
- auto start_time = std::chrono::steady_clock::now();
- ov::Tensor input_ids;
- ov::Tensor attention_mask;
- if (auto data = std::get_if(&inputs)) {
- input_ids = *data;
- attention_mask = ov::genai::utils::init_attention_mask(input_ids);
- } else if (auto data = std::get_if(&inputs)) {
- input_ids = data->input_ids;
- attention_mask = data->attention_mask;
- }
-
- if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS)
- std::copy(input_ids.data(), input_ids.data() + input_ids.get_size(), std::back_inserter(m_tokenized_chat_history));
-
- // Tail of previous output in chat mode is missing in KV cache.
- if (m_last_disappeared_token.has_value()) {
- attention_mask = ov::genai::utils::push_front_inputs(attention_mask, 1);
- input_ids = ov::genai::utils::push_front_inputs(input_ids, *m_last_disappeared_token);
- }
-
- GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
-
- // If eos_token_id was not provided, take value from default m_generation_config
- if (config.eos_token_id == -1)
- config.set_eos_token_id(m_generation_config.eos_token_id);
- config.validate();
-
- // Stateful pipeline does not provide logprobs for prompt tokens
- OPENVINO_ASSERT(config.echo == false, "Echo is not supported in the stateful pipeline");
-
- std::shared_ptr streamer_ptr;
- if (auto streamer_obj = std::get_if(&streamer)) {
- streamer_ptr = nullptr;
- } else if (auto streamer_obj = std::get_if>(&streamer)) {
- streamer_ptr = *streamer_obj;
- } else if (auto callback = std::get_if>(&streamer)) {
- streamer_ptr = std::make_shared(m_tokenizer, *callback);
- }
-
- auto batch_size = input_ids.get_shape().at(0);
- OPENVINO_ASSERT(streamer_ptr == nullptr || batch_size == 1 && config.num_return_sequences == 1 &&
- (config.is_greedy_decoding() || config.is_multinomial()),
- "Currently streaming is possible only with batch size=1 and only for greedy or multinomial decoding");
-
- auto num_inputs = m_model_runner.get_compiled_model().inputs().size();
- OPENVINO_ASSERT(num_inputs == 4 || num_inputs == 3, "Model should have 3 or 4 inputs: "
- "either (input_ids, attention_mask, beam_idx) or "
- "(input_ids, attention_mask, position_ids, beam_idx) "
- "but you have '" + std::to_string(num_inputs) + "' inputs");
-
- ov::genai::utils::trim_kv_cache(m_model_runner, m_kv_history_manager.num_tokens_to_remove_from_kv_cache, m_kv_cache_seq_length_axis, m_adapter_controller);
-
- size_t kv_cache_len = 0;
- ov::Tensor concatenated_attention_mask;
- if (is_chat_conversation && !m_tokenized_chat_history.empty()) {
- OPENVINO_ASSERT(batch_size == 1, "continuation of generation is possible only for batch 1");
- // If history is saved in KV cache, concatenate new attention_mask with the already existing.
- // Between subsequent runs attention_mask should not be modified.
- auto atten_mask_history = m_model_runner.get_tensor("attention_mask");
- auto prompt_len = attention_mask.get_shape()[1];
-
- kv_cache_len = atten_mask_history.get_shape()[1] - m_kv_history_manager.num_tokens_to_remove_from_kv_cache;
-
- ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, {batch_size, kv_cache_len + prompt_len}};
- auto start_atten_hst = atten_mask_history.data();
-
- std::copy(start_atten_hst, start_atten_hst + kv_cache_len,
- new_atten_mask.data());
- std::copy(attention_mask.data(), attention_mask.data() + prompt_len,
- new_atten_mask.data() + kv_cache_len);
- concatenated_attention_mask = new_atten_mask;
- } else {
- concatenated_attention_mask = attention_mask;
- }
-
- size_t prev_attn_mask_size = concatenated_attention_mask.get_shape()[1];
-
- bool position_ids_available = (num_inputs == 4);
- std::optional position_ids = std::nullopt;
- if (position_ids_available) {
- position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
- utils::initialize_position_ids(*position_ids, attention_mask, kv_cache_len);
- }
-
- if(m_adapter_controller) {
- m_adapter_controller->apply(m_model_runner, config.adapters);
- }
-
- if (is_chat_conversation && !m_trust_encoded_history) {
- m_trust_encoded_history = true;
- m_kv_history_manager.reset();
- }
-
- std::vector requests;
- size_t block_size = 1;
- bool enable_prefix_caching = false;
-
- for (size_t request_id = 0; request_id < batch_size; request_id++) {
- SequenceGroup::Ptr sequence_group;
- if (is_chat_conversation) {
- ov::Tensor tokenized_chat_history = ov::Tensor(ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data());
- sequence_group = std::make_shared(request_id, tokenized_chat_history, config, block_size, enable_prefix_caching);
- } else {
- size_t seq_len = input_ids.get_shape().at(1);
- size_t batch_offset = request_id * seq_len;
- const int64_t* prompt_start = input_ids.data() + batch_offset;
- std::vector tokenized_prompt(prompt_start, prompt_start + seq_len);
-
- sequence_group = std::make_shared(request_id, tokenized_prompt, config, block_size, enable_prefix_caching);
- }
-
- sequence_group->set_sequence_group_ptr(sequence_group);
- requests.push_back(sequence_group);
- }
-
- if (m_sampler.get_seed() != config.rng_seed) {
- m_sampler.set_seed(config.rng_seed);
- }
-
- ov::genai::EncodedResults result;
- std::tie(result, m_last_disappeared_token) = ov::genai::get_lm_encoded_results(m_model_runner, input_ids, concatenated_attention_mask,
- streamer_ptr, m_sampler, requests, position_ids, std::nullopt);
-
- if (is_chat_conversation) {
- // force remove from kv_cache last answer
- if (config.is_beam_search() && m_chat_input_type != ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS) {
- m_kv_history_manager.trusted_history_length = m_tokenized_chat_history.size();
- m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_model_runner.get_tensor("attention_mask").get_shape()[1] - prev_attn_mask_size;
- }
-
- std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history));
- } else {
- reset_kv_state();
- m_last_disappeared_token = std::nullopt;
- }
-
- if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS)
- std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history));
-
- auto stop_time = std::chrono::steady_clock::now();
-
- // If is called without tokenization then that stat will not be reported.
- auto& metrics = result.perf_metrics;
- metrics.num_input_tokens = batch_size * input_ids.get_shape().at(1);
- metrics.load_time = this->m_load_time_ms;
- metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
- metrics.evaluate_statistics(start_time);
- return result;
- }
-
- void start_chat(const std::string& system_message) override {
- is_chat_conversation = true;
- m_trust_encoded_history = true;
- m_kv_history_manager.reset();
- m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
- m_last_disappeared_token = std::nullopt;
- if (!m_tokenized_chat_history.empty()) {
- reset_kv_state();
- m_history = {};
- m_templated_chat_history = "";
- m_tokenized_chat_history.clear();
- }
- if (system_message.empty())
- return;
-
- m_history.push_back({{"role", "system"}, {"content", system_message}});
- constexpr bool add_generation_prompt = false;
+namespace {
- m_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
- }
+/*
+* NPU reads some properties from the config file, but when LLMPipeline is initialized
+* from the model_str and weights_tensor, there are no files.
+* In the later case ModelDesc is stored in properties.
+* This function pops ModelDescr from the the properties and returns a pair of updated properties and ModelDescr.
+*/
+std::pair split_model_descr(const ov::AnyMap& properties) {
+ ov::AnyMap main_properties = properties;
+ ov::genai::static_llm::ModelConfigDesc model_descr;
- void finish_chat() override {
- is_chat_conversation = false;
- m_trust_encoded_history = true;
- m_kv_history_manager.reset();
- m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
- m_last_disappeared_token = std::nullopt;
- if (!m_tokenized_chat_history.empty()) {
- reset_kv_state();
- m_history.clear();
- m_templated_chat_history.clear();
- m_tokenized_chat_history.clear();
+ auto pop_property = [](ov::AnyMap& orig_propertis, const std::string& key, auto& value) {
+ if (orig_propertis.find(key) != orig_propertis.end()) {
+ value = orig_propertis.at(key).as>();
+ orig_propertis.erase(key);
}
- }
-};
-
-DecodedResults LLMPipeline::generate(
- StringInputs inputs,
- OptionalGenerationConfig generation_config,
- StreamerVariant streamer
-) {
- return m_pimpl->generate(inputs, generation_config, streamer);
-}
-
-DecodedResults LLMPipeline::generate(StringInputs text, const ov::AnyMap& config_map) {
- auto config_arg = utils::get_config_from_map(config_map);
- GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config();
- config.update_generation_config(config_map);
-
- return m_pimpl->generate(text, config, utils::get_streamer_from_map(config_map));
-}
-
-EncodedResults LLMPipeline::generate(
- const EncodedInputs& inputs,
- OptionalGenerationConfig generation_config,
- StreamerVariant streamer
-) {
- return m_pimpl->generate(inputs, generation_config, streamer);
+ };
+ pop_property(main_properties, "name_or_path", model_descr.name_or_path);
+ pop_property(main_properties, "type", model_descr.type);
+ pop_property(main_properties, "num_key_value_heads", model_descr.num_key_value_heads);
+
+ return {main_properties, model_descr};
}
-EncodedResults LLMPipeline::generate(const EncodedInputs& inputs, const ov::AnyMap& config_map) {
- auto config_arg = utils::get_config_from_map(config_map);
- GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config();
- config.update_generation_config(config_map);
+} // namespace
- return m_pimpl->generate(inputs, config, utils::get_streamer_from_map(config_map));
-}
std::pair streamer(StreamerVariant func) {
if (auto streamer_obj = std::get_if>(&func)) {
@@ -491,7 +64,7 @@ std::pair draft_model(
auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
std::filesystem::path openvino_model_name = "openvino_model.xml";
- auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
+ auto model = utils::singleton_core().read_model(models_path / openvino_model_name, {}, plugin_config);
auto generation_config = utils::from_config_json_if_exists(models_path);
auto tokenizer = ov::genai::Tokenizer(models_path);
return { utils::DRAFT_MODEL_ARG_NAME, Any::make(model, tokenizer, device, plugin_config, scheduler_config, generation_config) };
@@ -510,194 +83,7 @@ std::pair draft_model(
return { utils::DRAFT_MODEL_ARG_NAME, Any::make(model, tokenizer, device, plugin_config, scheduler_config, generation_config) };
}
-} // namespace genai
-} // namespace ov
-
-namespace {
-using namespace ov::genai;
-
-template struct overloaded : Ts... {using Ts::operator()...;};
-template overloaded(Ts...) -> overloaded;
-
-Tokenizer dont_construct() {
- OPENVINO_THROW("Continuous Batching backend can't be constructed"
- "from ireq because the model must be transformed");
-}
-
-class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
-public:
- ContinuousBatchingPipeline m_impl;
-
- ContinuousBatchingAdapter(
- const ov::InferRequest& request,
- const Tokenizer& tokenizer,
- OptionalGenerationConfig generation_config
- ): LLMPipelineImplBase{dont_construct()}, m_impl{{}, {}, {}} {}
-
- ContinuousBatchingAdapter(
- const std::filesystem::path& models_path,
- const Tokenizer& tokenizer,
- const SchedulerConfig& scheduler_config,
- const std::string& device,
- const ov::AnyMap& plugin_config
- ): LLMPipelineImplBase{tokenizer}, m_impl{
- models_path.string(),
- tokenizer,
- scheduler_config,
- device,
- plugin_config} {
- m_generation_config = m_impl.get_config();
- }
-
- ContinuousBatchingAdapter(
- const std::string& model_str,
- const ov::Tensor& weights_tensor,
- const Tokenizer& tokenizer,
- const SchedulerConfig& scheduler_config,
- const std::string& device,
- const ov::AnyMap& plugin_config,
- const ov::genai::GenerationConfig& generation_config
- ): LLMPipelineImplBase{tokenizer}, m_impl{
- model_str,
- weights_tensor,
- tokenizer,
- scheduler_config,
- device,
- plugin_config,
- generation_config} {}
-
- ContinuousBatchingAdapter(
- const std::filesystem::path& models_path,
- const SchedulerConfig& scheduler_config,
- const std::string& device,
- const ov::AnyMap& plugin_config
- ): LLMPipelineImplBase{Tokenizer(models_path.string())}, m_impl{
- models_path.string(),
- m_tokenizer,
- scheduler_config,
- device,
- plugin_config} {
- m_generation_config = m_impl.get_config();
- }
-
- DecodedResults generate(
- StringInputs inputs,
- OptionalGenerationConfig generation_config,
- StreamerVariant streamer
- ) override {
- std::vector prompts = std::visit(overloaded{
- [](const std::string& prompt) {
- return std::vector{prompt};
- },
- [](std::vector& prompts) {
- return prompts;
- }
- }, inputs);
- const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config;
- // -1 == config.eos_token_id and config.validate() are handled in m_impl.
- std::vector generated = m_impl.generate(
- prompts,
- std::vector{prompts.size(), config},
- streamer
- );
- std::vector plain_replies;
- std::vector plain_scores;
- for (GenerationResult& res : generated) {
- OPENVINO_ASSERT(res.m_status == GenerationStatus::FINISHED || res.m_status == GenerationStatus::DROPPED_BY_HANDLE, "Got unfinished GenerationStatus");
- std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_replies));
- std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores));
- }
- return {std::move(plain_replies), std::move(plain_scores)};
- }
-
- EncodedResults generate(
- const EncodedInputs& inputs,
- OptionalGenerationConfig generation_config,
- StreamerVariant streamer
- ) override {
- std::vector input_ids = std::visit(overloaded{
- [](const ov::Tensor& inp) {
- size_t batch_size = inp.get_shape().at(0);
- if (1 == batch_size) {
- return std::vector{inp};
- }
- std::vector input_ids;
- input_ids.reserve(batch_size);
- size_t max_len = inp.get_shape().at(1);
- const int64_t* const source = inp.data();
- for (size_t batch_id = 0; batch_id < batch_size; ++batch_id) {
- input_ids.emplace_back(ov::element::i64, ov::Shape(1, max_len));
- int64_t* destination = input_ids.back().data();
- std::copy_n(source + batch_id * max_len, max_len, destination);
- }
- return input_ids;
- },
- [](const TokenizedInputs& inp) {
- size_t batch_size = inp.input_ids.get_shape().at(0);
- std::vector input_ids;
- input_ids.reserve(batch_size);
- size_t max_len = inp.input_ids.get_shape().at(1);
- const int64_t* const source = inp.input_ids.data();
- const int64_t* const attention_mask = inp.attention_mask.data();
- for (size_t batch_id = 0; batch_id < batch_size; ++batch_id) {
- input_ids.emplace_back(ov::element::i64, ov::Shape(1, max_len));
- int64_t* destination = input_ids.back().data();
- size_t copy_count = 0;
- for (size_t idx = 0; idx < max_len; ++idx) {
- if (1 == attention_mask[batch_id * max_len + idx]) {
- destination[copy_count++] = source[batch_id * max_len + idx];
- }
- }
- input_ids.back().set_shape({1, copy_count});
- }
- return input_ids;
- }
- }, inputs);
- const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config;
- // -1 == config.eos_token_id and config.validate() are handled in m_impl.
- std::vector generated = m_impl.generate(input_ids, std::vector{input_ids.size(), config}, streamer);
- std::vector> plain_tokens;
- std::vector plain_scores;
- for (EncodedGenerationResult& res : generated) {
- OPENVINO_ASSERT(res.m_status == GenerationStatus::FINISHED || res.m_status == GenerationStatus::DROPPED_BY_HANDLE, "Got unfinished GenerationStatus");
- std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_tokens));
- std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores));
- }
- return {std::move(plain_tokens), std::move(plain_scores)};
- }
-
- void start_chat(const std::string& system_message) override {
- m_impl.start_chat();
- };
-
- void finish_chat() override {
- m_impl.finish_chat();
- };
-};
-
-/*
-* NPU reads some properties from the config file, but when LLMPipeline is initialized
-* from the model_str and weights_tensor, there are not files.
-* In the later case ModelDesc is stored in properties.
-* This function pops ModelDescr from the the properties and returns a pair of updated properties and ModelDescr.
-*/
-std::pair split_model_descr(const ov::AnyMap& properties) {
- ov::AnyMap main_properties = properties;
- ov::genai::ModelConfigDesc model_descr;
-
- auto pop_property = [](ov::AnyMap& orig_propertis, const std::string& key, auto& value) {
- if (orig_propertis.find(key) != orig_propertis.end()) {
- value = orig_propertis.at(key).as>();
- orig_propertis.erase(key);
- }
- };
- pop_property(main_properties, "name_or_path", model_descr.name_or_path);
- pop_property(main_properties, "type", model_descr.type);
- pop_property(main_properties, "num_key_value_heads", model_descr.num_key_value_heads);
-
- return {main_properties, model_descr};
-}
-}
+// Public LLMPipeline
ov::genai::LLMPipeline::LLMPipeline(
const ov::InferRequest& request,
@@ -705,8 +91,6 @@ ov::genai::LLMPipeline::LLMPipeline(
OptionalGenerationConfig generation_config) {
auto start_time = std::chrono::steady_clock::now();
m_pimpl = std::make_unique(request, tokenizer, generation_config);
- auto stop_time = std::chrono::steady_clock::now();
- m_pimpl->m_load_time_ms = std::chrono::duration_cast(stop_time - start_time).count();
}
ov::genai::LLMPipeline::LLMPipeline(
@@ -721,32 +105,31 @@ ov::genai::LLMPipeline::LLMPipeline(
auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
m_pimpl = std::make_unique(models_path, tokenizer, scheduler_config, device, plugin_config);
} else if (device == "NPU") {
- m_pimpl = std::make_unique(models_path, tokenizer, device, properties);
+ m_pimpl = static_llm::LLMPipelineFactory::create(models_path, tokenizer, device, properties);
} else {
m_pimpl = std::make_unique(models_path, tokenizer, device, properties);
}
- auto stop_time = std::chrono::steady_clock::now();
- m_pimpl->m_load_time_ms = std::chrono::duration_cast(stop_time - start_time).count();
+ m_pimpl->save_load_time(start_time);
}
ov::genai::LLMPipeline::LLMPipeline(
const std::filesystem::path& models_path,
const std::string& device,
- const ov::AnyMap& config) {
+ const ov::AnyMap& properties) {
auto start_time = std::chrono::steady_clock::now();
- if (config.find(ov::genai::scheduler_config.name()) != config.end() ||
- config.find(utils::DRAFT_MODEL_ARG_NAME) != config.end() ||
- config.find(ov::genai::prompt_lookup.name()) != config.end()) {
- auto [plugin_config, scheduler_config] = utils::split_scheduler_config(config);
- m_pimpl = std::make_unique(models_path, scheduler_config, device, plugin_config);
+ if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
+ properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
+ properties.find(ov::genai::prompt_lookup.name()) != properties.end()) {
+ auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties);
+ m_pimpl = std::make_unique(models_path, scheduler_config, device, device_properties);
} else if (device == "NPU") {
- m_pimpl = std::make_unique(models_path, device, config);
+ m_pimpl = static_llm::LLMPipelineFactory::create(models_path, device, properties);
} else {
- m_pimpl = std::make_unique(models_path, device, config);
+ m_pimpl = std::make_unique(models_path, device, properties);
}
- auto stop_time = std::chrono::steady_clock::now();
- m_pimpl->m_load_time_ms = std::chrono::duration_cast(stop_time - start_time).count();
+
+ m_pimpl->save_load_time(start_time);
}
ov::genai::LLMPipeline::LLMPipeline(
@@ -754,18 +137,17 @@ ov::genai::LLMPipeline::LLMPipeline(
const ov::Tensor& weights_tensor,
const ov::genai::Tokenizer& tokenizer,
const std::string& device,
- const ov::AnyMap& config,
+ const ov::AnyMap& properties,
const ov::genai::GenerationConfig& generation_config) {
- auto [core_properties, plugin_config] = ov::genai::utils::split_core_compile_config(config);
-
auto start_time = std::chrono::steady_clock::now();
- if (plugin_config.find(ov::genai::scheduler_config.name()) != plugin_config.end() ||
- plugin_config.find(utils::DRAFT_MODEL_ARG_NAME) != plugin_config.end() ||
- plugin_config.find(ov::genai::prompt_lookup.name()) != plugin_config.end()){
- auto [plugin_config_, scheduler_config] = utils::split_scheduler_config(plugin_config);
+ if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
+ properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
+ properties.find(ov::genai::prompt_lookup.name()) != properties.end()){
+
+ auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties);
m_pimpl = std::make_unique(model_str, weights_tensor,
- tokenizer, scheduler_config, device, plugin_config_, generation_config);
+ tokenizer, scheduler_config, device, device_properties, generation_config);
} else if (device == "NPU") {
// TODO: CVS-158771 Currently, it's a workaround. Probably there is a better solution.
// NPU reads some properties from the config file, but when LLMPipeline is initialized
@@ -778,34 +160,64 @@ ov::genai::LLMPipeline::LLMPipeline(
// {"num_key_value_heads", 32}};
// ov::genai::LLMPipeline pipe(model_str,..., model_descr_properties);
// This will convert from AnyMap to ModelDesc.
- auto [properties, model_descr] = split_model_descr(plugin_config);
+ auto [filtered_properties, model_descr] = split_model_descr(properties);
- m_pimpl = std::make_unique(
+ m_pimpl = static_llm::LLMPipelineFactory::create(
utils::singleton_core().read_model(model_str, weights_tensor),
model_descr,
tokenizer,
device,
- properties,
+ filtered_properties,
generation_config
);
} else {
m_pimpl = std::make_unique(
- utils::singleton_core().read_model(model_str, weights_tensor),
+ utils::singleton_core().read_model(model_str, weights_tensor),
tokenizer,
device,
- plugin_config,
+ properties,
generation_config);
}
- auto stop_time = std::chrono::steady_clock::now();
- m_pimpl->m_load_time_ms = std::chrono::duration_cast(stop_time - start_time).count();
+
+ m_pimpl->save_load_time(start_time);
+}
+
+DecodedResults LLMPipeline::generate(
+ StringInputs inputs,
+ OptionalGenerationConfig generation_config,
+ StreamerVariant streamer) {
+ return m_pimpl->generate(inputs, generation_config, streamer);
+}
+
+DecodedResults LLMPipeline::generate(StringInputs text, const ov::AnyMap& config_map) {
+ auto config_arg = utils::get_config_from_map(config_map);
+ GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config();
+ config.update_generation_config(config_map);
+
+ return m_pimpl->generate(text, config, utils::get_streamer_from_map(config_map));
+}
+
+EncodedResults LLMPipeline::generate(
+ const EncodedInputs& inputs,
+ OptionalGenerationConfig generation_config,
+ StreamerVariant streamer) {
+ return m_pimpl->generate(inputs, generation_config, streamer);
+}
+
+EncodedResults LLMPipeline::generate(const EncodedInputs& inputs, const ov::AnyMap& config_map) {
+ auto config_arg = utils::get_config_from_map(config_map);
+ GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config();
+ config.update_generation_config(config_map);
+
+ return m_pimpl->generate(inputs, config, utils::get_streamer_from_map(config_map));
}
ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const {
- return m_pimpl->m_generation_config;
+ return m_pimpl->get_generation_config();
}
ov::genai::Tokenizer ov::genai::LLMPipeline::get_tokenizer() {
- return m_pimpl->m_tokenizer;
+ return m_pimpl->get_tokenizer();
}
void ov::genai::LLMPipeline::start_chat(const std::string& system_message) {
@@ -817,13 +229,10 @@ void ov::genai::LLMPipeline::finish_chat() {
}
void ov::genai::LLMPipeline::set_generation_config(const GenerationConfig& config) {
- int64_t default_eos_token_id = m_pimpl->m_generation_config.eos_token_id;
- m_pimpl->m_generation_config = config;
- // if eos_token_id was not provided in config forward from default config
- if (config.eos_token_id == -1)
- m_pimpl->m_generation_config.set_eos_token_id(default_eos_token_id);
-
- m_pimpl->m_generation_config.validate();
+ m_pimpl->set_generation_config(config);
}
ov::genai::LLMPipeline::~LLMPipeline() = default;
+
+} // namespace genai
+} // namespace ov
diff --git a/src/cpp/src/llm_pipeline_base.hpp b/src/cpp/src/llm_pipeline_base.hpp
index b2ad581e0b..5573272d7e 100644
--- a/src/cpp/src/llm_pipeline_base.hpp
+++ b/src/cpp/src/llm_pipeline_base.hpp
@@ -13,8 +13,26 @@ namespace genai {
class LLMPipelineImplBase {
public:
LLMPipelineImplBase(const Tokenizer& tokenizer,
- const GenerationConfig& config = {})
- : m_tokenizer(tokenizer), m_generation_config(config) {
+ const GenerationConfig& config)
+ : m_tokenizer(tokenizer), m_generation_config(config) { }
+
+ Tokenizer get_tokenizer() {
+ return m_tokenizer;
+ }
+
+ GenerationConfig get_generation_config() const {
+ return m_generation_config;
+ }
+
+ void set_generation_config(GenerationConfig config) {
+ int64_t default_eos_token_id = m_generation_config.eos_token_id;
+ m_generation_config = config;
+
+ // if eos_token_id was not provided in config forward from default config
+ if (m_generation_config.eos_token_id == -1)
+ m_generation_config.set_eos_token_id(default_eos_token_id);
+
+ m_generation_config.validate();
}
virtual DecodedResults generate(
@@ -34,6 +52,12 @@ class LLMPipelineImplBase {
virtual ~LLMPipelineImplBase() = default;
+ void save_load_time(std::chrono::steady_clock::time_point start_time) {
+ auto stop_time = std::chrono::steady_clock::now();
+ m_load_time_ms = std::chrono::duration_cast(stop_time - start_time).count();
+ }
+
+protected:
Tokenizer m_tokenizer;
GenerationConfig m_generation_config;
std::optional m_adapter_controller;
diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp
new file mode 100644
index 0000000000..153fcc6fce
--- /dev/null
+++ b/src/cpp/src/llm_pipeline_stateful.cpp
@@ -0,0 +1,402 @@
+
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "llm_pipeline_stateful.hpp"
+
+#include "lora_helper.hpp"
+#include "lm_encoding.hpp"
+#include "text_callback_streamer.hpp"
+#include "utils.hpp"
+
+namespace ov::genai {
+
+StatefulLLMPipeline::StatefulLLMPipeline(
+ const ov::InferRequest& request,
+ const ov::genai::Tokenizer& tokenizer,
+ OptionalGenerationConfig generation_config)
+ : LLMPipelineImplBase(tokenizer, generation_config.has_value() ? *generation_config : GenerationConfig()),
+ m_model_runner(request) {}
+
+StatefulLLMPipeline::StatefulLLMPipeline(
+ const std::filesystem::path& models_path,
+ const ov::genai::Tokenizer& tokenizer,
+ const std::string& device,
+ const ov::AnyMap& properties)
+ : StatefulLLMPipeline{
+ utils::singleton_core().read_model(models_path / "openvino_model.xml", {}, properties),
+ tokenizer,
+ device,
+ properties,
+ utils::from_config_json_if_exists(models_path)
+ } {}
+
+StatefulLLMPipeline::StatefulLLMPipeline(
+ const std::shared_ptr& model,
+ const ov::genai::Tokenizer& tokenizer,
+ const std::string& device,
+ const ov::AnyMap& properties,
+ const ov::genai::GenerationConfig& generation_config)
+ : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) {
+ utils::apply_slice_before_matmul_transformation(model);
+ m_kv_cache_seq_length_axis = ov::genai::utils::get_seq_len_axis(model);
+
+ ov::CompiledModel compiled_model;
+ if (auto filtered_properties = extract_adapters_from_properties(properties, &m_generation_config.adapters)) {
+ m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model.");
+ m_adapter_controller = AdapterController(model, *m_generation_config.adapters, device); // TODO: Make the prefix name configurable
+ compiled_model = utils::singleton_core().compile_model(model, device, *filtered_properties);
+ m_model_runner = compiled_model.create_infer_request();
+ } else {
+ compiled_model = utils::singleton_core().compile_model(model, device, properties);
+ m_model_runner = compiled_model.create_infer_request();
+ }
+ ov::genai::utils::print_compiled_model_properties(compiled_model, "Stateful LLM model");
+
+ // If eos_token_id was not provided, take value
+ if (m_generation_config.eos_token_id == -1)
+ m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
+
+ m_sampler.set_seed(m_generation_config.rng_seed);
+}
+
+StatefulLLMPipeline::StatefulLLMPipeline(
+ const std::filesystem::path& models_path,
+ const std::string& device,
+ const ov::AnyMap& plugin_config)
+ : StatefulLLMPipeline{models_path, Tokenizer(models_path), device, plugin_config} {}
+
+DecodedResults StatefulLLMPipeline::generate(
+ StringInputs inputs,
+ OptionalGenerationConfig generation_config,
+ StreamerVariant streamer) {
+ if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::UNDEF)
+ m_chat_input_type = ov::genai::utils::GenerationChatInputsType::STRING;
+
+ if (is_chat_conversation)
+ OPENVINO_ASSERT(m_chat_input_type != ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS,
+ "Chat doesn't support switching between input types. Please, continue using EncodedInputs or restart the chat.");
+
+ auto start_time = std::chrono::steady_clock::now();
+ GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
+ // If eos_token_id was not provided, take value from default m_generation_config
+ if (config.eos_token_id == -1)
+ config.set_eos_token_id(m_generation_config.eos_token_id);
+ config.validate();
+
+ TokenizedInputs encoded_input;
+
+ if (auto input_vector = std::get_if>(&inputs)) {
+ OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts");
+ encoded_input = m_tokenizer.encode(*input_vector);
+ } else if (auto input_prompt = std::get_if(&inputs)) {
+ std::string& prompt = *input_prompt;
+
+ if (is_chat_conversation) {
+ // KV cache in model already contains prompts and answers from previous iterations.
+ // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns
+ // token_ids = {, ...}. So if tokenizer applies only to the new prompt,
+ // will be inserted on every iteration.
+ // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt
+ // and takes only the difference between them.
+ // The chat history cannot be saved as already encoded tokens because generate call doesn't return token, but
+ // KV cache contains it. So we have to add it manually or get it by tokenization all chat history.
+
+ m_history.push_back({{"role", "user"}, {"content", prompt}});
+ constexpr bool add_generation_prompt = true;
+ auto new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
+ // Do not add special tokens in chat scenario to be aligned with HF.
+ auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false));
+ auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false));
+
+ // some symbols combinations can be encoded by the tokenizer in different ways
+ // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history
+ // so let's check it out, find the trusted part and use it in on the next step
+ size_t trusted_history_length = 0;
+ if (!m_tokenized_chat_history.empty()) {
+ std::set stop_tokens = config.stop_token_ids;
+ trusted_history_length = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_chat_history, stop_tokens);
+ m_trust_encoded_history = trusted_history_length == SIZE_MAX;
+ }
+
+ if (m_tokenized_chat_history.empty()) {
+ encoded_input = new_chat_tokens;
+ } else if (trusted_history_length != SIZE_MAX || m_kv_history_manager.does_kv_cache_need_to_update()) {
+ // does_kv_cache_need_to_update will be true here if beam search is activated
+ // in beam search mode we want to remove all history about last model answer from kv cache and add the best answer directly
+ // if we have difference in model answer and decoded answer it anyway will be less then entire history, so let's use data from m_kv_history_manager
+ if (m_kv_history_manager.does_kv_cache_need_to_update()) {
+ trusted_history_length = m_kv_history_manager.trusted_history_length;
+ } else {
+ m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_tokenized_chat_history.size() - trusted_history_length;
+ // if prev generation was finished because of max len was reached, kv cache is missed one last token, let's keep it
+ m_kv_history_manager.num_tokens_to_remove_from_kv_cache -= m_last_disappeared_token.has_value() ? 1 : 0;
+ }
+
+ ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.input_ids.get_element_type(),
+ {1, new_chat_tokens.input_ids.get_shape().at(1) - trusted_history_length},
+ new_chat_tokens.input_ids.data() + trusted_history_length);
+
+ ov::Tensor new_attention_mask(ov::element::i64, new_tensor.get_shape());
+ std::fill_n(new_attention_mask.data(), new_tensor.get_shape()[1], 1);
+
+ encoded_input.input_ids = ov::Tensor(new_chat_tokens.input_ids.get_element_type(),
+ {1, new_chat_tokens.input_ids.get_shape().at(1) - trusted_history_length});
+ new_tensor.copy_to(encoded_input.input_ids);
+ encoded_input.attention_mask = new_attention_mask;
+ m_last_disappeared_token = std::nullopt;
+ } else {
+ encoded_input = utils::subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens);
+ }
+ m_templated_chat_history = new_templated_chat_history;
+
+ m_tokenized_chat_history.clear();
+ m_tokenized_chat_history.reserve(new_chat_tokens.input_ids.get_size());
+ std::copy_n(new_chat_tokens.input_ids.data(), new_chat_tokens.input_ids.get_size(),
+ std::back_inserter(m_tokenized_chat_history));
+
+ // TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied
+ } else {
+ encoded_input = m_tokenizer.encode(prompt);
+ }
+ }
+
+ auto encode_stop_time = std::chrono::steady_clock::now();
+ auto encoded_results = generate(encoded_input, config, streamer);
+
+ auto decode_start_time = std::chrono::steady_clock::now();
+ DecodedResults decoded_results = {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores};
+ auto decode_stop_time = std::chrono::steady_clock::now();
+
+ if (is_chat_conversation) {
+ // Tail of chat template is missing in KV cache.
+ // Find the tail to concatenate it with the next input prompt.
+ auto answer = decoded_results.texts[0];
+ m_templated_chat_history.append(answer);
+ m_history.push_back({{"role", "assistant"}, {"content", answer}});
+ }
+
+ // generate_durations
+ decoded_results.perf_metrics = encoded_results.perf_metrics;
+
+ auto& raw_counters = decoded_results.perf_metrics.raw_metrics;
+ auto stop_time = std::chrono::steady_clock::now();
+ raw_counters.generate_durations = std::vector();
+ raw_counters.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
+ raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_microsec(encode_stop_time - start_time));
+ raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_stop_time - decode_start_time));
+
+ // Added tokenization/detokenization times, and updated generate duration, need to reevaluate statistics.
+ decoded_results.perf_metrics.m_evaluated = false;
+ decoded_results.perf_metrics.evaluate_statistics(start_time);
+ return decoded_results;
+}
+
+EncodedResults StatefulLLMPipeline::generate(
+ const EncodedInputs& inputs,
+ OptionalGenerationConfig generation_config,
+ StreamerVariant streamer) {
+ if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::UNDEF)
+ m_chat_input_type = ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS;
+
+ if (is_chat_conversation)
+ // if chat was run in StringInputs mode, but it was called EncodedInputs generate, last m_history entry will be with assistant role
+ OPENVINO_ASSERT(m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS || m_history.back()["role"] == "user",
+ "Chat doesn't support switching between input types. Please, continue using StringInputs or restart the chat.");
+
+ auto start_time = std::chrono::steady_clock::now();
+ ov::Tensor input_ids;
+ ov::Tensor attention_mask;
+ if (auto data = std::get_if(&inputs)) {
+ input_ids = *data;
+ attention_mask = ov::genai::utils::init_attention_mask(input_ids);
+ } else if (auto data = std::get_if(&inputs)) {
+ input_ids = data->input_ids;
+ attention_mask = data->attention_mask;
+ }
+
+ if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS)
+ std::copy(input_ids.data(), input_ids.data() + input_ids.get_size(), std::back_inserter(m_tokenized_chat_history));
+
+ // Tail of previous output in chat mode is missing in KV cache.
+ if (m_last_disappeared_token.has_value()) {
+ attention_mask = ov::genai::utils::push_front_inputs(attention_mask, 1);
+ input_ids = ov::genai::utils::push_front_inputs(input_ids, *m_last_disappeared_token);
+ }
+
+ GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
+
+ // If eos_token_id was not provided, take value from default m_generation_config
+ if (config.eos_token_id == -1)
+ config.set_eos_token_id(m_generation_config.eos_token_id);
+ config.validate();
+
+ // Stateful pipeline does not provide logprobs for prompt tokens
+ OPENVINO_ASSERT(config.echo == false, "Echo is not supported in the stateful pipeline");
+
+ std::shared_ptr streamer_ptr;
+ if (auto streamer_obj = std::get_if(&streamer)) {
+ streamer_ptr = nullptr;
+ } else if (auto streamer_obj = std::get_if>(&streamer)) {
+ streamer_ptr = *streamer_obj;
+ } else if (auto callback = std::get_if>(&streamer)) {
+ streamer_ptr = std::make_shared(m_tokenizer, *callback);
+ }
+
+ auto batch_size = input_ids.get_shape().at(0);
+ OPENVINO_ASSERT(streamer_ptr == nullptr || batch_size == 1 && config.num_return_sequences == 1 &&
+ (config.is_greedy_decoding() || config.is_multinomial()),
+ "Currently streaming is possible only with batch size=1 and only for greedy or multinomial decoding");
+
+ auto num_inputs = m_model_runner.get_compiled_model().inputs().size();
+ OPENVINO_ASSERT(num_inputs == 4 || num_inputs == 3, "Model should have 3 or 4 inputs: "
+ "either (input_ids, attention_mask, beam_idx) or "
+ "(input_ids, attention_mask, position_ids, beam_idx) "
+ "but you have '" + std::to_string(num_inputs) + "' inputs");
+
+ ov::genai::utils::trim_kv_cache(m_model_runner, m_kv_history_manager.num_tokens_to_remove_from_kv_cache, m_kv_cache_seq_length_axis, m_adapter_controller);
+
+ size_t kv_cache_len = 0;
+ ov::Tensor concatenated_attention_mask;
+ if (is_chat_conversation && !m_tokenized_chat_history.empty()) {
+ OPENVINO_ASSERT(batch_size == 1, "continuation of generation is possible only for batch 1");
+ // If history is saved in KV cache, concatenate new attention_mask with the already existing.
+ // Between subsequent runs attention_mask should not be modified.
+ auto atten_mask_history = m_model_runner.get_tensor("attention_mask");
+ auto prompt_len = attention_mask.get_shape()[1];
+
+ kv_cache_len = atten_mask_history.get_shape()[1] - m_kv_history_manager.num_tokens_to_remove_from_kv_cache;
+
+ ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, {batch_size, kv_cache_len + prompt_len}};
+ auto start_atten_hst = atten_mask_history.data();
+
+ std::copy(start_atten_hst, start_atten_hst + kv_cache_len,
+ new_atten_mask.data());
+ std::copy(attention_mask.data(), attention_mask.data() + prompt_len,
+ new_atten_mask.data() + kv_cache_len);
+ concatenated_attention_mask = new_atten_mask;
+ } else {
+ concatenated_attention_mask = attention_mask;
+ }
+
+ size_t prev_attn_mask_size = concatenated_attention_mask.get_shape()[1];
+
+ bool position_ids_available = (num_inputs == 4);
+ std::optional position_ids = std::nullopt;
+ if (position_ids_available) {
+ position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
+ utils::initialize_position_ids(*position_ids, attention_mask, kv_cache_len);
+ }
+
+ if(m_adapter_controller) {
+ m_adapter_controller->apply(m_model_runner, config.adapters);
+ }
+
+ if (is_chat_conversation && !m_trust_encoded_history) {
+ m_trust_encoded_history = true;
+ m_kv_history_manager.reset();
+ }
+
+ std::vector requests;
+ size_t block_size = 1;
+
+ for (size_t request_id = 0; request_id < batch_size; request_id++) {
+ SequenceGroup::Ptr sequence_group;
+ if (is_chat_conversation) {
+ ov::Tensor tokenized_chat_history = ov::Tensor(ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data());
+ sequence_group = std::make_shared(request_id, tokenized_chat_history, config, block_size);
+ } else {
+ size_t seq_len = input_ids.get_shape().at(1);
+ size_t batch_offset = request_id * seq_len;
+ const int64_t* prompt_start = input_ids.data() + batch_offset;
+ std::vector tokenized_prompt(prompt_start, prompt_start + seq_len);
+
+ sequence_group = std::make_shared(request_id, tokenized_prompt, config, block_size);
+ }
+
+ requests.push_back(sequence_group);
+ }
+
+ if (m_sampler.get_seed() != config.rng_seed) {
+ m_sampler.set_seed(config.rng_seed);
+ }
+
+ ov::genai::EncodedResults result;
+ std::tie(result, m_last_disappeared_token) = get_lm_encoded_results(m_model_runner, input_ids, concatenated_attention_mask,
+ streamer_ptr, m_sampler, requests, position_ids, std::nullopt);
+
+ if (is_chat_conversation) {
+ // force remove from kv_cache last answer
+ if (config.is_beam_search() && m_chat_input_type != ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS) {
+ m_kv_history_manager.trusted_history_length = m_tokenized_chat_history.size();
+ m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_model_runner.get_tensor("attention_mask").get_shape()[1] - prev_attn_mask_size;
+ }
+
+ std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history));
+ } else {
+ reset_kv_state();
+ m_last_disappeared_token = std::nullopt;
+ }
+
+ if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS)
+ std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history));
+
+ auto stop_time = std::chrono::steady_clock::now();
+
+ // If is called without tokenization then that stat will not be reported.
+ auto& metrics = result.perf_metrics;
+ metrics.num_input_tokens = batch_size * input_ids.get_shape().at(1);
+ metrics.load_time = m_load_time_ms;
+ metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
+ metrics.evaluate_statistics(start_time);
+ return result;
+}
+
+void StatefulLLMPipeline::start_chat(const std::string& system_message) {
+ is_chat_conversation = true;
+ m_trust_encoded_history = true;
+ m_kv_history_manager.reset();
+ m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
+ m_last_disappeared_token = std::nullopt;
+ if (!m_tokenized_chat_history.empty()) {
+ reset_kv_state();
+ m_history = {};
+ m_templated_chat_history = "";
+ m_tokenized_chat_history.clear();
+ }
+ if (system_message.empty())
+ return;
+
+ m_history.push_back({{"role", "system"}, {"content", system_message}});
+ constexpr bool add_generation_prompt = false;
+
+ m_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
+}
+
+void StatefulLLMPipeline::reset_kv_state() {
+ if(m_adapter_controller) {
+ for(auto& state: m_model_runner.query_state()) {
+ if(!m_adapter_controller->has_state_name(state.get_name())) {
+ state.reset();
+ }
+ }
+ } else {
+ m_model_runner.reset_state();
+ }
+}
+
+void StatefulLLMPipeline::finish_chat() {
+ is_chat_conversation = false;
+ m_trust_encoded_history = true;
+ m_kv_history_manager.reset();
+ m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
+ m_last_disappeared_token = std::nullopt;
+ if (!m_tokenized_chat_history.empty()) {
+ reset_kv_state();
+ m_history.clear();
+ m_templated_chat_history.clear();
+ m_tokenized_chat_history.clear();
+ }
+}
+
+} // namespace ov::genai
diff --git a/src/cpp/src/llm_pipeline_stateful.hpp b/src/cpp/src/llm_pipeline_stateful.hpp
new file mode 100644
index 0000000000..dbf8d89391
--- /dev/null
+++ b/src/cpp/src/llm_pipeline_stateful.hpp
@@ -0,0 +1,77 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+
+#include "llm_pipeline_base.hpp"
+#include "sampler.hpp"
+#include "utils.hpp"
+
+namespace ov::genai {
+
+class StatefulLLMPipeline final : public LLMPipelineImplBase {
+ ov::InferRequest m_model_runner;
+ Sampler m_sampler;
+
+ // Chat scenario specific parameters
+ bool is_chat_conversation = false;
+ bool m_trust_encoded_history = true;
+ ChatHistory m_history;
+ std::string m_templated_chat_history = {};
+ std::vector m_tokenized_chat_history;
+ ov::genai::utils::GenerationChatInputsType m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
+ // Tail of previous output in chat mode is missing in KV cache, let's keep it
+ std::optional m_last_disappeared_token = std::nullopt;
+ // If sequence contains some symbols, which could be ambiguously encoded by tokenizer, we need to trim kv cache
+ // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history
+ // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history
+ ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0};
+ size_t m_kv_cache_seq_length_axis = 2;
+
+ void reset_kv_state();
+public:
+
+ StatefulLLMPipeline(
+ const ov::InferRequest& request,
+ const ov::genai::Tokenizer& tokenizer,
+ OptionalGenerationConfig generation_config = std::nullopt
+ );
+
+ StatefulLLMPipeline(
+ const std::filesystem::path& models_path,
+ const ov::genai::Tokenizer& tokenizer,
+ const std::string& device,
+ const ov::AnyMap& plugin_config
+ );
+
+ StatefulLLMPipeline(
+ const std::shared_ptr& model,
+ const ov::genai::Tokenizer& tokenizer,
+ const std::string& device,
+ const ov::AnyMap& config,
+ const ov::genai::GenerationConfig& generation_config
+ );
+
+ StatefulLLMPipeline(
+ const std::filesystem::path& models_path,
+ const std::string& device,
+ const ov::AnyMap& plugin_config
+ );
+
+ DecodedResults generate(
+ StringInputs inputs,
+ OptionalGenerationConfig generation_config,
+ StreamerVariant streamer
+ ) override;
+
+ EncodedResults generate(
+ const EncodedInputs& inputs,
+ OptionalGenerationConfig generation_config,
+ StreamerVariant streamer
+ ) override;
+
+ void start_chat(const std::string& system_message) override;
+
+ void finish_chat() override;
+};
+
+} // namespace ov::genai
diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 6f4f124894..c98b571179 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -1,8 +1,10 @@
-// Copyright (C) 2024 Intel Corporation
+// Copyright (C) 2024-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#include "llm_pipeline_static.hpp"
+#include "sampler.hpp"
+
#include
#include
@@ -235,12 +237,12 @@ enum class GenerateHint {
std::string to_string(GenerateHint h) {
switch(h) {
- case GenerateHint::FAST_COMPILE :
+ case GenerateHint::FAST_COMPILE :
return "FAST_COMPILE";
- case GenerateHint::BEST_PERF :
+ case GenerateHint::BEST_PERF :
return "BEST_PERF";
default:
- OPENVINO_THROW("Unsupported value for type GenerateHint provided");
+ OPENVINO_THROW("Unsupported value for type GenerateHint provided");
}
}
@@ -396,12 +398,12 @@ KVAxesPosition get_kv_axes(const std::string& model_type) {
return axes;
}
-ov::genai::ModelConfigDesc get_modeldesc_from_json(const std::filesystem::path& filepath) {
+ov::genai::static_llm::ModelConfigDesc get_modeldesc_from_json(const std::filesystem::path& filepath) {
std::ifstream file(filepath);
- OPENVINO_ASSERT(file.is_open(), "Could not open file: " + filepath.string());
+ OPENVINO_ASSERT(file.is_open(), "Could not open file: ", filepath);
nlohmann::json config_data = nlohmann::json::parse(file);
- ov::genai::ModelConfigDesc desc;
+ ov::genai::static_llm::ModelConfigDesc desc;
desc.type = config_data["model_type"].get();
// NB: In case _name_or_path field isn't presented in config.json
if (config_data.contains("_name_or_path")) {
@@ -586,6 +588,19 @@ std::optional pop_int_and_cast(ov::AnyMap& config, const std::string&
return std::nullopt;
}
+void update_config(ov::AnyMap& config, const std::pair | | | | | | | | | |