From c4857e3ce0d526f6918498a836eeecd059506476 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 10 Dec 2024 20:32:14 +0400 Subject: [PATCH] Added Python API for other image generation models (#1349) --- .../flux_transformer_2d_model.hpp | 4 +- .../sd3_transformer_2d_model.hpp | 2 +- .../image_generation/text2image_pipeline.hpp | 2 +- .../models/flux_transformer_2d_model.cpp | 1 - src/python/openvino_genai/__init__.py | 3 + src/python/openvino_genai/__init__.pyi | 5 +- .../openvino_genai/py_openvino_genai.pyi | 143 +++++- src/python/py_image_generation_models.cpp | 428 ++++++++++++------ src/python/py_image_generation_pipelines.cpp | 8 + 9 files changed, 455 insertions(+), 141 deletions(-) diff --git a/src/cpp/include/openvino/genai/image_generation/flux_transformer_2d_model.hpp b/src/cpp/include/openvino/genai/image_generation/flux_transformer_2d_model.hpp index c49eb56b21..95f846668b 100644 --- a/src/cpp/include/openvino/genai/image_generation/flux_transformer_2d_model.hpp +++ b/src/cpp/include/openvino/genai/image_generation/flux_transformer_2d_model.hpp @@ -18,7 +18,7 @@ namespace genai { class OPENVINO_GENAI_EXPORTS FluxTransformer2DModel { public: - struct Config { + struct OPENVINO_GENAI_EXPORTS Config { size_t in_channels = 64; bool guidance_embeds = false; size_t m_default_sample_size = 128; @@ -69,7 +69,7 @@ class OPENVINO_GENAI_EXPORTS FluxTransformer2DModel { template ov::util::EnableIfAllStringAny compile(const std::string& device, - Properties&&... properties) { + Properties&&... properties) { return compile(device, ov::AnyMap{std::forward(properties)...}); } diff --git a/src/cpp/include/openvino/genai/image_generation/sd3_transformer_2d_model.hpp b/src/cpp/include/openvino/genai/image_generation/sd3_transformer_2d_model.hpp index e4641066ec..7f96af49c2 100644 --- a/src/cpp/include/openvino/genai/image_generation/sd3_transformer_2d_model.hpp +++ b/src/cpp/include/openvino/genai/image_generation/sd3_transformer_2d_model.hpp @@ -19,7 +19,7 @@ namespace genai { class OPENVINO_GENAI_EXPORTS SD3Transformer2DModel { public: - struct Config { + struct OPENVINO_GENAI_EXPORTS Config { size_t sample_size = 128; size_t patch_size = 2; size_t in_channels = 16; diff --git a/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp index be594f13fc..4fce33856f 100644 --- a/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp +++ b/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp @@ -82,7 +82,7 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { const CLIPTextModel& clip_text_model, const T5EncoderModel t5_encoder_model, const FluxTransformer2DModel& transformer, - const AutoencoderKL& vae_decoder); + const AutoencoderKL& vae); ImageGenerationConfig get_generation_config() const; void set_generation_config(const ImageGenerationConfig& generation_config); diff --git a/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp b/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp index 3c6bb94505..6b28b116b0 100644 --- a/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp +++ b/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp @@ -22,7 +22,6 @@ FluxTransformer2DModel::Config::Config(const std::filesystem::path& config_path) read_json_param(data, "in_channels", in_channels); read_json_param(data, "guidance_embeds", guidance_embeds); - file.close(); } FluxTransformer2DModel::FluxTransformer2DModel(const std::filesystem::path& root_dir) diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py index 66cf12aad1..a97812261f 100644 --- a/src/python/openvino_genai/__init__.py +++ b/src/python/openvino_genai/__init__.py @@ -63,7 +63,10 @@ from .py_openvino_genai import ( CLIPTextModel, CLIPTextModelWithProjection, + T5EncoderModel, UNet2DConditionModel, + FluxTransformer2DModel, + SD3Transformer2DModel, AutoencoderKL, Text2ImagePipeline, Scheduler, diff --git a/src/python/openvino_genai/__init__.pyi b/src/python/openvino_genai/__init__.pyi index 5a406043b7..e7e4e2619c 100644 --- a/src/python/openvino_genai/__init__.pyi +++ b/src/python/openvino_genai/__init__.pyi @@ -15,6 +15,7 @@ from openvino_genai.py_openvino_genai import ContinuousBatchingPipeline from openvino_genai.py_openvino_genai import CppStdGenerator from openvino_genai.py_openvino_genai import DecodedResults from openvino_genai.py_openvino_genai import EncodedResults +from openvino_genai.py_openvino_genai import FluxTransformer2DModel from openvino_genai.py_openvino_genai import GenerationConfig from openvino_genai.py_openvino_genai import GenerationResult from openvino_genai.py_openvino_genai import Generator @@ -22,10 +23,12 @@ from openvino_genai.py_openvino_genai import ImageGenerationConfig from openvino_genai.py_openvino_genai import LLMPipeline from openvino_genai.py_openvino_genai import PerfMetrics from openvino_genai.py_openvino_genai import RawPerfMetrics +from openvino_genai.py_openvino_genai import SD3Transformer2DModel from openvino_genai.py_openvino_genai import Scheduler from openvino_genai.py_openvino_genai import SchedulerConfig from openvino_genai.py_openvino_genai import StopCriteria from openvino_genai.py_openvino_genai import StreamerBase +from openvino_genai.py_openvino_genai import T5EncoderModel from openvino_genai.py_openvino_genai import Text2ImagePipeline from openvino_genai.py_openvino_genai import TokenizedInputs from openvino_genai.py_openvino_genai import Tokenizer @@ -38,5 +41,5 @@ from openvino_genai.py_openvino_genai import WhisperRawPerfMetrics from openvino_genai.py_openvino_genai import draft_model import os as os from . import py_openvino_genai -__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedResults', 'GenerationConfig', 'GenerationResult', 'Generator', 'ImageGenerationConfig', 'LLMPipeline', 'PerfMetrics', 'RawPerfMetrics', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'UNet2DConditionModel', 'VLMPipeline', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'openvino', 'os', 'py_openvino_genai'] +__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationResult', 'Generator', 'ImageGenerationConfig', 'LLMPipeline', 'PerfMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'UNet2DConditionModel', 'VLMPipeline', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'openvino', 'os', 'py_openvino_genai'] __version__: str = '2025.0.0.0' diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index b13ee37f24..24bf6fd785 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -5,7 +5,7 @@ from __future__ import annotations import openvino._pyopenvino import os import typing -__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedGenerationResult', 'EncodedResults', 'GenerationConfig', 'GenerationFinishReason', 'GenerationHandle', 'GenerationOutput', 'GenerationResult', 'GenerationStatus', 'Generator', 'ImageGenerationConfig', 'LLMPipeline', 'MeanStdPair', 'PerfMetrics', 'PipelineMetrics', 'RawPerfMetrics', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'UNet2DConditionModel', 'VLMDecodedResults', 'VLMPerfMetrics', 'VLMPipeline', 'VLMRawPerfMetrics', 'WhisperDecodedResultChunk', 'WhisperDecodedResults', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model'] +__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedGenerationResult', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationFinishReason', 'GenerationHandle', 'GenerationOutput', 'GenerationResult', 'GenerationStatus', 'Generator', 'ImageGenerationConfig', 'LLMPipeline', 'MeanStdPair', 'PerfMetrics', 'PipelineMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'UNet2DConditionModel', 'VLMDecodedResults', 'VLMPerfMetrics', 'VLMPipeline', 'VLMRawPerfMetrics', 'WhisperDecodedResultChunk', 'WhisperDecodedResults', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model'] class Adapter: """ Immutable LoRA Adapter that carries the adaptation matrices and serves as unique adapter identifier. @@ -222,7 +222,7 @@ class CLIPTextModel: """ max_position_embeddings: int num_hidden_layers: int - def __init__(self, config_path: str) -> None: + def __init__(self, config_path: os.PathLike) -> None: ... @typing.overload def __init__(self, root_dir: os.PathLike) -> None: @@ -470,6 +470,53 @@ class EncodedResults: @property def tokens(self) -> list[list[int]]: ... +class FluxTransformer2DModel: + """ + FluxTransformer2DModel class. + """ + class Config: + """ + This class is used for storing FluxTransformer2DModel config. + """ + default_sample_size: int + in_channels: int + def __init__(self, config_path: os.PathLike) -> None: + ... + @typing.overload + def __init__(self, root_dir: os.PathLike) -> None: + """ + FluxTransformer2DModel class + root_dir (os.PathLike): Model root directory. + """ + @typing.overload + def __init__(self, root_dir: os.PathLike, device: str, **kwargs) -> None: + """ + UNet2DConditionModel class + root_dir (os.PathLike): Model root directory. + device (str): Device on which inference will be done. + kwargs: Device properties. + """ + @typing.overload + def __init__(self, model: FluxTransformer2DModel) -> None: + """ + FluxTransformer2DModel model + FluxTransformer2DModel class + model (FluxTransformer2DModel): FluxTransformer2DModel model + """ + def compile(self, device: str, **kwargs) -> None: + """ + Compiles the model. + device (str): Device to run the model on (e.g., CPU, GPU). + kwargs: Device properties. + """ + def get_config(self) -> FluxTransformer2DModel.Config: + ... + def infer(self, sample: openvino._pyopenvino.Tensor, timestep: openvino._pyopenvino.Tensor) -> openvino._pyopenvino.Tensor: + ... + def reshape(self, batch_size: int, height: int, width: int, tokenizer_model_max_length: int) -> FluxTransformer2DModel: + ... + def set_hidden_states(self, tensor_name: str, encoder_hidden_states: openvino._pyopenvino.Tensor) -> None: + ... class GenerationConfig: """ @@ -1068,6 +1115,55 @@ class RawPerfMetrics: @property def tokenization_durations(self) -> list[float]: ... +class SD3Transformer2DModel: + """ + SD3Transformer2DModel class. + """ + class Config: + """ + This class is used for storing SD3Transformer2DModel config. + """ + in_channels: int + joint_attention_dim: int + patch_size: int + sample_size: int + def __init__(self, config_path: os.PathLike) -> None: + ... + @typing.overload + def __init__(self, root_dir: os.PathLike) -> None: + """ + SD3Transformer2DModel class + root_dir (os.PathLike): Model root directory. + """ + @typing.overload + def __init__(self, root_dir: os.PathLike, device: str, **kwargs) -> None: + """ + SD3Transformer2DModel class + root_dir (os.PathLike): Model root directory. + device (str): Device on which inference will be done. + kwargs: Device properties. + """ + @typing.overload + def __init__(self, model: SD3Transformer2DModel) -> None: + """ + SD3Transformer2DModel model + SD3Transformer2DModel class + model (SD3Transformer2DModel): SD3Transformer2DModel model + """ + def compile(self, device: str, **kwargs) -> None: + """ + Compiles the model. + device (str): Device to run the model on (e.g., CPU, GPU). + kwargs: Device properties. + """ + def get_config(self) -> SD3Transformer2DModel.Config: + ... + def infer(self, sample: openvino._pyopenvino.Tensor, timestep: openvino._pyopenvino.Tensor) -> openvino._pyopenvino.Tensor: + ... + def reshape(self, batch_size: int, height: int, width: int, tokenizer_model_max_length: int) -> SD3Transformer2DModel: + ... + def set_hidden_states(self, tensor_name: str, encoder_hidden_states: openvino._pyopenvino.Tensor) -> None: + ... class Scheduler: """ Scheduler for image generation pipelines. @@ -1220,17 +1316,60 @@ class StreamerBase: """ Put is called every time new token is decoded. Returns a bool flag to indicate whether generation should be stopped, if return true generation stops """ +class T5EncoderModel: + """ + T5EncoderModel class. + """ + @typing.overload + def __init__(self, root_dir: os.PathLike) -> None: + """ + T5EncoderModel class + root_dir (os.PathLike): Model root directory. + """ + @typing.overload + def __init__(self, root_dir: os.PathLike, device: str, **kwargs) -> None: + """ + T5EncoderModel class + root_dir (os.PathLike): Model root directory. + device (str): Device on which inference will be done. + kwargs: Device properties. + """ + @typing.overload + def __init__(self, model: T5EncoderModel) -> None: + """ + T5EncoderModel model + T5EncoderModel class + model (T5EncoderModel): T5EncoderModel model + """ + def compile(self, device: str, **kwargs) -> None: + """ + Compiles the model. + device (str): Device to run the model on (e.g., CPU, GPU). + kwargs: Device properties. + """ + def get_output_tensor(self, idx: int) -> openvino._pyopenvino.Tensor: + ... + def infer(self, pos_prompt: str, neg_prompt: str, do_classifier_free_guidance: bool, max_sequence_length: int) -> openvino._pyopenvino.Tensor: + ... + def reshape(self, batch_size: int, max_sequence_length: int) -> T5EncoderModel: + ... class Text2ImagePipeline: """ This class is used for generation with text-to-image models. """ @staticmethod + def flux(scheduler: Scheduler, clip_text_model: CLIPTextModel, t5_encoder_model: T5EncoderModel, transformer: FluxTransformer2DModel, vae: AutoencoderKL) -> Text2ImagePipeline: + ... + @staticmethod def latent_consistency_model(scheduler: Scheduler, clip_text_model: CLIPTextModel, unet: UNet2DConditionModel, vae: AutoencoderKL) -> Text2ImagePipeline: ... @staticmethod def stable_diffusion(scheduler: Scheduler, clip_text_model: CLIPTextModel, unet: UNet2DConditionModel, vae: AutoencoderKL) -> Text2ImagePipeline: ... @staticmethod + def stable_diffusion_3(scheduler: Scheduler, clip_text_model_1: CLIPTextModelWithProjection, clip_text_model_2: CLIPTextModelWithProjection, t5_encoder_model: T5EncoderModel, transformer: SD3Transformer2DModel, vae: AutoencoderKL) -> Text2ImagePipeline: + ... + @staticmethod def stable_diffusion_xl(scheduler: Scheduler, clip_text_model: CLIPTextModel, clip_text_model_with_projection: CLIPTextModelWithProjection, unet: UNet2DConditionModel, vae: AutoencoderKL) -> Text2ImagePipeline: ... @typing.overload diff --git a/src/python/py_image_generation_models.cpp b/src/python/py_image_generation_models.cpp index 72a8970cb4..75be28233f 100644 --- a/src/python/py_image_generation_models.cpp +++ b/src/python/py_image_generation_models.cpp @@ -12,7 +12,10 @@ #include "openvino/genai/image_generation/autoencoder_kl.hpp" #include "openvino/genai/image_generation/clip_text_model.hpp" #include "openvino/genai/image_generation/clip_text_model_with_projection.hpp" +#include "openvino/genai/image_generation/t5_encoder_model.hpp" #include "openvino/genai/image_generation/unet2d_condition_model.hpp" +#include "openvino/genai/image_generation/sd3_transformer_2d_model.hpp" +#include "openvino/genai/image_generation/flux_transformer_2d_model.hpp" #include "tokenizers_path.hpp" #include "py_utils.hpp" @@ -22,13 +25,11 @@ namespace pyutils = ov::genai::pybind::utils; void init_clip_text_model(py::module_& m) { auto clip_text_model = py::class_(m, "CLIPTextModel", "CLIPTextModel class.") - .def(py::init([]( - const std::filesystem::path& root_dir - ) { + .def(py::init([](const std::filesystem::path& root_dir) { ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); return std::make_unique(root_dir); }), - py::arg("root_dir"), "Model root directory", + py::arg("root_dir"), "Model root directory", R"( CLIPTextModel class root_dir (os.PathLike): Model root directory. @@ -41,17 +42,15 @@ void init_clip_text_model(py::module_& m) { ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); return std::make_unique(root_dir, device, pyutils::kwargs_to_any_map(kwargs)); }), - py::arg("root_dir"), "Model root directory", + py::arg("root_dir"), "Model root directory", py::arg("device"), "Device on which inference will be done", R"( CLIPTextModel class root_dir (os.PathLike): Model root directory. device (str): Device on which inference will be done. kwargs: Device properties. - )") - .def(py::init([]( - const ov::genai::CLIPTextModel& model - ) { + )") + .def(py::init([](const ov::genai::CLIPTextModel& model) { return std::make_unique(model); }), py::arg("model"), "CLIPText model" @@ -59,25 +58,23 @@ void init_clip_text_model(py::module_& m) { CLIPTextModel class model (CLIPTextModel): CLIPText model )"); - + py::class_(clip_text_model, "Config", "This class is used for storing CLIPTextModel config.") - .def(py::init([]( - const std::string& config_path - ) { + .def(py::init([](const std::filesystem::path& config_path) { return std::make_unique(config_path); }), py::arg("config_path")) .def_readwrite("max_position_embeddings", &ov::genai::CLIPTextModel::Config::max_position_embeddings) .def_readwrite("num_hidden_layers", &ov::genai::CLIPTextModel::Config::num_hidden_layers); - clip_text_model.def("get_config", &ov::genai::CLIPTextModel::get_config); - clip_text_model.def("reshape", &ov::genai::CLIPTextModel::reshape, py::arg("batch_size")); - clip_text_model.def("set_adapters", &ov::genai::CLIPTextModel::set_adapters, py::arg("adapters")); - clip_text_model.def("infer", &ov::genai::CLIPTextModel::infer, py::arg("pos_prompt"), py::arg("neg_prompt"), py::arg("do_classifier_free_guidance")); - clip_text_model.def("get_output_tensor", &ov::genai::CLIPTextModel::get_output_tensor, py::arg("idx")); - clip_text_model.def( - "compile", - [](ov::genai::CLIPTextModel& self, + clip_text_model.def("get_config", &ov::genai::CLIPTextModel::get_config) + .def("reshape", &ov::genai::CLIPTextModel::reshape, py::arg("batch_size")) + .def("set_adapters", &ov::genai::CLIPTextModel::set_adapters, py::arg("adapters")) + .def("infer", &ov::genai::CLIPTextModel::infer, py::arg("pos_prompt"), py::arg("neg_prompt"), py::arg("do_classifier_free_guidance")) + .def("get_output_tensor", &ov::genai::CLIPTextModel::get_output_tensor, py::arg("idx")) + .def( + "compile", + [](ov::genai::CLIPTextModel& self, const std::string& device, const py::kwargs& kwargs ) { @@ -91,14 +88,132 @@ void init_clip_text_model(py::module_& m) { )"); } -void init_unet2d_condition_model(py::module_& m) { - auto unet2d_condition_model = py::class_(m, "UNet2DConditionModel", "UNet2DConditionModel class.") +void init_clip_text_model_with_projection(py::module_& m) { + auto clip_text_model_with_projection = py::class_(m, "CLIPTextModelWithProjection", "CLIPTextModelWithProjection class.") + .def(py::init([](const std::filesystem::path& root_dir) { + ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); + return std::make_unique(root_dir); + }), + py::arg("root_dir"), "Model root directory", + R"( + CLIPTextModelWithProjection class + root_dir (os.PathLike): Model root directory. + )") + .def(py::init([]( + const std::filesystem::path& root_dir, + const std::string& device, + const py::kwargs& kwargs + ) { + ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); + return std::make_unique(root_dir, device, pyutils::kwargs_to_any_map(kwargs)); + }), + py::arg("root_dir"), "Model root directory", + py::arg("device"), "Device on which inference will be done", + R"( + CLIPTextModelWithProjection class + root_dir (os.PathLike): Model root directory. + device (str): Device on which inference will be done. + kwargs: Device properties. + )") + .def(py::init([](const ov::genai::CLIPTextModelWithProjection& model) { + return std::make_unique(model); + }), + py::arg("model"), "CLIPTextModelWithProjection model" + R"( + CLIPTextModelWithProjection class + model (CLIPTextModelWithProjection): CLIPTextModelWithProjection model + )"); + + py::class_(clip_text_model_with_projection, "Config", "This class is used for storing CLIPTextModelWithProjection config.") + .def(py::init([](const std::filesystem::path& config_path) { + return std::make_unique(config_path); + }), + py::arg("config_path")) + .def_readwrite("max_position_embeddings", &ov::genai::CLIPTextModelWithProjection::Config::max_position_embeddings) + .def_readwrite("num_hidden_layers", &ov::genai::CLIPTextModelWithProjection::Config::num_hidden_layers); + + clip_text_model_with_projection.def("reshape", &ov::genai::CLIPTextModelWithProjection::reshape, py::arg("batch_size")) + .def("infer", &ov::genai::CLIPTextModelWithProjection::infer, py::arg("pos_prompt"), py::arg("neg_prompt"), py::arg("do_classifier_free_guidance")) + .def("get_config", &ov::genai::CLIPTextModelWithProjection::get_config) + .def("get_output_tensor", &ov::genai::CLIPTextModelWithProjection::get_output_tensor, py::arg("idx")) + .def("set_adapters", &ov::genai::CLIPTextModelWithProjection::set_adapters, py::arg("adapters")) + .def( + "compile", + [](ov::genai::CLIPTextModelWithProjection& self, + const std::string& device, + const py::kwargs& kwargs + ) { + self.compile(device, pyutils::kwargs_to_any_map(kwargs)); + }, + py::arg("device"), "device on which inference will be done", + R"( + Compiles the model. + device (str): Device to run the model on (e.g., CPU, GPU). + kwargs: Device properties. + )"); +} + +void init_t5_encoder_model(py::module_& m) { + auto t5_encoder_model = py::class_(m, "T5EncoderModel", "T5EncoderModel class.") + .def(py::init([](const std::filesystem::path& root_dir) { + ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); + return std::make_unique(root_dir); + }), + py::arg("root_dir"), "Model root directory", + R"( + T5EncoderModel class + root_dir (os.PathLike): Model root directory. + )") .def(py::init([]( - const std::filesystem::path& root_dir + const std::filesystem::path& root_dir, + const std::string& device, + const py::kwargs& kwargs ) { + ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); + return std::make_unique(root_dir, device, pyutils::kwargs_to_any_map(kwargs)); + }), + py::arg("root_dir"), "Model root directory", + py::arg("device"), "Device on which inference will be done", + R"( + T5EncoderModel class + root_dir (os.PathLike): Model root directory. + device (str): Device on which inference will be done. + kwargs: Device properties. + )") + .def(py::init([](const ov::genai::T5EncoderModel& model) { + return std::make_unique(model); + }), + py::arg("model"), "T5EncoderModel model" + R"( + T5EncoderModel class + model (T5EncoderModel): T5EncoderModel model + )") + .def("reshape", &ov::genai::T5EncoderModel::reshape, py::arg("batch_size"), py::arg("max_sequence_length")) + .def("infer", &ov::genai::T5EncoderModel::infer, py::arg("pos_prompt"), py::arg("neg_prompt"), py::arg("do_classifier_free_guidance"), py::arg("max_sequence_length")) + .def("get_output_tensor", &ov::genai::T5EncoderModel::get_output_tensor, py::arg("idx")) + // .def("set_adapters", &ov::genai::T5EncoderModel::set_adapters, py::arg("adapters")) + .def( + "compile", + [](ov::genai::T5EncoderModel& self, + const std::string& device, + const py::kwargs& kwargs + ) { + self.compile(device, pyutils::kwargs_to_any_map(kwargs)); + }, + py::arg("device"), "device on which inference will be done", + R"( + Compiles the model. + device (str): Device to run the model on (e.g., CPU, GPU). + kwargs: Device properties. + )"); +} + +void init_unet2d_condition_model(py::module_& m) { + auto unet2d_condition_model = py::class_(m, "UNet2DConditionModel", "UNet2DConditionModel class.") + .def(py::init([](const std::filesystem::path& root_dir) { return std::make_unique(root_dir); }), - py::arg("root_dir"), "Model root directory", + py::arg("root_dir"), "Model root directory", R"( UNet2DConditionModel class root_dir (os.PathLike): Model root directory. @@ -110,17 +225,15 @@ void init_unet2d_condition_model(py::module_& m) { ) { return std::make_unique(root_dir, device, pyutils::kwargs_to_any_map(kwargs)); }), - py::arg("root_dir"), "Model root directory", + py::arg("root_dir"), "Model root directory", py::arg("device"), "Device on which inference will be done", R"( UNet2DConditionModel class root_dir (os.PathLike): Model root directory. device (str): Device on which inference will be done. kwargs: Device properties. - )") - .def(py::init([]( - const ov::genai::UNet2DConditionModel& model - ) { + )") + .def(py::init([](const ov::genai::UNet2DConditionModel& model) { return std::make_unique(model); }), py::arg("model"), "UNet2DConditionModel model" @@ -130,9 +243,7 @@ void init_unet2d_condition_model(py::module_& m) { )"); py::class_(unet2d_condition_model, "Config", "This class is used for storing UNet2DConditionModel config.") - .def(py::init([]( - const std::filesystem::path& config_path - ) { + .def(py::init([](const std::filesystem::path& config_path) { return std::make_unique(config_path); }), py::arg("config_path")) @@ -140,14 +251,14 @@ void init_unet2d_condition_model(py::module_& m) { .def_readwrite("sample_size", &ov::genai::UNet2DConditionModel::Config::sample_size) .def_readwrite("time_cond_proj_dim", &ov::genai::UNet2DConditionModel::Config::time_cond_proj_dim); - unet2d_condition_model.def("get_config", &ov::genai::UNet2DConditionModel::get_config); - unet2d_condition_model.def("reshape", &ov::genai::UNet2DConditionModel::reshape, py::arg("batch_size"), py::arg("height"), py::arg("width"), py::arg("tokenizer_model_max_length")); - unet2d_condition_model.def("set_adapters", &ov::genai::UNet2DConditionModel::set_adapters, py::arg("adapters")); - unet2d_condition_model.def("infer", &ov::genai::UNet2DConditionModel::infer, py::arg("sample"), py::arg("timestep")); - unet2d_condition_model.def("set_hidden_states", &ov::genai::UNet2DConditionModel::set_hidden_states, py::arg("tensor_name"), py::arg("encoder_hidden_states")); - unet2d_condition_model.def("do_classifier_free_guidance", &ov::genai::UNet2DConditionModel::do_classifier_free_guidance, py::arg("guidance_scale")); - unet2d_condition_model.def( - "compile", + unet2d_condition_model.def("get_config", &ov::genai::UNet2DConditionModel::get_config) + .def("reshape", &ov::genai::UNet2DConditionModel::reshape, py::arg("batch_size"), py::arg("height"), py::arg("width"), py::arg("tokenizer_model_max_length")) + .def("set_adapters", &ov::genai::UNet2DConditionModel::set_adapters, py::arg("adapters")) + .def("infer", &ov::genai::UNet2DConditionModel::infer, py::arg("sample"), py::arg("timestep")) + .def("set_hidden_states", &ov::genai::UNet2DConditionModel::set_hidden_states, py::arg("tensor_name"), py::arg("encoder_hidden_states")) + .def("do_classifier_free_guidance", &ov::genai::UNet2DConditionModel::do_classifier_free_guidance, py::arg("guidance_scale")) + .def( + "compile", [](ov::genai::UNet2DConditionModel& self, const std::string& device, const py::kwargs& kwargs @@ -162,14 +273,140 @@ void init_unet2d_condition_model(py::module_& m) { )"); } -void init_autoencoder_kl(py::module_& m) { - auto autoencoder_kl = py::class_(m, "AutoencoderKL", "AutoencoderKL class.") +void init_sd3_transformer_2d_model(py::module_& m) { + auto sd3_transformer_2d_model = py::class_(m, "SD3Transformer2DModel", "SD3Transformer2DModel class.") + .def(py::init([](const std::filesystem::path& root_dir) { + return std::make_unique(root_dir); + }), + py::arg("root_dir"), "Model root directory", + R"( + SD3Transformer2DModel class + root_dir (os.PathLike): Model root directory. + )") .def(py::init([]( - const std::filesystem::path& vae_decoder_path + const std::filesystem::path& root_dir, + const std::string& device, + const py::kwargs& kwargs + ) { + return std::make_unique(root_dir, device, pyutils::kwargs_to_any_map(kwargs)); + }), + py::arg("root_dir"), "Model root directory", + py::arg("device"), "Device on which inference will be done", + R"( + SD3Transformer2DModel class + root_dir (os.PathLike): Model root directory. + device (str): Device on which inference will be done. + kwargs: Device properties. + )") + .def(py::init([](const ov::genai::SD3Transformer2DModel& model) { + return std::make_unique(model); + }), + py::arg("model"), "SD3Transformer2DModel model" + R"( + SD3Transformer2DModel class + model (SD3Transformer2DModel): SD3Transformer2DModel model + )"); + + py::class_(sd3_transformer_2d_model, "Config", "This class is used for storing SD3Transformer2DModel config.") + .def(py::init([](const std::filesystem::path& config_path) { + return std::make_unique(config_path); + }), + py::arg("config_path")) + .def_readwrite("in_channels", &ov::genai::SD3Transformer2DModel::Config::in_channels) + .def_readwrite("sample_size", &ov::genai::SD3Transformer2DModel::Config::sample_size) + .def_readwrite("patch_size", &ov::genai::SD3Transformer2DModel::Config::patch_size) + .def_readwrite("joint_attention_dim", &ov::genai::SD3Transformer2DModel::Config::joint_attention_dim); + + sd3_transformer_2d_model.def("get_config", &ov::genai::SD3Transformer2DModel::get_config) + .def("reshape", &ov::genai::SD3Transformer2DModel::reshape, py::arg("batch_size"), py::arg("height"), py::arg("width"), py::arg("tokenizer_model_max_length")) + // .def("set_adapters", &ov::genai::SD3Transformer2DModel::set_adapters, py::arg("adapters")) + .def("infer", &ov::genai::SD3Transformer2DModel::infer, py::arg("sample"), py::arg("timestep")) + .def("set_hidden_states", &ov::genai::SD3Transformer2DModel::set_hidden_states, py::arg("tensor_name"), py::arg("encoder_hidden_states")) + .def( + "compile", + [](ov::genai::SD3Transformer2DModel& self, + const std::string& device, + const py::kwargs& kwargs + ) { + self.compile(device, pyutils::kwargs_to_any_map(kwargs)); + }, + py::arg("device"), "device on which inference will be done", + R"( + Compiles the model. + device (str): Device to run the model on (e.g., CPU, GPU). + kwargs: Device properties. + )"); +} + +void init_flux_transformer_2d_model(py::module_& m) { + auto flux_transformer_2d_model = py::class_(m, "FluxTransformer2DModel", "FluxTransformer2DModel class.") + .def(py::init([](const std::filesystem::path& root_dir) { + return std::make_unique(root_dir); + }), + py::arg("root_dir"), "Model root directory", + R"( + FluxTransformer2DModel class + root_dir (os.PathLike): Model root directory. + )") + .def(py::init([]( + const std::filesystem::path& root_dir, + const std::string& device, + const py::kwargs& kwargs ) { + return std::make_unique(root_dir, device, pyutils::kwargs_to_any_map(kwargs)); + }), + py::arg("root_dir"), "Model root directory", + py::arg("device"), "Device on which inference will be done", + R"( + UNet2DConditionModel class + root_dir (os.PathLike): Model root directory. + device (str): Device on which inference will be done. + kwargs: Device properties. + )") + .def(py::init([](const ov::genai::FluxTransformer2DModel& model) { + return std::make_unique(model); + }), + py::arg("model"), "FluxTransformer2DModel model" + R"( + FluxTransformer2DModel class + model (FluxTransformer2DModel): FluxTransformer2DModel model + )"); + + py::class_(flux_transformer_2d_model, "Config", "This class is used for storing FluxTransformer2DModel config.") + .def(py::init([](const std::filesystem::path& config_path) { + return std::make_unique(config_path); + }), + py::arg("config_path")) + .def_readwrite("in_channels", &ov::genai::FluxTransformer2DModel::Config::in_channels) + .def_readwrite("default_sample_size", &ov::genai::FluxTransformer2DModel::Config::m_default_sample_size); + + flux_transformer_2d_model.def("get_config", &ov::genai::FluxTransformer2DModel::get_config) + .def("reshape", &ov::genai::FluxTransformer2DModel::reshape, py::arg("batch_size"), py::arg("height"), py::arg("width"), py::arg("tokenizer_model_max_length")) + // .def("set_adapters", &ov::genai::FluxTransformer2DModel::set_adapters, py::arg("adapters")) + .def("infer", &ov::genai::FluxTransformer2DModel::infer, py::arg("sample"), py::arg("timestep")) + .def("set_hidden_states", &ov::genai::FluxTransformer2DModel::set_hidden_states, py::arg("tensor_name"), py::arg("encoder_hidden_states")) + .def( + "compile", + [](ov::genai::FluxTransformer2DModel& self, + const std::string& device, + const py::kwargs& kwargs + ) { + self.compile(device, pyutils::kwargs_to_any_map(kwargs)); + }, + py::arg("device"), "device on which inference will be done", + R"( + Compiles the model. + device (str): Device to run the model on (e.g., CPU, GPU). + kwargs: Device properties. + )"); +} + +void init_autoencoder_kl(py::module_& m) { + auto autoencoder_kl = py::class_(m, "AutoencoderKL", "AutoencoderKL class.") + .def(py::init([](const std::filesystem::path& vae_decoder_path) { return std::make_unique(vae_decoder_path); }), - py::arg("vae_decoder_path"), "VAE decoder directory", + py::arg("vae_decoder_path"), "VAE decoder directory", R"( AutoencoderKL class initialized only with decoder model. vae_decoder_path (os.PathLike): VAE decoder directory. @@ -194,7 +431,7 @@ void init_autoencoder_kl(py::module_& m) { ) { return std::make_unique(vae_decoder_path, device, pyutils::kwargs_to_any_map(kwargs)); }), - py::arg("vae_decoder_path"), "Root directory", + py::arg("vae_decoder_path"), "Root directory", py::arg("device"), "Device on which inference will be done", R"( AutoencoderKL class initialized only with decoder model. @@ -219,10 +456,8 @@ void init_autoencoder_kl(py::module_& m) { vae_decoder_path (os.PathLike): VAE decoder directory. device (str): Device on which inference will be done. kwargs: Device properties. - )") - .def(py::init([]( - const ov::genai::AutoencoderKL& model - ) { + )") + .def(py::init([](const ov::genai::AutoencoderKL& model) { return std::make_unique(model); }), py::arg("model"), "AutoencoderKL model" @@ -232,11 +467,9 @@ void init_autoencoder_kl(py::module_& m) { )"); py::class_(autoencoder_kl, "Config", "This class is used for storing AutoencoderKL config.") - .def(py::init([]( - const std::filesystem::path& config_path - ) { + .def(py::init([](const std::filesystem::path& config_path) { return std::make_unique(config_path); - }), + }), py::arg("config_path")) .def_readwrite("in_channels", &ov::genai::AutoencoderKL::Config::in_channels) .def_readwrite("latent_channels", &ov::genai::AutoencoderKL::Config::latent_channels) @@ -244,9 +477,9 @@ void init_autoencoder_kl(py::module_& m) { .def_readwrite("scaling_factor", &ov::genai::AutoencoderKL::Config::scaling_factor) .def_readwrite("block_out_channels", &ov::genai::AutoencoderKL::Config::block_out_channels); - autoencoder_kl.def("reshape", &ov::genai::AutoencoderKL::reshape, py::arg("batch_size"), py::arg("height"), py::arg("width")); - autoencoder_kl.def( - "compile", + autoencoder_kl.def("reshape", &ov::genai::AutoencoderKL::reshape, py::arg("batch_size"), py::arg("height"), py::arg("width")) + .def( + "compile", [](ov::genai::AutoencoderKL& self, const std::string& device, const py::kwargs& kwargs @@ -258,80 +491,9 @@ void init_autoencoder_kl(py::module_& m) { Compiles the model. device (str): Device to run the model on (e.g., CPU, GPU). kwargs: Device properties. - )"); - autoencoder_kl.def("decode", &ov::genai::AutoencoderKL::decode, py::arg("latent")); - autoencoder_kl.def("encode", &ov::genai::AutoencoderKL::encode, py::arg("image"), py::arg("generator")); - autoencoder_kl.def("get_config", &ov::genai::AutoencoderKL::get_config); - autoencoder_kl.def("get_vae_scale_factor", &ov::genai::AutoencoderKL::get_vae_scale_factor); -} - -void init_clip_text_model_with_projection(py::module_& m) { - auto clip_text_model_with_projection = py::class_(m, "CLIPTextModelWithProjection", "CLIPTextModelWithProjection class.") - .def(py::init([]( - const std::filesystem::path& root_dir - ) { - ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); - return std::make_unique(root_dir); - }), - py::arg("root_dir"), "Model root directory", - R"( - CLIPTextModelWithProjection class - root_dir (os.PathLike): Model root directory. - )") - .def(py::init([]( - const std::filesystem::path& root_dir, - const std::string& device, - const py::kwargs& kwargs - ) { - ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); - return std::make_unique(root_dir, device, pyutils::kwargs_to_any_map(kwargs)); - }), - py::arg("root_dir"), "Model root directory", - py::arg("device"), "Device on which inference will be done", - R"( - CLIPTextModelWithProjection class - root_dir (os.PathLike): Model root directory. - device (str): Device on which inference will be done. - kwargs: Device properties. - )") - .def(py::init([]( - const ov::genai::CLIPTextModelWithProjection& model - ) { - return std::make_unique(model); - }), - py::arg("model"), "CLIPTextModelWithProjection model" - R"( - CLIPTextModelWithProjection class - model (CLIPTextModelWithProjection): CLIPTextModelWithProjection model - )"); - - py::class_(clip_text_model_with_projection, "Config", "This class is used for storing CLIPTextModelWithProjection config.") - .def(py::init([]( - const std::filesystem::path& config_path - ) { - return std::make_unique(config_path); - }), - py::arg("config_path")) - .def_readwrite("max_position_embeddings", &ov::genai::CLIPTextModelWithProjection::Config::max_position_embeddings) - .def_readwrite("num_hidden_layers", &ov::genai::CLIPTextModelWithProjection::Config::num_hidden_layers); - - clip_text_model_with_projection.def("reshape", &ov::genai::CLIPTextModelWithProjection::reshape, py::arg("batch_size")); - clip_text_model_with_projection.def("infer", &ov::genai::CLIPTextModelWithProjection::infer, py::arg("pos_prompt"), py::arg("neg_prompt"), py::arg("do_classifier_free_guidance")); - clip_text_model_with_projection.def("get_config", &ov::genai::CLIPTextModelWithProjection::get_config); - clip_text_model_with_projection.def("get_output_tensor", &ov::genai::CLIPTextModelWithProjection::get_output_tensor, py::arg("idx")); - clip_text_model_with_projection.def("set_adapters", &ov::genai::CLIPTextModelWithProjection::set_adapters, py::arg("adapters")); - clip_text_model_with_projection.def( - "compile", - [](ov::genai::CLIPTextModelWithProjection& self, - const std::string& device, - const py::kwargs& kwargs - ) { - self.compile(device, pyutils::kwargs_to_any_map(kwargs)); - }, - py::arg("device"), "device on which inference will be done", - R"( - Compiles the model. - device (str): Device to run the model on (e.g., CPU, GPU). - kwargs: Device properties. - )"); + )") + .def("decode", &ov::genai::AutoencoderKL::decode, py::arg("latent")) + .def("encode", &ov::genai::AutoencoderKL::encode, py::arg("image"), py::arg("generator")) + .def("get_config", &ov::genai::AutoencoderKL::get_config) + .def("get_vae_scale_factor", &ov::genai::AutoencoderKL::get_vae_scale_factor); } diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp index d0d2f18a92..64ea64feb0 100644 --- a/src/python/py_image_generation_pipelines.cpp +++ b/src/python/py_image_generation_pipelines.cpp @@ -72,7 +72,10 @@ auto text2image_generate_docstring = R"( void init_clip_text_model(py::module_& m); void init_clip_text_model_with_projection(py::module_& m); +void init_t5_encoder_model(py::module_& m); void init_unet2d_condition_model(py::module_& m); +void init_sd3_transformer_2d_model(py::module_& m); +void init_flux_transformer_2d_model(py::module_& m); void init_autoencoder_kl(py::module_& m); void init_image_generation_pipelines(py::module_& m) { @@ -92,7 +95,10 @@ void init_image_generation_pipelines(py::module_& m) { // init image generation models init_clip_text_model(m); init_clip_text_model_with_projection(m); + init_t5_encoder_model(m); init_unet2d_condition_model(m); + init_sd3_transformer_2d_model(m); + init_flux_transformer_2d_model(m); init_autoencoder_kl(m); auto image_generation_scheduler = py::class_>(m, "Scheduler", "Scheduler for image generation pipelines."); @@ -167,6 +173,8 @@ void init_image_generation_pipelines(py::module_& m) { .def_static("stable_diffusion", &ov::genai::Text2ImagePipeline::stable_diffusion, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae")) .def_static("latent_consistency_model", &ov::genai::Text2ImagePipeline::latent_consistency_model, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae")) .def_static("stable_diffusion_xl", &ov::genai::Text2ImagePipeline::stable_diffusion_xl, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("clip_text_model_with_projection"), py::arg("unet"), py::arg("vae")) + .def_static("stable_diffusion_3", &ov::genai::Text2ImagePipeline::stable_diffusion_3, py::arg("scheduler"), py::arg("clip_text_model_1"), py::arg("clip_text_model_2"), py::arg("t5_encoder_model"), py::arg("transformer"), py::arg("vae")) + .def_static("flux", &ov::genai::Text2ImagePipeline::flux, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("t5_encoder_model"), py::arg("transformer"), py::arg("vae")) .def( "compile", [](ov::genai::Text2ImagePipeline& pipe,