diff --git a/tools/who_what_benchmark/tests/test_cli_image.py b/tools/who_what_benchmark/tests/test_cli_image.py index a1e1b3934b..d0b753b358 100644 --- a/tools/who_what_benchmark/tests/test_cli_image.py +++ b/tools/who_what_benchmark/tests/test_cli_image.py @@ -3,6 +3,7 @@ import shutil import pytest import logging +import tempfile logging.basicConfig(level=logging.INFO) @@ -10,9 +11,10 @@ def run_wwb(args): - logger.info(" ".join(["wwb"] + args)) + logger.info(" ".join(["TRANSFOREMRS_VERBOSITY=debug wwb"] + args)) result = subprocess.run(["wwb"] + args, capture_output=True, text=True) logger.info(result) + print(" ".join(["TRANSFOREMRS_VERBOSITY=debug wwb"] + args)) return result @@ -42,9 +44,11 @@ def test_image_model_types(model_id, model_type, backend): ] if backend == "hf": wwb_args.append("--hf") + elif backend == "genai": + wwb_args.append("--genai") result = run_wwb(wwb_args) - print(f"WWB result: {result}, {result.stderr}") + print(result.stderr, result.stdout) try: os.remove(GT_FILE) @@ -58,6 +62,64 @@ def test_image_model_types(model_id, model_type, backend): assert "## Reference text" not in result.stderr +@pytest.mark.parametrize( + ("model_id", "model_type"), + [ + ("echarlaix/tiny-random-stable-diffusion-xl", "text-to-image"), + ], +) +def test_image_model_genai(model_id, model_type): + GT_FILE = "test_sd.json" + MODEL_PATH = tempfile.TemporaryDirectory().name + + result = subprocess.run(["optimum-cli", "export", + "openvino", "-m", model_id, + MODEL_PATH], capture_output=True, text=True) + assert result.returncode == 0 + + wwb_args = [ + "--base-model", + MODEL_PATH, + "--num-samples", + "1", + "--gt-data", + GT_FILE, + "--device", + "CPU", + "--model-type", + model_type, + ] + result = run_wwb(wwb_args) + assert result.returncode == 0 + + wwb_args = [ + "--target-model", + MODEL_PATH, + "--num-samples", + "1", + "--gt-data", + GT_FILE, + "--device", + "CPU", + "--model-type", + model_type, + "--genai", + ] + result = run_wwb(wwb_args) + + try: + os.remove(GT_FILE) + except OSError: + pass + shutil.rmtree("reference", ignore_errors=True) + shutil.rmtree("target", ignore_errors=True) + shutil.rmtree(MODEL_PATH, ignore_errors=True) + + assert result.returncode == 0 + assert "Metrics for model" in result.stderr + assert "## Reference text" not in result.stderr + + @pytest.mark.parametrize( ("model_id", "model_type", "backend"), [ @@ -84,6 +146,8 @@ def test_image_custom_dataset(model_id, model_type, backend): ] if backend == "hf": wwb_args.append("--hf") + elif backend == "genai": + wwb_args.append("--genai") result = run_wwb(wwb_args) diff --git a/tools/who_what_benchmark/whowhatbench/registry.py b/tools/who_what_benchmark/whowhatbench/registry.py index 867b53e27a..83bb8b1c06 100644 --- a/tools/who_what_benchmark/whowhatbench/registry.py +++ b/tools/who_what_benchmark/whowhatbench/registry.py @@ -35,3 +35,7 @@ def score(self, model, **kwargs): @abstractmethod def worst_examples(self, top_k: int = 5, metric="similarity"): pass + + @abstractmethod + def get_generation_fn(self): + raise NotImplementedError("generation_fn should be returned") diff --git a/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py b/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py index 79dda2dcc9..16c781c0eb 100644 --- a/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py +++ b/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py @@ -5,6 +5,7 @@ from tqdm import tqdm from transformers import set_seed import torch +import openvino_genai from .registry import register_evaluator, BaseEvaluator @@ -26,6 +27,17 @@ } +class Generator(openvino_genai.Generator): + def __init__(self, seed, rng, mu=0.0, sigma=1.0): + openvino_genai.Generator.__init__(self) + self.mu = mu + self.sigma = sigma + self.rng = rng + + def next(self): + return torch.normal(torch.tensor(self.mu), self.sigma, generator=self.rng).item() + + @register_evaluator("text-to-image") class Text2ImageEvaluator(BaseEvaluator): def __init__( @@ -41,6 +53,7 @@ def __init__( num_samples=None, gen_image_fn=None, seed=42, + is_genai=False, ) -> None: assert ( base_model is not None or gt_data is not None @@ -57,17 +70,25 @@ def __init__( self.similarity = ImageSimilarity(similarity_model_id) self.last_cmp = None self.gt_dir = os.path.dirname(gt_data) + self.generation_fn = gen_image_fn + self.is_genai = is_genai + if base_model: + base_model.resolution = self.resolution self.gt_data = self._generate_data( base_model, gen_image_fn, os.path.join(self.gt_dir, "reference") ) else: self.gt_data = pd.read_csv(gt_data, keep_default_na=False) + def get_generation_fn(self): + return self.generation_fn + def dump_gt(self, csv_name: str): self.gt_data.to_csv(csv_name) def score(self, model, gen_image_fn=None): + model.resolution = self.resolution predictions = self._generate_data( model, gen_image_fn, os.path.join(self.gt_dir, "target") ) @@ -100,12 +121,13 @@ def worst_examples(self, top_k: int = 5, metric="similarity"): def _generate_data(self, model, gen_image_fn=None, image_dir="reference"): if hasattr(model, "reshape") and self.resolution is not None: - model.reshape( - batch_size=1, - height=self.resolution[0], - width=self.resolution[1], - num_images_per_prompt=1, - ) + if gen_image_fn is None: + model.reshape( + batch_size=1, + height=self.resolution[0], + width=self.resolution[1], + num_images_per_prompt=1, + ) def default_gen_image_fn(model, prompt, num_inference_steps, generator=None): output = model( @@ -118,7 +140,7 @@ def default_gen_image_fn(model, prompt, num_inference_steps, generator=None): ) return output.images[0] - gen_image_fn = gen_image_fn or default_gen_image_fn + generation_fn = gen_image_fn or default_gen_image_fn if self.test_data: if isinstance(self.test_data, str): @@ -144,13 +166,15 @@ def default_gen_image_fn(model, prompt, num_inference_steps, generator=None): if not os.path.exists(image_dir): os.makedirs(image_dir) + for i, prompt in tqdm(enumerate(prompts), desc="Evaluate pipeline"): set_seed(self.seed) - image = gen_image_fn( + rng = rng.manual_seed(self.seed) + image = generation_fn( model, prompt, self.num_inference_steps, - generator=rng.manual_seed(self.seed), + generator=Generator(self.seed, rng) if self.is_genai else rng ) image_path = os.path.join(image_dir, f"{i}.png") image.save(image_path) diff --git a/tools/who_what_benchmark/whowhatbench/text_evaluator.py b/tools/who_what_benchmark/whowhatbench/text_evaluator.py index 436d2be034..8672105489 100644 --- a/tools/who_what_benchmark/whowhatbench/text_evaluator.py +++ b/tools/who_what_benchmark/whowhatbench/text_evaluator.py @@ -121,6 +121,7 @@ def __init__( self.generation_config = generation_config self.generation_config_base = generation_config self.seqs_per_request = seqs_per_request + self.generation_fn = gen_answer_fn if self.generation_config is not None: assert self.seqs_per_request is not None @@ -151,6 +152,9 @@ def __init__( self.last_cmp = None + def get_generation_fn(self): + return self.generation_fn + def dump_gt(self, csv_name: str): self.gt_data.to_csv(csv_name) diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py index 2cf9ad8024..72365dcd41 100644 --- a/tools/who_what_benchmark/whowhatbench/wwb.py +++ b/tools/who_what_benchmark/whowhatbench/wwb.py @@ -3,6 +3,7 @@ import os import json import pandas as pd +from PIL import Image import logging from datasets import load_dataset from diffusers import DiffusionPipeline @@ -35,9 +36,14 @@ class GenAIModelWrapper: A helper class to store additional attributes for GenAI models """ - def __init__(self, model, model_dir): + def __init__(self, model, model_dir, model_type): self.model = model - self.config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True) + self.model_type = model_type + + if model_type == "text": + self.config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True) + elif model_type == "text-to-image": + self.config = DiffusionPipeline.load_config(model_dir, trust_remote_code=True) def __getattr__(self, attr): if attr in self.__dict__: @@ -53,40 +59,41 @@ def load_text_genai_pipeline(model_dir, device="CPU"): logger.error("Failed to import openvino_genai package. Please install it.") exit(-1) logger.info("Using OpenVINO GenAI API") - return GenAIModelWrapper(openvino_genai.LLMPipeline(model_dir, device), model_dir) + return GenAIModelWrapper(openvino_genai.LLMPipeline(model_dir, device), model_dir, "text") def load_text_model( model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False ): - if use_hf: - logger.info("Using HF Transformers API") - return AutoModelForCausalLM.from_pretrained( - model_id, trust_remote_code=True, device_map=device.lower() - ) - - if use_genai: - return load_text_genai_pipeline(model_id, device) - if ov_config: with open(ov_config) as f: ov_options = json.load(f) else: ov_options = None - try: - model = OVModelForCausalLM.from_pretrained( - model_id, trust_remote_code=True, device=device, ov_config=ov_options - ) - except ValueError: - config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) - model = OVModelForCausalLM.from_pretrained( - model_id, - config=config, - trust_remote_code=True, - use_cache=True, - device=device, - ov_config=ov_options, + + if use_hf: + logger.info("Using HF Transformers API") + model = AutoModelForCausalLM.from_pretrained( + model_id, trust_remote_code=True, device_map=device.lower() ) + elif use_genai: + model = load_text_genai_pipeline(model_id, device) + else: + try: + model = OVModelForCausalLM.from_pretrained( + model_id, trust_remote_code=True, device=device, ov_config=ov_options + ) + except ValueError: + config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) + model = OVModelForCausalLM.from_pretrained( + model_id, + config=config, + trust_remote_code=True, + use_cache=True, + device=device, + ov_config=ov_options, + ) + return model @@ -95,6 +102,20 @@ def load_text_model( } +def load_text2image_genai_pipeline(model_dir, device="CPU"): + try: + import openvino_genai + except ImportError: + logger.error("Failed to import openvino_genai package. Please install it.") + exit(-1) + logger.info("Using OpenVINO GenAI API") + return GenAIModelWrapper( + openvino_genai.Text2ImagePipeline(model_dir, device), + model_dir, + "text-to-image" + ) + + def load_text2image_model( model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False ): @@ -104,25 +125,28 @@ def load_text2image_model( else: ov_options = None - if use_hf: - return DiffusionPipeline.from_pretrained(model_id, trust_remote_code=True) + if use_genai: + model = load_text2image_genai_pipeline(model_id, device) + elif use_hf: + model = DiffusionPipeline.from_pretrained(model_id, trust_remote_code=True) + else: + TEXT2IMAGEPipeline = TEXT2IMAGE_TASK2CLASS[model_type] - TEXT2IMAGEPipeline = TEXT2IMAGE_TASK2CLASS[model_type] + try: + model = TEXT2IMAGEPipeline.from_pretrained( + model_id, trust_remote_code=True, device=device, ov_config=ov_options + ) + except ValueError: + config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) + model = TEXT2IMAGEPipeline.from_pretrained( + model_id, + config=config, + trust_remote_code=True, + use_cache=True, + device=device, + ov_config=ov_options, + ) - try: - model = TEXT2IMAGEPipeline.from_pretrained( - model_id, trust_remote_code=True, device=device, ov_config=ov_options - ) - except ValueError: - config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) - model = TEXT2IMAGEPipeline.from_pretrained( - model_id, - config=config, - trust_remote_code=True, - use_cache=True, - device=device, - ov_config=ov_options, - ) return model @@ -278,6 +302,24 @@ def parse_args(): action="store_true", help="Use LLMPipeline from transformers library to instantiate the model.", ) + parser.add_argument( + "--image-size", + type=int, + default=512, + help="Text-to-image specific parameter that defines the image resolution.", + ) + parser.add_argument( + "--num-inference-steps", + type=int, + default=4, + help="Text-to-image specific parameter that defines the number of denoising steps.", + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Text-to-image specific parameter that defines the seed value.", + ) return parser.parse_args() @@ -340,6 +382,18 @@ def genai_gen_answer(model, tokenizer, question, max_new_tokens, skip_question): return out +def genai_gen_image(model, prompt, num_inference_steps, generator=None): + image_tensor = model.generate( + prompt, + width=model.resolution[0], + height=model.resolution[1], + num_inference_steps=num_inference_steps, + random_generator=generator + ) + image = Image.fromarray(image_tensor.data[0]) + return image + + def get_evaluator(base_model, args): # config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) # task = TasksManager.infer_task_from_model(config._name_or_path) @@ -368,6 +422,11 @@ def get_evaluator(base_model, args): gt_data=args.gt_data, test_data=prompts, num_samples=args.num_samples, + resolution=(args.image_size, args.image_size), + num_inference_steps=args.num_inference_steps, + gen_image_fn=genai_gen_image if args.genai else None, + is_genai=args.genai, + seed=args.seed, ) else: raise ValueError(f"Unsupported task: {task}") @@ -446,7 +505,7 @@ def main(): args.genai, ) all_metrics_per_question, all_metrics = evaluator.score( - target_model, genai_gen_answer if args.genai else None + target_model, evaluator.get_generation_fn() if args.genai else None ) logger.info("Metrics for model: %s", args.target_model) logger.info(all_metrics)