diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 9384477eb9..eb1ed88467 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -15,7 +15,7 @@ import logging import os from pathlib import Path -from tempfile import TemporaryDirectory +from tempfile import TemporaryDirectory, gettempdir from typing import Dict, Optional, Union import openvino @@ -339,11 +339,11 @@ def compile(self): if self.request is None: logger.info(f"Compiling the model to {self._device} ...") ov_config = {**self.ov_config} - if "CACHE_DIR" not in self.ov_config.keys(): - # Set default CACHE_DIR only if it is not set. + if "CACHE_DIR" not in self.ov_config.keys() and not str(self.model_save_dir).startswith(gettempdir()): + # Set default CACHE_DIR only if it is not set, and if the model is not in a temporary directory cache_dir = Path(self.model_save_dir).joinpath("model_cache") ov_config["CACHE_DIR"] = str(cache_dir) - logger.info(f"Set CACHE_DIR to {str(cache_dir)}") + logger.info(f"Setting OpenVINO CACHE_DIR to {str(cache_dir)}") self.request = core.compile_model(self.model, self._device, ov_config) def _reshape( diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 1ca0b93643..2707260606 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -17,7 +17,7 @@ import os import shutil from pathlib import Path -from tempfile import TemporaryDirectory +from tempfile import TemporaryDirectory, gettempdir from typing import Any, Dict, List, Optional, Union import numpy as np @@ -539,10 +539,8 @@ def __init__( self._model_dir = Path(model_dir or parent_model._model_save_dir) config_path = self._model_dir / model_name / self.CONFIG_NAME self.config = self.parent_model._dict_from_json_file(config_path) if config_path.is_file() else {} - - # TODO : disable if self._model_dir tmp directory - if "CACHE_DIR" not in self.ov_config: - self.ov_config["CACHE_DIR"] = os.path.join(self._model_dir, self._model_name) + if "CACHE_DIR" not in self.ov_config.keys() and not str(self._model_dir).startswith(gettempdir()): + self.ov_config["CACHE_DIR"] = os.path.join(self._model_dir, self._model_name, "model_cache") def _compile(self): if self.request is None: diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py index 87cd18d875..6b759054d0 100644 --- a/optimum/intel/openvino/modeling_seq2seq.py +++ b/optimum/intel/openvino/modeling_seq2seq.py @@ -14,6 +14,7 @@ import logging from pathlib import Path +from tempfile import gettempdir from typing import Dict, Optional, Tuple import numpy as np @@ -202,31 +203,32 @@ def __init__( self.decoder_with_past = None enable_compilation = kwargs.get("compile", True) encoder_cache_dir = Path(self.model_save_dir).joinpath("encoder_cache") - encoder_cache_dir.mkdir(parents=True, exist_ok=True) - ov_encoder_config = ( - {**self.ov_config} - if "CACHE_DIR" in self.ov_config.keys() - else {**self.ov_config, "CACHE_DIR": str(encoder_cache_dir)} - ) + ov_encoder_config = {**self.ov_config} + + if "CACHE_DIR" not in ov_encoder_config.keys() and not str(self.model_save_dir).startswith(gettempdir()): + ov_encoder_config["CACHE_DIR"] = str(encoder_cache_dir) + self.encoder = OVEncoder( self.encoder_model, self._device, ov_encoder_config, main_input_name=self.main_input_name ) + decoder_cache_dir = Path(self.model_save_dir).joinpath("decoder_cache") - decoder_cache_dir.mkdir(parents=True, exist_ok=True) - ov_decoder_config = ( - {**self.ov_config} - if "CACHE_DIR" in self.ov_config.keys() - else {**self.ov_config, "CACHE_DIR": str(decoder_cache_dir)} - ) + ov_decoder_config = {**self.ov_config} + + if "CACHE_DIR" not in ov_decoder_config.keys() and not str(self.model_save_dir).startswith(gettempdir()): + ov_decoder_config["CACHE_DIR"] = str(decoder_cache_dir) + self.decoder = OVDecoder(self.decoder_model, self._device, ov_decoder_config) + if self.use_cache: decoder_past_cache_dir = Path(self.model_save_dir).joinpath("decoder_past_cache") - decoder_past_cache_dir.mkdir(parents=True, exist_ok=True) - ov_decoder_past_config = ( - {**self.ov_config} - if "CACHE_DIR" in self.ov_config.keys() - else {**self.ov_config, "CACHE_DIR": str(decoder_past_cache_dir)} - ) + ov_decoder_past_config = {**self.ov_config} + + if "CACHE_DIR" not in ov_decoder_past_config.keys() and not str(self.model_save_dir).startswith( + gettempdir() + ): + ov_decoder_past_config["CACHE_DIR"] = str(decoder_past_cache_dir) + self.decoder_with_past = OVDecoder(self.decoder_with_past_model, self._device, ov_decoder_past_config) if enable_compilation: self.compile() diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index bf1a007844..f3978b2965 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -111,6 +111,19 @@ def test_load_from_hub_and_save_model(self): self.assertIsInstance(loaded_model.config, PretrainedConfig) loaded_model_outputs = loaded_model(**tokens) + # Test that model caching is automatically enabled + openvino_cache_dir = loaded_model.model_save_dir / "model_cache" + self.assertTrue(openvino_cache_dir.is_dir()) + self.assertGreaterEqual(len(list(openvino_cache_dir.glob("*.blob"))), 1) + + # Test specifying ov_config with throughput hint and manual cache dir + manual_openvino_cache_dir = loaded_model.model_save_dir / "manual_model_cache" + ov_config = {"CACHE_DIR": str(manual_openvino_cache_dir), "PERFORMANCE_HINT": "THROUGHPUT"} + loaded_model = OVModelForSequenceClassification.from_pretrained(self.OV_MODEL_ID, ov_config=ov_config) + self.assertTrue(manual_openvino_cache_dir.is_dir()) + self.assertGreaterEqual(len(list(manual_openvino_cache_dir.glob("*.blob"))), 1) + self.assertEqual(loaded_model.request.get_property("PERFORMANCE_HINT").name, "THROUGHPUT") + with tempfile.TemporaryDirectory() as tmpdirname: loaded_model.save_pretrained(tmpdirname) folder_contents = os.listdir(tmpdirname) @@ -120,6 +133,7 @@ def test_load_from_hub_and_save_model(self): outputs = model(**tokens) self.assertTrue(torch.equal(loaded_model_outputs.logits, outputs.logits)) + del loaded_model del model gc.collect() @@ -276,6 +290,10 @@ def test_pipeline(self, model_arch): self.assertTrue(not model.is_dynamic) self.assertGreaterEqual(outputs[0]["score"], 0.0) self.assertIsInstance(outputs[0]["label"], str) + # Test that model caching was not automatically enabled for exported model + openvino_cache_dir = model.model_save_dir / "model_cache" + self.assertFalse(openvino_cache_dir.is_dir()) + del model del pipe gc.collect() @@ -466,7 +484,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "pegasus", ) GENERATION_LENGTH = 100 - SPEEDUP_CACHE = 1.1 @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): @@ -539,29 +556,17 @@ def test_compare_with_and_without_past_key_values(self): tokens = tokenizer("This is a sample input", return_tensors="pt") model_with_pkv = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True) - # Warmup - _ = model_with_pkv.generate(**tokens) - with Timer() as with_pkv_timer: - outputs_model_with_pkv = model_with_pkv.generate( - **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 - ) - + outputs_model_with_pkv = model_with_pkv.generate( + **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 + ) model_without_pkv = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=False) - - # Warmup - _ = model_without_pkv.generate(**tokens) - with Timer() as without_pkv_timer: - outputs_model_without_pkv = model_without_pkv.generate( - **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 - ) + outputs_model_without_pkv = model_without_pkv.generate( + **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 + ) self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH) self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH) - self.assertTrue( - without_pkv_timer.elapsed / with_pkv_timer.elapsed > self.SPEEDUP_CACHE, - f"With pkv latency: {with_pkv_timer.elapsed:.3f} ms, without pkv latency: {without_pkv_timer.elapsed:.3f} ms," - f" speedup: {without_pkv_timer.elapsed / with_pkv_timer.elapsed:.3f}", - ) + del model_with_pkv del model_without_pkv gc.collect()