From b845f8e5b29fc847f643f3ef2524b5b7e3ae918f Mon Sep 17 00:00:00 2001 From: Vinayak Baddi <68580231+vbaddi@users.noreply.github.com> Date: Fri, 10 Jan 2025 15:43:44 +0530 Subject: [PATCH 1/4] nit: update the toml file with correct huggingface-hub version for finetune stack (#217) Signed-off-by: Swati Allabadi Signed-off-by: Shubham Agrawal Signed-off-by: Rishin Raj Signed-off-by: vbaddi Co-authored-by: Swati Allabadi Co-authored-by: Swati Allabadi Co-authored-by: shubhagr-quic Co-authored-by: Rishin Raj --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 7234d72b..9867181c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ classifiers = [ requires-python = ">=3.8,<3.11" dependencies = [ "transformers==4.45.2", + "huggingface-hub==0.27.0", "peft==0.13.2", "datasets==2.20.0", "fsspec==2023.6.0", From 314009eebfb87206988ae6373d8ff2017f1846aa Mon Sep 17 00:00:00 2001 From: Onkar Chougule <168134249+ochougul@users.noreply.github.com> Date: Sat, 11 Jan 2025 11:48:19 +0530 Subject: [PATCH 2/4] Scratch prefix caching (#218) Signed-off-by: Onkar Chougule --- .../transformers/models/modeling_auto.py | 30 ++- scripts/Jenkinsfile | 4 +- tests/finetune/test_finetune.py | 1 + tests/peft/lora/test_lora_model.py | 1 - tests/text_generation/test_text_generation.py | 1 + .../models/test_prefix_caching.py | 183 ++++++++++++++++++ tests/transformers/spd/test_spd_inference.py | 1 + 7 files changed, 212 insertions(+), 9 deletions(-) create mode 100644 tests/transformers/models/test_prefix_caching.py diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index f565cbca..ff657d29 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -209,6 +209,7 @@ def export(self, export_dir: Optional[str] = None) -> str: 2: "ctx_len", } output_names = ["logits"] + for i in range(self.num_layers): for kv in ["key", "value"]: example_inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape, dtype=torch.float32)) @@ -240,6 +241,7 @@ def compile( ctx_len: int = 128, batch_size: int = 1, full_batch_size: Optional[int] = None, + kv_cache_batch_size: Optional[int] = None, num_devices: int = 1, num_cores: int = 16, # FIXME: Make this mandatory arg mxfp6_matmul: bool = False, @@ -291,15 +293,28 @@ def compile( if self.continuous_batching and full_batch_size is None: raise TypeError("missing required argument: 'full_batch_size'") + if kv_cache_batch_size and not full_batch_size: + raise ValueError( + "Prefix caching is enabled only for continuous batching as of now. Please pass `full_batch_size` argument and make sure you pass `continuous_batching=True` in the `from_pretrained` call" + ) + + kv_cache_batch_size = ( + kv_cache_batch_size if kv_cache_batch_size else (full_batch_size if full_batch_size else batch_size) + ) # Define prefill specialization prefill_specialization = { # Prefill is always run with single BS for continuous batching. "batch_size": 1 if self.continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, + # TODO: should be renamed to kv_cache_batch_size in specialzation too } - prefill_specialization.update({"full_batch_size": full_batch_size}) if self.continuous_batching else None - prefill_specialization.update({"num_logits_to_keep": 1}) if self.is_tlm else None + prefill_specialization.update({"num_logits_to_keep": 1}) if self.is_tlm else ... + if self.continuous_batching: + prefill_specialization.update({"full_batch_size": kv_cache_batch_size}) + else: + prefill_specialization.update({"batch_size": kv_cache_batch_size}) + prefill_specialization.update({"full_batch_exec_size": full_batch_size}) if full_batch_size else ... specializations = [ prefill_specialization, ] @@ -311,8 +326,11 @@ def compile( "seq_len": num_speculative_tokens + 1 if self.is_tlm else 1, "ctx_len": ctx_len, } - decode_specialization.update({"full_batch_size": full_batch_size}) if self.continuous_batching else None - decode_specialization.update({"num_logits_to_keep": num_speculative_tokens + 1}) if self.is_tlm else None + if self.continuous_batching: + decode_specialization.update({"full_batch_size": kv_cache_batch_size}) + else: + decode_specialization.update({"batch_size": kv_cache_batch_size}) + decode_specialization.update({"num_logits_to_keep": num_speculative_tokens + 1}) if self.is_tlm else ... specializations.append(decode_specialization) if enable_qnn: @@ -363,7 +381,7 @@ def generate( self, tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer], prompts: List[str], - device_id: List[int] = [0], + device_id: List[int] = None, runtime_ai100: bool = True, **kwargs, ): @@ -569,7 +587,7 @@ def compile( def generate( self, inputs: torch.Tensor, - device_ids: List[int] = [0], + device_ids: List[int] = None, runtime_ai100: bool = True, ) -> Union[torch.Tensor, np.ndarray]: """ diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile index d1bb02a2..0d802b83 100644 --- a/scripts/Jenkinsfile +++ b/scripts/Jenkinsfile @@ -48,7 +48,7 @@ pipeline { } stage('Run Non-CLI QAIC Tests') { steps { - timeout(time: 70, unit: 'MINUTES') { + timeout(time: 200, unit: 'MINUTES') { sh ''' sudo docker exec ${BUILD_TAG} bash -c " cd /efficient-transformers && @@ -56,7 +56,7 @@ pipeline { mkdir -p $PWD/Non_qaic && export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/Non_qaic && - pytest tests -m '(not cli) and (on_qaic) and (not qnn)' -n 3 --junitxml=tests/tests_log2.xml && + pytest tests -m '(not cli) and (on_qaic) and (not qnn)' -n 4 --junitxml=tests/tests_log2.xml && deactivate" ''' } diff --git a/tests/finetune/test_finetune.py b/tests/finetune/test_finetune.py index 4d7d061f..45330cad 100644 --- a/tests/finetune/test_finetune.py +++ b/tests/finetune/test_finetune.py @@ -26,6 +26,7 @@ def clean_up(path): # TODO:enable this once docker is available +@pytest.mark.on_qaic @pytest.mark.skip(reason="eager docker not available in sdk") @pytest.mark.parametrize( "model_name,max_eval_step,max_train_step,intermediate_step_save,context_length,run_validation,use_peft,device", diff --git a/tests/peft/lora/test_lora_model.py b/tests/peft/lora/test_lora_model.py index a91555b3..a1bea604 100644 --- a/tests/peft/lora/test_lora_model.py +++ b/tests/peft/lora/test_lora_model.py @@ -229,6 +229,5 @@ def test_auto_lora_model_for_causal_lm_export_compile_generate(base_model_name, qeff_model.generate( tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=base_model_name), prompts=prompts, - device_id=[0], prompt_to_adapter_mapping=["adapter_0", "adapter_1", "adapter_0", "base"], ) diff --git a/tests/text_generation/test_text_generation.py b/tests/text_generation/test_text_generation.py index 15f4b7dc..b8915859 100644 --- a/tests/text_generation/test_text_generation.py +++ b/tests/text_generation/test_text_generation.py @@ -44,6 +44,7 @@ def load_causal_lm_model(model_config): # Use @pytest.mark.parametrize to apply the configurations +@pytest.mark.on_qaic @pytest.mark.parametrize("model_name, n_layer, full_batch_size, max_gen_len", configs) def test_generate_text_stream( model_name: str, diff --git a/tests/transformers/models/test_prefix_caching.py b/tests/transformers/models/test_prefix_caching.py new file mode 100644 index 00000000..fa79f33c --- /dev/null +++ b/tests/transformers/models/test_prefix_caching.py @@ -0,0 +1,183 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import numpy as np +import pytest +from transformers import AutoTokenizer + +from QEfficient.generation.text_generation_inference import TextGeneration +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM + +test_models = ["gpt2"] + + +# The test should first generate output with some prefix+suffix1 or batch_id and then confirm that we are still able to execute of prefix+suffix2 on same batch id and getting correct output. +@pytest.mark.on_qaic +@pytest.mark.parametrize("model_name", test_models) +def test_simple_prefix_caching(model_name): + qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name, continuous_batching=True) + qeff_model.compile( + prefill_seq_len=128, + ctx_len=256, + full_batch_size=2, + kv_cache_batch_size=4, + num_cores=14, + ) + + prefixes = ["Once upon a time ", "Once upon a time "] + suffixes1 = ["in a land far away", "there was a small village"] + suffixes2 = ["a little girl", "in a bustling city"] + + tokenizer = AutoTokenizer.from_pretrained(model_name) + + generator = TextGeneration(tokenizer=tokenizer, qpc_path=qeff_model.qpc_path, full_batch_size=2, ctx_len=256) + + prompts = [pref + suff for pref, suff in zip(prefixes, suffixes1)] + + # generation for batch_indices = 0, 1 + prompts_exec_info = generator.generate(prompts) + ############################## + # generation for batch_indices + ############################## + # Run prefill for indices 2, 3 with same prompts + out2, pos2, gen_len2 = generator._qaic_model.run_prefill( + prompts[0], generation_len=None, decode_batch_id=np.array(2, dtype=np.int64).reshape(1, 1) + ) + out3, pos3, gen_len3 = generator._qaic_model.run_prefill( + prompts[1], generation_len=None, decode_batch_id=np.array(3, dtype=np.int64).reshape(1, 1) + ) + + # Run decode for batch indices 2, 3 + decode_inputs = { + "input_ids": np.array([[out2["logits"].argmax(2)[0][0]], [out3["logits"].argmax(2)[0][0]]]), + "position_ids": np.array([[pos2[0][0]], [pos3[0][0]]]), + "batch_index": np.array([[2], [3]], dtype=np.int64), + } + + # Set logits placeholder for decode + logits_out_placeholder = np.zeros( + ( + generator._qaic_model.full_batch_size, + generator._qaic_model._decode_seq_len, + generator._qaic_model._vocab_size, + ), + dtype=np.float32, + ) + generator._qaic_model._session.set_buffers({"logits": logits_out_placeholder}) + + generation_outputs = [] + for i in range(gen_len2): + generation_outputs.append(decode_inputs["input_ids"]) + outputs = generator._qaic_model._session.run(decode_inputs) + logits = outputs["logits"] + if len(logits.shape) == 2: + logits = np.expand_dims(logits, 1) + next_token_id = logits.argmax(2) + + decode_inputs["input_ids"] = next_token_id + decode_inputs["position_ids"] += 1 + + assert np.all(generator._qaic_model.generated_ids[0, :gen_len2] == [int(val[0]) for val in generation_outputs]) + assert np.all(generator._qaic_model.generated_ids[1, :gen_len2] == [int(val[1]) for val in generation_outputs]) + + ############################## + # Now rerun with cached prefix on 0th index with prompt3 and use -1 for 1st index + ############################## + + nprompts = [pref + suff for pref, suff in zip(prefixes, suffixes2)] + + ## Prefill run on index 0 + prompt = nprompts[0] + inputs = tokenizer(prompt, return_tensors="np", padding=True) + position_ids = inputs["attention_mask"].sum(1, keepdims=True) + padded_len = inputs["input_ids"].shape[1] + num_chunks = -(padded_len // -generator._qaic_model._prefill_seq_len) + padded_len = num_chunks * generator._qaic_model._prefill_seq_len # Convert to a multiple of prompt_len + + # Initialize variables specific to request + # Calculate the max generation length. + max_gen_len = generator._qaic_model._ctx_len - position_ids.max() + + # Set the prefill logic buffer + logits_out_placeholder = np.zeros((1, 1, generator._qaic_model._vocab_size), dtype=np.float32) + generator._qaic_model._session.set_buffers({"logits": logits_out_placeholder}) + inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len) + inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1) + inputs.pop("token_type_ids", None) + inputs["batch_index"] = np.array([[0]], dtype=np.int64) + norm_outputs = generator._qaic_model._session.run(inputs) + inputs["input_ids"][:, :3] = inputs["input_ids"][:, 4:7] + inputs["input_ids"][:, 3:] = 50256 + inputs["position_ids"][:, :3] = inputs["position_ids"][:, 4:7] + inputs["position_ids"][:, 3:] = -1 + mod_outputs = generator._qaic_model._session.run(inputs) + assert (mod_outputs["logits"] == norm_outputs["logits"]).all() + decode_inputs = { + "input_ids": np.array([[mod_outputs["logits"].argmax(2)[0][0]], [0]]), + "position_ids": np.array([[position_ids[0][0]], [-1]]), + "batch_index": np.array([[0], [1]], dtype=np.int64), + } + + # Set logits placeholder for decode + logits_out_placeholder = np.zeros( + ( + generator._qaic_model.full_batch_size, + generator._qaic_model._decode_seq_len, + generator._qaic_model._vocab_size, + ), + dtype=np.float32, + ) + generator._qaic_model._session.set_buffers({"logits": logits_out_placeholder}) + + generation_outputs = [] + for i in range(max_gen_len): + generation_outputs.append(decode_inputs["input_ids"]) + outputs = generator._qaic_model._session.run(decode_inputs) + logits = outputs["logits"] + if len(logits.shape) == 2: + logits = np.expand_dims(logits, 1) + next_token_id = logits.argmax(2) + + decode_inputs["input_ids"] = next_token_id + decode_inputs["position_ids"][0][0] += 1 + + # TODO: add a check if this matches normal execution for same prompt + ############## + # Now run decode on 1st index again with mod_inputs and check if output is correct + ############## + decode_inputs = { + "input_ids": np.array([[0], [prompts_exec_info.generated_ids[1][0]]]), + "position_ids": np.array([[-1], [9]]), + "batch_index": np.array([[0], [1]], dtype=np.int64), + } + + # Set logits placeholder for decode + logits_out_placeholder = np.zeros( + ( + generator._qaic_model.full_batch_size, + generator._qaic_model._decode_seq_len, + generator._qaic_model._vocab_size, + ), + dtype=np.float32, + ) + generator._qaic_model._session.set_buffers({"logits": logits_out_placeholder}) + + generation_outputs_prefill_cached = [] + for i in range(max_gen_len): + generation_outputs_prefill_cached.append(decode_inputs["input_ids"]) + outputs = generator._qaic_model._session.run(decode_inputs) + logits = outputs["logits"] + if len(logits.shape) == 2: + logits = np.expand_dims(logits, 1) + next_token_id = logits.argmax(2) + + decode_inputs["input_ids"] = next_token_id + decode_inputs["position_ids"][1][0] += 1 + + assert np.all( + prompts_exec_info.generated_ids[1][:247] == [int(val[1]) for val in generation_outputs_prefill_cached][:247] + ) diff --git a/tests/transformers/spd/test_spd_inference.py b/tests/transformers/spd/test_spd_inference.py index 2e5f55cc..18334e81 100644 --- a/tests/transformers/spd/test_spd_inference.py +++ b/tests/transformers/spd/test_spd_inference.py @@ -92,6 +92,7 @@ def split_dlm_bonus_token_inputs(dlm_decode_inputs): return bonus_token_inputs, dlm_decode_inputs +@pytest.mark.on_qaic @pytest.mark.parametrize( "prompt, num_speculative_tokens, prefill_seq_len, ctx_len, prefill_bsz, draft_model_name, target_model_name, full_batch_size", configs, From 1517d6ae384a4cae74a73b14eeb3bd0a6ab6a3f6 Mon Sep 17 00:00:00 2001 From: shubhagr-quic Date: Sat, 11 Jan 2025 14:41:21 +0530 Subject: [PATCH 3/4] Support for mxint8 kv-cache added in QNN Compilation path. (#215) Signed-off-by: Shubham Agrawal --- QEfficient/compile/qnn_compiler.py | 36 ++++++++++----- QEfficient/utils/constants.py | 9 ++-- ...erate_qnn_network_specialization_config.py | 44 ++++++++++++++++++- 3 files changed, 72 insertions(+), 17 deletions(-) diff --git a/QEfficient/compile/qnn_compiler.py b/QEfficient/compile/qnn_compiler.py index ad5da976..11926c9a 100644 --- a/QEfficient/compile/qnn_compiler.py +++ b/QEfficient/compile/qnn_compiler.py @@ -11,7 +11,7 @@ from QEfficient.utils._utils import create_json, execute_command, load_json from QEfficient.utils.constants import QnnConstants -from QEfficient.utils.generate_qnn_network_specialization_config import fetch_nodes_info +from QEfficient.utils.generate_qnn_network_specialization_config import fetch_nodes_info, generate_data_format_config from QEfficient.utils.logging_utils import logger @@ -38,6 +38,8 @@ def __init__( qnn_target: str = QnnConstants.TARGET, qnn_config_path: Optional[str] = None, qnn_binary_dir: Optional[str] = None, + mxint8: Optional[bool] = False, + compiler_mxint8_mdp_io: Optional[bool] = False, **kwargs, ) -> None: self.onnx_path = onnx_path @@ -52,6 +54,8 @@ def __init__( self.compiler_mxfp6_matmul_weights = compiler_mxfp6_matmul_weights self.qnn_config_path = qnn_config_path self.qnn_binary_dir = qnn_binary_dir + self.mxint8 = mxint8 + self.compiler_mxint8_mdp_io = compiler_mxint8_mdp_io self.custom_io_path = custom_io_path self.dlc_model_path = os.path.join(qpc_base_path, f"{QnnConstants.MODEL_NAME}.dlc") self.qnn_target = qnn_target @@ -148,6 +152,7 @@ def create_qnn_compile_backend_json(self) -> str: "compiler_stat_level": QnnConstants.COMPILER_STAT_LEVEL, "compiler_stats_batch_size": QnnConstants.COMPILER_STATS_BATCH_SIZE, "compiler_time_passes": QnnConstants.COMPILER_TIME_PASSES, + "compiler_mxint8_mdp_io": self.compiler_mxint8_mdp_io, } if self.compiler_max_out_channel_split > 0: qnn_compile_backend["compiler_max_out_channel_split"] = str(self.compiler_max_out_channel_split) @@ -225,10 +230,10 @@ def converter(self) -> str: IMMUTABLE parameters which can not be overridden by the user using qnn_config.json: :input_network (str): Generated ``ONNX`` Model Path. :output_path (str): Path to generated DLC file, which is provided qpc_base_path/model.dlc - :io_config (str): Path to custom_io_config.yaml file created using GenerateQNNnetworkSpecializationconfig.py + :config (str): Path to custom_io_config.yaml file created using GenerateQNNnetworkSpecializationconfig.py :float_bias_bitwidth (int): Bitwidth to use for float bias tensor :float_bitwidth (int): Converts the graph to the specified float bitwidth, either 32 or 16(Default). - :keep_int64_inputs(flag): Passed by default. + :preserve_io_datatype(flag): Passed by default. CONVERTOR_ARGS_EXTENSION passed in qnn_config.json is appended to the command created. @@ -240,7 +245,7 @@ def converter(self) -> str: cmd = ( f"{converter_tool} --input_network {self.onnx_path} " f"--output_path {self.dlc_model_path} " - f"--io_config {self.custom_io_path} " + f"--config {self.custom_io_path} " f"--float_bias_bitwidth {QnnConstants.FLOAT_BIAS_BITWIDTH} " f"--float_bitwidth {QnnConstants.FLOAT_BITWIDTH} " ) @@ -287,6 +292,17 @@ def generate_context_binary(self) -> str: f"--config_file {config_file_path} " ) + if self.mxint8: + data_format_file_path = os.path.join(self.qpc_base_path, QnnConstants.QNN_DATA_FORMAT_CONFIG_NAME) + generate_data_format_config( + self.onnx_path, model_dlc_name=QnnConstants.MODEL_NAME, file_path=data_format_file_path + ) + if not os.path.isfile(data_format_file_path): + raise FileNotFoundError( + f"file {data_format_file_path} needs to exist in the qpc_base_path for mxint8 compilation. Please rerun infer/compile Api" + ) + cmd += f"--data_format_config {data_format_file_path} " + if self.qnn_config and QnnConstants.CONTEXT_BIN_ARGS_EXTENSION_STR in self.qnn_config: if "--log_level " not in self.qnn_config[QnnConstants.CONTEXT_BIN_ARGS_EXTENSION_STR]: cmd += f"--log_level {QnnConstants.LOG_LEVEL} " @@ -353,20 +369,15 @@ def compile( if kwargs: logger.warning("Extra arguments to QNN compilation are not supported as of now!") - raise NotImplementedError("Can't handle extra compilation args now!") - if allow_mxint8_mdp_io: - logger.warning("QNN doesn't support allow_mxint8_mdp_io. Bypassing the value passed for allow_mxint8_mdp_io") - - if mxint8: - logger.warning("QNN doesn't support mxint8. Bypassing the value passed for mxint8") - os.makedirs(qpc_base_path, exist_ok=True) # Created custom_io_config.yaml file for QNN-Convertor stage. # TODO To make custom_io_config.yaml configurable as not all models need it. custom_io_file_path = os.path.join(qpc_base_path, "custom_io_config.yaml") + + kv_precision = "uint8" if mxint8 else "float16" fetch_nodes_info( onnx_graph_path=onnx_path, batch_size=batch_size, @@ -374,6 +385,7 @@ def compile( context_length=ctx_len, file_path=custom_io_file_path, full_batch_size=full_batch_size, + kv_precision=kv_precision, ) if not os.path.isfile(custom_io_file_path): @@ -395,6 +407,8 @@ def compile( ctx_len=ctx_len, compiler_mxfp6_matmul_weights=mxfp6, qnn_binary_dir=qnn_binary_dir, + mxint8=mxint8, + compiler_mxint8_mdp_io=allow_mxint8_mdp_io, ) compiled_binary_path = qnn_obj.compile() diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index bfbac905..ab861a78 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -81,6 +81,7 @@ class QnnConstants: # QNN Compilation target names MODEL_NAME = "model" + QNN_DATA_FORMAT_CONFIG_NAME = "qnn_data_format_config.json" CONTEXT_BIN_NAME = "qnngraph.serialized" CONTEXT_BIN_QPC_NAME = "programqpc.bin" @@ -90,7 +91,7 @@ class QnnConstants: # Convertor Arguments FLOAT_BITWIDTH = 16 FLOAT_BIAS_BITWIDTH = 32 - CONVERTOR_DEFAULT_ARGS = "--keep_int64_inputs --onnx_no_simplification " + CONVERTOR_DEFAULT_ARGS = "--preserve_io_datatype --onnx_skip_simplification " # Context-Binary-Generator Arguments LOG_LEVEL = "error" @@ -118,11 +119,11 @@ class QnnConstants: IMMUTABLE_CONVERTOR_ARGS = [ "--input_network ", "--output_path ", - "--io_config ", + "--config ", "--float_bias_bitwidth ", "--float_bitwidth ", - "--keep_int64_inputs", - "--onnx_no_simplification", + "--preserve_io_datatype", + "--onnx_skip_simplification", "--onnx_defer_loading", ] diff --git a/QEfficient/utils/generate_qnn_network_specialization_config.py b/QEfficient/utils/generate_qnn_network_specialization_config.py index 0e5e17c0..ca78c658 100644 --- a/QEfficient/utils/generate_qnn_network_specialization_config.py +++ b/QEfficient/utils/generate_qnn_network_specialization_config.py @@ -5,6 +5,7 @@ # # ----------------------------------------------------------------------------- +import json from typing import Optional import onnx @@ -24,6 +25,7 @@ def fetch_nodes_info( file_path: str = "custom_io_config.yaml", full_batch_size: Optional[int] = None, decode_only: Optional[bool] = False, + kv_precision: Optional[str] = "float16", ) -> None: # Load the ONNX model onnx_model = onnx.load(onnx_graph_path) @@ -38,7 +40,7 @@ def fetch_nodes_info( input_info = {} input_info["DataType"] = str(helper.tensor_dtype_to_np_dtype(node.type.tensor_type.elem_type)) if "past_key" in node.name or "past_value" in node.name: - input_info["DataType"] = "float16" + input_info["DataType"] = kv_precision if "batch_index" in node.name: if full_batch_size: @@ -128,7 +130,7 @@ def fetch_nodes_info( output_info = {} output_info["DataType"] = str(helper.tensor_dtype_to_np_dtype(output.type.tensor_type.elem_type)) if "past_key" in output.name or "past_value" in output.name: - output_info["DataType"] = "float16" + output_info["DataType"] = kv_precision elif "logits" in output.name: output_info["DataType"] = "float32" output_nodes_info.append({"Name": output.name, "Desired Model Parameters": output_info}) @@ -142,3 +144,41 @@ def fetch_nodes_info( yaml.dump(final_dict, yaml_file, default_flow_style=False, sort_keys=False) except Exception as e: print(f"Failed to create YAML File for QNN Network Specialization Configuration{file_path}: {e}") + + +def generate_data_format_config( + onnx_graph_path: str, + *, + data_format: Optional[str] = "QNN_TENSOR_DATA_FORMAT_MX", + model_dlc_name: Optional[str] = "model", + file_path: str = "qnn_data_format_config.json", +) -> None: + # Load the ONNX model + onnx_model = onnx.load(onnx_graph_path) + + kv_nodes: list = [] + + for input in onnx_model.graph.input: + if "past_key" in input.name or "past_value" in input.name: + kv_nodes.append((input.name).replace(".", "_")) + for output in onnx_model.graph.output: + if "past_key" in output.name or "past_value" in output.name: + kv_nodes.append((output.name).replace(".", "_")) + kv_overrides = {} + + kv_overrides["graphs"] = [ + { + "graph_name": model_dlc_name + "_configuration_1", + "tensors": [{"tensor_name": node, "dataFormat": data_format} for node in kv_nodes], + }, + { + "graph_name": model_dlc_name + "_configuration_2", + "tensors": [{"tensor_name": node, "dataFormat": data_format} for node in kv_nodes], + }, + ] + + try: + with open(file_path, "w") as json_file: + json.dump(kv_overrides, json_file, indent=4) + except Exception as e: + print(f"Failed to create JSON File for QNN Data Format Configuration{file_path}: {e}") From 05275e599e030319ef422eeef0d98c07464aa1f5 Mon Sep 17 00:00:00 2001 From: quic-jouachen Date: Sat, 11 Jan 2025 06:50:57 -0800 Subject: [PATCH 4/4] Fix finite lorax generation in cb mode (#216) The `examples/lora_models.py` script encounters issues in cb mode. This PR addresses the following: * Resolves the regression in finite lorax generation within cb mode in `QEfficient/generation/text_generation_inference.py` that occurred after the last refactoring. * Adds an additional unit test in `tests/peft/lora/test_lora_model.py` to verify the compile-generate flow for finite lorax cb mode. * [Addressed after comments] Uses auto device picking in `tests/peft/lora/test_lora_model.py`; Updates auto device picking option for `generate()` in `QEfficient/peft/lora/auto.py` Signed-off-by: Jou-An Chen --- .../generation/text_generation_inference.py | 4 ++- QEfficient/peft/lora/auto.py | 6 ++-- tests/peft/lora/test_lora_model.py | 30 +++++++++++++++++-- 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index 4ddd57ad..54b6f057 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -341,7 +341,9 @@ def cloud_ai_100_exec_kv( perf_metrics=PerfMetrics(prefill_time, decode_perf, total_perf, total_time), ) else: - exec_info = generate_text.generate(prompt=prompt, generation_len=generation_len) + exec_info = generate_text.generate( + prompt=prompt, generation_len=generation_len, prompt_to_lora_id_mapping=prompt_to_lora_id_mapping + ) print_latency_stats_kv(prompt, exec_info=exec_info, automation=automation) return exec_info diff --git a/QEfficient/peft/lora/auto.py b/QEfficient/peft/lora/auto.py index 2ccfac12..c1397996 100644 --- a/QEfficient/peft/lora/auto.py +++ b/QEfficient/peft/lora/auto.py @@ -342,9 +342,9 @@ def generate( self, tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer], prompts: List[str], - device_id: List[int] = None, prompt_to_adapter_mapping: List[str] = None, - runtime: str = "AI_100", + device_id: Optional[List[int]] = None, + runtime: Optional[str] = "AI_100", **kwargs, ): """ @@ -355,9 +355,9 @@ def generate( ``Mandatory`` Args: :tokenizer (PreTrainedTokenizerFast or PreTrainedTokenizer): The tokenizer used in the inference :prompts (List[str]): List of prompts to run the execution. - :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model :prompt_to_adapter_mapping (List[str]): The sequence of the adapter names will be matched with sequence of prompts and corresponding adapters will be used for the prompts."base" for base model (no adapter). ``optional`` Args: + :device_id (List[int]): Device IDs to be used for execution. If ``len(device_id) > 1``, it enables multiple card setup. If ``None``, auto-device-picker will be used. ``Defaults to None``. :runtime (str, optional): Only ``AI_100`` runtime is supported as of now; ``ONNXRT`` and ``PyTorch`` coming soon. Defaults to "AI_100". """ diff --git a/tests/peft/lora/test_lora_model.py b/tests/peft/lora/test_lora_model.py index a1bea604..4726fb8c 100644 --- a/tests/peft/lora/test_lora_model.py +++ b/tests/peft/lora/test_lora_model.py @@ -195,10 +195,12 @@ def test_auto_lora_model_for_causal_lm_load_unload_adapter(base_model_name, adap assert qeff_model.unload_adapter("adapter_0") # valid unload -# test the export, export caching, compile, generate workflow +# test the export, export caching, compile and generate workflow in noncb mode @pytest.mark.on_qaic @pytest.mark.parametrize("base_model_name,adapter_id_0,adapter_id_1", model_samples[:1]) -def test_auto_lora_model_for_causal_lm_export_compile_generate(base_model_name, adapter_id_0, adapter_id_1, tmp_path): +def test_auto_lora_model_for_causal_lm_noncb_export_compile_generate( + base_model_name, adapter_id_0, adapter_id_1, tmp_path +): qeff_model = QEffAutoLoraModelForCausalLM.from_pretrained(base_model_name, num_hidden_layers=1) qeff_model.load_adapter(adapter_id_0, "adapter_0") @@ -231,3 +233,27 @@ def test_auto_lora_model_for_causal_lm_export_compile_generate(base_model_name, prompts=prompts, prompt_to_adapter_mapping=["adapter_0", "adapter_1", "adapter_0", "base"], ) + + +# test the compile and generate workflow in cb mode +@pytest.mark.on_qaic +@pytest.mark.parametrize("base_model_name,adapter_id_0,adapter_id_1", model_samples[:1]) +def test_auto_lora_model_for_causal_lm_cb_compile_generate(base_model_name, adapter_id_0, adapter_id_1, tmp_path): + qeff_model = QEffAutoLoraModelForCausalLM.from_pretrained( + base_model_name, continuous_batching=True, num_hidden_layers=1 + ) + + qeff_model.load_adapter(adapter_id_0, "adapter_0") + qeff_model.load_adapter(adapter_id_1, "adapter_1") + + # test compile + qeff_model.compile(prefill_seq_len=32, ctx_len=64, full_batch_size=2) + assert Path(qeff_model.qpc_path).is_dir() + + # test generate + prompts = ["hello!", "hi", "hello, my name is", "hey"] + qeff_model.generate( + tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=base_model_name), + prompts=prompts, + prompt_to_adapter_mapping=["adapter_0", "adapter_1", "adapter_0", "base"], + )