From b845f8e5b29fc847f643f3ef2524b5b7e3ae918f Mon Sep 17 00:00:00 2001
From: Vinayak Baddi <68580231+vbaddi@users.noreply.github.com>
Date: Fri, 10 Jan 2025 15:43:44 +0530
Subject: [PATCH 1/4] nit: update the toml file with correct huggingface-hub
 version for finetune stack (#217)

Signed-off-by: Swati Allabadi <sallabad@qti.qualcomm.com>
Signed-off-by: Shubham Agrawal <quic_shubhagr@quicinc.com>
Signed-off-by: Rishin Raj <quic_rishinr@quicinc.com>
Signed-off-by: vbaddi <quic_vbaddi@quicinc.com>
Co-authored-by: Swati Allabadi <quic_sallabad@quicinc.com>
Co-authored-by: Swati Allabadi <sallabad@qti.qualcomm.com>
Co-authored-by: shubhagr-quic <quic_shubhagr@quicinc.com>
Co-authored-by: Rishin Raj <quic_rishinr@quicinc.com>
---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 7234d72b..9867181c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,6 +20,7 @@ classifiers = [
 requires-python = ">=3.8,<3.11"
 dependencies = [
     "transformers==4.45.2",
+    "huggingface-hub==0.27.0",
     "peft==0.13.2",
     "datasets==2.20.0",
     "fsspec==2023.6.0",

From 314009eebfb87206988ae6373d8ff2017f1846aa Mon Sep 17 00:00:00 2001
From: Onkar Chougule <168134249+ochougul@users.noreply.github.com>
Date: Sat, 11 Jan 2025 11:48:19 +0530
Subject: [PATCH 2/4] Scratch prefix caching (#218)

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 .../transformers/models/modeling_auto.py      |  30 ++-
 scripts/Jenkinsfile                           |   4 +-
 tests/finetune/test_finetune.py               |   1 +
 tests/peft/lora/test_lora_model.py            |   1 -
 tests/text_generation/test_text_generation.py |   1 +
 .../models/test_prefix_caching.py             | 183 ++++++++++++++++++
 tests/transformers/spd/test_spd_inference.py  |   1 +
 7 files changed, 212 insertions(+), 9 deletions(-)
 create mode 100644 tests/transformers/models/test_prefix_caching.py

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index f565cbca..ff657d29 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -209,6 +209,7 @@ def export(self, export_dir: Optional[str] = None) -> str:
                 2: "ctx_len",
             }
         output_names = ["logits"]
+
         for i in range(self.num_layers):
             for kv in ["key", "value"]:
                 example_inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape, dtype=torch.float32))
@@ -240,6 +241,7 @@ def compile(
         ctx_len: int = 128,
         batch_size: int = 1,
         full_batch_size: Optional[int] = None,
+        kv_cache_batch_size: Optional[int] = None,
         num_devices: int = 1,
         num_cores: int = 16,  # FIXME: Make this mandatory arg
         mxfp6_matmul: bool = False,
@@ -291,15 +293,28 @@ def compile(
         if self.continuous_batching and full_batch_size is None:
             raise TypeError("missing required argument: 'full_batch_size'")
 
+        if kv_cache_batch_size and not full_batch_size:
+            raise ValueError(
+                "Prefix caching is enabled only for continuous batching as of now. Please pass `full_batch_size` argument and make sure you pass `continuous_batching=True` in the `from_pretrained` call"
+            )
+
+        kv_cache_batch_size = (
+            kv_cache_batch_size if kv_cache_batch_size else (full_batch_size if full_batch_size else batch_size)
+        )
         # Define prefill specialization
         prefill_specialization = {
             # Prefill is always run with single BS for continuous batching.
             "batch_size": 1 if self.continuous_batching else batch_size,
             "seq_len": prefill_seq_len,
             "ctx_len": ctx_len,
+            # TODO: should be renamed to kv_cache_batch_size in specialzation too
         }
-        prefill_specialization.update({"full_batch_size": full_batch_size}) if self.continuous_batching else None
-        prefill_specialization.update({"num_logits_to_keep": 1}) if self.is_tlm else None
+        prefill_specialization.update({"num_logits_to_keep": 1}) if self.is_tlm else ...
+        if self.continuous_batching:
+            prefill_specialization.update({"full_batch_size": kv_cache_batch_size})
+        else:
+            prefill_specialization.update({"batch_size": kv_cache_batch_size})
+        prefill_specialization.update({"full_batch_exec_size": full_batch_size}) if full_batch_size else ...
         specializations = [
             prefill_specialization,
         ]
@@ -311,8 +326,11 @@ def compile(
                 "seq_len": num_speculative_tokens + 1 if self.is_tlm else 1,
                 "ctx_len": ctx_len,
             }
-            decode_specialization.update({"full_batch_size": full_batch_size}) if self.continuous_batching else None
-            decode_specialization.update({"num_logits_to_keep": num_speculative_tokens + 1}) if self.is_tlm else None
+            if self.continuous_batching:
+                decode_specialization.update({"full_batch_size": kv_cache_batch_size})
+            else:
+                decode_specialization.update({"batch_size": kv_cache_batch_size})
+            decode_specialization.update({"num_logits_to_keep": num_speculative_tokens + 1}) if self.is_tlm else ...
             specializations.append(decode_specialization)
 
         if enable_qnn:
@@ -363,7 +381,7 @@ def generate(
         self,
         tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
         prompts: List[str],
-        device_id: List[int] = [0],
+        device_id: List[int] = None,
         runtime_ai100: bool = True,
         **kwargs,
     ):
@@ -569,7 +587,7 @@ def compile(
     def generate(
         self,
         inputs: torch.Tensor,
-        device_ids: List[int] = [0],
+        device_ids: List[int] = None,
         runtime_ai100: bool = True,
     ) -> Union[torch.Tensor, np.ndarray]:
         """
diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile
index d1bb02a2..0d802b83 100644
--- a/scripts/Jenkinsfile
+++ b/scripts/Jenkinsfile
@@ -48,7 +48,7 @@ pipeline {
                }
                stage('Run Non-CLI QAIC Tests') {
                    steps {
-                       timeout(time: 70, unit: 'MINUTES') {
+                       timeout(time: 200, unit: 'MINUTES') {
                            sh '''
                            sudo docker exec ${BUILD_TAG} bash -c "
                            cd /efficient-transformers &&
@@ -56,7 +56,7 @@ pipeline {
                            mkdir -p $PWD/Non_qaic &&
                            export TOKENIZERS_PARALLELISM=false &&
                            export QEFF_HOME=$PWD/Non_qaic &&
-                           pytest tests -m '(not cli) and (on_qaic) and (not qnn)' -n 3 --junitxml=tests/tests_log2.xml &&
+                           pytest tests -m '(not cli) and (on_qaic) and (not qnn)' -n 4 --junitxml=tests/tests_log2.xml &&
                            deactivate"
                            '''
                        }
diff --git a/tests/finetune/test_finetune.py b/tests/finetune/test_finetune.py
index 4d7d061f..45330cad 100644
--- a/tests/finetune/test_finetune.py
+++ b/tests/finetune/test_finetune.py
@@ -26,6 +26,7 @@ def clean_up(path):
 
 
 # TODO:enable this once docker is available
+@pytest.mark.on_qaic
 @pytest.mark.skip(reason="eager docker not available in sdk")
 @pytest.mark.parametrize(
     "model_name,max_eval_step,max_train_step,intermediate_step_save,context_length,run_validation,use_peft,device",
diff --git a/tests/peft/lora/test_lora_model.py b/tests/peft/lora/test_lora_model.py
index a91555b3..a1bea604 100644
--- a/tests/peft/lora/test_lora_model.py
+++ b/tests/peft/lora/test_lora_model.py
@@ -229,6 +229,5 @@ def test_auto_lora_model_for_causal_lm_export_compile_generate(base_model_name,
     qeff_model.generate(
         tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=base_model_name),
         prompts=prompts,
-        device_id=[0],
         prompt_to_adapter_mapping=["adapter_0", "adapter_1", "adapter_0", "base"],
     )
diff --git a/tests/text_generation/test_text_generation.py b/tests/text_generation/test_text_generation.py
index 15f4b7dc..b8915859 100644
--- a/tests/text_generation/test_text_generation.py
+++ b/tests/text_generation/test_text_generation.py
@@ -44,6 +44,7 @@ def load_causal_lm_model(model_config):
 
 
 # Use @pytest.mark.parametrize to apply the configurations
+@pytest.mark.on_qaic
 @pytest.mark.parametrize("model_name, n_layer, full_batch_size, max_gen_len", configs)
 def test_generate_text_stream(
     model_name: str,
diff --git a/tests/transformers/models/test_prefix_caching.py b/tests/transformers/models/test_prefix_caching.py
new file mode 100644
index 00000000..fa79f33c
--- /dev/null
+++ b/tests/transformers/models/test_prefix_caching.py
@@ -0,0 +1,183 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import numpy as np
+import pytest
+from transformers import AutoTokenizer
+
+from QEfficient.generation.text_generation_inference import TextGeneration
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
+
+test_models = ["gpt2"]
+
+
+# The test should first generate output with some prefix+suffix1 or batch_id and then confirm that we are still able to execute of prefix+suffix2 on same batch id and getting correct output.
+@pytest.mark.on_qaic
+@pytest.mark.parametrize("model_name", test_models)
+def test_simple_prefix_caching(model_name):
+    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name, continuous_batching=True)
+    qeff_model.compile(
+        prefill_seq_len=128,
+        ctx_len=256,
+        full_batch_size=2,
+        kv_cache_batch_size=4,
+        num_cores=14,
+    )
+
+    prefixes = ["Once upon a time ", "Once upon a time "]
+    suffixes1 = ["in a land far away", "there was a small village"]
+    suffixes2 = ["a little girl", "in a bustling city"]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+    generator = TextGeneration(tokenizer=tokenizer, qpc_path=qeff_model.qpc_path, full_batch_size=2, ctx_len=256)
+
+    prompts = [pref + suff for pref, suff in zip(prefixes, suffixes1)]
+
+    # generation for batch_indices = 0, 1
+    prompts_exec_info = generator.generate(prompts)
+    ##############################
+    # generation for batch_indices
+    ##############################
+    # Run prefill for indices 2, 3 with same prompts
+    out2, pos2, gen_len2 = generator._qaic_model.run_prefill(
+        prompts[0], generation_len=None, decode_batch_id=np.array(2, dtype=np.int64).reshape(1, 1)
+    )
+    out3, pos3, gen_len3 = generator._qaic_model.run_prefill(
+        prompts[1], generation_len=None, decode_batch_id=np.array(3, dtype=np.int64).reshape(1, 1)
+    )
+
+    # Run decode for batch indices 2, 3
+    decode_inputs = {
+        "input_ids": np.array([[out2["logits"].argmax(2)[0][0]], [out3["logits"].argmax(2)[0][0]]]),
+        "position_ids": np.array([[pos2[0][0]], [pos3[0][0]]]),
+        "batch_index": np.array([[2], [3]], dtype=np.int64),
+    }
+
+    # Set logits placeholder for decode
+    logits_out_placeholder = np.zeros(
+        (
+            generator._qaic_model.full_batch_size,
+            generator._qaic_model._decode_seq_len,
+            generator._qaic_model._vocab_size,
+        ),
+        dtype=np.float32,
+    )
+    generator._qaic_model._session.set_buffers({"logits": logits_out_placeholder})
+
+    generation_outputs = []
+    for i in range(gen_len2):
+        generation_outputs.append(decode_inputs["input_ids"])
+        outputs = generator._qaic_model._session.run(decode_inputs)
+        logits = outputs["logits"]
+        if len(logits.shape) == 2:
+            logits = np.expand_dims(logits, 1)
+        next_token_id = logits.argmax(2)
+
+        decode_inputs["input_ids"] = next_token_id
+        decode_inputs["position_ids"] += 1
+
+    assert np.all(generator._qaic_model.generated_ids[0, :gen_len2] == [int(val[0]) for val in generation_outputs])
+    assert np.all(generator._qaic_model.generated_ids[1, :gen_len2] == [int(val[1]) for val in generation_outputs])
+
+    ##############################
+    # Now rerun with cached prefix on 0th index with prompt3 and use -1 for 1st index
+    ##############################
+
+    nprompts = [pref + suff for pref, suff in zip(prefixes, suffixes2)]
+
+    ## Prefill run on index 0
+    prompt = nprompts[0]
+    inputs = tokenizer(prompt, return_tensors="np", padding=True)
+    position_ids = inputs["attention_mask"].sum(1, keepdims=True)
+    padded_len = inputs["input_ids"].shape[1]
+    num_chunks = -(padded_len // -generator._qaic_model._prefill_seq_len)
+    padded_len = num_chunks * generator._qaic_model._prefill_seq_len  # Convert to a multiple of prompt_len
+
+    # Initialize variables specific to request
+    # Calculate the max generation length.
+    max_gen_len = generator._qaic_model._ctx_len - position_ids.max()
+
+    # Set the prefill logic buffer
+    logits_out_placeholder = np.zeros((1, 1, generator._qaic_model._vocab_size), dtype=np.float32)
+    generator._qaic_model._session.set_buffers({"logits": logits_out_placeholder})
+    inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len)
+    inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1)
+    inputs.pop("token_type_ids", None)
+    inputs["batch_index"] = np.array([[0]], dtype=np.int64)
+    norm_outputs = generator._qaic_model._session.run(inputs)
+    inputs["input_ids"][:, :3] = inputs["input_ids"][:, 4:7]
+    inputs["input_ids"][:, 3:] = 50256
+    inputs["position_ids"][:, :3] = inputs["position_ids"][:, 4:7]
+    inputs["position_ids"][:, 3:] = -1
+    mod_outputs = generator._qaic_model._session.run(inputs)
+    assert (mod_outputs["logits"] == norm_outputs["logits"]).all()
+    decode_inputs = {
+        "input_ids": np.array([[mod_outputs["logits"].argmax(2)[0][0]], [0]]),
+        "position_ids": np.array([[position_ids[0][0]], [-1]]),
+        "batch_index": np.array([[0], [1]], dtype=np.int64),
+    }
+
+    # Set logits placeholder for decode
+    logits_out_placeholder = np.zeros(
+        (
+            generator._qaic_model.full_batch_size,
+            generator._qaic_model._decode_seq_len,
+            generator._qaic_model._vocab_size,
+        ),
+        dtype=np.float32,
+    )
+    generator._qaic_model._session.set_buffers({"logits": logits_out_placeholder})
+
+    generation_outputs = []
+    for i in range(max_gen_len):
+        generation_outputs.append(decode_inputs["input_ids"])
+        outputs = generator._qaic_model._session.run(decode_inputs)
+        logits = outputs["logits"]
+        if len(logits.shape) == 2:
+            logits = np.expand_dims(logits, 1)
+        next_token_id = logits.argmax(2)
+
+        decode_inputs["input_ids"] = next_token_id
+        decode_inputs["position_ids"][0][0] += 1
+
+    # TODO: add a check if this matches normal execution for same prompt
+    ##############
+    # Now run decode on 1st index again with mod_inputs and check if output is correct
+    ##############
+    decode_inputs = {
+        "input_ids": np.array([[0], [prompts_exec_info.generated_ids[1][0]]]),
+        "position_ids": np.array([[-1], [9]]),
+        "batch_index": np.array([[0], [1]], dtype=np.int64),
+    }
+
+    # Set logits placeholder for decode
+    logits_out_placeholder = np.zeros(
+        (
+            generator._qaic_model.full_batch_size,
+            generator._qaic_model._decode_seq_len,
+            generator._qaic_model._vocab_size,
+        ),
+        dtype=np.float32,
+    )
+    generator._qaic_model._session.set_buffers({"logits": logits_out_placeholder})
+
+    generation_outputs_prefill_cached = []
+    for i in range(max_gen_len):
+        generation_outputs_prefill_cached.append(decode_inputs["input_ids"])
+        outputs = generator._qaic_model._session.run(decode_inputs)
+        logits = outputs["logits"]
+        if len(logits.shape) == 2:
+            logits = np.expand_dims(logits, 1)
+        next_token_id = logits.argmax(2)
+
+        decode_inputs["input_ids"] = next_token_id
+        decode_inputs["position_ids"][1][0] += 1
+
+    assert np.all(
+        prompts_exec_info.generated_ids[1][:247] == [int(val[1]) for val in generation_outputs_prefill_cached][:247]
+    )
diff --git a/tests/transformers/spd/test_spd_inference.py b/tests/transformers/spd/test_spd_inference.py
index 2e5f55cc..18334e81 100644
--- a/tests/transformers/spd/test_spd_inference.py
+++ b/tests/transformers/spd/test_spd_inference.py
@@ -92,6 +92,7 @@ def split_dlm_bonus_token_inputs(dlm_decode_inputs):
     return bonus_token_inputs, dlm_decode_inputs
 
 
+@pytest.mark.on_qaic
 @pytest.mark.parametrize(
     "prompt, num_speculative_tokens, prefill_seq_len, ctx_len, prefill_bsz, draft_model_name, target_model_name, full_batch_size",
     configs,

From 1517d6ae384a4cae74a73b14eeb3bd0a6ab6a3f6 Mon Sep 17 00:00:00 2001
From: shubhagr-quic <quic_shubhagr@quicinc.com>
Date: Sat, 11 Jan 2025 14:41:21 +0530
Subject: [PATCH 3/4] Support for mxint8 kv-cache added in QNN Compilation
 path. (#215)

Signed-off-by: Shubham Agrawal <quic_shubhagr@quicinc.com>
---
 QEfficient/compile/qnn_compiler.py            | 36 ++++++++++-----
 QEfficient/utils/constants.py                 |  9 ++--
 ...erate_qnn_network_specialization_config.py | 44 ++++++++++++++++++-
 3 files changed, 72 insertions(+), 17 deletions(-)

diff --git a/QEfficient/compile/qnn_compiler.py b/QEfficient/compile/qnn_compiler.py
index ad5da976..11926c9a 100644
--- a/QEfficient/compile/qnn_compiler.py
+++ b/QEfficient/compile/qnn_compiler.py
@@ -11,7 +11,7 @@
 
 from QEfficient.utils._utils import create_json, execute_command, load_json
 from QEfficient.utils.constants import QnnConstants
-from QEfficient.utils.generate_qnn_network_specialization_config import fetch_nodes_info
+from QEfficient.utils.generate_qnn_network_specialization_config import fetch_nodes_info, generate_data_format_config
 from QEfficient.utils.logging_utils import logger
 
 
@@ -38,6 +38,8 @@ def __init__(
         qnn_target: str = QnnConstants.TARGET,
         qnn_config_path: Optional[str] = None,
         qnn_binary_dir: Optional[str] = None,
+        mxint8: Optional[bool] = False,
+        compiler_mxint8_mdp_io: Optional[bool] = False,
         **kwargs,
     ) -> None:
         self.onnx_path = onnx_path
@@ -52,6 +54,8 @@ def __init__(
         self.compiler_mxfp6_matmul_weights = compiler_mxfp6_matmul_weights
         self.qnn_config_path = qnn_config_path
         self.qnn_binary_dir = qnn_binary_dir
+        self.mxint8 = mxint8
+        self.compiler_mxint8_mdp_io = compiler_mxint8_mdp_io
         self.custom_io_path = custom_io_path
         self.dlc_model_path = os.path.join(qpc_base_path, f"{QnnConstants.MODEL_NAME}.dlc")
         self.qnn_target = qnn_target
@@ -148,6 +152,7 @@ def create_qnn_compile_backend_json(self) -> str:
             "compiler_stat_level": QnnConstants.COMPILER_STAT_LEVEL,
             "compiler_stats_batch_size": QnnConstants.COMPILER_STATS_BATCH_SIZE,
             "compiler_time_passes": QnnConstants.COMPILER_TIME_PASSES,
+            "compiler_mxint8_mdp_io": self.compiler_mxint8_mdp_io,
         }
         if self.compiler_max_out_channel_split > 0:
             qnn_compile_backend["compiler_max_out_channel_split"] = str(self.compiler_max_out_channel_split)
@@ -225,10 +230,10 @@ def converter(self) -> str:
         IMMUTABLE parameters which can not be overridden by the user using qnn_config.json:
             :input_network (str): Generated ``ONNX`` Model Path.
             :output_path (str): Path to generated DLC file, which is provided qpc_base_path/model.dlc
-            :io_config (str): Path to custom_io_config.yaml file created using GenerateQNNnetworkSpecializationconfig.py
+            :config (str): Path to custom_io_config.yaml file created using GenerateQNNnetworkSpecializationconfig.py
             :float_bias_bitwidth (int): Bitwidth to use for float bias tensor
             :float_bitwidth (int): Converts the graph to the specified float bitwidth, either 32 or 16(Default).
-            :keep_int64_inputs(flag): Passed by default.
+            :preserve_io_datatype(flag): Passed by default.
 
         CONVERTOR_ARGS_EXTENSION passed in qnn_config.json is appended to the command created.
 
@@ -240,7 +245,7 @@ def converter(self) -> str:
         cmd = (
             f"{converter_tool} --input_network {self.onnx_path} "
             f"--output_path {self.dlc_model_path} "
-            f"--io_config {self.custom_io_path} "
+            f"--config {self.custom_io_path} "
             f"--float_bias_bitwidth {QnnConstants.FLOAT_BIAS_BITWIDTH} "
             f"--float_bitwidth {QnnConstants.FLOAT_BITWIDTH} "
         )
@@ -287,6 +292,17 @@ def generate_context_binary(self) -> str:
             f"--config_file {config_file_path} "
         )
 
+        if self.mxint8:
+            data_format_file_path = os.path.join(self.qpc_base_path, QnnConstants.QNN_DATA_FORMAT_CONFIG_NAME)
+            generate_data_format_config(
+                self.onnx_path, model_dlc_name=QnnConstants.MODEL_NAME, file_path=data_format_file_path
+            )
+            if not os.path.isfile(data_format_file_path):
+                raise FileNotFoundError(
+                    f"file {data_format_file_path} needs to exist in the qpc_base_path for mxint8 compilation. Please rerun infer/compile Api"
+                )
+            cmd += f"--data_format_config {data_format_file_path} "
+
         if self.qnn_config and QnnConstants.CONTEXT_BIN_ARGS_EXTENSION_STR in self.qnn_config:
             if "--log_level " not in self.qnn_config[QnnConstants.CONTEXT_BIN_ARGS_EXTENSION_STR]:
                 cmd += f"--log_level {QnnConstants.LOG_LEVEL} "
@@ -353,20 +369,15 @@ def compile(
 
     if kwargs:
         logger.warning("Extra arguments to QNN compilation are not supported as of now!")
-
         raise NotImplementedError("Can't handle extra compilation args now!")
 
-    if allow_mxint8_mdp_io:
-        logger.warning("QNN doesn't support allow_mxint8_mdp_io. Bypassing the value passed for allow_mxint8_mdp_io")
-
-    if mxint8:
-        logger.warning("QNN doesn't support mxint8. Bypassing the value passed for mxint8")
-
     os.makedirs(qpc_base_path, exist_ok=True)
 
     # Created custom_io_config.yaml file for QNN-Convertor stage.
     # TODO To make custom_io_config.yaml configurable as not all models need it.
     custom_io_file_path = os.path.join(qpc_base_path, "custom_io_config.yaml")
+
+    kv_precision = "uint8" if mxint8 else "float16"
     fetch_nodes_info(
         onnx_graph_path=onnx_path,
         batch_size=batch_size,
@@ -374,6 +385,7 @@ def compile(
         context_length=ctx_len,
         file_path=custom_io_file_path,
         full_batch_size=full_batch_size,
+        kv_precision=kv_precision,
     )
 
     if not os.path.isfile(custom_io_file_path):
@@ -395,6 +407,8 @@ def compile(
         ctx_len=ctx_len,
         compiler_mxfp6_matmul_weights=mxfp6,
         qnn_binary_dir=qnn_binary_dir,
+        mxint8=mxint8,
+        compiler_mxint8_mdp_io=allow_mxint8_mdp_io,
     )
 
     compiled_binary_path = qnn_obj.compile()
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
index bfbac905..ab861a78 100644
--- a/QEfficient/utils/constants.py
+++ b/QEfficient/utils/constants.py
@@ -81,6 +81,7 @@ class QnnConstants:
 
     # QNN Compilation target names
     MODEL_NAME = "model"
+    QNN_DATA_FORMAT_CONFIG_NAME = "qnn_data_format_config.json"
     CONTEXT_BIN_NAME = "qnngraph.serialized"
     CONTEXT_BIN_QPC_NAME = "programqpc.bin"
 
@@ -90,7 +91,7 @@ class QnnConstants:
     # Convertor Arguments
     FLOAT_BITWIDTH = 16
     FLOAT_BIAS_BITWIDTH = 32
-    CONVERTOR_DEFAULT_ARGS = "--keep_int64_inputs --onnx_no_simplification "
+    CONVERTOR_DEFAULT_ARGS = "--preserve_io_datatype --onnx_skip_simplification "
 
     # Context-Binary-Generator Arguments
     LOG_LEVEL = "error"
@@ -118,11 +119,11 @@ class QnnConstants:
     IMMUTABLE_CONVERTOR_ARGS = [
         "--input_network ",
         "--output_path ",
-        "--io_config ",
+        "--config ",
         "--float_bias_bitwidth ",
         "--float_bitwidth ",
-        "--keep_int64_inputs",
-        "--onnx_no_simplification",
+        "--preserve_io_datatype",
+        "--onnx_skip_simplification",
         "--onnx_defer_loading",
     ]
 
diff --git a/QEfficient/utils/generate_qnn_network_specialization_config.py b/QEfficient/utils/generate_qnn_network_specialization_config.py
index 0e5e17c0..ca78c658 100644
--- a/QEfficient/utils/generate_qnn_network_specialization_config.py
+++ b/QEfficient/utils/generate_qnn_network_specialization_config.py
@@ -5,6 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
+import json
 from typing import Optional
 
 import onnx
@@ -24,6 +25,7 @@ def fetch_nodes_info(
     file_path: str = "custom_io_config.yaml",
     full_batch_size: Optional[int] = None,
     decode_only: Optional[bool] = False,
+    kv_precision: Optional[str] = "float16",
 ) -> None:
     # Load the ONNX model
     onnx_model = onnx.load(onnx_graph_path)
@@ -38,7 +40,7 @@ def fetch_nodes_info(
         input_info = {}
         input_info["DataType"] = str(helper.tensor_dtype_to_np_dtype(node.type.tensor_type.elem_type))
         if "past_key" in node.name or "past_value" in node.name:
-            input_info["DataType"] = "float16"
+            input_info["DataType"] = kv_precision
 
         if "batch_index" in node.name:
             if full_batch_size:
@@ -128,7 +130,7 @@ def fetch_nodes_info(
         output_info = {}
         output_info["DataType"] = str(helper.tensor_dtype_to_np_dtype(output.type.tensor_type.elem_type))
         if "past_key" in output.name or "past_value" in output.name:
-            output_info["DataType"] = "float16"
+            output_info["DataType"] = kv_precision
         elif "logits" in output.name:
             output_info["DataType"] = "float32"
         output_nodes_info.append({"Name": output.name, "Desired Model Parameters": output_info})
@@ -142,3 +144,41 @@ def fetch_nodes_info(
             yaml.dump(final_dict, yaml_file, default_flow_style=False, sort_keys=False)
     except Exception as e:
         print(f"Failed to create YAML File for QNN Network Specialization Configuration{file_path}: {e}")
+
+
+def generate_data_format_config(
+    onnx_graph_path: str,
+    *,
+    data_format: Optional[str] = "QNN_TENSOR_DATA_FORMAT_MX",
+    model_dlc_name: Optional[str] = "model",
+    file_path: str = "qnn_data_format_config.json",
+) -> None:
+    # Load the ONNX model
+    onnx_model = onnx.load(onnx_graph_path)
+
+    kv_nodes: list = []
+
+    for input in onnx_model.graph.input:
+        if "past_key" in input.name or "past_value" in input.name:
+            kv_nodes.append((input.name).replace(".", "_"))
+    for output in onnx_model.graph.output:
+        if "past_key" in output.name or "past_value" in output.name:
+            kv_nodes.append((output.name).replace(".", "_"))
+            kv_overrides = {}
+
+    kv_overrides["graphs"] = [
+        {
+            "graph_name": model_dlc_name + "_configuration_1",
+            "tensors": [{"tensor_name": node, "dataFormat": data_format} for node in kv_nodes],
+        },
+        {
+            "graph_name": model_dlc_name + "_configuration_2",
+            "tensors": [{"tensor_name": node, "dataFormat": data_format} for node in kv_nodes],
+        },
+    ]
+
+    try:
+        with open(file_path, "w") as json_file:
+            json.dump(kv_overrides, json_file, indent=4)
+    except Exception as e:
+        print(f"Failed to create JSON File for QNN Data Format Configuration{file_path}: {e}")

From 05275e599e030319ef422eeef0d98c07464aa1f5 Mon Sep 17 00:00:00 2001
From: quic-jouachen <quic_jouachen@quicinc.com>
Date: Sat, 11 Jan 2025 06:50:57 -0800
Subject: [PATCH 4/4] Fix finite lorax generation in cb mode (#216)

The `examples/lora_models.py` script encounters issues in cb mode. This
PR addresses the following:

* Resolves the regression in finite lorax generation within cb mode in
`QEfficient/generation/text_generation_inference.py` that occurred after
the last refactoring.
* Adds an additional unit test in `tests/peft/lora/test_lora_model.py`
to verify the compile-generate flow for finite lorax cb mode.
* [Addressed after comments] Uses auto device picking in
`tests/peft/lora/test_lora_model.py`; Updates auto device picking option
for `generate()` in `QEfficient/peft/lora/auto.py`

Signed-off-by: Jou-An Chen <quic_jouachen@quicinc.com>
---
 .../generation/text_generation_inference.py   |  4 ++-
 QEfficient/peft/lora/auto.py                  |  6 ++--
 tests/peft/lora/test_lora_model.py            | 30 +++++++++++++++++--
 3 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index 4ddd57ad..54b6f057 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -341,7 +341,9 @@ def cloud_ai_100_exec_kv(
             perf_metrics=PerfMetrics(prefill_time, decode_perf, total_perf, total_time),
         )
     else:
-        exec_info = generate_text.generate(prompt=prompt, generation_len=generation_len)
+        exec_info = generate_text.generate(
+            prompt=prompt, generation_len=generation_len, prompt_to_lora_id_mapping=prompt_to_lora_id_mapping
+        )
 
     print_latency_stats_kv(prompt, exec_info=exec_info, automation=automation)
     return exec_info
diff --git a/QEfficient/peft/lora/auto.py b/QEfficient/peft/lora/auto.py
index 2ccfac12..c1397996 100644
--- a/QEfficient/peft/lora/auto.py
+++ b/QEfficient/peft/lora/auto.py
@@ -342,9 +342,9 @@ def generate(
         self,
         tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
         prompts: List[str],
-        device_id: List[int] = None,
         prompt_to_adapter_mapping: List[str] = None,
-        runtime: str = "AI_100",
+        device_id: Optional[List[int]] = None,
+        runtime: Optional[str] = "AI_100",
         **kwargs,
     ):
         """
@@ -355,9 +355,9 @@ def generate(
         ``Mandatory`` Args:
             :tokenizer (PreTrainedTokenizerFast or PreTrainedTokenizer): The tokenizer used in the inference
             :prompts (List[str]): List of prompts to run the execution.
-            :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
             :prompt_to_adapter_mapping (List[str]): The sequence of the adapter names will be matched with sequence of prompts and corresponding adapters will be used for the prompts."base" for base model (no adapter).
         ``optional`` Args:
+            :device_id (List[int]): Device IDs to be used for execution. If ``len(device_id) > 1``, it enables multiple card setup. If ``None``, auto-device-picker will be used. ``Defaults to None``.
             :runtime (str, optional): Only ``AI_100`` runtime is supported as of now; ``ONNXRT`` and ``PyTorch`` coming soon. Defaults to "AI_100".
 
         """
diff --git a/tests/peft/lora/test_lora_model.py b/tests/peft/lora/test_lora_model.py
index a1bea604..4726fb8c 100644
--- a/tests/peft/lora/test_lora_model.py
+++ b/tests/peft/lora/test_lora_model.py
@@ -195,10 +195,12 @@ def test_auto_lora_model_for_causal_lm_load_unload_adapter(base_model_name, adap
     assert qeff_model.unload_adapter("adapter_0")  # valid unload
 
 
-# test the export, export caching, compile, generate workflow
+# test the export, export caching, compile and generate workflow in noncb mode
 @pytest.mark.on_qaic
 @pytest.mark.parametrize("base_model_name,adapter_id_0,adapter_id_1", model_samples[:1])
-def test_auto_lora_model_for_causal_lm_export_compile_generate(base_model_name, adapter_id_0, adapter_id_1, tmp_path):
+def test_auto_lora_model_for_causal_lm_noncb_export_compile_generate(
+    base_model_name, adapter_id_0, adapter_id_1, tmp_path
+):
     qeff_model = QEffAutoLoraModelForCausalLM.from_pretrained(base_model_name, num_hidden_layers=1)
 
     qeff_model.load_adapter(adapter_id_0, "adapter_0")
@@ -231,3 +233,27 @@ def test_auto_lora_model_for_causal_lm_export_compile_generate(base_model_name,
         prompts=prompts,
         prompt_to_adapter_mapping=["adapter_0", "adapter_1", "adapter_0", "base"],
     )
+
+
+# test the compile and generate workflow in cb mode
+@pytest.mark.on_qaic
+@pytest.mark.parametrize("base_model_name,adapter_id_0,adapter_id_1", model_samples[:1])
+def test_auto_lora_model_for_causal_lm_cb_compile_generate(base_model_name, adapter_id_0, adapter_id_1, tmp_path):
+    qeff_model = QEffAutoLoraModelForCausalLM.from_pretrained(
+        base_model_name, continuous_batching=True, num_hidden_layers=1
+    )
+
+    qeff_model.load_adapter(adapter_id_0, "adapter_0")
+    qeff_model.load_adapter(adapter_id_1, "adapter_1")
+
+    # test compile
+    qeff_model.compile(prefill_seq_len=32, ctx_len=64, full_batch_size=2)
+    assert Path(qeff_model.qpc_path).is_dir()
+
+    # test generate
+    prompts = ["hello!", "hi", "hello, my name is", "hey"]
+    qeff_model.generate(
+        tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=base_model_name),
+        prompts=prompts,
+        prompt_to_adapter_mapping=["adapter_0", "adapter_1", "adapter_0", "base"],
+    )