Merge branch 'quic:main' into finetune

Signed-off-by: Mamta Singh <[email protected]>
quic-amitraj · Jan 13, 2025 · c3079b0 · c3079b0
2 parents 2c41404 + 05275e5
commit c3079b0
Show file tree

Hide file tree

Showing 14 changed files with 340 additions and 45 deletions.
diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
@@ -5,21 +5,27 @@
 #
 # -----------------------------------------------------------------------------
 
-try:
-    import platform
-    import sys
 
-    sys.path.append(f"/opt/qti-aic/dev/lib/{platform.machine()}")
-    import qaicrt  # noqa: F401
+def check_qaic_sdk():
+    """Check if QAIC SDK is installed"""
+    try:
+        import platform
+        import sys
 
-    qaic_sdk_installed = True
-except ModuleNotFoundError:
-    qaic_sdk_installed = False
+        sys.path.append(f"/opt/qti-aic/dev/lib/{platform.machine()}")
+        import qaicrt  # noqa: F401
 
-__version__ = "0.0.1.dev0"
+        return True
+    except ImportError:
+        return False
+
+
+QAIC_INSTALLED = check_qaic_sdk()
 
-if qaic_sdk_installed:
-    from QEfficient.base import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader
+# Conditionally import QAIC-related modules if the SDK is installed
+__version__ = "0.0.1.dev0"
+if QAIC_INSTALLED:
+    from QEfficient.base import QEFFAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader
     from QEfficient.compile.compile_helper import compile
     from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
     from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
@@ -34,10 +40,12 @@
         "export",
         "compile",
         "cloud_ai_100_exec_kv",
-        "QEffAutoModel",
+        "QEFFAutoModel",
         "QEFFAutoModelForCausalLM",
         "QEffAutoPeftModelForCausalLM",
         "QEFFCommonLoader",
     ]
+
+    print("QAIC SDK is installed.")
 else:
-    print("QAIC SDK is not found, skipping QEfficient imports.")
+    print("QAIC SDK is not installed. Proceeding without it.")
diff --git a/QEfficient/compile/qnn_compiler.py b/QEfficient/compile/qnn_compiler.py
@@ -11,7 +11,7 @@
 
 from QEfficient.utils._utils import create_json, execute_command, load_json
 from QEfficient.utils.constants import QnnConstants
-from QEfficient.utils.generate_qnn_network_specialization_config import fetch_nodes_info
+from QEfficient.utils.generate_qnn_network_specialization_config import fetch_nodes_info, generate_data_format_config
 from QEfficient.utils.logging_utils import logger
 
 
@@ -38,6 +38,8 @@ def __init__(
         qnn_target: str = QnnConstants.TARGET,
         qnn_config_path: Optional[str] = None,
         qnn_binary_dir: Optional[str] = None,
+        mxint8: Optional[bool] = False,
+        compiler_mxint8_mdp_io: Optional[bool] = False,
         **kwargs,
     ) -> None:
         self.onnx_path = onnx_path
@@ -52,6 +54,8 @@ def __init__(
         self.compiler_mxfp6_matmul_weights = compiler_mxfp6_matmul_weights
         self.qnn_config_path = qnn_config_path
         self.qnn_binary_dir = qnn_binary_dir
+        self.mxint8 = mxint8
+        self.compiler_mxint8_mdp_io = compiler_mxint8_mdp_io
         self.custom_io_path = custom_io_path
         self.dlc_model_path = os.path.join(qpc_base_path, f"{QnnConstants.MODEL_NAME}.dlc")
         self.qnn_target = qnn_target
@@ -148,6 +152,7 @@ def create_qnn_compile_backend_json(self) -> str:
             "compiler_stat_level": QnnConstants.COMPILER_STAT_LEVEL,
             "compiler_stats_batch_size": QnnConstants.COMPILER_STATS_BATCH_SIZE,
             "compiler_time_passes": QnnConstants.COMPILER_TIME_PASSES,
+            "compiler_mxint8_mdp_io": self.compiler_mxint8_mdp_io,
         }
         if self.compiler_max_out_channel_split > 0:
             qnn_compile_backend["compiler_max_out_channel_split"] = str(self.compiler_max_out_channel_split)
@@ -225,10 +230,10 @@ def converter(self) -> str:
         IMMUTABLE parameters which can not be overridden by the user using qnn_config.json:
             :input_network (str): Generated ``ONNX`` Model Path.
             :output_path (str): Path to generated DLC file, which is provided qpc_base_path/model.dlc
-            :io_config (str): Path to custom_io_config.yaml file created using GenerateQNNnetworkSpecializationconfig.py
+            :config (str): Path to custom_io_config.yaml file created using GenerateQNNnetworkSpecializationconfig.py
             :float_bias_bitwidth (int): Bitwidth to use for float bias tensor
             :float_bitwidth (int): Converts the graph to the specified float bitwidth, either 32 or 16(Default).
-            :keep_int64_inputs(flag): Passed by default.
+            :preserve_io_datatype(flag): Passed by default.
 
         CONVERTOR_ARGS_EXTENSION passed in qnn_config.json is appended to the command created.
 
@@ -240,7 +245,7 @@ def converter(self) -> str:
         cmd = (
             f"{converter_tool} --input_network {self.onnx_path} "
             f"--output_path {self.dlc_model_path} "
-            f"--io_config {self.custom_io_path} "
+            f"--config {self.custom_io_path} "
             f"--float_bias_bitwidth {QnnConstants.FLOAT_BIAS_BITWIDTH} "
             f"--float_bitwidth {QnnConstants.FLOAT_BITWIDTH} "
         )
@@ -287,6 +292,17 @@ def generate_context_binary(self) -> str:
             f"--config_file {config_file_path} "
         )
 
+        if self.mxint8:
+            data_format_file_path = os.path.join(self.qpc_base_path, QnnConstants.QNN_DATA_FORMAT_CONFIG_NAME)
+            generate_data_format_config(
+                self.onnx_path, model_dlc_name=QnnConstants.MODEL_NAME, file_path=data_format_file_path
+            )
+            if not os.path.isfile(data_format_file_path):
+                raise FileNotFoundError(
+                    f"file {data_format_file_path} needs to exist in the qpc_base_path for mxint8 compilation. Please rerun infer/compile Api"
+                )
+            cmd += f"--data_format_config {data_format_file_path} "
+
         if self.qnn_config and QnnConstants.CONTEXT_BIN_ARGS_EXTENSION_STR in self.qnn_config:
             if "--log_level " not in self.qnn_config[QnnConstants.CONTEXT_BIN_ARGS_EXTENSION_STR]:
                 cmd += f"--log_level {QnnConstants.LOG_LEVEL} "
@@ -353,27 +369,23 @@ def compile(
 
     if kwargs:
         logger.warning("Extra arguments to QNN compilation are not supported as of now!")
-
         raise NotImplementedError("Can't handle extra compilation args now!")
 
-    if allow_mxint8_mdp_io:
-        logger.warning("QNN doesn't support allow_mxint8_mdp_io. Bypassing the value passed for allow_mxint8_mdp_io")
-
-    if mxint8:
-        logger.warning("QNN doesn't support mxint8. Bypassing the value passed for mxint8")
-
     os.makedirs(qpc_base_path, exist_ok=True)
 
     # Created custom_io_config.yaml file for QNN-Convertor stage.
     # TODO To make custom_io_config.yaml configurable as not all models need it.
     custom_io_file_path = os.path.join(qpc_base_path, "custom_io_config.yaml")
+
+    kv_precision = "uint8" if mxint8 else "float16"
     fetch_nodes_info(
         onnx_graph_path=onnx_path,
         batch_size=batch_size,
         sequence_length=prompt_len,
         context_length=ctx_len,
         file_path=custom_io_file_path,
         full_batch_size=full_batch_size,
+        kv_precision=kv_precision,
     )
 
     if not os.path.isfile(custom_io_file_path):
@@ -395,6 +407,8 @@ def compile(
         ctx_len=ctx_len,
         compiler_mxfp6_matmul_weights=mxfp6,
         qnn_binary_dir=qnn_binary_dir,
+        mxint8=mxint8,
+        compiler_mxint8_mdp_io=allow_mxint8_mdp_io,
     )
 
     compiled_binary_path = qnn_obj.compile()

diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
@@ -341,7 +341,9 @@ def cloud_ai_100_exec_kv(
             perf_metrics=PerfMetrics(prefill_time, decode_perf, total_perf, total_time),
         )
     else:
-        exec_info = generate_text.generate(prompt=prompt, generation_len=generation_len)
+        exec_info = generate_text.generate(
+            prompt=prompt, generation_len=generation_len, prompt_to_lora_id_mapping=prompt_to_lora_id_mapping
+        )
 
     print_latency_stats_kv(prompt, exec_info=exec_info, automation=automation)
     return exec_info

diff --git a/QEfficient/peft/lora/auto.py b/QEfficient/peft/lora/auto.py
@@ -342,9 +342,9 @@ def generate(
         self,
         tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
         prompts: List[str],
-        device_id: List[int] = None,
         prompt_to_adapter_mapping: List[str] = None,
-        runtime: str = "AI_100",
+        device_id: Optional[List[int]] = None,
+        runtime: Optional[str] = "AI_100",
         **kwargs,
     ):
         """
@@ -355,9 +355,9 @@ def generate(
         ``Mandatory`` Args:
             :tokenizer (PreTrainedTokenizerFast or PreTrainedTokenizer): The tokenizer used in the inference
             :prompts (List[str]): List of prompts to run the execution.
-            :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
             :prompt_to_adapter_mapping (List[str]): The sequence of the adapter names will be matched with sequence of prompts and corresponding adapters will be used for the prompts."base" for base model (no adapter).
         ``optional`` Args:
+            :device_id (List[int]): Device IDs to be used for execution. If ``len(device_id) > 1``, it enables multiple card setup. If ``None``, auto-device-picker will be used. ``Defaults to None``.
             :runtime (str, optional): Only ``AI_100`` runtime is supported as of now; ``ONNXRT`` and ``PyTorch`` coming soon. Defaults to "AI_100".
 
         """

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -209,6 +209,7 @@ def export(self, export_dir: Optional[str] = None) -> str:
                 2: "ctx_len",
             }
         output_names = ["logits"]
+
         for i in range(self.num_layers):
             for kv in ["key", "value"]:
                 example_inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape, dtype=torch.float32))
@@ -240,6 +241,7 @@ def compile(
         ctx_len: int = 128,
         batch_size: int = 1,
         full_batch_size: Optional[int] = None,
+        kv_cache_batch_size: Optional[int] = None,
         num_devices: int = 1,
         num_cores: int = 16,  # FIXME: Make this mandatory arg
         mxfp6_matmul: bool = False,
@@ -291,15 +293,28 @@ def compile(
         if self.continuous_batching and full_batch_size is None:
             raise TypeError("missing required argument: 'full_batch_size'")
 
+        if kv_cache_batch_size and not full_batch_size:
+            raise ValueError(
+                "Prefix caching is enabled only for continuous batching as of now. Please pass `full_batch_size` argument and make sure you pass `continuous_batching=True` in the `from_pretrained` call"
+            )
+
+        kv_cache_batch_size = (
+            kv_cache_batch_size if kv_cache_batch_size else (full_batch_size if full_batch_size else batch_size)
+        )
         # Define prefill specialization
         prefill_specialization = {
             # Prefill is always run with single BS for continuous batching.
             "batch_size": 1 if self.continuous_batching else batch_size,
             "seq_len": prefill_seq_len,
             "ctx_len": ctx_len,
+            # TODO: should be renamed to kv_cache_batch_size in specialzation too
         }
-        prefill_specialization.update({"full_batch_size": full_batch_size}) if self.continuous_batching else None
-        prefill_specialization.update({"num_logits_to_keep": 1}) if self.is_tlm else None
+        prefill_specialization.update({"num_logits_to_keep": 1}) if self.is_tlm else ...
+        if self.continuous_batching:
+            prefill_specialization.update({"full_batch_size": kv_cache_batch_size})
+        else:
+            prefill_specialization.update({"batch_size": kv_cache_batch_size})
+        prefill_specialization.update({"full_batch_exec_size": full_batch_size}) if full_batch_size else ...
         specializations = [
             prefill_specialization,
         ]
@@ -311,8 +326,11 @@ def compile(
                 "seq_len": num_speculative_tokens + 1 if self.is_tlm else 1,
                 "ctx_len": ctx_len,
             }
-            decode_specialization.update({"full_batch_size": full_batch_size}) if self.continuous_batching else None
-            decode_specialization.update({"num_logits_to_keep": num_speculative_tokens + 1}) if self.is_tlm else None
+            if self.continuous_batching:
+                decode_specialization.update({"full_batch_size": kv_cache_batch_size})
+            else:
+                decode_specialization.update({"batch_size": kv_cache_batch_size})
+            decode_specialization.update({"num_logits_to_keep": num_speculative_tokens + 1}) if self.is_tlm else ...
             specializations.append(decode_specialization)
 
         if enable_qnn:
@@ -363,7 +381,7 @@ def generate(
         self,
         tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
         prompts: List[str],
-        device_id: List[int] = [0],
+        device_id: List[int] = None,
         runtime_ai100: bool = True,
         **kwargs,
     ):
@@ -569,7 +587,7 @@ def compile(
     def generate(
         self,
         inputs: torch.Tensor,
-        device_ids: List[int] = [0],
+        device_ids: List[int] = None,
         runtime_ai100: bool = True,
     ) -> Union[torch.Tensor, np.ndarray]:
         """

diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
@@ -81,6 +81,7 @@ class QnnConstants:
 
     # QNN Compilation target names
     MODEL_NAME = "model"
+    QNN_DATA_FORMAT_CONFIG_NAME = "qnn_data_format_config.json"
     CONTEXT_BIN_NAME = "qnngraph.serialized"
     CONTEXT_BIN_QPC_NAME = "programqpc.bin"
 
@@ -90,7 +91,7 @@ class QnnConstants:
     # Convertor Arguments
     FLOAT_BITWIDTH = 16
     FLOAT_BIAS_BITWIDTH = 32
-    CONVERTOR_DEFAULT_ARGS = "--keep_int64_inputs --onnx_no_simplification "
+    CONVERTOR_DEFAULT_ARGS = "--preserve_io_datatype --onnx_skip_simplification "
 
     # Context-Binary-Generator Arguments
     LOG_LEVEL = "error"
@@ -118,11 +119,11 @@ class QnnConstants:
     IMMUTABLE_CONVERTOR_ARGS = [
         "--input_network ",
         "--output_path ",
-        "--io_config ",
+        "--config ",
         "--float_bias_bitwidth ",
         "--float_bitwidth ",
-        "--keep_int64_inputs",
-        "--onnx_no_simplification",
+        "--preserve_io_datatype",
+        "--onnx_skip_simplification",
         "--onnx_defer_loading",
     ]
 

diff --git a/QEfficient/utils/generate_qnn_network_specialization_config.py b/QEfficient/utils/generate_qnn_network_specialization_config.py
@@ -5,6 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
+import json
 from typing import Optional
 
 import onnx
@@ -24,6 +25,7 @@ def fetch_nodes_info(
     file_path: str = "custom_io_config.yaml",
     full_batch_size: Optional[int] = None,
     decode_only: Optional[bool] = False,
+    kv_precision: Optional[str] = "float16",
 ) -> None:
     # Load the ONNX model
     onnx_model = onnx.load(onnx_graph_path)
@@ -38,7 +40,7 @@ def fetch_nodes_info(
         input_info = {}
         input_info["DataType"] = str(helper.tensor_dtype_to_np_dtype(node.type.tensor_type.elem_type))
         if "past_key" in node.name or "past_value" in node.name:
-            input_info["DataType"] = "float16"
+            input_info["DataType"] = kv_precision
 
         if "batch_index" in node.name:
             if full_batch_size:
@@ -128,7 +130,7 @@ def fetch_nodes_info(
         output_info = {}
         output_info["DataType"] = str(helper.tensor_dtype_to_np_dtype(output.type.tensor_type.elem_type))
         if "past_key" in output.name or "past_value" in output.name:
-            output_info["DataType"] = "float16"
+            output_info["DataType"] = kv_precision
         elif "logits" in output.name:
             output_info["DataType"] = "float32"
         output_nodes_info.append({"Name": output.name, "Desired Model Parameters": output_info})
@@ -142,3 +144,41 @@ def fetch_nodes_info(
             yaml.dump(final_dict, yaml_file, default_flow_style=False, sort_keys=False)
     except Exception as e:
         print(f"Failed to create YAML File for QNN Network Specialization Configuration{file_path}: {e}")
+
+
+def generate_data_format_config(
+    onnx_graph_path: str,
+    *,
+    data_format: Optional[str] = "QNN_TENSOR_DATA_FORMAT_MX",
+    model_dlc_name: Optional[str] = "model",
+    file_path: str = "qnn_data_format_config.json",
+) -> None:
+    # Load the ONNX model
+    onnx_model = onnx.load(onnx_graph_path)
+
+    kv_nodes: list = []
+
+    for input in onnx_model.graph.input:
+        if "past_key" in input.name or "past_value" in input.name:
+            kv_nodes.append((input.name).replace(".", "_"))
+    for output in onnx_model.graph.output:
+        if "past_key" in output.name or "past_value" in output.name:
+            kv_nodes.append((output.name).replace(".", "_"))
+            kv_overrides = {}
+
+    kv_overrides["graphs"] = [
+        {
+            "graph_name": model_dlc_name + "_configuration_1",
+            "tensors": [{"tensor_name": node, "dataFormat": data_format} for node in kv_nodes],
+        },
+        {
+            "graph_name": model_dlc_name + "_configuration_2",
+            "tensors": [{"tensor_name": node, "dataFormat": data_format} for node in kv_nodes],
+        },
+    ]
+
+    try:
+        with open(file_path, "w") as json_file:
+            json.dump(kv_overrides, json_file, indent=4)
+    except Exception as e:
+        print(f"Failed to create JSON File for QNN Data Format Configuration{file_path}: {e}")
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,6 +20,7 @@ classifiers = [
 requires-python = ">=3.8,<3.11"
 dependencies = [
     "transformers==4.45.2",
+    "huggingface-hub==0.27.0",
     "peft==0.13.2",
     "datasets==2.20.0",
     "fsspec==2023.6.0",