Skip to content

Commit

Permalink
Merge branch 'quic:main' into finetune
Browse files Browse the repository at this point in the history
Signed-off-by: Mamta Singh <[email protected]>
  • Loading branch information
quic-mamta committed Jan 13, 2025
2 parents 2c41404 + 05275e5 commit c3079b0
Show file tree
Hide file tree
Showing 14 changed files with 340 additions and 45 deletions.
34 changes: 21 additions & 13 deletions QEfficient/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,27 @@
#
# -----------------------------------------------------------------------------

try:
import platform
import sys

sys.path.append(f"/opt/qti-aic/dev/lib/{platform.machine()}")
import qaicrt # noqa: F401
def check_qaic_sdk():
"""Check if QAIC SDK is installed"""
try:
import platform
import sys

qaic_sdk_installed = True
except ModuleNotFoundError:
qaic_sdk_installed = False
sys.path.append(f"/opt/qti-aic/dev/lib/{platform.machine()}")
import qaicrt # noqa: F401

__version__ = "0.0.1.dev0"
return True
except ImportError:
return False


QAIC_INSTALLED = check_qaic_sdk()

if qaic_sdk_installed:
from QEfficient.base import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader
# Conditionally import QAIC-related modules if the SDK is installed
__version__ = "0.0.1.dev0"
if QAIC_INSTALLED:
from QEfficient.base import QEFFAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader
from QEfficient.compile.compile_helper import compile
from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
Expand All @@ -34,10 +40,12 @@
"export",
"compile",
"cloud_ai_100_exec_kv",
"QEffAutoModel",
"QEFFAutoModel",
"QEFFAutoModelForCausalLM",
"QEffAutoPeftModelForCausalLM",
"QEFFCommonLoader",
]

print("QAIC SDK is installed.")
else:
print("QAIC SDK is not found, skipping QEfficient imports.")
print("QAIC SDK is not installed. Proceeding without it.")
36 changes: 25 additions & 11 deletions QEfficient/compile/qnn_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from QEfficient.utils._utils import create_json, execute_command, load_json
from QEfficient.utils.constants import QnnConstants
from QEfficient.utils.generate_qnn_network_specialization_config import fetch_nodes_info
from QEfficient.utils.generate_qnn_network_specialization_config import fetch_nodes_info, generate_data_format_config
from QEfficient.utils.logging_utils import logger


Expand All @@ -38,6 +38,8 @@ def __init__(
qnn_target: str = QnnConstants.TARGET,
qnn_config_path: Optional[str] = None,
qnn_binary_dir: Optional[str] = None,
mxint8: Optional[bool] = False,
compiler_mxint8_mdp_io: Optional[bool] = False,
**kwargs,
) -> None:
self.onnx_path = onnx_path
Expand All @@ -52,6 +54,8 @@ def __init__(
self.compiler_mxfp6_matmul_weights = compiler_mxfp6_matmul_weights
self.qnn_config_path = qnn_config_path
self.qnn_binary_dir = qnn_binary_dir
self.mxint8 = mxint8
self.compiler_mxint8_mdp_io = compiler_mxint8_mdp_io
self.custom_io_path = custom_io_path
self.dlc_model_path = os.path.join(qpc_base_path, f"{QnnConstants.MODEL_NAME}.dlc")
self.qnn_target = qnn_target
Expand Down Expand Up @@ -148,6 +152,7 @@ def create_qnn_compile_backend_json(self) -> str:
"compiler_stat_level": QnnConstants.COMPILER_STAT_LEVEL,
"compiler_stats_batch_size": QnnConstants.COMPILER_STATS_BATCH_SIZE,
"compiler_time_passes": QnnConstants.COMPILER_TIME_PASSES,
"compiler_mxint8_mdp_io": self.compiler_mxint8_mdp_io,
}
if self.compiler_max_out_channel_split > 0:
qnn_compile_backend["compiler_max_out_channel_split"] = str(self.compiler_max_out_channel_split)
Expand Down Expand Up @@ -225,10 +230,10 @@ def converter(self) -> str:
IMMUTABLE parameters which can not be overridden by the user using qnn_config.json:
:input_network (str): Generated ``ONNX`` Model Path.
:output_path (str): Path to generated DLC file, which is provided qpc_base_path/model.dlc
:io_config (str): Path to custom_io_config.yaml file created using GenerateQNNnetworkSpecializationconfig.py
:config (str): Path to custom_io_config.yaml file created using GenerateQNNnetworkSpecializationconfig.py
:float_bias_bitwidth (int): Bitwidth to use for float bias tensor
:float_bitwidth (int): Converts the graph to the specified float bitwidth, either 32 or 16(Default).
:keep_int64_inputs(flag): Passed by default.
:preserve_io_datatype(flag): Passed by default.
CONVERTOR_ARGS_EXTENSION passed in qnn_config.json is appended to the command created.
Expand All @@ -240,7 +245,7 @@ def converter(self) -> str:
cmd = (
f"{converter_tool} --input_network {self.onnx_path} "
f"--output_path {self.dlc_model_path} "
f"--io_config {self.custom_io_path} "
f"--config {self.custom_io_path} "
f"--float_bias_bitwidth {QnnConstants.FLOAT_BIAS_BITWIDTH} "
f"--float_bitwidth {QnnConstants.FLOAT_BITWIDTH} "
)
Expand Down Expand Up @@ -287,6 +292,17 @@ def generate_context_binary(self) -> str:
f"--config_file {config_file_path} "
)

if self.mxint8:
data_format_file_path = os.path.join(self.qpc_base_path, QnnConstants.QNN_DATA_FORMAT_CONFIG_NAME)
generate_data_format_config(
self.onnx_path, model_dlc_name=QnnConstants.MODEL_NAME, file_path=data_format_file_path
)
if not os.path.isfile(data_format_file_path):
raise FileNotFoundError(
f"file {data_format_file_path} needs to exist in the qpc_base_path for mxint8 compilation. Please rerun infer/compile Api"
)
cmd += f"--data_format_config {data_format_file_path} "

if self.qnn_config and QnnConstants.CONTEXT_BIN_ARGS_EXTENSION_STR in self.qnn_config:
if "--log_level " not in self.qnn_config[QnnConstants.CONTEXT_BIN_ARGS_EXTENSION_STR]:
cmd += f"--log_level {QnnConstants.LOG_LEVEL} "
Expand Down Expand Up @@ -353,27 +369,23 @@ def compile(

if kwargs:
logger.warning("Extra arguments to QNN compilation are not supported as of now!")

raise NotImplementedError("Can't handle extra compilation args now!")

if allow_mxint8_mdp_io:
logger.warning("QNN doesn't support allow_mxint8_mdp_io. Bypassing the value passed for allow_mxint8_mdp_io")

if mxint8:
logger.warning("QNN doesn't support mxint8. Bypassing the value passed for mxint8")

os.makedirs(qpc_base_path, exist_ok=True)

# Created custom_io_config.yaml file for QNN-Convertor stage.
# TODO To make custom_io_config.yaml configurable as not all models need it.
custom_io_file_path = os.path.join(qpc_base_path, "custom_io_config.yaml")

kv_precision = "uint8" if mxint8 else "float16"
fetch_nodes_info(
onnx_graph_path=onnx_path,
batch_size=batch_size,
sequence_length=prompt_len,
context_length=ctx_len,
file_path=custom_io_file_path,
full_batch_size=full_batch_size,
kv_precision=kv_precision,
)

if not os.path.isfile(custom_io_file_path):
Expand All @@ -395,6 +407,8 @@ def compile(
ctx_len=ctx_len,
compiler_mxfp6_matmul_weights=mxfp6,
qnn_binary_dir=qnn_binary_dir,
mxint8=mxint8,
compiler_mxint8_mdp_io=allow_mxint8_mdp_io,
)

compiled_binary_path = qnn_obj.compile()
Expand Down
4 changes: 3 additions & 1 deletion QEfficient/generation/text_generation_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,9 @@ def cloud_ai_100_exec_kv(
perf_metrics=PerfMetrics(prefill_time, decode_perf, total_perf, total_time),
)
else:
exec_info = generate_text.generate(prompt=prompt, generation_len=generation_len)
exec_info = generate_text.generate(
prompt=prompt, generation_len=generation_len, prompt_to_lora_id_mapping=prompt_to_lora_id_mapping
)

print_latency_stats_kv(prompt, exec_info=exec_info, automation=automation)
return exec_info
Expand Down
6 changes: 3 additions & 3 deletions QEfficient/peft/lora/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,9 +342,9 @@ def generate(
self,
tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
prompts: List[str],
device_id: List[int] = None,
prompt_to_adapter_mapping: List[str] = None,
runtime: str = "AI_100",
device_id: Optional[List[int]] = None,
runtime: Optional[str] = "AI_100",
**kwargs,
):
"""
Expand All @@ -355,9 +355,9 @@ def generate(
``Mandatory`` Args:
:tokenizer (PreTrainedTokenizerFast or PreTrainedTokenizer): The tokenizer used in the inference
:prompts (List[str]): List of prompts to run the execution.
:device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
:prompt_to_adapter_mapping (List[str]): The sequence of the adapter names will be matched with sequence of prompts and corresponding adapters will be used for the prompts."base" for base model (no adapter).
``optional`` Args:
:device_id (List[int]): Device IDs to be used for execution. If ``len(device_id) > 1``, it enables multiple card setup. If ``None``, auto-device-picker will be used. ``Defaults to None``.
:runtime (str, optional): Only ``AI_100`` runtime is supported as of now; ``ONNXRT`` and ``PyTorch`` coming soon. Defaults to "AI_100".
"""
Expand Down
30 changes: 24 additions & 6 deletions QEfficient/transformers/models/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ def export(self, export_dir: Optional[str] = None) -> str:
2: "ctx_len",
}
output_names = ["logits"]

for i in range(self.num_layers):
for kv in ["key", "value"]:
example_inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape, dtype=torch.float32))
Expand Down Expand Up @@ -240,6 +241,7 @@ def compile(
ctx_len: int = 128,
batch_size: int = 1,
full_batch_size: Optional[int] = None,
kv_cache_batch_size: Optional[int] = None,
num_devices: int = 1,
num_cores: int = 16, # FIXME: Make this mandatory arg
mxfp6_matmul: bool = False,
Expand Down Expand Up @@ -291,15 +293,28 @@ def compile(
if self.continuous_batching and full_batch_size is None:
raise TypeError("missing required argument: 'full_batch_size'")

if kv_cache_batch_size and not full_batch_size:
raise ValueError(
"Prefix caching is enabled only for continuous batching as of now. Please pass `full_batch_size` argument and make sure you pass `continuous_batching=True` in the `from_pretrained` call"
)

kv_cache_batch_size = (
kv_cache_batch_size if kv_cache_batch_size else (full_batch_size if full_batch_size else batch_size)
)
# Define prefill specialization
prefill_specialization = {
# Prefill is always run with single BS for continuous batching.
"batch_size": 1 if self.continuous_batching else batch_size,
"seq_len": prefill_seq_len,
"ctx_len": ctx_len,
# TODO: should be renamed to kv_cache_batch_size in specialzation too
}
prefill_specialization.update({"full_batch_size": full_batch_size}) if self.continuous_batching else None
prefill_specialization.update({"num_logits_to_keep": 1}) if self.is_tlm else None
prefill_specialization.update({"num_logits_to_keep": 1}) if self.is_tlm else ...
if self.continuous_batching:
prefill_specialization.update({"full_batch_size": kv_cache_batch_size})
else:
prefill_specialization.update({"batch_size": kv_cache_batch_size})
prefill_specialization.update({"full_batch_exec_size": full_batch_size}) if full_batch_size else ...
specializations = [
prefill_specialization,
]
Expand All @@ -311,8 +326,11 @@ def compile(
"seq_len": num_speculative_tokens + 1 if self.is_tlm else 1,
"ctx_len": ctx_len,
}
decode_specialization.update({"full_batch_size": full_batch_size}) if self.continuous_batching else None
decode_specialization.update({"num_logits_to_keep": num_speculative_tokens + 1}) if self.is_tlm else None
if self.continuous_batching:
decode_specialization.update({"full_batch_size": kv_cache_batch_size})
else:
decode_specialization.update({"batch_size": kv_cache_batch_size})
decode_specialization.update({"num_logits_to_keep": num_speculative_tokens + 1}) if self.is_tlm else ...
specializations.append(decode_specialization)

if enable_qnn:
Expand Down Expand Up @@ -363,7 +381,7 @@ def generate(
self,
tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
prompts: List[str],
device_id: List[int] = [0],
device_id: List[int] = None,
runtime_ai100: bool = True,
**kwargs,
):
Expand Down Expand Up @@ -569,7 +587,7 @@ def compile(
def generate(
self,
inputs: torch.Tensor,
device_ids: List[int] = [0],
device_ids: List[int] = None,
runtime_ai100: bool = True,
) -> Union[torch.Tensor, np.ndarray]:
"""
Expand Down
9 changes: 5 additions & 4 deletions QEfficient/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ class QnnConstants:

# QNN Compilation target names
MODEL_NAME = "model"
QNN_DATA_FORMAT_CONFIG_NAME = "qnn_data_format_config.json"
CONTEXT_BIN_NAME = "qnngraph.serialized"
CONTEXT_BIN_QPC_NAME = "programqpc.bin"

Expand All @@ -90,7 +91,7 @@ class QnnConstants:
# Convertor Arguments
FLOAT_BITWIDTH = 16
FLOAT_BIAS_BITWIDTH = 32
CONVERTOR_DEFAULT_ARGS = "--keep_int64_inputs --onnx_no_simplification "
CONVERTOR_DEFAULT_ARGS = "--preserve_io_datatype --onnx_skip_simplification "

# Context-Binary-Generator Arguments
LOG_LEVEL = "error"
Expand Down Expand Up @@ -118,11 +119,11 @@ class QnnConstants:
IMMUTABLE_CONVERTOR_ARGS = [
"--input_network ",
"--output_path ",
"--io_config ",
"--config ",
"--float_bias_bitwidth ",
"--float_bitwidth ",
"--keep_int64_inputs",
"--onnx_no_simplification",
"--preserve_io_datatype",
"--onnx_skip_simplification",
"--onnx_defer_loading",
]

Expand Down
44 changes: 42 additions & 2 deletions QEfficient/utils/generate_qnn_network_specialization_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#
# -----------------------------------------------------------------------------

import json
from typing import Optional

import onnx
Expand All @@ -24,6 +25,7 @@ def fetch_nodes_info(
file_path: str = "custom_io_config.yaml",
full_batch_size: Optional[int] = None,
decode_only: Optional[bool] = False,
kv_precision: Optional[str] = "float16",
) -> None:
# Load the ONNX model
onnx_model = onnx.load(onnx_graph_path)
Expand All @@ -38,7 +40,7 @@ def fetch_nodes_info(
input_info = {}
input_info["DataType"] = str(helper.tensor_dtype_to_np_dtype(node.type.tensor_type.elem_type))
if "past_key" in node.name or "past_value" in node.name:
input_info["DataType"] = "float16"
input_info["DataType"] = kv_precision

if "batch_index" in node.name:
if full_batch_size:
Expand Down Expand Up @@ -128,7 +130,7 @@ def fetch_nodes_info(
output_info = {}
output_info["DataType"] = str(helper.tensor_dtype_to_np_dtype(output.type.tensor_type.elem_type))
if "past_key" in output.name or "past_value" in output.name:
output_info["DataType"] = "float16"
output_info["DataType"] = kv_precision
elif "logits" in output.name:
output_info["DataType"] = "float32"
output_nodes_info.append({"Name": output.name, "Desired Model Parameters": output_info})
Expand All @@ -142,3 +144,41 @@ def fetch_nodes_info(
yaml.dump(final_dict, yaml_file, default_flow_style=False, sort_keys=False)
except Exception as e:
print(f"Failed to create YAML File for QNN Network Specialization Configuration{file_path}: {e}")


def generate_data_format_config(
onnx_graph_path: str,
*,
data_format: Optional[str] = "QNN_TENSOR_DATA_FORMAT_MX",
model_dlc_name: Optional[str] = "model",
file_path: str = "qnn_data_format_config.json",
) -> None:
# Load the ONNX model
onnx_model = onnx.load(onnx_graph_path)

kv_nodes: list = []

for input in onnx_model.graph.input:
if "past_key" in input.name or "past_value" in input.name:
kv_nodes.append((input.name).replace(".", "_"))
for output in onnx_model.graph.output:
if "past_key" in output.name or "past_value" in output.name:
kv_nodes.append((output.name).replace(".", "_"))
kv_overrides = {}

kv_overrides["graphs"] = [
{
"graph_name": model_dlc_name + "_configuration_1",
"tensors": [{"tensor_name": node, "dataFormat": data_format} for node in kv_nodes],
},
{
"graph_name": model_dlc_name + "_configuration_2",
"tensors": [{"tensor_name": node, "dataFormat": data_format} for node in kv_nodes],
},
]

try:
with open(file_path, "w") as json_file:
json.dump(kv_overrides, json_file, indent=4)
except Exception as e:
print(f"Failed to create JSON File for QNN Data Format Configuration{file_path}: {e}")
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ classifiers = [
requires-python = ">=3.8,<3.11"
dependencies = [
"transformers==4.45.2",
"huggingface-hub==0.27.0",
"peft==0.13.2",
"datasets==2.20.0",
"fsspec==2023.6.0",
Expand Down
Loading

0 comments on commit c3079b0

Please sign in to comment.