Skip to content

Commit

Permalink
Merge branch 'main' into new_docker_test_setup
Browse files Browse the repository at this point in the history
Signed-off-by: Amit Raj <[email protected]>
  • Loading branch information
quic-amitraj authored Sep 11, 2024
2 parents 539b24d + 67922d7 commit 3f86176
Show file tree
Hide file tree
Showing 90 changed files with 4,501 additions and 1,947 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,4 @@ cython_debug/
# Local Files
cache_dir
qeff_models
.vscode/*
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved.
Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted (subject to the limitations in the
Expand Down
2 changes: 1 addition & 1 deletion QEfficient/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved.
# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion QEfficient/base/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved.
# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------
Expand Down
4 changes: 0 additions & 4 deletions QEfficient/base/modeling_qeff.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,6 @@ def __init__(self, model: torch.nn.Module) -> None:
def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs):
raise NotImplementedError("Must implement for child classes")

@property
def is_transformed(self) -> bool:
raise NotImplementedError("Must implement for child classes")

def run_pytorch(self, inputs):
raise NotImplementedError("Reached too far!!")

Expand Down
70 changes: 62 additions & 8 deletions QEfficient/base/onnx_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
#
# ----------------------------------------------------------------------------

from onnx import ModelProto
from typing import Optional, Tuple

import numpy as np
from onnx import ModelProto, external_data_helper, numpy_helper


class OnnxTransform:
Expand All @@ -17,23 +20,74 @@ def __init__(self):
raise TypeError("Transform classes are not to be instantiated. Directly use the `apply` method.")

@classmethod
def apply(cls, model: ModelProto) -> ModelProto:
def apply(cls, model: ModelProto, **kwargs) -> Tuple[ModelProto, bool]:
"""
Override this class to apply a transformation.
:param model: The model's ONNX graph to transform
:param kwargs: Parameters needed for specific transforms. All transforms should take **kwargs to ignore unneeded kwargs.
:returns: ONNX graph after applying the transform
:returns: Boolean indicating whether transform was applied
"""
raise NotImplementedError("Use subclasses for ONNX transform")


class FP16Clip(OnnxTransform):
pass
class FP16ClipTransform(OnnxTransform):
"""
Clips the tensor values to be in FP16 range.
"""

@classmethod
def apply(cls, model: ModelProto, *, onnx_base_dir: Optional[str] = None, **kwargs) -> Tuple[ModelProto, bool]:
"""
:param onnx_base_dir: Base directory to load tensors (if not already loaded).
"""
finfo = np.finfo(np.float16)
fp16_max = finfo.max
fp16_min = finfo.min
transformed = False
for tensor in external_data_helper._get_all_tensors(model):
nptensor = numpy_helper.to_array(tensor, onnx_base_dir)
if nptensor.dtype == np.float32 and (np.any(nptensor > fp16_max) or np.any(nptensor < fp16_min)):
nptensor = np.clip(nptensor, fp16_min, fp16_max)
new_tensor = numpy_helper.from_array(nptensor, tensor.name)
tensor.CopyFrom(new_tensor)
transformed = True
return model, transformed

class SplitWeights(OnnxTransform):
pass

class SplitTensorsTransform(OnnxTransform):
"""
Split external tensors file
"""

class LoraAdapters(OnnxTransform):
pass
@classmethod
def apply(
cls,
model: ModelProto,
*,
model_name: str,
onnx_base_dir: Optional[str] = None,
file_chunk_size: int = 10 * 2**30, # 10 GiB
size_threshold: int = 1024,
**kwargs,
) -> Tuple[ModelProto, bool]:
"""
:param model_name: Used for naming external files. i.e. {model_name}_0.onnx.data
:param onnx_base_dir: Base directory to load tensors (if not already loaded).
:param file_chunk_size: Chunk size to split external files into.
:param size_threshold: Only tensors greater than this threshold (in bytes) will be saved externally.
"""
file_num = 0
current_file_size = 0
transformed = False
external_data_helper.load_external_data_for_model(model, onnx_base_dir)
for tensor in external_data_helper._get_all_tensors(model):
if tensor.HasField("raw_data") and ((tsize := len(tensor.raw_data)) > size_threshold):
transformed = True
current_file_size += tsize
if current_file_size > file_chunk_size:
file_num += 1
current_file_size = tsize
external_data_helper.set_external_data(tensor, f"{model_name}_{file_num}.onnx.data")
return model, transformed
23 changes: 14 additions & 9 deletions QEfficient/base/pytorch_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
# SPDX-License-Identifier: BSD-3-Clause
#
# ----------------------------------------------------------------------------

from typing import Dict, Type
from typing import Dict, Tuple, Type

from torch import nn

Expand All @@ -19,32 +18,38 @@ def __init__(self):
raise TypeError("Transform classes are not to be instantiated. Directly use the `apply` method.")

@classmethod
def apply(cls, model: nn.Module) -> nn.Module:
def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
"""
Override this class method to apply a transformation.
:param model: The torch module to transform, this module may be tranformed in-place
:param model: The torch module to transform, this module may be transformed in-place
:returns: Torch module after applying the tranform
:returns: Torch module after applying the transform
:returns: Boolean indicating whether transform was applied
"""
raise NotImplementedError("Use subclasses for Pytorch transform")


class ModuleMapping(PytorchTransform):
class ModuleMappingTransform(PytorchTransform):
"""
Replaces the PyTorch modules based on the _module_mapping class variable.
"""

_module_mapping: Dict[Type[nn.Module], Type[nn.Module]]

@classmethod
def apply(cls, model: nn.Module) -> nn.Module:
def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
transformed = False
for module in model.modules():
if repl_module := cls._module_mapping.get(type(module)):
module.__class__ = repl_module
return model
# Handling the __init__ calls in the models
if hasattr(module, "__qeff_init__"):
module.__qeff_init__()
transformed = True
return model, transformed

@classmethod
def register(cls, from_module: type, to_module: type):
def register(cls, from_module: Type[nn.Module], to_module: Type[nn.Module]):
"""
Add a new module type in the module mapping for this transform. ::
FlashAttention.register(LLamaAttention, LlamaFlashAttention)
Expand Down
10 changes: 8 additions & 2 deletions QEfficient/cloud/compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
"--device-group",
required=True,
type=lambda device_ids: [int(x) for x in device_ids.strip("[]").split(",")],
help="Cloud AI 100 device ids (comma-separated) e.g. [0] ",
help="Cloud AI 100 device ids (comma-separated) e.g. [0,1] ",
)
parser.add_argument(
"--aic_enable_depth_first",
Expand All @@ -69,7 +69,13 @@
default=-1,
help=" Effort level to reduce the on-chip memory",
)

parser.add_argument(
"--full_batch_size",
"--full-batch-size",
type=int,
default=None,
help="Set full batch size to enable continuous batching mode, default is None",
)
# FIXME(ochougul): Allow extra compilation arguments
args = parser.parse_args()
QEfficient.compile(**vars(args))
Loading

0 comments on commit 3f86176

Please sign in to comment.