Merge branch 'main' into new_docker_test_setup

Signed-off-by: Amit Raj <[email protected]>
quic-amitraj · Sep 11, 2024 · 3f86176 · 3f86176
2 parents 539b24d + 67922d7
commit 3f86176
Show file tree

Hide file tree

Showing 90 changed files with 4,501 additions and 1,947 deletions.
diff --git a/.gitignore b/.gitignore
@@ -88,3 +88,4 @@ cython_debug/
 # Local Files
 cache_dir
 qeff_models
+.vscode/*
diff --git a/LICENSE b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved.
+Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted (subject to the limitations in the

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
@@ -1,6 +1,6 @@
 # -----------------------------------------------------------------------------
 #
-# Copyright (c)  2023-2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------

diff --git a/QEfficient/base/__init__.py b/QEfficient/base/__init__.py
@@ -1,6 +1,6 @@
 # -----------------------------------------------------------------------------
 #
-# Copyright (c)  2023-2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------

diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
@@ -76,10 +76,6 @@ def __init__(self, model: torch.nn.Module) -> None:
     def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs):
         raise NotImplementedError("Must implement for child classes")
 
-    @property
-    def is_transformed(self) -> bool:
-        raise NotImplementedError("Must implement for child classes")
-
     def run_pytorch(self, inputs):
         raise NotImplementedError("Reached too far!!")
 

diff --git a/QEfficient/base/onnx_transforms.py b/QEfficient/base/onnx_transforms.py
@@ -5,7 +5,10 @@
 #
 # ----------------------------------------------------------------------------
 
-from onnx import ModelProto
+from typing import Optional, Tuple
+
+import numpy as np
+from onnx import ModelProto, external_data_helper, numpy_helper
 
 
 class OnnxTransform:
@@ -17,23 +20,74 @@ def __init__(self):
         raise TypeError("Transform classes are not to be instantiated. Directly use the `apply` method.")
 
     @classmethod
-    def apply(cls, model: ModelProto) -> ModelProto:
+    def apply(cls, model: ModelProto, **kwargs) -> Tuple[ModelProto, bool]:
         """
         Override this class to apply a transformation.
         :param model: The model's ONNX graph to transform
+        :param kwargs: Parameters needed for specific transforms. All transforms should take **kwargs to ignore unneeded kwargs.
 
         :returns: ONNX graph after applying the transform
+        :returns: Boolean indicating whether transform was applied
         """
         raise NotImplementedError("Use subclasses for ONNX transform")
 
 
-class FP16Clip(OnnxTransform):
-    pass
+class FP16ClipTransform(OnnxTransform):
+    """
+    Clips the tensor values to be in FP16 range.
+    """
 
+    @classmethod
+    def apply(cls, model: ModelProto, *, onnx_base_dir: Optional[str] = None, **kwargs) -> Tuple[ModelProto, bool]:
+        """
+        :param onnx_base_dir: Base directory to load tensors (if not already loaded).
+        """
+        finfo = np.finfo(np.float16)
+        fp16_max = finfo.max
+        fp16_min = finfo.min
+        transformed = False
+        for tensor in external_data_helper._get_all_tensors(model):
+            nptensor = numpy_helper.to_array(tensor, onnx_base_dir)
+            if nptensor.dtype == np.float32 and (np.any(nptensor > fp16_max) or np.any(nptensor < fp16_min)):
+                nptensor = np.clip(nptensor, fp16_min, fp16_max)
+                new_tensor = numpy_helper.from_array(nptensor, tensor.name)
+                tensor.CopyFrom(new_tensor)
+                transformed = True
+        return model, transformed
 
-class SplitWeights(OnnxTransform):
-    pass
 
+class SplitTensorsTransform(OnnxTransform):
+    """
+    Split external tensors file
+    """
 
-class LoraAdapters(OnnxTransform):
-    pass
+    @classmethod
+    def apply(
+        cls,
+        model: ModelProto,
+        *,
+        model_name: str,
+        onnx_base_dir: Optional[str] = None,
+        file_chunk_size: int = 10 * 2**30,  # 10 GiB
+        size_threshold: int = 1024,
+        **kwargs,
+    ) -> Tuple[ModelProto, bool]:
+        """
+        :param model_name: Used for naming external files. i.e. {model_name}_0.onnx.data
+        :param onnx_base_dir: Base directory to load tensors (if not already loaded).
+        :param file_chunk_size: Chunk size to split external files into.
+        :param size_threshold: Only tensors greater than this threshold (in bytes) will be saved externally.
+        """
+        file_num = 0
+        current_file_size = 0
+        transformed = False
+        external_data_helper.load_external_data_for_model(model, onnx_base_dir)
+        for tensor in external_data_helper._get_all_tensors(model):
+            if tensor.HasField("raw_data") and ((tsize := len(tensor.raw_data)) > size_threshold):
+                transformed = True
+                current_file_size += tsize
+                if current_file_size > file_chunk_size:
+                    file_num += 1
+                    current_file_size = tsize
+                external_data_helper.set_external_data(tensor, f"{model_name}_{file_num}.onnx.data")
+        return model, transformed
diff --git a/QEfficient/base/pytorch_transforms.py b/QEfficient/base/pytorch_transforms.py
@@ -4,8 +4,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # ----------------------------------------------------------------------------
-
-from typing import Dict, Type
+from typing import Dict, Tuple, Type
 
 from torch import nn
 
@@ -19,32 +18,38 @@ def __init__(self):
         raise TypeError("Transform classes are not to be instantiated. Directly use the `apply` method.")
 
     @classmethod
-    def apply(cls, model: nn.Module) -> nn.Module:
+    def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
         """
         Override this class method to apply a transformation.
-        :param model: The torch module to transform, this module may be tranformed in-place
+        :param model: The torch module to transform, this module may be transformed in-place
 
-        :returns: Torch module after applying the tranform
+        :returns: Torch module after applying the transform
+        :returns: Boolean indicating whether transform was applied
         """
         raise NotImplementedError("Use subclasses for Pytorch transform")
 
 
-class ModuleMapping(PytorchTransform):
+class ModuleMappingTransform(PytorchTransform):
     """
     Replaces the PyTorch modules based on the _module_mapping class variable.
     """
 
     _module_mapping: Dict[Type[nn.Module], Type[nn.Module]]
 
     @classmethod
-    def apply(cls, model: nn.Module) -> nn.Module:
+    def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
+        transformed = False
         for module in model.modules():
             if repl_module := cls._module_mapping.get(type(module)):
                 module.__class__ = repl_module
-        return model
+                # Handling the __init__ calls in the models
+                if hasattr(module, "__qeff_init__"):
+                    module.__qeff_init__()
+                transformed = True
+        return model, transformed
 
     @classmethod
-    def register(cls, from_module: type, to_module: type):
+    def register(cls, from_module: Type[nn.Module], to_module: Type[nn.Module]):
         """
         Add a new module type in the module mapping for this transform. ::
             FlashAttention.register(LLamaAttention, LlamaFlashAttention)

diff --git a/QEfficient/cloud/compile.py b/QEfficient/cloud/compile.py
@@ -55,7 +55,7 @@
         "--device-group",
         required=True,
         type=lambda device_ids: [int(x) for x in device_ids.strip("[]").split(",")],
-        help="Cloud AI 100 device ids (comma-separated) e.g. [0] ",
+        help="Cloud AI 100 device ids (comma-separated) e.g. [0,1] ",
     )
     parser.add_argument(
         "--aic_enable_depth_first",
@@ -69,7 +69,13 @@
         default=-1,
         help=" Effort level to reduce the on-chip memory",
     )
-
+    parser.add_argument(
+        "--full_batch_size",
+        "--full-batch-size",
+        type=int,
+        default=None,
+        help="Set full batch size to enable continuous batching mode, default is None",
+    )
     # FIXME(ochougul): Allow extra compilation arguments
     args = parser.parse_args()
     QEfficient.compile(**vars(args))