From 0cf32b0d1d61bd8ab4176bee86f56d07a437a152 Mon Sep 17 00:00:00 2001
From: Amit Raj <168538872+quic-amitraj@users.noreply.github.com>
Date: Mon, 13 Jan 2025 15:16:54 +0530
Subject: [PATCH 1/5] Updated documents (#222)

Signed-off-by: Amit Raj <quic_amitraj@quicinc.com>
---
 .../transformers/models/modeling_auto.py      | 29 ++++++++++++-------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index ff657d29..c2e3777b 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -83,11 +83,14 @@ class QEFFAutoModelForCausalLM(QEFFTransformersBase):
     .. code-block:: python
 
         from QEfficient import QEFFAutoModelForCausalLM
+        from transformers import AutoTokenizer
 
+        model_name = "gpt2"
         model = QEFFAutoModelForCausalLM.from_pretrained(model_name, num_hidden_layers=2)
-        model.compile(prefill_seq_len=32, ctx_len=1024)
+        model.compile(prefill_seq_len=128, ctx_len=256, num_cores=16, num_devices=1)
 
-        model.generate(prompts=["Hi there!!"])
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model.generate(prompts=["Hi there!!"], tokenizer=tokenizer)
     """
 
     _hf_auto_class = AutoModelForCausalLM
@@ -141,15 +144,18 @@ def from_pretrained(
         .. code-block:: python
 
             from QEfficient import QEFFAutoModelForCausalLM
+            from transformers import AutoTokenizer
 
             # Initialize the model using from_pretrained similar to transformers.AutoModelForCausalLM
-            model = QEFFAutoModelForCausalLM.from_pretrained("gpt2")
+            model_name = "gpt2"
+            model = QEFFAutoModelForCausalLM.from_pretrained(model_name)
 
             # Now you can directly compile the model for Cloud AI 100
-            model.compile(num_cores=6, device_group=[0])  # Considering you have a Cloud AI 100 Standard SKU
+            model.compile(num_cores=16) # Considering you have a Cloud AI 100 Standard SKU
 
             # You can now execute the model
-            model.generate(prompts=["Hi there!!"])
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            model.generate(prompts=["Hi there!!"], tokenizer=tokenizer)
         """
 
         if kwargs.pop("full_batch_size", None):
@@ -391,9 +397,11 @@ def generate(
         If the number of prompts cannot be divided by the ``batch_size``, the last unfulfilled batch will be dropped.
 
         ``Mandatory`` Args:
+            :tokenizer (Union[PreTrainedTokenizerFast, PreTrainedTokenizer]): Pass tokenizer of the model.
             :prompts (List[str]): List of prompts to run the execution.
-            :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
+
         ``optional`` Args:
+            :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
             :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime.
 
         """
@@ -430,7 +438,7 @@ class QEFFAutoModel(QEFFTransformersBase):
         model = QEFFAutoModel.from_pretrained("model_name")
 
         # Now you can directly compile the model for Cloud AI 100
-        model.compile(num_cores=16, device_group=[0])  # Considering you have a Cloud AI 100 SKU
+        model.compile(num_cores=16)  # Considering you have a Cloud AI 100 SKU
 
         #prepare input
         tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -469,7 +477,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
             model = QEFFAutoModel.from_pretrained("model_name")
 
             # Now you can directly compile the model for Cloud AI 100
-            model.compile(num_cores=16, device_group=[0])  # Considering you have a Cloud AI 100 SKU
+            model.compile(num_cores=16)  # Considering you have a Cloud AI 100 SKU
 
             #prepare input
             tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -594,10 +602,9 @@ def generate(
         This method generates output by executing PyTorch runtime or the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
         ``Mandatory`` Args:
             :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution.
-            :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
         ``optional`` Args:
+            :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
             :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime.
-            :eq_len (int, optional): Sequence length for the inputs. Defaults to constants.Constants.CTX_LEN.
         Returns:
             :dict: Output from the ``AI_100`` or ``PyTorch`` runtime.
         """
@@ -660,7 +667,7 @@ def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray
         Generates features from a list of text prompts using a PyTorch model.
 
         ``Mandatory`` Args:
-            model: The transformed PyTorch model used for generating features.
+            :model: The transformed PyTorch model used for generating features.
             :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution.
 
         Returns:

From b6358032d08d676ddf208344b3620683189c5fc2 Mon Sep 17 00:00:00 2001
From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
Date: Mon, 13 Jan 2025 15:17:30 +0530
Subject: [PATCH 2/5] [QEff. Finetune] : Bypass qeff import and add
 trust_remote_code flag for samsum dataset (#206)

1. Bypass qeff import
2. Add trust_remote_code flag for samsum dataset

---------

Signed-off-by: Mamta Singh <quic_mamtsing@quicinc.com>
---
 QEfficient/__init__.py                        | 61 +++++++++++++------
 QEfficient/cloud/finetune.py                  |  3 -
 QEfficient/finetune/configs/peft_config.py    |  2 +-
 QEfficient/finetune/configs/training.py       |  2 +-
 QEfficient/finetune/dataset/samsum_dataset.py | 18 +-----
 scripts/finetune/run_ft_model.py              |  7 ++-
 6 files changed, 49 insertions(+), 44 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 8e32a1e6..1bc06ccf 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -5,24 +5,47 @@
 #
 # -----------------------------------------------------------------------------
 
-from QEfficient.base import QEFFAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader
-from QEfficient.compile.compile_helper import compile
-from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
-from QEfficient.peft import QEffAutoPeftModelForCausalLM
-from QEfficient.transformers.transform import transform
-
-# Users can use QEfficient.export for exporting models to ONNX
-export = qualcomm_efficient_converter
+
+def check_qaic_sdk():
+    """Check if QAIC SDK is installed"""
+    try:
+        import platform
+        import sys
+
+        sys.path.append(f"/opt/qti-aic/dev/lib/{platform.machine()}")
+        import qaicrt  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+QAIC_INSTALLED = check_qaic_sdk()
+
+# Conditionally import QAIC-related modules if the SDK is installed
 __version__ = "0.0.1.dev0"
+if QAIC_INSTALLED:
+    from QEfficient.base import QEFFAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader
+    from QEfficient.compile.compile_helper import compile
+    from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
+    from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
+    from QEfficient.peft import QEffAutoPeftModelForCausalLM
+    from QEfficient.transformers.transform import transform
+
+    # Users can use QEfficient.export for exporting models to ONNX
+    export = qualcomm_efficient_converter
+
+    __all__ = [
+        "transform",
+        "export",
+        "compile",
+        "cloud_ai_100_exec_kv",
+        "QEFFAutoModel",
+        "QEFFAutoModelForCausalLM",
+        "QEffAutoPeftModelForCausalLM",
+        "QEFFCommonLoader",
+    ]
 
-__all__ = [
-    "transform",
-    "export",
-    "compile",
-    "cloud_ai_100_exec_kv",
-    "QEFFAutoModel",
-    "QEFFAutoModelForCausalLM",
-    "QEffAutoPeftModelForCausalLM",
-    "QEFFCommonLoader",
-]
+    print("QAIC SDK is installed.")
+else:
+    print("QAIC SDK is not installed. Proceeding without it.")
diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py
index b0ce3d99..eadab0d9 100644
--- a/QEfficient/cloud/finetune.py
+++ b/QEfficient/cloud/finetune.py
@@ -5,7 +5,6 @@
 #
 # -----------------------------------------------------------------------------
 
-import os
 import random
 import warnings
 
@@ -58,8 +57,6 @@ def main(**kwargs):
     update_config(train_config, **kwargs)
     device = train_config.device
 
-    os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = "True"
-
     # dist init
     if train_config.enable_ddp:
         # TODO: may have to init qccl backend, next try run with torchrun command
diff --git a/QEfficient/finetune/configs/peft_config.py b/QEfficient/finetune/configs/peft_config.py
index 13b9d6aa..e2d018f0 100644
--- a/QEfficient/finetune/configs/peft_config.py
+++ b/QEfficient/finetune/configs/peft_config.py
@@ -20,7 +20,7 @@ class lora_config:
     bias = "none"
     task_type: str = "CAUSAL_LM"
     lora_dropout: float = 0.05
-    inference_mode: bool = False
+    inference_mode: bool = False  # should be False for finetuning
 
 
 # CAUTION prefix tuning is currently not supported
diff --git a/QEfficient/finetune/configs/training.py b/QEfficient/finetune/configs/training.py
index ac750ebe..41ffa3fb 100644
--- a/QEfficient/finetune/configs/training.py
+++ b/QEfficient/finetune/configs/training.py
@@ -38,7 +38,7 @@ class train_config:
     save_metrics: bool = True  # saves training metrics to a json file for later plotting
     intermediate_step_save: int = 1000
     batching_strategy: str = "packing"
-    enable_sorting_for_ddp: bool = "True"
+    enable_sorting_for_ddp: bool = True
 
     # TODO: vbaddi: Uncomment post adding qaic to Pytorch Profiler
     # flop_counter: bool = False # Enable flop counter to measure model throughput, can not be used with pytorch profiler at the same time.
diff --git a/QEfficient/finetune/dataset/samsum_dataset.py b/QEfficient/finetune/dataset/samsum_dataset.py
index e6680cf7..71814599 100644
--- a/QEfficient/finetune/dataset/samsum_dataset.py
+++ b/QEfficient/finetune/dataset/samsum_dataset.py
@@ -5,27 +5,11 @@
 #
 # -----------------------------------------------------------------------------
 
-from unittest.mock import patch
-
 import datasets
 
 
-@patch("builtins.input", return_value="N")
-def load_samsum(split, _):
-    try:
-        ds = datasets.load_dataset("Samsung/samsum", split=split)
-    except ValueError as e:
-        if "trust_remote_code" in str(e):
-            raise ValueError(
-                "Loading Samsung/samsum requires you to execute the dataset script in that repo on your local machine. Make sure you have read the code there to avoid malicious use, then set HF_DATASETS_TRUST_REMOTE_CODE env variable to True."
-            ) from e
-        else:
-            raise e
-    return ds
-
-
 def get_preprocessed_samsum(dataset_config, tokenizer, split, context_length=None):
-    dataset = load_samsum(split)
+    dataset = datasets.load_dataset("Samsung/samsum", split=split, trust_remote_code=True)
 
     prompt = "Summarize this dialog:\n{dialog}\n---\nSummary:\n"
 
diff --git a/scripts/finetune/run_ft_model.py b/scripts/finetune/run_ft_model.py
index f607d253..5e88db64 100644
--- a/scripts/finetune/run_ft_model.py
+++ b/scripts/finetune/run_ft_model.py
@@ -9,17 +9,18 @@
 import warnings
 
 import torch
-from configs.training import train_config as TRAIN_CONFIG
 from peft import AutoPeftModelForCausalLM
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from QEfficient.finetune.configs.training import train_config as TRAIN_CONFIG
+
 # Suppress all warnings
 warnings.filterwarnings("ignore")
 
 try:
     import torch_qaic  # noqa: F401
 
-    device = "qaic:1"
+    device = "qaic:0"
 except ImportError as e:
     print(f"Warning: {e}. Moving ahead without these qaic modules.")
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@@ -29,7 +30,7 @@
     train_config.model_name,
     use_cache=False,
     attn_implementation="sdpa",
-    torch_dtype=torch.float16 if torch.cuda.is_available() or device == "qaic:1" else None,
+    torch_dtype=torch.float16 if torch.cuda.is_available() or device == "qaic:0" else None,
 )
 
 # Load the tokenizer and add special tokens

From ba1f984ccd2e4986fcc884e317855767875eb066 Mon Sep 17 00:00:00 2001
From: shubhagr-quic <quic_shubhagr@quicinc.com>
Date: Mon, 13 Jan 2025 18:59:04 +0530
Subject: [PATCH 3/5] Updated Qefficient documentation with QNN Compilation.
 (#223)

Signed-off-by: Shubham Agrawal <quic_shubhagr@quicinc.com>
---
 docs/source/hl_api.md      |  7 ---
 docs/source/quick_start.md | 97 +++++++++++++++++++++++++++-----------
 2 files changed, 70 insertions(+), 34 deletions(-)

diff --git a/docs/source/hl_api.md b/docs/source/hl_api.md
index 5662b23a..d5f2e10f 100644
--- a/docs/source/hl_api.md
+++ b/docs/source/hl_api.md
@@ -47,13 +47,6 @@
    import QEfficient
    base_path, onnx_model_path = QEfficient.export(model_name="gpt2")
    qpc_path = QEfficient.compile(onnx_path=onnx_model_path, qpc_path=os.path.join(base_path, "qpc"), num_cores=14, device_group=[0])
-
-   # Similarly for QPC Compiled via QNN SDK
-   # 1. export $QNN_SDK_ROOT=/path/to/qnn_sdk_folder
-   # 2. add --enable_qnn in the command
-   # 3. An optional config file can be provided via qnn_config if user wish to override the default parameters.
-   qpc_path_qnn = QEfficient.compile(onnx_path=onnx_model_path, qpc_path=os.path.join(base_path, "qpc"), num_cores=14, device_group=[0],
-                                     enable_qnn=True, qnn_config = "QEfficient/compile/qnn_config.json")
 .. deprecated::
    This function will be deprecated in version 1.19, please use QEFFAutoModelForCausalLM.compile instead
 ```
diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md
index 6b117322..55e0746e 100644
--- a/docs/source/quick_start.md
+++ b/docs/source/quick_start.md
@@ -50,16 +50,6 @@ You can also pass path of txt file with input prompts when you want to run infer
 python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompts_txt_file_path examples/prompts.txt --mxfp6 --mos 1 --aic_enable_depth_first
 ```
 
-For QNN Compilation, export $QNN_SDK_ROOT=/path/to/qnn_sdk_folder & add --enable_qnn in the command and an optional config file if user wish to override the default parameters.
-Without QNN Config
-```bash
-python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first --enable_qnn
-```
-
-With QNN Config
-```bash
-python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first --enable_qnn QEfficient/compile/qnn_config.json
-````
 ### QEfficient.cloud.execute
 You can first run `infer` API and then use `execute` to run the pre-compiled model on Cloud AI 100 cards.
 Once we have compiled the QPC, we can now use the precompiled QPC in execute API to run for different prompts. Make sure to pass same `--device_group` as used during infer. Refer [Execute API doc](execute_api) for more details.
@@ -83,10 +73,6 @@ You can also enable MQ, just based on the number of devices. Based on the `--dev
 python -m QEfficient.cloud.infer --model_name Salesforce/codegen-2B-mono --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device-group [0,1] --prompt "def fibonacci(n):" --mos 2 --aic_enable_depth_first
 ```
 
-For QNN Compilation, export $QNN_SDK_ROOT=/path/to/qnn_sdk_folder & add --enable_qnn in the command and an optional config file if user wish to override the default parameters.
-```bash
-python -m QEfficient.cloud.infer --model_name Salesforce/codegen-2B-mono --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device-group [0,1] --prompt "def fibonacci(n):" --mos 2 --aic_enable_depth_first --enable_qnn QEfficient/compile/qnn_config.json
-```
 Above step will save the `qpc` files under `efficient-transformers/qeff_models/{model_card_name}`, you can use the execute API to run for different prompts. This will automatically pick the pre-compiled `qpc` files.
 
 ```bash
@@ -99,12 +85,6 @@ To disable MQ, just pass single soc like below, below step will compile the mode
 python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device-group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first
 ```
 
-For QNN Compilation, export $QNN_SDK_ROOT=/path/to/qnn_sdk_folder & add --enable_qnn in the command and an optional config file if user wish to override the default parameters.
-```bash
-python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device-group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first --enable_qnn QEfficient/compile/qnn_config.json
-```
-
-
 ### Continuous Batching
 
 Users can compile a model utilizing the continuous batching feature by specifying full_batch_size <full_batch_size_value> in the infer and compiler APIs. If full_batch_size is not provided, the model will be compiled in the regular way.
@@ -118,11 +98,77 @@ python -m QEfficient.cloud.infer --model_name TinyLlama/TinyLlama_v1.1 --prompt_
 theory is the belief that|The sun rises from" --mxfp6 --mos 1 --aic_enable_depth_first --full_batch_size 3
 ```
 
-For QNN Compilation, export $QNN_SDK_ROOT=/path/to/qnn_sdk_folder & add --enable_qnn in the command and an optional config file if user wish to override the default parameters.
+### QNN Compilation
+
+Users can compile a model with QNN SDK by following the steps below:
+
+* Set QNN SDK Path: export $QNN_SDK_ROOT=/path/to/qnn_sdk_folder
+* Enabled QNN by passing enable_qnn flag, add --enable_qnn in the cli command.
+* An optional config file can be passed to override the default parameters.
+
+**CLI Inference Command**
+
+Without QNN Config
 ```bash
-python -m QEfficient.cloud.infer --model_name TinyLlama/TinyLlama_v1.1 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is|The flat earth
-theory is the belief that|The sun rises from" --mxfp6 --mos 1 --aic_enable_depth_first --full_batch_size 3 --enable_qnn QEfficient/compile/qnn_config.json
+python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first --enable_qnn
+```
+
+With QNN Config
+```bash
+python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first --enable_qnn QEfficient/compile/qnn_config.json
+````
+
+**CLI Compile Command**
+
+Users can also use `compile` API to compile pre exported onnx models using QNN SDK.
+
+Without QNN Config
+```bash
+python -m QEfficient.cloud.compile --onnx_path <path to gpt2 onnx file> --qpc-path <path to save qpc files> --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first --enable_qnn
 ```
+
+With QNN Config
+```bash
+python -m QEfficient.cloud.compile --onnx_path <path to gpt2 onnx file> --qpc-path <path to save qpc files> --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first --enable_qnn QEfficient/compile/qnn_config.json
+````
+
+**CLI Execute Command**
+
+Once we have compiled the QPC using `infer` or `compile` API, we can now use the precompiled QPC in `execute` API to run for different prompts.
+
+Make sure to pass same `--device_group` as used during infer. Refer [Execute API doc](execute_api) for more details.
+
+```bash
+python -m QEfficient.cloud.execute --model_name gpt2 --qpc_path qeff_models/gpt2/qpc_qnn_16cores_1BS_32PL_128CL_1devices_mxfp6/qpcs --prompt "Once upon a time in" --device_group [0]
+```
+
+**QNN Compilation via Python API**
+
+Users can also use python API to export, compile and execute onnx models using QNN SDK.
+
+```Python
+# We can now export the modified models to ONNX framework
+# This will generate single ONNX Model for both Prefill and Decode Variations which are optimized for
+# Cloud AI 100 Platform.
+from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM
+
+# Model-Card name (This is HF Model Card name) : https://huggingface.co/gpt2-xl
+model_name = "gpt2"  # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.
+
+qeff_model = AutoModelForCausalLM.from_pretrained(model_name)
+
+generated_qpc_path = qeff_model.compile(
+    num_cores=14,
+    mxfp6=True,
+    enable_qnn=True,
+    qnn_config = qnn_config_file_path # QNN compilation configuration is passed.
+)
+
+qeff_model.generate(prompts=["My name is"])
+```
+
+**Users can also take advantage of features like multi-Qranium inference and continuous batching with QNN SDK Compilation.**
+
 ## Python API
 
 ### 1.  Model download and Optimize for Cloud AI 100
@@ -169,9 +215,6 @@ Use the qualcomm_efficient_converter API to export the KV transformed Model to O
 generated_qpc_path = qeff_model.compile(
     num_cores=14,
     mxfp6=True,
-    device_group=[0],
-    enable_qnn=True # if QNN Compilation path {default = False}
-    qnn_config = qnn_config_file_path # if QNN compilation configuration is passed {default = None}.
 )
 ```
 
@@ -202,4 +245,4 @@ tlm.compile(num_speculative_tokens=k)
 dlm.compile()
 ```
 
-The `is_tlm` flag is fed during the instantiation of the model because slight changes to the ONNX graph are required. Once complete, the user can specify `num_speculative_tokens` to define the actual number of speculations that the TLM will take as input during the decode phase. As for the DLM, no new changes are required at the ONNX or compile level.
\ No newline at end of file
+The `is_tlm` flag is fed during the instantiation of the model because slight changes to the ONNX graph are required. Once complete, the user can specify `num_speculative_tokens` to define the actual number of speculations that the TLM will take as input during the decode phase. As for the DLM, no new changes are required at the ONNX or compile level.

From eba49be8775a756d6d84851c0c6a76ae824f08a3 Mon Sep 17 00:00:00 2001
From: Erick Platero <40013722+eplatero97@users.noreply.github.com>
Date: Wed, 15 Jan 2025 00:12:11 -0600
Subject: [PATCH 4/5] add draft spf inference to examples with cli (#221)

add spd inference script to `examples/` directory with CLI to make it
easy for users to test functionality

---------

Signed-off-by: eplatero <quic_eplatero@quicinc.com>
---
 examples/draft_spd_inference.py              | 444 +++++++++++++++++++
 tests/transformers/spd/test_spd_inference.py |  19 +-
 2 files changed, 452 insertions(+), 11 deletions(-)
 create mode 100644 examples/draft_spd_inference.py

diff --git a/examples/draft_spd_inference.py b/examples/draft_spd_inference.py
new file mode 100644
index 00000000..82b51274
--- /dev/null
+++ b/examples/draft_spd_inference.py
@@ -0,0 +1,444 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+from argparse import ArgumentParser
+from dataclasses import dataclass
+from time import perf_counter
+from typing import List, Optional, Union
+
+import numpy as np
+from transformers import AutoTokenizer
+
+from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM
+from QEfficient.generation.cloud_infer import QAICInferenceSession
+from QEfficient.utils.constants import Constants
+
+
+@dataclass
+class PerfMetrics:
+    """
+    Holds all performance metrics
+
+    Args:
+        :mean_ttft (float): Average TLM+DLM TTFT.
+        :batch_ttft (float): Total TLM+DLM Batch TTFT.
+        :decode_throughput (float): Decode throughput.
+        :e2e_throughput (float): E2E throughput.
+        :mean_num_accepted_tokens (float): Average number of accepted tokens.
+        :max_gen_len (int): Max generation length.
+        :generated_tokens_per_prompt (List[int]): Total generated tokens per prompt.
+    """
+
+    mean_ttft: float
+    batch_ttft: float
+    decode_throughput: float
+    e2e_throughput: float
+    mean_num_accepted_tokens: float
+    max_gen_len: int
+    generated_tokens_per_prompt: List[int]
+
+
+@dataclass
+class CloudAI100ExecInfo:
+    """
+    Holds all the information about Cloud AI 100 execution
+
+    Args:
+        :prompts (List[str]): Prompts to perfrom inferencing on.
+        :batch_size (int): Batch size of the QPC compilation.
+        :generated_texts (Union[List[List[str]], List[str]]): Generated text(s).
+        :generated_ids (Union[List[np.ndarray], np.ndarray]): Generated IDs.
+        :perf_metrics (PerfMetrics): Performance metrics.
+        :num_speculative_tokens (int): Number of speculative tokens.
+        :prefill_seq_len (int): Prefill sequence length.
+        :ctx_len (int): Context length.
+        :prefill_bsz (int): Prefill batch size.
+        :draft_model_name (str): Draft model name.
+        :target_model_name (str): Target model name.
+        :full_batch_size (Optional[int]): Full batch size.
+    """
+
+    prompts: List[str]
+    batch_size: int
+    generated_texts: Union[List[str], List[List[str]]]
+    generated_ids: Union[List[np.ndarray], np.ndarray]
+    perf_metrics: PerfMetrics
+    num_speculative_tokens: int
+    prefill_seq_len: int
+    ctx_len: int
+    prefill_bsz: int
+    draft_model_name: str
+    target_model_name: str
+    full_batch_size: Optional[int]
+
+    def __repr__(self):
+        return (
+            f"Avg TLM+DLM TTFT = {round(self.perf_metrics.mean_ttft, 2)}\n"
+            f"Total TLM+DLM Batch TTFT = {round(self.perf_metrics.batch_ttft, 2)}\n"
+            f"Decode Throughput = {round(self.perf_metrics.decode_throughput, 2)}\n"
+            f"E2E Throughput = {round(self.perf_metrics.e2e_throughput, 2)}\n"
+            f"Avg number of accepted tokens = {round(self.perf_metrics.mean_num_accepted_tokens, 2)}\n"
+            f"Max generation len = {self.perf_metrics.max_gen_len}\n"
+            f"Total Generated Tokens per Prompt: = {self.perf_metrics.generated_tokens_per_prompt}"
+        )
+
+
+def run_prefill_on_draft_and_target(
+    tlm_session: QAICInferenceSession,
+    dlm_session: QAICInferenceSession,
+    inputs: dict,
+    prefill_seq_len: int,
+    slot_idx: int,
+):
+    input_len = inputs.input_ids.shape[1]
+    num_chunks = input_len // prefill_seq_len
+    cache_index = np.array([[0]], np.int64)
+    batch_index = np.array([[slot_idx]], np.int64)
+    inputs["batch_index"] = batch_index
+
+    # Run chunked prefill
+    for i in range(num_chunks):
+        chunk_inputs = inputs.copy()
+        chunk_inputs["input_ids"] = inputs["input_ids"][:, cache_index[0, 0] : cache_index[0, 0] + prefill_seq_len]
+        chunk_inputs["position_ids"] = inputs["position_ids"][
+            :, cache_index[0, 0] : cache_index[0, 0] + prefill_seq_len
+        ]
+
+        tlm_outputs = tlm_session.run(chunk_inputs)
+        _ = dlm_session.run(chunk_inputs)
+        cache_index += prefill_seq_len
+
+    tlm_logits = tlm_outputs["logits"]
+    return tlm_logits
+
+
+def get_padded_input_len(input_len: int, prefill_seq_len: int, ctx_len: int):
+    """return padded input length (must be factor of `prefill_seq_len`)
+
+    Args:
+        input_len (int): prompt length
+        prefill_seq_len (int): prefill sequence length
+        ctx_len (int): context length
+
+    Returns:
+        input_len_padded (int): padded input length
+    """
+    num_chunks = -(input_len // -prefill_seq_len)  # ceil divide without float
+    input_len_padded = num_chunks * prefill_seq_len  # Convert input_len to a multiple of prefill_seq_len
+    assert input_len_padded <= ctx_len, (
+        "input_len rounded to nearest prefill_seq_len multiple should be less than ctx_len"
+    )
+    return input_len_padded
+
+
+def split_dlm_bonus_token_inputs(dlm_decode_inputs):
+    bonus_token_inputs = dict()
+    bonus, regular = np.hsplit(dlm_decode_inputs["input_ids"], 2)
+    bonus_token_inputs["input_ids"] = bonus
+    dlm_decode_inputs["input_ids"] = regular
+    bonus, regular = np.hsplit(dlm_decode_inputs["position_ids"], 2)
+    bonus_token_inputs["position_ids"] = bonus
+    dlm_decode_inputs["position_ids"] = regular
+    bonus_token_inputs["batch_index"] = dlm_decode_inputs["batch_index"]
+    return bonus_token_inputs, dlm_decode_inputs
+
+
+def draft_spec_decode_inference(
+    prompts: List[str],
+    num_speculative_tokens: int,
+    prefill_seq_len: int,
+    ctx_len: int,
+    prefill_bsz: int,
+    draft_model_name: str,
+    target_model_name: str,
+    full_batch_size: Optional[int],
+    device_group: List[int],
+) -> CloudAI100ExecInfo:
+    """
+    Perform draft speculative decode inference on the given prompts.
+
+    Args:
+        prompts (List[str]): List of prompts to perform inference on.
+        num_speculative_tokens (int): Number of speculative tokens.
+        prefill_seq_len (int): Prefill sequence length.
+        ctx_len (int): Context length.
+        prefill_bsz (int): Prefill batch size.
+        draft_model_name (str): Name of the draft model.
+        target_model_name (str): Name of the target model.
+        full_batch_size (Optional[int]): Full batch size.
+        device_group (List[int]): List of device IDs.
+
+    Returns:
+        CloudAI100ExecInfo: Execution information, including performance metrics and generated text.
+    """
+    # assumes dlm and tlm are compiled to the same prompt-chunk-size, context length and full_batch_size/batch-size
+    # get vocab size
+    tokenizer = AutoTokenizer.from_pretrained(target_model_name, padding_side="right")
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    vocab_size = len(tokenizer)
+
+    # export_and_compile tlm and dlm
+    continuous_batching = full_batch_size is not None
+    target_model = AutoModelForCausalLM.from_pretrained(
+        target_model_name, continuous_batching=continuous_batching, is_tlm=True
+    )
+    draft_model = AutoModelForCausalLM.from_pretrained(draft_model_name, continuous_batching=continuous_batching)
+
+    num_devices = len(device_group)
+    target_model_qpc_path: str = target_model.compile(
+        num_cores=11,
+        num_devices=num_devices,
+        prefill_seq_len=prefill_seq_len,
+        ctx_len=ctx_len,
+        aic_enable_depth_first=True,
+        full_batch_size=full_batch_size,
+        num_speculative_tokens=num_speculative_tokens,
+    )
+    draft_model_qpc_path: str = draft_model.compile(
+        num_cores=5,
+        prefill_seq_len=prefill_seq_len,
+        ctx_len=ctx_len,
+        aic_enable_depth_first=True,
+        full_batch_size=full_batch_size,
+    )
+    # init qaic session
+    target_model_session = QAICInferenceSession(target_model_qpc_path, device_ids=device_group)
+    draft_model_session = QAICInferenceSession(draft_model_qpc_path, device_ids=device_group)
+
+    # skip inputs/outputs buffers
+    target_model_session.skip_buffers(set([x for x in target_model_session.input_names if x.startswith("past_")]))
+    target_model_session.skip_buffers(
+        set([x for x in target_model_session.output_names if x.endswith("_RetainedState")])
+    )
+    draft_model_session.skip_buffers(set([x for x in draft_model_session.input_names if x.startswith("past_")]))
+    draft_model_session.skip_buffers(set([x for x in draft_model_session.output_names if x.endswith("_RetainedState")]))
+
+    is_cb = full_batch_size is not None
+    decode_batch_size = full_batch_size if is_cb else prefill_bsz
+    if len(prompts) < decode_batch_size:
+        prompts_exp = prompts * decode_batch_size
+        prompts = prompts_exp[:decode_batch_size]
+    # tokenize the prompts
+    prompts_tokenized: List[dict] = []
+    for p in prompts:
+        input_len: int = tokenizer(p, return_tensors="np", padding=True).input_ids.shape[1]
+        input_len_padded: int = get_padded_input_len(input_len, prefill_seq_len, ctx_len)
+        p_tok: dict = tokenizer(p, return_tensors="np", padding="max_length", max_length=input_len_padded)
+        position_ids = np.where(p_tok.pop("attention_mask"), np.arange(input_len_padded), -1)
+        p_tok["position_ids"] = position_ids
+        prompts_tokenized.append(p_tok)
+    # create caches to hold generated ids and input prompt lengths
+    generated_ids = [[] for i in range(decode_batch_size)]
+    input_lengths = [0] * decode_batch_size
+    # run prefill on both draft and target models
+    dlm_decode_inputs = dict()
+    dlm_decode_inputs["position_ids"] = np.zeros((decode_batch_size, 1), np.int64)
+    dlm_decode_inputs["input_ids"] = np.full((decode_batch_size, 1), tokenizer.pad_token_id)
+    dlm_decode_inputs["batch_index"] = np.reshape(
+        np.array(np.arange(decode_batch_size), np.int64), (decode_batch_size, 1)
+    )
+    # mock input key "logits" to store the first batch of output logits
+    tlm_precode_inputs = dict(
+        input_ids=np.zeros((decode_batch_size, num_speculative_tokens + 1), dtype=np.int64),
+        position_ids=np.zeros((decode_batch_size, num_speculative_tokens + 1), dtype=np.int64),
+        batch_index=np.arange(decode_batch_size, dtype=np.int64).reshape(-1, 1),
+    )
+    max_gen_len = [ctx_len] * decode_batch_size
+    num_logits_to_keep = num_speculative_tokens + 1
+    # setup buffers
+    tlm_prefill_logits_ph = np.zeros((prefill_bsz, 1, vocab_size), dtype=np.float32)
+    dlm_prefill_logits_ph = np.zeros((prefill_bsz, 1, vocab_size), dtype=np.float32)
+    decode_logits_ph = np.zeros((decode_batch_size, 1, vocab_size), dtype=np.float32)
+    precode_logits_ph = np.zeros((decode_batch_size, num_logits_to_keep, vocab_size), dtype=np.float32)
+
+    target_model_session.set_buffers({"logits": tlm_prefill_logits_ph})
+    draft_model_session.set_buffers({"logits": dlm_prefill_logits_ph})
+    e2e_start = perf_counter()
+    ttfts = []
+    for bi in range(decode_batch_size):
+        # assumes that prefill queue will always be popped from the front
+        start = perf_counter()
+        tlm_logits = run_prefill_on_draft_and_target(
+            tlm_session=target_model_session,
+            dlm_session=draft_model_session,
+            inputs=prompts_tokenized[bi],
+            prefill_seq_len=prefill_seq_len,
+            slot_idx=bi,
+        )
+        ttft = perf_counter() - start
+        ttfts.append(ttft)
+        input_ids = tlm_logits.argmax(2).astype(np.int64)
+        generated_ids[bi].append(input_ids.item())
+        dlm_decode_inputs["input_ids"][bi, 0] = input_ids
+        tlm_precode_inputs["input_ids"][bi, 0] = input_ids.item()
+        input_len = prompts_tokenized[bi]["position_ids"].max(1).item() + 1
+        dlm_decode_inputs["position_ids"][bi, 0] = input_len
+        tlm_precode_inputs["position_ids"][bi] = np.arange(
+            input_len, input_len + num_speculative_tokens + 1, dtype=np.int64
+        )
+        # assumes that prefill queue will always be popped from the front
+        input_lengths[bi] = input_len
+        max_gen_len[bi] -= input_lengths[bi]
+    batch_ttft = perf_counter() - e2e_start
+
+    # set decode logits buffers
+    target_model_session.set_buffers({"logits": precode_logits_ph})
+    draft_model_session.set_buffers({"logits": decode_logits_ph})
+    # start decode phase
+    valid_batch_indices = np.full(decode_batch_size, True, dtype=bool)
+    all_accept = False
+    it = 0
+    decode_start = perf_counter()
+    mean_num_accepted_tokens = 0
+    all_accept = np.full(decode_batch_size, False, dtype=bool)
+    while True:
+        it += 1
+        # generate proposals from draft model
+        for k_ in range(num_speculative_tokens):
+            if all_accept.any():
+                # running decode one extra time in the first speculative iteration
+                # workaround to avoid the incorrect precode with 3-specialized multi-batch DLM
+                bonus_token_inputs, dlm_decode_inputs = split_dlm_bonus_token_inputs(dlm_decode_inputs)
+                _ = draft_model_session.run(bonus_token_inputs)
+                all_accept[:] = False
+            dlm_outputs = draft_model_session.run(dlm_decode_inputs)
+            input_ids = dlm_outputs["logits"].argmax(2)
+            tlm_precode_inputs["input_ids"][:, k_ + 1] = input_ids.flatten()
+            dlm_decode_inputs["input_ids"] = input_ids
+            dlm_decode_inputs["position_ids"][valid_batch_indices] += 1
+        # run precode on TLM to score the proposed tokens
+        tlm_outputs = target_model_session.run(tlm_precode_inputs)
+        target_logits = tlm_outputs["logits"]
+        # greedy sampling from target model
+        target_tokens = target_logits.argmax(-1)
+        # exact matching between draft and target tokens
+        draft_tokens = tlm_precode_inputs["input_ids"][:, 1:]
+        matching = draft_tokens == target_tokens[:, :-1]  # shape: [decode_batch_size, num_speculative_tokens]
+        num_tokens_selected = matching.cumprod(axis=1).sum(axis=1) + 1  # shape: [decode_batch_size]
+        all_accept[valid_batch_indices] = num_tokens_selected[valid_batch_indices] == num_speculative_tokens + 1
+        mean_num_accepted_tokens += num_tokens_selected[valid_batch_indices].mean().item()
+        # append selected tokens to the generated_ids
+        tlm_precode_position_ids = tlm_precode_inputs["position_ids"] + num_tokens_selected.reshape(
+            decode_batch_size, 1
+        )
+        # tlm_precode_position_ids = tlm_precode_inputs["position_ids"] + num_tokens_selected.reshape(decode_batch_size,1)+1
+        for bi, valid in enumerate(valid_batch_indices):
+            if not valid:
+                continue
+            accepted_tokens = num_tokens_selected[bi]
+            num_tokens_to_append = min(accepted_tokens, max_gen_len[bi] - len(generated_ids[bi]))
+            generated_ids[bi].extend(target_tokens[bi, :num_tokens_to_append].tolist())
+            # position_ids > ctx_len-1 result in erronous output for logits at each seq_len of TLM
+            # (e.g., ctx_len=128 -> position_ids=[127,128,129] will give erronous output at each predicted token)
+            if len(generated_ids[bi]) >= max_gen_len[bi] or (tlm_precode_position_ids[bi] > ctx_len - 1).any():
+                valid_batch_indices[bi] = False
+        # check if all generations are done
+        if not valid_batch_indices.any():
+            break
+        # prepare decode inputs for next decode iteration
+        num_valid_batch_indices = valid_batch_indices.sum().item()
+        common_input_ids = target_tokens[valid_batch_indices, num_tokens_selected[valid_batch_indices] - 1].reshape(
+            num_valid_batch_indices, 1
+        )
+        common_position_ids = (
+            tlm_precode_inputs["position_ids"][
+                valid_batch_indices, num_tokens_selected[valid_batch_indices] - 1
+            ].reshape(num_valid_batch_indices, 1)
+            + 1
+        )
+        if all_accept.any():
+            # all_accept input_ids
+            input_ids = np.zeros((decode_batch_size, 2), dtype=np.int64)
+            last_spec_token_id = target_tokens[valid_batch_indices, -2].reshape(-1, 1)
+            input_ids[valid_batch_indices] = np.concatenate([last_spec_token_id, common_input_ids], axis=1)
+            dlm_decode_inputs["input_ids"] = input_ids
+            # all_accept position_ids
+            position_ids = np.full((decode_batch_size, 2), -1, dtype=np.int64)
+            last_spec_position_id = tlm_precode_inputs["position_ids"][valid_batch_indices, -1].reshape(-1, 1)
+            position_ids[valid_batch_indices] = np.concatenate([last_spec_position_id, common_position_ids], axis=1)
+            dlm_decode_inputs["position_ids"] = position_ids
+        else:
+            dlm_decode_inputs["input_ids"][valid_batch_indices] = common_input_ids
+            dlm_decode_inputs["position_ids"][valid_batch_indices] = common_position_ids
+        tlm_precode_inputs["input_ids"][valid_batch_indices, 0] = common_input_ids.flatten()
+        tlm_precode_inputs["position_ids"][valid_batch_indices] += num_tokens_selected[valid_batch_indices].reshape(
+            num_valid_batch_indices, 1
+        )
+    end = perf_counter()
+    # calculate performance metrics
+    decode_end = end - decode_start
+    e2e_end = end - e2e_start
+    mean_ttft = sum(ttfts) / len(ttfts)
+    generated_tokens_per_prompt = [len(gid) + 1 for gid in generated_ids]
+    decode_throughput = sum(generated_tokens_per_prompt) / decode_end
+    e2e_throughput = (sum(generated_tokens_per_prompt) + decode_batch_size) / e2e_end
+    batch_decode = tokenizer.batch_decode(generated_ids)
+    mean_num_accepted_tokens /= it
+    perf_metrics = PerfMetrics(
+        mean_ttft,
+        batch_ttft,
+        decode_throughput,
+        e2e_throughput,
+        mean_num_accepted_tokens,
+        max_gen_len,
+        generated_tokens_per_prompt,
+    )
+    exec_info = CloudAI100ExecInfo(
+        prompts,
+        decode_batch_size,
+        batch_decode,
+        generated_ids,
+        perf_metrics,
+        num_speculative_tokens,
+        prefill_seq_len,
+        ctx_len,
+        prefill_bsz,
+        draft_model_name,
+        target_model_name,
+        full_batch_size,
+    )
+    return exec_info
+
+
+def optional_int(x):
+    if x is None:
+        return None
+    return int(x)
+
+
+def arg_parse():
+    parser = ArgumentParser(description="Draft-based SpD Inference")
+    parser.add_argument("--prompts", type=str, nargs="+", default=Constants.INPUT_STR, help="Input prompt(s)")
+    parser.add_argument("--num-speculative-tokens", type=int, default=4, help="Number of speculative tokens")
+    parser.add_argument("--prefill-seq-len", type=int, default=32, help="Prefill sequence length")
+    parser.add_argument("--ctx-len", type=int, default=128, help="Context length")
+    parser.add_argument("--prefill-bsz", type=int, default=1, help="Prefill batch size")
+    parser.add_argument(
+        "--draft-model-name", type=str, default="TinyLlama/TinyLlama-1.1B-Chat-v1.0", help="Draft model name"
+    )
+    parser.add_argument(
+        "--target-model-name", type=str, default="TinyLlama/TinyLlama-1.1B-Chat-v1.0", help="Target model name"
+    )
+    parser.add_argument("--full-batch-size", type=optional_int, default=None, help="Full batch size")
+    parser.add_argument("--device-group", type=int, nargs="+", default=[0], help="device QIDs")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = arg_parse()
+    exec_info = draft_spec_decode_inference(**vars(args))
+    print(exec_info)
+    prompts = exec_info.prompts
+    generated_texts = exec_info.generated_texts
+    for prompt, generation in zip(prompts, generated_texts):
+        print(f"{prompt=} {generation=}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/transformers/spd/test_spd_inference.py b/tests/transformers/spd/test_spd_inference.py
index 18334e81..b9f07e4b 100644
--- a/tests/transformers/spd/test_spd_inference.py
+++ b/tests/transformers/spd/test_spd_inference.py
@@ -19,7 +19,7 @@
 
 configs = [
     pytest.param(
-        Constants.INPUT_STR,  # prompt
+        Constants.INPUT_STR,  # prompts
         4,  # num_speculative_tokens
         32,  # prefill_seq_len
         128,  # ctx_len
@@ -92,13 +92,12 @@ def split_dlm_bonus_token_inputs(dlm_decode_inputs):
     return bonus_token_inputs, dlm_decode_inputs
 
 
-@pytest.mark.on_qaic
 @pytest.mark.parametrize(
-    "prompt, num_speculative_tokens, prefill_seq_len, ctx_len, prefill_bsz, draft_model_name, target_model_name, full_batch_size",
+    "prompts, num_speculative_tokens, prefill_seq_len, ctx_len, prefill_bsz, draft_model_name, target_model_name, full_batch_size",
     configs,
 )
 def test_spec_decode_inference(
-    prompt: List[str],
+    prompts: List[str],
     num_speculative_tokens: int,
     prefill_seq_len: int,
     ctx_len: int,
@@ -155,12 +154,10 @@ def test_spec_decode_inference(
     draft_model_session.skip_buffers(set([x for x in draft_model_session.output_names if x.endswith("_RetainedState")]))
 
     is_cb = full_batch_size is not None
-    if not is_cb:
-        prompts = prompt * prefill_bsz
-        decode_batch_size = prefill_bsz
-    else:
-        prompts = prompt
-        decode_batch_size = full_batch_size
+    decode_batch_size = full_batch_size if is_cb else prefill_bsz
+    if len(prompts) < decode_batch_size:
+        prompts_exp = prompts * decode_batch_size
+        prompts = prompts_exp[:decode_batch_size]
     # tokenize the prompts
     prompts_tokenized: List[dict] = []
     for p in prompts:
@@ -331,7 +328,7 @@ def test_spec_decode_inference(
     )
     del target_model_session
     del draft_model_session
-    generated_ids = np.asarray(generated_ids).flatten()
+    generated_ids = np.asarray(generated_ids[0]).flatten()
     gen_len = generated_ids.shape[0]
     exec_info = draft_model.generate(tokenizer, Constants.INPUT_STR, device_group)
     cloud_ai_100_tokens = exec_info.generated_ids[0][

From 29041830e32a1c58bea5dd838a98bdf70a0fd4d3 Mon Sep 17 00:00:00 2001
From: quic-akuruvil <quic_akuruvil@quicinc.com>
Date: Thu, 16 Jan 2025 09:58:16 +0530
Subject: [PATCH 5/5] Docs Update for Granite (#225)

Signed-off-by: Ann <quic_akuruvil@quicinc.com>
---
 docs/source/validate.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/validate.md b/docs/source/validate.md
index 63fe5bd0..36b66085 100644
--- a/docs/source/validate.md
+++ b/docs/source/validate.md
@@ -17,6 +17,8 @@
 | [Gemma-2-2b](https://huggingface.co/google/gemma-2-2b) |✔️ |
 | [Gemma-2-9b](https://huggingface.co/google/gemma-2-9b) |✔️ |
 | [Gemma-2-27b](https://huggingface.co/google/gemma-2-27b) |✔️ |
+| [Granite-20b-code-base](https://huggingface.co/ibm-granite/granite-20b-code-base-8k) | ✔️ |
+| [Granite-20b-code-instruct-8k](https://huggingface.co/ibm-granite/granite-20b-code-instruct-8k) | ✔️ |
 | [Jais-adapted-7b](https://huggingface.co/inceptionai/jais-adapted-7b) |✔️ |
 | [Jais-adapted-13b-chat](https://huggingface.co/inceptionai/jais-adapted-13b-chat) |✔️ |
 | [Jais-adapted-70b](https://huggingface.co/inceptionai/jais-adapted-70b) |✔️ |