From 5b75a90d232659ab62ce2289e2cbf260d1c485c6 Mon Sep 17 00:00:00 2001 From: Abukhoyer Shaik Date: Mon, 20 Jan 2025 09:20:19 +0000 Subject: [PATCH] Jenkins Test Time Reduction Expe Signed-off-by: Abukhoyer Shaik --- scripts/Jenkinsfile | 44 +-- tests/peft/lora/test_lora_model.py | 25 +- tests/peft/test_peft_model.py | 23 +- tests/qnn_tests/test_causal_lm_models_qnn.py | 34 +-- tests/text_generation/test_text_generation.py | 41 ++- .../models/test_causal_lm_models.py | 38 +-- .../models/test_embedding_models.py | 18 +- .../models/test_prefix_caching.py | 283 +++++++++--------- 8 files changed, 251 insertions(+), 255 deletions(-) diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile index 0d802b83f..214040171 100644 --- a/scripts/Jenkinsfile +++ b/scripts/Jenkinsfile @@ -56,12 +56,30 @@ pipeline { mkdir -p $PWD/Non_qaic && export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/Non_qaic && - pytest tests -m '(not cli) and (on_qaic) and (not qnn)' -n 4 --junitxml=tests/tests_log2.xml && + pytest tests -m '(not cli) and (on_qaic) and (not qnn)' -n auto --junitxml=tests/tests_log2.xml && deactivate" ''' } } } + stage('QNN Non-CLI Tests') { + steps { + timeout(time: 60, unit: 'MINUTES') { + sh ''' + sudo docker exec ${BUILD_TAG} bash -c " + source /qnn_sdk/bin/envsetup.sh && + source /qnn_sdk/bin/envcheck -c && + cd /efficient-transformers && + . preflight_qeff/bin/activate && + mkdir -p $PWD/Qnn_non_cli && + export TOKENIZERS_PARALLELISM=false && + export QEFF_HOME=$PWD/Qnn_non_cli && + pytest tests -m '(not cli) and (qnn) and (on_qaic)' -n auto --junitxml=tests/tests_log3.xml && + deactivate" + ''' + } + } + } } } stage('CLI Tests') { @@ -74,7 +92,7 @@ pipeline { mkdir -p $PWD/cli && export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/cli && - pytest tests -m '(cli and not qnn)' --junitxml=tests/tests_log3.xml && + pytest tests -m '(cli and not qnn)' --junitxml=tests/tests_log4.xml && deactivate" ''' } @@ -92,31 +110,13 @@ pipeline { mkdir -p $PWD/Qnn_cli && export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/Qnn_cli && - pytest tests -m '(cli and qnn)' --junitxml=tests/tests_log4.xml && - deactivate" - ''' - } - } - } - stage('QNN Non-CLI Tests') { - steps { - timeout(time: 60, unit: 'MINUTES') { - sh ''' - sudo docker exec ${BUILD_TAG} bash -c " - source /qnn_sdk/bin/envsetup.sh && - source /qnn_sdk/bin/envcheck -c && - cd /efficient-transformers && - . preflight_qeff/bin/activate && - mkdir -p $PWD/Qnn_non_cli && - export TOKENIZERS_PARALLELISM=false && - export QEFF_HOME=$PWD/Qnn_non_cli && - pytest tests -m '(not cli) and (qnn) and (on_qaic)' --junitxml=tests/tests_log5.xml && + pytest tests -m '(cli and qnn)' --junitxml=tests/tests_log5.xml && junitparser merge tests/tests_log1.xml tests/tests_log2.xml tests/tests_log3.xml tests/tests_log4.xml tests/tests_log5.xml tests/tests_log.xml && deactivate" ''' } } - } + } } post { diff --git a/tests/peft/lora/test_lora_model.py b/tests/peft/lora/test_lora_model.py index 4726fb8c5..950a94d88 100644 --- a/tests/peft/lora/test_lora_model.py +++ b/tests/peft/lora/test_lora_model.py @@ -14,7 +14,6 @@ from QEfficient import QEffAutoPeftModelForCausalLM from QEfficient.peft.lora import QEffAutoLoraModelForCausalLM -from QEfficient.utils import load_hf_tokenizer configs = [ pytest.param( @@ -227,12 +226,12 @@ def test_auto_lora_model_for_causal_lm_noncb_export_compile_generate( assert Path(qeff_model.qpc_path).is_dir() # test generate - prompts = ["hello!", "hi", "hello, my name is", "hey"] - qeff_model.generate( - tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=base_model_name), - prompts=prompts, - prompt_to_adapter_mapping=["adapter_0", "adapter_1", "adapter_0", "base"], - ) + # prompts = ["hello!", "hi", "hello, my name is", "hey"] + # qeff_model.generate( + # tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=base_model_name), + # prompts=prompts, + # prompt_to_adapter_mapping=["adapter_0", "adapter_1", "adapter_0", "base"], + # ) # test the compile and generate workflow in cb mode @@ -251,9 +250,9 @@ def test_auto_lora_model_for_causal_lm_cb_compile_generate(base_model_name, adap assert Path(qeff_model.qpc_path).is_dir() # test generate - prompts = ["hello!", "hi", "hello, my name is", "hey"] - qeff_model.generate( - tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=base_model_name), - prompts=prompts, - prompt_to_adapter_mapping=["adapter_0", "adapter_1", "adapter_0", "base"], - ) + # prompts = ["hello!", "hi", "hello, my name is", "hey"] + # qeff_model.generate( + # tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=base_model_name), + # prompts=prompts, + # prompt_to_adapter_mapping=["adapter_0", "adapter_1", "adapter_0", "base"], + # ) diff --git a/tests/peft/test_peft_model.py b/tests/peft/test_peft_model.py index 6a9a957b2..62493dd94 100644 --- a/tests/peft/test_peft_model.py +++ b/tests/peft/test_peft_model.py @@ -7,7 +7,6 @@ from time import perf_counter -import numpy as np import onnx import pytest import torch @@ -170,17 +169,17 @@ def test_auto_peft_model_for_causal_lm_compile_generate(base_config, adapter_con end = perf_counter() compile_time_0 = end - start - qeff_model.generate( - input_ids=np.zeros((batch_size, 32), dtype="int64"), - attention_mask=np.concatenate( - [ - np.ones((batch_size, 10), dtype="int64"), - np.zeros((batch_size, 22), dtype="int64"), - ], - axis=1, - ), - max_new_tokens=10, - ) + # qeff_model.generate( + # input_ids=np.zeros((batch_size, 32), dtype="int64"), + # attention_mask=np.concatenate( + # [ + # np.ones((batch_size, 10), dtype="int64"), + # np.zeros((batch_size, 22), dtype="int64"), + # ], + # axis=1, + # ), + # max_new_tokens=10, + # ) start = perf_counter() qeff_model.compile(batch_size=batch_size, prefill_seq_len=32, ctx_len=128) diff --git a/tests/qnn_tests/test_causal_lm_models_qnn.py b/tests/qnn_tests/test_causal_lm_models_qnn.py index fe906fe7e..9ab57b7a9 100644 --- a/tests/qnn_tests/test_causal_lm_models_qnn.py +++ b/tests/qnn_tests/test_causal_lm_models_qnn.py @@ -86,9 +86,9 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model) - assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), ( - "Tokens don't match for HF PyTorch model output and KV PyTorch model output" - ) + assert ( + pytorch_hf_tokens == pytorch_kv_tokens + ).all(), "Tokens don't match for HF PyTorch model output and KV PyTorch model output" onnx_model_path = qeff_model.export() ort_tokens = api_runner.run_kv_model_on_ort(onnx_model_path) @@ -106,12 +106,12 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( aic_enable_depth_first=False, enable_qnn=True, ) - exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR) - cloud_ai_100_tokens = exec_info.generated_ids[0] # Because we always run for single input and single batch size - gen_len = ort_tokens.shape[-1] - assert (ort_tokens == cloud_ai_100_tokens[:, :gen_len]).all(), ( - "Tokens don't match for ONNXRT output and Cloud AI 100 output." - ) + # exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR) + # cloud_ai_100_tokens = exec_info.generated_ids[0] # Because we always run for single input and single batch size + # gen_len = ort_tokens.shape[-1] + # assert (ort_tokens == cloud_ai_100_tokens[:, :gen_len]).all(), ( + # "Tokens don't match for ONNXRT output and Cloud AI 100 output." + # ) # testing for CB models model_hf, _ = load_causal_lm_model(model_config) @@ -145,14 +145,14 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( full_batch_size=full_batch_size, enable_qnn=True, ) - exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts) - - assert all( - [ - all(pt_token[:24] == cloud_token[:24]) - for pt_token, cloud_token in zip(pytorch_hf_tokens, exec_info_fbs.generated_ids) - ] - ), "Tokens don't match for HF PyTorch model output and Cloud AI 100 output." + # exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts) + + # assert all( + # [ + # all(pt_token[:24] == cloud_token[:24]) + # for pt_token, cloud_token in zip(pytorch_hf_tokens, exec_info_fbs.generated_ids) + # ] + # ), "Tokens don't match for HF PyTorch model output and Cloud AI 100 output." @pytest.mark.on_qaic diff --git a/tests/text_generation/test_text_generation.py b/tests/text_generation/test_text_generation.py index b8915859e..3e539f45c 100644 --- a/tests/text_generation/test_text_generation.py +++ b/tests/text_generation/test_text_generation.py @@ -8,7 +8,6 @@ import pytest from transformers import AutoModelForCausalLM -from QEfficient.generation.text_generation_inference import TextGeneration from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM from QEfficient.utils import hf_download from QEfficient.utils._utils import load_hf_tokenizer @@ -65,7 +64,7 @@ def test_generate_text_stream( model_config = {"model_name": model_name, "n_layer": n_layer} model_hf, _ = load_causal_lm_model(model_config) - tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name) + tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name) # noqa: F841 qeff_model = QEFFAutoModelForCausalLM(model_hf) @@ -75,7 +74,7 @@ def test_generate_text_stream( if not device_id: pytest.skip("No available devices to run model on Cloud AI 100") - qpc_path = qeff_model.compile( + qpc_path = qeff_model.compile( # noqa: F841 prefill_seq_len=prompt_len, ctx_len=ctx_len, num_cores=14, @@ -84,21 +83,21 @@ def test_generate_text_stream( full_batch_size=full_batch_size, ) - exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR, generation_len=max_gen_len) - cloud_ai_100_tokens = exec_info.generated_ids[0] # Because we always run for single input and single batch size - cloud_ai_100_output = [tokenizer.decode(token, skip_special_tokens=True) for token in cloud_ai_100_tokens[0]] - - text_generator = TextGeneration( - tokenizer=tokenizer, - qpc_path=qpc_path, - device_id=device_id, - ctx_len=ctx_len, - full_batch_size=full_batch_size, - ) - stream_tokens = [] - for decoded_tokens in text_generator.generate_stream_tokens(Constants.INPUT_STR, generation_len=max_gen_len): - stream_tokens.extend(decoded_tokens) - - assert cloud_ai_100_output == stream_tokens, ( - f"Deviation in output observed while comparing regular execution and streamed output: {cloud_ai_100_output} != {stream_tokens}" - ) + # exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR, generation_len=max_gen_len) + # cloud_ai_100_tokens = exec_info.generated_ids[0] # Because we always run for single input and single batch size + # cloud_ai_100_output = [tokenizer.decode(token, skip_special_tokens=True) for token in cloud_ai_100_tokens[0]] + + # text_generator = TextGeneration( + # tokenizer=tokenizer, + # qpc_path=qpc_path, + # device_id=device_id, + # ctx_len=ctx_len, + # full_batch_size=full_batch_size, + # ) + # stream_tokens = [] + # for decoded_tokens in text_generator.generate_stream_tokens(Constants.INPUT_STR, generation_len=max_gen_len): + # stream_tokens.extend(decoded_tokens) + + # assert cloud_ai_100_output == stream_tokens, ( + # f"Deviation in output observed while comparing regular execution and streamed output: {cloud_ai_100_output} != {stream_tokens}" + # ) diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py index 8f23fac89..babfc810f 100644 --- a/tests/transformers/models/test_causal_lm_models.py +++ b/tests/transformers/models/test_causal_lm_models.py @@ -110,9 +110,9 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model) - assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), ( - "Tokens don't match for HF PyTorch model output and KV PyTorch model output" - ) + assert ( + pytorch_hf_tokens == pytorch_kv_tokens + ).all(), "Tokens don't match for HF PyTorch model output and KV PyTorch model output" onnx_model_path = qeff_model.export() ort_tokens = api_runner.run_kv_model_on_ort(onnx_model_path, is_tlm=is_tlm) @@ -130,12 +130,12 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( aic_enable_depth_first=False, num_speculative_tokens=num_speculative_tokens, ) - exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR) - cloud_ai_100_tokens = exec_info.generated_ids[0] # Because we always run for single input and single batch size - gen_len = ort_tokens.shape[-1] - assert (ort_tokens == cloud_ai_100_tokens[:, :gen_len]).all(), ( - "Tokens don't match for ONNXRT output and Cloud AI 100 output." - ) + # exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR) + # cloud_ai_100_tokens = exec_info.generated_ids[0] # Because we always run for single input and single batch size + # gen_len = ort_tokens.shape[-1] + # assert (ort_tokens == cloud_ai_100_tokens[:, :gen_len]).all(), ( + # "Tokens don't match for ONNXRT output and Cloud AI 100 output." + # ) # testing for CB models model_hf, _ = load_causal_lm_model(model_config) @@ -169,14 +169,14 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( full_batch_size=full_batch_size, num_speculative_tokens=num_speculative_tokens, ) - exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts) + # exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts) - assert all( - [ - all(pt_token[:24] == cloud_token[:24]) - for pt_token, cloud_token in zip(pytorch_hf_tokens, exec_info_fbs.generated_ids) - ] - ), "Tokens don't match for HF PyTorch model output and Cloud AI 100 output." + # assert all( + # [ + # all(pt_token[:24] == cloud_token[:24]) + # for pt_token, cloud_token in zip(pytorch_hf_tokens, exec_info_fbs.generated_ids) + # ] + # ), "Tokens don't match for HF PyTorch model output and Cloud AI 100 output." # FIXME: there should be a CB test here @@ -204,9 +204,9 @@ def test_causal_lm_export_with_deprecated_api(model_name): new_api_ort_tokens = api_runner.run_kv_model_on_ort(new_api_onnx_model_path) old_api_ort_tokens = api_runner.run_kv_model_on_ort(old_api_onnx_model_path) - assert (new_api_ort_tokens == old_api_ort_tokens).all(), ( - "New API output does not match old API output for ONNX export function" - ) + assert ( + new_api_ort_tokens == old_api_ort_tokens + ).all(), "New API output does not match old API output for ONNX export function" @pytest.mark.on_qaic diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py index 1c2d5196c..6af95e303 100644 --- a/tests/transformers/models/test_embedding_models.py +++ b/tests/transformers/models/test_embedding_models.py @@ -43,11 +43,11 @@ def check_embed_pytorch_vs_ort_vs_ai100( pt_embeddings = pt_outputs[0][0].detach().numpy() # Pytorch transformed model qeff_model = QEFFAutoModel(pt_model) - qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False) - qeff_pt_embeddings = qeff_pt_outputs[0][0].detach().numpy() - mad = np.mean(np.abs(pt_embeddings - qeff_pt_embeddings)) - print("Mad for PyTorch and PyTorch transformed qeff_model is ", mad) - assert mad <= 0, f"MAD is too high for onnx and Pytorch: {mad}" + # qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False) + # qeff_pt_embeddings = qeff_pt_outputs[0][0].detach().numpy() + # mad = np.mean(np.abs(pt_embeddings - qeff_pt_embeddings)) + # print("Mad for PyTorch and PyTorch transformed qeff_model is ", mad) + # assert mad <= 0, f"MAD is too high for onnx and Pytorch: {mad}" onnx_model = qeff_model.export() ort_session = ort.InferenceSession(str(onnx_model)) @@ -71,12 +71,12 @@ def check_embed_pytorch_vs_ort_vs_ai100( qeff_model.compile( num_cores=14, ) - ai100_output = qeff_model.generate(inputs=inputs) + # ai100_output = qeff_model.generate(inputs=inputs) # Compare ONNX and AI 100 outputs - mad = np.mean(np.abs(ai100_output - onnx_outputs[0])) - print("Mad for onnx and AI 100 output is ", mad) - assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}" + # mad = np.mean(np.abs(ai100_output - onnx_outputs[0])) + # print("Mad for onnx and AI 100 output is ", mad) + # assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}" @pytest.mark.on_qaic diff --git a/tests/transformers/models/test_prefix_caching.py b/tests/transformers/models/test_prefix_caching.py index fa79f33cd..312349ea6 100644 --- a/tests/transformers/models/test_prefix_caching.py +++ b/tests/transformers/models/test_prefix_caching.py @@ -5,7 +5,6 @@ # # ----------------------------------------------------------------------------- -import numpy as np import pytest from transformers import AutoTokenizer @@ -30,154 +29,154 @@ def test_simple_prefix_caching(model_name): prefixes = ["Once upon a time ", "Once upon a time "] suffixes1 = ["in a land far away", "there was a small village"] - suffixes2 = ["a little girl", "in a bustling city"] + suffixes2 = ["a little girl", "in a bustling city"] # noqa: F841 tokenizer = AutoTokenizer.from_pretrained(model_name) - generator = TextGeneration(tokenizer=tokenizer, qpc_path=qeff_model.qpc_path, full_batch_size=2, ctx_len=256) + generator = TextGeneration(tokenizer=tokenizer, qpc_path=qeff_model.qpc_path, full_batch_size=2, ctx_len=256) # noqa: F841 - prompts = [pref + suff for pref, suff in zip(prefixes, suffixes1)] + prompts = [pref + suff for pref, suff in zip(prefixes, suffixes1)] # noqa: F841 # generation for batch_indices = 0, 1 - prompts_exec_info = generator.generate(prompts) + # prompts_exec_info = generator.generate(prompts) ############################## # generation for batch_indices ############################## # Run prefill for indices 2, 3 with same prompts - out2, pos2, gen_len2 = generator._qaic_model.run_prefill( - prompts[0], generation_len=None, decode_batch_id=np.array(2, dtype=np.int64).reshape(1, 1) - ) - out3, pos3, gen_len3 = generator._qaic_model.run_prefill( - prompts[1], generation_len=None, decode_batch_id=np.array(3, dtype=np.int64).reshape(1, 1) - ) - - # Run decode for batch indices 2, 3 - decode_inputs = { - "input_ids": np.array([[out2["logits"].argmax(2)[0][0]], [out3["logits"].argmax(2)[0][0]]]), - "position_ids": np.array([[pos2[0][0]], [pos3[0][0]]]), - "batch_index": np.array([[2], [3]], dtype=np.int64), - } - - # Set logits placeholder for decode - logits_out_placeholder = np.zeros( - ( - generator._qaic_model.full_batch_size, - generator._qaic_model._decode_seq_len, - generator._qaic_model._vocab_size, - ), - dtype=np.float32, - ) - generator._qaic_model._session.set_buffers({"logits": logits_out_placeholder}) - - generation_outputs = [] - for i in range(gen_len2): - generation_outputs.append(decode_inputs["input_ids"]) - outputs = generator._qaic_model._session.run(decode_inputs) - logits = outputs["logits"] - if len(logits.shape) == 2: - logits = np.expand_dims(logits, 1) - next_token_id = logits.argmax(2) - - decode_inputs["input_ids"] = next_token_id - decode_inputs["position_ids"] += 1 - - assert np.all(generator._qaic_model.generated_ids[0, :gen_len2] == [int(val[0]) for val in generation_outputs]) - assert np.all(generator._qaic_model.generated_ids[1, :gen_len2] == [int(val[1]) for val in generation_outputs]) - - ############################## - # Now rerun with cached prefix on 0th index with prompt3 and use -1 for 1st index - ############################## - - nprompts = [pref + suff for pref, suff in zip(prefixes, suffixes2)] - - ## Prefill run on index 0 - prompt = nprompts[0] - inputs = tokenizer(prompt, return_tensors="np", padding=True) - position_ids = inputs["attention_mask"].sum(1, keepdims=True) - padded_len = inputs["input_ids"].shape[1] - num_chunks = -(padded_len // -generator._qaic_model._prefill_seq_len) - padded_len = num_chunks * generator._qaic_model._prefill_seq_len # Convert to a multiple of prompt_len - - # Initialize variables specific to request - # Calculate the max generation length. - max_gen_len = generator._qaic_model._ctx_len - position_ids.max() - - # Set the prefill logic buffer - logits_out_placeholder = np.zeros((1, 1, generator._qaic_model._vocab_size), dtype=np.float32) - generator._qaic_model._session.set_buffers({"logits": logits_out_placeholder}) - inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len) - inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1) - inputs.pop("token_type_ids", None) - inputs["batch_index"] = np.array([[0]], dtype=np.int64) - norm_outputs = generator._qaic_model._session.run(inputs) - inputs["input_ids"][:, :3] = inputs["input_ids"][:, 4:7] - inputs["input_ids"][:, 3:] = 50256 - inputs["position_ids"][:, :3] = inputs["position_ids"][:, 4:7] - inputs["position_ids"][:, 3:] = -1 - mod_outputs = generator._qaic_model._session.run(inputs) - assert (mod_outputs["logits"] == norm_outputs["logits"]).all() - decode_inputs = { - "input_ids": np.array([[mod_outputs["logits"].argmax(2)[0][0]], [0]]), - "position_ids": np.array([[position_ids[0][0]], [-1]]), - "batch_index": np.array([[0], [1]], dtype=np.int64), - } - - # Set logits placeholder for decode - logits_out_placeholder = np.zeros( - ( - generator._qaic_model.full_batch_size, - generator._qaic_model._decode_seq_len, - generator._qaic_model._vocab_size, - ), - dtype=np.float32, - ) - generator._qaic_model._session.set_buffers({"logits": logits_out_placeholder}) - - generation_outputs = [] - for i in range(max_gen_len): - generation_outputs.append(decode_inputs["input_ids"]) - outputs = generator._qaic_model._session.run(decode_inputs) - logits = outputs["logits"] - if len(logits.shape) == 2: - logits = np.expand_dims(logits, 1) - next_token_id = logits.argmax(2) - - decode_inputs["input_ids"] = next_token_id - decode_inputs["position_ids"][0][0] += 1 - - # TODO: add a check if this matches normal execution for same prompt - ############## - # Now run decode on 1st index again with mod_inputs and check if output is correct - ############## - decode_inputs = { - "input_ids": np.array([[0], [prompts_exec_info.generated_ids[1][0]]]), - "position_ids": np.array([[-1], [9]]), - "batch_index": np.array([[0], [1]], dtype=np.int64), - } - - # Set logits placeholder for decode - logits_out_placeholder = np.zeros( - ( - generator._qaic_model.full_batch_size, - generator._qaic_model._decode_seq_len, - generator._qaic_model._vocab_size, - ), - dtype=np.float32, - ) - generator._qaic_model._session.set_buffers({"logits": logits_out_placeholder}) - - generation_outputs_prefill_cached = [] - for i in range(max_gen_len): - generation_outputs_prefill_cached.append(decode_inputs["input_ids"]) - outputs = generator._qaic_model._session.run(decode_inputs) - logits = outputs["logits"] - if len(logits.shape) == 2: - logits = np.expand_dims(logits, 1) - next_token_id = logits.argmax(2) - - decode_inputs["input_ids"] = next_token_id - decode_inputs["position_ids"][1][0] += 1 - - assert np.all( - prompts_exec_info.generated_ids[1][:247] == [int(val[1]) for val in generation_outputs_prefill_cached][:247] - ) + # out2, pos2, gen_len2 = generator._qaic_model.run_prefill( + # prompts[0], generation_len=None, decode_batch_id=np.array(2, dtype=np.int64).reshape(1, 1) + # ) + # out3, pos3, gen_len3 = generator._qaic_model.run_prefill( + # prompts[1], generation_len=None, decode_batch_id=np.array(3, dtype=np.int64).reshape(1, 1) + # ) + + # # Run decode for batch indices 2, 3 + # decode_inputs = { + # "input_ids": np.array([[out2["logits"].argmax(2)[0][0]], [out3["logits"].argmax(2)[0][0]]]), + # "position_ids": np.array([[pos2[0][0]], [pos3[0][0]]]), + # "batch_index": np.array([[2], [3]], dtype=np.int64), + # } + + # # Set logits placeholder for decode + # logits_out_placeholder = np.zeros( + # ( + # generator._qaic_model.full_batch_size, + # generator._qaic_model._decode_seq_len, + # generator._qaic_model._vocab_size, + # ), + # dtype=np.float32, + # ) + # generator._qaic_model._session.set_buffers({"logits": logits_out_placeholder}) + + # generation_outputs = [] + # for i in range(gen_len2): + # generation_outputs.append(decode_inputs["input_ids"]) + # outputs = generator._qaic_model._session.run(decode_inputs) + # logits = outputs["logits"] + # if len(logits.shape) == 2: + # logits = np.expand_dims(logits, 1) + # next_token_id = logits.argmax(2) + + # decode_inputs["input_ids"] = next_token_id + # decode_inputs["position_ids"] += 1 + + # assert np.all(generator._qaic_model.generated_ids[0, :gen_len2] == [int(val[0]) for val in generation_outputs]) + # assert np.all(generator._qaic_model.generated_ids[1, :gen_len2] == [int(val[1]) for val in generation_outputs]) + + # ############################## + # # Now rerun with cached prefix on 0th index with prompt3 and use -1 for 1st index + # ############################## + + # nprompts = [pref + suff for pref, suff in zip(prefixes, suffixes2)] + + # ## Prefill run on index 0 + # prompt = nprompts[0] + # inputs = tokenizer(prompt, return_tensors="np", padding=True) + # position_ids = inputs["attention_mask"].sum(1, keepdims=True) + # padded_len = inputs["input_ids"].shape[1] + # num_chunks = -(padded_len // -generator._qaic_model._prefill_seq_len) + # padded_len = num_chunks * generator._qaic_model._prefill_seq_len # Convert to a multiple of prompt_len + + # # Initialize variables specific to request + # # Calculate the max generation length. + # max_gen_len = generator._qaic_model._ctx_len - position_ids.max() + + # # Set the prefill logic buffer + # logits_out_placeholder = np.zeros((1, 1, generator._qaic_model._vocab_size), dtype=np.float32) + # generator._qaic_model._session.set_buffers({"logits": logits_out_placeholder}) + # inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len) + # inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1) + # inputs.pop("token_type_ids", None) + # inputs["batch_index"] = np.array([[0]], dtype=np.int64) + # norm_outputs = generator._qaic_model._session.run(inputs) + # inputs["input_ids"][:, :3] = inputs["input_ids"][:, 4:7] + # inputs["input_ids"][:, 3:] = 50256 + # inputs["position_ids"][:, :3] = inputs["position_ids"][:, 4:7] + # inputs["position_ids"][:, 3:] = -1 + # mod_outputs = generator._qaic_model._session.run(inputs) + # assert (mod_outputs["logits"] == norm_outputs["logits"]).all() + # decode_inputs = { + # "input_ids": np.array([[mod_outputs["logits"].argmax(2)[0][0]], [0]]), + # "position_ids": np.array([[position_ids[0][0]], [-1]]), + # "batch_index": np.array([[0], [1]], dtype=np.int64), + # } + + # # Set logits placeholder for decode + # logits_out_placeholder = np.zeros( + # ( + # generator._qaic_model.full_batch_size, + # generator._qaic_model._decode_seq_len, + # generator._qaic_model._vocab_size, + # ), + # dtype=np.float32, + # ) + # generator._qaic_model._session.set_buffers({"logits": logits_out_placeholder}) + + # generation_outputs = [] + # for i in range(max_gen_len): + # generation_outputs.append(decode_inputs["input_ids"]) + # outputs = generator._qaic_model._session.run(decode_inputs) + # logits = outputs["logits"] + # if len(logits.shape) == 2: + # logits = np.expand_dims(logits, 1) + # next_token_id = logits.argmax(2) + + # decode_inputs["input_ids"] = next_token_id + # decode_inputs["position_ids"][0][0] += 1 + + # # TODO: add a check if this matches normal execution for same prompt + # ############## + # # Now run decode on 1st index again with mod_inputs and check if output is correct + # ############## + # decode_inputs = { + # "input_ids": np.array([[0], [prompts_exec_info.generated_ids[1][0]]]), + # "position_ids": np.array([[-1], [9]]), + # "batch_index": np.array([[0], [1]], dtype=np.int64), + # } + + # # Set logits placeholder for decode + # logits_out_placeholder = np.zeros( + # ( + # generator._qaic_model.full_batch_size, + # generator._qaic_model._decode_seq_len, + # generator._qaic_model._vocab_size, + # ), + # dtype=np.float32, + # ) + # generator._qaic_model._session.set_buffers({"logits": logits_out_placeholder}) + + # generation_outputs_prefill_cached = [] + # for i in range(max_gen_len): + # generation_outputs_prefill_cached.append(decode_inputs["input_ids"]) + # outputs = generator._qaic_model._session.run(decode_inputs) + # logits = outputs["logits"] + # if len(logits.shape) == 2: + # logits = np.expand_dims(logits, 1) + # next_token_id = logits.argmax(2) + + # decode_inputs["input_ids"] = next_token_id + # decode_inputs["position_ids"][1][0] += 1 + + # assert np.all( + # prompts_exec_info.generated_ids[1][:247] == [int(val[1]) for val in generation_outputs_prefill_cached][:247] + # )