Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Testing: Jenkins Optimization #229

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 23 additions & 23 deletions scripts/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ pipeline {
parallel {
stage('Run Non-CLI Non-QAIC Tests') {
steps {
timeout(time: 25, unit: 'MINUTES') {
timeout(time: 60, unit: 'MINUTES') {
sh '''
sudo docker exec ${BUILD_TAG} bash -c "
cd /efficient-transformers &&
Expand All @@ -56,12 +56,30 @@ pipeline {
mkdir -p $PWD/Non_qaic &&
export TOKENIZERS_PARALLELISM=false &&
export QEFF_HOME=$PWD/Non_qaic &&
pytest tests -m '(not cli) and (on_qaic) and (not qnn)' -n 4 --junitxml=tests/tests_log2.xml &&
pytest tests -m '(not cli) and (on_qaic) and (not qnn)' -n auto --junitxml=tests/tests_log2.xml &&
deactivate"
'''
}
}
}
stage('QNN Non-CLI Tests') {
steps {
timeout(time: 60, unit: 'MINUTES') {
sh '''
sudo docker exec ${BUILD_TAG} bash -c "
source /qnn_sdk/bin/envsetup.sh &&
source /qnn_sdk/bin/envcheck -c &&
cd /efficient-transformers &&
. preflight_qeff/bin/activate &&
mkdir -p $PWD/Qnn_non_cli &&
export TOKENIZERS_PARALLELISM=false &&
export QEFF_HOME=$PWD/Qnn_non_cli &&
pytest tests -m '(not cli) and (qnn) and (on_qaic)' -n auto --junitxml=tests/tests_log3.xml &&
deactivate"
'''
}
}
}
}
}
stage('CLI Tests') {
Expand All @@ -74,7 +92,7 @@ pipeline {
mkdir -p $PWD/cli &&
export TOKENIZERS_PARALLELISM=false &&
export QEFF_HOME=$PWD/cli &&
pytest tests -m '(cli and not qnn)' --junitxml=tests/tests_log3.xml &&
pytest tests -m '(cli and not qnn)' --junitxml=tests/tests_log4.xml &&
deactivate"
'''
}
Expand All @@ -92,31 +110,13 @@ pipeline {
mkdir -p $PWD/Qnn_cli &&
export TOKENIZERS_PARALLELISM=false &&
export QEFF_HOME=$PWD/Qnn_cli &&
pytest tests -m '(cli and qnn)' --junitxml=tests/tests_log4.xml &&
deactivate"
'''
}
}
}
stage('QNN Non-CLI Tests') {
steps {
timeout(time: 60, unit: 'MINUTES') {
sh '''
sudo docker exec ${BUILD_TAG} bash -c "
source /qnn_sdk/bin/envsetup.sh &&
source /qnn_sdk/bin/envcheck -c &&
cd /efficient-transformers &&
. preflight_qeff/bin/activate &&
mkdir -p $PWD/Qnn_non_cli &&
export TOKENIZERS_PARALLELISM=false &&
export QEFF_HOME=$PWD/Qnn_non_cli &&
pytest tests -m '(not cli) and (qnn) and (on_qaic)' --junitxml=tests/tests_log5.xml &&
pytest tests -m '(cli and qnn)' --junitxml=tests/tests_log5.xml &&
junitparser merge tests/tests_log1.xml tests/tests_log2.xml tests/tests_log3.xml tests/tests_log4.xml tests/tests_log5.xml tests/tests_log.xml &&
deactivate"
'''
}
}
}
}
}

post {
Expand Down
25 changes: 12 additions & 13 deletions tests/peft/lora/test_lora_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

from QEfficient import QEffAutoPeftModelForCausalLM
from QEfficient.peft.lora import QEffAutoLoraModelForCausalLM
from QEfficient.utils import load_hf_tokenizer

configs = [
pytest.param(
Expand Down Expand Up @@ -227,12 +226,12 @@ def test_auto_lora_model_for_causal_lm_noncb_export_compile_generate(
assert Path(qeff_model.qpc_path).is_dir()

# test generate
prompts = ["hello!", "hi", "hello, my name is", "hey"]
qeff_model.generate(
tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=base_model_name),
prompts=prompts,
prompt_to_adapter_mapping=["adapter_0", "adapter_1", "adapter_0", "base"],
)
# prompts = ["hello!", "hi", "hello, my name is", "hey"]
# qeff_model.generate(
# tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=base_model_name),
# prompts=prompts,
# prompt_to_adapter_mapping=["adapter_0", "adapter_1", "adapter_0", "base"],
# )


# test the compile and generate workflow in cb mode
Expand All @@ -251,9 +250,9 @@ def test_auto_lora_model_for_causal_lm_cb_compile_generate(base_model_name, adap
assert Path(qeff_model.qpc_path).is_dir()

# test generate
prompts = ["hello!", "hi", "hello, my name is", "hey"]
qeff_model.generate(
tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=base_model_name),
prompts=prompts,
prompt_to_adapter_mapping=["adapter_0", "adapter_1", "adapter_0", "base"],
)
# prompts = ["hello!", "hi", "hello, my name is", "hey"]
# qeff_model.generate(
# tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=base_model_name),
# prompts=prompts,
# prompt_to_adapter_mapping=["adapter_0", "adapter_1", "adapter_0", "base"],
# )
23 changes: 11 additions & 12 deletions tests/peft/test_peft_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

from time import perf_counter

import numpy as np
import onnx
import pytest
import torch
Expand Down Expand Up @@ -170,17 +169,17 @@ def test_auto_peft_model_for_causal_lm_compile_generate(base_config, adapter_con
end = perf_counter()
compile_time_0 = end - start

qeff_model.generate(
input_ids=np.zeros((batch_size, 32), dtype="int64"),
attention_mask=np.concatenate(
[
np.ones((batch_size, 10), dtype="int64"),
np.zeros((batch_size, 22), dtype="int64"),
],
axis=1,
),
max_new_tokens=10,
)
# qeff_model.generate(
# input_ids=np.zeros((batch_size, 32), dtype="int64"),
# attention_mask=np.concatenate(
# [
# np.ones((batch_size, 10), dtype="int64"),
# np.zeros((batch_size, 22), dtype="int64"),
# ],
# axis=1,
# ),
# max_new_tokens=10,
# )

start = perf_counter()
qeff_model.compile(batch_size=batch_size, prefill_seq_len=32, ctx_len=128)
Expand Down
28 changes: 14 additions & 14 deletions tests/qnn_tests/test_causal_lm_models_qnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,12 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
aic_enable_depth_first=False,
enable_qnn=True,
)
exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR)
cloud_ai_100_tokens = exec_info.generated_ids[0] # Because we always run for single input and single batch size
gen_len = ort_tokens.shape[-1]
assert (ort_tokens == cloud_ai_100_tokens[:, :gen_len]).all(), (
"Tokens don't match for ONNXRT output and Cloud AI 100 output."
)
# exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR)
# cloud_ai_100_tokens = exec_info.generated_ids[0] # Because we always run for single input and single batch size
# gen_len = ort_tokens.shape[-1]
# assert (ort_tokens == cloud_ai_100_tokens[:, :gen_len]).all(), (
# "Tokens don't match for ONNXRT output and Cloud AI 100 output."
# )

# testing for CB models
model_hf, _ = load_causal_lm_model(model_config)
Expand Down Expand Up @@ -145,14 +145,14 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
full_batch_size=full_batch_size,
enable_qnn=True,
)
exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts)

assert all(
[
all(pt_token[:24] == cloud_token[:24])
for pt_token, cloud_token in zip(pytorch_hf_tokens, exec_info_fbs.generated_ids)
]
), "Tokens don't match for HF PyTorch model output and Cloud AI 100 output."
# exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts)

# assert all(
# [
# all(pt_token[:24] == cloud_token[:24])
# for pt_token, cloud_token in zip(pytorch_hf_tokens, exec_info_fbs.generated_ids)
# ]
# ), "Tokens don't match for HF PyTorch model output and Cloud AI 100 output."


@pytest.mark.on_qaic
Expand Down
41 changes: 20 additions & 21 deletions tests/text_generation/test_text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import pytest
from transformers import AutoModelForCausalLM

from QEfficient.generation.text_generation_inference import TextGeneration
from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
from QEfficient.utils import hf_download
from QEfficient.utils._utils import load_hf_tokenizer
Expand Down Expand Up @@ -65,7 +64,7 @@ def test_generate_text_stream(
model_config = {"model_name": model_name, "n_layer": n_layer}
model_hf, _ = load_causal_lm_model(model_config)

tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name)
tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name) # noqa: F841

qeff_model = QEFFAutoModelForCausalLM(model_hf)

Expand All @@ -75,7 +74,7 @@ def test_generate_text_stream(
if not device_id:
pytest.skip("No available devices to run model on Cloud AI 100")

qpc_path = qeff_model.compile(
qpc_path = qeff_model.compile( # noqa: F841
prefill_seq_len=prompt_len,
ctx_len=ctx_len,
num_cores=14,
Expand All @@ -84,21 +83,21 @@ def test_generate_text_stream(
full_batch_size=full_batch_size,
)

exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR, generation_len=max_gen_len)
cloud_ai_100_tokens = exec_info.generated_ids[0] # Because we always run for single input and single batch size
cloud_ai_100_output = [tokenizer.decode(token, skip_special_tokens=True) for token in cloud_ai_100_tokens[0]]

text_generator = TextGeneration(
tokenizer=tokenizer,
qpc_path=qpc_path,
device_id=device_id,
ctx_len=ctx_len,
full_batch_size=full_batch_size,
)
stream_tokens = []
for decoded_tokens in text_generator.generate_stream_tokens(Constants.INPUT_STR, generation_len=max_gen_len):
stream_tokens.extend(decoded_tokens)

assert cloud_ai_100_output == stream_tokens, (
f"Deviation in output observed while comparing regular execution and streamed output: {cloud_ai_100_output} != {stream_tokens}"
)
# exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR, generation_len=max_gen_len)
# cloud_ai_100_tokens = exec_info.generated_ids[0] # Because we always run for single input and single batch size
# cloud_ai_100_output = [tokenizer.decode(token, skip_special_tokens=True) for token in cloud_ai_100_tokens[0]]

# text_generator = TextGeneration(
# tokenizer=tokenizer,
# qpc_path=qpc_path,
# device_id=device_id,
# ctx_len=ctx_len,
# full_batch_size=full_batch_size,
# )
# stream_tokens = []
# for decoded_tokens in text_generator.generate_stream_tokens(Constants.INPUT_STR, generation_len=max_gen_len):
# stream_tokens.extend(decoded_tokens)

# assert cloud_ai_100_output == stream_tokens, (
# f"Deviation in output observed while comparing regular execution and streamed output: {cloud_ai_100_output} != {stream_tokens}"
# )
28 changes: 14 additions & 14 deletions tests/transformers/models/test_causal_lm_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,12 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
aic_enable_depth_first=False,
num_speculative_tokens=num_speculative_tokens,
)
exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR)
cloud_ai_100_tokens = exec_info.generated_ids[0] # Because we always run for single input and single batch size
gen_len = ort_tokens.shape[-1]
assert (ort_tokens == cloud_ai_100_tokens[:, :gen_len]).all(), (
"Tokens don't match for ONNXRT output and Cloud AI 100 output."
)
# exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR)
# cloud_ai_100_tokens = exec_info.generated_ids[0] # Because we always run for single input and single batch size
# gen_len = ort_tokens.shape[-1]
# assert (ort_tokens == cloud_ai_100_tokens[:, :gen_len]).all(), (
# "Tokens don't match for ONNXRT output and Cloud AI 100 output."
# )

# testing for CB models
model_hf, _ = load_causal_lm_model(model_config)
Expand Down Expand Up @@ -169,14 +169,14 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
full_batch_size=full_batch_size,
num_speculative_tokens=num_speculative_tokens,
)
exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts)

assert all(
[
all(pt_token[:24] == cloud_token[:24])
for pt_token, cloud_token in zip(pytorch_hf_tokens, exec_info_fbs.generated_ids)
]
), "Tokens don't match for HF PyTorch model output and Cloud AI 100 output."
# exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts)

# assert all(
# [
# all(pt_token[:24] == cloud_token[:24])
# for pt_token, cloud_token in zip(pytorch_hf_tokens, exec_info_fbs.generated_ids)
# ]
# ), "Tokens don't match for HF PyTorch model output and Cloud AI 100 output."


# FIXME: there should be a CB test here
Expand Down
18 changes: 9 additions & 9 deletions tests/transformers/models/test_embedding_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,11 @@ def check_embed_pytorch_vs_ort_vs_ai100(
pt_embeddings = pt_outputs[0][0].detach().numpy()
# Pytorch transformed model
qeff_model = QEFFAutoModel(pt_model)
qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False)
qeff_pt_embeddings = qeff_pt_outputs[0][0].detach().numpy()
mad = np.mean(np.abs(pt_embeddings - qeff_pt_embeddings))
print("Mad for PyTorch and PyTorch transformed qeff_model is ", mad)
assert mad <= 0, f"MAD is too high for onnx and Pytorch: {mad}"
# qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False)
# qeff_pt_embeddings = qeff_pt_outputs[0][0].detach().numpy()
# mad = np.mean(np.abs(pt_embeddings - qeff_pt_embeddings))
# print("Mad for PyTorch and PyTorch transformed qeff_model is ", mad)
# assert mad <= 0, f"MAD is too high for onnx and Pytorch: {mad}"

onnx_model = qeff_model.export()
ort_session = ort.InferenceSession(str(onnx_model))
Expand All @@ -71,12 +71,12 @@ def check_embed_pytorch_vs_ort_vs_ai100(
qeff_model.compile(
num_cores=14,
)
ai100_output = qeff_model.generate(inputs=inputs)
# ai100_output = qeff_model.generate(inputs=inputs)

# Compare ONNX and AI 100 outputs
mad = np.mean(np.abs(ai100_output - onnx_outputs[0]))
print("Mad for onnx and AI 100 output is ", mad)
assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}"
# mad = np.mean(np.abs(ai100_output - onnx_outputs[0]))
# print("Mad for onnx and AI 100 output is ", mad)
# assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}"


@pytest.mark.on_qaic
Expand Down
Loading
Loading