From 620800fc407e9b360eef13650d3de83d63818417 Mon Sep 17 00:00:00 2001 From: minmingzhu <45281494+minmingzhu@users.noreply.github.com> Date: Thu, 16 May 2024 03:42:03 +0000 Subject: [PATCH] [Inference ] Integrate chat template in llm-on-ray (#199) * integrate inference chat template Signed-off-by: minmingzhu * update Signed-off-by: minmingzhu * update Signed-off-by: minmingzhu * update Signed-off-by: minmingzhu * update Signed-off-by: minmingzhu * update Signed-off-by: minmingzhu * Update query_http_requests.py * update Signed-off-by: minmingzhu * update Signed-off-by: minmingzhu * update * update * update * update yaml file * update yaml * format yaml * update * Update mpt_deltatuner.yaml * update * Update neural-chat-7b-v3-1.yaml * Update predictor_deployment.py * 1. add jinja file 2. add chat template unit test 3. fix comments Signed-off-by: minmingzhu * add license header Signed-off-by: minmingzhu * update Signed-off-by: minmingzhu * update Signed-off-by: minmingzhu * update Signed-off-by: minmingzhu * Update bloom-560m-ci.yaml * debug CI Signed-off-by: minmingzhu * debug CI Signed-off-by: minmingzhu * Update VLLM installation script and documentation (#212) * Update VLLM installation script and documentation Signed-off-by: Wu, Xiaochang * nit Signed-off-by: Wu, Xiaochang * Update vLLM installation message Signed-off-by: Wu, Xiaochang * Update installation instructions for vLLM CPU Signed-off-by: Wu, Xiaochang * Update Dockerfile.vllm Signed-off-by: Wu, Xiaochang * Update VLLM version to 0.4.1 Signed-off-by: Wu, Xiaochang * update doc Signed-off-by: Wu, Xiaochang * nit Signed-off-by: Wu, Xiaochang * nit Signed-off-by: Wu, Xiaochang --------- Signed-off-by: Wu, Xiaochang * [Workflow] Unify Docker operations into bash (#123) * docker2sh test * codepath * codepath * codepath * add * add * add * add * add * add * df * docker.sh * docker bash * docker bash * docker bash * docker bash * inference docker bash * merge main0312 * merge main0312 * merge main0312 * test set-e * fix test * fix * fix * fix * test error * test error * add map * test install error * test install error * test install error * test install error * test * test * fix * fix * fix * only inference * fux * fux * fux * target * target * target * fix proxy * fix proxy * fix proxy * fix proxy * fix proxy * fix proxy * fix proxy * fix fuc * fix fuc * fix fuc * all inference * add finetune * fix * fix * fix * fix * fix finetune * fix finetune * fix review * fix review * fix review * add info output * Update proxy settings and Docker configurations Signed-off-by: Wu, Xiaochang * fix vllm pr212 * fix * fix * change name --------- Signed-off-by: Wu, Xiaochang Co-authored-by: Wu, Xiaochang * fix comments Signed-off-by: minmingzhu * update code style Signed-off-by: minmingzhu * Fix openai response for vLLM (#213) * [CI] Add llama2-70b inference workflow (#208) * add llama-2-70b * nit * fix vllm inference ci * Revert "fix vllm inference ci" This reverts commit 36062bdac79df6c20e631c56cc3dfaae748e816d. * Fix StoppingCriteriaSub parameters to be compatible with latest Transformers (#215) * 1. fix CI 2. fix comments Signed-off-by: minmingzhu * format Signed-off-by: minmingzhu * modify jinja path Signed-off-by: minmingzhu * fix comments Signed-off-by: minmingzhu * update Signed-off-by: minmingzhu * fix comments Signed-off-by: minmingzhu * update Signed-off-by: minmingzhu * update Signed-off-by: minmingzhu * update jinja Signed-off-by: minmingzhu * update jinja file Signed-off-by: minmingzhu --------- Signed-off-by: minmingzhu Signed-off-by: Wu, Xiaochang Signed-off-by: minmingzhu <45281494+minmingzhu@users.noreply.github.com> Co-authored-by: Xiaochang Wu Co-authored-by: yutianchen Co-authored-by: KepingYan Co-authored-by: Yizhong Zhang Co-authored-by: Zhi Lin --- .github/workflows/config/bloom-560m-ci.yaml | 7 +- .github/workflows/config/gpt2-ci.yaml | 7 +- .../config/llama-2-7b-chat-hf-vllm-fp32.yaml | 8 - .github/workflows/config/mpt_deltatuner.yaml | 13 - .../config/mpt_deltatuner_deepspeed.yaml | 13 - .github/workflows/config/opt-125m-ci.yaml | 7 +- README.md | 2 +- docs/serve.md | 2 +- .../openai_tools_call_query.py | 4 +- .../api_server_openai/query_http_requests.py | 2 +- .../query_http_requests_tool.py | 2 +- llm_on_ray/inference/chat_process.py | 222 ----------------- llm_on_ray/inference/chat_template_process.py | 86 +++++++ llm_on_ray/inference/inference_config.py | 4 + .../inference/models/CodeLlama-7b-hf.yaml | 7 +- llm_on_ray/inference/models/bloom-560m.yaml | 7 +- .../models/deepseek-coder-33b-instruct.yaml | 13 +- llm_on_ray/inference/models/deplot.yaml | 14 +- llm_on_ray/inference/models/falcon-7b.yaml | 7 +- llm_on_ray/inference/models/fuyu8b.yaml | 14 +- llm_on_ray/inference/models/gemma-2b.yaml | 9 +- llm_on_ray/inference/models/gpt-j-6b.yaml | 13 - llm_on_ray/inference/models/gpt2.yaml | 7 +- .../models/hpu/llama-2-70b-chat-hf-hpu.yaml | 8 - .../models/hpu/llama-2-7b-chat-hf-hpu.yaml | 8 - .../models/hpu/llama-3-70b-chat-hf-hpu.yaml | 8 - .../models/hpu/llama-3-8b-instruct-hpu.yaml | 8 - .../models/hpu/neural-chat-7b-v3-3.yaml | 11 +- .../ipex-llm/mistral-7b-v0.1-ipex-llm.yaml | 7 +- .../models/ipex-llm/mpt-7b-ipex-llm.yaml | 13 - .../inference/models/llama-2-7b-chat-hf.yaml | 8 - .../models/mistral-7b-Instruct-v0.2.yaml | 6 - .../inference/models/mistral-7b-v0.1.yaml | 7 +- llm_on_ray/inference/models/mpt-7b.yaml | 13 - .../inference/models/neural-chat-7b-v3-1.yaml | 11 +- llm_on_ray/inference/models/opt-125m.yaml | 7 +- .../inference/models/sqlcoder-7b-2.yaml | 7 +- llm_on_ray/inference/models/starcoder.yaml | 7 +- .../export_inference_config_to_yaml.py | 24 -- .../template/inference_config_template.yaml | 35 --- .../models/templates/default_template.jinja | 23 ++ .../models/templates/template_codellama.jinja | 22 ++ .../models/templates/template_gemma.jinja | 18 ++ .../models/templates/template_gpt2.jinja | 20 ++ .../models/templates/template_llama2.jinja | 21 ++ .../models/templates/template_mistral.jinja | 16 ++ .../templates/template_neuralchat.jinja | 23 ++ .../models/vllm/llama-2-7b-chat-hf-vllm.yaml | 8 - llm_on_ray/inference/predictor_deployment.py | 31 +-- llm_on_ray/inference/utils.py | 28 ++- llm_on_ray/ui/start_ui.py | 6 +- tests/inference/test_chat_template.py | 227 ++++++++++++++++++ tests/test_getting_started.sh | 2 +- 53 files changed, 523 insertions(+), 580 deletions(-) delete mode 100644 llm_on_ray/inference/chat_process.py create mode 100644 llm_on_ray/inference/chat_template_process.py delete mode 100644 llm_on_ray/inference/models/template/export_inference_config_to_yaml.py delete mode 100644 llm_on_ray/inference/models/template/inference_config_template.yaml create mode 100644 llm_on_ray/inference/models/templates/default_template.jinja create mode 100644 llm_on_ray/inference/models/templates/template_codellama.jinja create mode 100644 llm_on_ray/inference/models/templates/template_gemma.jinja create mode 100644 llm_on_ray/inference/models/templates/template_gpt2.jinja create mode 100644 llm_on_ray/inference/models/templates/template_llama2.jinja create mode 100644 llm_on_ray/inference/models/templates/template_mistral.jinja create mode 100644 llm_on_ray/inference/models/templates/template_neuralchat.jinja create mode 100644 tests/inference/test_chat_template.py diff --git a/.github/workflows/config/bloom-560m-ci.yaml b/.github/workflows/config/bloom-560m-ci.yaml index 16a97d896..06d0064bb 100644 --- a/.github/workflows/config/bloom-560m-ci.yaml +++ b/.github/workflows/config/bloom-560m-ci.yaml @@ -13,9 +13,4 @@ ipex: model_description: model_id_or_path: bigscience/bloom-560m tokenizer_name_or_path: bigscience/bloom-560m - chat_processor: ChatModelGptJ - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] + chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja" diff --git a/.github/workflows/config/gpt2-ci.yaml b/.github/workflows/config/gpt2-ci.yaml index e528123cc..b3927953b 100644 --- a/.github/workflows/config/gpt2-ci.yaml +++ b/.github/workflows/config/gpt2-ci.yaml @@ -14,10 +14,5 @@ ipex: model_description: model_id_or_path: gpt2 tokenizer_name_or_path: gpt2 - chat_processor: ChatModelGptJ gpt_base_model: true - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] + chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja" diff --git a/.github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml b/.github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml index 46be6eb57..d3d96a0e1 100644 --- a/.github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml +++ b/.github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml @@ -16,13 +16,5 @@ ipex: model_description: model_id_or_path: meta-llama/Llama-2-7b-chat-hf tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf - chat_processor: ChatModelLLama - prompt: - intro: '' - human_id: '[INST] {msg} [/INST] - - ' - bot_id: '' - stop_words: [] config: use_auth_token: '' diff --git a/.github/workflows/config/mpt_deltatuner.yaml b/.github/workflows/config/mpt_deltatuner.yaml index 250004dc2..e0c0d6946 100644 --- a/.github/workflows/config/mpt_deltatuner.yaml +++ b/.github/workflows/config/mpt_deltatuner.yaml @@ -13,20 +13,7 @@ ipex: model_description: model_id_or_path: mosaicml/mpt-7b tokenizer_name_or_path: EleutherAI/gpt-neox-20b - chat_processor: ChatModelGptJ peft_model_id_or_path: nathan0/mpt-7b-deltatuner-model peft_type: deltatuner - prompt: - intro: 'Below is an instruction that describes a task, paired with an input that - provides further context. Write a response that appropriately completes the request. - - ' - human_id: ' - - ### Instruction' - bot_id: ' - - ### Response' - stop_words: [] config: trust_remote_code: true diff --git a/.github/workflows/config/mpt_deltatuner_deepspeed.yaml b/.github/workflows/config/mpt_deltatuner_deepspeed.yaml index 40051e0fa..a4fdd0709 100644 --- a/.github/workflows/config/mpt_deltatuner_deepspeed.yaml +++ b/.github/workflows/config/mpt_deltatuner_deepspeed.yaml @@ -13,20 +13,7 @@ ipex: model_description: model_id_or_path: mosaicml/mpt-7b tokenizer_name_or_path: EleutherAI/gpt-neox-20b - chat_processor: ChatModelGptJ peft_model_id_or_path: nathan0/mpt-7b-deltatuner-model peft_type: deltatuner - prompt: - intro: 'Below is an instruction that describes a task, paired with an input that - provides further context. Write a response that appropriately completes the request. - - ' - human_id: ' - - ### Instruction' - bot_id: ' - - ### Response' - stop_words: [] config: trust_remote_code: true diff --git a/.github/workflows/config/opt-125m-ci.yaml b/.github/workflows/config/opt-125m-ci.yaml index 047d0008c..f13ec7e54 100644 --- a/.github/workflows/config/opt-125m-ci.yaml +++ b/.github/workflows/config/opt-125m-ci.yaml @@ -13,9 +13,4 @@ ipex: model_description: model_id_or_path: facebook/opt-125m tokenizer_name_or_path: facebook/opt-125m - chat_processor: ChatModelGptJ - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] + chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja" diff --git a/README.md b/README.md index 4728e5fc0..5ba4410fa 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ curl $ENDPOINT_URL/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "gpt2", - "messages": [{"role": "assistant", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello!"}], + "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello!"}], "temperature": 0.7 }' diff --git a/docs/serve.md b/docs/serve.md index b822cd416..7a19e139d 100644 --- a/docs/serve.md +++ b/docs/serve.md @@ -52,7 +52,7 @@ curl $ENDPOINT_URL/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": $MODEL_NAME, - "messages": [{"role": "assistant", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello!"}], + "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello!"}], "temperature": 0.7 }' diff --git a/examples/inference/api_server_openai/openai_tools_call_query.py b/examples/inference/api_server_openai/openai_tools_call_query.py index 897ddba13..6a3ea6990 100644 --- a/examples/inference/api_server_openai/openai_tools_call_query.py +++ b/examples/inference/api_server_openai/openai_tools_call_query.py @@ -75,11 +75,11 @@ ] messages = [ [ - {"role": "user", "content": "You are a helpful assistant"}, + {"role": "system", "content": "You are a helpful assistant"}, {"role": "user", "content": "What's the weather like in Boston today?"}, ], [ - {"role": "user", "content": "You are a helpful assistant"}, + {"role": "system", "content": "You are a helpful assistant"}, {"role": "user", "content": "Tell me a short joke?"}, ], ] diff --git a/examples/inference/api_server_openai/query_http_requests.py b/examples/inference/api_server_openai/query_http_requests.py index 536deb30e..2ef1ac0c6 100644 --- a/examples/inference/api_server_openai/query_http_requests.py +++ b/examples/inference/api_server_openai/query_http_requests.py @@ -58,7 +58,7 @@ body = { "model": args.model_name, "messages": [ - {"role": "assistant", "content": "You are a helpful assistant."}, + {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": args.input_text}, ], "stream": args.streaming_response, diff --git a/examples/inference/api_server_openai/query_http_requests_tool.py b/examples/inference/api_server_openai/query_http_requests_tool.py index 217f2b792..bc77c3272 100644 --- a/examples/inference/api_server_openai/query_http_requests_tool.py +++ b/examples/inference/api_server_openai/query_http_requests_tool.py @@ -73,7 +73,7 @@ messages = [ [ - {"role": "user", "content": "You are a helpful assistant"}, + {"role": "system", "content": "You are a helpful assistant"}, {"role": "user", "content": "What's the weather like in Boston today?"}, ], ] diff --git a/llm_on_ray/inference/chat_process.py b/llm_on_ray/inference/chat_process.py deleted file mode 100644 index 3ee238fb7..000000000 --- a/llm_on_ray/inference/chat_process.py +++ /dev/null @@ -1,222 +0,0 @@ -# -# Copyright 2023 The LLM-on-Ray Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - - -class ChatModel: - human_id = "" - bot_id = "" - unknown_id = "" - MEANINGLESS_WORDS = ["", "", "<|endoftext|>", "
"] - stop_words = [""] - - def __init__(self, intro, human_id, bot_id, stop_words) -> None: - self.intro = intro - self.human_id = human_id - self.bot_id = bot_id - self.stop_words = stop_words - self.MEANINGLESS_WORDS.extend(self.stop_words) - - def prepare_prompt(self, messages: list): - """Prepare prompt from history messages.""" - prompt = "" - for msg in messages: - role, content = msg.role, msg.content - if role == "user": - prompt += f"{self.human_id}: {content}\n" - elif role == "assistant": - prompt += f"{self.bot_id}: {content}\n" - else: - prompt += f"{self.unknown_id}: {content}\n" - prompt += f"{self.bot_id}:" - return prompt - - def convert_output(self, output: str): - """Convert the model output to final answer.""" - human_id = self.human_id.strip() - bot_id = self.bot_id.strip() - if human_id != "": - output = output.split(human_id)[0] - if bot_id != "": - output = output.split(bot_id)[0] - for word in self.MEANINGLESS_WORDS: - output = output.replace(word, "") - text = output - # remove partial human_id or bot id - if "\n" in text and ( - human_id.startswith(text[text.rfind("\n") + 1 :]) - or bot_id.startswith(text[text.rfind("\n") + 1]) - ): - text = text[: text.rfind("\n")] - return text - - def get_prompt(self, messages): - """Generate response based on messages.""" - prompt = self.prepare_prompt(messages) - return prompt - - -class ChatModelGptJ(ChatModel): - def __init__(self, intro, human_id, bot_id, stop_words): - super().__init__(intro, human_id, bot_id, stop_words) - - def prepare_prompt(self, messages: list): - """Prepare prompt from history messages.""" - prompt = self.intro - for msg in messages: - msg = dict(msg) - role, content = msg["role"], msg["content"] - if role == "user": - if self.human_id != "": - prompt += f"{self.human_id}:\n{content}\n" - else: - prompt += f"{content}\n" - elif role == "assistant": - if self.bot_id != "": - prompt += f"{self.bot_id}:\n{content}\n" - else: - prompt += f"{content}\n" - else: - prompt += f"### Unknown:\n{content}\n" - if self.bot_id != "": - prompt += f"{self.bot_id}:\n" - return prompt - - -class ChatModelLLama(ChatModel): - def __init__(self, intro, human_id, bot_id, stop_words): - super().__init__(intro, human_id, bot_id, stop_words) - - def prepare_prompt(self, messages: list): - """Prepare prompt from history messages.""" - prompt = self.intro - for msg in messages: - msg = dict(msg) - role, content = msg["role"], msg["content"] - if role == "user": - if self.human_id != "": - prompt += self.human_id.format(msg=content) - else: - prompt += f"{content}\n" - elif role == "assistant": - prompt += f"{content}\n" - elif role == "tool": - prompt += f"{content}\n" - elif role == "system": - prompt += f"### system:\n{content}\n" - else: - prompt += f"### Unknown:\n{content}\n" - if self.bot_id != "": - prompt += f"{self.bot_id}:\n" - return prompt - - -class ChatModelwithImage(ChatModel): - def __init__(self, intro, human_id, bot_id, stop_words): - super().__init__(intro, human_id, bot_id, stop_words) - - def prepare_prompt(self, messages: list): - """Prepare prompt from history messages.""" - from PIL import Image - import requests - from io import BytesIO - import base64 - import re - - prompt = self.intro - for msg in messages: - msg = dict(msg) - role, content = msg["role"], msg["content"] - text_prompt = [] - image_prompt = [] - for item in content: - if item["type"] == "text": - text_prompt.append(item["text"]) - elif item["type"] == "image_url": - image_prompt.append(item["image_url"]) - else: - raise ValueError(f"Unknown content type {item['type']}") - - content = "\n".join(text_prompt) - # prepare images - images = [] - for img in image_prompt: - if "url" not in img: - continue - is_data = len(re.findall("^data:image/.+;base64,", img["url"])) > 0 - if is_data: - encoded_str = re.sub("^data:image/.+;base64,", "", img["url"]) - images.append(Image.open(BytesIO(base64.b64decode(encoded_str)))) - else: - images.append(Image.open(requests.get(img["url"], stream=True).raw)) - - if role == "user": - if self.human_id != "": - prompt += self.human_id.format(msg=content) - else: - prompt += f"{content}\n" - elif role == "assistant": - prompt += f"{content}\n" - else: - prompt += f"### Unknown:\n{content}\n" - if self.bot_id != "": - prompt += f"{self.bot_id}:\n" - return prompt, images - - -class ChatModelGemma(ChatModel): - def __init__(self, intro, human_id, bot_id, stop_words): - super().__init__(intro, human_id, bot_id, stop_words) - - def prepare_prompt(self, messages: list): - """Prepare prompt from history messages.""" - prompt = self.intro - for msg in messages: - msg = dict(msg) - role, content = msg["role"], msg["content"] - if role == "user": - if self.human_id != "": - prompt += f"{self.human_id} {content}\n" - else: - prompt += f"{content}\n" - elif role == "assistant": - if self.bot_id != "": - prompt += f"{self.bot_id} {content}\n" - else: - prompt += f"{content}\n" - else: - prompt += f"### Unknown:\n{content}\n" - if self.bot_id != "": - prompt += f"{self.bot_id}:\n" - return prompt - - -class ChatModelNoFormat(ChatModel): - def __init__(self, intro, human_id, bot_id, stop_words): - super().__init__(intro, human_id, bot_id, stop_words) - - def prepare_prompt(self, messages: list): - """Prepare prompt from history messages.""" - prompt = "" - for msg in messages: - msg = dict(msg) - prompt += msg["content"] - return prompt - - -if __name__ == "__main__": - process_tool = ChatModelGptJ( - "", "### Instruction", "### Response", stop_words=["##", "### Instruction"] - ) diff --git a/llm_on_ray/inference/chat_template_process.py b/llm_on_ray/inference/chat_template_process.py new file mode 100644 index 000000000..004081f03 --- /dev/null +++ b/llm_on_ray/inference/chat_template_process.py @@ -0,0 +1,86 @@ +# +# Copyright 2023 The LLM-on-Ray Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from typing import List + +from llm_on_ray.inference.api_openai_backend.openai_protocol import ChatMessage +from llm_on_ray.inference.utils import parse_jinja_file + + +class ChatTemplatePreprocess: + def __init__(self, predictor) -> None: + self.predictor = predictor + + def get_prompt(self, input: List, is_mllm=False): + """Generate prompt based on chat templates.""" + self.predictor.tokenizer.chat_template = ( + parse_jinja_file(self.predictor.infer_conf.model_description.chat_template) + or self.predictor.tokenizer.chat_template + or parse_jinja_file(self.predictor.infer_conf.model_description.default_chat_template) + ) + """ChatMessage for OpenAI backend and dict for simple backend.""" + if input and isinstance(input[0], (ChatMessage, dict)): + messages = ( + [dict(chat_message) for chat_message in input] + if isinstance(input[0], ChatMessage) + else input + ) + if is_mllm: + texts, images = self._extract_messages(messages) + image = self._prepare_image(images) + prompt = self.predictor.tokenizer.apply_chat_template( + texts, add_generation_prompt=True, tokenize=False + ) + return prompt, image + + prompt = self.predictor.tokenizer.apply_chat_template( + messages, add_generation_prompt=True, tokenize=False + ) + return prompt + + raise TypeError(f"Unsupported type {type(input)} for text. Expected dict or list of dicts.") + + def _extract_messages(self, messages): + texts, images = [], [] + for message in messages: + if message["role"] == "user" and isinstance(message["content"], list): + texts.append({"role": "user", "content": message["content"][0]["text"]}) + images.append( + {"role": "user", "content": message["content"][1]["image_url"]["url"]} + ) + else: + texts.append(message) + return texts, images + + def _prepare_image(self, messages: list): + """Prepare image from history messages.""" + from PIL import Image + import requests + from io import BytesIO + import base64 + import re + + # prepare images + images: List = [] + for msg in messages: + msg = dict(msg) + content = msg["content"] + is_data = len(re.findall("^data:image/.+;base64,", content)) > 0 + if is_data: + encoded_str = re.sub("^data:image/.+;base64,", "", content) + images.append(Image.open(BytesIO(base64.b64decode(encoded_str)))) + else: + images.append(Image.open(requests.get(content, stream=True).raw)) + return images diff --git a/llm_on_ray/inference/inference_config.py b/llm_on_ray/inference/inference_config.py index 323adc548..a731af55f 100644 --- a/llm_on_ray/inference/inference_config.py +++ b/llm_on_ray/inference/inference_config.py @@ -119,6 +119,10 @@ class ModelDescription(BaseModel): input_processor: str = "AutoProcessor" model_loader: str = "AutoModel" + chat_model_with_image: bool = False + chat_template: Union[str, None] = None + default_chat_template: str = "llm_on_ray/inference/models/templates/default_template.jinja" + @validator("quantization_type") def _check_quant_type(cls, v: str): if v: diff --git a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml index 9ea2d77db..37e18acf4 100644 --- a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml +++ b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml @@ -13,9 +13,4 @@ ipex: model_description: model_id_or_path: codellama/CodeLlama-7b-hf tokenizer_name_or_path: codellama/CodeLlama-7b-hf - chat_processor: ChatModelGptJ - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] + chat_template: "llm_on_ray/inference/models/templates/template_codellama.jinja" diff --git a/llm_on_ray/inference/models/bloom-560m.yaml b/llm_on_ray/inference/models/bloom-560m.yaml index ba2a6d962..12d2b0372 100644 --- a/llm_on_ray/inference/models/bloom-560m.yaml +++ b/llm_on_ray/inference/models/bloom-560m.yaml @@ -13,9 +13,4 @@ ipex: model_description: model_id_or_path: bigscience/bloom-560m tokenizer_name_or_path: bigscience/bloom-560m - chat_processor: ChatModelGptJ - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] + chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja" diff --git a/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml b/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml index 75e646a44..adc1d158c 100644 --- a/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml +++ b/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml @@ -10,16 +10,7 @@ device: cpu ipex: enabled: false precision: bf16 -model_description: +model_description: model_id_or_path: deepseek-ai/deepseek-coder-33b-instruct tokenizer_name_or_path: deepseek-ai/deepseek-coder-33b-instruct - chat_processor: ChatModelGptJ - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: ['<|EOT|>', ""] - - - - + chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja" diff --git a/llm_on_ray/inference/models/deplot.yaml b/llm_on_ray/inference/models/deplot.yaml index 4e732a4fe..6e5bde761 100644 --- a/llm_on_ray/inference/models/deplot.yaml +++ b/llm_on_ray/inference/models/deplot.yaml @@ -13,15 +13,5 @@ ipex: model_description: model_id_or_path: google/deplot tokenizer_name_or_path: google/deplot - chat_processor: ChatModelwithImage - input_processor: 'AutoProcessor' - model_loader: 'Pix2StructForConditionalGeneration' - prompt: - intro: '' - human_id: '[INST] {msg} [/INST] - - ' - bot_id: '' - stop_words: [] - config: - use_auth_token: '' + chat_model_with_image: true + chat_template: "llm_on_ray/inference/models/templates/template_llama2.jinja" diff --git a/llm_on_ray/inference/models/falcon-7b.yaml b/llm_on_ray/inference/models/falcon-7b.yaml index 8176a2689..119337d70 100644 --- a/llm_on_ray/inference/models/falcon-7b.yaml +++ b/llm_on_ray/inference/models/falcon-7b.yaml @@ -13,9 +13,4 @@ ipex: model_description: model_id_or_path: tiiuae/falcon-7b tokenizer_name_or_path: tiiuae/falcon-7b - chat_processor: ChatModelGptJ - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] + chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja" diff --git a/llm_on_ray/inference/models/fuyu8b.yaml b/llm_on_ray/inference/models/fuyu8b.yaml index 551a85789..77d33ff9b 100644 --- a/llm_on_ray/inference/models/fuyu8b.yaml +++ b/llm_on_ray/inference/models/fuyu8b.yaml @@ -13,15 +13,5 @@ ipex: model_description: model_id_or_path: adept/fuyu-8b tokenizer_name_or_path: adept/fuyu-8b - chat_processor: ChatModelwithImage - input_processor: FuyuProcessor - model_loader: FuyuForCausalLM - prompt: - intro: '' - human_id: '[INST] {msg} [/INST] - - ' - bot_id: '' - stop_words: [] - config: - use_auth_token: '' + chat_model_with_image: true + chat_template: "llm_on_ray/inference/models/templates/template_llama2.jinja" diff --git a/llm_on_ray/inference/models/gemma-2b.yaml b/llm_on_ray/inference/models/gemma-2b.yaml index 8335857ca..5b013b371 100644 --- a/llm_on_ray/inference/models/gemma-2b.yaml +++ b/llm_on_ray/inference/models/gemma-2b.yaml @@ -13,13 +13,6 @@ ipex: model_description: model_id_or_path: google/gemma-2b tokenizer_name_or_path: google/gemma-2b - chat_processor: ChatModelGemma - prompt: - intro: '' - human_id: 'user - {msg}' - bot_id: 'model - {msg}' - stop_words: [] config: use_auth_token: ' ' + chat_template: "llm_on_ray/inference/models/templates/template_gemma.jinja" diff --git a/llm_on_ray/inference/models/gpt-j-6b.yaml b/llm_on_ray/inference/models/gpt-j-6b.yaml index c7778c12e..9719b2f7e 100644 --- a/llm_on_ray/inference/models/gpt-j-6b.yaml +++ b/llm_on_ray/inference/models/gpt-j-6b.yaml @@ -14,17 +14,4 @@ ipex: model_description: model_id_or_path: EleutherAI/gpt-j-6b tokenizer_name_or_path: EleutherAI/gpt-j-6b - chat_processor: ChatModelGptJ gpt_base_model: true - prompt: - intro: 'Below is an instruction that describes a task. Write a response that appropriately - completes the request. - - ' - human_id: ' - - ### Instruction' - bot_id: ' - - ### Response' - stop_words: [] diff --git a/llm_on_ray/inference/models/gpt2.yaml b/llm_on_ray/inference/models/gpt2.yaml index 48287670a..06a4b1b8b 100644 --- a/llm_on_ray/inference/models/gpt2.yaml +++ b/llm_on_ray/inference/models/gpt2.yaml @@ -13,10 +13,5 @@ ipex: model_description: model_id_or_path: gpt2 tokenizer_name_or_path: gpt2 - chat_processor: ChatModelGptJ gpt_base_model: true - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] + chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja" diff --git a/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml index d68da8428..ab411ff0e 100644 --- a/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml @@ -10,13 +10,5 @@ device: hpu model_description: model_id_or_path: meta-llama/Llama-2-70b-chat-hf tokenizer_name_or_path: meta-llama/Llama-2-70b-chat-hf - chat_processor: ChatModelLLama - prompt: - intro: '' - human_id: '[INST] {msg} [/INST] - - ' - bot_id: '' - stop_words: [] config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml index 374a98f77..b7b19f02a 100644 --- a/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml @@ -8,13 +8,5 @@ device: hpu model_description: model_id_or_path: meta-llama/Llama-2-7b-chat-hf tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf - chat_processor: ChatModelLLama - prompt: - intro: '' - human_id: '[INST] {msg} [/INST] - - ' - bot_id: '' - stop_words: [] config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml index 6976eafb3..32cf9bb4e 100644 --- a/llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml @@ -9,13 +9,5 @@ device: hpu model_description: model_id_or_path: meta-llama/Meta-Llama-3-70b-Instruct tokenizer_name_or_path: meta-llama/Meta-Llama-3-70b-Instruct - chat_processor: ChatModelLLama - prompt: - intro: '' - human_id: '[INST] {msg} [/INST] - - ' - bot_id: '' - stop_words: [] config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml index 35e3fc260..d57ffcc22 100644 --- a/llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml @@ -8,13 +8,5 @@ device: hpu model_description: model_id_or_path: meta-llama/Meta-Llama-3-8b-Instruct tokenizer_name_or_path: meta-llama/Meta-Llama-3-8b-Instruct - chat_processor: ChatModelLLama - prompt: - intro: '' - human_id: '[INST] {msg} [/INST] - - ' - bot_id: '' - stop_words: [] config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml index 848358bec..64566a6d8 100644 --- a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml +++ b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml @@ -14,13 +14,4 @@ ipex: model_description: model_id_or_path: Intel/neural-chat-7b-v3-3 tokenizer_name_or_path: Intel/neural-chat-7b-v3-3 - chat_processor: ChatModelGptJ - prompt: - intro: '### System: - You are a chatbot developed by Intel. Please answer all questions to the best of your ability.' - human_id: ' - - ### User' - bot_id: ' - - ### Assistant' + chat_template: "llm_on_ray/inference/models/templates/template_neuralchat.jinja" diff --git a/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml b/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml index 6a8523467..ed03ad82d 100644 --- a/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml +++ b/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml @@ -14,12 +14,7 @@ model_description: model_id_or_path: mistralai/Mistral-7B-v0.1 ipexllm: true tokenizer_name_or_path: mistralai/Mistral-7B-v0.1 - chat_processor: ChatModelLLama - prompt: - intro: '' - human_id: '[INST] {msg} [/INST]' - bot_id: '' - stop_words: [] config: trust_remote_code: true load_in_4bit: true + chat_template: "llm_on_ray/inference/models/templates/template_mistral.jinja" diff --git a/llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml b/llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml index d352a6517..ecb129973 100644 --- a/llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml +++ b/llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml @@ -14,19 +14,6 @@ model_description: model_id_or_path: mosaicml/mpt-7b-chat ipexllm: true tokenizer_name_or_path: EleutherAI/gpt-neox-20b - chat_processor: ChatModelGptJ - prompt: - intro: 'Below is an instruction that describes a task, paired with an input that - provides further context. Write a response that appropriately completes the request. - - ' - human_id: ' - - ### Instruction' - bot_id: ' - - ### Response' - stop_words: [] config: trust_remote_code: true load_in_4bit: true diff --git a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml index 4b3e11e98..7fdae3933 100644 --- a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml +++ b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml @@ -13,13 +13,5 @@ ipex: model_description: model_id_or_path: meta-llama/Llama-2-7b-chat-hf tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf - chat_processor: ChatModelLLama - prompt: - intro: '' - human_id: '[INST] {msg} [/INST] - - ' - bot_id: '' - stop_words: [] config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml b/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml index 1af9aad1b..ea50f6af7 100644 --- a/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml +++ b/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml @@ -13,11 +13,5 @@ model_description: model_id_or_path: mistralai/Mistral-7B-Instruct-v0.2 ipexllm: false tokenizer_name_or_path: mistralai/Mistral-7B-Instruct-v0.2 - chat_processor: ChatModelLLama - prompt: - intro: '' - human_id: '[INST] {msg} [/INST]' - bot_id: '' - stop_words: [] config: trust_remote_code: true diff --git a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml index c8a0ff385..3654f18f0 100644 --- a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml +++ b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml @@ -14,11 +14,6 @@ model_description: model_id_or_path: mistralai/Mistral-7B-v0.1 ipexllm: false tokenizer_name_or_path: mistralai/Mistral-7B-v0.1 - chat_processor: ChatModelLLama - prompt: - intro: '' - human_id: '[INST] {msg} [/INST]' - bot_id: '' - stop_words: [] config: trust_remote_code: true + chat_template: "llm_on_ray/inference/models/templates/template_mistral.jinja" diff --git a/llm_on_ray/inference/models/mpt-7b.yaml b/llm_on_ray/inference/models/mpt-7b.yaml index 4ea12adb3..89ce086ed 100644 --- a/llm_on_ray/inference/models/mpt-7b.yaml +++ b/llm_on_ray/inference/models/mpt-7b.yaml @@ -13,18 +13,5 @@ ipex: model_description: model_id_or_path: mosaicml/mpt-7b tokenizer_name_or_path: EleutherAI/gpt-neox-20b - chat_processor: ChatModelGptJ - prompt: - intro: 'Below is an instruction that describes a task, paired with an input that - provides further context. Write a response that appropriately completes the request. - - ' - human_id: ' - - ### Instruction' - bot_id: ' - - ### Response' - stop_words: [] config: trust_remote_code: true diff --git a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml index 13a29676c..8f32c28b7 100644 --- a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml +++ b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml @@ -13,13 +13,4 @@ ipex: model_description: model_id_or_path: Intel/neural-chat-7b-v3-1 tokenizer_name_or_path: Intel/neural-chat-7b-v3-1 - chat_processor: ChatModelGptJ - prompt: - intro: '### System: - You are a chatbot developed by Intel. Please answer all questions to the best of your ability.' - human_id: ' - - ### User' - bot_id: ' - - ### Assistant' + chat_template: "llm_on_ray/inference/models/templates/template_neuralchat.jinja" diff --git a/llm_on_ray/inference/models/opt-125m.yaml b/llm_on_ray/inference/models/opt-125m.yaml index 545cd2145..81e05fc19 100644 --- a/llm_on_ray/inference/models/opt-125m.yaml +++ b/llm_on_ray/inference/models/opt-125m.yaml @@ -13,9 +13,4 @@ ipex: model_description: model_id_or_path: facebook/opt-125m tokenizer_name_or_path: facebook/opt-125m - chat_processor: ChatModelGptJ - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] + chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja" diff --git a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml index 7130148a3..daa5256c5 100644 --- a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml +++ b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml @@ -12,11 +12,6 @@ ipex: model_description: model_id_or_path: defog/sqlcoder-7b-2 tokenizer_name_or_path: defog/sqlcoder-7b-2 - chat_processor: ChatModelLLama - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: ["```"] config: use_auth_token: '' + chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja" diff --git a/llm_on_ray/inference/models/starcoder.yaml b/llm_on_ray/inference/models/starcoder.yaml index 0da59ac02..199926353 100644 --- a/llm_on_ray/inference/models/starcoder.yaml +++ b/llm_on_ray/inference/models/starcoder.yaml @@ -13,11 +13,6 @@ device: cpu model_description: model_id_or_path: bigcode/starcoder tokenizer_name_or_path: bigcode/starcoder - chat_processor: ChatModelGptJ - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] config: use_auth_token: '' + chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja" diff --git a/llm_on_ray/inference/models/template/export_inference_config_to_yaml.py b/llm_on_ray/inference/models/template/export_inference_config_to_yaml.py deleted file mode 100644 index 74630030b..000000000 --- a/llm_on_ray/inference/models/template/export_inference_config_to_yaml.py +++ /dev/null @@ -1,24 +0,0 @@ -# -# Copyright 2023 The LLM-on-Ray Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import yaml -import os -from llm_on_ray.inference.inference_config import InferenceConfig - -ic = InferenceConfig() - -with open(os.path.dirname(__file__) + "/inference_config_template.yaml", "w") as f: - yaml.dump(ic.dict(), f, sort_keys=False) diff --git a/llm_on_ray/inference/models/template/inference_config_template.yaml b/llm_on_ray/inference/models/template/inference_config_template.yaml deleted file mode 100644 index 137ddb2dc..000000000 --- a/llm_on_ray/inference/models/template/inference_config_template.yaml +++ /dev/null @@ -1,35 +0,0 @@ -port: 8000 -name: null -route_prefix: null -num_replicas: 1 -cpus_per_worker: 24 -gpus_per_worker: 0 -hpus_per_worker: 0 -deepspeed: false -workers_per_group: 2 -device: cpu -ipex: - enabled: true - precision: bf16 -model_description: - model_id_or_path: null - ipexllm:: false - tokenizer_name_or_path: null - chat_processor: null - gpt_base_model: false - quantized_model_id_or_path: null - quantization_type: null - peft_model_id_or_path: null - peft_type: null - use_hpu_graphs: true - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] - config: - trust_remote_code: false - use_auth_token: null - load_in_4bit: false - ipexllm_config: - load_in_low_bit: '' diff --git a/llm_on_ray/inference/models/templates/default_template.jinja b/llm_on_ray/inference/models/templates/default_template.jinja new file mode 100644 index 000000000..d098fc154 --- /dev/null +++ b/llm_on_ray/inference/models/templates/default_template.jinja @@ -0,0 +1,23 @@ +{% if messages[0]['role'] == 'system' %} +{% set loop_messages = messages[1:] %} +{% set system_message = messages[0]['content'] %} +{% else %} +{% set loop_messages = messages %} +{% set system_message = false %} +{% endif %} +{% for message in loop_messages %} +{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} +{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} +{% endif %} +{% if loop.index0 == 0 and system_message %} +{{ system_message }} +{% endif %} +{% if message['role'] == 'user' %} +{{ '### Instruction: ' + message['content'].strip() }} +{% elif message['role'] == 'assistant' %} +{{ '### Response:' + message['content'].strip() }} +{% endif %} +{% endfor %} +{% if add_generation_prompt %} +{{ '### Response:' }} +{% endif %} diff --git a/llm_on_ray/inference/models/templates/template_codellama.jinja b/llm_on_ray/inference/models/templates/template_codellama.jinja new file mode 100644 index 000000000..62213c3ec --- /dev/null +++ b/llm_on_ray/inference/models/templates/template_codellama.jinja @@ -0,0 +1,22 @@ +{% if messages[0]['role'] == 'system' %} +{% set loop_messages = messages[1:] %} +{% set system_message = messages[0]['content'] %} +{% else %} +{% set loop_messages = messages %} +{% set system_message = false %} +{% endif %} +{% for message in loop_messages %} +{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} +{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} +{% endif %} +{% if loop.index0 == 0 and system_message != false %} +{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %} +{% else %} +{% set content = message['content'] %} +{% endif %} +{% if message['role'] == 'user' %} +{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }} +{% elif message['role'] == 'assistant' %} +{{ ' ' + content | trim + ' ' + eos_token }} +{% endif %} +{% endfor %} diff --git a/llm_on_ray/inference/models/templates/template_gemma.jinja b/llm_on_ray/inference/models/templates/template_gemma.jinja new file mode 100644 index 000000000..9dfacbb9a --- /dev/null +++ b/llm_on_ray/inference/models/templates/template_gemma.jinja @@ -0,0 +1,18 @@ +{{ bos_token }} +{% if messages[0]['role'] == 'system' %} +{% set messages = messages[1:] %} +{% endif %} +{% for message in messages %} +{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} +{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} +{% endif %} +{% if (message['role'] == 'assistant') %} +{% set role = 'model' %} +{% else %} +{% set role = message['role'] %} +{% endif %} +{{ '' + role + '\n' + message['content'] | trim + '\n' }} +{% endfor %} +{% if add_generation_prompt %} +{{'model\n'}} +{% endif %} diff --git a/llm_on_ray/inference/models/templates/template_gpt2.jinja b/llm_on_ray/inference/models/templates/template_gpt2.jinja new file mode 100644 index 000000000..bdd0814ba --- /dev/null +++ b/llm_on_ray/inference/models/templates/template_gpt2.jinja @@ -0,0 +1,20 @@ +{% if messages[0]['role'] == 'system' %} +{% set loop_messages = messages[1:] %} +{% set system_message = messages[0]['content'] %} +{% else %} +{% set loop_messages = messages %} +{% set system_message = false %} +{% endif %} +{% for message in loop_messages %} +{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} +{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} +{% endif %} +{% if loop.index0 == 0 and system_message %} +{{ system_message }} +{% endif %} +{% if message['role'] == 'user' %} +{{ message['content'].strip() }} +{% elif message['role'] == 'assistant' %} +{{ message['content'].strip() }} +{% endif %} +{% endfor %} diff --git a/llm_on_ray/inference/models/templates/template_llama2.jinja b/llm_on_ray/inference/models/templates/template_llama2.jinja new file mode 100644 index 000000000..1fcd5ec2d --- /dev/null +++ b/llm_on_ray/inference/models/templates/template_llama2.jinja @@ -0,0 +1,21 @@ +{% if messages[0]['role'] == 'system' %} +{% set loop_messages = messages[1:] %} +{% set system_message = messages[0]['content'] %} +{% else %}{% set loop_messages = messages %} +{% set system_message = false %} +{% endif %} +{% for message in loop_messages %} +{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} +{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} +{% endif %} +{% if loop.index0 == 0 and system_message != false %} +{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %} +{% else %} +{% set content = message['content'] %} +{% endif %} +{% if message['role'] == 'user' %} +{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }} +{% elif message['role'] == 'assistant' %} +{{ ' ' + content.strip() + ' ' + eos_token }} +{% endif %} +{% endfor %} diff --git a/llm_on_ray/inference/models/templates/template_mistral.jinja b/llm_on_ray/inference/models/templates/template_mistral.jinja new file mode 100644 index 000000000..1ad2bc4fc --- /dev/null +++ b/llm_on_ray/inference/models/templates/template_mistral.jinja @@ -0,0 +1,16 @@ +{{ bos_token }} +{% if messages[0]['role'] == 'system' %} +{% set messages = messages[1:] %} +{% endif %} +{% for message in messages %} +{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} +{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} +{% endif %} +{% if message['role'] == 'user' %} +{{ '[INST] ' + message['content'] + ' [/INST]' }} +{% elif message['role'] == 'assistant' %} +{{ message['content'] + eos_token}} +{% else %} +{{ raise_exception('Only user and assistant roles are supported!') }} +{% endif %} +{% endfor %} diff --git a/llm_on_ray/inference/models/templates/template_neuralchat.jinja b/llm_on_ray/inference/models/templates/template_neuralchat.jinja new file mode 100644 index 000000000..872e225b6 --- /dev/null +++ b/llm_on_ray/inference/models/templates/template_neuralchat.jinja @@ -0,0 +1,23 @@ +{% if messages[0]['role'] == 'system' %} +{% set loop_messages = messages[1:] %} +{% set system_message = messages[0]['content'] %} +{% else %} +{% set loop_messages = messages %} +{% set system_message = false %} +{% endif %} +{% for message in loop_messages %} +{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} +{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} +{% endif %} +{% if loop.index0 == 0 and system_message != false %} +{{ '###System: ' + system_message.strip() }} +{% endif %} +{% if message['role'] == 'user' %} +{{ '###User: ' + message['content'].strip() }} +{% elif message['role'] == 'assistant' %} +{{ '###Assistant: ' + message['content'].strip() }} +{% endif %} +{% endfor %} +{% if add_generation_prompt %} +{{ '###Assistant: ' }} +{% endif %} diff --git a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml b/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml index acbf58455..9302b9be2 100644 --- a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml +++ b/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml @@ -16,13 +16,5 @@ ipex: model_description: model_id_or_path: meta-llama/Llama-2-7b-chat-hf tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf - chat_processor: ChatModelLLama - prompt: - intro: '' - human_id: '[INST] {msg} [/INST] - - ' - bot_id: '' - stop_words: [] config: use_auth_token: '' diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index e96dd5563..f5ac35d80 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -26,6 +26,8 @@ from starlette.requests import Request from starlette.responses import StreamingResponse, JSONResponse from fastapi import HTTPException + +from llm_on_ray.inference.chat_template_process import ChatTemplatePreprocess from llm_on_ray.inference.inference_config import InferenceConfig from llm_on_ray.inference.api_openai_backend.openai_protocol import ( ChatMessage, @@ -52,31 +54,11 @@ def __init__( max_batch_size=_DEFAULT_MAX_BATCH_SIZE, ): self.device = torch.device(infer_conf.device) - self.process_tool = None - chat_processor_name = infer_conf.model_description.chat_processor - prompt = infer_conf.model_description.prompt self.handle_dynamic_batch.set_max_batch_size(max_batch_size) - - if chat_processor_name: - try: - module = __import__("chat_process") - except Exception: - sys.path.append(os.path.dirname(__file__)) - module = __import__("chat_process") - chat_processor = getattr(module, chat_processor_name, None) - if chat_processor is None: - raise ValueError( - infer_conf.name - + " deployment failed. chat_processor(" - + chat_processor_name - + ") does not exist." - ) - self.process_tool = chat_processor(**prompt.dict()) - self.use_deepspeed = infer_conf.deepspeed self.use_vllm = infer_conf.vllm.enabled - self.is_mllm = True if chat_processor_name in ["ChatModelwithImage"] else False + self.is_mllm = infer_conf.model_description.chat_model_with_image # Used to determine if openai backend is used self.use_openai = False @@ -103,6 +85,7 @@ def __init__( self.predictor = TransformerPredictor(infer_conf) self.loop = asyncio.get_running_loop() + self.process_tool = ChatTemplatePreprocess(self.predictor) def consume_streamer(self, streamer): for text in streamer: @@ -351,6 +334,7 @@ def preprocess_prompts( Raises: HTTPException: If the input prompt format is invalid or not supported. """ + if isinstance(input, str): return input elif isinstance(input, List): @@ -376,7 +360,7 @@ def preprocess_prompts( # Process the input prompts with MLLM tool if self.process_tool is not None: if self.is_mllm: - input, image = self.process_tool.get_prompt(input) + input, image = self.process_tool.get_prompt(input, self.is_mllm) prompts.append(input) images.extend(image) return (prompts, images) @@ -403,16 +387,15 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON status_code=400, content="Invalid JSON format from http request.", ) - streaming_response = json_request["stream"] if "stream" in json_request else False input = json_request["text"] if "text" in json_request else "" + if input == "": return JSONResponse( status_code=400, content="Empty prompt is not supported.", ) config = json_request["config"] if "config" in json_request else {} - # return prompt or list of prompts preprocessed prompts = self.preprocess_prompts(input) diff --git a/llm_on_ray/inference/utils.py b/llm_on_ray/inference/utils.py index 2d1a4d878..6712d7bfb 100644 --- a/llm_on_ray/inference/utils.py +++ b/llm_on_ray/inference/utils.py @@ -13,7 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # - +import os +import pathlib from transformers import StoppingCriteria, TextStreamer from ray.util.queue import Queue import torch @@ -194,3 +195,28 @@ def module_import_and_init(module_name, clazz, **clazzs_kwargs): module = importlib.import_module(module_name) class_ = getattr(module, clazz) return class_(**clazzs_kwargs) + + +def parse_jinja_file(chat_template: Union[str, None]): + if chat_template is None: + return None + + try: + # Get the absolute path of the provided chat template + jinja_path = os.path.abspath(chat_template) + + # If the user specifies a jinja file, the absolute path to jinja_path exists. + # If jinja_path does not exist, it means that the user did not specify jinja and the default jinja is used. + if not os.path.exists(jinja_path): + jinja_path = str( + pathlib.Path(os.path.dirname(os.path.abspath(__file__))).parent.parent + / chat_template + ) + + with open(jinja_path, "r") as file: + content = file.read() + return content + except FileNotFoundError: + raise FileNotFoundError(f"File {jinja_path} not found.") + except Exception as e: + raise Exception(f"An error occurred: {str(e)}") diff --git a/llm_on_ray/ui/start_ui.py b/llm_on_ray/ui/start_ui.py index c30851a8e..5cdece259 100644 --- a/llm_on_ray/ui/start_ui.py +++ b/llm_on_ray/ui/start_ui.py @@ -31,11 +31,7 @@ from ray.util import queue from llm_on_ray.inference.inference_config import all_models, ModelDescription, Prompt from llm_on_ray.inference.inference_config import InferenceConfig as FinetunedConfig -from llm_on_ray.inference.chat_process import ( - ChatModelGptJ, - ChatModelLLama, - ChatModelwithImage, -) + from llm_on_ray.inference.predictor_deployment import PredictorDeployment from llm_on_ray.ui.html_format import cpu_memory_html, ray_status_html, custom_css from langchain.vectorstores import FAISS diff --git a/tests/inference/test_chat_template.py b/tests/inference/test_chat_template.py new file mode 100644 index 000000000..4a987a841 --- /dev/null +++ b/tests/inference/test_chat_template.py @@ -0,0 +1,227 @@ +# +# Copyright 2023 The LLM-on-Ray Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import pathlib + +import pytest +from transformers import AutoTokenizer + +from llm_on_ray.inference.utils import parse_jinja_file + +# Define the base path for templates +base_path = ( + pathlib.Path(os.path.dirname(os.path.abspath(__file__))).parent.parent + / "llm_on_ray/inference/models/templates" +) + + +# Define models, templates, and their corresponding expected outputs +MODEL_TEMPLATE_GENERATON_OUTPUT = [ + ( + "EleutherAI/gpt-j-6b", + base_path / "default_template.jinja", + True, + "Below is an instruction that describes a task. Write a response that " + "appropriately completes the request.\n" + "### Instruction: Hello\n" + "### Response:Hi there!\n" + "### Response:\n", + ), + ( + "EleutherAI/gpt-j-6b", + base_path / "default_template.jinja", + False, + "Below is an instruction that describes a task. Write a response that " + "appropriately completes the request.\n" + "### Instruction: Hello\n" + "### Response:Hi there!\n", + ), + ("gpt2", base_path / "template_gpt2.jinja", True, "Hello\nHi there!\nWhat is the capital of\n"), + ( + "gpt2", + base_path / "template_gpt2.jinja", + False, + "Hello\nHi there!\nWhat is the capital of\n", + ), + ( + "google/gemma-2b", + base_path / "template_gemma.jinja", + True, + "<|endoftext|>\n" + "user\n" + "Hello\n" + "\n" + "model\n" + "Hi there!\n" + "\n" + "user\n" + "What is the capital of\n" + "\n" + "model\n" + "\n", + ), + ( + "google/gemma-2b", + base_path / "template_gemma.jinja", + False, + "<|endoftext|>\n" + "user\n" + "Hello\n" + "\n" + "model\n" + "Hi there!\n" + "\n" + "user\n" + "What is the capital of\n" + "\n", + ), + ( + "mistralai/Mistral-7B-v0.1", + base_path / "template_mistral.jinja", + True, + "\n" + "[INST] Hello [/INST]\n" + "Hi there!\n" + "[INST] What is the capital of [/INST]\n", + ), + ( + "mistralai/Mistral-7B-v0.1", + base_path / "template_mistral.jinja", + False, + "\n" + "[INST] Hello [/INST]\n" + "Hi there!\n" + "[INST] What is the capital of [/INST]\n", + ), + ( + "Intel/neural-chat-7b-v3-1", + base_path / "template_neuralchat.jinja", + True, + "###System: You are a chatbot developed by Intel. Please answer all " + "questions to the best of your ability.\n" + "###User: Hello\n" + "###Assistant: Hi there!\n" + "###Assistant: \n", + ), + ( + "Intel/neural-chat-7b-v3-1", + base_path / "template_neuralchat.jinja", + False, + "###System: You are a chatbot developed by Intel. Please answer all " + "questions to the best of your ability.\n" + "###User: Hello\n" + "###Assistant: Hi there!\n", + ), + ( + "adept/fuyu-8b", + base_path / "template_llama2.jinja", + True, + "|ENDOFTEXT|[INST] Hello [/INST]\n" + " Hi there! |ENDOFTEXT|\n" + "|ENDOFTEXT|[INST] What is the capital of [/INST]\n", + ), + ( + "adept/fuyu-8b", + base_path / "template_llama2.jinja", + False, + "|ENDOFTEXT|[INST] Hello [/INST]\n" + " Hi there! |ENDOFTEXT|\n" + "|ENDOFTEXT|[INST] What is the capital of [/INST]\n", + ), + ( + "codellama/CodeLlama-7b-hf", + base_path / "template_codellama.jinja", + True, + "[INST] Hello [/INST]\n" + " Hi there! \n" + "[INST] What is the capital of [/INST]\n", + ), + ( + "codellama/CodeLlama-7b-hf", + base_path / "template_codellama.jinja", + False, + "[INST] Hello [/INST]\n" + " Hi there! \n" + "[INST] What is the capital of [/INST]\n", + ), +] + + +TEST_MESSAGES = [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi there!"}, + {"role": "user", "content": "What is the capital of"}, +] + +TEST_NEURALCHAT_MESSAGES = [ + { + "role": "system", + "content": "You are a chatbot developed by Intel. Please answer all questions to the best of your ability.", + }, + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi there!"}, +] + +TEST_DEFAULT_MESSAGES = [ + { + "role": "system", + "content": "Below is an instruction that describes a task. Write a response that appropriately completes the request.", + }, + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi there!"}, +] + + +@pytest.mark.parametrize( + "model,template,add_generation_prompt,expected_output", MODEL_TEMPLATE_GENERATON_OUTPUT +) +def test_get_gen_default_prompt( + model: object, template: object, add_generation_prompt: object, expected_output: object +) -> object: + # Initialize the tokenizer + tokenizer = AutoTokenizer.from_pretrained("gpt2") + tokenizer.chat_template = parse_jinja_file(template) + + if model == "mistralai/Mistral-7B-v0.1" or model == "codellama/CodeLlama-7b-hf": + tokenizer.bos_token = "" + tokenizer.eos_token = "" + elif model == "adept/fuyu-8b": + tokenizer.bos_token = "|ENDOFTEXT|" + tokenizer.eos_token = "|ENDOFTEXT|" + + # Call the function and get the result + if model == "Intel/neural-chat-7b-v3-1": + result = tokenizer.apply_chat_template( + conversation=TEST_NEURALCHAT_MESSAGES, + tokenize=False, + add_generation_prompt=add_generation_prompt, + ) + elif model == "EleutherAI/gpt-j-6b": + result = tokenizer.apply_chat_template( + conversation=TEST_DEFAULT_MESSAGES, + tokenize=False, + add_generation_prompt=add_generation_prompt, + ) + else: + result = tokenizer.apply_chat_template( + conversation=TEST_MESSAGES, tokenize=False, add_generation_prompt=add_generation_prompt + ) + # Test assertion + assert result == expected_output, ( + f"The generated prompt does not match the expected output for " + f"model {model} and template {template}" + ) diff --git a/tests/test_getting_started.sh b/tests/test_getting_started.sh index 6a900a553..a84bfe334 100755 --- a/tests/test_getting_started.sh +++ b/tests/test_getting_started.sh @@ -33,7 +33,7 @@ curl $ENDPOINT_URL/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "gpt2", - "messages": [{"role": "assistant", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello!"}], + "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello!"}], "temperature": 0.7 }'