From 620800fc407e9b360eef13650d3de83d63818417 Mon Sep 17 00:00:00 2001
From: minmingzhu <45281494+minmingzhu@users.noreply.github.com>
Date: Thu, 16 May 2024 03:42:03 +0000
Subject: [PATCH] [Inference ] Integrate chat template in llm-on-ray (#199)

* integrate inference chat template

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* Update query_http_requests.py

* update

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update

* update

* update

* update yaml file

* update yaml

* format yaml

* update

* Update mpt_deltatuner.yaml

* update

* Update neural-chat-7b-v3-1.yaml

* Update predictor_deployment.py

* 1. add jinja file
2. add chat template unit test
3. fix comments

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* add license header

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* Update bloom-560m-ci.yaml

* debug CI

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* debug CI

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* Update VLLM installation script and documentation (#212)

* Update VLLM installation script and documentation

Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>

* nit

Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>

* Update vLLM installation message

Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>

* Update installation instructions for vLLM CPU

Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>

* Update Dockerfile.vllm

Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>

* Update VLLM version to 0.4.1

Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>

* update doc

Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>

* nit

Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>

* nit

Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>

---------

Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>

* [Workflow]  Unify Docker operations into bash (#123)

* docker2sh test

* codepath

* codepath

* codepath

* add

* add

* add

* add

* add

* add

* df

* docker.sh

* docker bash

* docker bash

* docker bash

* docker bash

* inference docker bash

* merge main0312

* merge main0312

* merge main0312

* test set-e

* fix test

* fix

* fix

* fix

* test error

* test error

* add map

* test install error

* test install error

* test install error

* test install error

* test

* test

* fix

* fix

* fix

* only inference

* fux

* fux

* fux

* target

* target

* target

* fix proxy

* fix proxy

* fix proxy

* fix proxy

* fix proxy

* fix proxy

* fix proxy

* fix fuc

* fix fuc

* fix fuc

* all inference

* add finetune

* fix

* fix

* fix

* fix

* fix finetune

* fix finetune

* fix review

* fix review

* fix review

* add info output

* Update proxy settings and Docker configurations

Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>

* fix vllm pr212

* fix

* fix

* change name

---------

Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>
Co-authored-by: Wu, Xiaochang <xiaochang.wu@intel.com>

* fix comments

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update code style

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* Fix openai response for vLLM (#213)

* [CI] Add llama2-70b inference workflow (#208)

* add llama-2-70b

* nit

* fix vllm inference ci

* Revert "fix vllm inference ci"

This reverts commit 36062bdac79df6c20e631c56cc3dfaae748e816d.

* Fix StoppingCriteriaSub parameters to be compatible with latest Transformers (#215)

* 1. fix CI
2. fix comments

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* format

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* modify jinja path

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* fix comments

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* fix comments

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update jinja

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update jinja file

Signed-off-by: minmingzhu <minming.zhu@intel.com>

---------

Signed-off-by: minmingzhu <minming.zhu@intel.com>
Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>
Signed-off-by: minmingzhu <45281494+minmingzhu@users.noreply.github.com>
Co-authored-by: Xiaochang Wu <xiaochang.wu@intel.com>
Co-authored-by: yutianchen <tianchen.yu@intel.com>
Co-authored-by: KepingYan <keping.yan@intel.com>
Co-authored-by: Yizhong Zhang <zyzzxycj@163.com>
Co-authored-by: Zhi Lin <zhi.lin@intel.com>
---
 .github/workflows/config/bloom-560m-ci.yaml   |   7 +-
 .github/workflows/config/gpt2-ci.yaml         |   7 +-
 .../config/llama-2-7b-chat-hf-vllm-fp32.yaml  |   8 -
 .github/workflows/config/mpt_deltatuner.yaml  |  13 -
 .../config/mpt_deltatuner_deepspeed.yaml      |  13 -
 .github/workflows/config/opt-125m-ci.yaml     |   7 +-
 README.md                                     |   2 +-
 docs/serve.md                                 |   2 +-
 .../openai_tools_call_query.py                |   4 +-
 .../api_server_openai/query_http_requests.py  |   2 +-
 .../query_http_requests_tool.py               |   2 +-
 llm_on_ray/inference/chat_process.py          | 222 -----------------
 llm_on_ray/inference/chat_template_process.py |  86 +++++++
 llm_on_ray/inference/inference_config.py      |   4 +
 .../inference/models/CodeLlama-7b-hf.yaml     |   7 +-
 llm_on_ray/inference/models/bloom-560m.yaml   |   7 +-
 .../models/deepseek-coder-33b-instruct.yaml   |  13 +-
 llm_on_ray/inference/models/deplot.yaml       |  14 +-
 llm_on_ray/inference/models/falcon-7b.yaml    |   7 +-
 llm_on_ray/inference/models/fuyu8b.yaml       |  14 +-
 llm_on_ray/inference/models/gemma-2b.yaml     |   9 +-
 llm_on_ray/inference/models/gpt-j-6b.yaml     |  13 -
 llm_on_ray/inference/models/gpt2.yaml         |   7 +-
 .../models/hpu/llama-2-70b-chat-hf-hpu.yaml   |   8 -
 .../models/hpu/llama-2-7b-chat-hf-hpu.yaml    |   8 -
 .../models/hpu/llama-3-70b-chat-hf-hpu.yaml   |   8 -
 .../models/hpu/llama-3-8b-instruct-hpu.yaml   |   8 -
 .../models/hpu/neural-chat-7b-v3-3.yaml       |  11 +-
 .../ipex-llm/mistral-7b-v0.1-ipex-llm.yaml    |   7 +-
 .../models/ipex-llm/mpt-7b-ipex-llm.yaml      |  13 -
 .../inference/models/llama-2-7b-chat-hf.yaml  |   8 -
 .../models/mistral-7b-Instruct-v0.2.yaml      |   6 -
 .../inference/models/mistral-7b-v0.1.yaml     |   7 +-
 llm_on_ray/inference/models/mpt-7b.yaml       |  13 -
 .../inference/models/neural-chat-7b-v3-1.yaml |  11 +-
 llm_on_ray/inference/models/opt-125m.yaml     |   7 +-
 .../inference/models/sqlcoder-7b-2.yaml       |   7 +-
 llm_on_ray/inference/models/starcoder.yaml    |   7 +-
 .../export_inference_config_to_yaml.py        |  24 --
 .../template/inference_config_template.yaml   |  35 ---
 .../models/templates/default_template.jinja   |  23 ++
 .../models/templates/template_codellama.jinja |  22 ++
 .../models/templates/template_gemma.jinja     |  18 ++
 .../models/templates/template_gpt2.jinja      |  20 ++
 .../models/templates/template_llama2.jinja    |  21 ++
 .../models/templates/template_mistral.jinja   |  16 ++
 .../templates/template_neuralchat.jinja       |  23 ++
 .../models/vllm/llama-2-7b-chat-hf-vllm.yaml  |   8 -
 llm_on_ray/inference/predictor_deployment.py  |  31 +--
 llm_on_ray/inference/utils.py                 |  28 ++-
 llm_on_ray/ui/start_ui.py                     |   6 +-
 tests/inference/test_chat_template.py         | 227 ++++++++++++++++++
 tests/test_getting_started.sh                 |   2 +-
 53 files changed, 523 insertions(+), 580 deletions(-)
 delete mode 100644 llm_on_ray/inference/chat_process.py
 create mode 100644 llm_on_ray/inference/chat_template_process.py
 delete mode 100644 llm_on_ray/inference/models/template/export_inference_config_to_yaml.py
 delete mode 100644 llm_on_ray/inference/models/template/inference_config_template.yaml
 create mode 100644 llm_on_ray/inference/models/templates/default_template.jinja
 create mode 100644 llm_on_ray/inference/models/templates/template_codellama.jinja
 create mode 100644 llm_on_ray/inference/models/templates/template_gemma.jinja
 create mode 100644 llm_on_ray/inference/models/templates/template_gpt2.jinja
 create mode 100644 llm_on_ray/inference/models/templates/template_llama2.jinja
 create mode 100644 llm_on_ray/inference/models/templates/template_mistral.jinja
 create mode 100644 llm_on_ray/inference/models/templates/template_neuralchat.jinja
 create mode 100644 tests/inference/test_chat_template.py

diff --git a/.github/workflows/config/bloom-560m-ci.yaml b/.github/workflows/config/bloom-560m-ci.yaml
index 16a97d896..06d0064bb 100644
--- a/.github/workflows/config/bloom-560m-ci.yaml
+++ b/.github/workflows/config/bloom-560m-ci.yaml
@@ -13,9 +13,4 @@ ipex:
 model_description:  
   model_id_or_path: bigscience/bloom-560m
   tokenizer_name_or_path: bigscience/bloom-560m
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
+  chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja"
diff --git a/.github/workflows/config/gpt2-ci.yaml b/.github/workflows/config/gpt2-ci.yaml
index e528123cc..b3927953b 100644
--- a/.github/workflows/config/gpt2-ci.yaml
+++ b/.github/workflows/config/gpt2-ci.yaml
@@ -14,10 +14,5 @@ ipex:
 model_description:
   model_id_or_path: gpt2
   tokenizer_name_or_path: gpt2
-  chat_processor: ChatModelGptJ
   gpt_base_model: true
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
+  chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja"
diff --git a/.github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml b/.github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml
index 46be6eb57..d3d96a0e1 100644
--- a/.github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml
+++ b/.github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml
@@ -16,13 +16,5 @@ ipex:
 model_description:
   model_id_or_path: meta-llama/Llama-2-7b-chat-hf
   tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf
-  chat_processor: ChatModelLLama
-  prompt:
-    intro: ''
-    human_id: '[INST] {msg} [/INST]
-
-      '
-    bot_id: ''
-    stop_words: []
   config:
     use_auth_token: ''
diff --git a/.github/workflows/config/mpt_deltatuner.yaml b/.github/workflows/config/mpt_deltatuner.yaml
index 250004dc2..e0c0d6946 100644
--- a/.github/workflows/config/mpt_deltatuner.yaml
+++ b/.github/workflows/config/mpt_deltatuner.yaml
@@ -13,20 +13,7 @@ ipex:
 model_description:
   model_id_or_path: mosaicml/mpt-7b
   tokenizer_name_or_path: EleutherAI/gpt-neox-20b
-  chat_processor: ChatModelGptJ
   peft_model_id_or_path: nathan0/mpt-7b-deltatuner-model
   peft_type: deltatuner
-  prompt:
-    intro: 'Below is an instruction that describes a task, paired with an input that
-      provides further context. Write a response that appropriately completes the request.
-
-      '
-    human_id: '
-
-      ### Instruction'
-    bot_id: '
-
-      ### Response'
-    stop_words: []
   config:
     trust_remote_code: true
diff --git a/.github/workflows/config/mpt_deltatuner_deepspeed.yaml b/.github/workflows/config/mpt_deltatuner_deepspeed.yaml
index 40051e0fa..a4fdd0709 100644
--- a/.github/workflows/config/mpt_deltatuner_deepspeed.yaml
+++ b/.github/workflows/config/mpt_deltatuner_deepspeed.yaml
@@ -13,20 +13,7 @@ ipex:
 model_description:
   model_id_or_path: mosaicml/mpt-7b
   tokenizer_name_or_path: EleutherAI/gpt-neox-20b
-  chat_processor: ChatModelGptJ
   peft_model_id_or_path: nathan0/mpt-7b-deltatuner-model
   peft_type: deltatuner
-  prompt:
-    intro: 'Below is an instruction that describes a task, paired with an input that
-      provides further context. Write a response that appropriately completes the request.
-
-      '
-    human_id: '
-
-      ### Instruction'
-    bot_id: '
-
-      ### Response'
-    stop_words: []
   config:
     trust_remote_code: true
diff --git a/.github/workflows/config/opt-125m-ci.yaml b/.github/workflows/config/opt-125m-ci.yaml
index 047d0008c..f13ec7e54 100644
--- a/.github/workflows/config/opt-125m-ci.yaml
+++ b/.github/workflows/config/opt-125m-ci.yaml
@@ -13,9 +13,4 @@ ipex:
 model_description:
   model_id_or_path: facebook/opt-125m
   tokenizer_name_or_path: facebook/opt-125m
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
+  chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja"
diff --git a/README.md b/README.md
index 4728e5fc0..5ba4410fa 100644
--- a/README.md
+++ b/README.md
@@ -80,7 +80,7 @@ curl $ENDPOINT_URL/chat/completions \
     -H "Content-Type: application/json" \
     -d '{
     "model": "gpt2",
-    "messages": [{"role": "assistant", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello!"}],
+    "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello!"}],
     "temperature": 0.7
     }'
 
diff --git a/docs/serve.md b/docs/serve.md
index b822cd416..7a19e139d 100644
--- a/docs/serve.md
+++ b/docs/serve.md
@@ -52,7 +52,7 @@ curl $ENDPOINT_URL/chat/completions \
     -H "Content-Type: application/json" \
     -d '{
     "model": $MODEL_NAME,
-    "messages": [{"role": "assistant", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello!"}],
+    "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello!"}],
     "temperature": 0.7
     }'
 
diff --git a/examples/inference/api_server_openai/openai_tools_call_query.py b/examples/inference/api_server_openai/openai_tools_call_query.py
index 897ddba13..6a3ea6990 100644
--- a/examples/inference/api_server_openai/openai_tools_call_query.py
+++ b/examples/inference/api_server_openai/openai_tools_call_query.py
@@ -75,11 +75,11 @@
 ]
 messages = [
     [
-        {"role": "user", "content": "You are a helpful assistant"},
+        {"role": "system", "content": "You are a helpful assistant"},
         {"role": "user", "content": "What's the weather like in Boston today?"},
     ],
     [
-        {"role": "user", "content": "You are a helpful assistant"},
+        {"role": "system", "content": "You are a helpful assistant"},
         {"role": "user", "content": "Tell me a short joke?"},
     ],
 ]
diff --git a/examples/inference/api_server_openai/query_http_requests.py b/examples/inference/api_server_openai/query_http_requests.py
index 536deb30e..2ef1ac0c6 100644
--- a/examples/inference/api_server_openai/query_http_requests.py
+++ b/examples/inference/api_server_openai/query_http_requests.py
@@ -58,7 +58,7 @@
 body = {
     "model": args.model_name,
     "messages": [
-        {"role": "assistant", "content": "You are a helpful assistant."},
+        {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": args.input_text},
     ],
     "stream": args.streaming_response,
diff --git a/examples/inference/api_server_openai/query_http_requests_tool.py b/examples/inference/api_server_openai/query_http_requests_tool.py
index 217f2b792..bc77c3272 100644
--- a/examples/inference/api_server_openai/query_http_requests_tool.py
+++ b/examples/inference/api_server_openai/query_http_requests_tool.py
@@ -73,7 +73,7 @@
 
 messages = [
     [
-        {"role": "user", "content": "You are a helpful assistant"},
+        {"role": "system", "content": "You are a helpful assistant"},
         {"role": "user", "content": "What's the weather like in Boston today?"},
     ],
 ]
diff --git a/llm_on_ray/inference/chat_process.py b/llm_on_ray/inference/chat_process.py
deleted file mode 100644
index 3ee238fb7..000000000
--- a/llm_on_ray/inference/chat_process.py
+++ /dev/null
@@ -1,222 +0,0 @@
-#
-# Copyright 2023 The LLM-on-Ray Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-
-class ChatModel:
-    human_id = "<human>"
-    bot_id = "<bot>"
-    unknown_id = "<unknown>"
-    MEANINGLESS_WORDS = ["<pad>", "</s>", "<|endoftext|>", "<br>"]
-    stop_words = ["<human>"]
-
-    def __init__(self, intro, human_id, bot_id, stop_words) -> None:
-        self.intro = intro
-        self.human_id = human_id
-        self.bot_id = bot_id
-        self.stop_words = stop_words
-        self.MEANINGLESS_WORDS.extend(self.stop_words)
-
-    def prepare_prompt(self, messages: list):
-        """Prepare prompt from history messages."""
-        prompt = ""
-        for msg in messages:
-            role, content = msg.role, msg.content
-            if role == "user":
-                prompt += f"{self.human_id}: {content}\n"
-            elif role == "assistant":
-                prompt += f"{self.bot_id}: {content}\n"
-            else:
-                prompt += f"{self.unknown_id}: {content}\n"
-        prompt += f"{self.bot_id}:"
-        return prompt
-
-    def convert_output(self, output: str):
-        """Convert the model output to final answer."""
-        human_id = self.human_id.strip()
-        bot_id = self.bot_id.strip()
-        if human_id != "":
-            output = output.split(human_id)[0]
-        if bot_id != "":
-            output = output.split(bot_id)[0]
-        for word in self.MEANINGLESS_WORDS:
-            output = output.replace(word, "")
-        text = output
-        # remove partial human_id or bot id
-        if "\n" in text and (
-            human_id.startswith(text[text.rfind("\n") + 1 :])
-            or bot_id.startswith(text[text.rfind("\n") + 1])
-        ):
-            text = text[: text.rfind("\n")]
-        return text
-
-    def get_prompt(self, messages):
-        """Generate response based on messages."""
-        prompt = self.prepare_prompt(messages)
-        return prompt
-
-
-class ChatModelGptJ(ChatModel):
-    def __init__(self, intro, human_id, bot_id, stop_words):
-        super().__init__(intro, human_id, bot_id, stop_words)
-
-    def prepare_prompt(self, messages: list):
-        """Prepare prompt from history messages."""
-        prompt = self.intro
-        for msg in messages:
-            msg = dict(msg)
-            role, content = msg["role"], msg["content"]
-            if role == "user":
-                if self.human_id != "":
-                    prompt += f"{self.human_id}:\n{content}\n"
-                else:
-                    prompt += f"{content}\n"
-            elif role == "assistant":
-                if self.bot_id != "":
-                    prompt += f"{self.bot_id}:\n{content}\n"
-                else:
-                    prompt += f"{content}\n"
-            else:
-                prompt += f"### Unknown:\n{content}\n"
-        if self.bot_id != "":
-            prompt += f"{self.bot_id}:\n"
-        return prompt
-
-
-class ChatModelLLama(ChatModel):
-    def __init__(self, intro, human_id, bot_id, stop_words):
-        super().__init__(intro, human_id, bot_id, stop_words)
-
-    def prepare_prompt(self, messages: list):
-        """Prepare prompt from history messages."""
-        prompt = self.intro
-        for msg in messages:
-            msg = dict(msg)
-            role, content = msg["role"], msg["content"]
-            if role == "user":
-                if self.human_id != "":
-                    prompt += self.human_id.format(msg=content)
-                else:
-                    prompt += f"{content}\n"
-            elif role == "assistant":
-                prompt += f"{content}\n"
-            elif role == "tool":
-                prompt += f"{content}\n"
-            elif role == "system":
-                prompt += f"### system:\n{content}\n"
-            else:
-                prompt += f"### Unknown:\n{content}\n"
-        if self.bot_id != "":
-            prompt += f"{self.bot_id}:\n"
-        return prompt
-
-
-class ChatModelwithImage(ChatModel):
-    def __init__(self, intro, human_id, bot_id, stop_words):
-        super().__init__(intro, human_id, bot_id, stop_words)
-
-    def prepare_prompt(self, messages: list):
-        """Prepare prompt from history messages."""
-        from PIL import Image
-        import requests
-        from io import BytesIO
-        import base64
-        import re
-
-        prompt = self.intro
-        for msg in messages:
-            msg = dict(msg)
-            role, content = msg["role"], msg["content"]
-            text_prompt = []
-            image_prompt = []
-            for item in content:
-                if item["type"] == "text":
-                    text_prompt.append(item["text"])
-                elif item["type"] == "image_url":
-                    image_prompt.append(item["image_url"])
-                else:
-                    raise ValueError(f"Unknown content type {item['type']}")
-
-            content = "\n".join(text_prompt)
-            # prepare images
-            images = []
-            for img in image_prompt:
-                if "url" not in img:
-                    continue
-                is_data = len(re.findall("^data:image/.+;base64,", img["url"])) > 0
-                if is_data:
-                    encoded_str = re.sub("^data:image/.+;base64,", "", img["url"])
-                    images.append(Image.open(BytesIO(base64.b64decode(encoded_str))))
-                else:
-                    images.append(Image.open(requests.get(img["url"], stream=True).raw))
-
-            if role == "user":
-                if self.human_id != "":
-                    prompt += self.human_id.format(msg=content)
-                else:
-                    prompt += f"{content}\n"
-            elif role == "assistant":
-                prompt += f"{content}\n"
-            else:
-                prompt += f"### Unknown:\n{content}\n"
-        if self.bot_id != "":
-            prompt += f"{self.bot_id}:\n"
-        return prompt, images
-
-
-class ChatModelGemma(ChatModel):
-    def __init__(self, intro, human_id, bot_id, stop_words):
-        super().__init__(intro, human_id, bot_id, stop_words)
-
-    def prepare_prompt(self, messages: list):
-        """Prepare prompt from history messages."""
-        prompt = self.intro
-        for msg in messages:
-            msg = dict(msg)
-            role, content = msg["role"], msg["content"]
-            if role == "user":
-                if self.human_id != "":
-                    prompt += f"{self.human_id} {content}\n"
-                else:
-                    prompt += f"{content}\n"
-            elif role == "assistant":
-                if self.bot_id != "":
-                    prompt += f"{self.bot_id} {content}\n"
-                else:
-                    prompt += f"{content}\n"
-            else:
-                prompt += f"### Unknown:\n{content}\n"
-        if self.bot_id != "":
-            prompt += f"{self.bot_id}:\n"
-        return prompt
-
-
-class ChatModelNoFormat(ChatModel):
-    def __init__(self, intro, human_id, bot_id, stop_words):
-        super().__init__(intro, human_id, bot_id, stop_words)
-
-    def prepare_prompt(self, messages: list):
-        """Prepare prompt from history messages."""
-        prompt = ""
-        for msg in messages:
-            msg = dict(msg)
-            prompt += msg["content"]
-        return prompt
-
-
-if __name__ == "__main__":
-    process_tool = ChatModelGptJ(
-        "", "### Instruction", "### Response", stop_words=["##", "### Instruction"]
-    )
diff --git a/llm_on_ray/inference/chat_template_process.py b/llm_on_ray/inference/chat_template_process.py
new file mode 100644
index 000000000..004081f03
--- /dev/null
+++ b/llm_on_ray/inference/chat_template_process.py
@@ -0,0 +1,86 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from typing import List
+
+from llm_on_ray.inference.api_openai_backend.openai_protocol import ChatMessage
+from llm_on_ray.inference.utils import parse_jinja_file
+
+
+class ChatTemplatePreprocess:
+    def __init__(self, predictor) -> None:
+        self.predictor = predictor
+
+    def get_prompt(self, input: List, is_mllm=False):
+        """Generate prompt based on chat templates."""
+        self.predictor.tokenizer.chat_template = (
+            parse_jinja_file(self.predictor.infer_conf.model_description.chat_template)
+            or self.predictor.tokenizer.chat_template
+            or parse_jinja_file(self.predictor.infer_conf.model_description.default_chat_template)
+        )
+        """ChatMessage for OpenAI backend and dict for simple backend."""
+        if input and isinstance(input[0], (ChatMessage, dict)):
+            messages = (
+                [dict(chat_message) for chat_message in input]
+                if isinstance(input[0], ChatMessage)
+                else input
+            )
+            if is_mllm:
+                texts, images = self._extract_messages(messages)
+                image = self._prepare_image(images)
+                prompt = self.predictor.tokenizer.apply_chat_template(
+                    texts, add_generation_prompt=True, tokenize=False
+                )
+                return prompt, image
+
+            prompt = self.predictor.tokenizer.apply_chat_template(
+                messages, add_generation_prompt=True, tokenize=False
+            )
+            return prompt
+
+        raise TypeError(f"Unsupported type {type(input)} for text. Expected dict or list of dicts.")
+
+    def _extract_messages(self, messages):
+        texts, images = [], []
+        for message in messages:
+            if message["role"] == "user" and isinstance(message["content"], list):
+                texts.append({"role": "user", "content": message["content"][0]["text"]})
+                images.append(
+                    {"role": "user", "content": message["content"][1]["image_url"]["url"]}
+                )
+            else:
+                texts.append(message)
+        return texts, images
+
+    def _prepare_image(self, messages: list):
+        """Prepare image from history messages."""
+        from PIL import Image
+        import requests
+        from io import BytesIO
+        import base64
+        import re
+
+        # prepare images
+        images: List = []
+        for msg in messages:
+            msg = dict(msg)
+            content = msg["content"]
+            is_data = len(re.findall("^data:image/.+;base64,", content)) > 0
+            if is_data:
+                encoded_str = re.sub("^data:image/.+;base64,", "", content)
+                images.append(Image.open(BytesIO(base64.b64decode(encoded_str))))
+            else:
+                images.append(Image.open(requests.get(content, stream=True).raw))
+        return images
diff --git a/llm_on_ray/inference/inference_config.py b/llm_on_ray/inference/inference_config.py
index 323adc548..a731af55f 100644
--- a/llm_on_ray/inference/inference_config.py
+++ b/llm_on_ray/inference/inference_config.py
@@ -119,6 +119,10 @@ class ModelDescription(BaseModel):
     input_processor: str = "AutoProcessor"
     model_loader: str = "AutoModel"
 
+    chat_model_with_image: bool = False
+    chat_template: Union[str, None] = None
+    default_chat_template: str = "llm_on_ray/inference/models/templates/default_template.jinja"
+
     @validator("quantization_type")
     def _check_quant_type(cls, v: str):
         if v:
diff --git a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
index 9ea2d77db..37e18acf4 100644
--- a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
+++ b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
@@ -13,9 +13,4 @@ ipex:
 model_description:
   model_id_or_path: codellama/CodeLlama-7b-hf
   tokenizer_name_or_path: codellama/CodeLlama-7b-hf
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
+  chat_template: "llm_on_ray/inference/models/templates/template_codellama.jinja"
diff --git a/llm_on_ray/inference/models/bloom-560m.yaml b/llm_on_ray/inference/models/bloom-560m.yaml
index ba2a6d962..12d2b0372 100644
--- a/llm_on_ray/inference/models/bloom-560m.yaml
+++ b/llm_on_ray/inference/models/bloom-560m.yaml
@@ -13,9 +13,4 @@ ipex:
 model_description:
   model_id_or_path: bigscience/bloom-560m
   tokenizer_name_or_path: bigscience/bloom-560m
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
+  chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja"
diff --git a/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml b/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml
index 75e646a44..adc1d158c 100644
--- a/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml
+++ b/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml
@@ -10,16 +10,7 @@ device: cpu
 ipex:
   enabled: false
   precision: bf16
-model_description:  
+model_description:
   model_id_or_path: deepseek-ai/deepseek-coder-33b-instruct
   tokenizer_name_or_path: deepseek-ai/deepseek-coder-33b-instruct
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: ['<|EOT|>', "<human>"]
-
-
-
-
+  chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja"
diff --git a/llm_on_ray/inference/models/deplot.yaml b/llm_on_ray/inference/models/deplot.yaml
index 4e732a4fe..6e5bde761 100644
--- a/llm_on_ray/inference/models/deplot.yaml
+++ b/llm_on_ray/inference/models/deplot.yaml
@@ -13,15 +13,5 @@ ipex:
 model_description:
   model_id_or_path: google/deplot
   tokenizer_name_or_path: google/deplot
-  chat_processor: ChatModelwithImage
-  input_processor: 'AutoProcessor'
-  model_loader: 'Pix2StructForConditionalGeneration'
-  prompt:
-    intro: ''
-    human_id: '[INST] {msg} [/INST]
-
-      '
-    bot_id: ''
-    stop_words: []
-  config:
-    use_auth_token: ''
+  chat_model_with_image: true
+  chat_template: "llm_on_ray/inference/models/templates/template_llama2.jinja"
diff --git a/llm_on_ray/inference/models/falcon-7b.yaml b/llm_on_ray/inference/models/falcon-7b.yaml
index 8176a2689..119337d70 100644
--- a/llm_on_ray/inference/models/falcon-7b.yaml
+++ b/llm_on_ray/inference/models/falcon-7b.yaml
@@ -13,9 +13,4 @@ ipex:
 model_description:
   model_id_or_path: tiiuae/falcon-7b
   tokenizer_name_or_path: tiiuae/falcon-7b
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
+  chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja"
diff --git a/llm_on_ray/inference/models/fuyu8b.yaml b/llm_on_ray/inference/models/fuyu8b.yaml
index 551a85789..77d33ff9b 100644
--- a/llm_on_ray/inference/models/fuyu8b.yaml
+++ b/llm_on_ray/inference/models/fuyu8b.yaml
@@ -13,15 +13,5 @@ ipex:
 model_description:
   model_id_or_path: adept/fuyu-8b
   tokenizer_name_or_path: adept/fuyu-8b
-  chat_processor: ChatModelwithImage
-  input_processor: FuyuProcessor
-  model_loader: FuyuForCausalLM
-  prompt:
-    intro: ''
-    human_id: '[INST] {msg} [/INST]
-
-      '
-    bot_id: ''
-    stop_words: []
-  config:
-    use_auth_token: ''
+  chat_model_with_image: true
+  chat_template: "llm_on_ray/inference/models/templates/template_llama2.jinja"
diff --git a/llm_on_ray/inference/models/gemma-2b.yaml b/llm_on_ray/inference/models/gemma-2b.yaml
index 8335857ca..5b013b371 100644
--- a/llm_on_ray/inference/models/gemma-2b.yaml
+++ b/llm_on_ray/inference/models/gemma-2b.yaml
@@ -13,13 +13,6 @@ ipex:
 model_description:
   model_id_or_path: google/gemma-2b
   tokenizer_name_or_path: google/gemma-2b
-  chat_processor: ChatModelGemma
-  prompt:
-    intro: ''
-    human_id: '<bos><start_of_turn>user
-    {msg}<end_of_turn>'
-    bot_id: '<bos><start_of_turn>model
-    {msg}<end_of_turn>'
-    stop_words: []
   config:
     use_auth_token: ' '
+  chat_template: "llm_on_ray/inference/models/templates/template_gemma.jinja"
diff --git a/llm_on_ray/inference/models/gpt-j-6b.yaml b/llm_on_ray/inference/models/gpt-j-6b.yaml
index c7778c12e..9719b2f7e 100644
--- a/llm_on_ray/inference/models/gpt-j-6b.yaml
+++ b/llm_on_ray/inference/models/gpt-j-6b.yaml
@@ -14,17 +14,4 @@ ipex:
 model_description:
   model_id_or_path: EleutherAI/gpt-j-6b
   tokenizer_name_or_path: EleutherAI/gpt-j-6b
-  chat_processor: ChatModelGptJ
   gpt_base_model: true
-  prompt:
-    intro: 'Below is an instruction that describes a task. Write a response that appropriately
-      completes the request.
-
-      '
-    human_id: '
-
-      ### Instruction'
-    bot_id: '
-
-      ### Response'
-    stop_words: []
diff --git a/llm_on_ray/inference/models/gpt2.yaml b/llm_on_ray/inference/models/gpt2.yaml
index 48287670a..06a4b1b8b 100644
--- a/llm_on_ray/inference/models/gpt2.yaml
+++ b/llm_on_ray/inference/models/gpt2.yaml
@@ -13,10 +13,5 @@ ipex:
 model_description:
   model_id_or_path: gpt2
   tokenizer_name_or_path: gpt2
-  chat_processor: ChatModelGptJ
   gpt_base_model: true
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
+  chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja"
diff --git a/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml
index d68da8428..ab411ff0e 100644
--- a/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml
@@ -10,13 +10,5 @@ device: hpu
 model_description:
   model_id_or_path: meta-llama/Llama-2-70b-chat-hf
   tokenizer_name_or_path: meta-llama/Llama-2-70b-chat-hf
-  chat_processor: ChatModelLLama
-  prompt:
-    intro: ''
-    human_id: '[INST] {msg} [/INST]
-
-      '
-    bot_id: ''
-    stop_words: []
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml
index 374a98f77..b7b19f02a 100644
--- a/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml
@@ -8,13 +8,5 @@ device: hpu
 model_description:
   model_id_or_path: meta-llama/Llama-2-7b-chat-hf
   tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf
-  chat_processor: ChatModelLLama
-  prompt:
-    intro: ''
-    human_id: '[INST] {msg} [/INST]
-
-      '
-    bot_id: ''
-    stop_words: []
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml
index 6976eafb3..32cf9bb4e 100644
--- a/llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml
@@ -9,13 +9,5 @@ device: hpu
 model_description:
   model_id_or_path: meta-llama/Meta-Llama-3-70b-Instruct
   tokenizer_name_or_path: meta-llama/Meta-Llama-3-70b-Instruct
-  chat_processor: ChatModelLLama
-  prompt:
-    intro: ''
-    human_id: '[INST] {msg} [/INST]
-
-      '
-    bot_id: ''
-    stop_words: []
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml
index 35e3fc260..d57ffcc22 100644
--- a/llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml
@@ -8,13 +8,5 @@ device: hpu
 model_description:
   model_id_or_path: meta-llama/Meta-Llama-3-8b-Instruct
   tokenizer_name_or_path: meta-llama/Meta-Llama-3-8b-Instruct
-  chat_processor: ChatModelLLama
-  prompt:
-    intro: ''
-    human_id: '[INST] {msg} [/INST]
-
-      '
-    bot_id: ''
-    stop_words: []
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml
index 848358bec..64566a6d8 100644
--- a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml
+++ b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml
@@ -14,13 +14,4 @@ ipex:
 model_description:
   model_id_or_path: Intel/neural-chat-7b-v3-3
   tokenizer_name_or_path: Intel/neural-chat-7b-v3-3
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: '### System:
-      You are a chatbot developed by Intel. Please answer all questions to the best of your ability.'
-    human_id: '
-
-      ### User'
-    bot_id: '
-
-      ### Assistant'
+  chat_template: "llm_on_ray/inference/models/templates/template_neuralchat.jinja"
diff --git a/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml b/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml
index 6a8523467..ed03ad82d 100644
--- a/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml
+++ b/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml
@@ -14,12 +14,7 @@ model_description:
   model_id_or_path: mistralai/Mistral-7B-v0.1
   ipexllm: true
   tokenizer_name_or_path: mistralai/Mistral-7B-v0.1
-  chat_processor: ChatModelLLama
-  prompt:
-    intro: ''
-    human_id: '<s>[INST] {msg} [/INST]'
-    bot_id: ''
-    stop_words: []
   config:
     trust_remote_code: true
     load_in_4bit: true
+  chat_template: "llm_on_ray/inference/models/templates/template_mistral.jinja"
diff --git a/llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml b/llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml
index d352a6517..ecb129973 100644
--- a/llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml
+++ b/llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml
@@ -14,19 +14,6 @@ model_description:
   model_id_or_path: mosaicml/mpt-7b-chat
   ipexllm: true
   tokenizer_name_or_path: EleutherAI/gpt-neox-20b
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: 'Below is an instruction that describes a task, paired with an input that
-      provides further context. Write a response that appropriately completes the request.
-
-      '
-    human_id: '
-
-      ### Instruction'
-    bot_id: '
-
-      ### Response'
-    stop_words: []
   config:
     trust_remote_code: true
     load_in_4bit: true
diff --git a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml
index 4b3e11e98..7fdae3933 100644
--- a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml
+++ b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml
@@ -13,13 +13,5 @@ ipex:
 model_description:
   model_id_or_path: meta-llama/Llama-2-7b-chat-hf
   tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf
-  chat_processor: ChatModelLLama
-  prompt:
-    intro: ''
-    human_id: '[INST] {msg} [/INST]
-
-      '
-    bot_id: ''
-    stop_words: []
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml b/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml
index 1af9aad1b..ea50f6af7 100644
--- a/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml
+++ b/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml
@@ -13,11 +13,5 @@ model_description:
   model_id_or_path: mistralai/Mistral-7B-Instruct-v0.2
   ipexllm: false
   tokenizer_name_or_path: mistralai/Mistral-7B-Instruct-v0.2
-  chat_processor: ChatModelLLama
-  prompt:
-    intro: ''
-    human_id: '<s>[INST] {msg} [/INST]'
-    bot_id: ''
-    stop_words: []
   config:
     trust_remote_code: true
diff --git a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
index c8a0ff385..3654f18f0 100644
--- a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
+++ b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
@@ -14,11 +14,6 @@ model_description:
   model_id_or_path: mistralai/Mistral-7B-v0.1
   ipexllm: false
   tokenizer_name_or_path: mistralai/Mistral-7B-v0.1
-  chat_processor: ChatModelLLama
-  prompt:
-    intro: ''
-    human_id: '<s>[INST] {msg} [/INST]'
-    bot_id: ''
-    stop_words: []
   config:
     trust_remote_code: true
+  chat_template: "llm_on_ray/inference/models/templates/template_mistral.jinja"
diff --git a/llm_on_ray/inference/models/mpt-7b.yaml b/llm_on_ray/inference/models/mpt-7b.yaml
index 4ea12adb3..89ce086ed 100644
--- a/llm_on_ray/inference/models/mpt-7b.yaml
+++ b/llm_on_ray/inference/models/mpt-7b.yaml
@@ -13,18 +13,5 @@ ipex:
 model_description:
   model_id_or_path: mosaicml/mpt-7b
   tokenizer_name_or_path: EleutherAI/gpt-neox-20b
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: 'Below is an instruction that describes a task, paired with an input that
-      provides further context. Write a response that appropriately completes the request.
-
-      '
-    human_id: '
-
-      ### Instruction'
-    bot_id: '
-
-      ### Response'
-    stop_words: []
   config:
     trust_remote_code: true
diff --git a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
index 13a29676c..8f32c28b7 100644
--- a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
+++ b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
@@ -13,13 +13,4 @@ ipex:
 model_description:
   model_id_or_path: Intel/neural-chat-7b-v3-1
   tokenizer_name_or_path: Intel/neural-chat-7b-v3-1
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: '### System:
-      You are a chatbot developed by Intel. Please answer all questions to the best of your ability.'
-    human_id: '
-
-      ### User'
-    bot_id: '
-
-      ### Assistant'
+  chat_template: "llm_on_ray/inference/models/templates/template_neuralchat.jinja"
diff --git a/llm_on_ray/inference/models/opt-125m.yaml b/llm_on_ray/inference/models/opt-125m.yaml
index 545cd2145..81e05fc19 100644
--- a/llm_on_ray/inference/models/opt-125m.yaml
+++ b/llm_on_ray/inference/models/opt-125m.yaml
@@ -13,9 +13,4 @@ ipex:
 model_description:
   model_id_or_path: facebook/opt-125m
   tokenizer_name_or_path: facebook/opt-125m
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
+  chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja"
diff --git a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml
index 7130148a3..daa5256c5 100644
--- a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml
+++ b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml
@@ -12,11 +12,6 @@ ipex:
 model_description:
   model_id_or_path: defog/sqlcoder-7b-2
   tokenizer_name_or_path: defog/sqlcoder-7b-2
-  chat_processor: ChatModelLLama
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: ["```"]
   config:
     use_auth_token: ''
+  chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja"
diff --git a/llm_on_ray/inference/models/starcoder.yaml b/llm_on_ray/inference/models/starcoder.yaml
index 0da59ac02..199926353 100644
--- a/llm_on_ray/inference/models/starcoder.yaml
+++ b/llm_on_ray/inference/models/starcoder.yaml
@@ -13,11 +13,6 @@ device: cpu
 model_description:
   model_id_or_path: bigcode/starcoder
   tokenizer_name_or_path: bigcode/starcoder
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
   config:
     use_auth_token: ''
+  chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja"
diff --git a/llm_on_ray/inference/models/template/export_inference_config_to_yaml.py b/llm_on_ray/inference/models/template/export_inference_config_to_yaml.py
deleted file mode 100644
index 74630030b..000000000
--- a/llm_on_ray/inference/models/template/export_inference_config_to_yaml.py
+++ /dev/null
@@ -1,24 +0,0 @@
-#
-# Copyright 2023 The LLM-on-Ray Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import yaml
-import os
-from llm_on_ray.inference.inference_config import InferenceConfig
-
-ic = InferenceConfig()
-
-with open(os.path.dirname(__file__) + "/inference_config_template.yaml", "w") as f:
-    yaml.dump(ic.dict(), f, sort_keys=False)
diff --git a/llm_on_ray/inference/models/template/inference_config_template.yaml b/llm_on_ray/inference/models/template/inference_config_template.yaml
deleted file mode 100644
index 137ddb2dc..000000000
--- a/llm_on_ray/inference/models/template/inference_config_template.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-port: 8000
-name: null
-route_prefix: null
-num_replicas: 1
-cpus_per_worker: 24
-gpus_per_worker: 0
-hpus_per_worker: 0
-deepspeed: false
-workers_per_group: 2
-device: cpu
-ipex:
-  enabled: true
-  precision: bf16
-model_description:
-  model_id_or_path: null
-  ipexllm:: false
-  tokenizer_name_or_path: null
-  chat_processor: null
-  gpt_base_model: false
-  quantized_model_id_or_path: null
-  quantization_type: null
-  peft_model_id_or_path: null
-  peft_type: null
-  use_hpu_graphs: true
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
-  config:
-    trust_remote_code: false
-    use_auth_token: null
-    load_in_4bit: false
-  ipexllm_config:
-    load_in_low_bit: ''
diff --git a/llm_on_ray/inference/models/templates/default_template.jinja b/llm_on_ray/inference/models/templates/default_template.jinja
new file mode 100644
index 000000000..d098fc154
--- /dev/null
+++ b/llm_on_ray/inference/models/templates/default_template.jinja
@@ -0,0 +1,23 @@
+{% if messages[0]['role'] == 'system' %}
+{% set loop_messages = messages[1:] %}
+{% set system_message = messages[0]['content'] %}
+{% else %}
+{% set loop_messages = messages %}
+{% set system_message = false %}
+{% endif %}
+{% for message in loop_messages %}
+{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+{% endif %}
+{% if loop.index0 == 0 and system_message %}
+{{ system_message }}
+{% endif %}
+{% if message['role'] == 'user' %}
+{{ '### Instruction: ' + message['content'].strip() }}
+{% elif message['role'] == 'assistant' %}
+{{ '### Response:'  + message['content'].strip() }}
+{% endif %}
+{% endfor %}
+{% if add_generation_prompt %}
+{{ '### Response:' }}
+{% endif %}
diff --git a/llm_on_ray/inference/models/templates/template_codellama.jinja b/llm_on_ray/inference/models/templates/template_codellama.jinja
new file mode 100644
index 000000000..62213c3ec
--- /dev/null
+++ b/llm_on_ray/inference/models/templates/template_codellama.jinja
@@ -0,0 +1,22 @@
+{% if messages[0]['role'] == 'system' %}
+{% set loop_messages = messages[1:] %}
+{% set system_message = messages[0]['content'] %}
+{% else %}
+{% set loop_messages = messages %}
+{% set system_message = false %}
+{% endif %}
+{% for message in loop_messages %}
+{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+{% endif %}
+{% if loop.index0 == 0 and system_message != false %}
+{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}
+{% else %}
+{% set content = message['content'] %}
+{% endif %}
+{% if message['role'] == 'user' %}
+{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}
+{% elif message['role'] == 'assistant' %}
+{{ ' '  + content | trim + ' ' + eos_token }}
+{% endif %}
+{% endfor %}
diff --git a/llm_on_ray/inference/models/templates/template_gemma.jinja b/llm_on_ray/inference/models/templates/template_gemma.jinja
new file mode 100644
index 000000000..9dfacbb9a
--- /dev/null
+++ b/llm_on_ray/inference/models/templates/template_gemma.jinja
@@ -0,0 +1,18 @@
+{{ bos_token }}
+{% if messages[0]['role'] == 'system' %}
+{% set messages = messages[1:] %}
+{% endif %}
+{% for message in messages %}
+{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+{% endif %}
+{% if (message['role'] == 'assistant') %}
+{% set role = 'model' %}
+{% else %}
+{% set role = message['role'] %}
+{% endif %}
+{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}
+{% endfor %}
+{% if add_generation_prompt %}
+{{'<start_of_turn>model\n'}}
+{% endif %}
diff --git a/llm_on_ray/inference/models/templates/template_gpt2.jinja b/llm_on_ray/inference/models/templates/template_gpt2.jinja
new file mode 100644
index 000000000..bdd0814ba
--- /dev/null
+++ b/llm_on_ray/inference/models/templates/template_gpt2.jinja
@@ -0,0 +1,20 @@
+{% if messages[0]['role'] == 'system' %}
+{% set loop_messages = messages[1:] %}
+{% set system_message = messages[0]['content'] %}
+{% else %}
+{% set loop_messages = messages %}
+{% set system_message = false %}
+{% endif %}
+{% for message in loop_messages %}
+{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+{% endif %}
+{% if loop.index0 == 0 and system_message %}
+{{ system_message }}
+{% endif %}
+{% if message['role'] == 'user' %}
+{{ message['content'].strip() }}
+{% elif message['role'] == 'assistant' %}
+{{ message['content'].strip() }}
+{% endif %}
+{% endfor %}
diff --git a/llm_on_ray/inference/models/templates/template_llama2.jinja b/llm_on_ray/inference/models/templates/template_llama2.jinja
new file mode 100644
index 000000000..1fcd5ec2d
--- /dev/null
+++ b/llm_on_ray/inference/models/templates/template_llama2.jinja
@@ -0,0 +1,21 @@
+{% if messages[0]['role'] == 'system' %}
+{% set loop_messages = messages[1:] %}
+{% set system_message = messages[0]['content'] %}
+{% else %}{% set loop_messages = messages %}
+{% set system_message = false %}
+{% endif %}
+{% for message in loop_messages %}
+{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+{% endif %}
+{% if loop.index0 == 0 and system_message != false %}
+{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}
+{% else %}
+{% set content = message['content'] %}
+{% endif %}
+{% if message['role'] == 'user' %}
+{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}
+{% elif message['role'] == 'assistant' %}
+{{ ' '  + content.strip() + ' ' + eos_token }}
+{% endif %}
+{% endfor %}
diff --git a/llm_on_ray/inference/models/templates/template_mistral.jinja b/llm_on_ray/inference/models/templates/template_mistral.jinja
new file mode 100644
index 000000000..1ad2bc4fc
--- /dev/null
+++ b/llm_on_ray/inference/models/templates/template_mistral.jinja
@@ -0,0 +1,16 @@
+{{ bos_token }}
+{% if messages[0]['role'] == 'system' %}
+{% set messages = messages[1:] %}
+{% endif %}
+{% for message in messages %}
+{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+{% endif %}
+{% if message['role'] == 'user' %}
+{{ '[INST] ' + message['content'] + ' [/INST]' }}
+{% elif message['role'] == 'assistant' %}
+{{ message['content'] + eos_token}}
+{% else %}
+{{ raise_exception('Only user and assistant roles are supported!') }}
+{% endif %}
+{% endfor %}
diff --git a/llm_on_ray/inference/models/templates/template_neuralchat.jinja b/llm_on_ray/inference/models/templates/template_neuralchat.jinja
new file mode 100644
index 000000000..872e225b6
--- /dev/null
+++ b/llm_on_ray/inference/models/templates/template_neuralchat.jinja
@@ -0,0 +1,23 @@
+{% if messages[0]['role'] == 'system' %}
+{% set loop_messages = messages[1:] %}
+{% set system_message = messages[0]['content'] %}
+{% else %}
+{% set loop_messages = messages %}
+{% set system_message = false %}
+{% endif %}
+{% for message in loop_messages %}
+{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+{% endif %}
+{% if loop.index0 == 0 and system_message != false %}
+{{ '###System: ' + system_message.strip() }}
+{% endif %}
+{% if message['role'] == 'user' %}
+{{ '###User: ' + message['content'].strip() }}
+{% elif message['role'] == 'assistant' %}
+{{ '###Assistant: '  + message['content'].strip() }}
+{% endif %}
+{% endfor %}
+{% if add_generation_prompt %}
+{{ '###Assistant: ' }}
+{% endif %}
diff --git a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml b/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml
index acbf58455..9302b9be2 100644
--- a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml
+++ b/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml
@@ -16,13 +16,5 @@ ipex:
 model_description:
   model_id_or_path: meta-llama/Llama-2-7b-chat-hf
   tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf
-  chat_processor: ChatModelLLama
-  prompt:
-    intro: ''
-    human_id: '[INST] {msg} [/INST]
-
-      '
-    bot_id: ''
-    stop_words: []
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index e96dd5563..f5ac35d80 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -26,6 +26,8 @@
 from starlette.requests import Request
 from starlette.responses import StreamingResponse, JSONResponse
 from fastapi import HTTPException
+
+from llm_on_ray.inference.chat_template_process import ChatTemplatePreprocess
 from llm_on_ray.inference.inference_config import InferenceConfig
 from llm_on_ray.inference.api_openai_backend.openai_protocol import (
     ChatMessage,
@@ -52,31 +54,11 @@ def __init__(
         max_batch_size=_DEFAULT_MAX_BATCH_SIZE,
     ):
         self.device = torch.device(infer_conf.device)
-        self.process_tool = None
-        chat_processor_name = infer_conf.model_description.chat_processor
-        prompt = infer_conf.model_description.prompt
 
         self.handle_dynamic_batch.set_max_batch_size(max_batch_size)
-
-        if chat_processor_name:
-            try:
-                module = __import__("chat_process")
-            except Exception:
-                sys.path.append(os.path.dirname(__file__))
-                module = __import__("chat_process")
-            chat_processor = getattr(module, chat_processor_name, None)
-            if chat_processor is None:
-                raise ValueError(
-                    infer_conf.name
-                    + " deployment failed. chat_processor("
-                    + chat_processor_name
-                    + ") does not exist."
-                )
-            self.process_tool = chat_processor(**prompt.dict())
-
         self.use_deepspeed = infer_conf.deepspeed
         self.use_vllm = infer_conf.vllm.enabled
-        self.is_mllm = True if chat_processor_name in ["ChatModelwithImage"] else False
+        self.is_mllm = infer_conf.model_description.chat_model_with_image
 
         # Used to determine if openai backend is used
         self.use_openai = False
@@ -103,6 +85,7 @@ def __init__(
             self.predictor = TransformerPredictor(infer_conf)
 
         self.loop = asyncio.get_running_loop()
+        self.process_tool = ChatTemplatePreprocess(self.predictor)
 
     def consume_streamer(self, streamer):
         for text in streamer:
@@ -351,6 +334,7 @@ def preprocess_prompts(
         Raises:
             HTTPException: If the input prompt format is invalid or not supported.
         """
+
         if isinstance(input, str):
             return input
         elif isinstance(input, List):
@@ -376,7 +360,7 @@ def preprocess_prompts(
                 # Process the input prompts with MLLM tool
                 if self.process_tool is not None:
                     if self.is_mllm:
-                        input, image = self.process_tool.get_prompt(input)
+                        input, image = self.process_tool.get_prompt(input, self.is_mllm)
                         prompts.append(input)
                         images.extend(image)
                         return (prompts, images)
@@ -403,16 +387,15 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON
                 status_code=400,
                 content="Invalid JSON format from http request.",
             )
-
         streaming_response = json_request["stream"] if "stream" in json_request else False
         input = json_request["text"] if "text" in json_request else ""
+
         if input == "":
             return JSONResponse(
                 status_code=400,
                 content="Empty prompt is not supported.",
             )
         config = json_request["config"] if "config" in json_request else {}
-
         # return prompt or list of prompts preprocessed
         prompts = self.preprocess_prompts(input)
 
diff --git a/llm_on_ray/inference/utils.py b/llm_on_ray/inference/utils.py
index 2d1a4d878..6712d7bfb 100644
--- a/llm_on_ray/inference/utils.py
+++ b/llm_on_ray/inference/utils.py
@@ -13,7 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
+import os
+import pathlib
 from transformers import StoppingCriteria, TextStreamer
 from ray.util.queue import Queue
 import torch
@@ -194,3 +195,28 @@ def module_import_and_init(module_name, clazz, **clazzs_kwargs):
     module = importlib.import_module(module_name)
     class_ = getattr(module, clazz)
     return class_(**clazzs_kwargs)
+
+
+def parse_jinja_file(chat_template: Union[str, None]):
+    if chat_template is None:
+        return None
+
+    try:
+        # Get the absolute path of the provided chat template
+        jinja_path = os.path.abspath(chat_template)
+
+        # If the user specifies a jinja file, the absolute path to jinja_path exists.
+        # If jinja_path does not exist, it means that the user did not specify jinja and the default jinja is used.
+        if not os.path.exists(jinja_path):
+            jinja_path = str(
+                pathlib.Path(os.path.dirname(os.path.abspath(__file__))).parent.parent
+                / chat_template
+            )
+
+        with open(jinja_path, "r") as file:
+            content = file.read()
+        return content
+    except FileNotFoundError:
+        raise FileNotFoundError(f"File {jinja_path} not found.")
+    except Exception as e:
+        raise Exception(f"An error occurred: {str(e)}")
diff --git a/llm_on_ray/ui/start_ui.py b/llm_on_ray/ui/start_ui.py
index c30851a8e..5cdece259 100644
--- a/llm_on_ray/ui/start_ui.py
+++ b/llm_on_ray/ui/start_ui.py
@@ -31,11 +31,7 @@
 from ray.util import queue
 from llm_on_ray.inference.inference_config import all_models, ModelDescription, Prompt
 from llm_on_ray.inference.inference_config import InferenceConfig as FinetunedConfig
-from llm_on_ray.inference.chat_process import (
-    ChatModelGptJ,
-    ChatModelLLama,
-    ChatModelwithImage,
-)
+
 from llm_on_ray.inference.predictor_deployment import PredictorDeployment
 from llm_on_ray.ui.html_format import cpu_memory_html, ray_status_html, custom_css
 from langchain.vectorstores import FAISS
diff --git a/tests/inference/test_chat_template.py b/tests/inference/test_chat_template.py
new file mode 100644
index 000000000..4a987a841
--- /dev/null
+++ b/tests/inference/test_chat_template.py
@@ -0,0 +1,227 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import pathlib
+
+import pytest
+from transformers import AutoTokenizer
+
+from llm_on_ray.inference.utils import parse_jinja_file
+
+# Define the base path for templates
+base_path = (
+    pathlib.Path(os.path.dirname(os.path.abspath(__file__))).parent.parent
+    / "llm_on_ray/inference/models/templates"
+)
+
+
+# Define models, templates, and their corresponding expected outputs
+MODEL_TEMPLATE_GENERATON_OUTPUT = [
+    (
+        "EleutherAI/gpt-j-6b",
+        base_path / "default_template.jinja",
+        True,
+        "Below is an instruction that describes a task. Write a response that "
+        "appropriately completes the request.\n"
+        "### Instruction: Hello\n"
+        "### Response:Hi there!\n"
+        "### Response:\n",
+    ),
+    (
+        "EleutherAI/gpt-j-6b",
+        base_path / "default_template.jinja",
+        False,
+        "Below is an instruction that describes a task. Write a response that "
+        "appropriately completes the request.\n"
+        "### Instruction: Hello\n"
+        "### Response:Hi there!\n",
+    ),
+    ("gpt2", base_path / "template_gpt2.jinja", True, "Hello\nHi there!\nWhat is the capital of\n"),
+    (
+        "gpt2",
+        base_path / "template_gpt2.jinja",
+        False,
+        "Hello\nHi there!\nWhat is the capital of\n",
+    ),
+    (
+        "google/gemma-2b",
+        base_path / "template_gemma.jinja",
+        True,
+        "<|endoftext|>\n"
+        "<start_of_turn>user\n"
+        "Hello<end_of_turn>\n"
+        "\n"
+        "<start_of_turn>model\n"
+        "Hi there!<end_of_turn>\n"
+        "\n"
+        "<start_of_turn>user\n"
+        "What is the capital of<end_of_turn>\n"
+        "\n"
+        "<start_of_turn>model\n"
+        "\n",
+    ),
+    (
+        "google/gemma-2b",
+        base_path / "template_gemma.jinja",
+        False,
+        "<|endoftext|>\n"
+        "<start_of_turn>user\n"
+        "Hello<end_of_turn>\n"
+        "\n"
+        "<start_of_turn>model\n"
+        "Hi there!<end_of_turn>\n"
+        "\n"
+        "<start_of_turn>user\n"
+        "What is the capital of<end_of_turn>\n"
+        "\n",
+    ),
+    (
+        "mistralai/Mistral-7B-v0.1",
+        base_path / "template_mistral.jinja",
+        True,
+        "<s>\n"
+        "[INST] Hello [/INST]\n"
+        "Hi there!</s>\n"
+        "[INST] What is the capital of [/INST]\n",
+    ),
+    (
+        "mistralai/Mistral-7B-v0.1",
+        base_path / "template_mistral.jinja",
+        False,
+        "<s>\n"
+        "[INST] Hello [/INST]\n"
+        "Hi there!</s>\n"
+        "[INST] What is the capital of [/INST]\n",
+    ),
+    (
+        "Intel/neural-chat-7b-v3-1",
+        base_path / "template_neuralchat.jinja",
+        True,
+        "###System: You are a chatbot developed by Intel. Please answer all "
+        "questions to the best of your ability.\n"
+        "###User: Hello\n"
+        "###Assistant: Hi there!\n"
+        "###Assistant: \n",
+    ),
+    (
+        "Intel/neural-chat-7b-v3-1",
+        base_path / "template_neuralchat.jinja",
+        False,
+        "###System: You are a chatbot developed by Intel. Please answer all "
+        "questions to the best of your ability.\n"
+        "###User: Hello\n"
+        "###Assistant: Hi there!\n",
+    ),
+    (
+        "adept/fuyu-8b",
+        base_path / "template_llama2.jinja",
+        True,
+        "|ENDOFTEXT|[INST] Hello [/INST]\n"
+        " Hi there! |ENDOFTEXT|\n"
+        "|ENDOFTEXT|[INST] What is the capital of [/INST]\n",
+    ),
+    (
+        "adept/fuyu-8b",
+        base_path / "template_llama2.jinja",
+        False,
+        "|ENDOFTEXT|[INST] Hello [/INST]\n"
+        " Hi there! |ENDOFTEXT|\n"
+        "|ENDOFTEXT|[INST] What is the capital of [/INST]\n",
+    ),
+    (
+        "codellama/CodeLlama-7b-hf",
+        base_path / "template_codellama.jinja",
+        True,
+        "<s>[INST] Hello [/INST]\n"
+        " Hi there! </s>\n"
+        "<s>[INST] What is the capital of [/INST]\n",
+    ),
+    (
+        "codellama/CodeLlama-7b-hf",
+        base_path / "template_codellama.jinja",
+        False,
+        "<s>[INST] Hello [/INST]\n"
+        " Hi there! </s>\n"
+        "<s>[INST] What is the capital of [/INST]\n",
+    ),
+]
+
+
+TEST_MESSAGES = [
+    {"role": "user", "content": "Hello"},
+    {"role": "assistant", "content": "Hi there!"},
+    {"role": "user", "content": "What is the capital of"},
+]
+
+TEST_NEURALCHAT_MESSAGES = [
+    {
+        "role": "system",
+        "content": "You are a chatbot developed by Intel. Please answer all questions to the best of your ability.",
+    },
+    {"role": "user", "content": "Hello"},
+    {"role": "assistant", "content": "Hi there!"},
+]
+
+TEST_DEFAULT_MESSAGES = [
+    {
+        "role": "system",
+        "content": "Below is an instruction that describes a task. Write a response that appropriately completes the request.",
+    },
+    {"role": "user", "content": "Hello"},
+    {"role": "assistant", "content": "Hi there!"},
+]
+
+
+@pytest.mark.parametrize(
+    "model,template,add_generation_prompt,expected_output", MODEL_TEMPLATE_GENERATON_OUTPUT
+)
+def test_get_gen_default_prompt(
+    model: object, template: object, add_generation_prompt: object, expected_output: object
+) -> object:
+    # Initialize the tokenizer
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    tokenizer.chat_template = parse_jinja_file(template)
+
+    if model == "mistralai/Mistral-7B-v0.1" or model == "codellama/CodeLlama-7b-hf":
+        tokenizer.bos_token = "<s>"
+        tokenizer.eos_token = "</s>"
+    elif model == "adept/fuyu-8b":
+        tokenizer.bos_token = "|ENDOFTEXT|"
+        tokenizer.eos_token = "|ENDOFTEXT|"
+
+    # Call the function and get the result
+    if model == "Intel/neural-chat-7b-v3-1":
+        result = tokenizer.apply_chat_template(
+            conversation=TEST_NEURALCHAT_MESSAGES,
+            tokenize=False,
+            add_generation_prompt=add_generation_prompt,
+        )
+    elif model == "EleutherAI/gpt-j-6b":
+        result = tokenizer.apply_chat_template(
+            conversation=TEST_DEFAULT_MESSAGES,
+            tokenize=False,
+            add_generation_prompt=add_generation_prompt,
+        )
+    else:
+        result = tokenizer.apply_chat_template(
+            conversation=TEST_MESSAGES, tokenize=False, add_generation_prompt=add_generation_prompt
+        )
+    # Test assertion
+    assert result == expected_output, (
+        f"The generated prompt does not match the expected output for "
+        f"model {model} and template {template}"
+    )
diff --git a/tests/test_getting_started.sh b/tests/test_getting_started.sh
index 6a900a553..a84bfe334 100755
--- a/tests/test_getting_started.sh
+++ b/tests/test_getting_started.sh
@@ -33,7 +33,7 @@ curl $ENDPOINT_URL/chat/completions \
     -H "Content-Type: application/json" \
     -d '{
     "model": "gpt2",
-    "messages": [{"role": "assistant", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello!"}],
+    "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello!"}],
     "temperature": 0.7
     }'