chore: add Qwen2.5 coder (#39)

bentoml · Nov 15, 2024 · db84e2f · db84e2f
1 parent 99ba6bd
commit db84e2f
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 2 deletions.
diff --git a/src/llamacpp-chat/service.py b/src/llamacpp-chat/service.py
@@ -77,12 +77,11 @@ async def catch_all(full_path: str):
 @bentoml.mount_asgi_app(openai_api_app, path="/v1")
 @bentoml.service(**SERVICE_CONFIG)
 class LlamaCppChat:
-
+    model = bentoml.models.HuggingFaceModel(ENGINE_CONFIG["model"])
     def __init__(self) -> None:
         from llama_cpp import Llama
         self.llm = Llama.from_pretrained(
             **ENGINE_CONFIG,
-            verbose=False,
         )
 
     @bentoml.api(route="/v1/chat/completions")

diff --git a/src/recipe.yaml b/src/recipe.yaml
@@ -1045,3 +1045,66 @@
       gpu_type: nvidia-l4
     traffic:
       timeout: 300
+'qwen2.5-coder:7b-instruct-ggml-fp16-darwin':
+  project: llamacpp-chat
+  extra_labels:
+    model_name: Qwen/Qwen2.5-Coder-7B-Instruct-GGUF
+    openllm_alias: 7b-ggml-fp16-darwin
+    platforms: macos
+  extra_envs:
+    - name: CMAKE_ARGS
+      value: "-DGGML_METAL=on"
+  engine_config:
+    max_model_len: 2048
+    additional_files:
+      - qwen2.5-coder-7b-instruct-fp16-00001-of-00004.gguf
+      - qwen2.5-coder-7b-instruct-fp16-00002-of-00004.gguf
+      - qwen2.5-coder-7b-instruct-fp16-00003-of-00004.gguf
+      - qwen2.5-coder-7b-instruct-fp16-00004-of-00004.gguf
+    filename: qwen2.5-coder-7b-instruct-fp16-00001-of-00004.gguf
+    repo_id: Qwen/Qwen2.5-Coder-7B-Instruct-GGUF
+  service_config:
+    name: qwen2.5-coder
+    resources:
+      memory: 16Gi
+    traffic:
+      timeout: 300
+'qwen2.5-coder:7b-instruct-ggml-fp16-linux':
+  project: llamacpp-chat
+  extra_labels:
+    model_name: Qwen/Qwen2.5-Coder-7B-Instruct-GGUF
+    openllm_alias: 7b-ggml-fp16-linux
+    platforms: linux
+  extra_envs:
+    - name: CMAKE_ARGS
+      value: "-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS"
+  engine_config:
+    max_model_len: 2048
+    additional_files:
+      - qwen2.5-coder-7b-instruct-fp16-00001-of-00004.gguf
+      - qwen2.5-coder-7b-instruct-fp16-00002-of-00004.gguf
+      - qwen2.5-coder-7b-instruct-fp16-00003-of-00004.gguf
+      - qwen2.5-coder-7b-instruct-fp16-00004-of-00004.gguf
+    filename: qwen2.5-coder-7b-instruct-fp16-00001-of-00004.gguf
+    repo_id: Qwen/Qwen2.5-Coder-7B-Instruct-GGUF
+  service_config:
+    name: qwen2.5-coder
+    resources:
+      memory: 16Gi
+    traffic:
+      timeout: 300
+'qwen2.5-coder:7b-instruct':
+  project: vllm-chat
+  extra_labels:
+    model_name: Qwen/Qwen2.5-Coder-7B-Instruct
+    openllm_alias: 7b-instruct
+  service_config:
+    name: qwen2.5-coder
+    resources:
+      gpu: 1
+      gpu_type: nvidia-l4
+    traffic:
+      timeout: 300
+  engine_config:
+    model: Qwen/Qwen2.5-Coder-7B-Instruct
+    max_model_len: 20480