intel · carsonwang · Jun 4, 2024 · Apr 17, 2024 · Apr 26, 2024 · May 7, 2024
diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml
@@ -153,4 +153,4 @@ jobs:
         run: |
           TARGET=${{steps.target.outputs.target}}
           source dev/scripts/ci-functions.sh
-          stop_container ${TARGET}
+          stop_container ${TARGET}
diff --git a/.github/workflows/workflow_orders_nightly.yml b/.github/workflows/workflow_orders_nightly.yml
@@ -20,6 +20,11 @@ jobs:
     uses: ./.github/workflows/workflow_finetune.yml
     with:
       ci_type: nightly
+
+  call-benchmark:
+    uses: ./.github/workflows/workflow_test_benchmark.yml
+    with:
+      ci_type: nightly
 
   # call-finetune-on-intel-gpu:
   #  uses: ./.github/workflows/workflow_finetune_gpu.yml

diff --git a/.github/workflows/workflow_orders_on_merge.yml b/.github/workflows/workflow_orders_on_merge.yml
@@ -27,3 +27,7 @@ jobs:
   Finetune:
     needs: Lint
     uses: ./.github/workflows/workflow_finetune.yml
+
+  Benchmark:
+    needs: Lint
+    uses: ./.github/workflows/workflow_test_benchmark.yml
diff --git a/.github/workflows/workflow_orders_on_pr.yml b/.github/workflows/workflow_orders_on_pr.yml
@@ -27,3 +27,7 @@ jobs:
   Finetune:
     needs: Lint
     uses: ./.github/workflows/workflow_finetune.yml
+
+  Benchmark:
+    needs: Lint
+    uses: ./.github/workflows/workflow_test_benchmark.yml
diff --git a/.github/workflows/workflow_test_benchmark.yml b/.github/workflows/workflow_test_benchmark.yml
@@ -0,0 +1,127 @@
+name: Benchmark
+
+on:
+  workflow_call:
+    inputs:
+      ci_type:
+        type: string
+        default: 'pr'
+      runner_container_image:
+        type: string
+        default: '10.1.2.13:5000/llmray-build'
+      http_proxy:
+        type: string
+        default: 'http://10.24.221.169:911'
+      https_proxy:
+        type: string
+        default: 'http://10.24.221.169:911'
+      runner_config_path:
+        type: string
+        default: '/home/ci/llm-ray-actions-runner'
+      code_checkout_path:
+        type: string
+        default: '/home/ci/actions-runner/_work/llm-on-ray/llm-on-ray'
+      model_cache_path:
+        type: string
+        default: '/mnt/DP_disk1/huggingface/cache'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-bench
+  cancel-in-progress: true
+
+jobs:
+  setup-test:
+
+    name: benchmark
+
+    runs-on: self-hosted
+
+    defaults:
+      run:
+        shell: bash
+    container:
+      image: ${{ inputs.runner_container_image }}
+      env:
+        http_proxy: ${{ inputs.http_proxy }}
+        https_proxy: ${{ inputs.https_proxy }}
+        SHELL: bash -eo pipefail
+      volumes:
+        - /var/run/docker.sock:/var/run/docker.sock
+        - ${{ inputs.runner_config_path }}:/root/actions-runner-config
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Load environment variables
+        run: cat /root/actions-runner-config/.env >> $GITHUB_ENV
+
+      - name: Determine Target
+        id: "target"
+        run: |
+          target="benchmark"
+          target="${target}_vllm"
+          echo "target is ${target}"
+          echo "target=$target" >> $GITHUB_OUTPUT
+
+      - name: Build Docker Image
+        run: |
+          DF_SUFFIX=".vllm"
+          TARGET=${{steps.target.outputs.target}}
+          docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ inputs.http_proxy }} --build-arg https_proxy=${{ inputs.https_proxy }} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:latest 
+          docker container prune -f
+          docker image prune -f
+
+      - name: Start Docker Container
+        run: |
+          TARGET=${{steps.target.outputs.target}}
+          cid=$(docker ps -q --filter "name=${TARGET}")
+          if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
+          # check and remove exited container
+          cid=$(docker ps -a -q --filter "name=${TARGET}")
+          if [[ ! -z "$cid" ]]; then docker rm $cid; fi
+          docker run -tid -v ${{ inputs.model_cache_path }}:/root/.cache/huggingface/hub -v ${{ inputs.code_checkout_path }}:/root/llm-on-ray -e http_proxy=${{ inputs.http_proxy }} -e https_proxy=${{ inputs.https_proxy }} --name="${TARGET}" --hostname="${TARGET}-container" ${TARGET}:latest
+
+      - name: Start Ray Cluster
+        run: |
+          TARGET=${{steps.target.outputs.target}}
+          docker exec "${TARGET}" bash -c "./dev/scripts/start-ray-cluster.sh"
+
+      - name: Run Benchmark Test
+        run: |
+          TARGET=${{steps.target.outputs.target}}
+          # Additional libraries required  for pytest
+          docker exec "${TARGET}" bash -c "pip install -r tests/requirements.txt"
+          CMD=$(cat << EOF
+          import yaml
+          conf_path = "llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml"
+          with open(conf_path, encoding="utf-8") as reader:
+              result = yaml.load(reader, Loader=yaml.FullLoader)
+              result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
+          with open(conf_path, 'w') as output:
+              yaml.dump(result, output, sort_keys=False)
+          conf_path = "llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml"
+          with open(conf_path, encoding="utf-8") as reader:
+              result = yaml.load(reader, Loader=yaml.FullLoader)
+              result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
+          with open(conf_path, 'w') as output:
+              yaml.dump(result, output, sort_keys=False)
+          EOF
+          )
+          docker exec "${TARGET}" python -c "$CMD"
+          docker exec "${TARGET}" bash -c "huggingface-cli login --token ${{ env.HF_ACCESS_TOKEN }}"
+          docker exec "${TARGET}" bash -c "./tests/run-tests-benchmark.sh"
+      - name: Stop Ray
+        run: |
+          TARGET=${{steps.target.outputs.target}}
+          cid=$(docker ps -q --filter "name=${TARGET}")
+          if [[ ! -z "$cid" ]]; then
+            docker exec "${TARGET}" bash -c "ray stop"
+          fi
+
+      - name: Stop Container
+        if: success() || failure()
+        run: |
+          TARGET=${{steps.target.outputs.target}}
+          cid=$(docker ps -q --filter "name=${TARGET}")
+          if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
diff --git a/README.md b/README.md
@@ -30,7 +30,7 @@ LLM-on-Ray's modular workflow structure is designed to comprehensively cater to
 * **Interactive Web UI for Enhanced Usability**: Except for command line, LLM-on-Ray introduces a Web UI, allowing users to easily finetune and deploy LLMs through a user-friendly interface. Additionally, the UI includes a chatbot application, enabling users to immediately test and refine the models.
 
 
-![llm-on-ray](https://github.com/intel/llm-on-ray/assets/9278199/68017c14-c0be-4b91-8d71-4b74ab89bd81)
+![llm-on-ray](./docs/assets/solution_technical_overview.png)
 
 
 ## Getting Started

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -308,11 +308,18 @@ async def send_request(
                 if args.track_token_latency:
                     generate_len = len(tokenizer.encode(response_text))
                 else:
-                    response_content = json.loads(response_text)
-                    if isinstance(response_content, list):
-                        generate_len = response_content[0]["generate_length"]
+                    if vllm_engine:
+                        length_name = "num_generated_tokens"
                     else:
-                        generate_len = response_content["generate_length"]
+                        length_name = "generate_length"
+                    try:
+                        response_content = json.loads(response_text)
+                        if isinstance(response_content, list):
+                            generate_len = response_content[0][length_name]
+                        else:
+                            generate_len = response_content[length_name]
+                    except Exception:
+                        generate_len = None
             else:
                 if args.track_token_latency:
                     response_content = chunks[-2].decode("utf-8")
@@ -470,6 +477,7 @@ def main(args: argparse.Namespace):
         config["top_p"] = float(args.top_p)
     if args.top_k:
         config["top_k"] = float(args.top_k)
+    config["do_sample"] = args.do_sample
     # In order to align with vllm test parameters
     if args.vllm_engine:
         config["ignore_eos"] = True
@@ -727,6 +735,11 @@ def main(args: argparse.Namespace):
         help="The number of highest probability vocabulary tokens to keep \
             for top-k-filtering.",
     )
+    parser.add_argument(
+        "--do_sample",
+        action="store_true",
+        help="Whether or not to use sampling; use greedy decoding otherwise.",
+    )
     parser.add_argument(
         "--vllm-engine",
         action="store_true",