From 7463d47bad9cad1ab417425fa2c3590cf4c217b8 Mon Sep 17 00:00:00 2001 From: Jiafu Zhang Date: Wed, 20 Dec 2023 21:18:19 +0800 Subject: [PATCH] verify Signed-off-by: Jiafu Zhang --- .github/workflows/workflow_finetune.yml | 153 ++++++++++++++++++ .github/workflows/workflow_finetune_gpu.yml | 36 +++++ .github/workflows/workflow_inference.yml | 13 +- .../workflows/workflow_orders on_merge.yml | 23 +++ .github/workflows/workflow_orders_nightly.yml | 20 +++ ...w_orders.yml => workflow_orders_on_pr.yml} | 3 + 6 files changed, 243 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/workflow_finetune.yml create mode 100644 .github/workflows/workflow_finetune_gpu.yml create mode 100644 .github/workflows/workflow_orders on_merge.yml create mode 100644 .github/workflows/workflow_orders_nightly.yml rename .github/workflows/{workflow_orders.yml => workflow_orders_on_pr.yml} (82%) diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml new file mode 100644 index 000000000..80c9e715d --- /dev/null +++ b/.github/workflows/workflow_finetune.yml @@ -0,0 +1,153 @@ +name: Finetune + +on: + workflow_call: + inputs: + ci_type: + type: string + default: 'pr' + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-ft + cancel-in-progress: true + +jobs: + inference: + name: finetune test + strategy: + matrix: + model: [ EleutherAI/gpt-j-6b, meta-llama/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b-chat, huggyllama/llama-7b ] + isPR: + - ${{inputs.ci_type == 'pr'}} + + exclude: + - { isPR: true } + include: + - { model: "EleutherAI/gpt-j-6b"} + - { model: "meta-llama/Llama-2-7b-chat-hf"} + + runs-on: self-hosted + + defaults: + run: + shell: bash + container: + image: ${{ vars.ACTIONS_RUNNER_CONTAINER_IMAGE }} + env: + http_proxy: ${{ vars.HTTP_PROXY_CONTAINER }} + https_proxy: ${{ vars.HTTPS_PROXY_CONTAINER }} + volumes: + - /var/run/docker.sock:/var/run/docker.sock + + steps: + - name: Checkout + uses: actions/checkout@v2 + + - name: Load environment variables + run: cat ~/llm-ray-actions-runner/.env >> $GITHUB_ENV + + - name: Build Docker Image + run: | + docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ vars.HTTP_PROXY_CONTAINER }} --build-arg https_proxy=${{ vars.HTTPS_PROXY_CONTAINER }} -f dev/docker/Dockerfile.cpu_and_deepspeed -t finetune:latest && yes | docker container prune && yes + docker image prune -f + + - name: Start Docker Container + run: | + cid=$(docker ps -q --filter "name=finetune") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi + docker run -tid -v ${{ vars.MODEL_CACHE_PATH }}:/root/.cache/huggingface/hub -v ${{ vars.CODE_CHECKOUT_PATH }}:/root/llm-on-ray -e http_proxy=${{ vars.HTTP_PROXY_CONTAINER }} -e https_proxy=${{ vars.HTTPS_PROXY_CONTAINER }} --name="finetune" --hostname="finetune-container" finetune:latest + - name: Run Finetune Test + run: | + docker exec "finetune" bash -c "source \$(python -c 'import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)')/env/setvars.sh; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --head --node-ip-address 127.0.0.1 --ray-debugger-external; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address='127.0.0.1:6379' --ray-debugger-external" + CMD=$(cat << EOF + conf_path = "finetune/finetune.conf" + with open(conf_path, encoding="utf-8") as reader: + result = eval(reader.read()) + result['General']['base_model'] = "${{ matrix.model }}" + if "${{ matrix.model }}" == "mosaicml/mpt-7b-chat": + result['General']['config']['trust_remote_code'] = True + else: + result['General']['config']['trust_remote_code'] = False + if "${{ matrix.model }}" == "EleutherAI/gpt-j-6b" or "${{ matrix.model }}" == "gpt2": + result['General']["gpt_base_model"] = True + else: + result['General']["gpt_base_model"] = False + if "${{ matrix.model }}" == "meta-llama/Llama-2-7b-chat-hf": + result['General']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}" + else: + result['General']["config"]["use_auth_token"] = None + result['Training']['epochs'] = 1 + if "${{ matrix.model }}" == "gpt2": + # to verify oneccl + result['Training']['num_training_workers'] = 2 + else: + result['Training']['num_training_workers'] = 1 + result['General']['lora_config'] = None + with open(conf_path, 'w') as output: + print(result, file=output) + EOF + ) + docker exec "finetune" python -c "$CMD" + docker exec "finetune" bash -c "python finetune/finetune.py --config_path finetune/finetune.conf" + - name: Run PEFT-LoRA Test + run: | + docker exec "finetune" bash -c "rm -rf /tmp/llm-ray/*" + CMD=$(cat << EOF + conf_path = "finetune/finetune.conf" + with open(conf_path, encoding="utf-8") as reader: + result = eval(reader.read()) + result['General']['lora_config'] = { + "task_type": "CAUSAL_LM", + "r": 8, + "lora_alpha": 32, + "lora_dropout": 0.1 + } + with open(conf_path, 'w') as output: + print(result, file=output) + EOF + ) + docker exec "finetune" python -c "$CMD" + docker exec "finetune" bash -c "python finetune/finetune.py --config_path finetune/finetune.conf" + - name: Run Deltatuner Test on DENAS-LoRA Model + run: | + if [[ ${{ matrix.model }} =~ ^(mosaicml\/mpt-7b-chat|huggyllama\/llama-7b|meta-llama\/Llama-2-7b-chat-hf)$ ]]; then + echo ${{ matrix.model }} is not supported! + else + docker exec "finetune" bash -c "rm -rf /tmp/llm-ray/*" + CMD=$(cat << EOF + import os + os.system("cp -r $(python -m pip show deltatuner | grep Location | cut -d: -f2)/deltatuner/conf/best_structure examples/") + conf_path = "finetune/finetune.conf" + with open(conf_path, encoding="utf-8") as reader: + result = eval(reader.read()) + result['General']['lora_config'] = { + "task_type": "CAUSAL_LM", + "r": 8, + "lora_alpha": 32, + "lora_dropout": 0.1 + } + result['General']['deltatuner_config'] = { + "algo": "lora", + "denas": True, + "best_model_structure": f"examples/best_structure/${{ matrix.model }}-best_structure.jsonl", + } + with open(conf_path, 'w') as output: + print(result, file=output) + EOF) + docker exec "finetune" python -c "$CMD" + docker exec "finetune" bash -c "python finetune/finetune.py --config_path finetune/finetune.conf" + fi + - name: Stop Ray + run: | + cid=$(docker ps -q --filter "name=finetune") + if [[ ! -z "$cid" ]]; then + docker exec "finetune" bash -c "ray stop" + fi + + - name: Stop Container + if: success() || failure() + run: | + cid=$(docker ps -q --filter "name=finetune") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi + - name: Test Summary + run: echo "to be continued" diff --git a/.github/workflows/workflow_finetune_gpu.yml b/.github/workflows/workflow_finetune_gpu.yml new file mode 100644 index 000000000..f18e4eaf5 --- /dev/null +++ b/.github/workflows/workflow_finetune_gpu.yml @@ -0,0 +1,36 @@ +name: Finetune on Intel GPU + +on: + workflow_call: + +jobs: + finetune: + name: finetune on gpu test + strategy: + matrix: + model: [ pythia-6.9b, gpt-j-6b ] + runs-on: self-hosted + + defaults: + run: + shell: bash + container: + image: ${{ vars.ACTIONS_RUNNER_CONTAINER_IMAGE }} + env: + http_proxy: ${{ vars.HTTP_PROXY_CONTAINER }} + https_proxy: ${{ vars.HTTPS_PROXY_CONTAINER }} + volumes: + - /var/run/docker.sock:/var/run/docker.sock + + steps: + - name: Checkout + uses: actions/checkout@v2 + + - name: Running task on Intel GPU + run: | + rm ~/borealis-runner/llm-on-ray.tar.gz -f + tar zcf ~/borealis-runner/llm-on-ray.tar.gz -C ~/actions-runner/_work/llm-on-ray . + cd ~/borealis-runner/ + python3 finetune_on_pvc.py --base_model "${{ matrix.model }}" + - name: Test Summary + run: echo "to be continued" \ No newline at end of file diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml index c34acbbd0..21c510a40 100644 --- a/.github/workflows/workflow_inference.yml +++ b/.github/workflows/workflow_inference.yml @@ -32,16 +32,18 @@ jobs: model: mpt-7b runs-on: self-hosted + defaults: run: shell: bash container: - image: 10.1.2.13:5000/llmray-build + image: ${{ vars.ACTIONS_RUNNER_CONTAINER_IMAGE }} env: - http_proxy: http://proxy-chain.intel.com:911 - https_proxy: http://proxy-chain.intel.com:911 + http_proxy: ${{ vars.HTTP_PROXY_CONTAINER }} + https_proxy: ${{ vars.HTTPS_PROXY_CONTAINER }} volumes: - /var/run/docker.sock:/var/run/docker.sock + steps: - name: Checkout uses: actions/checkout@v2 @@ -64,7 +66,7 @@ jobs: DF_SUFFIX=".cpu_and_deepspeed" fi PREFIX=${{steps.prefix.outputs.prefix}} - docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=http://proxy-chain.intel.com:911 --build-arg https_proxy=http://proxy-chain.intel.com:911 -f dev/docker/Dockerfile${DF_SUFFIX} -t ${PREFIX}:latest && yes | docker container prune && yes + docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ vars.HTTP_PROXY_CONTAINER }} --build-arg https_proxy=${{ vars.HTTPS_PROXY_CONTAINER }} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${PREFIX}:latest && yes | docker container prune && yes docker image prune -f - name: Start Docker Container @@ -73,7 +75,8 @@ jobs: cid=$(docker ps -q --filter "name=${PREFIX}") if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi echo "pass is ${GITHUB_WORKSPACE}" - docker run -tid -v /mnt/DP_disk1/huggingface/cache/:/root/.cache/huggingface/hub -v /home/ci/actions-runner/_work/llm-on-ray/llm-on-ray:/root/llm-on-ray -e http_proxy=http://proxy-chain.intel.com:911 -e https_proxy=http://proxy-chain.intel.com:911 --name="${PREFIX}" --hostname="${PREFIX}-container" ${PREFIX}:latest + docker run -tid -v ${{ vars.MODEL_CACHE_PATH }}:/root/.cache/huggingface/hub -v ${{ vars.CODE_CHECKOUT_PATH }}:/root/llm-on-ray -e http_proxy=${{ vars.HTTP_PROXY_CONTAINER }} -e https_proxy=${{ vars.HTTPS_PROXY_CONTAINER }} --name="${PREFIX}" --hostname="${PREFIX}-container" ${PREFIX}:latest + docker run -tid -v ${{ vars.MODEL_CACHE_PATH }}:/root/.cache/huggingface/hub -v ${{ vars.CODE_CHECKOUT_PATH }}:/root/llm-on-ray -e http_proxy=${{ vars.HTTP_PROXY_CONTAINER }} -e https_proxy=${{ vars.HTTPS_PROXY_CONTAINER }} --name="${PREFIX}" --hostname="${PREFIX}-container" ${PREFIX}:latest - name: Start Ray Cluster run: | diff --git a/.github/workflows/workflow_orders on_merge.yml b/.github/workflows/workflow_orders on_merge.yml new file mode 100644 index 000000000..e453f242b --- /dev/null +++ b/.github/workflows/workflow_orders on_merge.yml @@ -0,0 +1,23 @@ +name: llm-ray inference & finetune + +on: + push: + branches: + - main + paths: + - '.github/**' + - 'docker/**' + - 'common/**' + - 'dev/docker/**' + - 'finetune/**' + - 'inference/**' + - 'rlhf/**' + - 'tools/**' + +jobs: + + call-inference: + uses: ./.github/workflows/workflow_inference.yml + + call-finetune: + uses: ./.github/workflows/workflow_finetune.yml diff --git a/.github/workflows/workflow_orders_nightly.yml b/.github/workflows/workflow_orders_nightly.yml new file mode 100644 index 000000000..9ee0fd202 --- /dev/null +++ b/.github/workflows/workflow_orders_nightly.yml @@ -0,0 +1,20 @@ +name: llm-ray inference & finetune nightly + +on: + schedule: + - cron: "0 16 * * *" + +jobs: + + call-inference: + uses: ./.github/workflows/workflow_inference.yml + with: + ci_type: nightly + + call-finetune: + uses: ./.github/workflows/workflow_finetune.yml + with: + ci_type: nightly + + call-finetune-on-intel-gpu: + uses: ./.github/workflows/workflow_finetune_gpu.yml \ No newline at end of file diff --git a/.github/workflows/workflow_orders.yml b/.github/workflows/workflow_orders_on_pr.yml similarity index 82% rename from .github/workflows/workflow_orders.yml rename to .github/workflows/workflow_orders_on_pr.yml index 605b4cfc9..e13bccecf 100644 --- a/.github/workflows/workflow_orders.yml +++ b/.github/workflows/workflow_orders_on_pr.yml @@ -18,3 +18,6 @@ jobs: call-inference: uses: ./.github/workflows/workflow_inference.yml + + call-finetune: + uses: ./.github/workflows/workflow_finetune.yml