From b46cff0918dac98b4ce85ca6ee4fae998c6cc476 Mon Sep 17 00:00:00 2001 From: Yuanhao Ji Date: Wed, 22 Jan 2025 09:56:45 +0800 Subject: [PATCH 1/9] improve workflows --- .github/workflows/deepspeed.yaml | 220 ++++++++++++++++--------------- .github/workflows/llamacpp.yaml | 85 ++++++------ 2 files changed, 160 insertions(+), 145 deletions(-) diff --git a/.github/workflows/deepspeed.yaml b/.github/workflows/deepspeed.yaml index ebd76df2e..047ebf255 100644 --- a/.github/workflows/deepspeed.yaml +++ b/.github/workflows/deepspeed.yaml @@ -1,8 +1,5 @@ -name: huawei-ascend-npu +name: Unit tests with DeepSpeed on Ascend NPU -defaults: - run: - shell: bash -ieo pipefail {0} on: workflow_dispatch: pull_request: @@ -16,118 +13,129 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true -permissions: - contents: read - issues: write +# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly +# declared as "shell: bash -el {0}" on steps that need to be properly activated. +# It's used to activate ascend-toolkit environment variables. +defaults: + run: + shell: bash -el {0} jobs: - unit-tests: - runs-on: [self-hosted, ascend, npu] + deepspeed-ut: + if: ${{ github.repository_owner == 'Ascend' }} + name: Run unit tests with DeepSpeed + runs-on: self-hosted container: - image: ascendai/cann - ports: - - 80 + image: ascendai/cann:latest volumes: - /usr/local/dcmi:/usr/local/dcmi - - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi + - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi - /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ - /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info - - /etc/ascend_install.info:/etc/ascend_install.info - options: --network host - --name deepspeed_unit-tests - --device /dev/davinci0 - --device /dev/davinci_manager - --device /dev/devmm_svm - --device /dev/hisi_hdc - --shm-size "20g" - --entrypoint /bin/bash - + - /etc/ascend_install.info:/etc/ascend_install.info + options: >- + --network host + --device /dev/davinci0 + --device /dev/davinci_manager + --device /dev/devmm_svm + --device /dev/hisi_hdc steps: - - uses: actions/checkout@v4 - - - name: Install pytorch - run: | - npu-smi info - apt-get update - apt-get install sudo - pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple - source /root/.bashrc - - pip install torch==2.2.0 torchvision==0.17.0 torch_npu==2.2.0 torchaudio==2.2.0 numpy==1.26.4 cloudpickle tornado ml-dtypes - - python << EOF - if __name__ == '__main__': - import torch - import torch_npu - torch_npu.npu.set_device("npu:0") - print(f"Device Name: {torch.npu.get_device_name(0)}") - print(f"Device Count: {torch.npu.device_count()}") - print(f"Device Available: {torch.npu.is_available()}") - EOF - - - name: Install transformers - uses: nick-fields/retry@v3 - with: - timeout_minutes: 30 - max_attempts: 3 - retry_on: error - command: | - source /root/.bashrc - echo "y" | apt-get install git - git clone https://github.com/huggingface/transformers - cd transformers - git rev-parse --short HEAD + - name: Show NPU info + run: | + npu-smi info + + - name: Config mirrors + run: | + sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list + pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + + - name: Install system dependencies + run: | + apt-get update + apt-get install -y \ + git gcc g++ make cmake ninja-build + + - name: Checkout + uses: actions/checkout@v4 + + - name: Install pytorch + run: | + pip install \ + torch==2.2.0 \ + torch_npu==2.2.0 \ + torchvision==0.17.0 \ + torchaudio==2.2.0 \ + numpy==1.26.4 \ + cloudpickle \ + tornado \ + ml-dtypes + + python << EOF + if __name__ == '__main__': + import torch + import torch_npu + torch_npu.npu.set_device("npu:0") + print(f"Device Name: {torch.npu.get_device_name(0)}") + print(f"Device Count: {torch.npu.device_count()}") + print(f"Device Available: {torch.npu.is_available()}") + EOF + + - name: Checkout transformers + uses: actions/checkout@v4 + with: + repository: huggingface/transformers + path: transformers + + - name: Install transformers + working-directory: transformers + run: | pip install . - - - name: Install deepspeed - uses: nick-fields/retry@v3 - with: - timeout_minutes: 30 - max_attempts: 3 - retry_on: error - command: | - source /root/.bashrc - git clone --depth=1 https://github.com/microsoft/DeepSpeed.git + + - name: Checkout deepspeed + uses: actions/checkout@v4 + with: + repository: microsoft/DeepSpeed + path: deepspeed + + - name: Install deepspeed dependencies + run: | pip install -r requirements/requirements_deepspeed.txt - cd DeepSpeed + + - name: Install deepspeed + working-directory: deepspeed + run: | + pip install . pip install .[1bit,autotuning,inf] - + ds_report - - name: Python environment - run: | - source /root/.bashrc - pip list - - - name: Unit tests - run: | - source /root/.bashrc - unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch - cd DeepSpeed/tests/unit/ - - pytest --verbose accelerator/* - pytest --verbose autotuning/* - pytest --verbose checkpoint/test_reshape_checkpoint.py - pytest --verbose checkpoint/test_moe_checkpoint.py - pytest --verbose checkpoint/test_shared_weights.py - pytest --verbose launcher/test_ds_arguments.py launcher/test_run.py - pytest --verbose model_parallelism/* - pytest --verbose moe/test_moe_tp.py - pytest --verbose monitor/* - pytest --verbose utils/* - pytest --verbose runtime/test_ds_config_model.py - pytest --verbose runtime/pipe/test_pipe_schedule.py - pytest --verbose runtime/zero/test_zero_config.py - pytest --verbose runtime/zero/test_zero_tiled.py - pytest --verbose runtime/zero/test_zeropp.py - pytest --verbose runtime/test_autocast.py - pytest --verbose runtime/test_data.py - pytest --verbose runtime/test_runtime_utils.py - pytest --verbose runtime/activation_checkpointing/* - pytest --verbose runtime/utils/* - pytest --verbose runtime/zero/test_zero_dynamic_class.py - - - - - + - name: Show environment info + run: | + pip list + + - name: Run unit tests + working-directory: deepspeed/tests/unit + run: | + unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch + + pytest --verbose accelerator/* + pytest --verbose autotuning/* + pytest --verbose checkpoint/test_reshape_checkpoint.py + pytest --verbose checkpoint/test_moe_checkpoint.py + pytest --verbose checkpoint/test_shared_weights.py + pytest --verbose launcher/test_ds_arguments.py launcher/test_run.py + pytest --verbose model_parallelism/* + pytest --verbose moe/test_moe_tp.py + pytest --verbose monitor/* + pytest --verbose utils/* + pytest --verbose runtime/test_ds_config_model.py + pytest --verbose runtime/pipe/test_pipe_schedule.py + pytest --verbose runtime/zero/test_zero_config.py + pytest --verbose runtime/zero/test_zero_tiled.py + pytest --verbose runtime/zero/test_zeropp.py + pytest --verbose runtime/test_autocast.py + pytest --verbose runtime/test_data.py + pytest --verbose runtime/test_runtime_utils.py + pytest --verbose runtime/activation_checkpointing/* + pytest --verbose runtime/utils/* + pytest --verbose runtime/zero/test_zero_dynamic_class.py diff --git a/.github/workflows/llamacpp.yaml b/.github/workflows/llamacpp.yaml index 704a1399f..5fc0035a4 100644 --- a/.github/workflows/llamacpp.yaml +++ b/.github/workflows/llamacpp.yaml @@ -1,14 +1,11 @@ -name: llama.cpp +name: Build llama.cpp in CANN container -defaults: - run: - shell: bash -el {0} on: workflow_dispatch: pull_request: - # paths: - # - '.github/workflows/llamacpp.yaml' - # - 'requirements/**' + paths: + - '.github/workflows/llamacpp.yaml' + - 'requirements/**' schedule: - cron: "0 0 * * *" @@ -16,39 +13,49 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true -permissions: - contents: read - issues: write +# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly +# declared as "shell: bash -el {0}" on steps that need to be properly activated. +# It's used to activate ascend-toolkit environment variables. +defaults: + run: + shell: bash -el {0} jobs: - unit-tests: - if: contains(github.event.pull_request.labels.*.name, 'Ascend NPU') - - runs-on: ubuntu-latest - strategy: - matrix: - build: ['Release'] - cann: ['openeuler-python3.10-cann8.0.rc3.beta1'] - device: ['ascend910b3'] - container: - image: ascendai/cann:${{ matrix.cann }} - steps: - - uses: actions/checkout@v4 - - name: Install llamacpp - uses: nick-fields/retry@v3 + openeuler-arm64-test: + if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }} + name: Build llama.cpp on OpenEuler 22.03 for Arm64 + runs-on: ubuntu-24.04-arm + strategy: + matrix: + cann: + - '8.0.rc3.beta1-910b-openeuler22.03-py3.10' + device: + - 'ascend910b3' + build: + - 'Release' + container: ascendai/cann:${{ matrix.cann }} + steps: + - name: Install dependencies + run: | + yum update -y + yum install -y git gcc gcc-c++ make cmake + + - name: Checkout + uses: actions/checkout@v4 + + - name: Checkout llama.cpp + uses: actions/checkout@v4 with: - timeout_minutes: 30 - max_attempts: 3 - retry_on: error - command: | - yum update -y - yum install git cmake gcc gcc-c++ make -y - git clone https://github.com/ggerganov/llama.cpp.git - - - name: Build + repository: ggerganov/llama.cpp + path: llama.cpp + + - name: Build llama.cpp + working-directory: llama.cpp run: | - cd llama.cpp - mkdir build - cd build - export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib/:${LD_LIBRARY_PATH} - cmake .. -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DGGML_CANN=on -DSOC_TYPE=${{ matrix.device }} && cmake --build . -j $(nproc) + mkdir build + cd build + + export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/aarch64-linux/devlib/:${LD_LIBRARY_PATH} + + cmake .. -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DGGML_CANN=on -DSOC_TYPE=${{ matrix.device }} + cmake --build . -j $(nproc) From 16f51b1604b059e66c51ab2f6c9434d87e7c171c Mon Sep 17 00:00:00 2001 From: Yuanhao Ji Date: Wed, 22 Jan 2025 10:03:52 +0800 Subject: [PATCH 2/9] update --- .github/workflows/deepspeed.yaml | 2 -- .github/workflows/llamacpp.yaml | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/deepspeed.yaml b/.github/workflows/deepspeed.yaml index 047ebf255..1cae788ce 100644 --- a/.github/workflows/deepspeed.yaml +++ b/.github/workflows/deepspeed.yaml @@ -104,9 +104,7 @@ jobs: - name: Install deepspeed working-directory: deepspeed run: | - pip install . pip install .[1bit,autotuning,inf] - ds_report - name: Show environment info diff --git a/.github/workflows/llamacpp.yaml b/.github/workflows/llamacpp.yaml index 5fc0035a4..4a0a95840 100644 --- a/.github/workflows/llamacpp.yaml +++ b/.github/workflows/llamacpp.yaml @@ -55,7 +55,7 @@ jobs: mkdir build cd build - export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/aarch64-linux/devlib/:${LD_LIBRARY_PATH} + export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH} cmake .. -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DGGML_CANN=on -DSOC_TYPE=${{ matrix.device }} cmake --build . -j $(nproc) From cf1fbf12f5181703a9ac286494868dddacf3defc Mon Sep 17 00:00:00 2001 From: Yuanhao Ji Date: Wed, 22 Jan 2025 15:49:25 +0800 Subject: [PATCH 3/9] update --- .github/workflows/deepspeed.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deepspeed.yaml b/.github/workflows/deepspeed.yaml index 1cae788ce..f16af6ce0 100644 --- a/.github/workflows/deepspeed.yaml +++ b/.github/workflows/deepspeed.yaml @@ -24,7 +24,7 @@ jobs: deepspeed-ut: if: ${{ github.repository_owner == 'Ascend' }} name: Run unit tests with DeepSpeed - runs-on: self-hosted + runs-on: [self-hosted, ascend, npu] container: image: ascendai/cann:latest volumes: From 620724aa001cf30525ba5e2f27bee33c5089ade6 Mon Sep 17 00:00:00 2001 From: Yuanhao Ji Date: Wed, 22 Jan 2025 17:20:29 +0800 Subject: [PATCH 4/9] update --- .github/workflows/llamacpp.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/llamacpp.yaml b/.github/workflows/llamacpp.yaml index 4a0a95840..c08067546 100644 --- a/.github/workflows/llamacpp.yaml +++ b/.github/workflows/llamacpp.yaml @@ -23,7 +23,7 @@ defaults: jobs: openeuler-arm64-test: if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }} - name: Build llama.cpp on OpenEuler 22.03 for Arm64 + name: Build llama.cpp on OpenEuler for Arm64 runs-on: ubuntu-24.04-arm strategy: matrix: From 9ba099139a8b57753e616a7df4cd1a71840a0997 Mon Sep 17 00:00:00 2001 From: Yuanhao Ji Date: Thu, 23 Jan 2025 11:07:30 +0800 Subject: [PATCH 5/9] update --- .github/workflows/llamacpp.yaml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/llamacpp.yaml b/.github/workflows/llamacpp.yaml index c08067546..212069f51 100644 --- a/.github/workflows/llamacpp.yaml +++ b/.github/workflows/llamacpp.yaml @@ -52,10 +52,11 @@ jobs: - name: Build llama.cpp working-directory: llama.cpp run: | - mkdir build - cd build - + export ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH} - cmake .. -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DGGML_CANN=on -DSOC_TYPE=${{ matrix.device }} - cmake --build . -j $(nproc) + cmake -S . -B build \ + -DCMAKE_BUILD_TYPE=${{ matrix.build }} \ + -DGGML_CANN=on \ + -DSOC_TYPE=${{ matrix.device }} + cmake --build build -j $(nproc) From b9397963c802410a7d83b4512a16fa4b8e24b1c8 Mon Sep 17 00:00:00 2001 From: Yuanhao Ji Date: Thu, 23 Jan 2025 11:51:33 +0800 Subject: [PATCH 6/9] tmp --- .github/workflows/llamacpp.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/llamacpp.yaml b/.github/workflows/llamacpp.yaml index 212069f51..5e05ac086 100644 --- a/.github/workflows/llamacpp.yaml +++ b/.github/workflows/llamacpp.yaml @@ -52,6 +52,11 @@ jobs: - name: Build llama.cpp working-directory: llama.cpp run: | + set -x + + echo $PATH + echo $LD_LIBRARY_PATH + export ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH} From 7dec44f1673ee0ce587c5335956585d79c6aa0fd Mon Sep 17 00:00:00 2001 From: Yuanhao Ji Date: Thu, 23 Jan 2025 14:25:28 +0800 Subject: [PATCH 7/9] update --- .github/workflows/llamacpp.yaml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/.github/workflows/llamacpp.yaml b/.github/workflows/llamacpp.yaml index 5e05ac086..8d035df13 100644 --- a/.github/workflows/llamacpp.yaml +++ b/.github/workflows/llamacpp.yaml @@ -52,13 +52,7 @@ jobs: - name: Build llama.cpp working-directory: llama.cpp run: | - set -x - - echo $PATH - echo $LD_LIBRARY_PATH - - export ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest - export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH} + export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH} cmake -S . -B build \ -DCMAKE_BUILD_TYPE=${{ matrix.build }} \ From 4eaac32dd893b1ecfef4035bf43ec3a3df5eaff9 Mon Sep 17 00:00:00 2001 From: Yuanhao Ji Date: Thu, 23 Jan 2025 14:27:21 +0800 Subject: [PATCH 8/9] tmp --- .github/workflows/llamacpp.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/llamacpp.yaml b/.github/workflows/llamacpp.yaml index 8d035df13..4c3e48afe 100644 --- a/.github/workflows/llamacpp.yaml +++ b/.github/workflows/llamacpp.yaml @@ -52,8 +52,12 @@ jobs: - name: Build llama.cpp working-directory: llama.cpp run: | + set -x + export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH} + echo $LD_LIBRARY_PATH + cmake -S . -B build \ -DCMAKE_BUILD_TYPE=${{ matrix.build }} \ -DGGML_CANN=on \ From 2c933da393903620eaea66c30cd52d5b4be6111f Mon Sep 17 00:00:00 2001 From: Yuanhao Ji Date: Thu, 23 Jan 2025 14:34:09 +0800 Subject: [PATCH 9/9] tmp --- .github/workflows/llamacpp.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/llamacpp.yaml b/.github/workflows/llamacpp.yaml index 4c3e48afe..8d035df13 100644 --- a/.github/workflows/llamacpp.yaml +++ b/.github/workflows/llamacpp.yaml @@ -52,12 +52,8 @@ jobs: - name: Build llama.cpp working-directory: llama.cpp run: | - set -x - export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH} - echo $LD_LIBRARY_PATH - cmake -S . -B build \ -DCMAKE_BUILD_TYPE=${{ matrix.build }} \ -DGGML_CANN=on \