diff --git a/.github/workflows/deepspeed.yaml b/.github/workflows/deepspeed.yaml
index ebd76df2e..f16af6ce0 100644
--- a/.github/workflows/deepspeed.yaml
+++ b/.github/workflows/deepspeed.yaml
@@ -1,8 +1,5 @@
-name: huawei-ascend-npu
+name: Unit tests with DeepSpeed on Ascend NPU
 
-defaults:
-  run:
-    shell: bash -ieo pipefail {0}
 on:
   workflow_dispatch:
   pull_request:
@@ -16,118 +13,127 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
-permissions:
-  contents: read
-  issues: write
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
 
 jobs:
-  unit-tests:
+  deepspeed-ut:
+    if: ${{ github.repository_owner == 'Ascend' }}
+    name: Run unit tests with DeepSpeed
     runs-on: [self-hosted, ascend, npu]
     container:
-      image: ascendai/cann
-      ports:
-        - 80
+      image: ascendai/cann:latest
       volumes:
         - /usr/local/dcmi:/usr/local/dcmi
-        - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi 
+        - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
         - /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/
         - /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info
-        - /etc/ascend_install.info:/etc/ascend_install.info   
-      options: --network host
-               --name deepspeed_unit-tests
-               --device /dev/davinci0
-               --device /dev/davinci_manager
-               --device /dev/devmm_svm
-               --device /dev/hisi_hdc
-               --shm-size "20g"
-               --entrypoint /bin/bash
-                              
+        - /etc/ascend_install.info:/etc/ascend_install.info
+      options: >-
+        --network host
+        --device /dev/davinci0
+        --device /dev/davinci_manager
+        --device /dev/devmm_svm
+        --device /dev/hisi_hdc
     steps:
-    - uses: actions/checkout@v4
-
-    - name: Install pytorch 
-      run: |
-        npu-smi info
-        apt-get update 
-        apt-get install sudo
-        pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
-        source /root/.bashrc
-
-        pip install torch==2.2.0 torchvision==0.17.0 torch_npu==2.2.0 torchaudio==2.2.0 numpy==1.26.4 cloudpickle tornado ml-dtypes
-        
-        python << EOF 
-        if __name__ == '__main__':
-            import torch
-            import torch_npu
-            torch_npu.npu.set_device("npu:0") 
-            print(f"Device Name: {torch.npu.get_device_name(0)}")
-            print(f"Device Count: {torch.npu.device_count()}")
-            print(f"Device Available: {torch.npu.is_available()}")
-        EOF
-
-    - name: Install transformers
-      uses: nick-fields/retry@v3
-      with:
-        timeout_minutes: 30
-        max_attempts: 3
-        retry_on: error
-        command: | 
-          source /root/.bashrc
-          echo "y" | apt-get install git
-          git clone https://github.com/huggingface/transformers
-          cd transformers
-          git rev-parse --short HEAD
+      - name: Show NPU info
+        run: |
+          npu-smi info
+
+      - name: Config mirrors
+        run: |
+          sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
+          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+      - name: Install system dependencies
+        run: |
+          apt-get update
+          apt-get install -y \
+              git gcc g++ make cmake ninja-build
+
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Install pytorch
+        run: |
+          pip install \
+              torch==2.2.0 \
+              torch_npu==2.2.0 \
+              torchvision==0.17.0 \
+              torchaudio==2.2.0 \
+              numpy==1.26.4 \
+              cloudpickle \
+              tornado \
+              ml-dtypes
+
+          python << EOF 
+          if __name__ == '__main__':
+              import torch
+              import torch_npu
+              torch_npu.npu.set_device("npu:0") 
+              print(f"Device Name: {torch.npu.get_device_name(0)}")
+              print(f"Device Count: {torch.npu.device_count()}")
+              print(f"Device Available: {torch.npu.is_available()}")
+          EOF
+
+      - name: Checkout transformers
+        uses: actions/checkout@v4
+        with:
+          repository: huggingface/transformers
+          path: transformers
+
+      - name: Install transformers
+        working-directory: transformers
+        run: |
           pip install .
-             
-    - name: Install deepspeed
-      uses: nick-fields/retry@v3
-      with:
-        timeout_minutes: 30
-        max_attempts: 3
-        retry_on: error
-        command: | 
-          source /root/.bashrc
-          git clone --depth=1 https://github.com/microsoft/DeepSpeed.git
+
+      - name: Checkout deepspeed
+        uses: actions/checkout@v4
+        with:
+          repository: microsoft/DeepSpeed
+          path: deepspeed
+
+      - name: Install deepspeed dependencies
+        run: |
           pip install -r requirements/requirements_deepspeed.txt 
-          cd DeepSpeed
+
+      - name: Install deepspeed
+        working-directory: deepspeed
+        run: |
           pip install .[1bit,autotuning,inf]   
-  
           ds_report
 
-    - name: Python environment
-      run: |
-        source /root/.bashrc
-        pip list
-
-    - name: Unit tests
-      run: |
-        source /root/.bashrc
-        unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
-        cd DeepSpeed/tests/unit/
-        
-        pytest --verbose accelerator/*
-        pytest --verbose autotuning/*
-        pytest --verbose checkpoint/test_reshape_checkpoint.py
-        pytest --verbose checkpoint/test_moe_checkpoint.py
-        pytest --verbose checkpoint/test_shared_weights.py
-        pytest --verbose launcher/test_ds_arguments.py launcher/test_run.py
-        pytest --verbose model_parallelism/*
-        pytest --verbose moe/test_moe_tp.py
-        pytest --verbose monitor/*
-        pytest --verbose utils/*
-        pytest --verbose runtime/test_ds_config_model.py
-        pytest --verbose runtime/pipe/test_pipe_schedule.py
-        pytest --verbose runtime/zero/test_zero_config.py
-        pytest --verbose runtime/zero/test_zero_tiled.py
-        pytest --verbose runtime/zero/test_zeropp.py
-        pytest --verbose runtime/test_autocast.py
-        pytest --verbose runtime/test_data.py
-        pytest --verbose runtime/test_runtime_utils.py
-        pytest --verbose runtime/activation_checkpointing/*
-        pytest --verbose runtime/utils/*
-        pytest --verbose runtime/zero/test_zero_dynamic_class.py
-
-
-
-
- 
+      - name: Show environment info
+        run: |
+          pip list
+
+      - name: Run unit tests
+        working-directory: deepspeed/tests/unit
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+
+          pytest --verbose accelerator/*
+          pytest --verbose autotuning/*
+          pytest --verbose checkpoint/test_reshape_checkpoint.py
+          pytest --verbose checkpoint/test_moe_checkpoint.py
+          pytest --verbose checkpoint/test_shared_weights.py
+          pytest --verbose launcher/test_ds_arguments.py launcher/test_run.py
+          pytest --verbose model_parallelism/*
+          pytest --verbose moe/test_moe_tp.py
+          pytest --verbose monitor/*
+          pytest --verbose utils/*
+          pytest --verbose runtime/test_ds_config_model.py
+          pytest --verbose runtime/pipe/test_pipe_schedule.py
+          pytest --verbose runtime/zero/test_zero_config.py
+          pytest --verbose runtime/zero/test_zero_tiled.py
+          pytest --verbose runtime/zero/test_zeropp.py
+          pytest --verbose runtime/test_autocast.py
+          pytest --verbose runtime/test_data.py
+          pytest --verbose runtime/test_runtime_utils.py
+          pytest --verbose runtime/activation_checkpointing/*
+          pytest --verbose runtime/utils/*
+          pytest --verbose runtime/zero/test_zero_dynamic_class.py
diff --git a/.github/workflows/llamacpp.yaml b/.github/workflows/llamacpp.yaml
index 704a1399f..8d035df13 100644
--- a/.github/workflows/llamacpp.yaml
+++ b/.github/workflows/llamacpp.yaml
@@ -1,14 +1,11 @@
-name: llama.cpp
+name: Build llama.cpp in CANN container
 
-defaults:
-  run:
-    shell: bash -el {0}
 on:
   workflow_dispatch:
   pull_request:
-    # paths:
-    #   - '.github/workflows/llamacpp.yaml'
-    #   - 'requirements/**'
+    paths:
+      - '.github/workflows/llamacpp.yaml'
+      - 'requirements/**'
   schedule:
     - cron: "0 0 * * *"
 
@@ -16,39 +13,49 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
-permissions:
-  contents: read
-  issues: write
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
 
 jobs:
-    unit-tests:
-      if: contains(github.event.pull_request.labels.*.name, 'Ascend NPU')
-          
-      runs-on: ubuntu-latest
-      strategy:
-        matrix:
-          build: ['Release']
-          cann: ['openeuler-python3.10-cann8.0.rc3.beta1']
-          device: ['ascend910b3']
-      container:
-        image: ascendai/cann:${{ matrix.cann }}
-      steps:
-      - uses: actions/checkout@v4
-      - name: Install llamacpp
-        uses: nick-fields/retry@v3
+  openeuler-arm64-test:
+    if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
+    name: Build llama.cpp on OpenEuler for Arm64
+    runs-on: ubuntu-24.04-arm
+    strategy:
+      matrix:
+        cann:
+          - '8.0.rc3.beta1-910b-openeuler22.03-py3.10'
+        device:
+          - 'ascend910b3'
+        build:
+          - 'Release'
+    container: ascendai/cann:${{ matrix.cann }}
+    steps:
+      - name: Install dependencies
+        run: |
+          yum update -y
+          yum install -y git gcc gcc-c++ make cmake
+
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Checkout llama.cpp
+        uses: actions/checkout@v4
         with:
-          timeout_minutes: 30
-          max_attempts: 3
-          retry_on: error
-          command: |
-            yum update -y
-            yum install git cmake gcc gcc-c++ make -y  
-            git clone https://github.com/ggerganov/llama.cpp.git
-
-      - name: Build
+          repository: ggerganov/llama.cpp
+          path: llama.cpp
+
+      - name: Build llama.cpp
+        working-directory: llama.cpp
         run: |
-            cd llama.cpp
-            mkdir build
-            cd build
-            export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib/:${LD_LIBRARY_PATH}
-            cmake .. -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DGGML_CANN=on -DSOC_TYPE=${{ matrix.device }} && cmake --build . -j $(nproc)
+          export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
+
+          cmake -S . -B build \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
+              -DGGML_CANN=on \
+              -DSOC_TYPE=${{ matrix.device }}
+          cmake --build build -j $(nproc)