Add support for Windows on ARM and MacOS to ggml-kleidiai.cpp

This commit adds cpu feature detection for Windows On ARM and MacOS. For Windows, it uses IsProcessorFeaturePresent API from windows.h to detect cpu features. However, there is no specific flag for I8MM, so it will run a small SMMLA program. If illegal instruction error is catched, it sets isa.i8mm as false. For MacOS, it uses sysctlbyname. Change-Id: Ifabf3a3d517edc9693cc815d0aba525a8aa4e91d
ARM-software · Oct 18, 2024 · ab1baf4 · ab1baf4
1 parent 868de9e
commit ab1baf4
Show file tree

Hide file tree

Showing 2 changed files with 153 additions and 34 deletions.
diff --git a/kleidiai-examples/llama_cpp/0001-Use-KleidiAI-Int4-Matmul-micro-kernels-in-llama.cpp.patch b/kleidiai-examples/llama_cpp/0001-Use-KleidiAI-Int4-Matmul-micro-kernels-in-llama.cpp.patch
@@ -1,4 +1,4 @@
-From deb08cd16a2fe6fe2dc98197d58b4f0fb3dd9c7f Mon Sep 17 00:00:00 2001
+From d7ff60d4824e9ffa7fc11c6548462008bea0121f Mon Sep 17 00:00:00 2001
 From: Charles Xu <[email protected]>
 Date: Wed, 21 Aug 2024 07:31:51 +0200
 Subject: [PATCH] Use KleidiAI Int4 Matmul micro-kernels in llama.cpp
@@ -8,15 +8,15 @@ repository
 - Implement a KleidiAI backend for llama.cpp
 - Add weight caching feature for KleidiAI
 
-Signed-off-by: Dan Johansson <dan.johansson@arm.com>
+Signed-off-by: Hao Wei <hao.wei@arm.com>
 ---
  CMakeLists.txt    |  52 ++++
  ggml-alloc.c      |  13 +
- ggml-kleidiai.cpp | 675 ++++++++++++++++++++++++++++++++++++++++++++++
- ggml-kleidiai.h   |  45 ++++
+ ggml-kleidiai.cpp | 746 ++++++++++++++++++++++++++++++++++++++++++++++
+ ggml-kleidiai.h   |  45 +++
  ggml.c            |  27 ++
- llama.cpp         |  19 +-
- 6 files changed, 830 insertions(+), 1 deletion(-)
+ llama.cpp         |  23 +-
+ 6 files changed, 901 insertions(+), 5 deletions(-)
  create mode 100644 ggml-kleidiai.cpp
  create mode 100644 ggml-kleidiai.h
 
@@ -123,10 +123,10 @@ index bd367c42..ed4ce0ae 100644
          if (this_size > max_size) {
 diff --git a/ggml-kleidiai.cpp b/ggml-kleidiai.cpp
 new file mode 100644
-index 00000000..9129ea99
+index 00000000..53236893
 --- /dev/null
 +++ b/ggml-kleidiai.cpp
-@@ -0,0 +1,675 @@
+@@ -0,0 +1,746 @@
 +/*
 + * Copyright (c) 2024 Arm Limited.
 + *
@@ -151,7 +151,7 @@ index 00000000..9129ea99
 + * SOFTWARE.
 + */
 +
-+#if defined(__aarch64__) && (defined(__ANDROID__) || defined(__linux__))
++#if defined(__aarch64__)
 +#include "ggml-kleidiai.h"
 +
 +#include "ggml.h"
@@ -163,9 +163,21 @@ index 00000000..9129ea99
 +#include <cfloat>
 +#include <stdint.h>
 +#include <string.h>
++#if defined(__linux__)
 +#include <asm/hwcap.h>
 +#include <sys/auxv.h>
++#elif defined(__APPLE__)
++#include <string_view>
++#include <sys/sysctl.h>
++#include <sys/types.h>
++#elif defined(_WIN32)
++#include <windows.h>
++#include <excpt.h>
++#endif
 +#if defined(GGML_KLEIDIAI_USE_CACHE)
++#if !(defined(__linux__) || defined(__APPLE__))
++#error "GGML_KLEIDIAI_USE_CACHE is only supported on Linux and macOS"
++#endif
 +#include <cstring>
 +#include <sys/mman.h>
 +#include <sys/stat.h>
@@ -308,13 +320,75 @@ index 00000000..9129ea99
 +    return (features & feature_mask);
 +}
 +
-+static void get_cpu_features(cpu_features &isa) {
++#if defined(__APPLE__)
++template <typename T>
++T get_sysctl_by_name(std::string_view name) {
++    T value{};
++    size_t size = sizeof(T);
++    if (sysctlbyname(name.data(), &value, &size, nullptr, 0) != 0) {
++        value = 0;
++    }
++    return value;
++}
++#endif
++
++#if defined(_WIN32)
++inline bool is_feature_supported(DWORD feature) {
++    return IsProcessorFeaturePresent(feature);
++}
++
++#pragma optimize("", off)  // Disable optimization for the exception handling
++bool check_i8mm_support() {
++    bool i8mm_supported = true;
++    __try {
++        int8x16_t matA = vdupq_n_s8(1);
++        int8x16_t matB = vdupq_n_s8(2);
++        int32x4_t matC = vmmlaq_s32(vdupq_n_s32(0), matA, matB);
++        int32_t array[4];
++        vst1q_s32(array, matC);
++        for (int i = 0; i < 4; ++i) {
++            assert(array[i]== 16);
++        }
++    }
++    __except (GetExceptionCode() == STATUS_ILLEGAL_INSTRUCTION ? 1 : 0)
++    {
++        i8mm_supported = false;
++    }
++    return i8mm_supported;
++}
++#pragma optimize("", on)  // Re-enable optimization
++#endif
++
++static void get_cpu_features_impl(cpu_features &isa) {
++#if defined (__linux__)
 +    const uint32_t hwcaps   = getauxval(AT_HWCAP);
 +    const uint32_t hwcaps2  = getauxval(AT_HWCAP2);
 +
 +    isa.neon = is_feature_supported(hwcaps, KAI_FEATURE_HWCAP_ASIMD);
 +    isa.dot  = is_feature_supported(hwcaps, KAI_FEATURE_HWCAP_ASIMDDP);
 +    isa.i8mm = is_feature_supported(hwcaps2, KAI_FEATURE_HWCAP2_I8MM);
++
++#elif defined(__APPLE__)
++    isa.neon = get_sysctl_by_name<uint32_t>("hw.optional.AdvSIMD") == 1;
++    isa.dot = get_sysctl_by_name<uint32_t>("hw.optional.arm.FEAT_DotProd") == 1;
++    isa.i8mm = get_sysctl_by_name<uint32_t>("hw.optional.arm.FEAT_I8MM") == 1;
++
++#elif defined(_WIN32)
++    isa.neon = is_feature_supported(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE);
++    isa.dot  = is_feature_supported(PF_ARM_V8_INSTRUCTIONS_AVAILABLE);
++    isa.i8mm = check_i8mm_support();
++#endif
++}
++
++static const cpu_features& get_cpu_features() {
++    static cpu_features isa;
++    static bool initialized = false;
++
++    if (!initialized) {
++        get_cpu_features_impl(isa);
++        initialized = true;
++    }
++    return isa;
 +}
 +
 +typedef void (*ggml_kai_func_t)(const struct ggml_compute_params * params, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
@@ -343,8 +417,7 @@ index 00000000..9129ea99
 +        return false;
 +    }
 +
-+    cpu_features cpu;
-+    get_cpu_features(cpu);
++    const cpu_features& cpu = get_cpu_features();
 +
 +    // Check whether the target platfom has i8mm and dotprod features
 +    if(!(cpu.i8mm && cpu.dot)) {
@@ -396,8 +469,7 @@ index 00000000..9129ea99
 +    GGML_KAI_UNUSED(k);
 +
 +    // Get CPU features
-+    cpu_features cpu;
-+    get_cpu_features(cpu);
++    const cpu_features& cpu = get_cpu_features();
 +
 +#if defined(__ARM_FEATURE_MATMUL_INT8) && defined(__ARM_FEATURE_DOTPROD)
 +    if(cpu.i8mm && cpu.dot) {
@@ -760,8 +832,7 @@ index 00000000..9129ea99
 +        const int32_t b = cur->ne[2];
 +
 +        // Temporary solution as we should check whether we can run the kleidiai matmul micro-kernels
-+        cpu_features cpu;
-+        get_cpu_features(cpu);
++        const cpu_features& cpu = get_cpu_features();
 +
 +        // Check whether the target platfom has i8mm and dotprod features
 +        if(!(cpu.i8mm && cpu.dot)) {
@@ -801,7 +872,7 @@ index 00000000..9129ea99
 +    close(g_kai_cached_weight.fd);
 +#endif
 +}
-+#endif // defined(__aarch64__) && (defined(__ANDROID__) || defined(__linux__))
++#endif // defined(__aarch64__)
 diff --git a/ggml-kleidiai.h b/ggml-kleidiai.h
 new file mode 100644
 index 00000000..a4cdf1fb
@@ -920,7 +991,7 @@ index d5d33c2b..84bfd3b1 100644
 
  #if defined(GGML_USE_OPENMP)
 diff --git a/llama.cpp b/llama.cpp
-index 05591aa4..99461995 100644
+index 05591aa4..1c63d5ec 100644
 --- a/llama.cpp
 +++ b/llama.cpp
 @@ -19,6 +19,8 @@
@@ -937,23 +1008,41 @@ index 05591aa4..99461995 100644
      llama_mmap(const llama_mmap &) = delete;
 
 -#ifdef _POSIX_MAPPED_FILES
-+#ifdef GGML_USE_KLEIDIAI
++#if !defined(GGML_USE_KLEIDIAI) && (defined(_POSIX_MAPPED_FILES) || defined(_WIN32))
 +    // With KleidiAI, we disable mmap to allow the backend
 +    // to re-use the memory allocated for the weights.
 +    // KleidiAI requires to pack the weights in a different format from the original one
 +    // to improve the overall computational efficiency.
 +    // However, since RAM is very limited on some devices, we want to re-use the original
 +    // storage to avoid allocating additional memory.
-+    static constexpr bool SUPPORTED = false;
-+#elif _POSIX_MAPPED_FILES
      static constexpr bool SUPPORTED = true;
++#else
++    static constexpr bool SUPPORTED = false;
 +#endif
 +
 +#ifdef _POSIX_MAPPED_FILES
 
      // list of mapped fragments (first_offset, last_offset)
      std::vector<std::pair<size_t, size_t>> mapped_fragments;
-@@ -15987,6 +16000,10 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
+@@ -1473,8 +1486,6 @@ struct llama_mmap {
+         }
+     }
+ #elif defined(_WIN32)
+-    static constexpr bool SUPPORTED = true;
+-
+     llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false) {
+         GGML_UNUSED(numa);
+
+@@ -1535,8 +1546,6 @@ struct llama_mmap {
+         }
+     }
+ #else
+-    static constexpr bool SUPPORTED = false;
+-
+     llama_mmap(struct llama_file * file, size_t prefetch = -1, bool numa = false) {
+         GGML_UNUSED(file);
+         GGML_UNUSED(prefetch);
+@@ -15987,6 +15996,10 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
 
  void llama_backend_free(void) {
      ggml_quantize_free();

diff --git a/kleidiai-examples/llama_cpp/README.md b/kleidiai-examples/llama_cpp/README.md
@@ -27,8 +27,8 @@
 ## Prerequisities
 
 - Experience with Arm® cross-compilation on Android™
-- Proficiency with Android® shell commands
-- An Android® device with an Arm® CPU with <strong>FEAT_DotProd</strong> (dotprod) and <strong>FEAT_I8MM</strong> (i8mm) features
+- Proficiency with Android™ shell commands
+- An Android™ device with an Arm® CPU with <strong>FEAT_DotProd</strong> (dotprod) and <strong>FEAT_I8MM</strong> (i8mm) features
 
 ## Dependencies
 - A laptop/PC with a Linux®-based operating system (tested on Ubuntu® 20.04.4 LTS)
@@ -53,7 +53,7 @@ These KleidiAI micro-kernels were fundamental to the Cookie and Ada chatbot, whi
 
 Arm® CPUs with <strong>FEAT_DotProd</strong> (dotprod) and <strong>FEAT_I8MM</strong> (i8mm) features.
 
-## Running llama.cpp with KleidiAI
+## Running llama.cpp with KleidiAI on Android™
 
 Connect your Android™ device to your computer and open Terminal. Then, follow the following steps to apply the patch with the KleidiAI backend on top of llama.cpp.
 
@@ -99,15 +99,6 @@ export NDK_PATH="your-android-ndk-path"
 
 cmake -DLLAMA_KLEIDIAI=ON -DLLAMA_KLEIDIAI_CACHE=ON -DCMAKE_TOOLCHAIN_FILE=${NDK_PATH}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+dotprod -DCMAKE_CXX_FLAGS=-march=armv8.2a+i8mm+dotprod ..
 
-make -j4
-```
-Build the llama.cpp project for Linux®:
-
-```bash
-mkdir build && cd build
-
-cmake -DLLAMA_KLEIDIAI=ON -DLLAMA_KLEIDIAI_CACHE=ON -DCMAKE_C_FLAGS=-march=armv8.2-a+dotprod+i8mm -DCMAKE_CXX_FLAGS=-march=armv8.2-a+dotprod+i8mm ..
-
 make -j4
 ```
 The  -DLLAMA_KLEIDIAI_CACHE=ON  is used to enable the weights caching. Weights caching is a feature available in the KleidiAI backend to improve the model loading time. Since the layout of the original model weights is transformed by KleidiAI to improve the performance of the matrix-multiplication routines, this option ensures that the weights transformation only happens the first time you run the model.
@@ -148,4 +139,43 @@ Run the model inference using the `llama-cli` binary using 4 CPU cores:
 ./llama-cli -m phi-2.Q4_0.gguf -p "Write a code in C for bubble sorting" -n 32 -t 4
 ```
 
+## Building llama.cpp with KleidiAI for other platforms
+KleidiAI can also be enabled on macOS® and Windows® on Arm® with FEAT_DotProd (dotprod) and FEAT_I8MM (i8mm) features.
+
+### Linux®:
+
+```bash
+mkdir build && cd build
+
+cmake -DLLAMA_KLEIDIAI=ON -DLLAMA_KLEIDIAI_CACHE=ON -DCMAKE_C_FLAGS=-march=armv8.2-a+dotprod+i8mm -DCMAKE_CXX_FLAGS=-march=armv8.2-a+dotprod+i8mm ..
+
+make -j4
+```
+
+### macOS®:
+```bash
+mkdir build && cd build
+
+# The -DLLAMA_METAL=OFF is used to disable running on Metal GPU
+cmake -DLLAMA_KLEIDIAI=ON -DCMAKE_C_FLAGS=-march=armv8.2-a+i8mm+dotprod -DCMAKE_CXX_FLAGS=-march=armv8.2-a+i8mm+dotprod -DLLAMA_METAL=OFF ..
+
+make -j4
+```
+
+### Windows® on Arm®:
+
+- Install [Visual Studio 2022](https://visualstudio.microsoft.com/de/vs/community/)
+- Install Required Components in Visual Studio Installer
+  - Workload Tab: Desktop development with C++
+  - Individual Components Tab (search for these components): C++ CMake Tools for Windows®, Git for Windows®, C++ Clang Compiler for Windows®, MSBuild Support for LLVM-Toolset (clang)
+- Environment Setup:
+  - If the host machine is x86-based, please use the integrated Developer Command Prompt / PowerShell in VS2022 for building and testing.
+  - If the host machine is Arm64-based, please use the system's cmd and set environment variables by running `"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsall.bat" arm64`as the integrated Developer Command Prompt / PowerShell in VS2022 is meant for x86
+```cmd
+cmake --preset arm64-windows-llvm-release -D LLAMA_KLEIDIAI=ON -D KLEIDIAI_BUILD_TESTS=OFF -D LLAMA_OPENMP=OFF
+cmake --build build-arm64-windows-llvm-release
+```
+
+The options LLAMA_KLEIDIAI_CACHE and KLEIDIAI_BUILD_TESTS are disabled on Windows®, as they are currently not supported. And please use llvm preset as MSVC is not supported either.
+
 That’s all for this guide!