Skip to content

Commit

Permalink
Add support for Windows on ARM and MacOS to ggml-kleidiai.cpp
Browse files Browse the repository at this point in the history
This commit adds cpu feature detection for Windows On ARM and MacOS.
For Windows, it uses IsProcessorFeaturePresent API from windows.h
to detect cpu features. However, there is no specific flag for I8MM,
so it will run a small SMMLA program. If illegal instruction error
is catched, it sets isa.i8mm as false. For MacOS, it uses sysctlbyname.

Change-Id: Ifabf3a3d517edc9693cc815d0aba525a8aa4e91d
  • Loading branch information
Hao Wei authored and kshitij-sisodia-arm committed Oct 18, 2024
1 parent 868de9e commit ab1baf4
Show file tree
Hide file tree
Showing 2 changed files with 153 additions and 34 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
From deb08cd16a2fe6fe2dc98197d58b4f0fb3dd9c7f Mon Sep 17 00:00:00 2001
From d7ff60d4824e9ffa7fc11c6548462008bea0121f Mon Sep 17 00:00:00 2001
From: Charles Xu <[email protected]>
Date: Wed, 21 Aug 2024 07:31:51 +0200
Subject: [PATCH] Use KleidiAI Int4 Matmul micro-kernels in llama.cpp
Expand All @@ -8,15 +8,15 @@ repository
- Implement a KleidiAI backend for llama.cpp
- Add weight caching feature for KleidiAI

Signed-off-by: Dan Johansson <dan.johansson@arm.com>
Signed-off-by: Hao Wei <hao.wei@arm.com>
---
CMakeLists.txt | 52 ++++
ggml-alloc.c | 13 +
ggml-kleidiai.cpp | 675 ++++++++++++++++++++++++++++++++++++++++++++++
ggml-kleidiai.h | 45 ++++
ggml-kleidiai.cpp | 746 ++++++++++++++++++++++++++++++++++++++++++++++
ggml-kleidiai.h | 45 +++
ggml.c | 27 ++
llama.cpp | 19 +-
6 files changed, 830 insertions(+), 1 deletion(-)
llama.cpp | 23 +-
6 files changed, 901 insertions(+), 5 deletions(-)
create mode 100644 ggml-kleidiai.cpp
create mode 100644 ggml-kleidiai.h

Expand Down Expand Up @@ -123,10 +123,10 @@ index bd367c42..ed4ce0ae 100644
if (this_size > max_size) {
diff --git a/ggml-kleidiai.cpp b/ggml-kleidiai.cpp
new file mode 100644
index 00000000..9129ea99
index 00000000..53236893
--- /dev/null
+++ b/ggml-kleidiai.cpp
@@ -0,0 +1,675 @@
@@ -0,0 +1,746 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
Expand All @@ -151,7 +151,7 @@ index 00000000..9129ea99
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__) && (defined(__ANDROID__) || defined(__linux__))
+#if defined(__aarch64__)
+#include "ggml-kleidiai.h"
+
+#include "ggml.h"
Expand All @@ -163,9 +163,21 @@ index 00000000..9129ea99
+#include <cfloat>
+#include <stdint.h>
+#include <string.h>
+#if defined(__linux__)
+#include <asm/hwcap.h>
+#include <sys/auxv.h>
+#elif defined(__APPLE__)
+#include <string_view>
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#elif defined(_WIN32)
+#include <windows.h>
+#include <excpt.h>
+#endif
+#if defined(GGML_KLEIDIAI_USE_CACHE)
+#if !(defined(__linux__) || defined(__APPLE__))
+#error "GGML_KLEIDIAI_USE_CACHE is only supported on Linux and macOS"
+#endif
+#include <cstring>
+#include <sys/mman.h>
+#include <sys/stat.h>
Expand Down Expand Up @@ -308,13 +320,75 @@ index 00000000..9129ea99
+ return (features & feature_mask);
+}
+
+static void get_cpu_features(cpu_features &isa) {
+#if defined(__APPLE__)
+template <typename T>
+T get_sysctl_by_name(std::string_view name) {
+ T value{};
+ size_t size = sizeof(T);
+ if (sysctlbyname(name.data(), &value, &size, nullptr, 0) != 0) {
+ value = 0;
+ }
+ return value;
+}
+#endif
+
+#if defined(_WIN32)
+inline bool is_feature_supported(DWORD feature) {
+ return IsProcessorFeaturePresent(feature);
+}
+
+#pragma optimize("", off) // Disable optimization for the exception handling
+bool check_i8mm_support() {
+ bool i8mm_supported = true;
+ __try {
+ int8x16_t matA = vdupq_n_s8(1);
+ int8x16_t matB = vdupq_n_s8(2);
+ int32x4_t matC = vmmlaq_s32(vdupq_n_s32(0), matA, matB);
+ int32_t array[4];
+ vst1q_s32(array, matC);
+ for (int i = 0; i < 4; ++i) {
+ assert(array[i]== 16);
+ }
+ }
+ __except (GetExceptionCode() == STATUS_ILLEGAL_INSTRUCTION ? 1 : 0)
+ {
+ i8mm_supported = false;
+ }
+ return i8mm_supported;
+}
+#pragma optimize("", on) // Re-enable optimization
+#endif
+
+static void get_cpu_features_impl(cpu_features &isa) {
+#if defined (__linux__)
+ const uint32_t hwcaps = getauxval(AT_HWCAP);
+ const uint32_t hwcaps2 = getauxval(AT_HWCAP2);
+
+ isa.neon = is_feature_supported(hwcaps, KAI_FEATURE_HWCAP_ASIMD);
+ isa.dot = is_feature_supported(hwcaps, KAI_FEATURE_HWCAP_ASIMDDP);
+ isa.i8mm = is_feature_supported(hwcaps2, KAI_FEATURE_HWCAP2_I8MM);
+
+#elif defined(__APPLE__)
+ isa.neon = get_sysctl_by_name<uint32_t>("hw.optional.AdvSIMD") == 1;
+ isa.dot = get_sysctl_by_name<uint32_t>("hw.optional.arm.FEAT_DotProd") == 1;
+ isa.i8mm = get_sysctl_by_name<uint32_t>("hw.optional.arm.FEAT_I8MM") == 1;
+
+#elif defined(_WIN32)
+ isa.neon = is_feature_supported(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE);
+ isa.dot = is_feature_supported(PF_ARM_V8_INSTRUCTIONS_AVAILABLE);
+ isa.i8mm = check_i8mm_support();
+#endif
+}
+
+static const cpu_features& get_cpu_features() {
+ static cpu_features isa;
+ static bool initialized = false;
+
+ if (!initialized) {
+ get_cpu_features_impl(isa);
+ initialized = true;
+ }
+ return isa;
+}
+
+typedef void (*ggml_kai_func_t)(const struct ggml_compute_params * params, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
Expand Down Expand Up @@ -343,8 +417,7 @@ index 00000000..9129ea99
+ return false;
+ }
+
+ cpu_features cpu;
+ get_cpu_features(cpu);
+ const cpu_features& cpu = get_cpu_features();
+
+ // Check whether the target platfom has i8mm and dotprod features
+ if(!(cpu.i8mm && cpu.dot)) {
Expand Down Expand Up @@ -396,8 +469,7 @@ index 00000000..9129ea99
+ GGML_KAI_UNUSED(k);
+
+ // Get CPU features
+ cpu_features cpu;
+ get_cpu_features(cpu);
+ const cpu_features& cpu = get_cpu_features();
+
+#if defined(__ARM_FEATURE_MATMUL_INT8) && defined(__ARM_FEATURE_DOTPROD)
+ if(cpu.i8mm && cpu.dot) {
Expand Down Expand Up @@ -760,8 +832,7 @@ index 00000000..9129ea99
+ const int32_t b = cur->ne[2];
+
+ // Temporary solution as we should check whether we can run the kleidiai matmul micro-kernels
+ cpu_features cpu;
+ get_cpu_features(cpu);
+ const cpu_features& cpu = get_cpu_features();
+
+ // Check whether the target platfom has i8mm and dotprod features
+ if(!(cpu.i8mm && cpu.dot)) {
Expand Down Expand Up @@ -801,7 +872,7 @@ index 00000000..9129ea99
+ close(g_kai_cached_weight.fd);
+#endif
+}
+#endif // defined(__aarch64__) && (defined(__ANDROID__) || defined(__linux__))
+#endif // defined(__aarch64__)
diff --git a/ggml-kleidiai.h b/ggml-kleidiai.h
new file mode 100644
index 00000000..a4cdf1fb
Expand Down Expand Up @@ -920,7 +991,7 @@ index d5d33c2b..84bfd3b1 100644

#if defined(GGML_USE_OPENMP)
diff --git a/llama.cpp b/llama.cpp
index 05591aa4..99461995 100644
index 05591aa4..1c63d5ec 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -19,6 +19,8 @@
Expand All @@ -937,23 +1008,41 @@ index 05591aa4..99461995 100644
llama_mmap(const llama_mmap &) = delete;

-#ifdef _POSIX_MAPPED_FILES
+#ifdef GGML_USE_KLEIDIAI
+#if !defined(GGML_USE_KLEIDIAI) && (defined(_POSIX_MAPPED_FILES) || defined(_WIN32))
+ // With KleidiAI, we disable mmap to allow the backend
+ // to re-use the memory allocated for the weights.
+ // KleidiAI requires to pack the weights in a different format from the original one
+ // to improve the overall computational efficiency.
+ // However, since RAM is very limited on some devices, we want to re-use the original
+ // storage to avoid allocating additional memory.
+ static constexpr bool SUPPORTED = false;
+#elif _POSIX_MAPPED_FILES
static constexpr bool SUPPORTED = true;
+#else
+ static constexpr bool SUPPORTED = false;
+#endif
+
+#ifdef _POSIX_MAPPED_FILES

// list of mapped fragments (first_offset, last_offset)
std::vector<std::pair<size_t, size_t>> mapped_fragments;
@@ -15987,6 +16000,10 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
@@ -1473,8 +1486,6 @@ struct llama_mmap {
}
}
#elif defined(_WIN32)
- static constexpr bool SUPPORTED = true;
-
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false) {
GGML_UNUSED(numa);

@@ -1535,8 +1546,6 @@ struct llama_mmap {
}
}
#else
- static constexpr bool SUPPORTED = false;
-
llama_mmap(struct llama_file * file, size_t prefetch = -1, bool numa = false) {
GGML_UNUSED(file);
GGML_UNUSED(prefetch);
@@ -15987,6 +15996,10 @@ void llama_numa_init(enum ggml_numa_strategy numa) {

void llama_backend_free(void) {
ggml_quantize_free();
Expand Down
54 changes: 42 additions & 12 deletions kleidiai-examples/llama_cpp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@
## Prerequisities

- Experience with Arm® cross-compilation on Android™
- Proficiency with Android® shell commands
- An Android® device with an Arm® CPU with <strong>FEAT_DotProd</strong> (dotprod) and <strong>FEAT_I8MM</strong> (i8mm) features
- Proficiency with Android shell commands
- An Android device with an Arm® CPU with <strong>FEAT_DotProd</strong> (dotprod) and <strong>FEAT_I8MM</strong> (i8mm) features

## Dependencies
- A laptop/PC with a Linux®-based operating system (tested on Ubuntu® 20.04.4 LTS)
Expand All @@ -53,7 +53,7 @@ These KleidiAI micro-kernels were fundamental to the Cookie and Ada chatbot, whi

Arm® CPUs with <strong>FEAT_DotProd</strong> (dotprod) and <strong>FEAT_I8MM</strong> (i8mm) features.

## Running llama.cpp with KleidiAI
## Running llama.cpp with KleidiAI on Android™

Connect your Android™ device to your computer and open Terminal. Then, follow the following steps to apply the patch with the KleidiAI backend on top of llama.cpp.

Expand Down Expand Up @@ -99,15 +99,6 @@ export NDK_PATH="your-android-ndk-path"

cmake -DLLAMA_KLEIDIAI=ON -DLLAMA_KLEIDIAI_CACHE=ON -DCMAKE_TOOLCHAIN_FILE=${NDK_PATH}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+dotprod -DCMAKE_CXX_FLAGS=-march=armv8.2a+i8mm+dotprod ..

make -j4
```
Build the llama.cpp project for Linux®:

```bash
mkdir build && cd build

cmake -DLLAMA_KLEIDIAI=ON -DLLAMA_KLEIDIAI_CACHE=ON -DCMAKE_C_FLAGS=-march=armv8.2-a+dotprod+i8mm -DCMAKE_CXX_FLAGS=-march=armv8.2-a+dotprod+i8mm ..

make -j4
```
The -DLLAMA_KLEIDIAI_CACHE=ON is used to enable the weights caching. Weights caching is a feature available in the KleidiAI backend to improve the model loading time. Since the layout of the original model weights is transformed by KleidiAI to improve the performance of the matrix-multiplication routines, this option ensures that the weights transformation only happens the first time you run the model.
Expand Down Expand Up @@ -148,4 +139,43 @@ Run the model inference using the `llama-cli` binary using 4 CPU cores:
./llama-cli -m phi-2.Q4_0.gguf -p "Write a code in C for bubble sorting" -n 32 -t 4
```

## Building llama.cpp with KleidiAI for other platforms
KleidiAI can also be enabled on macOS® and Windows® on Arm® with FEAT_DotProd (dotprod) and FEAT_I8MM (i8mm) features.

### Linux®:

```bash
mkdir build && cd build

cmake -DLLAMA_KLEIDIAI=ON -DLLAMA_KLEIDIAI_CACHE=ON -DCMAKE_C_FLAGS=-march=armv8.2-a+dotprod+i8mm -DCMAKE_CXX_FLAGS=-march=armv8.2-a+dotprod+i8mm ..

make -j4
```

### macOS®:
```bash
mkdir build && cd build

# The -DLLAMA_METAL=OFF is used to disable running on Metal GPU
cmake -DLLAMA_KLEIDIAI=ON -DCMAKE_C_FLAGS=-march=armv8.2-a+i8mm+dotprod -DCMAKE_CXX_FLAGS=-march=armv8.2-a+i8mm+dotprod -DLLAMA_METAL=OFF ..

make -j4
```

### Windows® on Arm®:

- Install [Visual Studio 2022](https://visualstudio.microsoft.com/de/vs/community/)
- Install Required Components in Visual Studio Installer
- Workload Tab: Desktop development with C++
- Individual Components Tab (search for these components): C++ CMake Tools for Windows®, Git for Windows®, C++ Clang Compiler for Windows®, MSBuild Support for LLVM-Toolset (clang)
- Environment Setup:
- If the host machine is x86-based, please use the integrated Developer Command Prompt / PowerShell in VS2022 for building and testing.
- If the host machine is Arm64-based, please use the system's cmd and set environment variables by running `"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsall.bat" arm64`as the integrated Developer Command Prompt / PowerShell in VS2022 is meant for x86
```cmd
cmake --preset arm64-windows-llvm-release -D LLAMA_KLEIDIAI=ON -D KLEIDIAI_BUILD_TESTS=OFF -D LLAMA_OPENMP=OFF
cmake --build build-arm64-windows-llvm-release
```

The options LLAMA_KLEIDIAI_CACHE and KLEIDIAI_BUILD_TESTS are disabled on Windows®, as they are currently not supported. And please use llvm preset as MSVC is not supported either.

That’s all for this guide!

0 comments on commit ab1baf4

Please sign in to comment.