diff --git a/kleidiai-examples/llama_cpp/0001-Use-KleidiAI-Int4-Matmul-micro-kernels-in-llama.cpp.patch b/kleidiai-examples/llama_cpp/0001-Use-KleidiAI-Int4-Matmul-micro-kernels-in-llama.cpp.patch index 249ed64..9bc595c 100644 --- a/kleidiai-examples/llama_cpp/0001-Use-KleidiAI-Int4-Matmul-micro-kernels-in-llama.cpp.patch +++ b/kleidiai-examples/llama_cpp/0001-Use-KleidiAI-Int4-Matmul-micro-kernels-in-llama.cpp.patch @@ -1,4 +1,4 @@ -From deb08cd16a2fe6fe2dc98197d58b4f0fb3dd9c7f Mon Sep 17 00:00:00 2001 +From d7ff60d4824e9ffa7fc11c6548462008bea0121f Mon Sep 17 00:00:00 2001 From: Charles Xu Date: Wed, 21 Aug 2024 07:31:51 +0200 Subject: [PATCH] Use KleidiAI Int4 Matmul micro-kernels in llama.cpp @@ -8,15 +8,15 @@ repository - Implement a KleidiAI backend for llama.cpp - Add weight caching feature for KleidiAI -Signed-off-by: Dan Johansson +Signed-off-by: Hao Wei --- CMakeLists.txt | 52 ++++ ggml-alloc.c | 13 + - ggml-kleidiai.cpp | 675 ++++++++++++++++++++++++++++++++++++++++++++++ - ggml-kleidiai.h | 45 ++++ + ggml-kleidiai.cpp | 746 ++++++++++++++++++++++++++++++++++++++++++++++ + ggml-kleidiai.h | 45 +++ ggml.c | 27 ++ - llama.cpp | 19 +- - 6 files changed, 830 insertions(+), 1 deletion(-) + llama.cpp | 23 +- + 6 files changed, 901 insertions(+), 5 deletions(-) create mode 100644 ggml-kleidiai.cpp create mode 100644 ggml-kleidiai.h @@ -123,10 +123,10 @@ index bd367c42..ed4ce0ae 100644 if (this_size > max_size) { diff --git a/ggml-kleidiai.cpp b/ggml-kleidiai.cpp new file mode 100644 -index 00000000..9129ea99 +index 00000000..53236893 --- /dev/null +++ b/ggml-kleidiai.cpp -@@ -0,0 +1,675 @@ +@@ -0,0 +1,746 @@ +/* + * Copyright (c) 2024 Arm Limited. + * @@ -151,7 +151,7 @@ index 00000000..9129ea99 + * SOFTWARE. + */ + -+#if defined(__aarch64__) && (defined(__ANDROID__) || defined(__linux__)) ++#if defined(__aarch64__) +#include "ggml-kleidiai.h" + +#include "ggml.h" @@ -163,9 +163,21 @@ index 00000000..9129ea99 +#include +#include +#include ++#if defined(__linux__) +#include +#include ++#elif defined(__APPLE__) ++#include ++#include ++#include ++#elif defined(_WIN32) ++#include ++#include ++#endif +#if defined(GGML_KLEIDIAI_USE_CACHE) ++#if !(defined(__linux__) || defined(__APPLE__)) ++#error "GGML_KLEIDIAI_USE_CACHE is only supported on Linux and macOS" ++#endif +#include +#include +#include @@ -308,13 +320,75 @@ index 00000000..9129ea99 + return (features & feature_mask); +} + -+static void get_cpu_features(cpu_features &isa) { ++#if defined(__APPLE__) ++template ++T get_sysctl_by_name(std::string_view name) { ++ T value{}; ++ size_t size = sizeof(T); ++ if (sysctlbyname(name.data(), &value, &size, nullptr, 0) != 0) { ++ value = 0; ++ } ++ return value; ++} ++#endif ++ ++#if defined(_WIN32) ++inline bool is_feature_supported(DWORD feature) { ++ return IsProcessorFeaturePresent(feature); ++} ++ ++#pragma optimize("", off) // Disable optimization for the exception handling ++bool check_i8mm_support() { ++ bool i8mm_supported = true; ++ __try { ++ int8x16_t matA = vdupq_n_s8(1); ++ int8x16_t matB = vdupq_n_s8(2); ++ int32x4_t matC = vmmlaq_s32(vdupq_n_s32(0), matA, matB); ++ int32_t array[4]; ++ vst1q_s32(array, matC); ++ for (int i = 0; i < 4; ++i) { ++ assert(array[i]== 16); ++ } ++ } ++ __except (GetExceptionCode() == STATUS_ILLEGAL_INSTRUCTION ? 1 : 0) ++ { ++ i8mm_supported = false; ++ } ++ return i8mm_supported; ++} ++#pragma optimize("", on) // Re-enable optimization ++#endif ++ ++static void get_cpu_features_impl(cpu_features &isa) { ++#if defined (__linux__) + const uint32_t hwcaps = getauxval(AT_HWCAP); + const uint32_t hwcaps2 = getauxval(AT_HWCAP2); + + isa.neon = is_feature_supported(hwcaps, KAI_FEATURE_HWCAP_ASIMD); + isa.dot = is_feature_supported(hwcaps, KAI_FEATURE_HWCAP_ASIMDDP); + isa.i8mm = is_feature_supported(hwcaps2, KAI_FEATURE_HWCAP2_I8MM); ++ ++#elif defined(__APPLE__) ++ isa.neon = get_sysctl_by_name("hw.optional.AdvSIMD") == 1; ++ isa.dot = get_sysctl_by_name("hw.optional.arm.FEAT_DotProd") == 1; ++ isa.i8mm = get_sysctl_by_name("hw.optional.arm.FEAT_I8MM") == 1; ++ ++#elif defined(_WIN32) ++ isa.neon = is_feature_supported(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE); ++ isa.dot = is_feature_supported(PF_ARM_V8_INSTRUCTIONS_AVAILABLE); ++ isa.i8mm = check_i8mm_support(); ++#endif ++} ++ ++static const cpu_features& get_cpu_features() { ++ static cpu_features isa; ++ static bool initialized = false; ++ ++ if (!initialized) { ++ get_cpu_features_impl(isa); ++ initialized = true; ++ } ++ return isa; +} + +typedef void (*ggml_kai_func_t)(const struct ggml_compute_params * params, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); @@ -343,8 +417,7 @@ index 00000000..9129ea99 + return false; + } + -+ cpu_features cpu; -+ get_cpu_features(cpu); ++ const cpu_features& cpu = get_cpu_features(); + + // Check whether the target platfom has i8mm and dotprod features + if(!(cpu.i8mm && cpu.dot)) { @@ -396,8 +469,7 @@ index 00000000..9129ea99 + GGML_KAI_UNUSED(k); + + // Get CPU features -+ cpu_features cpu; -+ get_cpu_features(cpu); ++ const cpu_features& cpu = get_cpu_features(); + +#if defined(__ARM_FEATURE_MATMUL_INT8) && defined(__ARM_FEATURE_DOTPROD) + if(cpu.i8mm && cpu.dot) { @@ -760,8 +832,7 @@ index 00000000..9129ea99 + const int32_t b = cur->ne[2]; + + // Temporary solution as we should check whether we can run the kleidiai matmul micro-kernels -+ cpu_features cpu; -+ get_cpu_features(cpu); ++ const cpu_features& cpu = get_cpu_features(); + + // Check whether the target platfom has i8mm and dotprod features + if(!(cpu.i8mm && cpu.dot)) { @@ -801,7 +872,7 @@ index 00000000..9129ea99 + close(g_kai_cached_weight.fd); +#endif +} -+#endif // defined(__aarch64__) && (defined(__ANDROID__) || defined(__linux__)) ++#endif // defined(__aarch64__) diff --git a/ggml-kleidiai.h b/ggml-kleidiai.h new file mode 100644 index 00000000..a4cdf1fb @@ -920,7 +991,7 @@ index d5d33c2b..84bfd3b1 100644 #if defined(GGML_USE_OPENMP) diff --git a/llama.cpp b/llama.cpp -index 05591aa4..99461995 100644 +index 05591aa4..1c63d5ec 100644 --- a/llama.cpp +++ b/llama.cpp @@ -19,6 +19,8 @@ @@ -937,23 +1008,41 @@ index 05591aa4..99461995 100644 llama_mmap(const llama_mmap &) = delete; -#ifdef _POSIX_MAPPED_FILES -+#ifdef GGML_USE_KLEIDIAI ++#if !defined(GGML_USE_KLEIDIAI) && (defined(_POSIX_MAPPED_FILES) || defined(_WIN32)) + // With KleidiAI, we disable mmap to allow the backend + // to re-use the memory allocated for the weights. + // KleidiAI requires to pack the weights in a different format from the original one + // to improve the overall computational efficiency. + // However, since RAM is very limited on some devices, we want to re-use the original + // storage to avoid allocating additional memory. -+ static constexpr bool SUPPORTED = false; -+#elif _POSIX_MAPPED_FILES static constexpr bool SUPPORTED = true; ++#else ++ static constexpr bool SUPPORTED = false; +#endif + +#ifdef _POSIX_MAPPED_FILES // list of mapped fragments (first_offset, last_offset) std::vector> mapped_fragments; -@@ -15987,6 +16000,10 @@ void llama_numa_init(enum ggml_numa_strategy numa) { +@@ -1473,8 +1486,6 @@ struct llama_mmap { + } + } + #elif defined(_WIN32) +- static constexpr bool SUPPORTED = true; +- + llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false) { + GGML_UNUSED(numa); + +@@ -1535,8 +1546,6 @@ struct llama_mmap { + } + } + #else +- static constexpr bool SUPPORTED = false; +- + llama_mmap(struct llama_file * file, size_t prefetch = -1, bool numa = false) { + GGML_UNUSED(file); + GGML_UNUSED(prefetch); +@@ -15987,6 +15996,10 @@ void llama_numa_init(enum ggml_numa_strategy numa) { void llama_backend_free(void) { ggml_quantize_free(); diff --git a/kleidiai-examples/llama_cpp/README.md b/kleidiai-examples/llama_cpp/README.md index fa379c4..c5400d2 100644 --- a/kleidiai-examples/llama_cpp/README.md +++ b/kleidiai-examples/llama_cpp/README.md @@ -27,8 +27,8 @@ ## Prerequisities - Experience with Arm® cross-compilation on Android™ -- Proficiency with Android® shell commands -- An Android® device with an Arm® CPU with FEAT_DotProd (dotprod) and FEAT_I8MM (i8mm) features +- Proficiency with Android™ shell commands +- An Android™ device with an Arm® CPU with FEAT_DotProd (dotprod) and FEAT_I8MM (i8mm) features ## Dependencies - A laptop/PC with a Linux®-based operating system (tested on Ubuntu® 20.04.4 LTS) @@ -53,7 +53,7 @@ These KleidiAI micro-kernels were fundamental to the Cookie and Ada chatbot, whi Arm® CPUs with FEAT_DotProd (dotprod) and FEAT_I8MM (i8mm) features. -## Running llama.cpp with KleidiAI +## Running llama.cpp with KleidiAI on Android™ Connect your Android™ device to your computer and open Terminal. Then, follow the following steps to apply the patch with the KleidiAI backend on top of llama.cpp. @@ -99,15 +99,6 @@ export NDK_PATH="your-android-ndk-path" cmake -DLLAMA_KLEIDIAI=ON -DLLAMA_KLEIDIAI_CACHE=ON -DCMAKE_TOOLCHAIN_FILE=${NDK_PATH}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+dotprod -DCMAKE_CXX_FLAGS=-march=armv8.2a+i8mm+dotprod .. -make -j4 -``` -Build the llama.cpp project for Linux®: - -```bash -mkdir build && cd build - -cmake -DLLAMA_KLEIDIAI=ON -DLLAMA_KLEIDIAI_CACHE=ON -DCMAKE_C_FLAGS=-march=armv8.2-a+dotprod+i8mm -DCMAKE_CXX_FLAGS=-march=armv8.2-a+dotprod+i8mm .. - make -j4 ``` The -DLLAMA_KLEIDIAI_CACHE=ON is used to enable the weights caching. Weights caching is a feature available in the KleidiAI backend to improve the model loading time. Since the layout of the original model weights is transformed by KleidiAI to improve the performance of the matrix-multiplication routines, this option ensures that the weights transformation only happens the first time you run the model. @@ -148,4 +139,43 @@ Run the model inference using the `llama-cli` binary using 4 CPU cores: ./llama-cli -m phi-2.Q4_0.gguf -p "Write a code in C for bubble sorting" -n 32 -t 4 ``` +## Building llama.cpp with KleidiAI for other platforms +KleidiAI can also be enabled on macOS® and Windows® on Arm® with FEAT_DotProd (dotprod) and FEAT_I8MM (i8mm) features. + +### Linux®: + +```bash +mkdir build && cd build + +cmake -DLLAMA_KLEIDIAI=ON -DLLAMA_KLEIDIAI_CACHE=ON -DCMAKE_C_FLAGS=-march=armv8.2-a+dotprod+i8mm -DCMAKE_CXX_FLAGS=-march=armv8.2-a+dotprod+i8mm .. + +make -j4 +``` + +### macOS®: +```bash +mkdir build && cd build + +# The -DLLAMA_METAL=OFF is used to disable running on Metal GPU +cmake -DLLAMA_KLEIDIAI=ON -DCMAKE_C_FLAGS=-march=armv8.2-a+i8mm+dotprod -DCMAKE_CXX_FLAGS=-march=armv8.2-a+i8mm+dotprod -DLLAMA_METAL=OFF .. + +make -j4 +``` + +### Windows® on Arm®: + +- Install [Visual Studio 2022](https://visualstudio.microsoft.com/de/vs/community/) +- Install Required Components in Visual Studio Installer + - Workload Tab: Desktop development with C++ + - Individual Components Tab (search for these components): C++ CMake Tools for Windows®, Git for Windows®, C++ Clang Compiler for Windows®, MSBuild Support for LLVM-Toolset (clang) +- Environment Setup: + - If the host machine is x86-based, please use the integrated Developer Command Prompt / PowerShell in VS2022 for building and testing. + - If the host machine is Arm64-based, please use the system's cmd and set environment variables by running `"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsall.bat" arm64`as the integrated Developer Command Prompt / PowerShell in VS2022 is meant for x86 +```cmd +cmake --preset arm64-windows-llvm-release -D LLAMA_KLEIDIAI=ON -D KLEIDIAI_BUILD_TESTS=OFF -D LLAMA_OPENMP=OFF +cmake --build build-arm64-windows-llvm-release +``` + +The options LLAMA_KLEIDIAI_CACHE and KLEIDIAI_BUILD_TESTS are disabled on Windows®, as they are currently not supported. And please use llvm preset as MSVC is not supported either. + That’s all for this guide!