Merge branch 'main' into clamp

gnuradio · Nov 5, 2023 · 8b953f5 · 8b953f5
2 parents fa95bec + d5b317c
commit 8b953f5
Show file tree

Hide file tree

Showing 69 changed files with 775 additions and 1,592 deletions.
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -17,7 +17,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        compiler: 
+        compiler:
           - { name: g++-7, cc: gcc-7, cxx: g++-7, distro: ubuntu-20.04 }
           - { name: g++-8, cc: gcc-8, cxx: g++-8, distro: ubuntu-20.04 }
           - { name: g++-9, cc: gcc-9, cxx: g++-9, distro: ubuntu-latest }
@@ -33,7 +33,7 @@ jobs:
           # - { name: clang-15, cc: clang-15, cxx: clang++-15, distro: ubuntu-latest }
 
     runs-on: ${{ matrix.compiler.distro }}
-    
+
     steps:
     - uses: actions/[email protected]
       with:
@@ -44,7 +44,7 @@ jobs:
       env:
         CC: ${{ matrix.compiler.cc }}
         CXX: ${{ matrix.compiler.cxx }}
-      run: mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-Werror" ..
+      run: mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON ..
     - name: Build
       run: |
         echo "Build with $(nproc) thread(s)"
@@ -107,14 +107,14 @@ jobs:
           - arch: riscv64
             distro: ubuntu22.04
             compiler: { name: g++-12, cc: gcc-12, cxx: g++-12 }
-    
+
     steps:
       - uses: actions/[email protected]
         with:
           submodules: 'recursive'
       - uses: uraimo/[email protected]
         name: Build in non-x86 container
-        continue-on-error: ${{ contains(fromJson('["ppc64le", "s390x"]'), matrix.arch) || contains(fromJson('["clang-14"]'), matrix.compiler.name) }}
+        continue-on-error: ${{ contains(fromJson('["ppc64le", "s390x"]'), matrix.arch) }}
         id: build
         with:
           arch: ${{ matrix.arch }}
@@ -150,7 +150,7 @@ jobs:
           run: |
             cd /volk
             cd build
-            cmake -DCMAKE_CXX_FLAGS="-Werror" ..
+            cmake -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON ..
             echo "Build with $(nproc) thread(s)"
             make -j$(nproc)
             ./cpu_features/list_cpu_features
@@ -173,7 +173,7 @@ jobs:
     - name: dependencies
       run: sudo apt install python3-mako liborc-dev
     - name: configure
-      run: mkdir build && cd build && cmake -DENABLE_STATIC_LIBS=True ..
+      run: mkdir build && cd build && cmake -DENABLE_STATIC_LIBS=True -DBUILD_EXECUTABLE=ON ..
     - name: build
       run: cmake --build build -j$(nproc)
     - name: Print info
@@ -206,7 +206,7 @@ jobs:
 
   # build-windows-msys2:
   #   name: Build on windows-latest using MinGW and MSYS2
-    
+
   #   runs-on: windows-latest
   #   steps:
   #     - uses: msys2/setup-msys2@v2
@@ -231,7 +231,7 @@ jobs:
   #     - name: Build
   #       shell: msys2 {0}
   #       run: cd build && make -j$(nproc)
-  #     - name: Test 
+  #     - name: Test
   #       shell: msys2 {0}
   #       run: |
   #         cd build
@@ -248,7 +248,7 @@ jobs:
     - name: dependencies
       run: pip3 install mako
     - name: configure
-      run: mkdir build && cd build && cmake ..
+      run: mkdir build && cd build && cmake -DBUILD_EXECUTABLE=ON ..
     - name: build
       run: cmake --build build --config Debug -j3
     - name: Print info

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,5 +1,6 @@
 #
 # Copyright 2011-2020 Free Software Foundation, Inc.
+# Copyright 2023 Magnus Lundmark <[email protected]>
 #
 # This file is part of VOLK
 #
@@ -144,6 +145,7 @@ if (VOLK_CPU_FEATURES)
       FORCE)
     set(BUILD_SHARED_LIBS_SAVED "${BUILD_SHARED_LIBS}")
     set(BUILD_SHARED_LIBS OFF)
+    set(ENABLE_INSTALL OFF)
     add_subdirectory(cpu_features)
     set(BUILD_SHARED_LIBS "${BUILD_SHARED_LIBS_SAVED}")
   endif()
@@ -248,6 +250,7 @@ install(FILES
     ${CMAKE_SOURCE_DIR}/include/volk/saturation_arithmetic.h
     ${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h
     ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h
+    ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h
     ${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h
     ${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h
     ${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h

diff --git a/cmake/Modules/VolkBuildTypes.cmake b/cmake/Modules/VolkBuildTypes.cmake
@@ -187,9 +187,9 @@ endif(NOT WIN32)
 # NOTE: This is not defined on Windows systems.
 ########################################################################
 if(NOT WIN32)
-  SET(CMAKE_CXX_FLAGS_ASAN "-Wall -Wextra -g -O2 -fsanitize=address -fno-omit-frame-pointer" CACHE STRING
+  SET(CMAKE_CXX_FLAGS_ASAN "-Wall -Wextra -g -O2 -fsanitize=address -fsanitize=undefined -fno-omit-frame-pointer" CACHE STRING
     "Flags used by the C++ compiler during Address Sanitized builds." FORCE)
-  SET(CMAKE_C_FLAGS_ASAN "-Wall -Wextra -g -O2 -fsanitize=address -fno-omit-frame-pointer" CACHE STRING
+  SET(CMAKE_C_FLAGS_ASAN "-Wall -Wextra -g -O2 -fsanitize=address -fsanitize=undefined -fno-omit-frame-pointer" CACHE STRING
     "Flags used by the C compiler during Address Sanitized builds." FORCE)
   MARK_AS_ADVANCED(
     CMAKE_CXX_FLAGS_ASAN

diff --git a/cpu_features b/cpu_features
diff --git a/gen/archs.xml b/gen/archs.xml
@@ -19,29 +19,6 @@ at the top, as a last resort.
   <flag compiler="clang">-mfloat-abi=hard</flag>
 </arch>
 
-<arch name="neon">
-  <flag compiler="gnu">-funsafe-math-optimizations</flag>
-  <flag compiler="clang">-funsafe-math-optimizations</flag>
-  <alignment>16</alignment>
-  <check name="neon"></check>
-</arch>
-
-<arch name="neonv7">
-  <flag compiler="gnu">-mfpu=neon</flag>
-  <flag compiler="gnu">-funsafe-math-optimizations</flag>
-  <flag compiler="clang">-mfpu=neon</flag>
-  <flag compiler="clang">-funsafe-math-optimizations</flag>
-  <alignment>16</alignment>
-  <check name="neon"></check>
-</arch>
-
-<arch name="neonv8">
-  <flag compiler="gnu">-funsafe-math-optimizations</flag>
-  <flag compiler="clang">-funsafe-math-optimizations</flag>
-  <alignment>16</alignment>
-  <check name="neon"></check>
-</arch>
-
 <arch name="32">
   <flag compiler="gnu">-m32</flag>
   <flag compiler="clang">-m32</flag>
@@ -105,6 +82,29 @@ at the top, as a last resort.
 <arch name="norc">
 </arch>
 
+<arch name="neon">
+  <flag compiler="gnu">-funsafe-math-optimizations</flag>
+  <flag compiler="clang">-funsafe-math-optimizations</flag>
+  <alignment>16</alignment>
+  <check name="neon"></check>
+</arch>
+
+<arch name="neonv7">
+  <flag compiler="gnu">-mfpu=neon</flag>
+  <flag compiler="gnu">-funsafe-math-optimizations</flag>
+  <flag compiler="clang">-mfpu=neon</flag>
+  <flag compiler="clang">-funsafe-math-optimizations</flag>
+  <alignment>16</alignment>
+  <check name="neon"></check>
+</arch>
+
+<arch name="neonv8">
+  <flag compiler="gnu">-funsafe-math-optimizations</flag>
+  <flag compiler="clang">-funsafe-math-optimizations</flag>
+  <alignment>16</alignment>
+  <check name="neon"></check>
+</arch>
+
 <arch name="sse3">
   <check name="sse3"></check>
   <flag compiler="gnu">-msse3</flag>

diff --git a/gen/machines.xml b/gen/machines.xml
@@ -13,7 +13,7 @@
 </machine>
 
 <machine name="neonv8">
-<archs>generic neon neonv8</archs>
+<archs>generic neon neonv8 orc|</archs>
 </machine>
 
 <!-- trailing | bar means generate without either for MSVC -->

diff --git a/gen/volk_kernel_defs.py b/gen/volk_kernel_defs.py
@@ -162,6 +162,8 @@ def __init__(self, kernel_file):
                     kern_name=self.name, header=sub_hdr, body=body,
                 ))
         assert(self._impls)
+        if "generic" not in [impl.name for impl in self._impls]:
+            raise Exception(f"{self.name} does not have a generic protokernel.")
         self.has_dispatcher = False
         for impl in self._impls:
             if impl.name == 'dispatcher':
@@ -194,4 +196,3 @@ def __repr__(self):
 
 if __name__ == '__main__':
     print(kernels)
-
diff --git a/include/volk/volk_avx2_fma_intrinsics.h b/include/volk/volk_avx2_fma_intrinsics.h
@@ -0,0 +1,50 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2023 Magnus Lundmark <[email protected]>
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*
+ * This file is intended to hold AVX2 FMA intrinsics of intrinsics.
+ * They should be used in VOLK kernels to avoid copy-paste.
+ */
+
+#ifndef INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
+#define INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
+#include <immintrin.h>
+
+/*
+ * Approximate arctan(x) via polynomial expansion
+ * on the interval [-1, 1]
+ *
+ * Maximum relative error ~6.5e-7
+ * Polynomial evaluated via Horner's method
+ */
+static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
+{
+    const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
+    const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
+    const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
+    const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
+    const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
+    const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
+    const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);
+
+    const __m256 x_times_x = _mm256_mul_ps(x, x);
+    __m256 arctan;
+    arctan = a13;
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a11);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a9);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a7);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a5);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a3);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a1);
+    arctan = _mm256_mul_ps(x, arctan);
+
+    return arctan;
+}
+
+#endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */
diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h
@@ -1,6 +1,7 @@
 /* -*- c++ -*- */
 /*
  * Copyright 2015 Free Software Foundation, Inc.
+ * Copyright 2023 Magnus Lundmark <[email protected]>
  *
  * This file is part of VOLK
  *
@@ -16,6 +17,43 @@
 #define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
 #include <immintrin.h>
 
+/*
+ * Approximate arctan(x) via polynomial expansion
+ * on the interval [-1, 1]
+ *
+ * Maximum relative error ~6.5e-7
+ * Polynomial evaluated via Horner's method
+ */
+static inline __m256 _m256_arctan_poly_avx(const __m256 x)
+{
+    const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
+    const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
+    const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
+    const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
+    const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
+    const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
+    const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);
+
+    const __m256 x_times_x = _mm256_mul_ps(x, x);
+    __m256 arctan;
+    arctan = a13;
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a11);
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a9);
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a7);
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a5);
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a3);
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a1);
+    arctan = _mm256_mul_ps(x, arctan);
+
+    return arctan;
+}
+
 static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
 {
     __m256 yl, yh, tmp1, tmp2;

diff --git a/include/volk/volk_common.h b/include/volk/volk_common.h
@@ -1,6 +1,7 @@
 /* -*- c++ -*- */
 /*
  * Copyright 2010, 2011, 2015-2017, 2019, 2020 Free Software Foundation, Inc.
+ * Copyright 2023 Magnus Lundmark <[email protected]>
  *
  * This file is part of VOLK
  *
@@ -166,6 +167,50 @@ static inline float log2f_non_ieee(float f)
 // Constant used to do log10 calculations as faster log2
 ////////////////////////////////////////////////////////////////////////
 // precalculated 10.0 / log2f_non_ieee(10.0) to allow for constexpr
-#define volk_log2to10factor 3.01029995663981209120
+#define volk_log2to10factor (0x1.815182p1) // 3.01029995663981209120
+
+////////////////////////////////////////////////////////////////////////
+// arctan(x)
+////////////////////////////////////////////////////////////////////////
+static inline float volk_arctan_poly(const float x)
+{
+    /*
+     * arctan(x) polynomial expansion on the interval [-1, 1]
+     * Maximum relative error < 6.6e-7
+     */
+    const float a1 = +0x1.ffffeap-1f;
+    const float a3 = -0x1.55437p-2f;
+    const float a5 = +0x1.972be6p-3f;
+    const float a7 = -0x1.1436ap-3f;
+    const float a9 = +0x1.5785aap-4f;
+    const float a11 = -0x1.2f3004p-5f;
+    const float a13 = +0x1.01a37cp-7f;
+
+    const float x_times_x = x * x;
+    float arctan = a13;
+    arctan = fmaf(x_times_x, arctan, a11);
+    arctan = fmaf(x_times_x, arctan, a9);
+    arctan = fmaf(x_times_x, arctan, a7);
+    arctan = fmaf(x_times_x, arctan, a5);
+    arctan = fmaf(x_times_x, arctan, a3);
+    arctan = fmaf(x_times_x, arctan, a1);
+    arctan *= x;
+
+    return arctan;
+}
+
+static inline float volk_arctan(const float x)
+{
+    /*
+     *  arctan(x) + arctan(1 / x) == sign(x) * pi / 2
+     */
+    const float pi_over_2 = 0x1.921fb6p0f;
+
+    if (fabs(x) < 1.f) {
+        return volk_arctan_poly(x);
+    } else {
+        return copysignf(pi_over_2, x) - volk_arctan_poly(1.f / x);
+    }
+}
 
 #endif /*INCLUDED_LIBVOLK_COMMON_H*/
+9 −4		.dockerignore
+1 −0		.github/workflows/Dockerfile
+26 −0		.github/workflows/aarch64_linux_bazel.yml
+1 −1		.github/workflows/amd64_freebsd_cmake.yml
+9 −9		.github/workflows/amd64_linux_bazel.yml
+35 −0		.github/workflows/amd64_macos_bazel.yml
+1 −1		.github/workflows/clang_format.yml
+57 −8		BUILD.bazel
+76 −43		CMakeLists.txt
+39 −29		README.md
+122 −0		bazel/ci/Makefile
+37 −0		bazel/ci/docker/Dockerfile
+7 −0		bazel/platforms.bzl
+1 −1		cmake/README.md
+4 −0		include/cpu_features_macros.h
+42 −0		include/cpuinfo_aarch64.h
+77 −0		include/cpuinfo_loongarch.h
+7 −0		include/cpuinfo_x86.h
+37 −0		include/internal/hwcaps.h
+61 −46		scripts/generate_badges.d
+118 −0		src/impl_aarch64__base_implementation.inl
+1 −75		src/impl_aarch64_linux_or_android.c
+88 −0		src/impl_aarch64_macos_or_iphone.c
+89 −0		src/impl_loongarch_linux.c
+15 −1		src/impl_x86__base_implementation.inl
+9 −0		src/utils/list_cpu_features.c
+17 −2		test/CMakeLists.txt
+129 −17		test/cpuinfo_aarch64_test.cc
+179 −0		test/cpuinfo_loongarch_test.cc
+61 −0		test/cpuinfo_x86_test.cc