Skip to content

Commit

Permalink
Merge branch 'main' into clamp
Browse files Browse the repository at this point in the history
  • Loading branch information
jdemel authored Nov 5, 2023
2 parents fa95bec + d5b317c commit 8b953f5
Show file tree
Hide file tree
Showing 69 changed files with 775 additions and 1,592 deletions.
20 changes: 10 additions & 10 deletions .github/workflows/run-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
strategy:
fail-fast: false
matrix:
compiler:
compiler:
- { name: g++-7, cc: gcc-7, cxx: g++-7, distro: ubuntu-20.04 }
- { name: g++-8, cc: gcc-8, cxx: g++-8, distro: ubuntu-20.04 }
- { name: g++-9, cc: gcc-9, cxx: g++-9, distro: ubuntu-latest }
Expand All @@ -33,7 +33,7 @@ jobs:
# - { name: clang-15, cc: clang-15, cxx: clang++-15, distro: ubuntu-latest }

runs-on: ${{ matrix.compiler.distro }}

steps:
- uses: actions/[email protected]
with:
Expand All @@ -44,7 +44,7 @@ jobs:
env:
CC: ${{ matrix.compiler.cc }}
CXX: ${{ matrix.compiler.cxx }}
run: mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-Werror" ..
run: mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON ..
- name: Build
run: |
echo "Build with $(nproc) thread(s)"
Expand Down Expand Up @@ -107,14 +107,14 @@ jobs:
- arch: riscv64
distro: ubuntu22.04
compiler: { name: g++-12, cc: gcc-12, cxx: g++-12 }

steps:
- uses: actions/[email protected]
with:
submodules: 'recursive'
- uses: uraimo/[email protected]
name: Build in non-x86 container
continue-on-error: ${{ contains(fromJson('["ppc64le", "s390x"]'), matrix.arch) || contains(fromJson('["clang-14"]'), matrix.compiler.name) }}
continue-on-error: ${{ contains(fromJson('["ppc64le", "s390x"]'), matrix.arch) }}
id: build
with:
arch: ${{ matrix.arch }}
Expand Down Expand Up @@ -150,7 +150,7 @@ jobs:
run: |
cd /volk
cd build
cmake -DCMAKE_CXX_FLAGS="-Werror" ..
cmake -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON ..
echo "Build with $(nproc) thread(s)"
make -j$(nproc)
./cpu_features/list_cpu_features
Expand All @@ -173,7 +173,7 @@ jobs:
- name: dependencies
run: sudo apt install python3-mako liborc-dev
- name: configure
run: mkdir build && cd build && cmake -DENABLE_STATIC_LIBS=True ..
run: mkdir build && cd build && cmake -DENABLE_STATIC_LIBS=True -DBUILD_EXECUTABLE=ON ..
- name: build
run: cmake --build build -j$(nproc)
- name: Print info
Expand Down Expand Up @@ -206,7 +206,7 @@ jobs:

# build-windows-msys2:
# name: Build on windows-latest using MinGW and MSYS2

# runs-on: windows-latest
# steps:
# - uses: msys2/setup-msys2@v2
Expand All @@ -231,7 +231,7 @@ jobs:
# - name: Build
# shell: msys2 {0}
# run: cd build && make -j$(nproc)
# - name: Test
# - name: Test
# shell: msys2 {0}
# run: |
# cd build
Expand All @@ -248,7 +248,7 @@ jobs:
- name: dependencies
run: pip3 install mako
- name: configure
run: mkdir build && cd build && cmake ..
run: mkdir build && cd build && cmake -DBUILD_EXECUTABLE=ON ..
- name: build
run: cmake --build build --config Debug -j3
- name: Print info
Expand Down
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#
# Copyright 2011-2020 Free Software Foundation, Inc.
# Copyright 2023 Magnus Lundmark <[email protected]>
#
# This file is part of VOLK
#
Expand Down Expand Up @@ -144,6 +145,7 @@ if (VOLK_CPU_FEATURES)
FORCE)
set(BUILD_SHARED_LIBS_SAVED "${BUILD_SHARED_LIBS}")
set(BUILD_SHARED_LIBS OFF)
set(ENABLE_INSTALL OFF)
add_subdirectory(cpu_features)
set(BUILD_SHARED_LIBS "${BUILD_SHARED_LIBS_SAVED}")
endif()
Expand Down Expand Up @@ -248,6 +250,7 @@ install(FILES
${CMAKE_SOURCE_DIR}/include/volk/saturation_arithmetic.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h
Expand Down
4 changes: 2 additions & 2 deletions cmake/Modules/VolkBuildTypes.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -187,9 +187,9 @@ endif(NOT WIN32)
# NOTE: This is not defined on Windows systems.
########################################################################
if(NOT WIN32)
SET(CMAKE_CXX_FLAGS_ASAN "-Wall -Wextra -g -O2 -fsanitize=address -fno-omit-frame-pointer" CACHE STRING
SET(CMAKE_CXX_FLAGS_ASAN "-Wall -Wextra -g -O2 -fsanitize=address -fsanitize=undefined -fno-omit-frame-pointer" CACHE STRING
"Flags used by the C++ compiler during Address Sanitized builds." FORCE)
SET(CMAKE_C_FLAGS_ASAN "-Wall -Wextra -g -O2 -fsanitize=address -fno-omit-frame-pointer" CACHE STRING
SET(CMAKE_C_FLAGS_ASAN "-Wall -Wextra -g -O2 -fsanitize=address -fsanitize=undefined -fno-omit-frame-pointer" CACHE STRING
"Flags used by the C compiler during Address Sanitized builds." FORCE)
MARK_AS_ADVANCED(
CMAKE_CXX_FLAGS_ASAN
Expand Down
46 changes: 23 additions & 23 deletions gen/archs.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,29 +19,6 @@ at the top, as a last resort.
<flag compiler="clang">-mfloat-abi=hard</flag>
</arch>

<arch name="neon">
<flag compiler="gnu">-funsafe-math-optimizations</flag>
<flag compiler="clang">-funsafe-math-optimizations</flag>
<alignment>16</alignment>
<check name="neon"></check>
</arch>

<arch name="neonv7">
<flag compiler="gnu">-mfpu=neon</flag>
<flag compiler="gnu">-funsafe-math-optimizations</flag>
<flag compiler="clang">-mfpu=neon</flag>
<flag compiler="clang">-funsafe-math-optimizations</flag>
<alignment>16</alignment>
<check name="neon"></check>
</arch>

<arch name="neonv8">
<flag compiler="gnu">-funsafe-math-optimizations</flag>
<flag compiler="clang">-funsafe-math-optimizations</flag>
<alignment>16</alignment>
<check name="neon"></check>
</arch>

<arch name="32">
<flag compiler="gnu">-m32</flag>
<flag compiler="clang">-m32</flag>
Expand Down Expand Up @@ -105,6 +82,29 @@ at the top, as a last resort.
<arch name="norc">
</arch>

<arch name="neon">
<flag compiler="gnu">-funsafe-math-optimizations</flag>
<flag compiler="clang">-funsafe-math-optimizations</flag>
<alignment>16</alignment>
<check name="neon"></check>
</arch>

<arch name="neonv7">
<flag compiler="gnu">-mfpu=neon</flag>
<flag compiler="gnu">-funsafe-math-optimizations</flag>
<flag compiler="clang">-mfpu=neon</flag>
<flag compiler="clang">-funsafe-math-optimizations</flag>
<alignment>16</alignment>
<check name="neon"></check>
</arch>

<arch name="neonv8">
<flag compiler="gnu">-funsafe-math-optimizations</flag>
<flag compiler="clang">-funsafe-math-optimizations</flag>
<alignment>16</alignment>
<check name="neon"></check>
</arch>

<arch name="sse3">
<check name="sse3"></check>
<flag compiler="gnu">-msse3</flag>
Expand Down
2 changes: 1 addition & 1 deletion gen/machines.xml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
</machine>

<machine name="neonv8">
<archs>generic neon neonv8</archs>
<archs>generic neon neonv8 orc|</archs>
</machine>

<!-- trailing | bar means generate without either for MSVC -->
Expand Down
3 changes: 2 additions & 1 deletion gen/volk_kernel_defs.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,8 @@ def __init__(self, kernel_file):
kern_name=self.name, header=sub_hdr, body=body,
))
assert(self._impls)
if "generic" not in [impl.name for impl in self._impls]:
raise Exception(f"{self.name} does not have a generic protokernel.")
self.has_dispatcher = False
for impl in self._impls:
if impl.name == 'dispatcher':
Expand Down Expand Up @@ -194,4 +196,3 @@ def __repr__(self):

if __name__ == '__main__':
print(kernels)

50 changes: 50 additions & 0 deletions include/volk/volk_avx2_fma_intrinsics.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/* -*- c++ -*- */
/*
* Copyright 2023 Magnus Lundmark <[email protected]>
*
* This file is part of VOLK
*
* SPDX-License-Identifier: LGPL-3.0-or-later
*/

/*
* This file is intended to hold AVX2 FMA intrinsics of intrinsics.
* They should be used in VOLK kernels to avoid copy-paste.
*/

#ifndef INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
#define INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
#include <immintrin.h>

/*
* Approximate arctan(x) via polynomial expansion
* on the interval [-1, 1]
*
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
{
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);

const __m256 x_times_x = _mm256_mul_ps(x, x);
__m256 arctan;
arctan = a13;
arctan = _mm256_fmadd_ps(x_times_x, arctan, a11);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a9);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a7);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a5);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a3);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a1);
arctan = _mm256_mul_ps(x, arctan);

return arctan;
}

#endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */
38 changes: 38 additions & 0 deletions include/volk/volk_avx_intrinsics.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/* -*- c++ -*- */
/*
* Copyright 2015 Free Software Foundation, Inc.
* Copyright 2023 Magnus Lundmark <[email protected]>
*
* This file is part of VOLK
*
Expand All @@ -16,6 +17,43 @@
#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
#include <immintrin.h>

/*
* Approximate arctan(x) via polynomial expansion
* on the interval [-1, 1]
*
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
static inline __m256 _m256_arctan_poly_avx(const __m256 x)
{
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);

const __m256 x_times_x = _mm256_mul_ps(x, x);
__m256 arctan;
arctan = a13;
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a11);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a9);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a7);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a5);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a3);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a1);
arctan = _mm256_mul_ps(x, arctan);

return arctan;
}

static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
{
__m256 yl, yh, tmp1, tmp2;
Expand Down
47 changes: 46 additions & 1 deletion include/volk/volk_common.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/* -*- c++ -*- */
/*
* Copyright 2010, 2011, 2015-2017, 2019, 2020 Free Software Foundation, Inc.
* Copyright 2023 Magnus Lundmark <[email protected]>
*
* This file is part of VOLK
*
Expand Down Expand Up @@ -166,6 +167,50 @@ static inline float log2f_non_ieee(float f)
// Constant used to do log10 calculations as faster log2
////////////////////////////////////////////////////////////////////////
// precalculated 10.0 / log2f_non_ieee(10.0) to allow for constexpr
#define volk_log2to10factor 3.01029995663981209120
#define volk_log2to10factor (0x1.815182p1) // 3.01029995663981209120

////////////////////////////////////////////////////////////////////////
// arctan(x)
////////////////////////////////////////////////////////////////////////
static inline float volk_arctan_poly(const float x)
{
/*
* arctan(x) polynomial expansion on the interval [-1, 1]
* Maximum relative error < 6.6e-7
*/
const float a1 = +0x1.ffffeap-1f;
const float a3 = -0x1.55437p-2f;
const float a5 = +0x1.972be6p-3f;
const float a7 = -0x1.1436ap-3f;
const float a9 = +0x1.5785aap-4f;
const float a11 = -0x1.2f3004p-5f;
const float a13 = +0x1.01a37cp-7f;

const float x_times_x = x * x;
float arctan = a13;
arctan = fmaf(x_times_x, arctan, a11);
arctan = fmaf(x_times_x, arctan, a9);
arctan = fmaf(x_times_x, arctan, a7);
arctan = fmaf(x_times_x, arctan, a5);
arctan = fmaf(x_times_x, arctan, a3);
arctan = fmaf(x_times_x, arctan, a1);
arctan *= x;

return arctan;
}

static inline float volk_arctan(const float x)
{
/*
* arctan(x) + arctan(1 / x) == sign(x) * pi / 2
*/
const float pi_over_2 = 0x1.921fb6p0f;

if (fabs(x) < 1.f) {
return volk_arctan_poly(x);
} else {
return copysignf(pi_over_2, x) - volk_arctan_poly(1.f / x);
}
}

#endif /*INCLUDED_LIBVOLK_COMMON_H*/
Loading

0 comments on commit 8b953f5

Please sign in to comment.