From 9f5354e8fb836c6d006689cb5ae97c40428ff611 Mon Sep 17 00:00:00 2001 From: Clayton Smith Date: Thu, 9 Nov 2023 10:26:03 -0500 Subject: [PATCH] Use pointers to pass in s32fc arguments This avoids undefined behaviour arising from incompatibility between complex numbers in C and C++. Signed-off-by: Clayton Smith --- .github/workflows/run-tests.yml | 5 +- docs/kernels.dox | 3 + kernels/volk/volk_32fc_s32fc_multiply2_32fc.h | 413 +++++++++ kernels/volk/volk_32fc_s32fc_multiply_32fc.h | 260 +----- .../volk_32fc_s32fc_rotator2puppet_32fc.h | 173 ++++ .../volk/volk_32fc_s32fc_rotatorpuppet_32fc.h | 169 ---- .../volk/volk_32fc_s32fc_x2_rotator2_32fc.h | 782 ++++++++++++++++++ .../volk/volk_32fc_s32fc_x2_rotator_32fc.h | 617 +------------- ...fc_x2_s32fc_multiply_conjugate_add2_32fc.h | 344 ++++++++ ...2fc_x2_s32fc_multiply_conjugate_add_32fc.h | 198 +---- lib/kernel_tests.h | 8 +- lib/qa_utils.cc | 6 +- lib/qa_utils.h | 6 +- tmpl/volk.tmpl.h | 8 +- 14 files changed, 1772 insertions(+), 1220 deletions(-) create mode 100644 kernels/volk/volk_32fc_s32fc_multiply2_32fc.h create mode 100644 kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h delete mode 100644 kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h create mode 100644 kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h create mode 100644 kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index f0581eeaf..0b79cb82b 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -114,7 +114,6 @@ jobs: submodules: 'recursive' - uses: uraimo/run-on-arch-action@v2.5.0 name: Build in non-x86 container - continue-on-error: ${{ contains(fromJson('["ppc64le", "s390x"]'), matrix.arch) }} id: build with: arch: ${{ matrix.arch }} @@ -153,7 +152,9 @@ jobs: cmake -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON .. echo "Build with $(nproc) thread(s)" make -j$(nproc) - ./cpu_features/list_cpu_features + if [ -f ./cpu_features/list_cpu_features ]; then + ./cpu_features/list_cpu_features + fi ./apps/volk-config-info --alignment ./apps/volk-config-info --avail-machines ./apps/volk-config-info --all-machines diff --git a/docs/kernels.dox b/docs/kernels.dox index e6930ad2f..f67e44440 100644 --- a/docs/kernels.dox +++ b/docs/kernels.dox @@ -50,7 +50,9 @@ \li \subpage volk_32f_cos_32f \li \subpage volk_32fc_s32f_atan2_32f \li \subpage volk_32fc_s32fc_multiply_32fc +\li \subpage volk_32fc_s32fc_multiply2_32fc \li \subpage volk_32fc_s32fc_x2_rotator_32fc +\li \subpage volk_32fc_s32fc_x2_rotator2_32fc \li \subpage volk_32fc_s32f_deinterleave_real_16i \li \subpage volk_32fc_s32f_magnitude_16i \li \subpage volk_32fc_s32f_power_32fc @@ -63,6 +65,7 @@ \li \subpage volk_32fc_x2_multiply_32fc \li \subpage volk_32fc_x2_multiply_conjugate_32fc \li \subpage volk_32fc_x2_s32fc_multiply_conjugate_add_32fc +\li \subpage volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc \li \subpage volk_32fc_x2_s32f_square_dist_scalar_mult_32f \li \subpage volk_32fc_x2_square_dist_32f \li \subpage volk_32f_exp_32f diff --git a/kernels/volk/volk_32fc_s32fc_multiply2_32fc.h b/kernels/volk/volk_32fc_s32fc_multiply2_32fc.h new file mode 100644 index 000000000..c1704041f --- /dev/null +++ b/kernels/volk/volk_32fc_s32fc_multiply2_32fc.h @@ -0,0 +1,413 @@ +/* -*- c++ -*- */ +/* + * Copyright 2012, 2014 Free Software Foundation, Inc. + * + * This file is part of VOLK + * + * SPDX-License-Identifier: LGPL-3.0-or-later + */ + +/*! + * \page volk_32fc_s32fc_multiply2_32fc + * + * \b Overview + * + * Multiplies the input complex vector by a complex scalar and returns + * the results. + * + * Dispatcher Prototype + * \code + * void volk_32fc_s32fc_multiply2_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const + * lv_32fc_t* scalar, unsigned int num_points); \endcode + * + * \b Inputs + * \li aVector: The input vector to be multiplied. + * \li scalar: The complex scalar to multiply against aVector. + * \li num_points: The number of complex values in aVector. + * + * \b Outputs + * \li cVector: The vector where the results will be stored. + * + * \b Example + * Generate points around the unit circle and shift the phase pi/3 rad. + * \code + * int N = 10; + * unsigned int alignment = volk_get_alignment(); + * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); + * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); + * lv_32fc_t scalar = lv_cmake((float)std::cos(M_PI/3.f), (float)std::sin(M_PI/3.f)); + * + * float delta = 2.f*M_PI / (float)N; + * for(unsigned int ii = 0; ii < N/2; ++ii){ + * // Generate points around the unit circle + * float real = std::cos(delta * (float)ii); + * float imag = std::sin(delta * (float)ii); + * in[ii] = lv_cmake(real, imag); + * in[ii+N/2] = lv_cmake(-real, -imag); + * } + * + * volk_32fc_s32fc_multiply2_32fc(out, in, &scalar, N); + * + * printf(" mag phase | mag phase\n"); + * for(unsigned int ii = 0; ii < N; ++ii){ + * printf("%+1.2f %+1.2f | %+1.2f %+1.2f\n", + * std::abs(in[ii]), std::arg(in[ii]), + * std::abs(out[ii]), std::arg(out[ii])); + * } + * + * volk_free(in); + * volk_free(out); + * \endcode + */ + +#ifndef INCLUDED_volk_32fc_s32fc_multiply2_32fc_u_H +#define INCLUDED_volk_32fc_s32fc_multiply2_32fc_u_H + +#include +#include +#include +#include + +#if LV_HAVE_AVX && LV_HAVE_FMA +#include + +static inline void volk_32fc_s32fc_multiply2_32fc_u_avx_fma(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* scalar, + unsigned int num_points) +{ + unsigned int number = 0; + unsigned int i = 0; + const unsigned int quarterPoints = num_points / 4; + unsigned int isodd = num_points & 3; + __m256 x, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + // Set up constant scalar vector + yl = _mm256_set1_ps(lv_creal(*scalar)); + yh = _mm256_set1_ps(lv_cimag(*scalar)); + + for (; number < quarterPoints; number++) { + x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + + tmp1 = x; + + x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_fmaddsub_ps( + tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm256_storeu_ps((float*)c, z); // Store the results back into the C container + + a += 4; + c += 4; + } + + for (i = num_points - isodd; i < num_points; i++) { + *c++ = (*a++) * (*scalar); + } +} +#endif /* LV_HAVE_AVX && LV_HAVE_FMA */ + +#ifdef LV_HAVE_AVX +#include + +static inline void volk_32fc_s32fc_multiply2_32fc_u_avx(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* scalar, + unsigned int num_points) +{ + unsigned int number = 0; + unsigned int i = 0; + const unsigned int quarterPoints = num_points / 4; + unsigned int isodd = num_points & 3; + __m256 x, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + // Set up constant scalar vector + yl = _mm256_set1_ps(lv_creal(*scalar)); + yh = _mm256_set1_ps(lv_cimag(*scalar)); + + for (; number < quarterPoints; number++) { + x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + + tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1, + tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm256_storeu_ps((float*)c, z); // Store the results back into the C container + + a += 4; + c += 4; + } + + for (i = num_points - isodd; i < num_points; i++) { + *c++ = (*a++) * (*scalar); + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_SSE3 +#include + +static inline void volk_32fc_s32fc_multiply2_32fc_u_sse3(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* scalar, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + // Set up constant scalar vector + yl = _mm_set_ps1(lv_creal(*scalar)); + yh = _mm_set_ps1(lv_cimag(*scalar)); + + for (; number < halfPoints; number++) { + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + + tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1, + tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_storeu_ps((float*)c, z); // Store the results back into the C container + + a += 2; + c += 2; + } + + if ((num_points % 2) != 0) { + *c = (*a) * (*scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + +static inline void volk_32fc_s32fc_multiply2_32fc_generic(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* scalar, + unsigned int num_points) +{ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = num_points; + + // unwrap loop + while (number >= 8) { + *cPtr++ = (*aPtr++) * (*scalar); + *cPtr++ = (*aPtr++) * (*scalar); + *cPtr++ = (*aPtr++) * (*scalar); + *cPtr++ = (*aPtr++) * (*scalar); + *cPtr++ = (*aPtr++) * (*scalar); + *cPtr++ = (*aPtr++) * (*scalar); + *cPtr++ = (*aPtr++) * (*scalar); + *cPtr++ = (*aPtr++) * (*scalar); + number -= 8; + } + + // clean up any remaining + while (number-- > 0) + *cPtr++ = *aPtr++ * (*scalar); +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply2_32fc_u_H */ +#ifndef INCLUDED_volk_32fc_s32fc_multiply2_32fc_a_H +#define INCLUDED_volk_32fc_s32fc_multiply2_32fc_a_H + +#include +#include +#include +#include + +#if LV_HAVE_AVX && LV_HAVE_FMA +#include + +static inline void volk_32fc_s32fc_multiply2_32fc_a_avx_fma(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* scalar, + unsigned int num_points) +{ + unsigned int number = 0; + unsigned int i = 0; + const unsigned int quarterPoints = num_points / 4; + unsigned int isodd = num_points & 3; + __m256 x, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + // Set up constant scalar vector + yl = _mm256_set1_ps(lv_creal(*scalar)); + yh = _mm256_set1_ps(lv_cimag(*scalar)); + + for (; number < quarterPoints; number++) { + x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + + tmp1 = x; + + x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_fmaddsub_ps( + tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm256_store_ps((float*)c, z); // Store the results back into the C container + + a += 4; + c += 4; + } + + for (i = num_points - isodd; i < num_points; i++) { + *c++ = (*a++) * (*scalar); + } +} +#endif /* LV_HAVE_AVX && LV_HAVE_FMA */ + + +#ifdef LV_HAVE_AVX +#include + +static inline void volk_32fc_s32fc_multiply2_32fc_a_avx(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* scalar, + unsigned int num_points) +{ + unsigned int number = 0; + unsigned int i = 0; + const unsigned int quarterPoints = num_points / 4; + unsigned int isodd = num_points & 3; + __m256 x, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + // Set up constant scalar vector + yl = _mm256_set1_ps(lv_creal(*scalar)); + yh = _mm256_set1_ps(lv_cimag(*scalar)); + + for (; number < quarterPoints; number++) { + x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + + tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1, + tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm256_store_ps((float*)c, z); // Store the results back into the C container + + a += 4; + c += 4; + } + + for (i = num_points - isodd; i < num_points; i++) { + *c++ = (*a++) * (*scalar); + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_SSE3 +#include + +static inline void volk_32fc_s32fc_multiply2_32fc_a_sse3(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* scalar, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + // Set up constant scalar vector + yl = _mm_set_ps1(lv_creal(*scalar)); + yh = _mm_set_ps1(lv_cimag(*scalar)); + + for (; number < halfPoints; number++) { + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + + tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1, + tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_store_ps((float*)c, z); // Store the results back into the C container + + a += 2; + c += 2; + } + + if ((num_points % 2) != 0) { + *c = (*a) * (*scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_NEON +#include + +static inline void volk_32fc_s32fc_multiply2_32fc_neon(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* scalar, + unsigned int num_points) +{ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = num_points; + unsigned int quarter_points = num_points / 4; + + float32x4x2_t a_val, scalar_val; + float32x4x2_t tmp_imag; + + scalar_val.val[0] = vld1q_dup_f32((const float*)scalar); + scalar_val.val[1] = vld1q_dup_f32(((const float*)scalar) + 1); + for (number = 0; number < quarter_points; ++number) { + a_val = vld2q_f32((float*)aPtr); + tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]); + tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]); + + tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]); + tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]); + + vst2q_f32((float*)cPtr, tmp_imag); + aPtr += 4; + cPtr += 4; + } + + for (number = quarter_points * 4; number < num_points; number++) { + *cPtr++ = *aPtr++ * (*scalar); + } +} +#endif /* LV_HAVE_NEON */ + +#endif /* INCLUDED_volk_32fc_x2_multiply2_32fc_a_H */ diff --git a/kernels/volk/volk_32fc_s32fc_multiply_32fc.h b/kernels/volk/volk_32fc_s32fc_multiply_32fc.h index 1593b7cbe..ab1bc2be9 100644 --- a/kernels/volk/volk_32fc_s32fc_multiply_32fc.h +++ b/kernels/volk/volk_32fc_s32fc_multiply_32fc.h @@ -66,137 +66,39 @@ #include #include #include +#include #include #if LV_HAVE_AVX && LV_HAVE_FMA -#include static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points) { - unsigned int number = 0; - unsigned int i = 0; - const unsigned int quarterPoints = num_points / 4; - unsigned int isodd = num_points & 3; - __m256 x, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - - // Set up constant scalar vector - yl = _mm256_set1_ps(lv_creal(scalar)); - yh = _mm256_set1_ps(lv_cimag(scalar)); - - for (; number < quarterPoints; number++) { - x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - - tmp1 = x; - - x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm256_fmaddsub_ps( - tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm256_storeu_ps((float*)c, z); // Store the results back into the C container - - a += 4; - c += 4; - } - - for (i = num_points - isodd; i < num_points; i++) { - *c++ = (*a++) * scalar; - } + volk_32fc_s32fc_multiply2_32fc_u_avx_fma(cVector, aVector, &scalar, num_points); } #endif /* LV_HAVE_AVX && LV_HAVE_FMA */ #ifdef LV_HAVE_AVX -#include static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points) { - unsigned int number = 0; - unsigned int i = 0; - const unsigned int quarterPoints = num_points / 4; - unsigned int isodd = num_points & 3; - __m256 x, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - - // Set up constant scalar vector - yl = _mm256_set1_ps(lv_creal(scalar)); - yh = _mm256_set1_ps(lv_cimag(scalar)); - - for (; number < quarterPoints; number++) { - x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - - tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm256_addsub_ps(tmp1, - tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm256_storeu_ps((float*)c, z); // Store the results back into the C container - - a += 4; - c += 4; - } - - for (i = num_points - isodd; i < num_points; i++) { - *c++ = (*a++) * scalar; - } + volk_32fc_s32fc_multiply2_32fc_u_avx(cVector, aVector, &scalar, num_points); } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE3 -#include static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points) { - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - - // Set up constant scalar vector - yl = _mm_set_ps1(lv_creal(scalar)); - yh = _mm_set_ps1(lv_cimag(scalar)); - - for (; number < halfPoints; number++) { - - x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - - tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1, - tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm_storeu_ps((float*)c, z); // Store the results back into the C container - - a += 2; - c += 2; - } - - if ((num_points % 2) != 0) { - *c = (*a) * scalar; - } + volk_32fc_s32fc_multiply2_32fc_u_sse3(cVector, aVector, &scalar, num_points); } #endif /* LV_HAVE_SSE */ @@ -207,26 +109,7 @@ static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t scalar, unsigned int num_points) { - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - unsigned int number = num_points; - - // unwrap loop - while (number >= 8) { - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - number -= 8; - } - - // clean up any remaining - while (number-- > 0) - *cPtr++ = *aPtr++ * scalar; + volk_32fc_s32fc_multiply2_32fc_generic(cVector, aVector, &scalar, num_points); } #endif /* LV_HAVE_GENERIC */ @@ -241,172 +124,47 @@ static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, #include #if LV_HAVE_AVX && LV_HAVE_FMA -#include static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points) { - unsigned int number = 0; - unsigned int i = 0; - const unsigned int quarterPoints = num_points / 4; - unsigned int isodd = num_points & 3; - __m256 x, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - - // Set up constant scalar vector - yl = _mm256_set1_ps(lv_creal(scalar)); - yh = _mm256_set1_ps(lv_cimag(scalar)); - - for (; number < quarterPoints; number++) { - x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - - tmp1 = x; - - x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm256_fmaddsub_ps( - tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm256_store_ps((float*)c, z); // Store the results back into the C container - - a += 4; - c += 4; - } - - for (i = num_points - isodd; i < num_points; i++) { - *c++ = (*a++) * scalar; - } + volk_32fc_s32fc_multiply2_32fc_a_avx_fma(cVector, aVector, &scalar, num_points); } #endif /* LV_HAVE_AVX && LV_HAVE_FMA */ #ifdef LV_HAVE_AVX -#include static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points) { - unsigned int number = 0; - unsigned int i = 0; - const unsigned int quarterPoints = num_points / 4; - unsigned int isodd = num_points & 3; - __m256 x, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - - // Set up constant scalar vector - yl = _mm256_set1_ps(lv_creal(scalar)); - yh = _mm256_set1_ps(lv_cimag(scalar)); - - for (; number < quarterPoints; number++) { - x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - - tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm256_addsub_ps(tmp1, - tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm256_store_ps((float*)c, z); // Store the results back into the C container - - a += 4; - c += 4; - } - - for (i = num_points - isodd; i < num_points; i++) { - *c++ = (*a++) * scalar; - } + volk_32fc_s32fc_multiply2_32fc_a_avx(cVector, aVector, &scalar, num_points); } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE3 -#include static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points) { - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - - // Set up constant scalar vector - yl = _mm_set_ps1(lv_creal(scalar)); - yh = _mm_set_ps1(lv_cimag(scalar)); - - for (; number < halfPoints; number++) { - - x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - - tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1, - tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm_store_ps((float*)c, z); // Store the results back into the C container - - a += 2; - c += 2; - } - - if ((num_points % 2) != 0) { - *c = (*a) * scalar; - } + volk_32fc_s32fc_multiply2_32fc_a_sse3(cVector, aVector, &scalar, num_points); } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_NEON -#include static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points) { - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - unsigned int number = num_points; - unsigned int quarter_points = num_points / 4; - - float32x4x2_t a_val, scalar_val; - float32x4x2_t tmp_imag; - - scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar); - scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1); - for (number = 0; number < quarter_points; ++number) { - a_val = vld2q_f32((float*)aPtr); - tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]); - tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]); - - tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]); - tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]); - - vst2q_f32((float*)cPtr, tmp_imag); - aPtr += 4; - cPtr += 4; - } - - for (number = quarter_points * 4; number < num_points; number++) { - *cPtr++ = *aPtr++ * scalar; - } + volk_32fc_s32fc_multiply2_32fc_neon(cVector, aVector, &scalar, num_points); } #endif /* LV_HAVE_NEON */ diff --git a/kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h b/kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h new file mode 100644 index 000000000..3ce071ca4 --- /dev/null +++ b/kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h @@ -0,0 +1,173 @@ +/* -*- c++ -*- */ +/* + * Copyright 2012, 2013, 2014 Free Software Foundation, Inc. + * + * This file is part of VOLK + * + * SPDX-License-Identifier: LGPL-3.0-or-later + */ + + +#ifndef INCLUDED_volk_32fc_s32fc_rotator2puppet_32fc_a_H +#define INCLUDED_volk_32fc_s32fc_rotator2puppet_32fc_a_H + + +#include +#include +#include + + +#ifdef LV_HAVE_GENERIC + +static inline void volk_32fc_s32fc_rotator2puppet_32fc_generic(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + unsigned int num_points) +{ + lv_32fc_t phase[1] = { lv_cmake(.3f, 0.95393f) }; + (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); + const lv_32fc_t phase_inc_n = + *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc)); + volk_32fc_s32fc_x2_rotator2_32fc_generic( + outVector, inVector, &phase_inc_n, phase, num_points); +} + +#endif /* LV_HAVE_GENERIC */ + + +#ifdef LV_HAVE_NEON +#include +#include + +static inline void volk_32fc_s32fc_rotator2puppet_32fc_neon(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + unsigned int num_points) +{ + lv_32fc_t phase[1] = { lv_cmake(.3f, 0.95393f) }; + (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); + const lv_32fc_t phase_inc_n = + *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc)); + volk_32fc_s32fc_x2_rotator2_32fc_neon( + outVector, inVector, &phase_inc_n, phase, num_points); +} + +#endif /* LV_HAVE_NEON */ + + +#ifdef LV_HAVE_SSE4_1 +#include + +static inline void +volk_32fc_s32fc_rotator2puppet_32fc_a_sse4_1(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + unsigned int num_points) +{ + lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) }; + (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); + const lv_32fc_t phase_inc_n = + *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc)); + volk_32fc_s32fc_x2_rotator2_32fc_a_sse4_1( + outVector, inVector, &phase_inc_n, phase, num_points); +} + +#endif /* LV_HAVE_SSE4_1 */ + + +#ifdef LV_HAVE_SSE4_1 +#include +static inline void +volk_32fc_s32fc_rotator2puppet_32fc_u_sse4_1(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + unsigned int num_points) +{ + lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) }; + (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); + const lv_32fc_t phase_inc_n = + *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc)); + volk_32fc_s32fc_x2_rotator2_32fc_u_sse4_1( + outVector, inVector, &phase_inc_n, phase, num_points); +} + +#endif /* LV_HAVE_SSE4_1 */ + + +#ifdef LV_HAVE_AVX +#include + +static inline void volk_32fc_s32fc_rotator2puppet_32fc_a_avx(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + unsigned int num_points) +{ + lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) }; + (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); + const lv_32fc_t phase_inc_n = + *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc)); + volk_32fc_s32fc_x2_rotator2_32fc_a_avx( + outVector, inVector, &phase_inc_n, phase, num_points); +} + +#endif /* LV_HAVE_AVX */ + + +#ifdef LV_HAVE_AVX +#include + +static inline void volk_32fc_s32fc_rotator2puppet_32fc_u_avx(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + unsigned int num_points) +{ + lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) }; + (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); + const lv_32fc_t phase_inc_n = + *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc)); + volk_32fc_s32fc_x2_rotator2_32fc_u_avx( + outVector, inVector, &phase_inc_n, phase, num_points); +} + +#endif /* LV_HAVE_AVX */ + +#if LV_HAVE_AVX && LV_HAVE_FMA +#include + +static inline void +volk_32fc_s32fc_rotator2puppet_32fc_a_avx_fma(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + unsigned int num_points) +{ + lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) }; + (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); + const lv_32fc_t phase_inc_n = + *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc)); + volk_32fc_s32fc_x2_rotator2_32fc_a_avx_fma( + outVector, inVector, &phase_inc_n, phase, num_points); +} + +#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/ + + +#if LV_HAVE_AVX && LV_HAVE_FMA +#include + +static inline void +volk_32fc_s32fc_rotator2puppet_32fc_u_avx_fma(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + unsigned int num_points) +{ + lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) }; + (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); + const lv_32fc_t phase_inc_n = + *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc)); + volk_32fc_s32fc_x2_rotator2_32fc_u_avx_fma( + outVector, inVector, &phase_inc_n, phase, num_points); +} + +#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/ + +#endif /* INCLUDED_volk_32fc_s32fc_rotator2puppet_32fc_a_H */ diff --git a/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h b/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h deleted file mode 100644 index e328a3112..000000000 --- a/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h +++ /dev/null @@ -1,169 +0,0 @@ -/* -*- c++ -*- */ -/* - * Copyright 2012, 2013, 2014 Free Software Foundation, Inc. - * - * This file is part of VOLK - * - * SPDX-License-Identifier: LGPL-3.0-or-later - */ - - -#ifndef INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H -#define INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H - - -#include -#include -#include - - -#ifdef LV_HAVE_GENERIC - -static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector, - const lv_32fc_t* inVector, - const lv_32fc_t phase_inc, - unsigned int num_points) -{ - lv_32fc_t phase[1] = { lv_cmake(.3f, 0.95393f) }; - (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); - const lv_32fc_t phase_inc_n = - phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); - volk_32fc_s32fc_x2_rotator_32fc_generic( - outVector, inVector, phase_inc_n, phase, num_points); -} - -#endif /* LV_HAVE_GENERIC */ - - -#ifdef LV_HAVE_NEON -#include -#include - -static inline void volk_32fc_s32fc_rotatorpuppet_32fc_neon(lv_32fc_t* outVector, - const lv_32fc_t* inVector, - const lv_32fc_t phase_inc, - unsigned int num_points) -{ - lv_32fc_t phase[1] = { lv_cmake(.3f, 0.95393f) }; - (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); - const lv_32fc_t phase_inc_n = - phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); - volk_32fc_s32fc_x2_rotator_32fc_neon( - outVector, inVector, phase_inc_n, phase, num_points); -} - -#endif /* LV_HAVE_NEON */ - - -#ifdef LV_HAVE_SSE4_1 -#include - -static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector, - const lv_32fc_t* inVector, - const lv_32fc_t phase_inc, - unsigned int num_points) -{ - lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) }; - (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); - const lv_32fc_t phase_inc_n = - phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); - volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1( - outVector, inVector, phase_inc_n, phase, num_points); -} - -#endif /* LV_HAVE_SSE4_1 */ - - -#ifdef LV_HAVE_SSE4_1 -#include -static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVector, - const lv_32fc_t* inVector, - const lv_32fc_t phase_inc, - unsigned int num_points) -{ - lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) }; - (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); - const lv_32fc_t phase_inc_n = - phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); - volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1( - outVector, inVector, phase_inc_n, phase, num_points); -} - -#endif /* LV_HAVE_SSE4_1 */ - - -#ifdef LV_HAVE_AVX -#include - -static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector, - const lv_32fc_t* inVector, - const lv_32fc_t phase_inc, - unsigned int num_points) -{ - lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) }; - (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); - const lv_32fc_t phase_inc_n = - phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); - volk_32fc_s32fc_x2_rotator_32fc_a_avx( - outVector, inVector, phase_inc_n, phase, num_points); -} - -#endif /* LV_HAVE_AVX */ - - -#ifdef LV_HAVE_AVX -#include - -static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx(lv_32fc_t* outVector, - const lv_32fc_t* inVector, - const lv_32fc_t phase_inc, - unsigned int num_points) -{ - lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) }; - (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); - const lv_32fc_t phase_inc_n = - phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); - volk_32fc_s32fc_x2_rotator_32fc_u_avx( - outVector, inVector, phase_inc_n, phase, num_points); -} - -#endif /* LV_HAVE_AVX */ - -#if LV_HAVE_AVX && LV_HAVE_FMA -#include - -static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx_fma(lv_32fc_t* outVector, - const lv_32fc_t* inVector, - const lv_32fc_t phase_inc, - unsigned int num_points) -{ - lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) }; - (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); - const lv_32fc_t phase_inc_n = - phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); - volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma( - outVector, inVector, phase_inc_n, phase, num_points); -} - -#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/ - - -#if LV_HAVE_AVX && LV_HAVE_FMA -#include - -static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx_fma(lv_32fc_t* outVector, - const lv_32fc_t* inVector, - const lv_32fc_t phase_inc, - unsigned int num_points) -{ - lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) }; - (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); - const lv_32fc_t phase_inc_n = - phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); - volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma( - outVector, inVector, phase_inc_n, phase, num_points); -} - -#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/ - -#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */ diff --git a/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h b/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h new file mode 100644 index 000000000..bee1f0682 --- /dev/null +++ b/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h @@ -0,0 +1,782 @@ +/* -*- c++ -*- */ +/* + * Copyright 2012, 2013, 2014 Free Software Foundation, Inc. + * + * This file is part of VOLK + * + * SPDX-License-Identifier: LGPL-3.0-or-later + */ + +/*! + * \page volk_32fc_s32fc_x2_rotator2_32fc + * + * \b Overview + * + * Rotate input vector at fixed rate per sample from initial phase + * offset. + * + * Dispatcher Prototype + * \code + * void volk_32fc_s32fc_x2_rotator2_32fc(lv_32fc_t* outVector, const lv_32fc_t* inVector, + * const lv_32fc_t* phase_inc, lv_32fc_t* phase, unsigned int num_points) \endcode + * + * \b Inputs + * \li inVector: Vector to be rotated. + * \li phase_inc: rotational velocity (scalar, input). + * \li phase: phase offset (scalar, input & output). + * \li num_points: The number of values in inVector to be rotated and stored into + * outVector. + * + * \b Outputs + * \li outVector: The vector where the results will be stored. + * + * \b Example + * Generate a tone at f=0.3 (normalized frequency) and use the rotator with + * f=0.1 to shift the tone to f=0.4. Change this example to start with a DC + * tone (initialize in with lv_cmake(1, 0)) to observe rotator signal generation. + * \code + * int N = 10; + * unsigned int alignment = volk_get_alignment(); + * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); + * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); + * + * for(unsigned int ii = 0; ii < N; ++ii){ + * // Generate a tone at f=0.3 + * float real = std::cos(0.3f * (float)ii); + * float imag = std::sin(0.3f * (float)ii); + * in[ii] = lv_cmake(real, imag); + * } + * // The oscillator rotates at f=0.1 + * float frequency = 0.1f; + * lv_32fc_t phase_increment = lv_cmake(std::cos(frequency), std::sin(frequency)); + * lv_32fc_t phase= lv_cmake(1.f, 0.0f); // start at 1 (0 rad phase) + * + * // rotate so the output is a tone at f=0.4 + * volk_32fc_s32fc_x2_rotator2_32fc(out, in, &phase_increment, &phase, N); + * + * // print results for inspection + * for(unsigned int ii = 0; ii < N; ++ii){ + * printf("out[%u] = %+1.2f %+1.2fj\n", + * ii, lv_creal(out[ii]), lv_cimag(out[ii])); + * } + * + * volk_free(in); + * volk_free(out); + * \endcode + */ + +#ifndef INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H +#define INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H + + +#include +#include +#include +#include +#define ROTATOR_RELOAD 512 +#define ROTATOR_RELOAD_2 (ROTATOR_RELOAD / 2) +#define ROTATOR_RELOAD_4 (ROTATOR_RELOAD / 4) + + +#ifdef LV_HAVE_GENERIC + +static inline void volk_32fc_s32fc_x2_rotator2_32fc_generic(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + lv_32fc_t* phase, + unsigned int num_points) +{ + unsigned int i = 0; + int j = 0; + for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) { + for (j = 0; j < ROTATOR_RELOAD; ++j) { + *outVector++ = *inVector++ * (*phase); + (*phase) *= *phase_inc; + } + + (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); + } + for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) { + *outVector++ = *inVector++ * (*phase); + (*phase) *= *phase_inc; + } + if (i) { + // Make sure, we normalize phase on every call! + (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); + } +} + +#endif /* LV_HAVE_GENERIC */ + + +#ifdef LV_HAVE_NEON +#include +#include + +static inline void volk_32fc_s32fc_x2_rotator2_32fc_neon(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + lv_32fc_t* phase, + unsigned int num_points) + +{ + lv_32fc_t* outputVectorPtr = outVector; + const lv_32fc_t* inputVectorPtr = inVector; + lv_32fc_t incr = 1; + lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) }; + float32x4x2_t input_vec; + float32x4x2_t output_vec; + + unsigned int i = 0, j = 0; + // const unsigned int quarter_points = num_points / 4; + + for (i = 0; i < 4; ++i) { + phasePtr[i] *= incr; + incr *= (*phase_inc); + } + + // Notice that incr has be incremented in the previous loop + const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr }; + const float32x4x2_t incr_vec = vld2q_f32((float*)incrPtr); + float32x4x2_t phase_vec = vld2q_f32((float*)phasePtr); + + for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) { + for (j = 0; j < ROTATOR_RELOAD_4; j++) { + input_vec = vld2q_f32((float*)inputVectorPtr); + // Prefetch next one, speeds things up + __VOLK_PREFETCH(inputVectorPtr + 4); + // Rotate + output_vec = _vmultiply_complexq_f32(input_vec, phase_vec); + // Increase phase + phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec); + // Store output + vst2q_f32((float*)outputVectorPtr, output_vec); + + outputVectorPtr += 4; + inputVectorPtr += 4; + } + // normalize phase so magnitude doesn't grow because of + // floating point rounding error + const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec); + const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared); + // Multiply complex with real + phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag); + phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag); + } + + for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; i++) { + input_vec = vld2q_f32((float*)inputVectorPtr); + // Prefetch next one, speeds things up + __VOLK_PREFETCH(inputVectorPtr + 4); + // Rotate + output_vec = _vmultiply_complexq_f32(input_vec, phase_vec); + // Increase phase + phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec); + // Store output + vst2q_f32((float*)outputVectorPtr, output_vec); + + outputVectorPtr += 4; + inputVectorPtr += 4; + } + // if(i) == true means we looped above + if (i) { + // normalize phase so magnitude doesn't grow because of + // floating point rounding error + const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec); + const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared); + // Multiply complex with real + phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag); + phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag); + } + // Store current phase + vst2q_f32((float*)phasePtr, phase_vec); + + // Deal with the rest + for (i = 0; i < num_points % 4; i++) { + *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0]; + phasePtr[0] *= (*phase_inc); + } + + // For continuous phase next time we need to call this function + (*phase) = phasePtr[0]; +} + +#endif /* LV_HAVE_NEON */ + + +#ifdef LV_HAVE_SSE4_1 +#include + +static inline void volk_32fc_s32fc_x2_rotator2_32fc_a_sse4_1(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + lv_32fc_t* phase, + unsigned int num_points) +{ + lv_32fc_t* cPtr = outVector; + const lv_32fc_t* aPtr = inVector; + lv_32fc_t incr = 1; + lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) }; + + unsigned int i, j = 0; + + for (i = 0; i < 2; ++i) { + phase_Ptr[i] *= incr; + incr *= (*phase_inc); + } + + __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; + + phase_Val = _mm_loadu_ps((float*)phase_Ptr); + inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr)); + + for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) { + for (j = 0; j < ROTATOR_RELOAD_2; ++j) { + + aVal = _mm_load_ps((float*)aPtr); + + yl = _mm_moveldup_ps(phase_Val); + yh = _mm_movehdup_ps(phase_Val); + ylp = _mm_moveldup_ps(inc_Val); + yhp = _mm_movehdup_ps(inc_Val); + + tmp1 = _mm_mul_ps(aVal, yl); + tmp1p = _mm_mul_ps(phase_Val, ylp); + + aVal = _mm_shuffle_ps(aVal, aVal, 0xB1); + phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1); + tmp2 = _mm_mul_ps(aVal, yh); + tmp2p = _mm_mul_ps(phase_Val, yhp); + + z = _mm_addsub_ps(tmp1, tmp2); + phase_Val = _mm_addsub_ps(tmp1p, tmp2p); + + _mm_store_ps((float*)cPtr, z); + + aPtr += 2; + cPtr += 2; + } + tmp1 = _mm_mul_ps(phase_Val, phase_Val); + tmp2 = _mm_hadd_ps(tmp1, tmp1); + tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); + tmp2 = _mm_sqrt_ps(tmp1); + phase_Val = _mm_div_ps(phase_Val, tmp2); + } + for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) { + aVal = _mm_load_ps((float*)aPtr); + + yl = _mm_moveldup_ps(phase_Val); + yh = _mm_movehdup_ps(phase_Val); + ylp = _mm_moveldup_ps(inc_Val); + yhp = _mm_movehdup_ps(inc_Val); + + tmp1 = _mm_mul_ps(aVal, yl); + + tmp1p = _mm_mul_ps(phase_Val, ylp); + + aVal = _mm_shuffle_ps(aVal, aVal, 0xB1); + phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1); + tmp2 = _mm_mul_ps(aVal, yh); + tmp2p = _mm_mul_ps(phase_Val, yhp); + + z = _mm_addsub_ps(tmp1, tmp2); + phase_Val = _mm_addsub_ps(tmp1p, tmp2p); + + _mm_store_ps((float*)cPtr, z); + + aPtr += 2; + cPtr += 2; + } + if (i) { + tmp1 = _mm_mul_ps(phase_Val, phase_Val); + tmp2 = _mm_hadd_ps(tmp1, tmp1); + tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); + tmp2 = _mm_sqrt_ps(tmp1); + phase_Val = _mm_div_ps(phase_Val, tmp2); + } + + _mm_storeu_ps((float*)phase_Ptr, phase_Val); + if (num_points & 1) { + *cPtr++ = *aPtr++ * phase_Ptr[0]; + phase_Ptr[0] *= (*phase_inc); + } + + (*phase) = phase_Ptr[0]; +} + +#endif /* LV_HAVE_SSE4_1 for aligned */ + + +#ifdef LV_HAVE_SSE4_1 +#include + +static inline void volk_32fc_s32fc_x2_rotator2_32fc_u_sse4_1(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + lv_32fc_t* phase, + unsigned int num_points) +{ + lv_32fc_t* cPtr = outVector; + const lv_32fc_t* aPtr = inVector; + lv_32fc_t incr = 1; + lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) }; + + unsigned int i, j = 0; + + for (i = 0; i < 2; ++i) { + phase_Ptr[i] *= incr; + incr *= (*phase_inc); + } + + /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0])); + printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1])); + printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/ + __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; + + phase_Val = _mm_loadu_ps((float*)phase_Ptr); + inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr)); + + for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) { + for (j = 0; j < ROTATOR_RELOAD_2; ++j) { + + aVal = _mm_loadu_ps((float*)aPtr); + + yl = _mm_moveldup_ps(phase_Val); + yh = _mm_movehdup_ps(phase_Val); + ylp = _mm_moveldup_ps(inc_Val); + yhp = _mm_movehdup_ps(inc_Val); + + tmp1 = _mm_mul_ps(aVal, yl); + tmp1p = _mm_mul_ps(phase_Val, ylp); + + aVal = _mm_shuffle_ps(aVal, aVal, 0xB1); + phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1); + tmp2 = _mm_mul_ps(aVal, yh); + tmp2p = _mm_mul_ps(phase_Val, yhp); + + z = _mm_addsub_ps(tmp1, tmp2); + phase_Val = _mm_addsub_ps(tmp1p, tmp2p); + + _mm_storeu_ps((float*)cPtr, z); + + aPtr += 2; + cPtr += 2; + } + tmp1 = _mm_mul_ps(phase_Val, phase_Val); + tmp2 = _mm_hadd_ps(tmp1, tmp1); + tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); + tmp2 = _mm_sqrt_ps(tmp1); + phase_Val = _mm_div_ps(phase_Val, tmp2); + } + for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) { + aVal = _mm_loadu_ps((float*)aPtr); + + yl = _mm_moveldup_ps(phase_Val); + yh = _mm_movehdup_ps(phase_Val); + ylp = _mm_moveldup_ps(inc_Val); + yhp = _mm_movehdup_ps(inc_Val); + + tmp1 = _mm_mul_ps(aVal, yl); + + tmp1p = _mm_mul_ps(phase_Val, ylp); + + aVal = _mm_shuffle_ps(aVal, aVal, 0xB1); + phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1); + tmp2 = _mm_mul_ps(aVal, yh); + tmp2p = _mm_mul_ps(phase_Val, yhp); + + z = _mm_addsub_ps(tmp1, tmp2); + phase_Val = _mm_addsub_ps(tmp1p, tmp2p); + + _mm_storeu_ps((float*)cPtr, z); + + aPtr += 2; + cPtr += 2; + } + if (i) { + tmp1 = _mm_mul_ps(phase_Val, phase_Val); + tmp2 = _mm_hadd_ps(tmp1, tmp1); + tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); + tmp2 = _mm_sqrt_ps(tmp1); + phase_Val = _mm_div_ps(phase_Val, tmp2); + } + + _mm_storeu_ps((float*)phase_Ptr, phase_Val); + if (num_points & 1) { + *cPtr++ = *aPtr++ * phase_Ptr[0]; + phase_Ptr[0] *= (*phase_inc); + } + + (*phase) = phase_Ptr[0]; +} + +#endif /* LV_HAVE_SSE4_1 */ + + +#ifdef LV_HAVE_AVX +#include +#include + +static inline void volk_32fc_s32fc_x2_rotator2_32fc_a_avx(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + lv_32fc_t* phase, + unsigned int num_points) +{ + lv_32fc_t* cPtr = outVector; + const lv_32fc_t* aPtr = inVector; + lv_32fc_t incr = lv_cmake(1.0f, 0.0f); + lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) }; + + unsigned int i, j = 0; + + for (i = 0; i < 4; ++i) { + phase_Ptr[i] *= incr; + incr *= (*phase_inc); + } + + __m256 aVal, phase_Val, z; + + phase_Val = _mm256_loadu_ps((float*)phase_Ptr); + + const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr)); + + for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) { + for (j = 0; j < ROTATOR_RELOAD_4; ++j) { + + aVal = _mm256_load_ps((float*)aPtr); + + z = _mm256_complexmul_ps(aVal, phase_Val); + phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val); + + _mm256_store_ps((float*)cPtr, z); + + aPtr += 4; + cPtr += 4; + } + phase_Val = _mm256_normalize_ps(phase_Val); + } + + for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) { + aVal = _mm256_load_ps((float*)aPtr); + + z = _mm256_complexmul_ps(aVal, phase_Val); + phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val); + + _mm256_store_ps((float*)cPtr, z); + + aPtr += 4; + cPtr += 4; + } + if (i) { + phase_Val = _mm256_normalize_ps(phase_Val); + } + + _mm256_storeu_ps((float*)phase_Ptr, phase_Val); + (*phase) = phase_Ptr[0]; + volk_32fc_s32fc_x2_rotator2_32fc_generic( + cPtr, aPtr, phase_inc, phase, num_points % 4); +} + +#endif /* LV_HAVE_AVX for aligned */ + + +#ifdef LV_HAVE_AVX +#include +#include + +static inline void volk_32fc_s32fc_x2_rotator2_32fc_u_avx(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + lv_32fc_t* phase, + unsigned int num_points) +{ + lv_32fc_t* cPtr = outVector; + const lv_32fc_t* aPtr = inVector; + lv_32fc_t incr = lv_cmake(1.0f, 0.0f); + lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) }; + + unsigned int i, j = 0; + + for (i = 0; i < 4; ++i) { + phase_Ptr[i] *= incr; + incr *= (*phase_inc); + } + + __m256 aVal, phase_Val, z; + + phase_Val = _mm256_loadu_ps((float*)phase_Ptr); + + const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr)); + + for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) { + for (j = 0; j < ROTATOR_RELOAD_4; ++j) { + + aVal = _mm256_loadu_ps((float*)aPtr); + + z = _mm256_complexmul_ps(aVal, phase_Val); + phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val); + + _mm256_storeu_ps((float*)cPtr, z); + + aPtr += 4; + cPtr += 4; + } + phase_Val = _mm256_normalize_ps(phase_Val); + } + + for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) { + aVal = _mm256_loadu_ps((float*)aPtr); + + z = _mm256_complexmul_ps(aVal, phase_Val); + phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val); + + _mm256_storeu_ps((float*)cPtr, z); + + aPtr += 4; + cPtr += 4; + } + if (i) { + phase_Val = _mm256_normalize_ps(phase_Val); + } + + _mm256_storeu_ps((float*)phase_Ptr, phase_Val); + (*phase) = phase_Ptr[0]; + volk_32fc_s32fc_x2_rotator2_32fc_generic( + cPtr, aPtr, phase_inc, phase, num_points % 4); +} + +#endif /* LV_HAVE_AVX */ + +#if LV_HAVE_AVX && LV_HAVE_FMA +#include + +static inline void volk_32fc_s32fc_x2_rotator2_32fc_a_avx_fma(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + lv_32fc_t* phase, + unsigned int num_points) +{ + lv_32fc_t* cPtr = outVector; + const lv_32fc_t* aPtr = inVector; + lv_32fc_t incr = 1; + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) }; + + unsigned int i, j = 0; + + for (i = 0; i < 4; ++i) { + phase_Ptr[i] *= incr; + incr *= (*phase_inc); + } + + __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; + + phase_Val = _mm256_load_ps((float*)phase_Ptr); + inc_Val = _mm256_set_ps(lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr)); + + for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) { + for (j = 0; j < ROTATOR_RELOAD_4; ++j) { + + aVal = _mm256_load_ps((float*)aPtr); + + yl = _mm256_moveldup_ps(phase_Val); + yh = _mm256_movehdup_ps(phase_Val); + ylp = _mm256_moveldup_ps(inc_Val); + yhp = _mm256_movehdup_ps(inc_Val); + + tmp1 = aVal; + tmp1p = phase_Val; + + aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1); + phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1); + tmp2 = _mm256_mul_ps(aVal, yh); + tmp2p = _mm256_mul_ps(phase_Val, yhp); + + z = _mm256_fmaddsub_ps(tmp1, yl, tmp2); + phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p); + + _mm256_store_ps((float*)cPtr, z); + + aPtr += 4; + cPtr += 4; + } + tmp1 = _mm256_mul_ps(phase_Val, phase_Val); + tmp2 = _mm256_hadd_ps(tmp1, tmp1); + tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8); + tmp2 = _mm256_sqrt_ps(tmp1); + phase_Val = _mm256_div_ps(phase_Val, tmp2); + } + for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) { + aVal = _mm256_load_ps((float*)aPtr); + + yl = _mm256_moveldup_ps(phase_Val); + yh = _mm256_movehdup_ps(phase_Val); + ylp = _mm256_moveldup_ps(inc_Val); + yhp = _mm256_movehdup_ps(inc_Val); + + tmp1 = aVal; + tmp1p = phase_Val; + + aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1); + phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1); + tmp2 = _mm256_mul_ps(aVal, yh); + tmp2p = _mm256_mul_ps(phase_Val, yhp); + + z = _mm256_fmaddsub_ps(tmp1, yl, tmp2); + phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p); + + _mm256_store_ps((float*)cPtr, z); + + aPtr += 4; + cPtr += 4; + } + if (i) { + tmp1 = _mm256_mul_ps(phase_Val, phase_Val); + tmp2 = _mm256_hadd_ps(tmp1, tmp1); + tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8); + tmp2 = _mm256_sqrt_ps(tmp1); + phase_Val = _mm256_div_ps(phase_Val, tmp2); + } + + _mm256_store_ps((float*)phase_Ptr, phase_Val); + for (i = 0; i < num_points % 4; ++i) { + *cPtr++ = *aPtr++ * phase_Ptr[0]; + phase_Ptr[0] *= (*phase_inc); + } + + (*phase) = phase_Ptr[0]; +} + +#endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned*/ + +#if LV_HAVE_AVX && LV_HAVE_FMA +#include + +static inline void volk_32fc_s32fc_x2_rotator2_32fc_u_avx_fma(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + lv_32fc_t* phase, + unsigned int num_points) +{ + lv_32fc_t* cPtr = outVector; + const lv_32fc_t* aPtr = inVector; + lv_32fc_t incr = 1; + lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) }; + + unsigned int i, j = 0; + + for (i = 0; i < 4; ++i) { + phase_Ptr[i] *= incr; + incr *= (*phase_inc); + } + + __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; + + phase_Val = _mm256_loadu_ps((float*)phase_Ptr); + inc_Val = _mm256_set_ps(lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr)); + + for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) { + for (j = 0; j < ROTATOR_RELOAD_4; ++j) { + + aVal = _mm256_loadu_ps((float*)aPtr); + + yl = _mm256_moveldup_ps(phase_Val); + yh = _mm256_movehdup_ps(phase_Val); + ylp = _mm256_moveldup_ps(inc_Val); + yhp = _mm256_movehdup_ps(inc_Val); + + tmp1 = aVal; + tmp1p = phase_Val; + + aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1); + phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1); + tmp2 = _mm256_mul_ps(aVal, yh); + tmp2p = _mm256_mul_ps(phase_Val, yhp); + + z = _mm256_fmaddsub_ps(tmp1, yl, tmp2); + phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p); + + _mm256_storeu_ps((float*)cPtr, z); + + aPtr += 4; + cPtr += 4; + } + tmp1 = _mm256_mul_ps(phase_Val, phase_Val); + tmp2 = _mm256_hadd_ps(tmp1, tmp1); + tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8); + tmp2 = _mm256_sqrt_ps(tmp1); + phase_Val = _mm256_div_ps(phase_Val, tmp2); + } + for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) { + aVal = _mm256_loadu_ps((float*)aPtr); + + yl = _mm256_moveldup_ps(phase_Val); + yh = _mm256_movehdup_ps(phase_Val); + ylp = _mm256_moveldup_ps(inc_Val); + yhp = _mm256_movehdup_ps(inc_Val); + + tmp1 = aVal; + tmp1p = phase_Val; + + aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1); + phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1); + tmp2 = _mm256_mul_ps(aVal, yh); + tmp2p = _mm256_mul_ps(phase_Val, yhp); + + z = _mm256_fmaddsub_ps(tmp1, yl, tmp2); + phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p); + + _mm256_storeu_ps((float*)cPtr, z); + + aPtr += 4; + cPtr += 4; + } + if (i) { + tmp1 = _mm256_mul_ps(phase_Val, phase_Val); + tmp2 = _mm256_hadd_ps(tmp1, tmp1); + tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8); + tmp2 = _mm256_sqrt_ps(tmp1); + phase_Val = _mm256_div_ps(phase_Val, tmp2); + } + + _mm256_storeu_ps((float*)phase_Ptr, phase_Val); + for (i = 0; i < num_points % 4; ++i) { + *cPtr++ = *aPtr++ * phase_Ptr[0]; + phase_Ptr[0] *= (*phase_inc); + } + + (*phase) = phase_Ptr[0]; +} + +#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/ + +#endif /* INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H */ diff --git a/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h b/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h index aebf46d53..754bfcba7 100644 --- a/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h +++ b/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h @@ -72,10 +72,8 @@ #include #include #include +#include #include -#define ROTATOR_RELOAD 512 -#define ROTATOR_RELOAD_2 (ROTATOR_RELOAD / 2) -#define ROTATOR_RELOAD_4 (ROTATOR_RELOAD / 4) #ifdef LV_HAVE_GENERIC @@ -86,32 +84,14 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector, lv_32fc_t* phase, unsigned int num_points) { - unsigned int i = 0; - int j = 0; - for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) { - for (j = 0; j < ROTATOR_RELOAD; ++j) { - *outVector++ = *inVector++ * (*phase); - (*phase) *= phase_inc; - } - - (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); - } - for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) { - *outVector++ = *inVector++ * (*phase); - (*phase) *= phase_inc; - } - if (i) { - // Make sure, we normalize phase on every call! - (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); - } + volk_32fc_s32fc_x2_rotator2_32fc_generic( + outVector, inVector, &phase_inc, phase, num_points); } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_NEON -#include -#include static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector, const lv_32fc_t* inVector, @@ -120,92 +100,14 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector, unsigned int num_points) { - lv_32fc_t* outputVectorPtr = outVector; - const lv_32fc_t* inputVectorPtr = inVector; - lv_32fc_t incr = 1; - lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) }; - float32x4x2_t input_vec; - float32x4x2_t output_vec; - - unsigned int i = 0, j = 0; - // const unsigned int quarter_points = num_points / 4; - - for (i = 0; i < 4; ++i) { - phasePtr[i] *= incr; - incr *= (phase_inc); - } - - // Notice that incr has be incremented in the previous loop - const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr }; - const float32x4x2_t incr_vec = vld2q_f32((float*)incrPtr); - float32x4x2_t phase_vec = vld2q_f32((float*)phasePtr); - - for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) { - for (j = 0; j < ROTATOR_RELOAD_4; j++) { - input_vec = vld2q_f32((float*)inputVectorPtr); - // Prefetch next one, speeds things up - __VOLK_PREFETCH(inputVectorPtr + 4); - // Rotate - output_vec = _vmultiply_complexq_f32(input_vec, phase_vec); - // Increase phase - phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec); - // Store output - vst2q_f32((float*)outputVectorPtr, output_vec); - - outputVectorPtr += 4; - inputVectorPtr += 4; - } - // normalize phase so magnitude doesn't grow because of - // floating point rounding error - const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec); - const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared); - // Multiply complex with real - phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag); - phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag); - } - - for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; i++) { - input_vec = vld2q_f32((float*)inputVectorPtr); - // Prefetch next one, speeds things up - __VOLK_PREFETCH(inputVectorPtr + 4); - // Rotate - output_vec = _vmultiply_complexq_f32(input_vec, phase_vec); - // Increase phase - phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec); - // Store output - vst2q_f32((float*)outputVectorPtr, output_vec); - - outputVectorPtr += 4; - inputVectorPtr += 4; - } - // if(i) == true means we looped above - if (i) { - // normalize phase so magnitude doesn't grow because of - // floating point rounding error - const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec); - const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared); - // Multiply complex with real - phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag); - phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag); - } - // Store current phase - vst2q_f32((float*)phasePtr, phase_vec); - - // Deal with the rest - for (i = 0; i < num_points % 4; i++) { - *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0]; - phasePtr[0] *= (phase_inc); - } - - // For continuous phase next time we need to call this function - (*phase) = phasePtr[0]; + volk_32fc_s32fc_x2_rotator2_32fc_neon( + outVector, inVector, &phase_inc, phase, num_points); } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_SSE4_1 -#include static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, @@ -213,102 +115,14 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector lv_32fc_t* phase, unsigned int num_points) { - lv_32fc_t* cPtr = outVector; - const lv_32fc_t* aPtr = inVector; - lv_32fc_t incr = 1; - lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) }; - - unsigned int i, j = 0; - - for (i = 0; i < 2; ++i) { - phase_Ptr[i] *= incr; - incr *= (phase_inc); - } - - __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; - - phase_Val = _mm_loadu_ps((float*)phase_Ptr); - inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr)); - - for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) { - for (j = 0; j < ROTATOR_RELOAD_2; ++j) { - - aVal = _mm_load_ps((float*)aPtr); - - yl = _mm_moveldup_ps(phase_Val); - yh = _mm_movehdup_ps(phase_Val); - ylp = _mm_moveldup_ps(inc_Val); - yhp = _mm_movehdup_ps(inc_Val); - - tmp1 = _mm_mul_ps(aVal, yl); - tmp1p = _mm_mul_ps(phase_Val, ylp); - - aVal = _mm_shuffle_ps(aVal, aVal, 0xB1); - phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1); - tmp2 = _mm_mul_ps(aVal, yh); - tmp2p = _mm_mul_ps(phase_Val, yhp); - - z = _mm_addsub_ps(tmp1, tmp2); - phase_Val = _mm_addsub_ps(tmp1p, tmp2p); - - _mm_store_ps((float*)cPtr, z); - - aPtr += 2; - cPtr += 2; - } - tmp1 = _mm_mul_ps(phase_Val, phase_Val); - tmp2 = _mm_hadd_ps(tmp1, tmp1); - tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); - tmp2 = _mm_sqrt_ps(tmp1); - phase_Val = _mm_div_ps(phase_Val, tmp2); - } - for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) { - aVal = _mm_load_ps((float*)aPtr); - - yl = _mm_moveldup_ps(phase_Val); - yh = _mm_movehdup_ps(phase_Val); - ylp = _mm_moveldup_ps(inc_Val); - yhp = _mm_movehdup_ps(inc_Val); - - tmp1 = _mm_mul_ps(aVal, yl); - - tmp1p = _mm_mul_ps(phase_Val, ylp); - - aVal = _mm_shuffle_ps(aVal, aVal, 0xB1); - phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1); - tmp2 = _mm_mul_ps(aVal, yh); - tmp2p = _mm_mul_ps(phase_Val, yhp); - - z = _mm_addsub_ps(tmp1, tmp2); - phase_Val = _mm_addsub_ps(tmp1p, tmp2p); - - _mm_store_ps((float*)cPtr, z); - - aPtr += 2; - cPtr += 2; - } - if (i) { - tmp1 = _mm_mul_ps(phase_Val, phase_Val); - tmp2 = _mm_hadd_ps(tmp1, tmp1); - tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); - tmp2 = _mm_sqrt_ps(tmp1); - phase_Val = _mm_div_ps(phase_Val, tmp2); - } - - _mm_storeu_ps((float*)phase_Ptr, phase_Val); - if (num_points & 1) { - *cPtr++ = *aPtr++ * phase_Ptr[0]; - phase_Ptr[0] *= (phase_inc); - } - - (*phase) = phase_Ptr[0]; + volk_32fc_s32fc_x2_rotator2_32fc_a_sse4_1( + outVector, inVector, &phase_inc, phase, num_points); } #endif /* LV_HAVE_SSE4_1 for aligned */ #ifdef LV_HAVE_SSE4_1 -#include static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, @@ -316,106 +130,14 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector lv_32fc_t* phase, unsigned int num_points) { - lv_32fc_t* cPtr = outVector; - const lv_32fc_t* aPtr = inVector; - lv_32fc_t incr = 1; - lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) }; - - unsigned int i, j = 0; - - for (i = 0; i < 2; ++i) { - phase_Ptr[i] *= incr; - incr *= (phase_inc); - } - - /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0])); - printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1])); - printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/ - __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; - - phase_Val = _mm_loadu_ps((float*)phase_Ptr); - inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr)); - - for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) { - for (j = 0; j < ROTATOR_RELOAD_2; ++j) { - - aVal = _mm_loadu_ps((float*)aPtr); - - yl = _mm_moveldup_ps(phase_Val); - yh = _mm_movehdup_ps(phase_Val); - ylp = _mm_moveldup_ps(inc_Val); - yhp = _mm_movehdup_ps(inc_Val); - - tmp1 = _mm_mul_ps(aVal, yl); - tmp1p = _mm_mul_ps(phase_Val, ylp); - - aVal = _mm_shuffle_ps(aVal, aVal, 0xB1); - phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1); - tmp2 = _mm_mul_ps(aVal, yh); - tmp2p = _mm_mul_ps(phase_Val, yhp); - - z = _mm_addsub_ps(tmp1, tmp2); - phase_Val = _mm_addsub_ps(tmp1p, tmp2p); - - _mm_storeu_ps((float*)cPtr, z); - - aPtr += 2; - cPtr += 2; - } - tmp1 = _mm_mul_ps(phase_Val, phase_Val); - tmp2 = _mm_hadd_ps(tmp1, tmp1); - tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); - tmp2 = _mm_sqrt_ps(tmp1); - phase_Val = _mm_div_ps(phase_Val, tmp2); - } - for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) { - aVal = _mm_loadu_ps((float*)aPtr); - - yl = _mm_moveldup_ps(phase_Val); - yh = _mm_movehdup_ps(phase_Val); - ylp = _mm_moveldup_ps(inc_Val); - yhp = _mm_movehdup_ps(inc_Val); - - tmp1 = _mm_mul_ps(aVal, yl); - - tmp1p = _mm_mul_ps(phase_Val, ylp); - - aVal = _mm_shuffle_ps(aVal, aVal, 0xB1); - phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1); - tmp2 = _mm_mul_ps(aVal, yh); - tmp2p = _mm_mul_ps(phase_Val, yhp); - - z = _mm_addsub_ps(tmp1, tmp2); - phase_Val = _mm_addsub_ps(tmp1p, tmp2p); - - _mm_storeu_ps((float*)cPtr, z); - - aPtr += 2; - cPtr += 2; - } - if (i) { - tmp1 = _mm_mul_ps(phase_Val, phase_Val); - tmp2 = _mm_hadd_ps(tmp1, tmp1); - tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); - tmp2 = _mm_sqrt_ps(tmp1); - phase_Val = _mm_div_ps(phase_Val, tmp2); - } - - _mm_storeu_ps((float*)phase_Ptr, phase_Val); - if (num_points & 1) { - *cPtr++ = *aPtr++ * phase_Ptr[0]; - phase_Ptr[0] *= (phase_inc); - } - - (*phase) = phase_Ptr[0]; + volk_32fc_s32fc_x2_rotator2_32fc_u_sse4_1( + outVector, inVector, &phase_inc, phase, num_points); } #endif /* LV_HAVE_SSE4_1 */ #ifdef LV_HAVE_AVX -#include -#include static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, @@ -423,73 +145,14 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, lv_32fc_t* phase, unsigned int num_points) { - lv_32fc_t* cPtr = outVector; - const lv_32fc_t* aPtr = inVector; - lv_32fc_t incr = lv_cmake(1.0f, 0.0f); - lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) }; - - unsigned int i, j = 0; - - for (i = 0; i < 4; ++i) { - phase_Ptr[i] *= incr; - incr *= (phase_inc); - } - - __m256 aVal, phase_Val, z; - - phase_Val = _mm256_loadu_ps((float*)phase_Ptr); - - const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr), - lv_creal(incr), - lv_cimag(incr), - lv_creal(incr), - lv_cimag(incr), - lv_creal(incr), - lv_cimag(incr), - lv_creal(incr)); - - for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) { - for (j = 0; j < ROTATOR_RELOAD_4; ++j) { - - aVal = _mm256_load_ps((float*)aPtr); - - z = _mm256_complexmul_ps(aVal, phase_Val); - phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val); - - _mm256_store_ps((float*)cPtr, z); - - aPtr += 4; - cPtr += 4; - } - phase_Val = _mm256_normalize_ps(phase_Val); - } - - for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) { - aVal = _mm256_load_ps((float*)aPtr); - - z = _mm256_complexmul_ps(aVal, phase_Val); - phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val); - - _mm256_store_ps((float*)cPtr, z); - - aPtr += 4; - cPtr += 4; - } - if (i) { - phase_Val = _mm256_normalize_ps(phase_Val); - } - - _mm256_storeu_ps((float*)phase_Ptr, phase_Val); - (*phase) = phase_Ptr[0]; - volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4); + volk_32fc_s32fc_x2_rotator2_32fc_a_avx( + outVector, inVector, &phase_inc, phase, num_points); } #endif /* LV_HAVE_AVX for aligned */ #ifdef LV_HAVE_AVX -#include -#include static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, @@ -497,71 +160,13 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, lv_32fc_t* phase, unsigned int num_points) { - lv_32fc_t* cPtr = outVector; - const lv_32fc_t* aPtr = inVector; - lv_32fc_t incr = lv_cmake(1.0f, 0.0f); - lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) }; - - unsigned int i, j = 0; - - for (i = 0; i < 4; ++i) { - phase_Ptr[i] *= incr; - incr *= (phase_inc); - } - - __m256 aVal, phase_Val, z; - - phase_Val = _mm256_loadu_ps((float*)phase_Ptr); - - const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr), - lv_creal(incr), - lv_cimag(incr), - lv_creal(incr), - lv_cimag(incr), - lv_creal(incr), - lv_cimag(incr), - lv_creal(incr)); - - for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) { - for (j = 0; j < ROTATOR_RELOAD_4; ++j) { - - aVal = _mm256_loadu_ps((float*)aPtr); - - z = _mm256_complexmul_ps(aVal, phase_Val); - phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val); - - _mm256_storeu_ps((float*)cPtr, z); - - aPtr += 4; - cPtr += 4; - } - phase_Val = _mm256_normalize_ps(phase_Val); - } - - for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) { - aVal = _mm256_loadu_ps((float*)aPtr); - - z = _mm256_complexmul_ps(aVal, phase_Val); - phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val); - - _mm256_storeu_ps((float*)cPtr, z); - - aPtr += 4; - cPtr += 4; - } - if (i) { - phase_Val = _mm256_normalize_ps(phase_Val); - } - - _mm256_storeu_ps((float*)phase_Ptr, phase_Val); - (*phase) = phase_Ptr[0]; - volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4); + volk_32fc_s32fc_x2_rotator2_32fc_u_avx( + outVector, inVector, &phase_inc, phase, num_points); } #endif /* LV_HAVE_AVX */ #if LV_HAVE_AVX && LV_HAVE_FMA -#include static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, @@ -569,108 +174,13 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVecto lv_32fc_t* phase, unsigned int num_points) { - lv_32fc_t* cPtr = outVector; - const lv_32fc_t* aPtr = inVector; - lv_32fc_t incr = 1; - __VOLK_ATTR_ALIGNED(32) - lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) }; - - unsigned int i, j = 0; - - for (i = 0; i < 4; ++i) { - phase_Ptr[i] *= incr; - incr *= (phase_inc); - } - - __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; - - phase_Val = _mm256_load_ps((float*)phase_Ptr); - inc_Val = _mm256_set_ps(lv_cimag(incr), - lv_creal(incr), - lv_cimag(incr), - lv_creal(incr), - lv_cimag(incr), - lv_creal(incr), - lv_cimag(incr), - lv_creal(incr)); - - for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) { - for (j = 0; j < ROTATOR_RELOAD_4; ++j) { - - aVal = _mm256_load_ps((float*)aPtr); - - yl = _mm256_moveldup_ps(phase_Val); - yh = _mm256_movehdup_ps(phase_Val); - ylp = _mm256_moveldup_ps(inc_Val); - yhp = _mm256_movehdup_ps(inc_Val); - - tmp1 = aVal; - tmp1p = phase_Val; - - aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1); - phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1); - tmp2 = _mm256_mul_ps(aVal, yh); - tmp2p = _mm256_mul_ps(phase_Val, yhp); - - z = _mm256_fmaddsub_ps(tmp1, yl, tmp2); - phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p); - - _mm256_store_ps((float*)cPtr, z); - - aPtr += 4; - cPtr += 4; - } - tmp1 = _mm256_mul_ps(phase_Val, phase_Val); - tmp2 = _mm256_hadd_ps(tmp1, tmp1); - tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8); - tmp2 = _mm256_sqrt_ps(tmp1); - phase_Val = _mm256_div_ps(phase_Val, tmp2); - } - for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) { - aVal = _mm256_load_ps((float*)aPtr); - - yl = _mm256_moveldup_ps(phase_Val); - yh = _mm256_movehdup_ps(phase_Val); - ylp = _mm256_moveldup_ps(inc_Val); - yhp = _mm256_movehdup_ps(inc_Val); - - tmp1 = aVal; - tmp1p = phase_Val; - - aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1); - phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1); - tmp2 = _mm256_mul_ps(aVal, yh); - tmp2p = _mm256_mul_ps(phase_Val, yhp); - - z = _mm256_fmaddsub_ps(tmp1, yl, tmp2); - phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p); - - _mm256_store_ps((float*)cPtr, z); - - aPtr += 4; - cPtr += 4; - } - if (i) { - tmp1 = _mm256_mul_ps(phase_Val, phase_Val); - tmp2 = _mm256_hadd_ps(tmp1, tmp1); - tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8); - tmp2 = _mm256_sqrt_ps(tmp1); - phase_Val = _mm256_div_ps(phase_Val, tmp2); - } - - _mm256_store_ps((float*)phase_Ptr, phase_Val); - for (i = 0; i < num_points % 4; ++i) { - *cPtr++ = *aPtr++ * phase_Ptr[0]; - phase_Ptr[0] *= (phase_inc); - } - - (*phase) = phase_Ptr[0]; + volk_32fc_s32fc_x2_rotator2_32fc_a_avx_fma( + outVector, inVector, &phase_inc, phase, num_points); } #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned*/ #if LV_HAVE_AVX && LV_HAVE_FMA -#include static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, @@ -678,101 +188,8 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVecto lv_32fc_t* phase, unsigned int num_points) { - lv_32fc_t* cPtr = outVector; - const lv_32fc_t* aPtr = inVector; - lv_32fc_t incr = 1; - lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) }; - - unsigned int i, j = 0; - - for (i = 0; i < 4; ++i) { - phase_Ptr[i] *= incr; - incr *= (phase_inc); - } - - __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; - - phase_Val = _mm256_loadu_ps((float*)phase_Ptr); - inc_Val = _mm256_set_ps(lv_cimag(incr), - lv_creal(incr), - lv_cimag(incr), - lv_creal(incr), - lv_cimag(incr), - lv_creal(incr), - lv_cimag(incr), - lv_creal(incr)); - - for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) { - for (j = 0; j < ROTATOR_RELOAD_4; ++j) { - - aVal = _mm256_loadu_ps((float*)aPtr); - - yl = _mm256_moveldup_ps(phase_Val); - yh = _mm256_movehdup_ps(phase_Val); - ylp = _mm256_moveldup_ps(inc_Val); - yhp = _mm256_movehdup_ps(inc_Val); - - tmp1 = aVal; - tmp1p = phase_Val; - - aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1); - phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1); - tmp2 = _mm256_mul_ps(aVal, yh); - tmp2p = _mm256_mul_ps(phase_Val, yhp); - - z = _mm256_fmaddsub_ps(tmp1, yl, tmp2); - phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p); - - _mm256_storeu_ps((float*)cPtr, z); - - aPtr += 4; - cPtr += 4; - } - tmp1 = _mm256_mul_ps(phase_Val, phase_Val); - tmp2 = _mm256_hadd_ps(tmp1, tmp1); - tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8); - tmp2 = _mm256_sqrt_ps(tmp1); - phase_Val = _mm256_div_ps(phase_Val, tmp2); - } - for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) { - aVal = _mm256_loadu_ps((float*)aPtr); - - yl = _mm256_moveldup_ps(phase_Val); - yh = _mm256_movehdup_ps(phase_Val); - ylp = _mm256_moveldup_ps(inc_Val); - yhp = _mm256_movehdup_ps(inc_Val); - - tmp1 = aVal; - tmp1p = phase_Val; - - aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1); - phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1); - tmp2 = _mm256_mul_ps(aVal, yh); - tmp2p = _mm256_mul_ps(phase_Val, yhp); - - z = _mm256_fmaddsub_ps(tmp1, yl, tmp2); - phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p); - - _mm256_storeu_ps((float*)cPtr, z); - - aPtr += 4; - cPtr += 4; - } - if (i) { - tmp1 = _mm256_mul_ps(phase_Val, phase_Val); - tmp2 = _mm256_hadd_ps(tmp1, tmp1); - tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8); - tmp2 = _mm256_sqrt_ps(tmp1); - phase_Val = _mm256_div_ps(phase_Val, tmp2); - } - - _mm256_storeu_ps((float*)phase_Ptr, phase_Val); - for (i = 0; i < num_points % 4; ++i) { - *cPtr++ = *aPtr++ * phase_Ptr[0]; - phase_Ptr[0] *= (phase_inc); - } - - (*phase) = phase_Ptr[0]; + volk_32fc_s32fc_x2_rotator2_32fc_u_avx_fma( + outVector, inVector, &phase_inc, phase, num_points); } #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/ diff --git a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h new file mode 100644 index 000000000..a5014b7c5 --- /dev/null +++ b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h @@ -0,0 +1,344 @@ +/* -*- c++ -*- */ +/* + * Copyright 2019 Free Software Foundation, Inc. + * + * This file is part of VOLK + * + * SPDX-License-Identifier: LGPL-3.0-or-later + */ + +/*! + * \page volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc + * + * \b Overview + * + * Conjugate the input complex vector, multiply them by a complex scalar, + * add the another input complex vector and returns the results. + * + * c[i] = a[i] + conj(b[i]) * (*scalar) + * + * Dispatcher Prototype + * \code + * void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc(lv_32fc_t* cVector, const + * lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t* scalar, unsigned int + * num_points); \endcode + * + * \b Inputs + * \li aVector: The input vector to be added. + * \li bVector: The input vector to be conjugate and multiplied. + * \li scalar: The complex scalar to multiply against conjugated bVector. + * \li num_points: The number of complex values in aVector and bVector to be conjugate, + * multiplied and stored into cVector. + * + * \b Outputs + * \li cVector: The vector where the results will be stored. + * + * \b Example + * Calculate coefficients. + * + * \code + * int n_filter = 2 * N + 1; + * unsigned int alignment = volk_get_alignment(); + * + * // state is a queue of input IQ data. + * lv_32fc_t* state = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t) * n_filter, alignment); + * // weight and next one. + * lv_32fc_t* weight = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t) * n_filter, alignment); + * lv_32fc_t* next = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t) * n_filter, alignment); + * ... + * // push back input IQ data into state. + * foo_push_back_queue(state, input); + * + * // get filtered output. + * lv_32fc_t output = lv_cmake(0.f,0.f); + * for (int i = 0; i < n_filter; i++) { + * output += state[i] * weight[i]; + * } + * + * // update weight using output. + * float real = lv_creal(output) * (1.0 - std::norm(output)) * MU; + * lv_32fc_t factor = lv_cmake(real, 0.f); + * volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc(next, weight, state, &factor, n_filter); + * lv_32fc_t *tmp = next; + * next = weight; + * weight = tmp; + * weight[N + 1] = lv_cmake(lv_creal(weight[N + 1]), 0.f); + * ... + * volk_free(state); + * volk_free(weight); + * volk_free(next); + * \endcode + */ + +#ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H +#define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H + +#include +#include +#include +#include + + +#ifdef LV_HAVE_GENERIC + +static inline void +volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_generic(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + const lv_32fc_t* scalar, + unsigned int num_points) +{ + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr = bVector; + lv_32fc_t* cPtr = cVector; + unsigned int number = num_points; + + // unwrap loop + while (number >= 8) { + *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar); + *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar); + *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar); + *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar); + *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar); + *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar); + *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar); + *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar); + number -= 8; + } + + // clean up any remaining + while (number-- > 0) { + *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#ifdef LV_HAVE_AVX +#include +#include + +static inline void +volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_avx(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + const lv_32fc_t* scalar, + unsigned int num_points) +{ + unsigned int number = 0; + unsigned int i = 0; + const unsigned int quarterPoints = num_points / 4; + unsigned int isodd = num_points & 3; + + __m256 x, y, s, z; + lv_32fc_t v_scalar[4] = { *scalar, *scalar, *scalar, *scalar }; + + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + lv_32fc_t* c = cVector; + + // Set up constant scalar vector + s = _mm256_loadu_ps((float*)v_scalar); + + for (; number < quarterPoints; number++) { + x = _mm256_loadu_ps((float*)b); + y = _mm256_loadu_ps((float*)a); + z = _mm256_complexconjugatemul_ps(s, x); + z = _mm256_add_ps(y, z); + _mm256_storeu_ps((float*)c, z); + + a += 4; + b += 4; + c += 4; + } + + for (i = num_points - isodd; i < num_points; i++) { + *c++ = (*a++) + lv_conj(*b++) * (*scalar); + } +} +#endif /* LV_HAVE_AVX */ + + +#ifdef LV_HAVE_SSE3 +#include +#include + +static inline void +volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_sse3(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + const lv_32fc_t* scalar, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, s, z; + lv_32fc_t v_scalar[2] = { *scalar, *scalar }; + + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + lv_32fc_t* c = cVector; + + // Set up constant scalar vector + s = _mm_loadu_ps((float*)v_scalar); + + for (; number < halfPoints; number++) { + x = _mm_loadu_ps((float*)b); + y = _mm_loadu_ps((float*)a); + z = _mm_complexconjugatemul_ps(s, x); + z = _mm_add_ps(y, z); + _mm_storeu_ps((float*)c, z); + + a += 2; + b += 2; + c += 2; + } + + if ((num_points % 2) != 0) { + *c = *a + lv_conj(*b) * (*scalar); + } +} +#endif /* LV_HAVE_SSE */ + + +#ifdef LV_HAVE_AVX +#include +#include + +static inline void +volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_avx(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + const lv_32fc_t* scalar, + unsigned int num_points) +{ + unsigned int number = 0; + unsigned int i = 0; + const unsigned int quarterPoints = num_points / 4; + unsigned int isodd = num_points & 3; + + __m256 x, y, s, z; + lv_32fc_t v_scalar[4] = { *scalar, *scalar, *scalar, *scalar }; + + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + lv_32fc_t* c = cVector; + + // Set up constant scalar vector + s = _mm256_loadu_ps((float*)v_scalar); + + for (; number < quarterPoints; number++) { + x = _mm256_load_ps((float*)b); + y = _mm256_load_ps((float*)a); + z = _mm256_complexconjugatemul_ps(s, x); + z = _mm256_add_ps(y, z); + _mm256_store_ps((float*)c, z); + + a += 4; + b += 4; + c += 4; + } + + for (i = num_points - isodd; i < num_points; i++) { + *c++ = (*a++) + lv_conj(*b++) * (*scalar); + } +} +#endif /* LV_HAVE_AVX */ + + +#ifdef LV_HAVE_SSE3 +#include +#include + +static inline void +volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_sse3(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + const lv_32fc_t* scalar, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, s, z; + lv_32fc_t v_scalar[2] = { *scalar, *scalar }; + + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + lv_32fc_t* c = cVector; + + // Set up constant scalar vector + s = _mm_loadu_ps((float*)v_scalar); + + for (; number < halfPoints; number++) { + x = _mm_load_ps((float*)b); + y = _mm_load_ps((float*)a); + z = _mm_complexconjugatemul_ps(s, x); + z = _mm_add_ps(y, z); + _mm_store_ps((float*)c, z); + + a += 2; + b += 2; + c += 2; + } + + if ((num_points % 2) != 0) { + *c = *a + lv_conj(*b) * (*scalar); + } +} +#endif /* LV_HAVE_SSE */ + + +#ifdef LV_HAVE_NEON +#include + +static inline void +volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_neon(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + const lv_32fc_t* scalar, + unsigned int num_points) +{ + const lv_32fc_t* bPtr = bVector; + const lv_32fc_t* aPtr = aVector; + lv_32fc_t* cPtr = cVector; + unsigned int number = num_points; + unsigned int quarter_points = num_points / 4; + + float32x4x2_t a_val, b_val, c_val, scalar_val; + float32x4x2_t tmp_val; + + scalar_val.val[0] = vld1q_dup_f32((const float*)scalar); + scalar_val.val[1] = vld1q_dup_f32(((const float*)scalar) + 1); + + for (number = 0; number < quarter_points; ++number) { + a_val = vld2q_f32((float*)aPtr); + b_val = vld2q_f32((float*)bPtr); + b_val.val[1] = vnegq_f32(b_val.val[1]); + __VOLK_PREFETCH(aPtr + 8); + __VOLK_PREFETCH(bPtr + 8); + + tmp_val.val[1] = vmulq_f32(b_val.val[1], scalar_val.val[0]); + tmp_val.val[0] = vmulq_f32(b_val.val[0], scalar_val.val[0]); + + tmp_val.val[1] = vmlaq_f32(tmp_val.val[1], b_val.val[0], scalar_val.val[1]); + tmp_val.val[0] = vmlsq_f32(tmp_val.val[0], b_val.val[1], scalar_val.val[1]); + + c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]); + c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]); + + vst2q_f32((float*)cPtr, c_val); + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + for (number = quarter_points * 4; number < num_points; number++) { + *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar); + } +} +#endif /* LV_HAVE_NEON */ + +#endif /* INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H */ diff --git a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h index 85cdaf16a..9926448b4 100644 --- a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h +++ b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h @@ -76,6 +76,7 @@ #include #include #include +#include #include @@ -88,35 +89,13 @@ volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t scalar, unsigned int num_points) { - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr = bVector; - lv_32fc_t* cPtr = cVector; - unsigned int number = num_points; - - // unwrap loop - while (number >= 8) { - *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar; - *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar; - *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar; - *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar; - *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar; - *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar; - *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar; - *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar; - number -= 8; - } - - // clean up any remaining - while (number-- > 0) { - *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar; - } + volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_generic( + cVector, aVector, bVector, &scalar, num_points); } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_AVX -#include -#include static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t* cVector, @@ -125,43 +104,13 @@ volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t scalar, unsigned int num_points) { - unsigned int number = 0; - unsigned int i = 0; - const unsigned int quarterPoints = num_points / 4; - unsigned int isodd = num_points & 3; - - __m256 x, y, s, z; - lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar }; - - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - lv_32fc_t* c = cVector; - - // Set up constant scalar vector - s = _mm256_loadu_ps((float*)v_scalar); - - for (; number < quarterPoints; number++) { - x = _mm256_loadu_ps((float*)b); - y = _mm256_loadu_ps((float*)a); - z = _mm256_complexconjugatemul_ps(s, x); - z = _mm256_add_ps(y, z); - _mm256_storeu_ps((float*)c, z); - - a += 4; - b += 4; - c += 4; - } - - for (i = num_points - isodd; i < num_points; i++) { - *c++ = (*a++) + lv_conj(*b++) * scalar; - } + volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_avx( + cVector, aVector, bVector, &scalar, num_points); } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE3 -#include -#include static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t* cVector, @@ -170,41 +119,13 @@ volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t scalar, unsigned int num_points) { - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, y, s, z; - lv_32fc_t v_scalar[2] = { scalar, scalar }; - - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - lv_32fc_t* c = cVector; - - // Set up constant scalar vector - s = _mm_loadu_ps((float*)v_scalar); - - for (; number < halfPoints; number++) { - x = _mm_loadu_ps((float*)b); - y = _mm_loadu_ps((float*)a); - z = _mm_complexconjugatemul_ps(s, x); - z = _mm_add_ps(y, z); - _mm_storeu_ps((float*)c, z); - - a += 2; - b += 2; - c += 2; - } - - if ((num_points % 2) != 0) { - *c = *a + lv_conj(*b) * scalar; - } + volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_sse3( + cVector, aVector, bVector, &scalar, num_points); } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_AVX -#include -#include static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t* cVector, @@ -213,43 +134,13 @@ volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t scalar, unsigned int num_points) { - unsigned int number = 0; - unsigned int i = 0; - const unsigned int quarterPoints = num_points / 4; - unsigned int isodd = num_points & 3; - - __m256 x, y, s, z; - lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar }; - - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - lv_32fc_t* c = cVector; - - // Set up constant scalar vector - s = _mm256_loadu_ps((float*)v_scalar); - - for (; number < quarterPoints; number++) { - x = _mm256_load_ps((float*)b); - y = _mm256_load_ps((float*)a); - z = _mm256_complexconjugatemul_ps(s, x); - z = _mm256_add_ps(y, z); - _mm256_store_ps((float*)c, z); - - a += 4; - b += 4; - c += 4; - } - - for (i = num_points - isodd; i < num_points; i++) { - *c++ = (*a++) + lv_conj(*b++) * scalar; - } + volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_avx( + cVector, aVector, bVector, &scalar, num_points); } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE3 -#include -#include static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t* cVector, @@ -258,40 +149,13 @@ volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t scalar, unsigned int num_points) { - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, y, s, z; - lv_32fc_t v_scalar[2] = { scalar, scalar }; - - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - lv_32fc_t* c = cVector; - - // Set up constant scalar vector - s = _mm_loadu_ps((float*)v_scalar); - - for (; number < halfPoints; number++) { - x = _mm_load_ps((float*)b); - y = _mm_load_ps((float*)a); - z = _mm_complexconjugatemul_ps(s, x); - z = _mm_add_ps(y, z); - _mm_store_ps((float*)c, z); - - a += 2; - b += 2; - c += 2; - } - - if ((num_points % 2) != 0) { - *c = *a + lv_conj(*b) * scalar; - } + volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_sse3( + cVector, aVector, bVector, &scalar, num_points); } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_NEON -#include static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t* cVector, @@ -300,44 +164,8 @@ volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t scalar, unsigned int num_points) { - const lv_32fc_t* bPtr = bVector; - const lv_32fc_t* aPtr = aVector; - lv_32fc_t* cPtr = cVector; - unsigned int number = num_points; - unsigned int quarter_points = num_points / 4; - - float32x4x2_t a_val, b_val, c_val, scalar_val; - float32x4x2_t tmp_val; - - scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar); - scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1); - - for (number = 0; number < quarter_points; ++number) { - a_val = vld2q_f32((float*)aPtr); - b_val = vld2q_f32((float*)bPtr); - b_val.val[1] = vnegq_f32(b_val.val[1]); - __VOLK_PREFETCH(aPtr + 8); - __VOLK_PREFETCH(bPtr + 8); - - tmp_val.val[1] = vmulq_f32(b_val.val[1], scalar_val.val[0]); - tmp_val.val[0] = vmulq_f32(b_val.val[0], scalar_val.val[0]); - - tmp_val.val[1] = vmlaq_f32(tmp_val.val[1], b_val.val[0], scalar_val.val[1]); - tmp_val.val[0] = vmlsq_f32(tmp_val.val[0], b_val.val[1], scalar_val.val[1]); - - c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]); - c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]); - - vst2q_f32((float*)cPtr, c_val); - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - for (number = quarter_points * 4; number < num_points; number++) { - *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar; - } + volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_neon( + cVector, aVector, bVector, &scalar, num_points); } #endif /* LV_HAVE_NEON */ diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h index dd430762c..0e7db1439 100644 --- a/lib/kernel_tests.h +++ b/lib/kernel_tests.h @@ -55,8 +55,8 @@ std::vector init_test_list(volk_test_params_t test_params) QA(VOLK_INIT_PUPP(volk_32u_byteswappuppet_32u, volk_32u_byteswap, test_params)) QA(VOLK_INIT_PUPP(volk_32u_popcntpuppet_32u, volk_32u_popcnt_32u, test_params)) QA(VOLK_INIT_PUPP(volk_64u_byteswappuppet_64u, volk_64u_byteswap, test_params)) - QA(VOLK_INIT_PUPP(volk_32fc_s32fc_rotatorpuppet_32fc, - volk_32fc_s32fc_x2_rotator_32fc, + QA(VOLK_INIT_PUPP(volk_32fc_s32fc_rotator2puppet_32fc, + volk_32fc_s32fc_x2_rotator2_32fc, test_params_rotator)) QA(VOLK_INIT_PUPP( volk_8u_conv_k7_r2puppet_8u, volk_8u_x4_conv_k7_r2_8u, test_params.make_tol(0))) @@ -159,14 +159,14 @@ std::vector init_test_list(volk_test_params_t test_params) QA(VOLK_INIT_TEST(volk_8ic_x2_s32f_multiply_conjugate_32fc, test_params)) QA(VOLK_INIT_TEST(volk_8i_convert_16i, test_params)) QA(VOLK_INIT_TEST(volk_8i_s32f_convert_32f, test_params)) - QA(VOLK_INIT_TEST(volk_32fc_s32fc_multiply_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_s32fc_multiply2_32fc, test_params)) QA(VOLK_INIT_TEST(volk_32f_s32f_multiply_32f, test_params)) QA(VOLK_INIT_TEST(volk_32f_s32f_add_32f, test_params)) QA(VOLK_INIT_TEST(volk_32f_binary_slicer_32i, test_params)) QA(VOLK_INIT_TEST(volk_32f_binary_slicer_8i, test_params)) QA(VOLK_INIT_TEST(volk_32u_reverse_32u, test_params)) QA(VOLK_INIT_TEST(volk_32f_tanh_32f, test_params_inacc)) - QA(VOLK_INIT_TEST(volk_32fc_x2_s32fc_multiply_conjugate_add_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc, test_params)) QA(VOLK_INIT_TEST(volk_32f_exp_32f, test_params)) QA(VOLK_INIT_PUPP( volk_32f_s32f_mod_rangepuppet_32f, volk_32f_s32f_s32f_mod_range_32f, test_params)) diff --git a/lib/qa_utils.cc b/lib/qa_utils.cc index 4be7b8ad9..a94d895c4 100644 --- a/lib/qa_utils.cc +++ b/lib/qa_utils.cc @@ -355,7 +355,7 @@ inline void run_cast_test1_s32fc(volk_fn_1arg_s32fc func, std::string arch) { while (iter--) - func(buffs[0], scalar, vlen, arch.c_str()); + func(buffs[0], &scalar, vlen, arch.c_str()); } inline void run_cast_test2_s32fc(volk_fn_2arg_s32fc func, @@ -366,7 +366,7 @@ inline void run_cast_test2_s32fc(volk_fn_2arg_s32fc func, std::string arch) { while (iter--) - func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); + func(buffs[0], buffs[1], &scalar, vlen, arch.c_str()); } inline void run_cast_test3_s32fc(volk_fn_3arg_s32fc func, @@ -377,7 +377,7 @@ inline void run_cast_test3_s32fc(volk_fn_3arg_s32fc func, std::string arch) { while (iter--) - func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); + func(buffs[0], buffs[1], buffs[2], &scalar, vlen, arch.c_str()); } template diff --git a/lib/qa_utils.h b/lib/qa_utils.h index 40f549cfd..a65677203 100644 --- a/lib/qa_utils.h +++ b/lib/qa_utils.h @@ -199,11 +199,11 @@ typedef void (*volk_fn_2arg_s32f)(void*, void*, float, unsigned int, const char* typedef void (*volk_fn_3arg_s32f)(void*, void*, void*, float, unsigned int, const char*); typedef void (*volk_fn_1arg_s32fc)( void*, - lv_32fc_t, + lv_32fc_t*, unsigned int, const char*); // one input vector, one scalar float input -typedef void (*volk_fn_2arg_s32fc)(void*, void*, lv_32fc_t, unsigned int, const char*); +typedef void (*volk_fn_2arg_s32fc)(void*, void*, lv_32fc_t*, unsigned int, const char*); typedef void (*volk_fn_3arg_s32fc)( - void*, void*, void*, lv_32fc_t, unsigned int, const char*); + void*, void*, void*, lv_32fc_t*, unsigned int, const char*); #endif // VOLK_QA_UTILS_H diff --git a/tmpl/volk.tmpl.h b/tmpl/volk.tmpl.h index ad5595b24..b26c542cf 100644 --- a/tmpl/volk.tmpl.h +++ b/tmpl/volk.tmpl.h @@ -63,9 +63,11 @@ VOLK_API bool volk_is_aligned(const void *ptr); // Just drop the deprecated attribute in case we are on Windows. Clang and GCC support `__attribute__`. // We just assume the compiler and the system are tight together as far as Mako templates are concerned. <% -deprecated_kernels = ('volk_16i_x5_add_quad_16i_x4', 'volk_16i_branch_4_state_8', - 'volk_16i_max_star_16i', 'volk_16i_max_star_horizontal_16i', - 'volk_16i_permute_and_scalar_add', 'volk_16i_x4_quad_max_star_16i') +deprecated_kernels = ('volk_16i_x5_add_quad_16i_x4', 'volk_16i_branch_4_state_8', + 'volk_16i_max_star_16i', 'volk_16i_max_star_horizontal_16i', + 'volk_16i_permute_and_scalar_add', 'volk_16i_x4_quad_max_star_16i', + 'volk_32fc_s32fc_multiply_32fc', 'volk_32fc_s32fc_x2_rotator_32fc', + 'volk_32fc_x2_s32fc_multiply_conjugate_add_32fc') from platform import system if system() == 'Windows': deprecated_kernels = ()