Skip to content

Commit

Permalink
moved function, rewrote test
Browse files Browse the repository at this point in the history
Signed-off-by: Magnus Lundmark <[email protected]>
  • Loading branch information
Ka-zam committed Oct 4, 2023
1 parent 99c806c commit fa95bec
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 54 deletions.
28 changes: 14 additions & 14 deletions kernels/volk/volk_32f_s32f_clamppuppet_32f.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,23 @@

#include <volk/volk_32f_s32f_x2_clamp_32f.h>

#ifdef LV_HAVE_GENERIC
static inline void volk_32f_s32f_clamppuppet_32f_generic(float* out,
const float* in,
const float min,
unsigned int num_points)
{
volk_32f_s32f_x2_clamp_32f_generic(out, in, min, -min, num_points);
}
#endif

#ifdef LV_HAVE_AVX2
static inline void volk_32f_s32f_clamppuppet_32f_a_avx2(float* out,
const float* in,
const float min,
unsigned int num_points)
{
volk_32f_s32f_x2_clamp_32f_a_avx2(out, in, min, .5f, num_points);
volk_32f_s32f_x2_clamp_32f_a_avx2(out, in, min, -min, num_points);
}
#endif

Expand All @@ -28,18 +38,17 @@ static inline void volk_32f_s32f_clamppuppet_32f_a_sse4_1(float* out,
const float min,
unsigned int num_points)
{
volk_32f_s32f_x2_clamp_32f_a_sse4_1(out, in, min, .5f, num_points);
volk_32f_s32f_x2_clamp_32f_a_sse4_1(out, in, min, -min, num_points);
}
#endif


#ifdef LV_HAVE_AVX2
static inline void volk_32f_s32f_clamppuppet_32f_u_avx2(float* out,
const float* in,
const float min,
unsigned int num_points)
{
volk_32f_s32f_x2_clamp_32f_u_avx2(out, in, min, .5f, num_points);
volk_32f_s32f_x2_clamp_32f_u_avx2(out, in, min, -min, num_points);
}
#endif

Expand All @@ -49,17 +58,8 @@ static inline void volk_32f_s32f_clamppuppet_32f_u_sse4_1(float* out,
const float min,
unsigned int num_points)
{
volk_32f_s32f_x2_clamp_32f_u_sse4_1(out, in, min, .5f, num_points);
volk_32f_s32f_x2_clamp_32f_u_sse4_1(out, in, min, -min, num_points);
}
#endif

#ifdef LV_HAVE_GENERIC
static inline void volk_32f_s32f_clamppuppet_32f_generic(float* out,
const float* in,
const float min,
unsigned int num_points)
{
volk_32f_s32f_x2_clamp_32f_generic(out, in, min, .5f, num_points);
}
#endif
#endif /* INCLUDED_volk_32f_s32f_clamppuppet_32f_H */
74 changes: 34 additions & 40 deletions kernels/volk/volk_32f_s32f_x2_clamp_32f.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,32 @@
*
* \endcode
*/
#include <inttypes.h>

#ifndef INCLUDED_volk_32fc_s32f_x2_clamp_32f_a_H
#define INCLUDED_volk_32fc_s32f_x2_clamp_32f_a_H

#ifdef LV_HAVE_GENERIC
static inline void volk_32f_s32f_x2_clamp_32f_generic(float* out,
const float* in,
const float min,
const float max,
unsigned int num_points)
{
unsigned int number = 0;
for (; number < num_points; number++) {
if (*in > max) {
*out = max;
} else if (*in < min) {
*out = min;
} else {
*out = *in;
}
in++;
out++;
}
}
#endif /* LV_HAVE_GENERIC */

#if LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_32f_s32f_x2_clamp_32f_a_avx2(float* out,
Expand All @@ -56,8 +77,8 @@ static inline void volk_32f_s32f_x2_clamp_32f_a_avx2(float* out,
const __m256 vmin = _mm256_set1_ps(min);
const __m256 vmax = _mm256_set1_ps(max);

uint32_t number = 0;
uint32_t eighth_points = num_points / 8;
unsigned int number = 0;
unsigned int eighth_points = num_points / 8;
for (; number < eighth_points; number++) {
__m256 res = _mm256_load_ps(in);
__m256 max_mask = _mm256_cmp_ps(vmax, res, _CMP_LT_OS);
Expand All @@ -70,10 +91,7 @@ static inline void volk_32f_s32f_x2_clamp_32f_a_avx2(float* out,
}

number = eighth_points * 8;
for (; number < num_points; number++) {
*out++ = (*in > max) ? max : (*in < min) ? min : *in;
in++;
}
volk_32f_s32f_x2_clamp_32f_generic(out, in, min, max, num_points - number);
}
#endif /* LV_HAVE_AVX2 */

Expand All @@ -88,8 +106,8 @@ static inline void volk_32f_s32f_x2_clamp_32f_a_sse4_1(float* out,
const __m128 vmin = _mm_set1_ps(min);
const __m128 vmax = _mm_set1_ps(max);

uint32_t number = 0;
uint32_t quarter_points = num_points / 4;
unsigned int number = 0;
unsigned int quarter_points = num_points / 4;
for (; number < quarter_points; number++) {
__m128 res = _mm_load_ps(in);
__m128 max_mask = _mm_cmplt_ps(vmax, res);
Expand All @@ -102,10 +120,7 @@ static inline void volk_32f_s32f_x2_clamp_32f_a_sse4_1(float* out,
}

number = quarter_points * 4;
for (; number < num_points; number++) {
*out++ = (*in > max) ? max : (*in < min) ? min : *in;
in++;
}
volk_32f_s32f_x2_clamp_32f_generic(out, in, min, max, num_points - number);
}
#endif /* LV_HAVE_SSE4_1 */

Expand All @@ -125,8 +140,8 @@ static inline void volk_32f_s32f_x2_clamp_32f_u_avx2(float* out,
const __m256 vmin = _mm256_set1_ps(min);
const __m256 vmax = _mm256_set1_ps(max);

uint32_t number = 0;
uint32_t eighth_points = num_points / 8;
unsigned int number = 0;
unsigned int eighth_points = num_points / 8;
for (; number < eighth_points; number++) {
__m256 res = _mm256_loadu_ps(in);
__m256 max_mask = _mm256_cmp_ps(vmax, res, _CMP_LT_OS);
Expand All @@ -139,10 +154,7 @@ static inline void volk_32f_s32f_x2_clamp_32f_u_avx2(float* out,
}

number = eighth_points * 8;
for (; number < num_points; number++) {
*out++ = (*in > max) ? max : (*in < min) ? min : *in;
in++;
}
volk_32f_s32f_x2_clamp_32f_generic(out, in, min, max, num_points - number);
}
#endif /* LV_HAVE_AVX2 */

Expand All @@ -157,8 +169,8 @@ static inline void volk_32f_s32f_x2_clamp_32f_u_sse4_1(float* out,
const __m128 vmin = _mm_set1_ps(min);
const __m128 vmax = _mm_set1_ps(max);

uint32_t number = 0;
uint32_t quarter_points = num_points / 4;
unsigned int number = 0;
unsigned int quarter_points = num_points / 4;
for (; number < quarter_points; number++) {
__m128 res = _mm_loadu_ps(in);
__m128 max_mask = _mm_cmplt_ps(vmax, res);
Expand All @@ -171,26 +183,8 @@ static inline void volk_32f_s32f_x2_clamp_32f_u_sse4_1(float* out,
}

number = quarter_points * 4;
for (; number < num_points; number++) {
*out++ = (*in > max) ? max : (*in < min) ? min : *in;
in++;
}
volk_32f_s32f_x2_clamp_32f_generic(out, in, min, max, num_points - number);
}
#endif /* LV_HAVE_SSE4_1 */

#ifdef LV_HAVE_GENERIC
static inline void volk_32f_s32f_x2_clamp_32f_generic(float* out,
const float* in,
const float min,
const float max,
unsigned int num_points)
{
uint32_t number = 0;
for (; number < num_points; number++) {
*out++ = (*in > max) ? max : (*in < min) ? min : *in;
in++;
}
}
#endif /* LV_HAVE_GENERIC */

#endif /* INCLUDED_volk_32fc_s32f_x2_clamp_32f_u_H */

0 comments on commit fa95bec

Please sign in to comment.