79 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H 80 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H 87 const float scalar,
const unsigned int num_points)
90 for(
unsigned int i = 0;
i < num_points; ++
i) {
95 diff = symbol - *points++;
102 #include<immintrin.h> 106 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(
float* target,
lv_32fc_t* src0,
108 unsigned int num_points)
110 const unsigned int num_bytes = num_points*8;
113 __m256 xmm_points0, xmm_points1, xmm_result;
115 const unsigned int bound = num_bytes >> 6;
118 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
119 const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
122 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
123 const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
126 const __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
128 for(
unsigned int i = 0;
i < bound; ++
i) {
129 xmm_points0 = _mm256_load_ps((
float*)points);
130 xmm_points1 = _mm256_load_ps((
float*)(points + 4));
135 xmm_points0, xmm_points1,
138 _mm256_store_ps(target, xmm_result);
142 if (num_bytes >> 5 & 1) {
143 xmm_points0 = _mm256_load_ps((
float*)points);
145 xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
149 xmm6 = _mm256_mul_ps(xmm4, xmm4);
151 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
152 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
154 xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
156 xmm9 = _mm256_extractf128_ps(xmm_result, 1);
157 _mm_store_ps(target,xmm9);
161 if (num_bytes >> 4 & 1) {
162 xmm9 = _mm_load_ps((
float*)points);
164 xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
168 xmm9 = _mm_mul_ps(xmm10, xmm10);
170 xmm10 = _mm_hadd_ps(xmm9, xmm9);
172 xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
174 _mm_storeh_pi((__m64*)target, xmm10);
185 #include <immintrin.h> 191 unsigned int num_points) {
192 const int eightsPoints = num_points / 8;
193 const int remainder = num_points - 8 * eightsPoints;
195 __m256 xmm_points0, xmm_points1, xmm_result;
198 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
201 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
203 for(
int i = 0;
i < eightsPoints; ++
i){
204 xmm_points0 = _mm256_load_ps((
float*)points);
205 xmm_points1 = _mm256_load_ps((
float*)(points + 4));
209 xmm_points1, xmm_scalar);
211 _mm256_store_ps(target, xmm_result);
223 #include<pmmintrin.h> 229 unsigned int num_points)
231 __m128 xmm_points0, xmm_points1, xmm_result;
239 const int quarterPoints = num_points / 4;
240 const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
241 const int leftovers1 = num_points % 2;
244 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((
const double*)src0));
247 const __m128 xmm_scalar = _mm_load1_ps(&scalar);
249 for(
int i = 0;
i < quarterPoints; ++
i) {
250 xmm_points0 = _mm_load_ps((
float*)points);
251 xmm_points1 = _mm_load_ps((
float*)(points + 2));
256 xmm_points1, xmm_scalar);
258 _mm_store_ps(target, xmm_result);
262 for(
int i = 0;
i < leftovers0; ++
i) {
263 xmm_points0 = _mm_load_ps((
float*)points);
266 xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
267 xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
268 xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
269 xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
271 _mm_storeh_pi((__m64*)target, xmm_result);
281 #include <xmmintrin.h> 286 unsigned int num_points)
288 const __m128 xmm_scalar = _mm_set1_ps(scalar);
289 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((
const double*)src0));
291 for (
unsigned i = 0;
i < num_points / 4; ++
i) {
292 __m128 xmm_points0 = _mm_load_ps((
float *) points);
293 __m128 xmm_points1 = _mm_load_ps((
float *) (points + 2));
296 xmm_points0, xmm_points1,
298 _mm_store_ps((
float *) target, xmm_result);
304 #endif // LV_HAVE_SSE 306 #ifdef LV_HAVE_GENERIC 310 unsigned int num_points)
321 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H 322 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H 328 #include<immintrin.h> 332 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(
float* target,
lv_32fc_t* src0,
334 unsigned int num_points)
336 const unsigned int num_bytes = num_points*8;
339 __m256 xmm_points0, xmm_points1, xmm_result;
341 const unsigned int bound = num_bytes >> 6;
344 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
345 const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
348 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
349 const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
352 const __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
354 for(
unsigned int i = 0;
i < bound; ++
i) {
355 xmm_points0 = _mm256_loadu_ps((
float*)points);
356 xmm_points1 = _mm256_loadu_ps((
float*)(points + 4));
361 xmm_points0, xmm_points1,
364 _mm256_storeu_ps(target, xmm_result);
368 if (num_bytes >> 5 & 1) {
369 xmm_points0 = _mm256_loadu_ps((
float*)points);
371 xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
375 xmm6 = _mm256_mul_ps(xmm4, xmm4);
377 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
378 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
380 xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
382 xmm9 = _mm256_extractf128_ps(xmm_result, 1);
383 _mm_storeu_ps(target,xmm9);
387 if (num_bytes >> 4 & 1) {
388 xmm9 = _mm_loadu_ps((
float*)points);
390 xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
394 xmm9 = _mm_mul_ps(xmm10, xmm10);
396 xmm10 = _mm_hadd_ps(xmm9, xmm9);
398 xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
400 _mm_storeh_pi((__m64*)target, xmm10);
411 #include <immintrin.h> 417 unsigned int num_points) {
418 const int eightsPoints = num_points / 8;
419 const int remainder = num_points - 8 * eightsPoints;
421 __m256 xmm_points0, xmm_points1, xmm_result;
424 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
427 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
429 for(
int i = 0;
i < eightsPoints; ++
i){
430 xmm_points0 = _mm256_loadu_ps((
float*)points);
431 xmm_points1 = _mm256_loadu_ps((
float*)(points + 4));
435 xmm_points1, xmm_scalar);
437 _mm256_storeu_ps(target, xmm_result);
449 #include<pmmintrin.h> 455 unsigned int num_points)
457 __m128 xmm_points0, xmm_points1, xmm_result;
465 const int quarterPoints = num_points / 4;
466 const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
467 const int leftovers1 = num_points % 2;
470 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((
const double*)src0));
473 const __m128 xmm_scalar = _mm_load1_ps(&scalar);
475 for(
int i = 0;
i < quarterPoints; ++
i) {
476 xmm_points0 = _mm_loadu_ps((
float*)points);
477 xmm_points1 = _mm_loadu_ps((
float*)(points + 2));
482 xmm_points1, xmm_scalar);
484 _mm_storeu_ps(target, xmm_result);
488 for(
int i = 0;
i < leftovers0; ++
i) {
489 xmm_points0 = _mm_loadu_ps((
float*)points);
492 xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
493 xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
494 xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
495 xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
497 _mm_storeh_pi((__m64*)target, xmm_result);
507 #include <xmmintrin.h> 512 unsigned int num_points)
514 const __m128 xmm_scalar = _mm_set1_ps(scalar);
515 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((
const double*)src0));
517 for (
unsigned i = 0;
i < num_points / 4; ++
i) {
518 __m128 xmm_points0 = _mm_loadu_ps((
float *) points);
519 __m128 xmm_points1 = _mm_loadu_ps((
float *) (points + 2));
522 xmm_points0, xmm_points1,
524 _mm_storeu_ps((
float *) target, xmm_result);
530 #endif // LV_HAVE_SSE static __m128 _mm_scaled_norm_dist_ps_sse3(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
Definition: volk_sse3_intrinsics.h:65
static __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx2_intrinsics.h:74
static __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx_intrinsics.h:82
static __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
Definition: volk_sse_intrinsics.h:50
static void calculate_scaled_distances(float *target, const lv_32fc_t symbol, const lv_32fc_t *points, const float scalar, const unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:86
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:308
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:227
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:453
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:53
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:510
float complex lv_32fc_t
Definition: volk_complex.h:61
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:284
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:415
#define lv_creal(x)
Definition: volk_complex.h:83
#define lv_cimag(x)
Definition: volk_complex.h:85
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:189