76 #ifndef INCLUDED_volk_32fc_index_max_16u_a_H 77 #define INCLUDED_volk_32fc_index_max_16u_a_H 86 #include <immintrin.h> 89 volk_32fc_index_max_16u_a_avx2(uint16_t* target,
lv_32fc_t* src0,
92 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
96 const uint32_t num_bytes = num_points*8;
103 __m256 xmm1, xmm2, xmm3;
104 __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
106 xmm5.
int_vec = xmmfive = _mm256_setzero_si256();
107 xmm4.int_vec = xmmfour = _mm256_setzero_si256();
108 holderf.int_vec = holder0 = _mm256_setzero_si256();
109 holderi.int_vec = holder1 = _mm256_setzero_si256();
111 int bound = num_bytes >> 6;
114 xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
115 xmm9 = _mm256_setzero_si256();
116 xmm10 = _mm256_set1_epi32(8);
117 xmm3 = _mm256_setzero_ps();
119 __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
120 for(; i < bound; ++
i) {
121 xmm1 = _mm256_load_ps((
float*)src0);
122 xmm2 = _mm256_load_ps((
float*)&src0[4]);
126 xmm1 = _mm256_mul_ps(xmm1, xmm1);
127 xmm2 = _mm256_mul_ps(xmm2, xmm2);
129 xmm1 = _mm256_hadd_ps(xmm1, xmm2);
130 xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
132 xmm3 = _mm256_max_ps(xmm1, xmm3);
134 xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
135 xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
137 xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
138 xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
140 xmm9 = _mm256_add_epi32(xmm11, xmm12);
142 xmm8 = _mm256_add_epi32(xmm8, xmm10);
144 xmm10 = _mm256_set1_epi32(4);
145 if (num_bytes >> 5 & 1) {
146 xmm1 = _mm256_load_ps((
float*)src0);
150 xmm1 = _mm256_mul_ps(xmm1, xmm1);
152 xmm1 = _mm256_hadd_ps(xmm1, xmm1);
153 xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
155 xmm3 = _mm256_max_ps(xmm1, xmm3);
157 xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
158 xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
160 xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
161 xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
163 xmm9 = _mm256_add_epi32(xmm11, xmm12);
165 xmm8 = _mm256_add_epi32(xmm8, xmm10);
168 idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
169 xmm10 = _mm256_set1_epi32(2);
170 if (num_bytes >> 4 & 1) {
171 xmm2 = _mm256_load_ps((
float*)src0);
176 xmm2 = _mm256_mul_ps(xmm2, xmm2);
180 xmm1 = _mm256_hadd_ps(xmm2, xmm2);
182 xmm3 = _mm256_max_ps(xmm1, xmm3);
184 xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
185 xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
187 xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
188 xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
190 xmm9 = _mm256_add_epi32(xmm11, xmm12);
192 xmm8 = _mm256_add_epi32(xmm8, xmm10);
222 _mm256_store_ps((
float*)&(holderf.f), xmm3);
223 _mm256_store_si256(&(holderi.int_vec), xmm9);
225 target[0] = holderi.i[0];
226 sq_dist = holderf.f[0];
227 target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
228 sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
229 target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
230 sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
231 target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
232 sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
233 target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
234 sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
235 target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
236 sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
237 target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
238 sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
239 target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
240 sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
247 #include <xmmintrin.h> 248 #include <pmmintrin.h> 254 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
258 const uint32_t num_bytes = num_points*8;
265 __m128 xmm1, xmm2, xmm3;
266 __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
268 xmm5.
int_vec = xmmfive = _mm_setzero_si128();
269 xmm4.
int_vec = xmmfour = _mm_setzero_si128();
270 holderf.
int_vec = holder0 = _mm_setzero_si128();
271 holderi.
int_vec = holder1 = _mm_setzero_si128();
273 int bound = num_bytes >> 5;
276 xmm8 = _mm_set_epi32(3, 2, 1, 0);
277 xmm9 = _mm_setzero_si128();
278 xmm10 = _mm_set_epi32(4, 4, 4, 4);
279 xmm3 = _mm_setzero_ps();
282 for(; i < bound; ++
i) {
283 xmm1 = _mm_load_ps((
float*)src0);
284 xmm2 = _mm_load_ps((
float*)&src0[2]);
288 xmm1 = _mm_mul_ps(xmm1, xmm1);
289 xmm2 = _mm_mul_ps(xmm2, xmm2);
291 xmm1 = _mm_hadd_ps(xmm1, xmm2);
293 xmm3 = _mm_max_ps(xmm1, xmm3);
295 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
296 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
298 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
299 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
301 xmm9 = _mm_add_epi32(xmm11, xmm12);
303 xmm8 = _mm_add_epi32(xmm8, xmm10);
310 if (num_bytes >> 4 & 1) {
311 xmm2 = _mm_load_ps((
float*)src0);
316 xmm2 = _mm_mul_ps(xmm2, xmm2);
320 xmm1 = _mm_hadd_ps(xmm2, xmm2);
322 xmm3 = _mm_max_ps(xmm1, xmm3);
324 xmm10 = _mm_set_epi32(2, 2, 2, 2);
326 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
327 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
329 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
330 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
332 xmm9 = _mm_add_epi32(xmm11, xmm12);
334 xmm8 = _mm_add_epi32(xmm8, xmm10);
338 if (num_bytes >> 3 & 1) {
343 xmm2 = _mm_load1_ps(&sq_dist);
347 xmm3 = _mm_max_ss(xmm3, xmm2);
349 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
350 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
352 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
354 xmm11 = _mm_and_si128(xmm8, xmm4.
int_vec);
355 xmm12 = _mm_and_si128(xmm9, xmm5.
int_vec);
357 xmm9 = _mm_add_epi32(xmm11, xmm12);
363 _mm_store_ps((
float*)&(holderf.
f), xmm3);
364 _mm_store_si128(&(holderi.
int_vec), xmm9);
366 target[0] = holderi.
i[0];
367 sq_dist = holderf.
f[0];
368 target[0] = (holderf.
f[1] > sq_dist) ? holderi.
i[1] : target[0];
369 sq_dist = (holderf.
f[1] > sq_dist) ? holderf.
f[1] : sq_dist;
370 target[0] = (holderf.
f[2] > sq_dist) ? holderi.
i[2] : target[0];
371 sq_dist = (holderf.
f[2] > sq_dist) ? holderf.
f[2] : sq_dist;
372 target[0] = (holderf.
f[3] > sq_dist) ? holderi.
i[3] : target[0];
373 sq_dist = (holderf.
f[3] > sq_dist) ? holderf.
f[3] : sq_dist;
397 #ifdef LV_HAVE_GENERIC 402 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
404 const uint32_t num_bytes = num_points*8;
412 for(; i < num_bytes >> 3; ++
i) {
415 index = sq_dist > max ?
i : index;
416 max = sq_dist > max ? sq_dist : max;
427 #ifndef INCLUDED_volk_32fc_index_max_16u_u_H 428 #define INCLUDED_volk_32fc_index_max_16u_u_H 431 #include <inttypes.h> 437 #include <immintrin.h> 440 volk_32fc_index_max_16u_u_avx2(uint16_t* target,
lv_32fc_t* src0,
443 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
447 const uint32_t num_bytes = num_points*8;
454 __m256 xmm1, xmm2, xmm3;
455 __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
457 xmm5.
int_vec = xmmfive = _mm256_setzero_si256();
458 xmm4.
int_vec = xmmfour = _mm256_setzero_si256();
459 holderf.
int_vec = holder0 = _mm256_setzero_si256();
460 holderi.
int_vec = holder1 = _mm256_setzero_si256();
462 int bound = num_bytes >> 6;
465 xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
466 xmm9 = _mm256_setzero_si256();
467 xmm10 = _mm256_set1_epi32(8);
468 xmm3 = _mm256_setzero_ps();
470 __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
471 for(; i < bound; ++
i) {
472 xmm1 = _mm256_loadu_ps((
float*)src0);
473 xmm2 = _mm256_loadu_ps((
float*)&src0[4]);
477 xmm1 = _mm256_mul_ps(xmm1, xmm1);
478 xmm2 = _mm256_mul_ps(xmm2, xmm2);
480 xmm1 = _mm256_hadd_ps(xmm1, xmm2);
481 xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
483 xmm3 = _mm256_max_ps(xmm1, xmm3);
485 xmm4.
float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
486 xmm5.
float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
488 xmm11 = _mm256_and_si256(xmm8, xmm5.
int_vec);
489 xmm12 = _mm256_and_si256(xmm9, xmm4.
int_vec);
491 xmm9 = _mm256_add_epi32(xmm11, xmm12);
493 xmm8 = _mm256_add_epi32(xmm8, xmm10);
495 xmm10 = _mm256_set1_epi32(4);
496 if (num_bytes >> 5 & 1) {
497 xmm1 = _mm256_loadu_ps((
float*)src0);
501 xmm1 = _mm256_mul_ps(xmm1, xmm1);
503 xmm1 = _mm256_hadd_ps(xmm1, xmm1);
504 xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
506 xmm3 = _mm256_max_ps(xmm1, xmm3);
508 xmm4.
float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
509 xmm5.
float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
511 xmm11 = _mm256_and_si256(xmm8, xmm5.
int_vec);
512 xmm12 = _mm256_and_si256(xmm9, xmm4.
int_vec);
514 xmm9 = _mm256_add_epi32(xmm11, xmm12);
516 xmm8 = _mm256_add_epi32(xmm8, xmm10);
519 idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
520 xmm10 = _mm256_set1_epi32(2);
521 if (num_bytes >> 4 & 1) {
522 xmm2 = _mm256_loadu_ps((
float*)src0);
527 xmm2 = _mm256_mul_ps(xmm2, xmm2);
531 xmm1 = _mm256_hadd_ps(xmm2, xmm2);
533 xmm3 = _mm256_max_ps(xmm1, xmm3);
535 xmm4.
float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
536 xmm5.
float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
538 xmm11 = _mm256_and_si256(xmm8, xmm5.
int_vec);
539 xmm12 = _mm256_and_si256(xmm9, xmm4.
int_vec);
541 xmm9 = _mm256_add_epi32(xmm11, xmm12);
543 xmm8 = _mm256_add_epi32(xmm8, xmm10);
546 _mm256_storeu_ps((
float*)&(holderf.
f), xmm3);
547 _mm256_storeu_si256(&(holderi.
int_vec), xmm9);
549 target[0] = holderi.
i[0];
550 sq_dist = holderf.
f[0];
551 target[0] = (holderf.
f[1] > sq_dist) ? holderi.
i[1] : target[0];
552 sq_dist = (holderf.
f[1] > sq_dist) ? holderf.
f[1] : sq_dist;
553 target[0] = (holderf.
f[2] > sq_dist) ? holderi.
i[2] : target[0];
554 sq_dist = (holderf.
f[2] > sq_dist) ? holderf.
f[2] : sq_dist;
555 target[0] = (holderf.
f[3] > sq_dist) ? holderi.
i[3] : target[0];
556 sq_dist = (holderf.
f[3] > sq_dist) ? holderf.
f[3] : sq_dist;
557 target[0] = (holderf.
f[4] > sq_dist) ? holderi.
i[4] : target[0];
558 sq_dist = (holderf.
f[4] > sq_dist) ? holderf.
f[4] : sq_dist;
559 target[0] = (holderf.
f[5] > sq_dist) ? holderi.
i[5] : target[0];
560 sq_dist = (holderf.
f[5] > sq_dist) ? holderf.
f[5] : sq_dist;
561 target[0] = (holderf.
f[6] > sq_dist) ? holderi.
i[6] : target[0];
562 sq_dist = (holderf.
f[6] > sq_dist) ? holderf.
f[6] : sq_dist;
563 target[0] = (holderf.
f[7] > sq_dist) ? holderi.
i[7] : target[0];
564 sq_dist = (holderf.
f[7] > sq_dist) ? holderf.
f[7] : sq_dist;
#define bit128_p(x)
Definition: volk_common.h:132
static void volk_32fc_index_max_16u_generic(uint16_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:399
float f[8]
Definition: volk_common.h:122
__m256i int_vec
Definition: volk_common.h:127
uint32_t i[8]
Definition: volk_common.h:121
__m128i int_vec
Definition: volk_common.h:113
static void volk_32fc_index_max_16u_a_sse3(uint16_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:251
for i
Definition: volk_config_fixed.tmpl.h:25
#define bit256_p(x)
Definition: volk_common.h:133
Definition: volk_common.h:118
__m128 float_vec
Definition: volk_common.h:109
float complex lv_32fc_t
Definition: volk_complex.h:61
__m256 float_vec
Definition: volk_common.h:126
float f[4]
Definition: volk_common.h:105
#define lv_creal(x)
Definition: volk_complex.h:83
Definition: volk_common.h:101
#define lv_cimag(x)
Definition: volk_complex.h:85
uint32_t i[4]
Definition: volk_common.h:104