82 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H 83 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H 90 #define MAX(X,Y) ((X) > (Y)?(X):(Y)) 99 float* cutoff,
unsigned int num_points)
107 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
109 xmm9 = _mm_setzero_ps();
110 xmm1 = _mm_setzero_ps();
111 xmm0 = _mm_load1_ps(¢er_point_array[0]);
112 xmm6 = _mm_load1_ps(¢er_point_array[1]);
113 xmm7 = _mm_load1_ps(¢er_point_array[2]);
114 xmm8 = _mm_load1_ps(¢er_point_array[3]);
115 xmm10 = _mm_load1_ps(cutoff);
117 int bound = num_points/8;
118 int leftovers = num_points - 8*bound;
120 for(; i < bound; ++
i) {
122 xmm2 = _mm_load_ps(src0);
123 xmm2 = _mm_max_ps(xmm10, xmm2);
124 xmm3 = _mm_mul_ps(xmm2, xmm2);
125 xmm4 = _mm_mul_ps(xmm2, xmm3);
126 xmm5 = _mm_mul_ps(xmm3, xmm3);
128 xmm2 = _mm_mul_ps(xmm2, xmm0);
129 xmm3 = _mm_mul_ps(xmm3, xmm6);
130 xmm4 = _mm_mul_ps(xmm4, xmm7);
131 xmm5 = _mm_mul_ps(xmm5, xmm8);
133 xmm2 = _mm_add_ps(xmm2, xmm3);
134 xmm3 = _mm_add_ps(xmm4, xmm5);
138 xmm9 = _mm_add_ps(xmm2, xmm9);
139 xmm9 = _mm_add_ps(xmm3, xmm9);
142 xmm2 = _mm_load_ps(src0);
143 xmm2 = _mm_max_ps(xmm10, xmm2);
144 xmm3 = _mm_mul_ps(xmm2, xmm2);
145 xmm4 = _mm_mul_ps(xmm2, xmm3);
146 xmm5 = _mm_mul_ps(xmm3, xmm3);
148 xmm2 = _mm_mul_ps(xmm2, xmm0);
149 xmm3 = _mm_mul_ps(xmm3, xmm6);
150 xmm4 = _mm_mul_ps(xmm4, xmm7);
151 xmm5 = _mm_mul_ps(xmm5, xmm8);
153 xmm2 = _mm_add_ps(xmm2, xmm3);
154 xmm3 = _mm_add_ps(xmm4, xmm5);
158 xmm1 = _mm_add_ps(xmm2, xmm1);
159 xmm1 = _mm_add_ps(xmm3, xmm1);
161 xmm2 = _mm_hadd_ps(xmm9, xmm1);
162 xmm3 = _mm_hadd_ps(xmm2, xmm2);
163 xmm4 = _mm_hadd_ps(xmm3, xmm3);
164 _mm_store_ss(&result, xmm4);
166 for(i = 0; i < leftovers; ++
i) {
168 fst =
MAX(fst, *cutoff);
172 result += (center_point_array[0] * fst +
173 center_point_array[1] * sq +
174 center_point_array[2] * thrd +
175 center_point_array[3] * frth);
178 result += (float)(num_points) * center_point_array[4];
185 #if LV_HAVE_AVX && LV_HAVE_FMA 186 #include<immintrin.h> 189 volk_32f_x3_sum_of_poly_32f_a_avx2_fma(
float* target,
float* src0,
float* center_point_array,
190 float* cutoff,
unsigned int num_points)
192 const unsigned int eighth_points = num_points / 8;
198 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
200 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
202 cpa0 = _mm256_set1_ps(center_point_array[0]);
203 cpa1 = _mm256_set1_ps(center_point_array[1]);
204 cpa2 = _mm256_set1_ps(center_point_array[2]);
205 cpa3 = _mm256_set1_ps(center_point_array[3]);
206 cutoff_vec = _mm256_set1_ps(*cutoff);
207 target_vec = _mm256_setzero_ps();
211 for(i = 0; i < eighth_points; ++
i) {
212 x_to_1 = _mm256_load_ps(src0);
213 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
214 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
215 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
217 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
219 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
220 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
222 x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
223 x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
225 target_vec = _mm256_add_ps(x_to_1, target_vec);
226 target_vec = _mm256_add_ps(x_to_3, target_vec);
233 target_vec = _mm256_hadd_ps(target_vec, target_vec);
234 _mm256_store_ps(temp_results, target_vec);
235 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
237 for(i = eighth_points*8; i < num_points; ++
i) {
239 fst =
MAX(fst, *cutoff);
243 *target += (center_point_array[0] * fst +
244 center_point_array[1] * sq +
245 center_point_array[2] * thrd +
246 center_point_array[3] * frth);
248 *target += (float)(num_points) * center_point_array[4];
250 #endif // LV_HAVE_AVX && LV_HAVE_FMA 253 #include<immintrin.h> 257 float* cutoff,
unsigned int num_points)
259 const unsigned int eighth_points = num_points / 8;
265 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
267 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
269 cpa0 = _mm256_set1_ps(center_point_array[0]);
270 cpa1 = _mm256_set1_ps(center_point_array[1]);
271 cpa2 = _mm256_set1_ps(center_point_array[2]);
272 cpa3 = _mm256_set1_ps(center_point_array[3]);
273 cutoff_vec = _mm256_set1_ps(*cutoff);
274 target_vec = _mm256_setzero_ps();
278 for(i = 0; i < eighth_points; ++
i) {
279 x_to_1 = _mm256_load_ps(src0);
280 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
281 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
282 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
284 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
286 x_to_1 = _mm256_mul_ps(x_to_1, cpa0);
287 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
288 x_to_3 = _mm256_mul_ps(x_to_3, cpa2);
289 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
291 x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
292 x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
294 target_vec = _mm256_add_ps(x_to_1, target_vec);
295 target_vec = _mm256_add_ps(x_to_3, target_vec);
302 target_vec = _mm256_hadd_ps(target_vec, target_vec);
303 _mm256_store_ps(temp_results, target_vec);
304 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
306 for(i = eighth_points*8; i < num_points; ++
i) {
308 fst =
MAX(fst, *cutoff);
312 *target += (center_point_array[0] * fst +
313 center_point_array[1] * sq +
314 center_point_array[2] * thrd +
315 center_point_array[3] * frth);
317 *target += (float)(num_points) * center_point_array[4];
319 #endif // LV_HAVE_AVX 323 #ifdef LV_HAVE_GENERIC 327 float* cutoff,
unsigned int num_points)
329 const unsigned int eighth_points = num_points / 8;
331 float result[8] = {0.0f,0.0f,0.0f,0.0f, 0.0f,0.0f,0.0f,0.0f};
339 for(i = 0; i < eighth_points; ++
i) {
340 for(k = 0; k < 8; ++k) {
342 fst =
MAX(fst, *cutoff);
346 result[k] += center_point_array[0] * fst + center_point_array[1] * sq;
347 result[k] += center_point_array[2] * thrd + center_point_array[3] * frth;
350 for(k = 0; k < 8; k+=2)
351 result[k] = result[k]+result[k+1];
353 *target = result[0] + result[2] + result[4] + result[6];
355 for(i = eighth_points*8; i < num_points; ++
i) {
357 fst =
MAX(fst, *cutoff);
361 *target += (center_point_array[0] * fst +
362 center_point_array[1] * sq +
363 center_point_array[2] * thrd +
364 center_point_array[3] * frth);
366 *target += (float)(num_points) * center_point_array[4];
372 #include <arm_neon.h> 376 float* __restrict center_point_array,
377 float* __restrict cutoff,
unsigned int num_points)
380 float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
382 float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
383 float32x2_t cutoff_vector;
384 float32x2x2_t x_low, x_high;
385 float32x4_t x_qvector, c_qvector, cpa_qvector;
387 float res_accumulators[4];
389 c_qvector = vld1q_f32( zero );
391 cutoff_vector = vdup_n_f32( *cutoff );
393 cpa_qvector = vld1q_f32( center_point_array );
395 for(i=0; i < num_points; ++
i) {
397 x_to_1 = vdup_n_f32( *src0++ );
400 x_to_1 = vmax_f32(x_to_1, cutoff_vector );
401 x_to_2 = vmul_f32(x_to_1, x_to_1);
402 x_to_3 = vmul_f32(x_to_2, x_to_1);
403 x_to_4 = vmul_f32(x_to_3, x_to_1);
405 x_low = vzip_f32(x_to_1, x_to_2);
406 x_high = vzip_f32(x_to_3, x_to_4);
408 x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
411 c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
415 vst1q_f32(res_accumulators, c_qvector );
416 accumulator = res_accumulators[0] + res_accumulators[1] +
417 res_accumulators[2] + res_accumulators[3];
419 *target = accumulator + (float)num_points * center_point_array[4];
429 float* __restrict center_point_array,
430 float* __restrict cutoff,
unsigned int num_points)
433 float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
437 float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
438 accumulator1_vec = vld1q_f32(zero);
439 accumulator2_vec = vld1q_f32(zero);
440 accumulator3_vec = vld1q_f32(zero);
441 accumulator4_vec = vld1q_f32(zero);
442 float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
443 float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
446 cutoff_vector = vdupq_n_f32( *cutoff );
448 cpa_0 = vdupq_n_f32(center_point_array[0]);
449 cpa_1 = vdupq_n_f32(center_point_array[1]);
450 cpa_2 = vdupq_n_f32(center_point_array[2]);
451 cpa_3 = vdupq_n_f32(center_point_array[3]);
454 for(i=0; i < num_points/4; ++
i) {
456 x_to_1 = vld1q_f32( src0 );
459 x_to_1 = vmaxq_f32(x_to_1, cutoff_vector );
460 x_to_2 = vmulq_f32(x_to_1, x_to_1);
461 x_to_3 = vmulq_f32(x_to_2, x_to_1);
462 x_to_4 = vmulq_f32(x_to_3, x_to_1);
463 x_to_1 = vmulq_f32(x_to_1, cpa_0);
464 x_to_2 = vmulq_f32(x_to_2, cpa_1);
465 x_to_3 = vmulq_f32(x_to_3, cpa_2);
466 x_to_4 = vmulq_f32(x_to_4, cpa_3);
467 accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
468 accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
469 accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
470 accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
474 accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
475 accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
476 accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
479 vst1q_f32(res_accumulators, accumulator1_vec );
480 accumulator = res_accumulators[0] + res_accumulators[1] +
481 res_accumulators[2] + res_accumulators[3];
488 for(i = 4*num_points/4; i < num_points; ++
i) {
490 fst =
MAX(fst, *cutoff);
497 accumulator += (center_point_array[0] * fst +
498 center_point_array[1] * sq +
499 center_point_array[2] * thrd +
500 center_point_array[3] * frth);
503 *target = accumulator + (float)num_points * center_point_array[4];
510 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H 511 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H 518 #define MAX(X,Y) ((X) > (Y)?(X):(Y)) 521 #if LV_HAVE_AVX && LV_HAVE_FMA 522 #include<immintrin.h> 525 volk_32f_x3_sum_of_poly_32f_u_avx_fma(
float* target,
float* src0,
float* center_point_array,
526 float* cutoff,
unsigned int num_points)
528 const unsigned int eighth_points = num_points / 8;
534 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
536 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
538 cpa0 = _mm256_set1_ps(center_point_array[0]);
539 cpa1 = _mm256_set1_ps(center_point_array[1]);
540 cpa2 = _mm256_set1_ps(center_point_array[2]);
541 cpa3 = _mm256_set1_ps(center_point_array[3]);
542 cutoff_vec = _mm256_set1_ps(*cutoff);
543 target_vec = _mm256_setzero_ps();
547 for(i = 0; i < eighth_points; ++
i) {
548 x_to_1 = _mm256_loadu_ps(src0);
549 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
550 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
551 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
553 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
555 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
556 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
558 x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
559 x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
561 target_vec = _mm256_add_ps(x_to_1, target_vec);
562 target_vec = _mm256_add_ps(x_to_3, target_vec);
569 target_vec = _mm256_hadd_ps(target_vec, target_vec);
570 _mm256_storeu_ps(temp_results, target_vec);
571 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
573 for(i = eighth_points*8; i < num_points; ++
i) {
575 fst =
MAX(fst, *cutoff);
579 *target += (center_point_array[0] * fst +
580 center_point_array[1] * sq +
581 center_point_array[2] * thrd +
582 center_point_array[3] * frth);
585 *target += (float)(num_points) * center_point_array[4];
587 #endif // LV_HAVE_AVX && LV_HAVE_FMA 590 #include<immintrin.h> 594 float* cutoff,
unsigned int num_points)
596 const unsigned int eighth_points = num_points / 8;
602 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
604 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
606 cpa0 = _mm256_set1_ps(center_point_array[0]);
607 cpa1 = _mm256_set1_ps(center_point_array[1]);
608 cpa2 = _mm256_set1_ps(center_point_array[2]);
609 cpa3 = _mm256_set1_ps(center_point_array[3]);
610 cutoff_vec = _mm256_set1_ps(*cutoff);
611 target_vec = _mm256_setzero_ps();
615 for(i = 0; i < eighth_points; ++
i) {
616 x_to_1 = _mm256_loadu_ps(src0);
617 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
618 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
619 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
621 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
623 x_to_1 = _mm256_mul_ps(x_to_1, cpa0);
624 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
625 x_to_3 = _mm256_mul_ps(x_to_3, cpa2);
626 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
628 x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
629 x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
631 target_vec = _mm256_add_ps(x_to_1, target_vec);
632 target_vec = _mm256_add_ps(x_to_3, target_vec);
639 target_vec = _mm256_hadd_ps(target_vec, target_vec);
640 _mm256_storeu_ps(temp_results, target_vec);
641 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
643 for(i = eighth_points*8; i < num_points; ++
i) {
645 fst =
MAX(fst, *cutoff);
650 *target += (center_point_array[0] * fst +
651 center_point_array[1] * sq +
652 center_point_array[2] * thrd +
653 center_point_array[3] * frth);
656 *target += (float)(num_points) * center_point_array[4];
658 #endif // LV_HAVE_AVX static void volk_32f_x3_sum_of_poly_32f_neonvert(float *__restrict target, float *__restrict src0, float *__restrict center_point_array, float *__restrict cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:428
static void volk_32f_x3_sum_of_poly_32f_a_neon(float *__restrict target, float *__restrict src0, float *__restrict center_point_array, float *__restrict cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:375
#define MAX(X, Y)
Definition: volk_32f_x3_sum_of_poly_32f.h:90
static void volk_32f_x3_sum_of_poly_32f_a_sse3(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:98
for i
Definition: volk_config_fixed.tmpl.h:25
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:47
static void volk_32f_x3_sum_of_poly_32f_generic(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:326
static void volk_32f_x3_sum_of_poly_32f_u_avx(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:593
static void volk_32f_x3_sum_of_poly_32f_a_avx(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:256