77 #ifndef INCLUDED_volk_32f_acos_32f_a_H 78 #define INCLUDED_volk_32f_acos_32f_a_H 80 #if LV_HAVE_AVX2 && LV_HAVE_FMA 81 #include <immintrin.h> 84 volk_32f_acos_32f_a_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
86 float* bPtr = bVector;
87 const float* aPtr = aVector;
89 unsigned int number = 0;
90 unsigned int eighthPoints = num_points / 8;
93 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
94 __m256 fzeroes, fones, ftwos, ffours, condition;
96 pi = _mm256_set1_ps(3.14159265358979323846);
97 pio2 = _mm256_set1_ps(3.14159265358979323846/2);
98 fzeroes = _mm256_setzero_ps();
99 fones = _mm256_set1_ps(1.0);
100 ftwos = _mm256_set1_ps(2.0);
101 ffours = _mm256_set1_ps(4.0);
103 for(;number < eighthPoints; number++){
104 aVal = _mm256_load_ps(aPtr);
106 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
108 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
109 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
110 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
111 x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
113 for(i = 0; i < 2; i++)
114 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x,fones)));
115 x = _mm256_div_ps(fones, x);
118 y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
120 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
121 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
123 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
125 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
126 arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
127 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
128 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
130 _mm256_store_ps(bPtr, arccosine);
135 number = eighthPoints * 8;
136 for(;number < num_points; number++){
137 *bPtr++ = acos(*aPtr++);
145 #include <immintrin.h> 150 float* bPtr = bVector;
151 const float* aPtr = aVector;
153 unsigned int number = 0;
154 unsigned int eighthPoints = num_points / 8;
157 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
158 __m256 fzeroes, fones, ftwos, ffours, condition;
160 pi = _mm256_set1_ps(3.14159265358979323846);
161 pio2 = _mm256_set1_ps(3.14159265358979323846/2);
162 fzeroes = _mm256_setzero_ps();
163 fones = _mm256_set1_ps(1.0);
164 ftwos = _mm256_set1_ps(2.0);
165 ffours = _mm256_set1_ps(4.0);
167 for(;number < eighthPoints; number++){
168 aVal = _mm256_load_ps(aPtr);
170 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
172 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
173 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
174 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
175 x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
177 for(i = 0; i < 2; i++)
178 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
179 x = _mm256_div_ps(fones, x);
182 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
184 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
185 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
187 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
189 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
190 arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
191 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
192 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
194 _mm256_store_ps(bPtr, arccosine);
199 number = eighthPoints * 8;
200 for(;number < num_points; number++){
201 *bPtr++ = acos(*aPtr++);
207 #ifdef LV_HAVE_SSE4_1 208 #include <smmintrin.h> 211 volk_32f_acos_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
213 float* bPtr = bVector;
214 const float* aPtr = aVector;
216 unsigned int number = 0;
217 unsigned int quarterPoints = num_points / 4;
220 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
221 __m128 fzeroes, fones, ftwos, ffours, condition;
223 pi = _mm_set1_ps(3.14159265358979323846);
224 pio2 = _mm_set1_ps(3.14159265358979323846/2);
225 fzeroes = _mm_setzero_ps();
226 fones = _mm_set1_ps(1.0);
227 ftwos = _mm_set1_ps(2.0);
228 ffours = _mm_set1_ps(4.0);
230 for(;number < quarterPoints; number++){
231 aVal = _mm_load_ps(aPtr);
233 aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
235 condition = _mm_cmplt_ps(z, fzeroes);
236 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
237 condition = _mm_cmplt_ps(z, fones);
238 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
240 for(i = 0; i < 2; i++)
241 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
242 x = _mm_div_ps(fones, x);
245 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
247 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
248 condition = _mm_cmpgt_ps(z, fones);
250 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
252 condition = _mm_cmplt_ps(aVal, fzeroes);
253 arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
254 condition = _mm_cmplt_ps(d, fzeroes);
255 arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
257 _mm_store_ps(bPtr, arccosine);
262 number = quarterPoints * 4;
263 for(;number < num_points; number++){
264 *bPtr++ = acosf(*aPtr++);
273 #ifndef INCLUDED_volk_32f_acos_32f_u_H 274 #define INCLUDED_volk_32f_acos_32f_u_H 276 #if LV_HAVE_AVX2 && LV_HAVE_FMA 277 #include <immintrin.h> 280 volk_32f_acos_32f_u_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
282 float* bPtr = bVector;
283 const float* aPtr = aVector;
285 unsigned int number = 0;
286 unsigned int eighthPoints = num_points / 8;
289 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
290 __m256 fzeroes, fones, ftwos, ffours, condition;
292 pi = _mm256_set1_ps(3.14159265358979323846);
293 pio2 = _mm256_set1_ps(3.14159265358979323846/2);
294 fzeroes = _mm256_setzero_ps();
295 fones = _mm256_set1_ps(1.0);
296 ftwos = _mm256_set1_ps(2.0);
297 ffours = _mm256_set1_ps(4.0);
299 for(;number < eighthPoints; number++){
300 aVal = _mm256_loadu_ps(aPtr);
302 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
304 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
305 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
306 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
307 x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
309 for(i = 0; i < 2; i++)
310 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x,fones)));
311 x = _mm256_div_ps(fones, x);
314 y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
316 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
317 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
319 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
321 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
322 arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
323 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
324 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
326 _mm256_storeu_ps(bPtr, arccosine);
331 number = eighthPoints * 8;
332 for(;number < num_points; number++){
333 *bPtr++ = acos(*aPtr++);
341 #include <immintrin.h> 346 float* bPtr = bVector;
347 const float* aPtr = aVector;
349 unsigned int number = 0;
350 unsigned int eighthPoints = num_points / 8;
353 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
354 __m256 fzeroes, fones, ftwos, ffours, condition;
356 pi = _mm256_set1_ps(3.14159265358979323846);
357 pio2 = _mm256_set1_ps(3.14159265358979323846/2);
358 fzeroes = _mm256_setzero_ps();
359 fones = _mm256_set1_ps(1.0);
360 ftwos = _mm256_set1_ps(2.0);
361 ffours = _mm256_set1_ps(4.0);
363 for(;number < eighthPoints; number++){
364 aVal = _mm256_loadu_ps(aPtr);
366 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
368 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
369 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
370 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
371 x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
373 for(i = 0; i < 2; i++)
374 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
375 x = _mm256_div_ps(fones, x);
378 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
380 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
381 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
383 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
385 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
386 arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
387 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
388 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
390 _mm256_storeu_ps(bPtr, arccosine);
395 number = eighthPoints * 8;
396 for(;number < num_points; number++){
397 *bPtr++ = acos(*aPtr++);
403 #ifdef LV_HAVE_SSE4_1 404 #include <smmintrin.h> 407 volk_32f_acos_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
409 float* bPtr = bVector;
410 const float* aPtr = aVector;
412 unsigned int number = 0;
413 unsigned int quarterPoints = num_points / 4;
416 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
417 __m128 fzeroes, fones, ftwos, ffours, condition;
419 pi = _mm_set1_ps(3.14159265358979323846);
420 pio2 = _mm_set1_ps(3.14159265358979323846/2);
421 fzeroes = _mm_setzero_ps();
422 fones = _mm_set1_ps(1.0);
423 ftwos = _mm_set1_ps(2.0);
424 ffours = _mm_set1_ps(4.0);
426 for(;number < quarterPoints; number++){
427 aVal = _mm_loadu_ps(aPtr);
429 aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
431 condition = _mm_cmplt_ps(z, fzeroes);
432 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
433 condition = _mm_cmplt_ps(z, fones);
434 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
436 for(i = 0; i < 2; i++)
437 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
438 x = _mm_div_ps(fones, x);
442 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
444 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
445 condition = _mm_cmpgt_ps(z, fones);
447 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
449 condition = _mm_cmplt_ps(aVal, fzeroes);
450 arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
451 condition = _mm_cmplt_ps(d, fzeroes);
452 arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
454 _mm_storeu_ps(bPtr, arccosine);
459 number = quarterPoints * 4;
460 for(;number < num_points; number++){
461 *bPtr++ = acosf(*aPtr++);
467 #ifdef LV_HAVE_GENERIC 472 float* bPtr = bVector;
473 const float* aPtr = aVector;
474 unsigned int number = 0;
476 for(number = 0; number < num_points; number++){
477 *bPtr++ = acosf(*aPtr++);
static void volk_32f_acos_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:344
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_32f_acos_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:148
#define ACOS_TERMS
Definition: volk_32f_acos_32f.h:75
static void volk_32f_acos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:470