77 #ifndef INCLUDED_volk_32f_asin_32f_a_H 78 #define INCLUDED_volk_32f_asin_32f_a_H 80 #if LV_HAVE_AVX2 && LV_HAVE_FMA 81 #include <immintrin.h> 84 volk_32f_asin_32f_a_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
86 float* bPtr = bVector;
87 const float* aPtr = aVector;
89 unsigned int number = 0;
90 unsigned int eighthPoints = num_points / 8;
93 __m256 aVal, pio2, x, y, z, arcsine;
94 __m256 fzeroes, fones, ftwos, ffours, condition;
96 pio2 = _mm256_set1_ps(3.14159265358979323846/2);
97 fzeroes = _mm256_setzero_ps();
98 fones = _mm256_set1_ps(1.0);
99 ftwos = _mm256_set1_ps(2.0);
100 ffours = _mm256_set1_ps(4.0);
102 for(;number < eighthPoints; number++){
103 aVal = _mm256_load_ps(aPtr);
104 aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
106 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
107 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
108 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
109 x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
111 for(i = 0; i < 2; i++){
112 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
114 x = _mm256_div_ps(fones, x);
117 y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
120 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
121 condition = _mm256_cmp_ps(z, fones,_CMP_GT_OS);
123 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
125 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
126 arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
128 _mm256_store_ps(bPtr, arcsine);
133 number = eighthPoints * 8;
134 for(;number < num_points; number++){
135 *bPtr++ = asin(*aPtr++);
143 #include <immintrin.h> 148 float* bPtr = bVector;
149 const float* aPtr = aVector;
151 unsigned int number = 0;
152 unsigned int eighthPoints = num_points / 8;
155 __m256 aVal, pio2, x, y, z, arcsine;
156 __m256 fzeroes, fones, ftwos, ffours, condition;
158 pio2 = _mm256_set1_ps(3.14159265358979323846/2);
159 fzeroes = _mm256_setzero_ps();
160 fones = _mm256_set1_ps(1.0);
161 ftwos = _mm256_set1_ps(2.0);
162 ffours = _mm256_set1_ps(4.0);
164 for(;number < eighthPoints; number++){
165 aVal = _mm256_load_ps(aPtr);
166 aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
168 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
169 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
170 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
171 x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
173 for(i = 0; i < 2; i++){
174 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
176 x = _mm256_div_ps(fones, x);
179 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
182 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
183 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
185 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
187 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
188 arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
190 _mm256_store_ps(bPtr, arcsine);
195 number = eighthPoints * 8;
196 for(;number < num_points; number++){
197 *bPtr++ = asin(*aPtr++);
203 #ifdef LV_HAVE_SSE4_1 204 #include <smmintrin.h> 207 volk_32f_asin_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
209 float* bPtr = bVector;
210 const float* aPtr = aVector;
212 unsigned int number = 0;
213 unsigned int quarterPoints = num_points / 4;
216 __m128 aVal, pio2, x, y, z, arcsine;
217 __m128 fzeroes, fones, ftwos, ffours, condition;
219 pio2 = _mm_set1_ps(3.14159265358979323846/2);
220 fzeroes = _mm_setzero_ps();
221 fones = _mm_set1_ps(1.0);
222 ftwos = _mm_set1_ps(2.0);
223 ffours = _mm_set1_ps(4.0);
225 for(;number < quarterPoints; number++){
226 aVal = _mm_load_ps(aPtr);
227 aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
229 condition = _mm_cmplt_ps(z, fzeroes);
230 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
231 condition = _mm_cmplt_ps(z, fones);
232 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
234 for(i = 0; i < 2; i++){
235 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
237 x = _mm_div_ps(fones, x);
240 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
243 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
244 condition = _mm_cmpgt_ps(z, fones);
246 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
248 condition = _mm_cmplt_ps(aVal, fzeroes);
249 arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
251 _mm_store_ps(bPtr, arcsine);
256 number = quarterPoints * 4;
257 for(;number < num_points; number++){
258 *bPtr++ = asinf(*aPtr++);
266 #ifndef INCLUDED_volk_32f_asin_32f_u_H 267 #define INCLUDED_volk_32f_asin_32f_u_H 269 #if LV_HAVE_AVX2 && LV_HAVE_FMA 270 #include <immintrin.h> 273 volk_32f_asin_32f_u_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
275 float* bPtr = bVector;
276 const float* aPtr = aVector;
278 unsigned int number = 0;
279 unsigned int eighthPoints = num_points / 8;
282 __m256 aVal, pio2, x, y, z, arcsine;
283 __m256 fzeroes, fones, ftwos, ffours, condition;
285 pio2 = _mm256_set1_ps(3.14159265358979323846/2);
286 fzeroes = _mm256_setzero_ps();
287 fones = _mm256_set1_ps(1.0);
288 ftwos = _mm256_set1_ps(2.0);
289 ffours = _mm256_set1_ps(4.0);
291 for(;number < eighthPoints; number++){
292 aVal = _mm256_loadu_ps(aPtr);
293 aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
295 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
296 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
297 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
298 x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
300 for(i = 0; i < 2; i++){
301 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
303 x = _mm256_div_ps(fones, x);
306 y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
309 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
310 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
312 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
314 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
315 arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
317 _mm256_storeu_ps(bPtr, arcsine);
322 number = eighthPoints * 8;
323 for(;number < num_points; number++){
324 *bPtr++ = asin(*aPtr++);
332 #include <immintrin.h> 337 float* bPtr = bVector;
338 const float* aPtr = aVector;
340 unsigned int number = 0;
341 unsigned int eighthPoints = num_points / 8;
344 __m256 aVal, pio2, x, y, z, arcsine;
345 __m256 fzeroes, fones, ftwos, ffours, condition;
347 pio2 = _mm256_set1_ps(3.14159265358979323846/2);
348 fzeroes = _mm256_setzero_ps();
349 fones = _mm256_set1_ps(1.0);
350 ftwos = _mm256_set1_ps(2.0);
351 ffours = _mm256_set1_ps(4.0);
353 for(;number < eighthPoints; number++){
354 aVal = _mm256_loadu_ps(aPtr);
355 aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
357 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
358 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
359 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
360 x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
362 for(i = 0; i < 2; i++){
363 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
365 x = _mm256_div_ps(fones, x);
368 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
371 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
372 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
374 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
376 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
377 arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
379 _mm256_storeu_ps(bPtr, arcsine);
384 number = eighthPoints * 8;
385 for(;number < num_points; number++){
386 *bPtr++ = asin(*aPtr++);
393 #ifdef LV_HAVE_SSE4_1 394 #include <smmintrin.h> 397 volk_32f_asin_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
399 float* bPtr = bVector;
400 const float* aPtr = aVector;
402 unsigned int number = 0;
403 unsigned int quarterPoints = num_points / 4;
406 __m128 aVal, pio2, x, y, z, arcsine;
407 __m128 fzeroes, fones, ftwos, ffours, condition;
409 pio2 = _mm_set1_ps(3.14159265358979323846/2);
410 fzeroes = _mm_setzero_ps();
411 fones = _mm_set1_ps(1.0);
412 ftwos = _mm_set1_ps(2.0);
413 ffours = _mm_set1_ps(4.0);
415 for(;number < quarterPoints; number++){
416 aVal = _mm_loadu_ps(aPtr);
417 aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
419 condition = _mm_cmplt_ps(z, fzeroes);
420 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
421 condition = _mm_cmplt_ps(z, fones);
422 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
424 for(i = 0; i < 2; i++){
425 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
427 x = _mm_div_ps(fones, x);
430 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
433 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
434 condition = _mm_cmpgt_ps(z, fones);
436 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
438 condition = _mm_cmplt_ps(aVal, fzeroes);
439 arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
441 _mm_storeu_ps(bPtr, arcsine);
446 number = quarterPoints * 4;
447 for(;number < num_points; number++){
448 *bPtr++ = asinf(*aPtr++);
454 #ifdef LV_HAVE_GENERIC 459 float* bPtr = bVector;
460 const float* aPtr = aVector;
461 unsigned int number = 0;
463 for(number = 0; number < num_points; number++){
464 *bPtr++ = asinf(*aPtr++);
static void volk_32f_asin_32f_u_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:457
#define ASIN_TERMS
Definition: volk_32f_asin_32f.h:75
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_32f_asin_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:335
static void volk_32f_asin_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:146