Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32f_acos_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
70 #include <stdio.h>
71 #include <math.h>
72 #include <inttypes.h>
73 
74 /* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/
75 #define ACOS_TERMS 2
76 
77 #ifndef INCLUDED_volk_32f_acos_32f_a_H
78 #define INCLUDED_volk_32f_acos_32f_a_H
79 
80 #if LV_HAVE_AVX2 && LV_HAVE_FMA
81 #include <immintrin.h>
82 
83 static inline void
84 volk_32f_acos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
85 {
86  float* bPtr = bVector;
87  const float* aPtr = aVector;
88 
89  unsigned int number = 0;
90  unsigned int eighthPoints = num_points / 8;
91  int i, j;
92 
93  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
94  __m256 fzeroes, fones, ftwos, ffours, condition;
95 
96  pi = _mm256_set1_ps(3.14159265358979323846);
97  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
98  fzeroes = _mm256_setzero_ps();
99  fones = _mm256_set1_ps(1.0);
100  ftwos = _mm256_set1_ps(2.0);
101  ffours = _mm256_set1_ps(4.0);
102 
103  for(;number < eighthPoints; number++){
104  aVal = _mm256_load_ps(aPtr);
105  d = aVal;
106  aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
107  z = aVal;
108  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
109  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
110  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
111  x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
112 
113  for(i = 0; i < 2; i++)
114  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x,fones)));
115  x = _mm256_div_ps(fones, x);
116  y = fzeroes;
117  for(j = ACOS_TERMS - 1; j >=0 ; j--)
118  y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
119 
120  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
121  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
122 
123  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
124  arccosine = y;
125  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
126  arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
127  condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
128  arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
129 
130  _mm256_store_ps(bPtr, arccosine);
131  aPtr += 8;
132  bPtr += 8;
133  }
134 
135  number = eighthPoints * 8;
136  for(;number < num_points; number++){
137  *bPtr++ = acos(*aPtr++);
138  }
139 }
140 
141 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
142 
143 
144 #ifdef LV_HAVE_AVX
145 #include <immintrin.h>
146 
147 static inline void
148 volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
149 {
150  float* bPtr = bVector;
151  const float* aPtr = aVector;
152 
153  unsigned int number = 0;
154  unsigned int eighthPoints = num_points / 8;
155  int i, j;
156 
157  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
158  __m256 fzeroes, fones, ftwos, ffours, condition;
159 
160  pi = _mm256_set1_ps(3.14159265358979323846);
161  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
162  fzeroes = _mm256_setzero_ps();
163  fones = _mm256_set1_ps(1.0);
164  ftwos = _mm256_set1_ps(2.0);
165  ffours = _mm256_set1_ps(4.0);
166 
167  for(;number < eighthPoints; number++){
168  aVal = _mm256_load_ps(aPtr);
169  d = aVal;
170  aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
171  z = aVal;
172  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
173  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
174  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
175  x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
176 
177  for(i = 0; i < 2; i++)
178  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
179  x = _mm256_div_ps(fones, x);
180  y = fzeroes;
181  for(j = ACOS_TERMS - 1; j >=0 ; j--)
182  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
183 
184  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
185  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
186 
187  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
188  arccosine = y;
189  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
190  arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
191  condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
192  arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
193 
194  _mm256_store_ps(bPtr, arccosine);
195  aPtr += 8;
196  bPtr += 8;
197  }
198 
199  number = eighthPoints * 8;
200  for(;number < num_points; number++){
201  *bPtr++ = acos(*aPtr++);
202  }
203 }
204 
205 #endif /* LV_HAVE_AVX2 for aligned */
206 
207 #ifdef LV_HAVE_SSE4_1
208 #include <smmintrin.h>
209 
210 static inline void
211 volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
212 {
213  float* bPtr = bVector;
214  const float* aPtr = aVector;
215 
216  unsigned int number = 0;
217  unsigned int quarterPoints = num_points / 4;
218  int i, j;
219 
220  __m128 aVal, d, pi, pio2, x, y, z, arccosine;
221  __m128 fzeroes, fones, ftwos, ffours, condition;
222 
223  pi = _mm_set1_ps(3.14159265358979323846);
224  pio2 = _mm_set1_ps(3.14159265358979323846/2);
225  fzeroes = _mm_setzero_ps();
226  fones = _mm_set1_ps(1.0);
227  ftwos = _mm_set1_ps(2.0);
228  ffours = _mm_set1_ps(4.0);
229 
230  for(;number < quarterPoints; number++){
231  aVal = _mm_load_ps(aPtr);
232  d = aVal;
233  aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
234  z = aVal;
235  condition = _mm_cmplt_ps(z, fzeroes);
236  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
237  condition = _mm_cmplt_ps(z, fones);
238  x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
239 
240  for(i = 0; i < 2; i++)
241  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
242  x = _mm_div_ps(fones, x);
243  y = fzeroes;
244  for(j = ACOS_TERMS - 1; j >=0 ; j--)
245  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
246 
247  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
248  condition = _mm_cmpgt_ps(z, fones);
249 
250  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
251  arccosine = y;
252  condition = _mm_cmplt_ps(aVal, fzeroes);
253  arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
254  condition = _mm_cmplt_ps(d, fzeroes);
255  arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
256 
257  _mm_store_ps(bPtr, arccosine);
258  aPtr += 4;
259  bPtr += 4;
260  }
261 
262  number = quarterPoints * 4;
263  for(;number < num_points; number++){
264  *bPtr++ = acosf(*aPtr++);
265  }
266 }
267 
268 #endif /* LV_HAVE_SSE4_1 for aligned */
269 
270 #endif /* INCLUDED_volk_32f_acos_32f_a_H */
271 
272 
273 #ifndef INCLUDED_volk_32f_acos_32f_u_H
274 #define INCLUDED_volk_32f_acos_32f_u_H
275 
276 #if LV_HAVE_AVX2 && LV_HAVE_FMA
277 #include <immintrin.h>
278 
279 static inline void
280 volk_32f_acos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
281 {
282  float* bPtr = bVector;
283  const float* aPtr = aVector;
284 
285  unsigned int number = 0;
286  unsigned int eighthPoints = num_points / 8;
287  int i, j;
288 
289  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
290  __m256 fzeroes, fones, ftwos, ffours, condition;
291 
292  pi = _mm256_set1_ps(3.14159265358979323846);
293  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
294  fzeroes = _mm256_setzero_ps();
295  fones = _mm256_set1_ps(1.0);
296  ftwos = _mm256_set1_ps(2.0);
297  ffours = _mm256_set1_ps(4.0);
298 
299  for(;number < eighthPoints; number++){
300  aVal = _mm256_loadu_ps(aPtr);
301  d = aVal;
302  aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
303  z = aVal;
304  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
305  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
306  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
307  x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
308 
309  for(i = 0; i < 2; i++)
310  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x,fones)));
311  x = _mm256_div_ps(fones, x);
312  y = fzeroes;
313  for(j = ACOS_TERMS - 1; j >=0 ; j--)
314  y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
315 
316  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
317  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
318 
319  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
320  arccosine = y;
321  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
322  arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
323  condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
324  arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
325 
326  _mm256_storeu_ps(bPtr, arccosine);
327  aPtr += 8;
328  bPtr += 8;
329  }
330 
331  number = eighthPoints * 8;
332  for(;number < num_points; number++){
333  *bPtr++ = acos(*aPtr++);
334  }
335 }
336 
337 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
338 
339 
340 #ifdef LV_HAVE_AVX
341 #include <immintrin.h>
342 
343 static inline void
344 volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
345 {
346  float* bPtr = bVector;
347  const float* aPtr = aVector;
348 
349  unsigned int number = 0;
350  unsigned int eighthPoints = num_points / 8;
351  int i, j;
352 
353  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
354  __m256 fzeroes, fones, ftwos, ffours, condition;
355 
356  pi = _mm256_set1_ps(3.14159265358979323846);
357  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
358  fzeroes = _mm256_setzero_ps();
359  fones = _mm256_set1_ps(1.0);
360  ftwos = _mm256_set1_ps(2.0);
361  ffours = _mm256_set1_ps(4.0);
362 
363  for(;number < eighthPoints; number++){
364  aVal = _mm256_loadu_ps(aPtr);
365  d = aVal;
366  aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
367  z = aVal;
368  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
369  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
370  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
371  x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
372 
373  for(i = 0; i < 2; i++)
374  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
375  x = _mm256_div_ps(fones, x);
376  y = fzeroes;
377  for(j = ACOS_TERMS - 1; j >=0 ; j--)
378  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
379 
380  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
381  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
382 
383  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
384  arccosine = y;
385  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
386  arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
387  condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
388  arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
389 
390  _mm256_storeu_ps(bPtr, arccosine);
391  aPtr += 8;
392  bPtr += 8;
393  }
394 
395  number = eighthPoints * 8;
396  for(;number < num_points; number++){
397  *bPtr++ = acos(*aPtr++);
398  }
399 }
400 
401 #endif /* LV_HAVE_AVX2 for unaligned */
402 
403 #ifdef LV_HAVE_SSE4_1
404 #include <smmintrin.h>
405 
406 static inline void
407 volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
408 {
409  float* bPtr = bVector;
410  const float* aPtr = aVector;
411 
412  unsigned int number = 0;
413  unsigned int quarterPoints = num_points / 4;
414  int i, j;
415 
416  __m128 aVal, d, pi, pio2, x, y, z, arccosine;
417  __m128 fzeroes, fones, ftwos, ffours, condition;
418 
419  pi = _mm_set1_ps(3.14159265358979323846);
420  pio2 = _mm_set1_ps(3.14159265358979323846/2);
421  fzeroes = _mm_setzero_ps();
422  fones = _mm_set1_ps(1.0);
423  ftwos = _mm_set1_ps(2.0);
424  ffours = _mm_set1_ps(4.0);
425 
426  for(;number < quarterPoints; number++){
427  aVal = _mm_loadu_ps(aPtr);
428  d = aVal;
429  aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
430  z = aVal;
431  condition = _mm_cmplt_ps(z, fzeroes);
432  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
433  condition = _mm_cmplt_ps(z, fones);
434  x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
435 
436  for(i = 0; i < 2; i++)
437  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
438  x = _mm_div_ps(fones, x);
439  y = fzeroes;
440 
441  for(j = ACOS_TERMS - 1; j >=0 ; j--)
442  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
443 
444  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
445  condition = _mm_cmpgt_ps(z, fones);
446 
447  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
448  arccosine = y;
449  condition = _mm_cmplt_ps(aVal, fzeroes);
450  arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
451  condition = _mm_cmplt_ps(d, fzeroes);
452  arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
453 
454  _mm_storeu_ps(bPtr, arccosine);
455  aPtr += 4;
456  bPtr += 4;
457  }
458 
459  number = quarterPoints * 4;
460  for(;number < num_points; number++){
461  *bPtr++ = acosf(*aPtr++);
462  }
463 }
464 
465 #endif /* LV_HAVE_SSE4_1 for aligned */
466 
467 #ifdef LV_HAVE_GENERIC
468 
469 static inline void
470 volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
471 {
472  float* bPtr = bVector;
473  const float* aPtr = aVector;
474  unsigned int number = 0;
475 
476  for(number = 0; number < num_points; number++){
477  *bPtr++ = acosf(*aPtr++);
478  }
479 
480 }
481 #endif /* LV_HAVE_GENERIC */
482 
483 #endif /* INCLUDED_volk_32f_acos_32f_u_H */
static void volk_32f_acos_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:344
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_32f_acos_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:148
#define ACOS_TERMS
Definition: volk_32f_acos_32f.h:75
static void volk_32f_acos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:470