Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32fc_s32fc_x2_rotator_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
80 #ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
81 #define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
82 
83 
84 #include <volk/volk_complex.h>
85 #include <stdio.h>
86 #include <stdlib.h>
87 #include <math.h>
88 #define ROTATOR_RELOAD 512
89 
90 
91 #ifdef LV_HAVE_GENERIC
92 
93 static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
94  unsigned int i = 0;
95  int j = 0;
96  for(i = 0; i < (unsigned int)(num_points/ROTATOR_RELOAD); ++i) {
97  for(j = 0; j < ROTATOR_RELOAD; ++j) {
98  *outVector++ = *inVector++ * (*phase);
99  (*phase) *= phase_inc;
100  }
101 
102  (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
103  }
104  for(i = 0; i < num_points%ROTATOR_RELOAD; ++i) {
105  *outVector++ = *inVector++ * (*phase);
106  (*phase) *= phase_inc;
107  }
108  if(i){
109  // Make sure, we normalize phase on every call!
110  (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
111  }
112 }
113 
114 #endif /* LV_HAVE_GENERIC */
115 
116 
117 #ifdef LV_HAVE_NEON
118 #include <arm_neon.h>
120 
121 static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
122 
123 {
124  lv_32fc_t* outputVectorPtr = outVector;
125  const lv_32fc_t* inputVectorPtr = inVector;
126  lv_32fc_t incr = 1;
127  lv_32fc_t phasePtr[4] = {(*phase), (*phase), (*phase), (*phase)};
128  float32x4x2_t input_vec;
129  float32x4x2_t output_vec;
130 
131  unsigned int i = 0, j = 0;
132  const unsigned int quarter_points = num_points / 4;
133 
134  for(i = 0; i < 4; ++i) {
135  phasePtr[i] *= incr;
136  incr *= (phase_inc);
137  }
138 
139  // Notice that incr has be incremented in the previous loop
140  const lv_32fc_t incrPtr[4] = {incr, incr, incr, incr};
141  const float32x4x2_t incr_vec = vld2q_f32((float*) incrPtr);
142  float32x4x2_t phase_vec = vld2q_f32((float*) phasePtr);
143 
144  for(i = 0; i < (unsigned int)(quarter_points/ROTATOR_RELOAD); i++) {
145  for(j = 0; j < ROTATOR_RELOAD; j++) {
146  input_vec = vld2q_f32((float*) inputVectorPtr);
147  // Prefetch next one, speeds things up
148  __VOLK_PREFETCH(inputVectorPtr+4);
149  // Rotate
150  output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
151  // Increase phase
152  phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
153  // Store output
154  vst2q_f32((float*)outputVectorPtr, output_vec);
155 
156  outputVectorPtr+=4;
157  inputVectorPtr+=4;
158  }
159  // normalize phase so magnitude doesn't grow because of
160  // floating point rounding error
161  const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
162  const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
163  // Multiply complex with real
164  phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
165  phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
166  }
167 
168  for(i = 0; i < quarter_points % ROTATOR_RELOAD; i++) {
169  input_vec = vld2q_f32((float*) inputVectorPtr);
170  // Prefetch next one, speeds things up
171  __VOLK_PREFETCH(inputVectorPtr+4);
172  // Rotate
173  output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
174  // Increase phase
175  phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
176  // Store output
177  vst2q_f32((float*)outputVectorPtr, output_vec);
178 
179  outputVectorPtr+=4;
180  inputVectorPtr+=4;
181  }
182  // if(i) == true means we looped above
183  if (i) {
184  // normalize phase so magnitude doesn't grow because of
185  // floating point rounding error
186  const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
187  const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
188  // Multiply complex with real
189  phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
190  phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
191  }
192  // Store current phase
193  vst2q_f32((float*)phasePtr, phase_vec);
194 
195  // Deal with the rest
196  for(i = 0; i < num_points % 4; i++) {
197  *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
198  phasePtr[0] *= (phase_inc);
199  }
200 
201  // For continious phase next time we need to call this function
202  (*phase) = phasePtr[0];
203 }
204 
205 #endif /* LV_HAVE_NEON */
206 
207 
208 #ifdef LV_HAVE_SSE4_1
209 #include <smmintrin.h>
210 
211 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
212  lv_32fc_t* cPtr = outVector;
213  const lv_32fc_t* aPtr = inVector;
214  lv_32fc_t incr = 1;
215  lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
216 
217  unsigned int i, j = 0;
218 
219  for(i = 0; i < 2; ++i) {
220  phase_Ptr[i] *= incr;
221  incr *= (phase_inc);
222  }
223 
224  /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
225  printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
226  printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
227  __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
228 
229  phase_Val = _mm_loadu_ps((float*)phase_Ptr);
230  inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
231 
232  const unsigned int halfPoints = num_points / 2;
233 
234 
235  for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) {
236  for(j = 0; j < ROTATOR_RELOAD; ++j) {
237 
238  aVal = _mm_load_ps((float*)aPtr);
239 
240  yl = _mm_moveldup_ps(phase_Val);
241  yh = _mm_movehdup_ps(phase_Val);
242  ylp = _mm_moveldup_ps(inc_Val);
243  yhp = _mm_movehdup_ps(inc_Val);
244 
245  tmp1 = _mm_mul_ps(aVal, yl);
246  tmp1p = _mm_mul_ps(phase_Val, ylp);
247 
248  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
249  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
250  tmp2 = _mm_mul_ps(aVal, yh);
251  tmp2p = _mm_mul_ps(phase_Val, yhp);
252 
253  z = _mm_addsub_ps(tmp1, tmp2);
254  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
255 
256  _mm_store_ps((float*)cPtr, z);
257 
258  aPtr += 2;
259  cPtr += 2;
260  }
261  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
262  tmp2 = _mm_hadd_ps(tmp1, tmp1);
263  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
264  tmp2 = _mm_sqrt_ps(tmp1);
265  phase_Val = _mm_div_ps(phase_Val, tmp2);
266  }
267  for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) {
268  aVal = _mm_load_ps((float*)aPtr);
269 
270  yl = _mm_moveldup_ps(phase_Val);
271  yh = _mm_movehdup_ps(phase_Val);
272  ylp = _mm_moveldup_ps(inc_Val);
273  yhp = _mm_movehdup_ps(inc_Val);
274 
275  tmp1 = _mm_mul_ps(aVal, yl);
276 
277  tmp1p = _mm_mul_ps(phase_Val, ylp);
278 
279  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
280  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
281  tmp2 = _mm_mul_ps(aVal, yh);
282  tmp2p = _mm_mul_ps(phase_Val, yhp);
283 
284  z = _mm_addsub_ps(tmp1, tmp2);
285  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
286 
287  _mm_store_ps((float*)cPtr, z);
288 
289  aPtr += 2;
290  cPtr += 2;
291  }
292  if (i) {
293  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
294  tmp2 = _mm_hadd_ps(tmp1, tmp1);
295  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
296  tmp2 = _mm_sqrt_ps(tmp1);
297  phase_Val = _mm_div_ps(phase_Val, tmp2);
298  }
299 
300  _mm_storeu_ps((float*)phase_Ptr, phase_Val);
301  if (num_points & 1) {
302  *cPtr++ = *aPtr++ * phase_Ptr[0];
303  phase_Ptr[0] *= (phase_inc);
304  }
305 
306  (*phase) = phase_Ptr[0];
307 
308 }
309 
310 #endif /* LV_HAVE_SSE4_1 for aligned */
311 
312 
313 #ifdef LV_HAVE_SSE4_1
314 #include <smmintrin.h>
315 
316 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
317  lv_32fc_t* cPtr = outVector;
318  const lv_32fc_t* aPtr = inVector;
319  lv_32fc_t incr = 1;
320  lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
321 
322  unsigned int i, j = 0;
323 
324  for(i = 0; i < 2; ++i) {
325  phase_Ptr[i] *= incr;
326  incr *= (phase_inc);
327  }
328 
329  /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
330  printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
331  printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
332  __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
333 
334  phase_Val = _mm_loadu_ps((float*)phase_Ptr);
335  inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
336 
337  const unsigned int halfPoints = num_points / 2;
338 
339 
340  for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) {
341  for(j = 0; j < ROTATOR_RELOAD; ++j) {
342 
343  aVal = _mm_loadu_ps((float*)aPtr);
344 
345  yl = _mm_moveldup_ps(phase_Val);
346  yh = _mm_movehdup_ps(phase_Val);
347  ylp = _mm_moveldup_ps(inc_Val);
348  yhp = _mm_movehdup_ps(inc_Val);
349 
350  tmp1 = _mm_mul_ps(aVal, yl);
351  tmp1p = _mm_mul_ps(phase_Val, ylp);
352 
353  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
354  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
355  tmp2 = _mm_mul_ps(aVal, yh);
356  tmp2p = _mm_mul_ps(phase_Val, yhp);
357 
358  z = _mm_addsub_ps(tmp1, tmp2);
359  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
360 
361  _mm_storeu_ps((float*)cPtr, z);
362 
363  aPtr += 2;
364  cPtr += 2;
365  }
366  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
367  tmp2 = _mm_hadd_ps(tmp1, tmp1);
368  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
369  tmp2 = _mm_sqrt_ps(tmp1);
370  phase_Val = _mm_div_ps(phase_Val, tmp2);
371  }
372  for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) {
373  aVal = _mm_loadu_ps((float*)aPtr);
374 
375  yl = _mm_moveldup_ps(phase_Val);
376  yh = _mm_movehdup_ps(phase_Val);
377  ylp = _mm_moveldup_ps(inc_Val);
378  yhp = _mm_movehdup_ps(inc_Val);
379 
380  tmp1 = _mm_mul_ps(aVal, yl);
381 
382  tmp1p = _mm_mul_ps(phase_Val, ylp);
383 
384  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
385  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
386  tmp2 = _mm_mul_ps(aVal, yh);
387  tmp2p = _mm_mul_ps(phase_Val, yhp);
388 
389  z = _mm_addsub_ps(tmp1, tmp2);
390  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
391 
392  _mm_storeu_ps((float*)cPtr, z);
393 
394  aPtr += 2;
395  cPtr += 2;
396  }
397  if (i) {
398  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
399  tmp2 = _mm_hadd_ps(tmp1, tmp1);
400  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
401  tmp2 = _mm_sqrt_ps(tmp1);
402  phase_Val = _mm_div_ps(phase_Val, tmp2);
403  }
404 
405  _mm_storeu_ps((float*)phase_Ptr, phase_Val);
406  if (num_points & 1) {
407  *cPtr++ = *aPtr++ * phase_Ptr[0];
408  phase_Ptr[0] *= (phase_inc);
409  }
410 
411  (*phase) = phase_Ptr[0];
412 
413 }
414 
415 #endif /* LV_HAVE_SSE4_1 */
416 
417 
418 #ifdef LV_HAVE_AVX
419 #include <immintrin.h>
421 
422 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
423  lv_32fc_t* cPtr = outVector;
424  const lv_32fc_t* aPtr = inVector;
425  lv_32fc_t incr = lv_cmake(1.0, 0.0);
426  lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
427 
428  unsigned int i, j = 0;
429 
430  for(i = 0; i < 4; ++i) {
431  phase_Ptr[i] *= incr;
432  incr *= (phase_inc);
433  }
434 
435  __m256 aVal, phase_Val, z;
436 
437  phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
438 
439  const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),
440  lv_cimag(incr), lv_creal(incr),
441  lv_cimag(incr), lv_creal(incr),
442  lv_cimag(incr), lv_creal(incr));
443 
444  const unsigned int fourthPoints = num_points / 4;
445 
446  for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
447  for(j = 0; j < ROTATOR_RELOAD; ++j) {
448 
449  aVal = _mm256_load_ps((float*)aPtr);
450 
451  z = _mm256_complexmul_ps(aVal, phase_Val);
452  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
453 
454  _mm256_store_ps((float*)cPtr, z);
455 
456  aPtr += 4;
457  cPtr += 4;
458  }
459  phase_Val = _mm256_normalize_ps(phase_Val);
460  }
461 
462  for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
463  aVal = _mm256_load_ps((float*)aPtr);
464 
465  z = _mm256_complexmul_ps(aVal, phase_Val);
466  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
467 
468  _mm256_store_ps((float*)cPtr, z);
469 
470  aPtr += 4;
471  cPtr += 4;
472  }
473  if (i) {
474  phase_Val = _mm256_normalize_ps(phase_Val);
475  }
476 
477  _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
478  (*phase) = phase_Ptr[0];
479  volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points%4);
480 }
481 
482 #endif /* LV_HAVE_AVX for aligned */
483 
484 
485 #ifdef LV_HAVE_AVX
486 #include <immintrin.h>
488 
489 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
490  lv_32fc_t* cPtr = outVector;
491  const lv_32fc_t* aPtr = inVector;
492  lv_32fc_t incr = lv_cmake(1.0, 0.0);
493  lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
494 
495  unsigned int i, j = 0;
496 
497  for(i = 0; i < 4; ++i) {
498  phase_Ptr[i] *= incr;
499  incr *= (phase_inc);
500  }
501 
502  __m256 aVal, phase_Val, z;
503 
504  phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
505 
506  const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),
507  lv_cimag(incr), lv_creal(incr),
508  lv_cimag(incr), lv_creal(incr),
509  lv_cimag(incr), lv_creal(incr));
510 
511  const unsigned int fourthPoints = num_points / 4;
512 
513  for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); ++i) {
514  for(j = 0; j < ROTATOR_RELOAD; ++j) {
515 
516  aVal = _mm256_loadu_ps((float*)aPtr);
517 
518  z = _mm256_complexmul_ps(aVal, phase_Val);
519  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
520 
521  _mm256_storeu_ps((float*)cPtr, z);
522 
523  aPtr += 4;
524  cPtr += 4;
525  }
526  phase_Val = _mm256_normalize_ps(phase_Val);
527 
528  }
529 
530  for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
531  aVal = _mm256_loadu_ps((float*)aPtr);
532 
533  z = _mm256_complexmul_ps(aVal, phase_Val);
534  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
535 
536  _mm256_storeu_ps((float*)cPtr, z);
537 
538  aPtr += 4;
539  cPtr += 4;
540  }
541  if (i) {
542  phase_Val = _mm256_normalize_ps(phase_Val);
543  }
544 
545  _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
546  (*phase) = phase_Ptr[0];
547  volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points%4);
548 }
549 
550 #endif /* LV_HAVE_AVX */
551 
552 #if LV_HAVE_AVX && LV_HAVE_FMA
553 #include <immintrin.h>
554 
555 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
556  lv_32fc_t* cPtr = outVector;
557  const lv_32fc_t* aPtr = inVector;
558  lv_32fc_t incr = 1;
559  __VOLK_ATTR_ALIGNED(32) lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
560 
561  unsigned int i, j = 0;
562 
563  for(i = 0; i < 4; ++i) {
564  phase_Ptr[i] *= incr;
565  incr *= (phase_inc);
566  }
567 
568  __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
569 
570  phase_Val = _mm256_load_ps((float*)phase_Ptr);
571  inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
572  const unsigned int fourthPoints = num_points / 4;
573 
574  for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
575  for(j = 0; j < ROTATOR_RELOAD; ++j) {
576 
577  aVal = _mm256_load_ps((float*)aPtr);
578 
579  yl = _mm256_moveldup_ps(phase_Val);
580  yh = _mm256_movehdup_ps(phase_Val);
581  ylp = _mm256_moveldup_ps(inc_Val);
582  yhp = _mm256_movehdup_ps(inc_Val);
583 
584  tmp1 = aVal;
585  tmp1p = phase_Val;
586 
587  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
588  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
589  tmp2 = _mm256_mul_ps(aVal, yh);
590  tmp2p = _mm256_mul_ps(phase_Val, yhp);
591 
592  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
593  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
594 
595  _mm256_store_ps((float*)cPtr, z);
596 
597  aPtr += 4;
598  cPtr += 4;
599  }
600  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
601  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
602  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
603  tmp2 = _mm256_sqrt_ps(tmp1);
604  phase_Val = _mm256_div_ps(phase_Val, tmp2);
605  }
606  for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
607  aVal = _mm256_load_ps((float*)aPtr);
608 
609  yl = _mm256_moveldup_ps(phase_Val);
610  yh = _mm256_movehdup_ps(phase_Val);
611  ylp = _mm256_moveldup_ps(inc_Val);
612  yhp = _mm256_movehdup_ps(inc_Val);
613 
614  tmp1 = aVal;
615  tmp1p = phase_Val;
616 
617  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
618  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
619  tmp2 = _mm256_mul_ps(aVal, yh);
620  tmp2p = _mm256_mul_ps(phase_Val, yhp);
621 
622  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
623  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
624 
625  _mm256_store_ps((float*)cPtr, z);
626 
627  aPtr += 4;
628  cPtr += 4;
629  }
630  if (i) {
631  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
632  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
633  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
634  tmp2 = _mm256_sqrt_ps(tmp1);
635  phase_Val = _mm256_div_ps(phase_Val, tmp2);
636  }
637 
638  _mm256_store_ps((float*)phase_Ptr, phase_Val);
639  for(i = 0; i < num_points%4; ++i) {
640  *cPtr++ = *aPtr++ * phase_Ptr[0];
641  phase_Ptr[0] *= (phase_inc);
642  }
643 
644  (*phase) = phase_Ptr[0];
645 
646 }
647 
648 #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned*/
649 
650 #if LV_HAVE_AVX && LV_HAVE_FMA
651 #include <immintrin.h>
652 
653 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
654  lv_32fc_t* cPtr = outVector;
655  const lv_32fc_t* aPtr = inVector;
656  lv_32fc_t incr = 1;
657  lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
658 
659  unsigned int i, j = 0;
660 
661  for(i = 0; i < 4; ++i) {
662  phase_Ptr[i] *= incr;
663  incr *= (phase_inc);
664  }
665 
666  __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
667 
668  phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
669  inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
670  const unsigned int fourthPoints = num_points / 4;
671 
672  for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
673  for(j = 0; j < ROTATOR_RELOAD; ++j) {
674 
675  aVal = _mm256_loadu_ps((float*)aPtr);
676 
677  yl = _mm256_moveldup_ps(phase_Val);
678  yh = _mm256_movehdup_ps(phase_Val);
679  ylp = _mm256_moveldup_ps(inc_Val);
680  yhp = _mm256_movehdup_ps(inc_Val);
681 
682  tmp1 = aVal;
683  tmp1p = phase_Val;
684 
685  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
686  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
687  tmp2 = _mm256_mul_ps(aVal, yh);
688  tmp2p = _mm256_mul_ps(phase_Val, yhp);
689 
690  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
691  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
692 
693  _mm256_storeu_ps((float*)cPtr, z);
694 
695  aPtr += 4;
696  cPtr += 4;
697  }
698  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
699  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
700  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
701  tmp2 = _mm256_sqrt_ps(tmp1);
702  phase_Val = _mm256_div_ps(phase_Val, tmp2);
703  }
704  for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
705  aVal = _mm256_loadu_ps((float*)aPtr);
706 
707  yl = _mm256_moveldup_ps(phase_Val);
708  yh = _mm256_movehdup_ps(phase_Val);
709  ylp = _mm256_moveldup_ps(inc_Val);
710  yhp = _mm256_movehdup_ps(inc_Val);
711 
712  tmp1 = aVal;
713  tmp1p = phase_Val;
714 
715  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
716  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
717  tmp2 = _mm256_mul_ps(aVal, yh);
718  tmp2p = _mm256_mul_ps(phase_Val, yhp);
719 
720  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
721  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
722 
723  _mm256_storeu_ps((float*)cPtr, z);
724 
725  aPtr += 4;
726  cPtr += 4;
727  }
728  if (i) {
729  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
730  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
731  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
732  tmp2 = _mm256_sqrt_ps(tmp1);
733  phase_Val = _mm256_div_ps(phase_Val, tmp2);
734  }
735 
736  _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
737  for(i = 0; i < num_points%4; ++i) {
738  *cPtr++ = *aPtr++ * phase_Ptr[0];
739  phase_Ptr[0] *= (phase_inc);
740  }
741 
742  (*phase) = phase_Ptr[0];
743 
744 }
745 
746 #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
747 
748 #endif /* INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H */
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:33
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:98
static float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
Definition: volk_neon_intrinsics.h:119
static __m256 _mm256_normalize_ps(__m256 val)
Definition: volk_avx_intrinsics.h:57
static void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:93
static void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:422
#define lv_cmake(r, i)
Definition: volk_complex.h:64
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:88
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:53
for i
Definition: volk_config_fixed.tmpl.h:25
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:47
float complex lv_32fc_t
Definition: volk_complex.h:61
static void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:489
static void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:121
#define lv_creal(x)
Definition: volk_complex.h:83
#define ROTATOR_RELOAD
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:88
#define lv_cimag(x)
Definition: volk_complex.h:85