Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32fc_index_max_32u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
70 #ifndef INCLUDED_volk_32fc_index_max_32u_a_H
71 #define INCLUDED_volk_32fc_index_max_32u_a_H
72 
73 #include <volk/volk_common.h>
74 #include<inttypes.h>
75 #include<stdio.h>
76 #include<volk/volk_complex.h>
77 
78 #ifdef LV_HAVE_AVX2
79 #include<immintrin.h>
80 
81 static inline void
82 volk_32fc_index_max_32u_a_avx2(uint32_t* target, lv_32fc_t* src0,
83  uint32_t num_points)
84 {
85  const uint32_t num_bytes = num_points*8;
86 
87  union bit256 holderf;
88  union bit256 holderi;
89  float sq_dist = 0.0;
90 
91  union bit256 xmm5, xmm4;
92  __m256 xmm1, xmm2, xmm3;
93  __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
94 
95  xmm5.int_vec = xmmfive = _mm256_setzero_si256();
96  xmm4.int_vec = xmmfour = _mm256_setzero_si256();
97  holderf.int_vec = holder0 = _mm256_setzero_si256();
98  holderi.int_vec = holder1 = _mm256_setzero_si256();
99 
100  int bound = num_bytes >> 6;
101  int i = 0;
102 
103  xmm8 = _mm256_set_epi32(7,6,5,4,3, 2, 1, 0);
104  xmm9 = _mm256_setzero_si256();
105  xmm10 = _mm256_set1_epi32(8);
106  xmm3 = _mm256_setzero_ps();
107  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
108 
109  for(; i < bound; ++i) {
110  xmm1 = _mm256_load_ps((float*)src0);
111  xmm2 = _mm256_load_ps((float*)&src0[4]);
112 
113  src0 += 8;
114 
115  xmm1 = _mm256_mul_ps(xmm1, xmm1);
116  xmm2 = _mm256_mul_ps(xmm2, xmm2);
117 
118  xmm1 = _mm256_hadd_ps(xmm1, xmm2);
119  xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
120 
121  xmm3 = _mm256_max_ps(xmm1, xmm3);
122 
123  xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
124  xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
125 
126  xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
127  xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
128 
129  xmm9 = _mm256_add_epi32(xmm11, xmm12);
130 
131  xmm8 = _mm256_add_epi32(xmm8, xmm10);
132  }
133 
134  xmm10 = _mm256_set1_epi32(4);
135  if (num_bytes >> 5 & 1) {
136  xmm1 = _mm256_load_ps((float*)src0);
137 
138  xmm1 = _mm256_mul_ps(xmm1, xmm1);
139 
140  src0 += 4;
141 
142  xmm1 = _mm256_hadd_ps(xmm1, xmm1);
143 
144  xmm3 = _mm256_max_ps(xmm1, xmm3);
145 
146  xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
147  xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
148 
149  xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
150  xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
151 
152  xmm9 = _mm256_add_epi32(xmm11, xmm12);
153 
154  xmm8 = _mm256_add_epi32(xmm8, xmm10);
155  }
156 
157  idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
158  xmm10 = _mm256_set1_epi32(2);
159  if (num_bytes >> 4 & 1) {
160  xmm2 = _mm256_load_ps((float*)src0);
161 
162  xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
163  xmm8 = bit256_p(&xmm1)->int_vec;
164 
165  xmm2 = _mm256_mul_ps(xmm2, xmm2);
166 
167  src0 += 2;
168 
169  xmm1 = _mm256_hadd_ps(xmm2, xmm2);
170 
171  xmm3 = _mm256_max_ps(xmm1, xmm3);
172 
173  xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
174  xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
175 
176  xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
177  xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
178 
179  xmm9 = _mm256_add_epi32(xmm11, xmm12);
180 
181  xmm8 = _mm256_add_epi32(xmm8, xmm10);
182  }
183 
184  _mm256_store_ps((float*)&(holderf.f), xmm3);
185  _mm256_store_si256(&(holderi.int_vec), xmm9);
186 
187  target[0] = holderi.i[0];
188  sq_dist = holderf.f[0];
189  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
190  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
191  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
192  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
193  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
194  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
195  target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
196  sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
197  target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
198  sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
199  target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
200  sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
201  target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
202  sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
203 
204 }
205 
206 #endif /*LV_HAVE_AVX2*/
207 
208 #ifdef LV_HAVE_SSE3
209 #include<xmmintrin.h>
210 #include<pmmintrin.h>
211 
212 static inline void
214  uint32_t num_points)
215 {
216  const uint32_t num_bytes = num_points*8;
217 
218  union bit128 holderf;
219  union bit128 holderi;
220  float sq_dist = 0.0;
221 
222  union bit128 xmm5, xmm4;
223  __m128 xmm1, xmm2, xmm3;
224  __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
225 
226  xmm5.int_vec = xmmfive = _mm_setzero_si128();
227  xmm4.int_vec = xmmfour = _mm_setzero_si128();
228  holderf.int_vec = holder0 = _mm_setzero_si128();
229  holderi.int_vec = holder1 = _mm_setzero_si128();
230 
231  int bound = num_bytes >> 5;
232  int i = 0;
233 
234  xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order!
235  xmm9 = _mm_setzero_si128();
236  xmm10 = _mm_set_epi32(4, 4, 4, 4);
237  xmm3 = _mm_setzero_ps();
238 
239  //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
240 
241  for(; i < bound; ++i) {
242  xmm1 = _mm_load_ps((float*)src0);
243  xmm2 = _mm_load_ps((float*)&src0[2]);
244 
245  src0 += 4;
246 
247  xmm1 = _mm_mul_ps(xmm1, xmm1);
248  xmm2 = _mm_mul_ps(xmm2, xmm2);
249 
250  xmm1 = _mm_hadd_ps(xmm1, xmm2);
251 
252  xmm3 = _mm_max_ps(xmm1, xmm3);
253 
254  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
255  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
256 
257  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
258  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
259 
260  xmm9 = _mm_add_epi32(xmm11, xmm12);
261 
262  xmm8 = _mm_add_epi32(xmm8, xmm10);
263 
264  //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
265  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]);
266  }
267 
268 
269  if (num_bytes >> 4 & 1) {
270  xmm2 = _mm_load_ps((float*)src0);
271 
272  xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
273  xmm8 = bit128_p(&xmm1)->int_vec;
274 
275  xmm2 = _mm_mul_ps(xmm2, xmm2);
276 
277  src0 += 2;
278 
279  xmm1 = _mm_hadd_ps(xmm2, xmm2);
280 
281  xmm3 = _mm_max_ps(xmm1, xmm3);
282 
283  xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]);
284 
285  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
286  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
287 
288  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
289  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
290 
291  xmm9 = _mm_add_epi32(xmm11, xmm12);
292 
293  xmm8 = _mm_add_epi32(xmm8, xmm10);
294  //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
295  }
296 
297  if (num_bytes >> 3 & 1) {
298  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
299 
300  sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
301 
302  xmm2 = _mm_load1_ps(&sq_dist);
303 
304  xmm1 = xmm3;
305 
306  xmm3 = _mm_max_ss(xmm3, xmm2);
307 
308  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
309  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
310 
311  xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
312 
313  xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
314  xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
315 
316  xmm9 = _mm_add_epi32(xmm11, xmm12);
317  }
318 
319  //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
320  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
321 
322  _mm_store_ps((float*)&(holderf.f), xmm3);
323  _mm_store_si128(&(holderi.int_vec), xmm9);
324 
325  target[0] = holderi.i[0];
326  sq_dist = holderf.f[0];
327  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
328  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
329  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
330  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
331  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
332  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
333 
334  /*
335  float placeholder = 0.0;
336  uint32_t temp0, temp1;
337  uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
338  uint32_t l0 = g0 ^ 1;
339 
340  uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
341  uint32_t l1 = g1 ^ 1;
342 
343  temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
344  temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
345  sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
346  placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
347 
348  g0 = (sq_dist > placeholder);
349  l0 = g0 ^ 1;
350  target[0] = g0 * temp0 + l0 * temp1;
351  */
352 }
353 
354 #endif /*LV_HAVE_SSE3*/
355 
356 #ifdef LV_HAVE_GENERIC
357 static inline void
359  uint32_t num_points)
360 {
361  const uint32_t num_bytes = num_points*8;
362 
363  float sq_dist = 0.0;
364  float max = 0.0;
365  uint32_t index = 0;
366 
367  uint32_t i = 0;
368 
369  for(; i < num_bytes >> 3; ++i) {
370  sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
371 
372  index = sq_dist > max ? i : index;
373  max = sq_dist > max ? sq_dist : max;
374  }
375  target[0] = index;
376 }
377 
378 #endif /*LV_HAVE_GENERIC*/
379 
380 
381 #endif /*INCLUDED_volk_32fc_index_max_32u_a_H*/
382 
383 
384 #ifndef INCLUDED_volk_32fc_index_max_32u_u_H
385 #define INCLUDED_volk_32fc_index_max_32u_u_H
386 
387 #include <volk/volk_common.h>
388 #include<inttypes.h>
389 #include<stdio.h>
390 #include<volk/volk_complex.h>
391 
392 #ifdef LV_HAVE_AVX2
393 #include<immintrin.h>
394 
395 static inline void
396 volk_32fc_index_max_32u_u_avx2(uint32_t* target, lv_32fc_t* src0,
397  uint32_t num_points)
398 {
399  const uint32_t num_bytes = num_points*8;
400 
401  union bit256 holderf;
402  union bit256 holderi;
403  float sq_dist = 0.0;
404 
405  union bit256 xmm5, xmm4;
406  __m256 xmm1, xmm2, xmm3;
407  __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
408 
409  xmm5.int_vec = xmmfive = _mm256_setzero_si256();
410  xmm4.int_vec = xmmfour = _mm256_setzero_si256();
411  holderf.int_vec = holder0 = _mm256_setzero_si256();
412  holderi.int_vec = holder1 = _mm256_setzero_si256();
413 
414  int bound = num_bytes >> 6;
415  int i = 0;
416 
417  xmm8 = _mm256_set_epi32(7,6,5,4,3, 2, 1, 0);
418  xmm9 = _mm256_setzero_si256();
419  xmm10 = _mm256_set1_epi32(8);
420  xmm3 = _mm256_setzero_ps();
421  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
422 
423  for(; i < bound; ++i) {
424  xmm1 = _mm256_loadu_ps((float*)src0);
425  xmm2 = _mm256_loadu_ps((float*)&src0[4]);
426 
427  src0 += 8;
428 
429  xmm1 = _mm256_mul_ps(xmm1, xmm1);
430  xmm2 = _mm256_mul_ps(xmm2, xmm2);
431 
432  xmm1 = _mm256_hadd_ps(xmm1, xmm2);
433  xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
434 
435  xmm3 = _mm256_max_ps(xmm1, xmm3);
436 
437  xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
438  xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
439 
440  xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
441  xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
442 
443  xmm9 = _mm256_add_epi32(xmm11, xmm12);
444 
445  xmm8 = _mm256_add_epi32(xmm8, xmm10);
446  }
447 
448  xmm10 = _mm256_set1_epi32(4);
449  if (num_bytes >> 5 & 1) {
450  xmm1 = _mm256_loadu_ps((float*)src0);
451 
452  xmm1 = _mm256_mul_ps(xmm1, xmm1);
453 
454  src0 += 4;
455 
456  xmm1 = _mm256_hadd_ps(xmm1, xmm1);
457 
458  xmm3 = _mm256_max_ps(xmm1, xmm3);
459 
460  xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
461  xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
462 
463  xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
464  xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
465 
466  xmm9 = _mm256_add_epi32(xmm11, xmm12);
467 
468  xmm8 = _mm256_add_epi32(xmm8, xmm10);
469  }
470 
471  idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
472  xmm10 = _mm256_set1_epi32(2);
473  if (num_bytes >> 4 & 1) {
474  xmm2 = _mm256_loadu_ps((float*)src0);
475 
476  xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
477  xmm8 = bit256_p(&xmm1)->int_vec;
478 
479  xmm2 = _mm256_mul_ps(xmm2, xmm2);
480 
481  src0 += 2;
482 
483  xmm1 = _mm256_hadd_ps(xmm2, xmm2);
484 
485  xmm3 = _mm256_max_ps(xmm1, xmm3);
486 
487  xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
488  xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
489 
490  xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
491  xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
492 
493  xmm9 = _mm256_add_epi32(xmm11, xmm12);
494 
495  xmm8 = _mm256_add_epi32(xmm8, xmm10);
496  }
497 
498  _mm256_storeu_ps((float*)&(holderf.f), xmm3);
499  _mm256_storeu_si256(&(holderi.int_vec), xmm9);
500 
501  target[0] = holderi.i[0];
502  sq_dist = holderf.f[0];
503  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
504  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
505  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
506  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
507  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
508  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
509  target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
510  sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
511  target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
512  sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
513  target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
514  sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
515  target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
516  sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
517 
518 }
519 
520 #endif /*LV_HAVE_AVX2*/
521 
522 #ifdef LV_HAVE_NEON
523 #include <arm_neon.h>
525 
526 static inline void volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
527 {
528  unsigned int number = 0;
529  const uint32_t quarter_points = num_points / 4;
530  const lv_32fc_t* src0Ptr = src0;
531 
532  uint32_t indices[4] = {0, 1, 2, 3};
533  const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
534  uint32x4_t vec_indices = vld1q_u32(indices);
535  uint32x4_t vec_max_indices = vec_indices;
536 
537  if(num_points)
538  {
539  float max = *src0Ptr;
540  uint32_t index = 0;
541 
542  float32x4_t vec_max = vdupq_n_f32(*src0Ptr);
543 
544  for(;number < quarter_points; number++)
545  {
546  // Load complex and compute magnitude squared
547  const float32x4_t vec_mag2 = _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr));
548  __VOLK_PREFETCH(src0Ptr+=4);
549  // a > b?
550  const uint32x4_t gt_mask = vcgtq_f32(vec_mag2, vec_max);
551  vec_max = vbslq_f32(gt_mask, vec_mag2, vec_max);
552  vec_max_indices = vbslq_u32(gt_mask, vec_indices, vec_max_indices);
553  vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
554  }
555  uint32_t tmp_max_indices[4];
556  float tmp_max[4];
557  vst1q_u32(tmp_max_indices, vec_max_indices);
558  vst1q_f32(tmp_max, vec_max);
559 
560  for (int i = 0; i < 4; i++) {
561  if (tmp_max[i] > max) {
562  max = tmp_max[i];
563  index = tmp_max_indices[i];
564  }
565  }
566 
567  // Deal with the rest
568  for(number = quarter_points * 4;number < num_points; number++)
569  {
570  const float re = lv_creal(*src0Ptr);
571  const float im = lv_cimag(*src0Ptr);
572  if ((re*re+im*im) > max) {
573  max = *src0Ptr;
574  index = number;
575  }
576  src0Ptr++;
577  }
578  *target = index;
579  }
580 }
581 
582 #endif /*LV_HAVE_NEON*/
583 
584 #endif /*INCLUDED_volk_32fc_index_max_32u_u_H*/
static void volk_32fc_index_max_32u_a_sse3(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:213
#define bit128_p(x)
Definition: volk_common.h:132
float f[8]
Definition: volk_common.h:122
__m256i int_vec
Definition: volk_common.h:127
uint32_t i[8]
Definition: volk_common.h:121
__m128i int_vec
Definition: volk_common.h:113
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:88
static void volk_32fc_index_max_32u_generic(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:358
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:53
static void volk_32fc_index_max_32u_neon(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:526
for i
Definition: volk_config_fixed.tmpl.h:25
#define bit256_p(x)
Definition: volk_common.h:133
Definition: volk_common.h:118
__m128 float_vec
Definition: volk_common.h:109
float complex lv_32fc_t
Definition: volk_complex.h:61
__m256 float_vec
Definition: volk_common.h:126
float f[4]
Definition: volk_common.h:105
#define lv_creal(x)
Definition: volk_complex.h:83
Definition: volk_common.h:101
#define lv_cimag(x)
Definition: volk_complex.h:85
uint32_t i[4]
Definition: volk_common.h:104