71 #ifndef INCLUDED_volk_32f_index_max_16u_a_H 72 #define INCLUDED_volk_32f_index_max_16u_a_H 81 #include <immintrin.h> 87 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
90 const uint32_t eighthPoints = num_points / 8;
92 float* inputPtr = (
float*)src0;
94 __m256 indexIncrementValues = _mm256_set1_ps(8);
95 __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
99 __m256 maxValues = _mm256_set1_ps(max);
100 __m256 maxValuesIndex = _mm256_setzero_ps();
101 __m256 compareResults;
102 __m256 currentValues;
107 for(;number < eighthPoints; number++){
109 currentValues = _mm256_load_ps(inputPtr); inputPtr += 8;
110 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
112 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
114 maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
115 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
119 _mm256_store_ps(maxValuesBuffer, maxValues);
120 _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
122 for(number = 0; number < 8; number++){
123 if(maxValuesBuffer[number] > max){
124 index = maxIndexesBuffer[number];
125 max = maxValuesBuffer[number];
126 }
else if(maxValuesBuffer[number] == max){
127 if (index > maxIndexesBuffer[number])
128 index = maxIndexesBuffer[number];
132 number = eighthPoints * 8;
133 for(;number < num_points; number++){
134 if(src0[number] > max){
139 target[0] = (uint16_t)index;
144 #ifdef LV_HAVE_SSE4_1 145 #include <smmintrin.h> 148 volk_32f_index_max_16u_a_sse4_1(uint16_t* target,
const float* src0,
151 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
154 const uint32_t quarterPoints = num_points / 4;
156 float* inputPtr = (
float*)src0;
158 __m128 indexIncrementValues = _mm_set1_ps(4);
159 __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
163 __m128 maxValues = _mm_set1_ps(max);
164 __m128 maxValuesIndex = _mm_setzero_ps();
165 __m128 compareResults;
166 __m128 currentValues;
171 for(;number < quarterPoints; number++){
173 currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
174 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
176 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
178 maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
179 maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
183 _mm_store_ps(maxValuesBuffer, maxValues);
184 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
186 for(number = 0; number < 4; number++){
187 if(maxValuesBuffer[number] > max){
188 index = maxIndexesBuffer[number];
189 max = maxValuesBuffer[number];
190 }
else if(maxValuesBuffer[number] == max){
191 if (index > maxIndexesBuffer[number])
192 index = maxIndexesBuffer[number];
196 number = quarterPoints * 4;
197 for(;number < num_points; number++){
198 if(src0[number] > max){
203 target[0] = (uint16_t)index;
211 #include <xmmintrin.h> 217 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
220 const uint32_t quarterPoints = num_points / 4;
222 float* inputPtr = (
float*)src0;
224 __m128 indexIncrementValues = _mm_set1_ps(4);
225 __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
229 __m128 maxValues = _mm_set1_ps(max);
230 __m128 maxValuesIndex = _mm_setzero_ps();
231 __m128 compareResults;
232 __m128 currentValues;
237 for(;number < quarterPoints; number++){
239 currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
240 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
242 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
244 maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
245 _mm_andnot_ps(compareResults, maxValuesIndex));
246 maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
247 _mm_andnot_ps(compareResults, maxValues));
251 _mm_store_ps(maxValuesBuffer, maxValues);
252 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
254 for(number = 0; number < 4; number++){
255 if(maxValuesBuffer[number] > max){
256 index = maxIndexesBuffer[number];
257 max = maxValuesBuffer[number];
258 }
else if(maxValuesBuffer[number] == max){
259 if (index > maxIndexesBuffer[number])
260 index = maxIndexesBuffer[number];
264 number = quarterPoints * 4;
265 for(;number < num_points; number++){
266 if(src0[number] > max){
271 target[0] = (uint16_t)index;
277 #ifdef LV_HAVE_GENERIC 283 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
290 for(; i < num_points; ++
i) {
306 #ifndef INCLUDED_volk_32f_index_max_16u_u_H 307 #define INCLUDED_volk_32f_index_max_16u_u_H 311 #include <inttypes.h> 316 #include <immintrin.h> 322 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
325 const uint32_t eighthPoints = num_points / 8;
327 float* inputPtr = (
float*)src0;
329 __m256 indexIncrementValues = _mm256_set1_ps(8);
330 __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
334 __m256 maxValues = _mm256_set1_ps(max);
335 __m256 maxValuesIndex = _mm256_setzero_ps();
336 __m256 compareResults;
337 __m256 currentValues;
342 for(;number < eighthPoints; number++){
344 currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8;
345 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
347 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
349 maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
350 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
354 _mm256_storeu_ps(maxValuesBuffer, maxValues);
355 _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex);
357 for(number = 0; number < 8; number++){
358 if(maxValuesBuffer[number] > max){
359 index = maxIndexesBuffer[number];
360 max = maxValuesBuffer[number];
361 }
else if(maxValuesBuffer[number] == max){
362 if (index > maxIndexesBuffer[number])
363 index = maxIndexesBuffer[number];
367 number = eighthPoints * 8;
368 for(;number < num_points; number++){
369 if(src0[number] > max){
374 target[0] = (uint16_t)index;
static void volk_32f_index_max_16u_generic(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:280
static void volk_32f_index_max_16u_u_avx(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:319
static void volk_32f_index_max_16u_a_sse(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:214
for i
Definition: volk_config_fixed.tmpl.h:25
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:47
static void volk_32f_index_max_16u_a_avx(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:84