92 #ifndef INCLUDED_volk_32f_log2_32f_a_H 93 #define INCLUDED_volk_32f_log2_32f_a_H 100 #define LOG_POLY_DEGREE 6 104 float const result = log2f(f);
105 return isinf(result) ? copysignf(127.0f, result) : result;
108 #ifdef LV_HAVE_GENERIC 113 float* bPtr = bVector;
114 const float* aPtr = aVector;
115 unsigned int number = 0;
117 for(number = 0; number < num_points; number++)
122 #if LV_HAVE_AVX2 && LV_HAVE_FMA 123 #include <immintrin.h> 125 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0) 126 #define POLY1_FMAAVX2(x, c0, c1) _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0)) 127 #define POLY2_FMAAVX2(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0)) 128 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0)) 129 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) 130 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) 133 volk_32f_log2_32f_a_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
135 float* bPtr = bVector;
136 const float* aPtr = aVector;
138 unsigned int number = 0;
139 const unsigned int eighthPoints = num_points / 8;
141 __m256 aVal, bVal, mantissa, frac, leadingOne;
144 for(;number < eighthPoints; number++){
146 aVal = _mm256_load_ps(aPtr);
147 bias = _mm256_set1_epi32(127);
148 leadingOne = _mm256_set1_ps(1.0f);
149 exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
150 bVal = _mm256_cvtepi32_ps(exp);
153 frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
155 #if LOG_POLY_DEGREE == 6 156 mantissa = POLY5_FMAAVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
157 #elif LOG_POLY_DEGREE == 5 158 mantissa = POLY4_FMAAVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
159 #elif LOG_POLY_DEGREE == 4 160 mantissa = POLY3_FMAAVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
161 #elif LOG_POLY_DEGREE == 3 162 mantissa = POLY2_FMAAVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
167 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
168 _mm256_store_ps(bPtr, bVal);
174 number = eighthPoints * 8;
181 #include <immintrin.h> 183 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0) 184 #define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) 185 #define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) 186 #define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) 187 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) 188 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) 191 volk_32f_log2_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
193 float* bPtr = bVector;
194 const float* aPtr = aVector;
196 unsigned int number = 0;
197 const unsigned int eighthPoints = num_points / 8;
199 __m256 aVal, bVal, mantissa, frac, leadingOne;
202 for(;number < eighthPoints; number++){
204 aVal = _mm256_load_ps(aPtr);
205 bias = _mm256_set1_epi32(127);
206 leadingOne = _mm256_set1_ps(1.0f);
207 exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
208 bVal = _mm256_cvtepi32_ps(exp);
211 frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
213 #if LOG_POLY_DEGREE == 6 214 mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
215 #elif LOG_POLY_DEGREE == 5 216 mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
217 #elif LOG_POLY_DEGREE == 4 218 mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
219 #elif LOG_POLY_DEGREE == 3 220 mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
225 bVal = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
226 _mm256_store_ps(bPtr, bVal);
232 number = eighthPoints * 8;
238 #ifdef LV_HAVE_SSE4_1 239 #include <smmintrin.h> 241 #define POLY0(x, c0) _mm_set1_ps(c0) 242 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) 243 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) 244 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) 245 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) 246 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) 249 volk_32f_log2_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
251 float* bPtr = bVector;
252 const float* aPtr = aVector;
254 unsigned int number = 0;
255 const unsigned int quarterPoints = num_points / 4;
257 __m128 aVal, bVal, mantissa, frac, leadingOne;
260 for(;number < quarterPoints; number++){
262 aVal = _mm_load_ps(aPtr);
263 bias = _mm_set1_epi32(127);
264 leadingOne = _mm_set1_ps(1.0f);
265 exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
266 bVal = _mm_cvtepi32_ps(exp);
269 frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
271 #if LOG_POLY_DEGREE == 6 272 mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
273 #elif LOG_POLY_DEGREE == 5 274 mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
275 #elif LOG_POLY_DEGREE == 4 276 mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
277 #elif LOG_POLY_DEGREE == 3 278 mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
283 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
284 _mm_store_ps(bPtr, bVal);
290 number = quarterPoints * 4;
297 #include <arm_neon.h> 300 #define VLOG2Q_NEON_PREAMBLE() \ 301 int32x4_t one = vdupq_n_s32(0x000800000); \ 303 float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \ 304 float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \ 305 float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \ 306 float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \ 307 float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \ 308 float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \ 309 float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \ 310 int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \ 311 int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \ 312 int32x4_t exp_bias = vdupq_n_s32(127); 315 #define VLOG2Q_NEON_F32(log2_approx, aval) \ 316 int32x4_t exponent_i = vandq_s32(aval, exp_mask); \ 317 int32x4_t significand_i = vandq_s32(aval, sig_mask); \ 318 exponent_i = vshrq_n_s32(exponent_i, 23); \ 323 significand_i = vorrq_s32(one, significand_i); \ 324 float32x4_t significand_f = vcvtq_n_f32_s32(significand_i,23); \ 326 exponent_i = vsubq_s32(exponent_i, exp_bias); \ 327 float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \ 331 log2_approx = vaddq_f32(exponent_f, p0); \ 332 float32x4_t tmp1 = vmulq_f32(significand_f, p1); \ 333 log2_approx = vaddq_f32(log2_approx, tmp1); \ 334 float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); \ 335 tmp1 = vmulq_f32(sig_2, p2); \ 336 log2_approx = vaddq_f32(log2_approx, tmp1); \ 338 float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); \ 339 tmp1 = vmulq_f32(sig_3, p3); \ 340 log2_approx = vaddq_f32(log2_approx, tmp1); \ 341 float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); \ 342 tmp1 = vmulq_f32(sig_4, p4); \ 343 log2_approx = vaddq_f32(log2_approx, tmp1); \ 344 float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); \ 345 tmp1 = vmulq_f32(sig_5, p5); \ 346 log2_approx = vaddq_f32(log2_approx, tmp1); \ 347 float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); \ 348 tmp1 = vmulq_f32(sig_6, p6); \ 349 log2_approx = vaddq_f32(log2_approx, tmp1); 354 float* bPtr = bVector;
355 const float* aPtr = aVector;
357 const unsigned int quarterPoints = num_points / 4;
360 float32x4_t log2_approx;
371 for(number = 0; number < quarterPoints; ++number){
373 aval = vld1q_s32((
int*)aPtr);
377 vst1q_f32(bPtr, log2_approx);
383 number = quarterPoints * 4;
392 #ifndef INCLUDED_volk_32f_log2_32f_u_H 393 #define INCLUDED_volk_32f_log2_32f_u_H 396 #ifdef LV_HAVE_GENERIC 401 float* bPtr = bVector;
402 const float* aPtr = aVector;
403 unsigned int number = 0;
405 for(number = 0; number < num_points; number++){
406 float const result = log2f(*aPtr++);
407 *bPtr++ = isinf(result) ? -127.0f : result;
414 #ifdef LV_HAVE_SSE4_1 415 #include <smmintrin.h> 417 #define POLY0(x, c0) _mm_set1_ps(c0) 418 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) 419 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) 420 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) 421 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) 422 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) 425 volk_32f_log2_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
427 float* bPtr = bVector;
428 const float* aPtr = aVector;
430 unsigned int number = 0;
431 const unsigned int quarterPoints = num_points / 4;
433 __m128 aVal, bVal, mantissa, frac, leadingOne;
436 for(;number < quarterPoints; number++){
438 aVal = _mm_loadu_ps(aPtr);
439 bias = _mm_set1_epi32(127);
440 leadingOne = _mm_set1_ps(1.0f);
441 exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
442 bVal = _mm_cvtepi32_ps(exp);
445 frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
447 #if LOG_POLY_DEGREE == 6 448 mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
449 #elif LOG_POLY_DEGREE == 5 450 mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
451 #elif LOG_POLY_DEGREE == 4 452 mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
453 #elif LOG_POLY_DEGREE == 3 454 mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
459 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
460 _mm_storeu_ps(bPtr, bVal);
466 number = quarterPoints * 4;
472 #if LV_HAVE_AVX2 && LV_HAVE_FMA 473 #include <immintrin.h> 475 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0) 476 #define POLY1_FMAAVX2(x, c0, c1) _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0)) 477 #define POLY2_FMAAVX2(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0)) 478 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0)) 479 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) 480 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) 483 volk_32f_log2_32f_u_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
485 float* bPtr = bVector;
486 const float* aPtr = aVector;
488 unsigned int number = 0;
489 const unsigned int eighthPoints = num_points / 8;
491 __m256 aVal, bVal, mantissa, frac, leadingOne;
494 for(;number < eighthPoints; number++){
496 aVal = _mm256_loadu_ps(aPtr);
497 bias = _mm256_set1_epi32(127);
498 leadingOne = _mm256_set1_ps(1.0f);
499 exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
500 bVal = _mm256_cvtepi32_ps(exp);
503 frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
505 #if LOG_POLY_DEGREE == 6 506 mantissa = POLY5_FMAAVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
507 #elif LOG_POLY_DEGREE == 5 508 mantissa = POLY4_FMAAVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
509 #elif LOG_POLY_DEGREE == 4 510 mantissa = POLY3_FMAAVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
511 #elif LOG_POLY_DEGREE == 3 512 mantissa = POLY2_FMAAVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
517 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
518 _mm256_storeu_ps(bPtr, bVal);
524 number = eighthPoints * 8;
531 #include <immintrin.h> 533 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0) 534 #define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) 535 #define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) 536 #define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) 537 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) 538 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) 541 volk_32f_log2_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
543 float* bPtr = bVector;
544 const float* aPtr = aVector;
546 unsigned int number = 0;
547 const unsigned int eighthPoints = num_points / 8;
549 __m256 aVal, bVal, mantissa, frac, leadingOne;
552 for(;number < eighthPoints; number++){
554 aVal = _mm256_loadu_ps(aPtr);
555 bias = _mm256_set1_epi32(127);
556 leadingOne = _mm256_set1_ps(1.0f);
557 exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
558 bVal = _mm256_cvtepi32_ps(exp);
561 frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
563 #if LOG_POLY_DEGREE == 6 564 mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
565 #elif LOG_POLY_DEGREE == 5 566 mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
567 #elif LOG_POLY_DEGREE == 4 568 mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
569 #elif LOG_POLY_DEGREE == 3 570 mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
575 bVal = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
576 _mm256_storeu_ps(bPtr, bVal);
582 number = eighthPoints * 8;
static void volk_32f_log2_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:111
static void volk_32f_log2_32f_u_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:399
#define VLOG2Q_NEON_F32(log2_approx, aval)
Definition: volk_32f_log2_32f.h:315
static float log2f_non_ieee(float f)
Definition: volk_32f_log2_32f.h:103
static void volk_32f_log2_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:352
#define VLOG2Q_NEON_PREAMBLE()
Definition: volk_32f_log2_32f.h:300