84 #ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H 85 #define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H 93 #ifdef LV_HAVE_GENERIC 99 unsigned int number = num_points;
102 while (number >= 8) {
103 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
104 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
105 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
106 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
107 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
108 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
109 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
110 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
115 while (number-- > 0) {
116 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
123 #include <immintrin.h> 127 unsigned int number = 0;
129 const unsigned int quarterPoints = num_points / 4;
130 unsigned int isodd = num_points & 3;
133 lv_32fc_t v_scalar[4] = {scalar, scalar, scalar, scalar};
140 s = _mm256_loadu_ps((
float*)v_scalar);
142 for(;number < quarterPoints; number++) {
143 x = _mm256_loadu_ps((
float*)b);
144 y = _mm256_loadu_ps((
float*)a);
146 z = _mm256_add_ps(y, z);
147 _mm256_storeu_ps((
float*)c,z);
154 for(i = num_points-isodd; i < num_points; i++) {
155 *c++ = (*a++) +
lv_conj(*b++) * scalar;
162 #include <pmmintrin.h> 166 unsigned int number = 0;
167 const unsigned int halfPoints = num_points / 2;
170 lv_32fc_t v_scalar[2] = {scalar, scalar};
177 s = _mm_loadu_ps((
float*)v_scalar);
179 for(;number < halfPoints; number++){
180 x = _mm_loadu_ps((
float*)b);
181 y = _mm_loadu_ps((
float*)a);
183 z = _mm_add_ps(y, z);
184 _mm_storeu_ps((
float*)c,z);
191 if((num_points % 2) != 0) {
192 *c = *a +
lv_conj(*b) * scalar;
199 #include <immintrin.h> 203 unsigned int number = 0;
205 const unsigned int quarterPoints = num_points / 4;
206 unsigned int isodd = num_points & 3;
209 lv_32fc_t v_scalar[4] = {scalar, scalar, scalar, scalar};
216 s = _mm256_load_ps((
float*)v_scalar);
218 for(;number < quarterPoints; number++) {
219 x = _mm256_load_ps((
float*)b);
220 y = _mm256_load_ps((
float*)a);
222 z = _mm256_add_ps(y, z);
223 _mm256_store_ps((
float*)c,z);
230 for(i = num_points-isodd; i < num_points; i++) {
231 *c++ = (*a++) +
lv_conj(*b++) * scalar;
238 #include <pmmintrin.h> 242 unsigned int number = 0;
243 const unsigned int halfPoints = num_points / 2;
246 lv_32fc_t v_scalar[2] = {scalar, scalar};
253 s = _mm_load_ps((
float*)v_scalar);
255 for(;number < halfPoints; number++){
256 x = _mm_load_ps((
float*)b);
257 y = _mm_load_ps((
float*)a);
259 z = _mm_add_ps(y, z);
260 _mm_store_ps((
float*)c,z);
267 if((num_points % 2) != 0) {
268 *c = *a +
lv_conj(*b) * scalar;
275 #include <arm_neon.h> 281 unsigned int number = num_points;
282 unsigned int quarter_points = num_points / 4;
284 float32x4x2_t a_val, b_val, c_val, scalar_val;
285 float32x4x2_t tmp_val;
287 scalar_val.val[0] = vld1q_dup_f32((
const float*)&scalar);
288 scalar_val.val[1] = vld1q_dup_f32(((
const float*)&scalar) + 1);
290 for(number = 0; number < quarter_points; ++number) {
291 a_val = vld2q_f32((
float*)aPtr);
292 b_val = vld2q_f32((
float*)bPtr);
293 b_val.val[1] = vnegq_f32(b_val.val[1]);
297 tmp_val.val[1] = vmulq_f32(b_val.val[1], scalar_val.val[0]);
298 tmp_val.val[0] = vmulq_f32(b_val.val[0], scalar_val.val[0]);
300 tmp_val.val[1] = vmlaq_f32(tmp_val.val[1], b_val.val[0], scalar_val.val[1]);
301 tmp_val.val[0] = vmlsq_f32(tmp_val.val[0], b_val.val[1], scalar_val.val[1]);
303 c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]);
304 c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]);
306 vst2q_f32((
float*)cPtr, c_val);
313 for(number = quarter_points*4; number < num_points; number++){
314 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:202
static __m256 _mm256_complexconjugatemul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:51
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:241
#define lv_conj(x)
Definition: volk_complex.h:87
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:126
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:165
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:45
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:277
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:95
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:53
for i
Definition: volk_config_fixed.tmpl.h:25
float complex lv_32fc_t
Definition: volk_complex.h:61