Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_8u_x2_encodeframepolar_8u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*
24  * for documentation see 'volk_8u_x3_encodepolar_8u_x2.h'
25  */
26 
27 #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
28 #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
29 #include <string.h>
30 
31 static inline unsigned int
32 log2_of_power_of_2(unsigned int val){
33  // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog
34  static const unsigned int b[] = {0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0,
35  0xFF00FF00, 0xFFFF0000};
36 
37  unsigned int res = (val & b[0]) != 0;
38  res |= ((val & b[4]) != 0) << 4;
39  res |= ((val & b[3]) != 0) << 3;
40  res |= ((val & b[2]) != 0) << 2;
41  res |= ((val & b[1]) != 0) << 1;
42  return res;
43 }
44 
45 static inline void
46 encodepolar_single_stage(unsigned char* frame_ptr, const unsigned char* temp_ptr,
47  const unsigned int num_branches, const unsigned int frame_half)
48 {
49  unsigned int branch, bit;
50  for(branch = 0; branch < num_branches; ++branch){
51  for(bit = 0; bit < frame_half; ++bit){
52  *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
53  *(frame_ptr + frame_half) = *(temp_ptr + 1);
54  ++frame_ptr;
55  temp_ptr += 2;
56  }
57  frame_ptr += frame_half;
58  }
59 }
60 
61 static inline void
62 volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame, unsigned char* temp,
63  unsigned int frame_size)
64 {
65  unsigned int stage = log2_of_power_of_2(frame_size);
66  unsigned int frame_half = frame_size >> 1;
67  unsigned int num_branches = 1;
68 
69  while(stage){
70  // encode stage
71  encodepolar_single_stage(frame, temp, num_branches, frame_half);
72  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
73 
74  // update all the parameters.
75  num_branches = num_branches << 1;
76  frame_half = frame_half >> 1;
77  --stage;
78  }
79 }
80 
81 #ifdef LV_HAVE_SSSE3
82 #include <tmmintrin.h>
83 
84 static inline void
85 volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame, unsigned char* temp,
86  unsigned int frame_size)
87 {
88  const unsigned int po2 = log2_of_power_of_2(frame_size);
89 
90  unsigned int stage = po2;
91  unsigned char* frame_ptr = frame;
92  unsigned char* temp_ptr = temp;
93 
94  unsigned int frame_half = frame_size >> 1;
95  unsigned int num_branches = 1;
96  unsigned int branch;
97  unsigned int bit;
98 
99  // prepare constants
100  const __m128i mask_stage1 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
101 
102  // get some SIMD registers to play with.
103  __m128i r_frame0, r_temp0, shifted;
104 
105  {
106  __m128i r_frame1, r_temp1;
107  const __m128i shuffle_separate = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
108 
109  while(stage > 4){
110  frame_ptr = frame;
111  temp_ptr = temp;
112 
113  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
114  for(branch = 0; branch < num_branches; ++branch){
115  for(bit = 0; bit < frame_half; bit += 16){
116  r_temp0 = _mm_loadu_si128((__m128i *) temp_ptr);
117  temp_ptr += 16;
118  r_temp1 = _mm_loadu_si128((__m128i *) temp_ptr);
119  temp_ptr += 16;
120 
121  shifted = _mm_srli_si128(r_temp0, 1);
122  shifted = _mm_and_si128(shifted, mask_stage1);
123  r_temp0 = _mm_xor_si128(shifted, r_temp0);
124  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
125 
126  shifted = _mm_srli_si128(r_temp1, 1);
127  shifted = _mm_and_si128(shifted, mask_stage1);
128  r_temp1 = _mm_xor_si128(shifted, r_temp1);
129  r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
130 
131  r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
132  _mm_storeu_si128((__m128i*) frame_ptr, r_frame0);
133 
134  r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
135  _mm_storeu_si128((__m128i*) (frame_ptr + frame_half), r_frame1);
136  frame_ptr += 16;
137  }
138 
139  frame_ptr += frame_half;
140  }
141  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
142 
143  num_branches = num_branches << 1;
144  frame_half = frame_half >> 1;
145  stage--;
146  }
147  }
148 
149  // This last part requires at least 16-bit frames.
150  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
151 
152  // reset pointers to correct positions.
153  frame_ptr = frame;
154  temp_ptr = temp;
155 
156  // prefetch first chunk
157  __VOLK_PREFETCH(temp_ptr);
158 
159  const __m128i shuffle_stage4 = _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
160  const __m128i mask_stage4 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
161  const __m128i mask_stage3 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
162  const __m128i mask_stage2 = _mm_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
163 
164  for(branch = 0; branch < num_branches; ++branch){
165  r_temp0 = _mm_loadu_si128((__m128i*) temp_ptr);
166 
167  // prefetch next chunk
168  temp_ptr += 16;
169  __VOLK_PREFETCH(temp_ptr);
170 
171  // shuffle once for bit-reversal.
172  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
173 
174  shifted = _mm_srli_si128(r_temp0, 8);
175  shifted = _mm_and_si128(shifted, mask_stage4);
176  r_frame0 = _mm_xor_si128(shifted, r_temp0);
177 
178  shifted = _mm_srli_si128(r_frame0, 4);
179  shifted = _mm_and_si128(shifted, mask_stage3);
180  r_frame0 = _mm_xor_si128(shifted, r_frame0);
181 
182  shifted = _mm_srli_si128(r_frame0, 2);
183  shifted = _mm_and_si128(shifted, mask_stage2);
184  r_frame0 = _mm_xor_si128(shifted, r_frame0);
185 
186  shifted = _mm_srli_si128(r_frame0, 1);
187  shifted = _mm_and_si128(shifted, mask_stage1);
188  r_frame0 = _mm_xor_si128(shifted, r_frame0);
189 
190  // store result of chunk.
191  _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
192  frame_ptr += 16;
193  }
194 }
195 
196 #endif /* LV_HAVE_SSSE3 */
197 
198 #ifdef LV_HAVE_AVX2
199 #include <immintrin.h>
200 
201 static inline void
202 volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame, unsigned char* temp,
203  unsigned int frame_size)
204 {
205  const unsigned int po2 = log2_of_power_of_2(frame_size);
206 
207  unsigned int stage = po2;
208  unsigned char* frame_ptr = frame;
209  unsigned char* temp_ptr = temp;
210 
211  unsigned int frame_half = frame_size >> 1;
212  unsigned int num_branches = 1;
213  unsigned int branch;
214  unsigned int bit;
215 
216  // prepare constants
217  const __m256i mask_stage1 = _mm256_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF,
218  0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
219 
220  const __m128i mask_stage0 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
221  // get some SIMD registers to play with.
222  __m256i r_frame0, r_temp0, shifted;
223  __m128i r_temp2, r_frame2, shifted2;
224  {
225  __m256i r_frame1, r_temp1;
226  __m128i r_frame3, r_temp3;
227  const __m256i shuffle_separate = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
228  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
229  const __m128i shuffle_separate128 = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
230 
231  while(stage > 4){
232  frame_ptr = frame;
233  temp_ptr = temp;
234 
235  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
236  for(branch = 0; branch < num_branches; ++branch){
237  for(bit = 0; bit < frame_half; bit += 32){
238  if ((frame_half-bit)<32) //if only 16 bits remaining in frame, not 32
239  {
240  r_temp2 = _mm_loadu_si128((__m128i *) temp_ptr);
241  temp_ptr += 16;
242  r_temp3 = _mm_loadu_si128((__m128i *) temp_ptr);
243  temp_ptr += 16;
244 
245  shifted2 = _mm_srli_si128(r_temp2, 1);
246  shifted2 = _mm_and_si128(shifted2, mask_stage0);
247  r_temp2 = _mm_xor_si128(shifted2, r_temp2);
248  r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
249 
250  shifted2 = _mm_srli_si128(r_temp3, 1);
251  shifted2 = _mm_and_si128(shifted2, mask_stage0);
252  r_temp3 = _mm_xor_si128(shifted2, r_temp3);
253  r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
254 
255  r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
256  _mm_storeu_si128((__m128i*) frame_ptr, r_frame2);
257 
258  r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
259  _mm_storeu_si128((__m128i*) (frame_ptr + frame_half), r_frame3);
260  frame_ptr += 16;
261  break;
262  }
263  r_temp0 = _mm256_loadu_si256((__m256i *) temp_ptr);
264  temp_ptr += 32;
265  r_temp1 = _mm256_loadu_si256((__m256i *) temp_ptr);
266  temp_ptr += 32;
267 
268  shifted = _mm256_srli_si256(r_temp0, 1);//operate on 128 bit lanes
269  shifted = _mm256_and_si256(shifted, mask_stage1);
270  r_temp0 = _mm256_xor_si256(shifted, r_temp0);
271  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
272 
273  shifted = _mm256_srli_si256(r_temp1, 1);
274  shifted = _mm256_and_si256(shifted, mask_stage1);
275  r_temp1 = _mm256_xor_si256(shifted, r_temp1);
276  r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
277 
278  r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
279  r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
280  r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
281  r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
282 
283  _mm256_storeu_si256((__m256i*) frame_ptr, r_frame0);
284 
285  _mm256_storeu_si256((__m256i*) (frame_ptr + frame_half), r_frame1);
286  frame_ptr += 32;
287  }
288 
289  frame_ptr += frame_half;
290  }
291  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
292 
293  num_branches = num_branches << 1;
294  frame_half = frame_half >> 1;
295  stage--;
296  }
297  }
298 
299  // This last part requires at least 32-bit frames.
300  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
301 
302  // reset pointers to correct positions.
303  frame_ptr = frame;
304  temp_ptr = temp;
305 
306  // prefetch first chunk
307  __VOLK_PREFETCH(temp_ptr);
308 
309  const __m256i shuffle_stage4 = _mm256_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
310  0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
311  const __m256i mask_stage4 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
312  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
313  const __m256i mask_stage3 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF,
314  0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
315  const __m256i mask_stage2 = _mm256_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF,
316  0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
317 
318  for(branch = 0; branch < num_branches/2; ++branch){
319  r_temp0 = _mm256_loadu_si256((__m256i*) temp_ptr);
320 
321  // prefetch next chunk
322  temp_ptr += 32;
323  __VOLK_PREFETCH(temp_ptr);
324 
325  // shuffle once for bit-reversal.
326  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
327 
328  shifted = _mm256_srli_si256(r_temp0, 8); //128 bit lanes
329  shifted = _mm256_and_si256(shifted, mask_stage4);
330  r_frame0 = _mm256_xor_si256(shifted, r_temp0);
331 
332 
333  shifted = _mm256_srli_si256(r_frame0, 4);
334  shifted = _mm256_and_si256(shifted, mask_stage3);
335  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
336 
337  shifted = _mm256_srli_si256(r_frame0, 2);
338  shifted = _mm256_and_si256(shifted, mask_stage2);
339  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
340 
341  shifted = _mm256_srli_si256(r_frame0, 1);
342  shifted = _mm256_and_si256(shifted, mask_stage1);
343  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
344 
345  // store result of chunk.
346  _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
347  frame_ptr += 32;
348  }
349 }
350 #endif /* LV_HAVE_AVX2 */
351 
352 #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ */
353 
354 #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
355 #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
356 
357 #ifdef LV_HAVE_SSSE3
358 #include <tmmintrin.h>
359 
360 static inline void
361 volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame, unsigned char* temp,
362  unsigned int frame_size)
363 {
364  const unsigned int po2 = log2_of_power_of_2(frame_size);
365 
366  unsigned int stage = po2;
367  unsigned char* frame_ptr = frame;
368  unsigned char* temp_ptr = temp;
369 
370  unsigned int frame_half = frame_size >> 1;
371  unsigned int num_branches = 1;
372  unsigned int branch;
373  unsigned int bit;
374 
375  // prepare constants
376  const __m128i mask_stage1 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
377 
378  // get some SIMD registers to play with.
379  __m128i r_frame0, r_temp0, shifted;
380 
381  {
382  __m128i r_frame1, r_temp1;
383  const __m128i shuffle_separate = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
384 
385  while(stage > 4){
386  frame_ptr = frame;
387  temp_ptr = temp;
388 
389  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
390  for(branch = 0; branch < num_branches; ++branch){
391  for(bit = 0; bit < frame_half; bit += 16){
392  r_temp0 = _mm_load_si128((__m128i *) temp_ptr);
393  temp_ptr += 16;
394  r_temp1 = _mm_load_si128((__m128i *) temp_ptr);
395  temp_ptr += 16;
396 
397  shifted = _mm_srli_si128(r_temp0, 1);
398  shifted = _mm_and_si128(shifted, mask_stage1);
399  r_temp0 = _mm_xor_si128(shifted, r_temp0);
400  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
401 
402  shifted = _mm_srli_si128(r_temp1, 1);
403  shifted = _mm_and_si128(shifted, mask_stage1);
404  r_temp1 = _mm_xor_si128(shifted, r_temp1);
405  r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
406 
407  r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
408  _mm_store_si128((__m128i*) frame_ptr, r_frame0);
409 
410  r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
411  _mm_store_si128((__m128i*) (frame_ptr + frame_half), r_frame1);
412  frame_ptr += 16;
413  }
414 
415  frame_ptr += frame_half;
416  }
417  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
418 
419  num_branches = num_branches << 1;
420  frame_half = frame_half >> 1;
421  stage--;
422  }
423  }
424 
425  // This last part requires at least 16-bit frames.
426  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
427 
428  // reset pointers to correct positions.
429  frame_ptr = frame;
430  temp_ptr = temp;
431 
432  // prefetch first chunk
433  __VOLK_PREFETCH(temp_ptr);
434 
435  const __m128i shuffle_stage4 = _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
436  const __m128i mask_stage4 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
437  const __m128i mask_stage3 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
438  const __m128i mask_stage2 = _mm_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
439 
440  for(branch = 0; branch < num_branches; ++branch){
441  r_temp0 = _mm_load_si128((__m128i*) temp_ptr);
442 
443  // prefetch next chunk
444  temp_ptr += 16;
445  __VOLK_PREFETCH(temp_ptr);
446 
447  // shuffle once for bit-reversal.
448  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
449 
450  shifted = _mm_srli_si128(r_temp0, 8);
451  shifted = _mm_and_si128(shifted, mask_stage4);
452  r_frame0 = _mm_xor_si128(shifted, r_temp0);
453 
454  shifted = _mm_srli_si128(r_frame0, 4);
455  shifted = _mm_and_si128(shifted, mask_stage3);
456  r_frame0 = _mm_xor_si128(shifted, r_frame0);
457 
458  shifted = _mm_srli_si128(r_frame0, 2);
459  shifted = _mm_and_si128(shifted, mask_stage2);
460  r_frame0 = _mm_xor_si128(shifted, r_frame0);
461 
462  shifted = _mm_srli_si128(r_frame0, 1);
463  shifted = _mm_and_si128(shifted, mask_stage1);
464  r_frame0 = _mm_xor_si128(shifted, r_frame0);
465 
466  // store result of chunk.
467  _mm_store_si128((__m128i*)frame_ptr, r_frame0);
468  frame_ptr += 16;
469  }
470 }
471 #endif /* LV_HAVE_SSSE3 */
472 
473 #ifdef LV_HAVE_AVX2
474 #include <immintrin.h>
475 
476 static inline void
477 volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame, unsigned char* temp,
478  unsigned int frame_size)
479 {
480  const unsigned int po2 = log2_of_power_of_2(frame_size);
481 
482  unsigned int stage = po2;
483  unsigned char* frame_ptr = frame;
484  unsigned char* temp_ptr = temp;
485 
486  unsigned int frame_half = frame_size >> 1;
487  unsigned int num_branches = 1;
488  unsigned int branch;
489  unsigned int bit;
490 
491  // prepare constants
492  const __m256i mask_stage1 = _mm256_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF,
493  0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
494 
495  const __m128i mask_stage0 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
496  // get some SIMD registers to play with.
497  __m256i r_frame0, r_temp0, shifted;
498  __m128i r_temp2, r_frame2, shifted2;
499  {
500  __m256i r_frame1, r_temp1;
501  __m128i r_frame3, r_temp3;
502  const __m256i shuffle_separate = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
503  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
504  const __m128i shuffle_separate128 = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
505 
506  while(stage > 4){
507  frame_ptr = frame;
508  temp_ptr = temp;
509 
510  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
511  for(branch = 0; branch < num_branches; ++branch){
512  for(bit = 0; bit < frame_half; bit += 32){
513  if ((frame_half-bit)<32) //if only 16 bits remaining in frame, not 32
514  {
515  r_temp2 = _mm_load_si128((__m128i *) temp_ptr);
516  temp_ptr += 16;
517  r_temp3 = _mm_load_si128((__m128i *) temp_ptr);
518  temp_ptr += 16;
519 
520  shifted2 = _mm_srli_si128(r_temp2, 1);
521  shifted2 = _mm_and_si128(shifted2, mask_stage0);
522  r_temp2 = _mm_xor_si128(shifted2, r_temp2);
523  r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
524 
525  shifted2 = _mm_srli_si128(r_temp3, 1);
526  shifted2 = _mm_and_si128(shifted2, mask_stage0);
527  r_temp3 = _mm_xor_si128(shifted2, r_temp3);
528  r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
529 
530  r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
531  _mm_store_si128((__m128i*) frame_ptr, r_frame2);
532 
533  r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
534  _mm_store_si128((__m128i*) (frame_ptr + frame_half), r_frame3);
535  frame_ptr += 16;
536  break;
537  }
538  r_temp0 = _mm256_load_si256((__m256i *) temp_ptr);
539  temp_ptr += 32;
540  r_temp1 = _mm256_load_si256((__m256i *) temp_ptr);
541  temp_ptr += 32;
542 
543  shifted = _mm256_srli_si256(r_temp0, 1);//operate on 128 bit lanes
544  shifted = _mm256_and_si256(shifted, mask_stage1);
545  r_temp0 = _mm256_xor_si256(shifted, r_temp0);
546  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
547 
548  shifted = _mm256_srli_si256(r_temp1, 1);
549  shifted = _mm256_and_si256(shifted, mask_stage1);
550  r_temp1 = _mm256_xor_si256(shifted, r_temp1);
551  r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
552 
553  r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
554  r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
555  r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
556  r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
557 
558  _mm256_store_si256((__m256i*) frame_ptr, r_frame0);
559 
560  _mm256_store_si256((__m256i*) (frame_ptr + frame_half), r_frame1);
561  frame_ptr += 32;
562  }
563 
564  frame_ptr += frame_half;
565  }
566  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
567 
568  num_branches = num_branches << 1;
569  frame_half = frame_half >> 1;
570  stage--;
571  }
572  }
573 
574  // This last part requires at least 32-bit frames.
575  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
576 
577  // reset pointers to correct positions.
578  frame_ptr = frame;
579  temp_ptr = temp;
580 
581  // prefetch first chunk.
582  __VOLK_PREFETCH(temp_ptr);
583 
584  const __m256i shuffle_stage4 = _mm256_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
585  0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
586  const __m256i mask_stage4 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
587  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
588  const __m256i mask_stage3 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF,
589  0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
590  const __m256i mask_stage2 = _mm256_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF,
591  0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
592 
593  for(branch = 0; branch < num_branches/2; ++branch){
594  r_temp0 = _mm256_load_si256((__m256i*) temp_ptr);
595 
596  // prefetch next chunk
597  temp_ptr += 32;
598  __VOLK_PREFETCH(temp_ptr);
599 
600  // shuffle once for bit-reversal.
601  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
602 
603  shifted = _mm256_srli_si256(r_temp0, 8); //128 bit lanes
604  shifted = _mm256_and_si256(shifted, mask_stage4);
605  r_frame0 = _mm256_xor_si256(shifted, r_temp0);
606 
607  shifted = _mm256_srli_si256(r_frame0, 4);
608  shifted = _mm256_and_si256(shifted, mask_stage3);
609  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
610 
611  shifted = _mm256_srli_si256(r_frame0, 2);
612  shifted = _mm256_and_si256(shifted, mask_stage2);
613  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
614 
615  shifted = _mm256_srli_si256(r_frame0, 1);
616  shifted = _mm256_and_si256(shifted, mask_stage1);
617  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
618 
619  // store result of chunk.
620  _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
621  frame_ptr += 32;
622  }
623 }
624 #endif /* LV_HAVE_AVX2 */
625 
626 
627 
628 #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */
static void volk_8u_x2_encodeframepolar_8u_generic(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:62
static void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:361
val
Definition: volk_arch_defs.py:66
static unsigned int log2_of_power_of_2(unsigned int val)
Definition: volk_8u_x2_encodeframepolar_8u.h:32
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:53
static void encodepolar_single_stage(unsigned char *frame_ptr, const unsigned char *temp_ptr, const unsigned int num_branches, const unsigned int frame_half)
Definition: volk_8u_x2_encodeframepolar_8u.h:46
static void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:85