Vector Optimized Library of Kernels  2.3
Architecture-tuned implementations of math kernels
volk_32u_byteswap.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
66 #ifndef INCLUDED_volk_32u_byteswap_u_H
67 #define INCLUDED_volk_32u_byteswap_u_H
68 
69 #include <inttypes.h>
70 #include <stdio.h>
71 
72 #if LV_HAVE_AVX2
73 #include <immintrin.h>
74 static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap, unsigned int num_points)
75 {
76 
77  unsigned int number;
78 
79  const unsigned int nPerSet = 8;
80  const uint64_t nSets = num_points / nPerSet;
81 
82  uint32_t* inputPtr = intsToSwap;
83 
84  const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
85  8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
86  21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
87 
88  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
89 
90  for (number = 0; number < nSets; number++) {
91 
92  // Load the 32t values, increment inputPtr later since we're doing it in-place.
93  const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
94  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
95 
96  // Store the results
97  _mm256_storeu_si256((__m256i*)inputPtr, output);
98  inputPtr += nPerSet;
99  }
100  _mm256_zeroupper();
101 
102  // Byteswap any remaining points:
103  for (number = nSets * nPerSet; number < num_points; number++) {
104  uint32_t outputVal = *inputPtr;
105  outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
106  ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
107  *inputPtr = outputVal;
108  inputPtr++;
109  }
110 }
111 #endif /* LV_HAVE_AVX2 */
112 
113 
114 #ifdef LV_HAVE_SSE2
115 #include <emmintrin.h>
116 
117 static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points)
118 {
119  unsigned int number = 0;
120 
121  uint32_t* inputPtr = intsToSwap;
122  __m128i input, byte1, byte2, byte3, byte4, output;
123  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
124  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
125 
126  const uint64_t quarterPoints = num_points / 4;
127  for (; number < quarterPoints; number++) {
128  // Load the 32t values, increment inputPtr later since we're doing it in-place.
129  input = _mm_loadu_si128((__m128i*)inputPtr);
130  // Do the four shifts
131  byte1 = _mm_slli_epi32(input, 24);
132  byte2 = _mm_slli_epi32(input, 8);
133  byte3 = _mm_srli_epi32(input, 8);
134  byte4 = _mm_srli_epi32(input, 24);
135  // Or bytes together
136  output = _mm_or_si128(byte1, byte4);
137  byte2 = _mm_and_si128(byte2, byte2mask);
138  output = _mm_or_si128(output, byte2);
139  byte3 = _mm_and_si128(byte3, byte3mask);
140  output = _mm_or_si128(output, byte3);
141  // Store the results
142  _mm_storeu_si128((__m128i*)inputPtr, output);
143  inputPtr += 4;
144  }
145 
146  // Byteswap any remaining points:
147  number = quarterPoints * 4;
148  for (; number < num_points; number++) {
149  uint32_t outputVal = *inputPtr;
150  outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
151  ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
152  *inputPtr = outputVal;
153  inputPtr++;
154  }
155 }
156 #endif /* LV_HAVE_SSE2 */
157 
158 
159 #ifdef LV_HAVE_NEON
160 #include <arm_neon.h>
161 
162 static inline void volk_32u_byteswap_neon(uint32_t* intsToSwap, unsigned int num_points)
163 {
164  uint32_t* inputPtr = intsToSwap;
165  unsigned int number = 0;
166  unsigned int n8points = num_points / 8;
167 
168  uint8x8x4_t input_table;
169  uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
170  uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
171 
172  /* these magic numbers are used as byte-indices in the LUT.
173  they are pre-computed to save time. A simple C program
174  can calculate them; for example for lookup01:
175  uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
176  for(ii=0; ii < 8; ++ii) {
177  index += ((uint64_t)(*(chars+ii))) << (ii*8);
178  }
179  */
180  int_lookup01 = vcreate_u8(74609667900706840);
181  int_lookup23 = vcreate_u8(219290013576860186);
182  int_lookup45 = vcreate_u8(363970359253013532);
183  int_lookup67 = vcreate_u8(508650704929166878);
184 
185  for (number = 0; number < n8points; ++number) {
186  input_table = vld4_u8((uint8_t*)inputPtr);
187  swapped_int01 = vtbl4_u8(input_table, int_lookup01);
188  swapped_int23 = vtbl4_u8(input_table, int_lookup23);
189  swapped_int45 = vtbl4_u8(input_table, int_lookup45);
190  swapped_int67 = vtbl4_u8(input_table, int_lookup67);
191  vst1_u8((uint8_t*)inputPtr, swapped_int01);
192  vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23);
193  vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45);
194  vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67);
195 
196  inputPtr += 8;
197  }
198 
199  for (number = n8points * 8; number < num_points; ++number) {
200  uint32_t output = *inputPtr;
201  output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
202  ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
203 
204  *inputPtr = output;
205  inputPtr++;
206  }
207 }
208 #endif /* LV_HAVE_NEON */
209 
210 #ifdef LV_HAVE_NEONV8
211 #include <arm_neon.h>
212 
213 static inline void volk_32u_byteswap_neonv8(uint32_t* intsToSwap, unsigned int num_points)
214 {
215  uint32_t* inputPtr = (uint32_t*)intsToSwap;
216  const unsigned int n8points = num_points / 8;
217  uint8x16_t input;
218  uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
219 
220  unsigned int number = 0;
221  for (number = 0; number < n8points; ++number) {
222  __VOLK_PREFETCH(inputPtr + 8);
223  input = vld1q_u8((uint8_t*)inputPtr);
224  input = vqtbl1q_u8(input, idx);
225  vst1q_u8((uint8_t*)inputPtr, input);
226  inputPtr += 4;
227 
228  input = vld1q_u8((uint8_t*)inputPtr);
229  input = vqtbl1q_u8(input, idx);
230  vst1q_u8((uint8_t*)inputPtr, input);
231  inputPtr += 4;
232  }
233 
234  for (number = n8points * 8; number < num_points; ++number) {
235  uint32_t output = *inputPtr;
236 
237  output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
238  ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
239 
240  *inputPtr++ = output;
241  }
242 }
243 #endif /* LV_HAVE_NEONV8 */
244 
245 
246 #ifdef LV_HAVE_GENERIC
247 
248 static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap,
249  unsigned int num_points)
250 {
251  uint32_t* inputPtr = intsToSwap;
252 
253  unsigned int point;
254  for (point = 0; point < num_points; point++) {
255  uint32_t output = *inputPtr;
256  output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
257  ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
258 
259  *inputPtr = output;
260  inputPtr++;
261  }
262 }
263 #endif /* LV_HAVE_GENERIC */
264 
265 
266 #endif /* INCLUDED_volk_32u_byteswap_u_H */
267 #ifndef INCLUDED_volk_32u_byteswap_a_H
268 #define INCLUDED_volk_32u_byteswap_a_H
269 
270 #include <inttypes.h>
271 #include <stdio.h>
272 
273 
274 #if LV_HAVE_AVX2
275 #include <immintrin.h>
276 static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap, unsigned int num_points)
277 {
278 
279  unsigned int number;
280 
281  const unsigned int nPerSet = 8;
282  const uint64_t nSets = num_points / nPerSet;
283 
284  uint32_t* inputPtr = intsToSwap;
285 
286  const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
287  8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
288  21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
289 
290  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
291 
292  for (number = 0; number < nSets; number++) {
293 
294  // Load the 32t values, increment inputPtr later since we're doing it in-place.
295  const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
296  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
297 
298  // Store the results
299  _mm256_store_si256((__m256i*)inputPtr, output);
300  inputPtr += nPerSet;
301  }
302  _mm256_zeroupper();
303 
304  // Byteswap any remaining points:
305  for (number = nSets * nPerSet; number < num_points; number++) {
306  uint32_t outputVal = *inputPtr;
307  outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
308  ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
309  *inputPtr = outputVal;
310  inputPtr++;
311  }
312 }
313 #endif /* LV_HAVE_AVX2 */
314 
315 
316 #ifdef LV_HAVE_SSE2
317 #include <emmintrin.h>
318 
319 
320 static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int num_points)
321 {
322  unsigned int number = 0;
323 
324  uint32_t* inputPtr = intsToSwap;
325  __m128i input, byte1, byte2, byte3, byte4, output;
326  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
327  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
328 
329  const uint64_t quarterPoints = num_points / 4;
330  for (; number < quarterPoints; number++) {
331  // Load the 32t values, increment inputPtr later since we're doing it in-place.
332  input = _mm_load_si128((__m128i*)inputPtr);
333  // Do the four shifts
334  byte1 = _mm_slli_epi32(input, 24);
335  byte2 = _mm_slli_epi32(input, 8);
336  byte3 = _mm_srli_epi32(input, 8);
337  byte4 = _mm_srli_epi32(input, 24);
338  // Or bytes together
339  output = _mm_or_si128(byte1, byte4);
340  byte2 = _mm_and_si128(byte2, byte2mask);
341  output = _mm_or_si128(output, byte2);
342  byte3 = _mm_and_si128(byte3, byte3mask);
343  output = _mm_or_si128(output, byte3);
344  // Store the results
345  _mm_store_si128((__m128i*)inputPtr, output);
346  inputPtr += 4;
347  }
348 
349  // Byteswap any remaining points:
350  number = quarterPoints * 4;
351  for (; number < num_points; number++) {
352  uint32_t outputVal = *inputPtr;
353  outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
354  ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
355  *inputPtr = outputVal;
356  inputPtr++;
357  }
358 }
359 #endif /* LV_HAVE_SSE2 */
360 
361 
362 #ifdef LV_HAVE_GENERIC
363 
364 static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap,
365  unsigned int num_points)
366 {
367  uint32_t* inputPtr = intsToSwap;
368 
369  unsigned int point;
370  for (point = 0; point < num_points; point++) {
371  uint32_t output = *inputPtr;
372  output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
373  ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
374 
375  *inputPtr = output;
376  inputPtr++;
377  }
378 }
379 #endif /* LV_HAVE_GENERIC */
380 
381 
382 #endif /* INCLUDED_volk_32u_byteswap_a_H */
volk_32u_byteswap_u_sse2
static void volk_32u_byteswap_u_sse2(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:117
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
volk_32u_byteswap_a_sse2
static void volk_32u_byteswap_a_sse2(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:320
volk_32u_byteswap_neon
static void volk_32u_byteswap_neon(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:162
volk_32u_byteswap_a_generic
static void volk_32u_byteswap_a_generic(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:364
volk_32u_byteswap_generic
static void volk_32u_byteswap_generic(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:248