Vector Optimized Library of Kernels  2.3
Architecture-tuned implementations of math kernels
volk_32f_index_max_32u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
65 #ifndef INCLUDED_volk_32f_index_max_32u_a_H
66 #define INCLUDED_volk_32f_index_max_32u_a_H
67 
68 #include <inttypes.h>
69 #include <stdio.h>
70 #include <volk/volk_common.h>
71 
72 #ifdef LV_HAVE_SSE4_1
73 #include <smmintrin.h>
74 
75 static inline void
76 volk_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
77 {
78  if (num_points > 0) {
79  uint32_t number = 0;
80  const uint32_t quarterPoints = num_points / 4;
81 
82  float* inputPtr = (float*)src0;
83 
84  __m128 indexIncrementValues = _mm_set1_ps(4);
85  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
86 
87  float max = src0[0];
88  float index = 0;
89  __m128 maxValues = _mm_set1_ps(max);
90  __m128 maxValuesIndex = _mm_setzero_ps();
91  __m128 compareResults;
92  __m128 currentValues;
93 
94  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
95  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
96 
97  for (; number < quarterPoints; number++) {
98 
99  currentValues = _mm_load_ps(inputPtr);
100  inputPtr += 4;
101  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
102 
103  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
104 
105  maxValuesIndex =
106  _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
107  maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
108  }
109 
110  // Calculate the largest value from the remaining 4 points
111  _mm_store_ps(maxValuesBuffer, maxValues);
112  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
113 
114  for (number = 0; number < 4; number++) {
115  if (maxValuesBuffer[number] > max) {
116  index = maxIndexesBuffer[number];
117  max = maxValuesBuffer[number];
118  } else if (maxValuesBuffer[number] == max) {
119  if (index > maxIndexesBuffer[number])
120  index = maxIndexesBuffer[number];
121  }
122  }
123 
124  number = quarterPoints * 4;
125  for (; number < num_points; number++) {
126  if (src0[number] > max) {
127  index = number;
128  max = src0[number];
129  }
130  }
131  target[0] = (uint32_t)index;
132  }
133 }
134 
135 #endif /*LV_HAVE_SSE4_1*/
136 
137 
138 #ifdef LV_HAVE_SSE
139 
140 #include <xmmintrin.h>
141 
142 static inline void
143 volk_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points)
144 {
145  if (num_points > 0) {
146  uint32_t number = 0;
147  const uint32_t quarterPoints = num_points / 4;
148 
149  float* inputPtr = (float*)src0;
150 
151  __m128 indexIncrementValues = _mm_set1_ps(4);
152  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
153 
154  float max = src0[0];
155  float index = 0;
156  __m128 maxValues = _mm_set1_ps(max);
157  __m128 maxValuesIndex = _mm_setzero_ps();
158  __m128 compareResults;
159  __m128 currentValues;
160 
161  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
162  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
163 
164  for (; number < quarterPoints; number++) {
165 
166  currentValues = _mm_load_ps(inputPtr);
167  inputPtr += 4;
168  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
169 
170  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
171 
172  maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
173  _mm_andnot_ps(compareResults, maxValuesIndex));
174 
175  maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
176  _mm_andnot_ps(compareResults, maxValues));
177  }
178 
179  // Calculate the largest value from the remaining 4 points
180  _mm_store_ps(maxValuesBuffer, maxValues);
181  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
182 
183  for (number = 0; number < 4; number++) {
184  if (maxValuesBuffer[number] > max) {
185  index = maxIndexesBuffer[number];
186  max = maxValuesBuffer[number];
187  } else if (maxValuesBuffer[number] == max) {
188  if (index > maxIndexesBuffer[number])
189  index = maxIndexesBuffer[number];
190  }
191  }
192 
193  number = quarterPoints * 4;
194  for (; number < num_points; number++) {
195  if (src0[number] > max) {
196  index = number;
197  max = src0[number];
198  }
199  }
200  target[0] = (uint32_t)index;
201  }
202 }
203 
204 #endif /*LV_HAVE_SSE*/
205 
206 
207 #ifdef LV_HAVE_AVX
208 #include <immintrin.h>
209 
210 static inline void
211 volk_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points)
212 {
213  if (num_points > 0) {
214  uint32_t number = 0;
215  const uint32_t quarterPoints = num_points / 8;
216 
217  float* inputPtr = (float*)src0;
218 
219  __m256 indexIncrementValues = _mm256_set1_ps(8);
220  __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
221 
222  float max = src0[0];
223  float index = 0;
224  __m256 maxValues = _mm256_set1_ps(max);
225  __m256 maxValuesIndex = _mm256_setzero_ps();
226  __m256 compareResults;
227  __m256 currentValues;
228 
229  __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
230  __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
231 
232  for (; number < quarterPoints; number++) {
233  currentValues = _mm256_load_ps(inputPtr);
234  inputPtr += 8;
235  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
236  compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
237  maxValuesIndex =
238  _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
239  maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
240  }
241 
242  // Calculate the largest value from the remaining 8 points
243  _mm256_store_ps(maxValuesBuffer, maxValues);
244  _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
245 
246  for (number = 0; number < 8; number++) {
247  if (maxValuesBuffer[number] > max) {
248  index = maxIndexesBuffer[number];
249  max = maxValuesBuffer[number];
250  } else if (maxValuesBuffer[number] == max) {
251  if (index > maxIndexesBuffer[number])
252  index = maxIndexesBuffer[number];
253  }
254  }
255 
256  number = quarterPoints * 8;
257  for (; number < num_points; number++) {
258  if (src0[number] > max) {
259  index = number;
260  max = src0[number];
261  }
262  }
263  target[0] = (uint32_t)index;
264  }
265 }
266 
267 #endif /*LV_HAVE_AVX*/
268 
269 
270 #ifdef LV_HAVE_NEON
271 #include <arm_neon.h>
272 
273 static inline void
274 volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points)
275 {
276  if (num_points > 0) {
277  uint32_t number = 0;
278  const uint32_t quarterPoints = num_points / 4;
279 
280  float* inputPtr = (float*)src0;
281  float32x4_t indexIncrementValues = vdupq_n_f32(4);
283  float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
284  float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
285 
286  float max = src0[0];
287  float index = 0;
288  float32x4_t maxValues = vdupq_n_f32(max);
289  uint32x4_t maxValuesIndex = vmovq_n_u32(0);
290  uint32x4_t compareResults;
291  uint32x4_t currentIndexes_u;
292  float32x4_t currentValues;
293 
294  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
295  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
296 
297  for (; number < quarterPoints; number++) {
298  currentValues = vld1q_f32(inputPtr);
299  inputPtr += 4;
300  currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
301  currentIndexes_u = vcvtq_u32_f32(currentIndexes);
302  compareResults = vcleq_f32(currentValues, maxValues);
303  maxValuesIndex = vorrq_u32(vandq_u32(compareResults, maxValuesIndex),
304  vbicq_u32(currentIndexes_u, compareResults));
305  maxValues = vmaxq_f32(currentValues, maxValues);
306  }
307 
308  // Calculate the largest value from the remaining 4 points
309  vst1q_f32(maxValuesBuffer, maxValues);
310  vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
311  for (number = 0; number < 4; number++) {
312  if (maxValuesBuffer[number] > max) {
313  index = maxIndexesBuffer[number];
314  max = maxValuesBuffer[number];
315  } else if (maxValues[number] == max) {
316  if (index > maxIndexesBuffer[number])
317  index = maxIndexesBuffer[number];
318  }
319  }
320 
321  number = quarterPoints * 4;
322  for (; number < num_points; number++) {
323  if (src0[number] > max) {
324  index = number;
325  max = src0[number];
326  }
327  }
328  target[0] = (uint32_t)index;
329  }
330 }
331 
332 #endif /*LV_HAVE_NEON*/
333 
334 
335 #ifdef LV_HAVE_GENERIC
336 
337 static inline void
338 volk_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points)
339 {
340  if (num_points > 0) {
341  float max = src0[0];
342  uint32_t index = 0;
343 
344  uint32_t i = 1;
345 
346  for (; i < num_points; ++i) {
347  if (src0[i] > max) {
348  index = i;
349  max = src0[i];
350  }
351  }
352  target[0] = index;
353  }
354 }
355 
356 #endif /*LV_HAVE_GENERIC*/
357 
358 
359 #endif /*INCLUDED_volk_32f_index_max_32u_a_H*/
360 
361 
362 #ifndef INCLUDED_volk_32f_index_max_32u_u_H
363 #define INCLUDED_volk_32f_index_max_32u_u_H
364 
365 #include <inttypes.h>
366 #include <stdio.h>
367 #include <volk/volk_common.h>
368 
369 
370 #ifdef LV_HAVE_AVX
371 #include <immintrin.h>
372 
373 static inline void
374 volk_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points)
375 {
376  if (num_points > 0) {
377  uint32_t number = 0;
378  const uint32_t quarterPoints = num_points / 8;
379 
380  float* inputPtr = (float*)src0;
381 
382  __m256 indexIncrementValues = _mm256_set1_ps(8);
383  __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
384 
385  float max = src0[0];
386  float index = 0;
387  __m256 maxValues = _mm256_set1_ps(max);
388  __m256 maxValuesIndex = _mm256_setzero_ps();
389  __m256 compareResults;
390  __m256 currentValues;
391 
392  __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
393  __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
394 
395  for (; number < quarterPoints; number++) {
396  currentValues = _mm256_loadu_ps(inputPtr);
397  inputPtr += 8;
398  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
399  compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
400  maxValuesIndex =
401  _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
402  maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
403  }
404 
405  // Calculate the largest value from the remaining 8 points
406  _mm256_store_ps(maxValuesBuffer, maxValues);
407  _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
408 
409  for (number = 0; number < 8; number++) {
410  if (maxValuesBuffer[number] > max) {
411  index = maxIndexesBuffer[number];
412  max = maxValuesBuffer[number];
413  } else if (maxValuesBuffer[number] == max) {
414  if (index > maxIndexesBuffer[number])
415  index = maxIndexesBuffer[number];
416  }
417  }
418 
419  number = quarterPoints * 8;
420  for (; number < num_points; number++) {
421  if (src0[number] > max) {
422  index = number;
423  max = src0[number];
424  }
425  }
426  target[0] = (uint32_t)index;
427  }
428 }
429 
430 #endif /*LV_HAVE_AVX*/
431 
432 
433 #ifdef LV_HAVE_SSE4_1
434 #include <smmintrin.h>
435 
436 static inline void
437 volk_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
438 {
439  if (num_points > 0) {
440  uint32_t number = 0;
441  const uint32_t quarterPoints = num_points / 4;
442 
443  float* inputPtr = (float*)src0;
444 
445  __m128 indexIncrementValues = _mm_set1_ps(4);
446  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
447 
448  float max = src0[0];
449  float index = 0;
450  __m128 maxValues = _mm_set1_ps(max);
451  __m128 maxValuesIndex = _mm_setzero_ps();
452  __m128 compareResults;
453  __m128 currentValues;
454 
455  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
456  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
457 
458  for (; number < quarterPoints; number++) {
459  currentValues = _mm_loadu_ps(inputPtr);
460  inputPtr += 4;
461  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
462  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
463  maxValuesIndex =
464  _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
465  maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
466  }
467 
468  // Calculate the largest value from the remaining 4 points
469  _mm_store_ps(maxValuesBuffer, maxValues);
470  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
471 
472  for (number = 0; number < 4; number++) {
473  if (maxValuesBuffer[number] > max) {
474  index = maxIndexesBuffer[number];
475  max = maxValuesBuffer[number];
476  } else if (maxValuesBuffer[number] == max) {
477  if (index > maxIndexesBuffer[number])
478  index = maxIndexesBuffer[number];
479  }
480  }
481 
482  number = quarterPoints * 4;
483  for (; number < num_points; number++) {
484  if (src0[number] > max) {
485  index = number;
486  max = src0[number];
487  }
488  }
489  target[0] = (uint32_t)index;
490  }
491 }
492 
493 #endif /*LV_HAVE_SSE4_1*/
494 
495 #ifdef LV_HAVE_SSE
496 #include <xmmintrin.h>
497 
498 static inline void
499 volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points)
500 {
501  if (num_points > 0) {
502  uint32_t number = 0;
503  const uint32_t quarterPoints = num_points / 4;
504 
505  float* inputPtr = (float*)src0;
506 
507  __m128 indexIncrementValues = _mm_set1_ps(4);
508  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
509 
510  float max = src0[0];
511  float index = 0;
512  __m128 maxValues = _mm_set1_ps(max);
513  __m128 maxValuesIndex = _mm_setzero_ps();
514  __m128 compareResults;
515  __m128 currentValues;
516 
517  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
518  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
519 
520  for (; number < quarterPoints; number++) {
521  currentValues = _mm_loadu_ps(inputPtr);
522  inputPtr += 4;
523  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
524  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
525  maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
526  _mm_andnot_ps(compareResults, maxValuesIndex));
527  maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
528  _mm_andnot_ps(compareResults, maxValues));
529  }
530 
531  // Calculate the largest value from the remaining 4 points
532  _mm_store_ps(maxValuesBuffer, maxValues);
533  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
534 
535  for (number = 0; number < 4; number++) {
536  if (maxValuesBuffer[number] > max) {
537  index = maxIndexesBuffer[number];
538  max = maxValuesBuffer[number];
539  } else if (maxValuesBuffer[number] == max) {
540  if (index > maxIndexesBuffer[number])
541  index = maxIndexesBuffer[number];
542  }
543  }
544 
545  number = quarterPoints * 4;
546  for (; number < num_points; number++) {
547  if (src0[number] > max) {
548  index = number;
549  max = src0[number];
550  }
551  }
552  target[0] = (uint32_t)index;
553  }
554 }
555 
556 #endif /*LV_HAVE_SSE*/
557 
558 #endif /*INCLUDED_volk_32f_index_max_32u_u_H*/
volk_32f_index_max_32u_u_avx
static void volk_32f_index_max_32u_u_avx(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:374
volk_32f_index_max_32u_u_sse
static void volk_32f_index_max_32u_u_sse(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:499
volk_32f_index_max_32u_a_sse
static void volk_32f_index_max_32u_a_sse(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:143
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
i
for i
Definition: volk_config_fixed.tmpl.h:25
volk_common.h
volk_32f_index_max_32u_a_avx
static void volk_32f_index_max_32u_a_avx(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:211
volk_32f_index_max_32u_generic
static void volk_32f_index_max_32u_generic(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:338
volk_32f_index_max_32u_neon
static void volk_32f_index_max_32u_neon(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:274