65 #ifndef INCLUDED_volk_32f_index_max_32u_a_H
66 #define INCLUDED_volk_32f_index_max_32u_a_H
73 #include <smmintrin.h>
76 volk_32f_index_max_32u_a_sse4_1(uint32_t* target,
const float* src0, uint32_t num_points)
80 const uint32_t quarterPoints = num_points / 4;
82 float* inputPtr = (
float*)src0;
84 __m128 indexIncrementValues = _mm_set1_ps(4);
85 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
89 __m128 maxValues = _mm_set1_ps(max);
90 __m128 maxValuesIndex = _mm_setzero_ps();
91 __m128 compareResults;
97 for (; number < quarterPoints; number++) {
99 currentValues = _mm_load_ps(inputPtr);
101 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
103 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
106 _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
107 maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
111 _mm_store_ps(maxValuesBuffer, maxValues);
112 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
114 for (number = 0; number < 4; number++) {
115 if (maxValuesBuffer[number] > max) {
116 index = maxIndexesBuffer[number];
117 max = maxValuesBuffer[number];
118 }
else if (maxValuesBuffer[number] == max) {
119 if (index > maxIndexesBuffer[number])
120 index = maxIndexesBuffer[number];
124 number = quarterPoints * 4;
125 for (; number < num_points; number++) {
126 if (src0[number] > max) {
131 target[0] = (uint32_t)index;
140 #include <xmmintrin.h>
145 if (num_points > 0) {
147 const uint32_t quarterPoints = num_points / 4;
149 float* inputPtr = (
float*)src0;
151 __m128 indexIncrementValues = _mm_set1_ps(4);
152 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
156 __m128 maxValues = _mm_set1_ps(max);
157 __m128 maxValuesIndex = _mm_setzero_ps();
158 __m128 compareResults;
159 __m128 currentValues;
164 for (; number < quarterPoints; number++) {
166 currentValues = _mm_load_ps(inputPtr);
168 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
170 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
172 maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
173 _mm_andnot_ps(compareResults, maxValuesIndex));
175 maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
176 _mm_andnot_ps(compareResults, maxValues));
180 _mm_store_ps(maxValuesBuffer, maxValues);
181 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
183 for (number = 0; number < 4; number++) {
184 if (maxValuesBuffer[number] > max) {
185 index = maxIndexesBuffer[number];
186 max = maxValuesBuffer[number];
187 }
else if (maxValuesBuffer[number] == max) {
188 if (index > maxIndexesBuffer[number])
189 index = maxIndexesBuffer[number];
193 number = quarterPoints * 4;
194 for (; number < num_points; number++) {
195 if (src0[number] > max) {
200 target[0] = (uint32_t)index;
208 #include <immintrin.h>
213 if (num_points > 0) {
215 const uint32_t quarterPoints = num_points / 8;
217 float* inputPtr = (
float*)src0;
219 __m256 indexIncrementValues = _mm256_set1_ps(8);
220 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
224 __m256 maxValues = _mm256_set1_ps(max);
225 __m256 maxValuesIndex = _mm256_setzero_ps();
226 __m256 compareResults;
227 __m256 currentValues;
232 for (; number < quarterPoints; number++) {
233 currentValues = _mm256_load_ps(inputPtr);
235 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
236 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
238 _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
239 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
243 _mm256_store_ps(maxValuesBuffer, maxValues);
244 _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
246 for (number = 0; number < 8; number++) {
247 if (maxValuesBuffer[number] > max) {
248 index = maxIndexesBuffer[number];
249 max = maxValuesBuffer[number];
250 }
else if (maxValuesBuffer[number] == max) {
251 if (index > maxIndexesBuffer[number])
252 index = maxIndexesBuffer[number];
256 number = quarterPoints * 8;
257 for (; number < num_points; number++) {
258 if (src0[number] > max) {
263 target[0] = (uint32_t)index;
271 #include <arm_neon.h>
276 if (num_points > 0) {
278 const uint32_t quarterPoints = num_points / 4;
280 float* inputPtr = (
float*)src0;
281 float32x4_t indexIncrementValues = vdupq_n_f32(4);
283 float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
284 float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
288 float32x4_t maxValues = vdupq_n_f32(max);
289 uint32x4_t maxValuesIndex = vmovq_n_u32(0);
290 uint32x4_t compareResults;
291 uint32x4_t currentIndexes_u;
292 float32x4_t currentValues;
297 for (; number < quarterPoints; number++) {
298 currentValues = vld1q_f32(inputPtr);
300 currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
301 currentIndexes_u = vcvtq_u32_f32(currentIndexes);
302 compareResults = vcleq_f32(currentValues, maxValues);
303 maxValuesIndex = vorrq_u32(vandq_u32(compareResults, maxValuesIndex),
304 vbicq_u32(currentIndexes_u, compareResults));
305 maxValues = vmaxq_f32(currentValues, maxValues);
309 vst1q_f32(maxValuesBuffer, maxValues);
310 vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
311 for (number = 0; number < 4; number++) {
312 if (maxValuesBuffer[number] > max) {
313 index = maxIndexesBuffer[number];
314 max = maxValuesBuffer[number];
315 }
else if (maxValues[number] == max) {
316 if (index > maxIndexesBuffer[number])
317 index = maxIndexesBuffer[number];
321 number = quarterPoints * 4;
322 for (; number < num_points; number++) {
323 if (src0[number] > max) {
328 target[0] = (uint32_t)index;
335 #ifdef LV_HAVE_GENERIC
340 if (num_points > 0) {
346 for (;
i < num_points; ++
i) {
362 #ifndef INCLUDED_volk_32f_index_max_32u_u_H
363 #define INCLUDED_volk_32f_index_max_32u_u_H
365 #include <inttypes.h>
371 #include <immintrin.h>
376 if (num_points > 0) {
378 const uint32_t quarterPoints = num_points / 8;
380 float* inputPtr = (
float*)src0;
382 __m256 indexIncrementValues = _mm256_set1_ps(8);
383 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
387 __m256 maxValues = _mm256_set1_ps(max);
388 __m256 maxValuesIndex = _mm256_setzero_ps();
389 __m256 compareResults;
390 __m256 currentValues;
395 for (; number < quarterPoints; number++) {
396 currentValues = _mm256_loadu_ps(inputPtr);
398 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
399 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
401 _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
402 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
406 _mm256_store_ps(maxValuesBuffer, maxValues);
407 _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
409 for (number = 0; number < 8; number++) {
410 if (maxValuesBuffer[number] > max) {
411 index = maxIndexesBuffer[number];
412 max = maxValuesBuffer[number];
413 }
else if (maxValuesBuffer[number] == max) {
414 if (index > maxIndexesBuffer[number])
415 index = maxIndexesBuffer[number];
419 number = quarterPoints * 8;
420 for (; number < num_points; number++) {
421 if (src0[number] > max) {
426 target[0] = (uint32_t)index;
433 #ifdef LV_HAVE_SSE4_1
434 #include <smmintrin.h>
437 volk_32f_index_max_32u_u_sse4_1(uint32_t* target,
const float* src0, uint32_t num_points)
439 if (num_points > 0) {
441 const uint32_t quarterPoints = num_points / 4;
443 float* inputPtr = (
float*)src0;
445 __m128 indexIncrementValues = _mm_set1_ps(4);
446 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
450 __m128 maxValues = _mm_set1_ps(max);
451 __m128 maxValuesIndex = _mm_setzero_ps();
452 __m128 compareResults;
453 __m128 currentValues;
458 for (; number < quarterPoints; number++) {
459 currentValues = _mm_loadu_ps(inputPtr);
461 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
462 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
464 _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
465 maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
469 _mm_store_ps(maxValuesBuffer, maxValues);
470 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
472 for (number = 0; number < 4; number++) {
473 if (maxValuesBuffer[number] > max) {
474 index = maxIndexesBuffer[number];
475 max = maxValuesBuffer[number];
476 }
else if (maxValuesBuffer[number] == max) {
477 if (index > maxIndexesBuffer[number])
478 index = maxIndexesBuffer[number];
482 number = quarterPoints * 4;
483 for (; number < num_points; number++) {
484 if (src0[number] > max) {
489 target[0] = (uint32_t)index;
496 #include <xmmintrin.h>
501 if (num_points > 0) {
503 const uint32_t quarterPoints = num_points / 4;
505 float* inputPtr = (
float*)src0;
507 __m128 indexIncrementValues = _mm_set1_ps(4);
508 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
512 __m128 maxValues = _mm_set1_ps(max);
513 __m128 maxValuesIndex = _mm_setzero_ps();
514 __m128 compareResults;
515 __m128 currentValues;
520 for (; number < quarterPoints; number++) {
521 currentValues = _mm_loadu_ps(inputPtr);
523 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
524 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
525 maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
526 _mm_andnot_ps(compareResults, maxValuesIndex));
527 maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
528 _mm_andnot_ps(compareResults, maxValues));
532 _mm_store_ps(maxValuesBuffer, maxValues);
533 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
535 for (number = 0; number < 4; number++) {
536 if (maxValuesBuffer[number] > max) {
537 index = maxIndexesBuffer[number];
538 max = maxValuesBuffer[number];
539 }
else if (maxValuesBuffer[number] == max) {
540 if (index > maxIndexesBuffer[number])
541 index = maxIndexesBuffer[number];
545 number = quarterPoints * 4;
546 for (; number < num_points; number++) {
547 if (src0[number] > max) {
552 target[0] = (uint32_t)index;