54 #ifndef INCLUDED_volk_16i_s32f_convert_32f_u_H
55 #define INCLUDED_volk_16i_s32f_convert_32f_u_H
61 #include <immintrin.h>
63 static inline void volk_16i_s32f_convert_32f_u_avx2(
float* outputVector,
64 const int16_t* inputVector,
66 unsigned int num_points)
68 unsigned int number = 0;
69 const unsigned int eighthPoints = num_points / 8;
71 float* outputVectorPtr = outputVector;
72 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
73 int16_t* inputPtr = (int16_t*)inputVector;
78 for (; number < eighthPoints; number++) {
81 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
84 inputVal2 = _mm256_cvtepi16_epi32(inputVal);
86 ret = _mm256_cvtepi32_ps(inputVal2);
87 ret = _mm256_mul_ps(ret, invScalar);
89 _mm256_storeu_ps(outputVectorPtr, ret);
96 number = eighthPoints * 8;
97 for (; number < num_points; number++) {
98 outputVector[number] = ((float)(inputVector[number])) / scalar;
104 #include <immintrin.h>
107 const int16_t* inputVector,
109 unsigned int num_points)
111 unsigned int number = 0;
112 const unsigned int eighthPoints = num_points / 8;
114 float* outputVectorPtr = outputVector;
115 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
116 int16_t* inputPtr = (int16_t*)inputVector;
117 __m128i inputVal, inputVal2;
120 __m256 dummy = _mm256_setzero_ps();
122 for (; number < eighthPoints; number++) {
126 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
129 inputVal2 = _mm_srli_si128(inputVal, 8);
132 inputVal = _mm_cvtepi16_epi32(inputVal);
133 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
135 ret = _mm_cvtepi32_ps(inputVal);
136 ret = _mm_mul_ps(ret, invScalar);
137 output = _mm256_insertf128_ps(dummy, ret, 0);
139 ret = _mm_cvtepi32_ps(inputVal2);
140 ret = _mm_mul_ps(ret, invScalar);
141 output = _mm256_insertf128_ps(output, ret, 1);
143 _mm256_storeu_ps(outputVectorPtr, output);
145 outputVectorPtr += 8;
150 number = eighthPoints * 8;
151 for (; number < num_points; number++) {
152 outputVector[number] = ((float)(inputVector[number])) / scalar;
157 #ifdef LV_HAVE_SSE4_1
158 #include <smmintrin.h>
160 static inline void volk_16i_s32f_convert_32f_u_sse4_1(
float* outputVector,
161 const int16_t* inputVector,
163 unsigned int num_points)
165 unsigned int number = 0;
166 const unsigned int eighthPoints = num_points / 8;
168 float* outputVectorPtr = outputVector;
169 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
170 int16_t* inputPtr = (int16_t*)inputVector;
175 for (; number < eighthPoints; number++) {
178 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
181 inputVal2 = _mm_srli_si128(inputVal, 8);
184 inputVal = _mm_cvtepi16_epi32(inputVal);
185 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
187 ret = _mm_cvtepi32_ps(inputVal);
188 ret = _mm_mul_ps(ret, invScalar);
189 _mm_storeu_ps(outputVectorPtr, ret);
190 outputVectorPtr += 4;
192 ret = _mm_cvtepi32_ps(inputVal2);
193 ret = _mm_mul_ps(ret, invScalar);
194 _mm_storeu_ps(outputVectorPtr, ret);
196 outputVectorPtr += 4;
201 number = eighthPoints * 8;
202 for (; number < num_points; number++) {
203 outputVector[number] = ((float)(inputVector[number])) / scalar;
209 #include <xmmintrin.h>
212 const int16_t* inputVector,
214 unsigned int num_points)
216 unsigned int number = 0;
217 const unsigned int quarterPoints = num_points / 4;
219 float* outputVectorPtr = outputVector;
220 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
221 int16_t* inputPtr = (int16_t*)inputVector;
224 for (; number < quarterPoints; number++) {
225 ret = _mm_set_ps((
float)(inputPtr[3]),
226 (
float)(inputPtr[2]),
227 (
float)(inputPtr[1]),
228 (
float)(inputPtr[0]));
230 ret = _mm_mul_ps(ret, invScalar);
231 _mm_storeu_ps(outputVectorPtr, ret);
234 outputVectorPtr += 4;
237 number = quarterPoints * 4;
238 for (; number < num_points; number++) {
239 outputVector[number] = (float)(inputVector[number]) / scalar;
244 #ifdef LV_HAVE_GENERIC
247 const int16_t* inputVector,
249 unsigned int num_points)
251 float* outputVectorPtr = outputVector;
252 const int16_t* inputVectorPtr = inputVector;
253 unsigned int number = 0;
255 for (number = 0; number < num_points; number++) {
256 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
262 #include <arm_neon.h>
265 const int16_t* inputVector,
267 unsigned int num_points)
269 float* outputPtr = outputVector;
270 const int16_t* inputPtr = inputVector;
271 unsigned int number = 0;
272 unsigned int eighth_points = num_points / 8;
275 int32x4_t input32_0, input32_1;
276 float32x4_t input_float_0, input_float_1;
277 float32x4x2_t output_float;
278 float32x4_t inv_scale;
280 inv_scale = vdupq_n_f32(1.0 / scalar);
286 for (number = 0; number < eighth_points; number++) {
287 input16 = vld2_s16(inputPtr);
289 input32_0 = vmovl_s16(input16.val[0]);
290 input32_1 = vmovl_s16(input16.val[1]);
292 input_float_0 = vcvtq_f32_s32(input32_0);
293 input_float_1 = vcvtq_f32_s32(input32_1);
294 output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
295 output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
296 vst2q_f32(outputPtr, output_float);
301 for (number = eighth_points * 8; number < num_points; number++) {
302 *outputPtr++ = ((float)(*inputPtr++)) / scalar;
309 #ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H
310 #define INCLUDED_volk_16i_s32f_convert_32f_a_H
312 #include <inttypes.h>
316 #include <immintrin.h>
318 static inline void volk_16i_s32f_convert_32f_a_avx2(
float* outputVector,
319 const int16_t* inputVector,
321 unsigned int num_points)
323 unsigned int number = 0;
324 const unsigned int eighthPoints = num_points / 8;
326 float* outputVectorPtr = outputVector;
327 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
328 int16_t* inputPtr = (int16_t*)inputVector;
333 for (; number < eighthPoints; number++) {
336 inputVal = _mm_load_si128((__m128i*)inputPtr);
339 inputVal2 = _mm256_cvtepi16_epi32(inputVal);
341 ret = _mm256_cvtepi32_ps(inputVal2);
342 ret = _mm256_mul_ps(ret, invScalar);
344 _mm256_store_ps(outputVectorPtr, ret);
346 outputVectorPtr += 8;
351 number = eighthPoints * 8;
352 for (; number < num_points; number++) {
353 outputVector[number] = ((float)(inputVector[number])) / scalar;
359 #include <immintrin.h>
362 const int16_t* inputVector,
364 unsigned int num_points)
366 unsigned int number = 0;
367 const unsigned int eighthPoints = num_points / 8;
369 float* outputVectorPtr = outputVector;
370 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
371 int16_t* inputPtr = (int16_t*)inputVector;
372 __m128i inputVal, inputVal2;
375 __m256 dummy = _mm256_setzero_ps();
377 for (; number < eighthPoints; number++) {
381 inputVal = _mm_load_si128((__m128i*)inputPtr);
384 inputVal2 = _mm_srli_si128(inputVal, 8);
387 inputVal = _mm_cvtepi16_epi32(inputVal);
388 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
390 ret = _mm_cvtepi32_ps(inputVal);
391 ret = _mm_mul_ps(ret, invScalar);
392 output = _mm256_insertf128_ps(dummy, ret, 0);
394 ret = _mm_cvtepi32_ps(inputVal2);
395 ret = _mm_mul_ps(ret, invScalar);
396 output = _mm256_insertf128_ps(output, ret, 1);
398 _mm256_store_ps(outputVectorPtr, output);
400 outputVectorPtr += 8;
405 number = eighthPoints * 8;
406 for (; number < num_points; number++) {
407 outputVector[number] = ((float)(inputVector[number])) / scalar;
412 #ifdef LV_HAVE_SSE4_1
413 #include <smmintrin.h>
415 static inline void volk_16i_s32f_convert_32f_a_sse4_1(
float* outputVector,
416 const int16_t* inputVector,
418 unsigned int num_points)
420 unsigned int number = 0;
421 const unsigned int eighthPoints = num_points / 8;
423 float* outputVectorPtr = outputVector;
424 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
425 int16_t* inputPtr = (int16_t*)inputVector;
430 for (; number < eighthPoints; number++) {
433 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
436 inputVal2 = _mm_srli_si128(inputVal, 8);
439 inputVal = _mm_cvtepi16_epi32(inputVal);
440 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
442 ret = _mm_cvtepi32_ps(inputVal);
443 ret = _mm_mul_ps(ret, invScalar);
444 _mm_storeu_ps(outputVectorPtr, ret);
445 outputVectorPtr += 4;
447 ret = _mm_cvtepi32_ps(inputVal2);
448 ret = _mm_mul_ps(ret, invScalar);
449 _mm_storeu_ps(outputVectorPtr, ret);
451 outputVectorPtr += 4;
456 number = eighthPoints * 8;
457 for (; number < num_points; number++) {
458 outputVector[number] = ((float)(inputVector[number])) / scalar;
464 #include <xmmintrin.h>
467 const int16_t* inputVector,
469 unsigned int num_points)
471 unsigned int number = 0;
472 const unsigned int quarterPoints = num_points / 4;
474 float* outputVectorPtr = outputVector;
475 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
476 int16_t* inputPtr = (int16_t*)inputVector;
479 for (; number < quarterPoints; number++) {
480 ret = _mm_set_ps((
float)(inputPtr[3]),
481 (
float)(inputPtr[2]),
482 (
float)(inputPtr[1]),
483 (
float)(inputPtr[0]));
485 ret = _mm_mul_ps(ret, invScalar);
486 _mm_storeu_ps(outputVectorPtr, ret);
489 outputVectorPtr += 4;
492 number = quarterPoints * 4;
493 for (; number < num_points; number++) {
494 outputVector[number] = (float)(inputVector[number]) / scalar;
499 #ifdef LV_HAVE_GENERIC
502 const int16_t* inputVector,
504 unsigned int num_points)
506 float* outputVectorPtr = outputVector;
507 const int16_t* inputVectorPtr = inputVector;
508 unsigned int number = 0;
510 for (number = 0; number < num_points; number++) {
511 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;