60 #ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
61 #define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
67 #include <emmintrin.h>
68 #include <xmmintrin.h>
79 unsigned int num_points)
81 const unsigned int num_bytes = num_points * 2;
83 __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
84 __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2,
86 p_target0 = (__m128i*)target0;
87 p_target1 = (__m128i*)target1;
88 p_target2 = (__m128i*)target2;
89 p_target3 = (__m128i*)target3;
91 p_src0 = (__m128i*)src0;
92 p_src1 = (__m128i*)src1;
93 p_src2 = (__m128i*)src2;
94 p_src3 = (__m128i*)src3;
95 p_src4 = (__m128i*)src4;
99 int bound = (num_bytes >> 4);
100 int leftovers = (num_bytes >> 1) & 7;
102 for (;
i < bound; ++
i) {
103 xmm0 = _mm_load_si128(p_src0);
104 xmm1 = _mm_load_si128(p_src1);
105 xmm2 = _mm_load_si128(p_src2);
106 xmm3 = _mm_load_si128(p_src3);
107 xmm4 = _mm_load_si128(p_src4);
112 xmm1 = _mm_add_epi16(xmm0, xmm1);
113 xmm2 = _mm_add_epi16(xmm0, xmm2);
114 xmm3 = _mm_add_epi16(xmm0, xmm3);
115 xmm4 = _mm_add_epi16(xmm0, xmm4);
122 _mm_store_si128(p_target0, xmm1);
123 _mm_store_si128(p_target1, xmm2);
124 _mm_store_si128(p_target2, xmm3);
125 _mm_store_si128(p_target3, xmm4);
170 for (
i = bound * 8;
i < (bound * 8) + leftovers; ++
i) {
171 target0[
i] = src0[
i] + src1[
i];
172 target1[
i] = src0[
i] + src2[
i];
173 target2[
i] = src0[
i] + src3[
i];
174 target3[
i] = src0[
i] + src4[
i];
180 #include <arm_neon.h>
191 unsigned int num_points)
193 const unsigned int eighth_points = num_points / 8;
194 unsigned int number = 0;
196 int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec;
197 int16x8_t target0_vec, target1_vec, target2_vec, target3_vec;
198 for (number = 0; number < eighth_points; ++number) {
199 src0_vec = vld1q_s16(src0);
200 src1_vec = vld1q_s16(src1);
201 src2_vec = vld1q_s16(src2);
202 src3_vec = vld1q_s16(src3);
203 src4_vec = vld1q_s16(src4);
205 target0_vec = vaddq_s16(src0_vec, src1_vec);
206 target1_vec = vaddq_s16(src0_vec, src2_vec);
207 target2_vec = vaddq_s16(src0_vec, src3_vec);
208 target3_vec = vaddq_s16(src0_vec, src4_vec);
210 vst1q_s16(target0, target0_vec);
211 vst1q_s16(target1, target1_vec);
212 vst1q_s16(target2, target2_vec);
213 vst1q_s16(target3, target3_vec);
225 for (number = eighth_points * 8; number < num_points; ++number) {
226 *target0++ = *src0 + *src1++;
227 *target1++ = *src0 + *src2++;
228 *target2++ = *src0 + *src3++;
229 *target3++ = *src0++ + *src4++;
235 #ifdef LV_HAVE_GENERIC
246 unsigned int num_points)
248 const unsigned int num_bytes = num_points * 2;
252 int bound = num_bytes >> 1;
254 for (
i = 0;
i < bound; ++
i) {
255 target0[
i] = src0[
i] + src1[
i];
256 target1[
i] = src0[
i] + src2[
i];
257 target2[
i] = src0[
i] + src3[
i];
258 target3[
i] = src0[
i] + src4[
i];