1 #ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
2 #define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
8 #define ROTATOR_RELOAD 512
11 #ifdef LV_HAVE_GENERIC
23 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_generic(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
29 *outVector++ = *inVector++ * (*phase);
30 (*phase) *= phase_inc;
32 (*phase) /= abs((*phase));
35 *outVector++ = *inVector++ * (*phase);
36 (*phase) *= phase_inc;
44 #include <smmintrin.h>
46 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
51 lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
53 unsigned int i, j = 0;
55 for(i = 0; i < 2; ++
i) {
65 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
67 phase_Val = _mm_loadu_ps((
float*)phase_Ptr);
70 const unsigned int halfPoints = num_points / 2;
76 aVal = _mm_load_ps((
float*)aPtr);
78 yl = _mm_moveldup_ps(phase_Val);
79 yh = _mm_movehdup_ps(phase_Val);
80 ylp = _mm_moveldup_ps(inc_Val);
81 yhp = _mm_movehdup_ps(inc_Val);
83 tmp1 = _mm_mul_ps(aVal, yl);
84 tmp1p = _mm_mul_ps(phase_Val, ylp);
86 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
87 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
88 tmp2 = _mm_mul_ps(aVal, yh);
89 tmp2p = _mm_mul_ps(phase_Val, yhp);
91 z = _mm_addsub_ps(tmp1, tmp2);
92 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
94 _mm_store_ps((
float*)cPtr, z);
99 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
100 tmp2 = _mm_hadd_ps(tmp1, tmp1);
101 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
102 phase_Val = _mm_div_ps(phase_Val, tmp1);
105 aVal = _mm_load_ps((
float*)aPtr);
107 yl = _mm_moveldup_ps(phase_Val);
108 yh = _mm_movehdup_ps(phase_Val);
109 ylp = _mm_moveldup_ps(inc_Val);
110 yhp = _mm_movehdup_ps(inc_Val);
112 tmp1 = _mm_mul_ps(aVal, yl);
114 tmp1p = _mm_mul_ps(phase_Val, ylp);
116 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
117 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
118 tmp2 = _mm_mul_ps(aVal, yh);
119 tmp2p = _mm_mul_ps(phase_Val, yhp);
121 z = _mm_addsub_ps(tmp1, tmp2);
122 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
124 _mm_store_ps((
float*)cPtr, z);
130 _mm_storeu_ps((
float*)phase_Ptr, phase_Val);
131 for(i = 0; i < num_points%2; ++
i) {
132 *cPtr++ = *aPtr++ * phase_Ptr[0];
133 phase_Ptr[0] *= (phase_inc);
136 (*phase) = phase_Ptr[0];
144 #include <immintrin.h>
158 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
163 lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
165 unsigned int i, j = 0;
167 for(i = 0; i < 4; ++
i) {
168 phase_Ptr[
i] *= incr;
177 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p, negated, zeros;
179 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
181 zeros = _mm256_set1_ps(0.0);
182 negated = _mm256_set1_ps(-1.0);
183 const unsigned int fourthPoints = num_points / 4;
186 for(i = 0; i < (
unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
189 aVal = _mm256_load_ps((
float*)aPtr);
191 yl = _mm256_moveldup_ps(phase_Val);
192 yh = _mm256_movehdup_ps(phase_Val);
193 ylp = _mm256_moveldup_ps(inc_Val);
194 yhp = _mm256_movehdup_ps(inc_Val);
196 tmp1 = _mm256_mul_ps(aVal, yl);
197 tmp1p = _mm256_mul_ps(phase_Val, ylp);
199 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
200 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
201 tmp2 = _mm256_mul_ps(aVal, yh);
202 tmp2p = _mm256_mul_ps(phase_Val, yhp);
204 z = _mm256_addsub_ps(tmp1, tmp2);
205 phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
207 _mm256_store_ps((
float*)cPtr, z);
212 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
213 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
214 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
215 phase_Val = _mm256_div_ps(phase_Val, tmp1);
218 aVal = _mm256_load_ps((
float*)aPtr);
220 yl = _mm256_moveldup_ps(phase_Val);
221 yh = _mm256_movehdup_ps(phase_Val);
222 ylp = _mm256_moveldup_ps(inc_Val);
223 yhp = _mm256_movehdup_ps(inc_Val);
225 tmp1 = _mm256_mul_ps(aVal, yl);
227 tmp1p = _mm256_mul_ps(phase_Val, ylp);
229 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
230 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
231 tmp2 = _mm256_mul_ps(aVal, yh);
232 tmp2p = _mm256_mul_ps(phase_Val, yhp);
234 z = _mm256_addsub_ps(tmp1, tmp2);
235 phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
237 _mm256_store_ps((
float*)cPtr, z);
243 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
244 for(i = 0; i < num_points%4; ++
i) {
245 *cPtr++ = *aPtr++ * phase_Ptr[0];
246 phase_Ptr[0] *= (phase_inc);
249 (*phase) = phase_Ptr[0];