Vector Optimized Library of Kernels 2.5.2
Architecture-tuned implementations of math kernels
volk_32fc_x2_divide_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2016 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: GPL-3.0-or-later
8 */
9
59#ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
60#define INCLUDED_volk_32fc_x2_divide_32fc_u_H
61
62#include <float.h>
63#include <inttypes.h>
64#include <volk/volk_complex.h>
65
66
67#ifdef LV_HAVE_GENERIC
68
69static inline void volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector,
70 const lv_32fc_t* aVector,
71 const lv_32fc_t* bVector,
72 unsigned int num_points)
73{
74 lv_32fc_t* cPtr = cVector;
75 const lv_32fc_t* aPtr = aVector;
76 const lv_32fc_t* bPtr = bVector;
77
78 for (unsigned int number = 0; number < num_points; number++) {
79 *cPtr++ = (*aPtr++) / (*bPtr++);
80 }
81}
82#endif /* LV_HAVE_GENERIC */
83
84
85#ifdef LV_HAVE_SSE3
86#include <pmmintrin.h>
88
89static inline void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector,
90 const lv_32fc_t* numeratorVector,
91 const lv_32fc_t* denumeratorVector,
92 unsigned int num_points)
93{
94 /*
95 * we'll do the "classical"
96 * a a b*
97 * --- = -------
98 * b |b|^2
99 * */
100 unsigned int number = 0;
101 const unsigned int quarterPoints = num_points / 4;
102
103 __m128 num01, num23, den01, den23, norm, result;
104 lv_32fc_t* c = cVector;
105 const lv_32fc_t* a = numeratorVector;
106 const lv_32fc_t* b = denumeratorVector;
107
108 for (; number < quarterPoints; number++) {
109 num01 = _mm_loadu_ps((float*)a); // first pair
110 den01 = _mm_loadu_ps((float*)b); // first pair
111 num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
112 a += 2;
113 b += 2;
114
115 num23 = _mm_loadu_ps((float*)a); // second pair
116 den23 = _mm_loadu_ps((float*)b); // second pair
117 num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
118 a += 2;
119 b += 2;
120
121 norm = _mm_magnitudesquared_ps_sse3(den01, den23);
122 den01 = _mm_unpacklo_ps(norm, norm);
123 den23 = _mm_unpackhi_ps(norm, norm);
124
125 result = _mm_div_ps(num01, den01);
126 _mm_storeu_ps((float*)c, result); // Store the results back into the C container
127 c += 2;
128 result = _mm_div_ps(num23, den23);
129 _mm_storeu_ps((float*)c, result); // Store the results back into the C container
130 c += 2;
131 }
132
133 number *= 4;
134 for (; number < num_points; number++) {
135 *c = (*a) / (*b);
136 a++;
137 b++;
138 c++;
139 }
140}
141#endif /* LV_HAVE_SSE3 */
142
143
144#ifdef LV_HAVE_AVX
145#include <immintrin.h>
147
148static inline void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector,
149 const lv_32fc_t* numeratorVector,
150 const lv_32fc_t* denumeratorVector,
151 unsigned int num_points)
152{
153 /*
154 * we'll do the "classical"
155 * a a b*
156 * --- = -------
157 * b |b|^2
158 * */
159 unsigned int number = 0;
160 const unsigned int quarterPoints = num_points / 4;
161
162 __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
163 lv_32fc_t* c = cVector;
164 const lv_32fc_t* a = numeratorVector;
165 const lv_32fc_t* b = denumeratorVector;
166
167 for (; number < quarterPoints; number++) {
168 num = _mm256_loadu_ps(
169 (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
170 denum = _mm256_loadu_ps(
171 (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
172 mul_conj = _mm256_complexconjugatemul_ps(num, denum);
173 sq = _mm256_mul_ps(denum, denum); // Square the values
174 mag_sq_un = _mm256_hadd_ps(
175 sq, sq); // obtain the actual squared magnitude, although out of order
176 mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
177 // best guide I found on using these functions:
178 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
179 div = _mm256_div_ps(mul_conj, mag_sq);
180
181 _mm256_storeu_ps((float*)c, div); // Store the results back into the C container
182
183 a += 4;
184 b += 4;
185 c += 4;
186 }
187
188 number = quarterPoints * 4;
189
190 for (; number < num_points; number++) {
191 *c++ = (*a++) / (*b++);
192 }
193}
194#endif /* LV_HAVE_AVX */
195
196
197#endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */
198
199
200#ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
201#define INCLUDED_volk_32fc_x2_divide_32fc_a_H
202
203#include <float.h>
204#include <inttypes.h>
205#include <stdio.h>
206#include <volk/volk_complex.h>
207
208#ifdef LV_HAVE_SSE3
209#include <pmmintrin.h>
211
212static inline void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector,
213 const lv_32fc_t* numeratorVector,
214 const lv_32fc_t* denumeratorVector,
215 unsigned int num_points)
216{
217 /*
218 * we'll do the "classical"
219 * a a b*
220 * --- = -------
221 * b |b|^2
222 * */
223 unsigned int number = 0;
224 const unsigned int quarterPoints = num_points / 4;
225
226 __m128 num01, num23, den01, den23, norm, result;
227 lv_32fc_t* c = cVector;
228 const lv_32fc_t* a = numeratorVector;
229 const lv_32fc_t* b = denumeratorVector;
230
231 for (; number < quarterPoints; number++) {
232 num01 = _mm_load_ps((float*)a); // first pair
233 den01 = _mm_load_ps((float*)b); // first pair
234 num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
235 a += 2;
236 b += 2;
237
238 num23 = _mm_load_ps((float*)a); // second pair
239 den23 = _mm_load_ps((float*)b); // second pair
240 num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
241 a += 2;
242 b += 2;
243
244 norm = _mm_magnitudesquared_ps_sse3(den01, den23);
245
246 den01 = _mm_unpacklo_ps(norm, norm); // select the lower floats twice
247 den23 = _mm_unpackhi_ps(norm, norm); // select the upper floats twice
248
249 result = _mm_div_ps(num01, den01);
250 _mm_store_ps((float*)c, result); // Store the results back into the C container
251 c += 2;
252 result = _mm_div_ps(num23, den23);
253 _mm_store_ps((float*)c, result); // Store the results back into the C container
254 c += 2;
255 }
256
257 number *= 4;
258 for (; number < num_points; number++) {
259 *c = (*a) / (*b);
260 a++;
261 b++;
262 c++;
263 }
264}
265#endif /* LV_HAVE_SSE */
266
267#ifdef LV_HAVE_AVX
268#include <immintrin.h>
270
271static inline void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector,
272 const lv_32fc_t* numeratorVector,
273 const lv_32fc_t* denumeratorVector,
274 unsigned int num_points)
275{
276 /*
277 * Guide to AVX intrisics:
278 * https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
279 *
280 * we'll do the "classical"
281 * a a b*
282 * --- = -------
283 * b |b|^2
284 *
285 */
286 lv_32fc_t* c = cVector;
287 const lv_32fc_t* a = numeratorVector;
288 const lv_32fc_t* b = denumeratorVector;
289
290 const unsigned int eigthPoints = num_points / 8;
291
292 __m256 num01, num23, denum01, denum23, complex_result, result0, result1;
293
294 for (unsigned int number = 0; number < eigthPoints; number++) {
295 // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
296 num01 = _mm256_load_ps((float*)a);
297 denum01 = _mm256_load_ps((float*)b);
298
299 num01 = _mm256_complexconjugatemul_ps(num01, denum01);
300 a += 4;
301 b += 4;
302
303 num23 = _mm256_load_ps((float*)a);
304 denum23 = _mm256_load_ps((float*)b);
305 num23 = _mm256_complexconjugatemul_ps(num23, denum23);
306 a += 4;
307 b += 4;
308
309 complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01),
310 _mm256_mul_ps(denum23, denum23));
311
312 denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50);
313 denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa);
314
315 result0 = _mm256_div_ps(num01, denum01);
316 result1 = _mm256_div_ps(num23, denum23);
317
318 _mm256_store_ps((float*)c, result0);
319 c += 4;
320 _mm256_store_ps((float*)c, result1);
321 c += 4;
322 }
323
324 volk_32fc_x2_divide_32fc_generic(c, a, b, num_points - eigthPoints * 8);
325}
326#endif /* LV_HAVE_AVX */
327
328
329#ifdef LV_HAVE_NEON
330#include <arm_neon.h>
331
332static inline void volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector,
333 const lv_32fc_t* aVector,
334 const lv_32fc_t* bVector,
335 unsigned int num_points)
336{
337 lv_32fc_t* cPtr = cVector;
338 const lv_32fc_t* aPtr = aVector;
339 const lv_32fc_t* bPtr = bVector;
340
341 float32x4x2_t aVal, bVal, cVal;
342 float32x4_t bAbs, bAbsInv;
343
344 const unsigned int quarterPoints = num_points / 4;
345 unsigned int number = 0;
346 for (; number < quarterPoints; number++) {
347 aVal = vld2q_f32((const float*)(aPtr));
348 bVal = vld2q_f32((const float*)(bPtr));
349 aPtr += 4;
350 bPtr += 4;
351 __VOLK_PREFETCH(aPtr + 4);
352 __VOLK_PREFETCH(bPtr + 4);
353
354 bAbs = vmulq_f32(bVal.val[0], bVal.val[0]);
355 bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
356
357 bAbsInv = vrecpeq_f32(bAbs);
358 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
359 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
360
361 cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]);
362 cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
363 cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
364
365 cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]);
366 cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
367 cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
368
369 vst2q_f32((float*)(cPtr), cVal);
370 cPtr += 4;
371 }
372
373 for (number = quarterPoints * 4; number < num_points; number++) {
374 *cPtr++ = (*aPtr++) / (*bPtr++);
375 }
376}
377#endif /* LV_HAVE_NEON */
378
379
380#endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */