Vector Optimized Library of Kernels 2.5.2
Architecture-tuned implementations of math kernels
volk_32f_s32f_convert_8i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: GPL-3.0-or-later
8 */
9
60#ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H
61#define INCLUDED_volk_32f_s32f_convert_8i_u_H
62
63#include <inttypes.h>
64#include <stdio.h>
65
66static inline void volk_32f_s32f_convert_8i_single(int8_t* out, const float in)
67{
68 float min_val = INT8_MIN;
69 float max_val = INT8_MAX;
70 if (in > max_val) {
71 *out = (int8_t)(max_val);
72 } else if (in < min_val) {
73 *out = (int8_t)(min_val);
74 } else {
75 *out = (int8_t)(rintf(in));
76 }
77}
78
79#ifdef LV_HAVE_AVX2
80#include <immintrin.h>
81
82static inline void volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector,
83 const float* inputVector,
84 const float scalar,
85 unsigned int num_points)
86{
87 unsigned int number = 0;
88
89 const unsigned int thirtysecondPoints = num_points / 32;
90
91 const float* inputVectorPtr = (const float*)inputVector;
92 int8_t* outputVectorPtr = outputVector;
93
94 float min_val = INT8_MIN;
95 float max_val = INT8_MAX;
96 float r;
97
98 __m256 vScalar = _mm256_set1_ps(scalar);
99 __m256 inputVal1, inputVal2, inputVal3, inputVal4;
100 __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
101 __m256 vmin_val = _mm256_set1_ps(min_val);
102 __m256 vmax_val = _mm256_set1_ps(max_val);
103 __m256i intInputVal;
104
105 for (; number < thirtysecondPoints; number++) {
106 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
107 inputVectorPtr += 8;
108 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
109 inputVectorPtr += 8;
110 inputVal3 = _mm256_loadu_ps(inputVectorPtr);
111 inputVectorPtr += 8;
112 inputVal4 = _mm256_loadu_ps(inputVectorPtr);
113 inputVectorPtr += 8;
114
115 inputVal1 = _mm256_max_ps(
116 _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
117 inputVal2 = _mm256_max_ps(
118 _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
119 inputVal3 = _mm256_max_ps(
120 _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
121 inputVal4 = _mm256_max_ps(
122 _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
123
124 intInputVal1 = _mm256_cvtps_epi32(inputVal1);
125 intInputVal2 = _mm256_cvtps_epi32(inputVal2);
126 intInputVal3 = _mm256_cvtps_epi32(inputVal3);
127 intInputVal4 = _mm256_cvtps_epi32(inputVal4);
128
129 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
130 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
131 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
132 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
133
134 intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
135 intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
136
137 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
138 outputVectorPtr += 32;
139 }
140
141 number = thirtysecondPoints * 32;
142 for (; number < num_points; number++) {
143 r = inputVector[number] * scalar;
144 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
145 }
146}
147
148#endif /* LV_HAVE_AVX2 */
149
150
151#ifdef LV_HAVE_SSE2
152#include <emmintrin.h>
153
154static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector,
155 const float* inputVector,
156 const float scalar,
157 unsigned int num_points)
158{
159 unsigned int number = 0;
160
161 const unsigned int sixteenthPoints = num_points / 16;
162
163 const float* inputVectorPtr = (const float*)inputVector;
164 int8_t* outputVectorPtr = outputVector;
165
166 float min_val = INT8_MIN;
167 float max_val = INT8_MAX;
168 float r;
169
170 __m128 vScalar = _mm_set_ps1(scalar);
171 __m128 inputVal1, inputVal2, inputVal3, inputVal4;
172 __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
173 __m128 vmin_val = _mm_set_ps1(min_val);
174 __m128 vmax_val = _mm_set_ps1(max_val);
175
176 for (; number < sixteenthPoints; number++) {
177 inputVal1 = _mm_loadu_ps(inputVectorPtr);
178 inputVectorPtr += 4;
179 inputVal2 = _mm_loadu_ps(inputVectorPtr);
180 inputVectorPtr += 4;
181 inputVal3 = _mm_loadu_ps(inputVectorPtr);
182 inputVectorPtr += 4;
183 inputVal4 = _mm_loadu_ps(inputVectorPtr);
184 inputVectorPtr += 4;
185
186 inputVal1 =
187 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
188 inputVal2 =
189 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
190 inputVal3 =
191 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
192 inputVal4 =
193 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
194
195 intInputVal1 = _mm_cvtps_epi32(inputVal1);
196 intInputVal2 = _mm_cvtps_epi32(inputVal2);
197 intInputVal3 = _mm_cvtps_epi32(inputVal3);
198 intInputVal4 = _mm_cvtps_epi32(inputVal4);
199
200 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
201 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
202
203 intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
204
205 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
206 outputVectorPtr += 16;
207 }
208
209 number = sixteenthPoints * 16;
210 for (; number < num_points; number++) {
211 r = inputVector[number] * scalar;
212 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
213 }
214}
215
216#endif /* LV_HAVE_SSE2 */
217
218
219#ifdef LV_HAVE_SSE
220#include <xmmintrin.h>
221
222static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector,
223 const float* inputVector,
224 const float scalar,
225 unsigned int num_points)
226{
227 unsigned int number = 0;
228 size_t inner_loop;
229
230 const unsigned int quarterPoints = num_points / 4;
231
232 const float* inputVectorPtr = (const float*)inputVector;
233 int8_t* outputVectorPtr = outputVector;
234
235 float min_val = INT8_MIN;
236 float max_val = INT8_MAX;
237 float r;
238
239 __m128 vScalar = _mm_set_ps1(scalar);
240 __m128 ret;
241 __m128 vmin_val = _mm_set_ps1(min_val);
242 __m128 vmax_val = _mm_set_ps1(max_val);
243
244 __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
245
246 for (; number < quarterPoints; number++) {
247 ret = _mm_loadu_ps(inputVectorPtr);
248 inputVectorPtr += 4;
249
250 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
251
252 _mm_store_ps(outputFloatBuffer, ret);
253 for (inner_loop = 0; inner_loop < 4; inner_loop++) {
254 *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
255 }
256 }
257
258 number = quarterPoints * 4;
259 for (; number < num_points; number++) {
260 r = inputVector[number] * scalar;
261 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
262 }
263}
264
265#endif /* LV_HAVE_SSE */
266
267
268#ifdef LV_HAVE_GENERIC
269
270static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector,
271 const float* inputVector,
272 const float scalar,
273 unsigned int num_points)
274{
275 const float* inputVectorPtr = inputVector;
276 unsigned int number = 0;
277 float r;
278
279 for (number = 0; number < num_points; number++) {
280 r = *inputVectorPtr++ * scalar;
281 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
282 }
283}
284
285#endif /* LV_HAVE_GENERIC */
286
287
288#endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */
289#ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
290#define INCLUDED_volk_32f_s32f_convert_8i_a_H
291
292#include <inttypes.h>
293#include <stdio.h>
294#include <volk/volk_common.h>
295
296#ifdef LV_HAVE_AVX2
297#include <immintrin.h>
298
299static inline void volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector,
300 const float* inputVector,
301 const float scalar,
302 unsigned int num_points)
303{
304 unsigned int number = 0;
305
306 const unsigned int thirtysecondPoints = num_points / 32;
307
308 const float* inputVectorPtr = (const float*)inputVector;
309 int8_t* outputVectorPtr = outputVector;
310
311 float min_val = INT8_MIN;
312 float max_val = INT8_MAX;
313 float r;
314
315 __m256 vScalar = _mm256_set1_ps(scalar);
316 __m256 inputVal1, inputVal2, inputVal3, inputVal4;
317 __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
318 __m256 vmin_val = _mm256_set1_ps(min_val);
319 __m256 vmax_val = _mm256_set1_ps(max_val);
320 __m256i intInputVal;
321
322 for (; number < thirtysecondPoints; number++) {
323 inputVal1 = _mm256_load_ps(inputVectorPtr);
324 inputVectorPtr += 8;
325 inputVal2 = _mm256_load_ps(inputVectorPtr);
326 inputVectorPtr += 8;
327 inputVal3 = _mm256_load_ps(inputVectorPtr);
328 inputVectorPtr += 8;
329 inputVal4 = _mm256_load_ps(inputVectorPtr);
330 inputVectorPtr += 8;
331
332 inputVal1 = _mm256_max_ps(
333 _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
334 inputVal2 = _mm256_max_ps(
335 _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
336 inputVal3 = _mm256_max_ps(
337 _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
338 inputVal4 = _mm256_max_ps(
339 _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
340
341 intInputVal1 = _mm256_cvtps_epi32(inputVal1);
342 intInputVal2 = _mm256_cvtps_epi32(inputVal2);
343 intInputVal3 = _mm256_cvtps_epi32(inputVal3);
344 intInputVal4 = _mm256_cvtps_epi32(inputVal4);
345
346 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
347 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
348 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
349 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
350
351 intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
352 intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
353
354 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
355 outputVectorPtr += 32;
356 }
357
358 number = thirtysecondPoints * 32;
359 for (; number < num_points; number++) {
360 r = inputVector[number] * scalar;
361 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
362 }
363}
364
365#endif /* LV_HAVE_AVX2 */
366
367
368#ifdef LV_HAVE_SSE2
369#include <emmintrin.h>
370
371static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector,
372 const float* inputVector,
373 const float scalar,
374 unsigned int num_points)
375{
376 unsigned int number = 0;
377
378 const unsigned int sixteenthPoints = num_points / 16;
379
380 const float* inputVectorPtr = (const float*)inputVector;
381 int8_t* outputVectorPtr = outputVector;
382
383 float min_val = INT8_MIN;
384 float max_val = INT8_MAX;
385 float r;
386
387 __m128 vScalar = _mm_set_ps1(scalar);
388 __m128 inputVal1, inputVal2, inputVal3, inputVal4;
389 __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
390 __m128 vmin_val = _mm_set_ps1(min_val);
391 __m128 vmax_val = _mm_set_ps1(max_val);
392
393 for (; number < sixteenthPoints; number++) {
394 inputVal1 = _mm_load_ps(inputVectorPtr);
395 inputVectorPtr += 4;
396 inputVal2 = _mm_load_ps(inputVectorPtr);
397 inputVectorPtr += 4;
398 inputVal3 = _mm_load_ps(inputVectorPtr);
399 inputVectorPtr += 4;
400 inputVal4 = _mm_load_ps(inputVectorPtr);
401 inputVectorPtr += 4;
402
403 inputVal1 =
404 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
405 inputVal2 =
406 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
407 inputVal3 =
408 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
409 inputVal4 =
410 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
411
412 intInputVal1 = _mm_cvtps_epi32(inputVal1);
413 intInputVal2 = _mm_cvtps_epi32(inputVal2);
414 intInputVal3 = _mm_cvtps_epi32(inputVal3);
415 intInputVal4 = _mm_cvtps_epi32(inputVal4);
416
417 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
418 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
419
420 intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
421
422 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
423 outputVectorPtr += 16;
424 }
425
426 number = sixteenthPoints * 16;
427 for (; number < num_points; number++) {
428 r = inputVector[number] * scalar;
429 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
430 }
431}
432#endif /* LV_HAVE_SSE2 */
433
434
435#ifdef LV_HAVE_SSE
436#include <xmmintrin.h>
437
438static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector,
439 const float* inputVector,
440 const float scalar,
441 unsigned int num_points)
442{
443 unsigned int number = 0;
444 size_t inner_loop;
445
446 const unsigned int quarterPoints = num_points / 4;
447
448 const float* inputVectorPtr = (const float*)inputVector;
449
450 float min_val = INT8_MIN;
451 float max_val = INT8_MAX;
452 float r;
453
454 int8_t* outputVectorPtr = outputVector;
455 __m128 vScalar = _mm_set_ps1(scalar);
456 __m128 ret;
457 __m128 vmin_val = _mm_set_ps1(min_val);
458 __m128 vmax_val = _mm_set_ps1(max_val);
459
460 __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
461
462 for (; number < quarterPoints; number++) {
463 ret = _mm_load_ps(inputVectorPtr);
464 inputVectorPtr += 4;
465
466 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
467
468 _mm_store_ps(outputFloatBuffer, ret);
469 for (inner_loop = 0; inner_loop < 4; inner_loop++) {
470 *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
471 }
472 }
473
474 number = quarterPoints * 4;
475 for (; number < num_points; number++) {
476 r = inputVector[number] * scalar;
477 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
478 }
479}
480
481#endif /* LV_HAVE_SSE */
482
483
484#ifdef LV_HAVE_GENERIC
485
486static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector,
487 const float* inputVector,
488 const float scalar,
489 unsigned int num_points)
490{
491 const float* inputVectorPtr = inputVector;
492 unsigned int number = 0;
493 float r;
494
495 for (number = 0; number < num_points; number++) {
496 r = *inputVectorPtr++ * scalar;
497 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
498 }
499}
500
501#endif /* LV_HAVE_GENERIC */
502
503
504#endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */