Vector Optimized Library of Kernels 2.5.2
Architecture-tuned implementations of math kernels
volk_16i_max_star_horizontal_16i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: GPL-3.0-or-later
8 */
9
40#ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H
41#define INCLUDED_volk_16i_max_star_horizontal_16i_a_H
42
43#include <volk/volk_common.h>
44
45#include <inttypes.h>
46#include <stdio.h>
47
48
49#ifdef LV_HAVE_SSSE3
50
51#include <emmintrin.h>
52#include <tmmintrin.h>
53#include <xmmintrin.h>
54
55static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target,
56 int16_t* src0,
57 unsigned int num_points)
58{
59 const unsigned int num_bytes = num_points * 2;
60
61 static const uint8_t shufmask0[16] = {
62 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d,
63 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
64 };
65 static const uint8_t shufmask1[16] = {
66 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
67 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d
68 };
69 static const uint8_t andmask0[16] = {
70 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
71 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
72 };
73 static const uint8_t andmask1[16] = {
74 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
75 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02
76 };
77
78 __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
79 __m128i xmm5, xmm6, xmm7, xmm8;
80
81 xmm4 = _mm_load_si128((__m128i*)shufmask0);
82 xmm5 = _mm_load_si128((__m128i*)shufmask1);
83 xmm6 = _mm_load_si128((__m128i*)andmask0);
84 xmm7 = _mm_load_si128((__m128i*)andmask1);
85
86 __m128i *p_target, *p_src0;
87
88 p_target = (__m128i*)target;
89 p_src0 = (__m128i*)src0;
90
91 int bound = num_bytes >> 5;
92 int intermediate = (num_bytes >> 4) & 1;
93 int leftovers = (num_bytes >> 1) & 7;
94
95 int i = 0;
96
97 for (i = 0; i < bound; ++i) {
98 xmm0 = _mm_load_si128(p_src0);
99 xmm1 = _mm_load_si128(&p_src0[1]);
100
101 xmm2 = _mm_xor_si128(xmm2, xmm2);
102 p_src0 += 2;
103
104 xmm3 = _mm_hsub_epi16(xmm0, xmm1);
105
106 xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
107
108 xmm8 = _mm_and_si128(xmm2, xmm6);
109 xmm3 = _mm_and_si128(xmm2, xmm7);
110
111
112 xmm8 = _mm_add_epi8(xmm8, xmm4);
113 xmm3 = _mm_add_epi8(xmm3, xmm5);
114
115 xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
116 xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
117
118
119 xmm3 = _mm_add_epi16(xmm0, xmm1);
120
121
122 _mm_store_si128(p_target, xmm3);
123
124 p_target += 1;
125 }
126
127 if (intermediate) {
128 xmm0 = _mm_load_si128(p_src0);
129
130 xmm2 = _mm_xor_si128(xmm2, xmm2);
131 p_src0 += 1;
132
133 xmm3 = _mm_hsub_epi16(xmm0, xmm1);
134 xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
135
136 xmm8 = _mm_and_si128(xmm2, xmm6);
137
138 xmm3 = _mm_add_epi8(xmm8, xmm4);
139
140 xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
141
142 _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec);
143
144 p_target = (__m128i*)((int8_t*)p_target + 8);
145 }
146
147 for (i = (bound << 4) + (intermediate << 3);
148 i < (bound << 4) + (intermediate << 3) + leftovers;
149 i += 2) {
150 target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
151 }
152}
153
154#endif /*LV_HAVE_SSSE3*/
155
156#ifdef LV_HAVE_NEON
157
158#include <arm_neon.h>
159static inline void volk_16i_max_star_horizontal_16i_neon(int16_t* target,
160 int16_t* src0,
161 unsigned int num_points)
162{
163 const unsigned int eighth_points = num_points / 16;
164 unsigned number;
165 int16x8x2_t input_vec;
166 int16x8_t diff, max_vec, zeros;
167 uint16x8_t comp1, comp2;
168 zeros = vdupq_n_s16(0);
169 for (number = 0; number < eighth_points; ++number) {
170 input_vec = vld2q_s16(src0);
171 //__VOLK_PREFETCH(src0+16);
172 diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
173 comp1 = vcgeq_s16(diff, zeros);
174 comp2 = vcltq_s16(diff, zeros);
175
176 input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1);
177 input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2);
178
179 max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]);
180 vst1q_s16(target, max_vec);
181 src0 += 16;
182 target += 8;
183 }
184 for (number = 0; number < num_points % 16; number += 2) {
185 target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0)
186 ? src0[number]
187 : src0[number + 1];
188 }
189}
190#endif /* LV_HAVE_NEON */
191
192#ifdef LV_HAVE_NEONV7
193extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target,
194 int16_t* src0,
195 unsigned int num_points);
196#endif /* LV_HAVE_NEONV7 */
197
198#ifdef LV_HAVE_GENERIC
199static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target,
200 int16_t* src0,
201 unsigned int num_points)
202{
203 const unsigned int num_bytes = num_points * 2;
204
205 int i = 0;
206
207 int bound = num_bytes >> 1;
208
209 for (i = 0; i < bound; i += 2) {
210 target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
211 }
212}
213
214#endif /*LV_HAVE_GENERIC*/
215
216#endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a_H*/