libosmocore  0.9.6.311-c977
Osmocom core library
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Modules Pages
conv_acc_sse_impl.h
Go to the documentation of this file.
1 
5 /*
6  * Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
7  *
8  * All Rights Reserved
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License along
21  * with this program; if not, write to the Free Software Foundation, Inc.,
22  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23  */
24 
25 extern int sse41_supported;
26 
27 /* Octo-Viterbi butterfly
28  * Compute 8-wide butterfly generating 16 path decisions and 16 accumulated
29  * sums. Inputs all packed 16-bit integers in three 128-bit XMM registers.
30  * Two intermediate registers are used and results are set in the upper 4
31  * registers.
32  *
33  * Input:
34  * M0 - Path metrics 0 (packed 16-bit integers)
35  * M1 - Path metrics 1 (packed 16-bit integers)
36  * M2 - Branch metrics (packed 16-bit integers)
37  *
38  * Output:
39  * M2 - Selected and accumulated path metrics 0
40  * M4 - Selected and accumulated path metrics 1
41  * M3 - Path selections 0
42  * M1 - Path selections 1
43  */
44 #define SSE_BUTTERFLY(M0, M1, M2, M3, M4) \
45 { \
46  M3 = _mm_adds_epi16(M0, M2); \
47  M4 = _mm_subs_epi16(M1, M2); \
48  M0 = _mm_subs_epi16(M0, M2); \
49  M1 = _mm_adds_epi16(M1, M2); \
50  M2 = _mm_max_epi16(M3, M4); \
51  M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, M4)); \
52  M4 = _mm_max_epi16(M0, M1); \
53  M1 = _mm_or_si128(_mm_cmpgt_epi16(M0, M1), _mm_cmpeq_epi16(M0, M1)); \
54 }
55 
56 /* Two lane deinterleaving K = 5:
57  * Take 16 interleaved 16-bit integers and deinterleave to 2 packed 128-bit
58  * registers. The operation summarized below. Four registers are used with
59  * the lower 2 as input and upper 2 as output.
60  *
61  * In - 10101010 10101010 10101010 10101010
62  * Out - 00000000 11111111 00000000 11111111
63  *
64  * Input:
65  * M0:1 - Packed 16-bit integers
66  *
67  * Output:
68  * M2:3 - Deinterleaved packed 16-bit integers
69  */
70 #define _I8_SHUFFLE_MASK 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
71 
72 #define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3) \
73 { \
74  M2 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
75  M0 = _mm_shuffle_epi8(M0, M2); \
76  M1 = _mm_shuffle_epi8(M1, M2); \
77  M2 = _mm_unpacklo_epi64(M0, M1); \
78  M3 = _mm_unpackhi_epi64(M0, M1); \
79 }
80 
81 /* Two lane deinterleaving K = 7:
82  * Take 64 interleaved 16-bit integers and deinterleave to 8 packed 128-bit
83  * registers. The operation summarized below. 16 registers are used with the
84  * lower 8 as input and upper 8 as output.
85  *
86  * In - 10101010 10101010 10101010 10101010 ...
87  * Out - 00000000 11111111 00000000 11111111 ...
88  *
89  * Input:
90  * M0:7 - Packed 16-bit integers
91  *
92  * Output:
93  * M8:15 - Deinterleaved packed 16-bit integers
94  */
95 #define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, \
96  M8, M9, M10, M11, M12, M13, M14, M15) \
97 { \
98  M8 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
99  M0 = _mm_shuffle_epi8(M0, M8); \
100  M1 = _mm_shuffle_epi8(M1, M8); \
101  M2 = _mm_shuffle_epi8(M2, M8); \
102  M3 = _mm_shuffle_epi8(M3, M8); \
103  M4 = _mm_shuffle_epi8(M4, M8); \
104  M5 = _mm_shuffle_epi8(M5, M8); \
105  M6 = _mm_shuffle_epi8(M6, M8); \
106  M7 = _mm_shuffle_epi8(M7, M8); \
107  M8 = _mm_unpacklo_epi64(M0, M1); \
108  M9 = _mm_unpackhi_epi64(M0, M1); \
109  M10 = _mm_unpacklo_epi64(M2, M3); \
110  M11 = _mm_unpackhi_epi64(M2, M3); \
111  M12 = _mm_unpacklo_epi64(M4, M5); \
112  M13 = _mm_unpackhi_epi64(M4, M5); \
113  M14 = _mm_unpacklo_epi64(M6, M7); \
114  M15 = _mm_unpackhi_epi64(M6, M7); \
115 }
116 
117 /* Generate branch metrics N = 2:
118  * Compute 16 branch metrics from trellis outputs and input values.
119  *
120  * Input:
121  * M0:3 - 16 x 2 packed 16-bit trellis outputs
122  * M4 - Expanded and packed 16-bit input value
123  *
124  * Output:
125  * M6:7 - 16 computed 16-bit branch metrics
126  */
127 #define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7) \
128 { \
129  M0 = _mm_sign_epi16(M4, M0); \
130  M1 = _mm_sign_epi16(M4, M1); \
131  M2 = _mm_sign_epi16(M4, M2); \
132  M3 = _mm_sign_epi16(M4, M3); \
133  M6 = _mm_hadds_epi16(M0, M1); \
134  M7 = _mm_hadds_epi16(M2, M3); \
135 }
136 
137 /* Generate branch metrics N = 4:
138  * Compute 8 branch metrics from trellis outputs and input values. This
139  * macro is reused for N less than 4 where the extra soft input bits are
140  * padded.
141  *
142  * Input:
143  * M0:3 - 8 x 4 packed 16-bit trellis outputs
144  * M4 - Expanded and packed 16-bit input value
145  *
146  * Output:
147  * M5 - 8 computed 16-bit branch metrics
148  */
149 #define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5) \
150 { \
151  M0 = _mm_sign_epi16(M4, M0); \
152  M1 = _mm_sign_epi16(M4, M1); \
153  M2 = _mm_sign_epi16(M4, M2); \
154  M3 = _mm_sign_epi16(M4, M3); \
155  M0 = _mm_hadds_epi16(M0, M1); \
156  M1 = _mm_hadds_epi16(M2, M3); \
157  M5 = _mm_hadds_epi16(M0, M1); \
158 }
159 
160 /* Horizontal minimum
161  * Compute horizontal minimum of packed unsigned 16-bit integers and place
162  * result in the low 16-bit element of the source register. Only SSE 4.1
163  * has a dedicated minpos instruction. One intermediate register is used
164  * if SSE 4.1 is not available. This is a destructive operation and the
165  * source register is overwritten.
166  *
167  * Input:
168  * M0 - Packed unsigned 16-bit integers
169  *
170  * Output:
171  * M0 - Minimum value placed in low 16-bit element
172  */
173 #if defined(HAVE_SSE4_1) || defined(HAVE_SSE41)
174 #define SSE_MINPOS(M0, M1) \
175 { \
176  if (sse41_supported) { \
177  M0 = _mm_minpos_epu16(M0); \
178  } else { \
179  M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
180  M0 = _mm_min_epi16(M0, M1); \
181  M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
182  M0 = _mm_min_epi16(M0, M1); \
183  M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
184  M0 = _mm_min_epi16(M0, M1); \
185  } \
186 }
187 #else
188 #define SSE_MINPOS(M0, M1) \
189 { \
190  M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
191  M0 = _mm_min_epi16(M0, M1); \
192  M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
193  M0 = _mm_min_epi16(M0, M1); \
194  M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
195  M0 = _mm_min_epi16(M0, M1); \
196 }
197 #endif
198 
199 /* Normalize state metrics K = 5:
200  * Compute 16-wide normalization by subtracting the smallest value from
201  * all values. Inputs are 16 packed 16-bit integers across 2 XMM registers.
202  * Two intermediate registers are used and normalized results are placed
203  * in the originating locations.
204  *
205  * Input:
206  * M0:1 - Path metrics 0:1 (packed 16-bit integers)
207  *
208  * Output:
209  * M0:1 - Normalized path metrics 0:1
210  */
211 #define SSE_NORMALIZE_K5(M0, M1, M2, M3) \
212 { \
213  M2 = _mm_min_epi16(M0, M1); \
214  SSE_MINPOS(M2, M3) \
215  SSE_BROADCAST(M2) \
216  M0 = _mm_subs_epi16(M0, M2); \
217  M1 = _mm_subs_epi16(M1, M2); \
218 }
219 
220 /* Normalize state metrics K = 7:
221  * Compute 64-wide normalization by subtracting the smallest value from
222  * all values. Inputs are 8 registers of accumulated sums and 4 temporary
223  * registers. Normalized results are returned in the originating locations.
224  *
225  * Input:
226  * M0:7 - Path metrics 0:7 (packed 16-bit integers)
227  *
228  * Output:
229  * M0:7 - Normalized path metrics 0:7
230  */
231 #define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11) \
232 { \
233  M8 = _mm_min_epi16(M0, M1); \
234  M9 = _mm_min_epi16(M2, M3); \
235  M10 = _mm_min_epi16(M4, M5); \
236  M11 = _mm_min_epi16(M6, M7); \
237  M8 = _mm_min_epi16(M8, M9); \
238  M10 = _mm_min_epi16(M10, M11); \
239  M8 = _mm_min_epi16(M8, M10); \
240  SSE_MINPOS(M8, M9) \
241  SSE_BROADCAST(M8) \
242  M0 = _mm_subs_epi16(M0, M8); \
243  M1 = _mm_subs_epi16(M1, M8); \
244  M2 = _mm_subs_epi16(M2, M8); \
245  M3 = _mm_subs_epi16(M3, M8); \
246  M4 = _mm_subs_epi16(M4, M8); \
247  M5 = _mm_subs_epi16(M5, M8); \
248  M6 = _mm_subs_epi16(M6, M8); \
249  M7 = _mm_subs_epi16(M7, M8); \
250 }
251 
252 /* Combined BMU/PMU (K=5, N=2)
253  * Compute branch metrics followed by path metrics for half rate 16-state
254  * trellis. 8 butterflies are computed. Accumulated path sums are not
255  * preserved and read and written into the same memory location. Normalize
256  * sums if requires.
257  */
258 __always_inline static void _sse_metrics_k5_n2(const int16_t *val,
259  const int16_t *out, int16_t *sums, int16_t *paths, int norm)
260 {
261  __m128i m0, m1, m2, m3, m4, m5, m6;
262 
263  /* (BMU) Load input sequence */
264  m2 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
265 
266  /* (BMU) Load trellis outputs */
267  m0 = _mm_load_si128((__m128i *) &out[0]);
268  m1 = _mm_load_si128((__m128i *) &out[8]);
269 
270  /* (BMU) Compute branch metrics */
271  m0 = _mm_sign_epi16(m2, m0);
272  m1 = _mm_sign_epi16(m2, m1);
273  m2 = _mm_hadds_epi16(m0, m1);
274 
275  /* (PMU) Load accumulated path metrics */
276  m0 = _mm_load_si128((__m128i *) &sums[0]);
277  m1 = _mm_load_si128((__m128i *) &sums[8]);
278 
279  SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
280 
281  /* (PMU) Butterflies: 0-7 */
282  SSE_BUTTERFLY(m3, m4, m2, m5, m6)
283 
284  if (norm)
285  SSE_NORMALIZE_K5(m2, m6, m0, m1)
286 
287  _mm_store_si128((__m128i *) &sums[0], m2);
288  _mm_store_si128((__m128i *) &sums[8], m6);
289  _mm_store_si128((__m128i *) &paths[0], m5);
290  _mm_store_si128((__m128i *) &paths[8], m4);
291 }
292 
293 /* Combined BMU/PMU (K=5, N=3 and N=4)
294  * Compute branch metrics followed by path metrics for 16-state and rates
295  * to 1/4. 8 butterflies are computed. The input sequence is read four 16-bit
296  * values at a time, and extra values should be set to zero for rates other
297  * than 1/4. Normally only rates 1/3 and 1/4 are used as there is a
298  * dedicated implementation of rate 1/2.
299  */
300 __always_inline static void _sse_metrics_k5_n4(const int16_t *val,
301  const int16_t *out, int16_t *sums, int16_t *paths, int norm)
302 {
303  __m128i m0, m1, m2, m3, m4, m5, m6;
304 
305  /* (BMU) Load input sequence */
306  m4 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
307 
308  /* (BMU) Load trellis outputs */
309  m0 = _mm_load_si128((__m128i *) &out[0]);
310  m1 = _mm_load_si128((__m128i *) &out[8]);
311  m2 = _mm_load_si128((__m128i *) &out[16]);
312  m3 = _mm_load_si128((__m128i *) &out[24]);
313 
314  SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2)
315 
316  /* (PMU) Load accumulated path metrics */
317  m0 = _mm_load_si128((__m128i *) &sums[0]);
318  m1 = _mm_load_si128((__m128i *) &sums[8]);
319 
320  SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
321 
322  /* (PMU) Butterflies: 0-7 */
323  SSE_BUTTERFLY(m3, m4, m2, m5, m6)
324 
325  if (norm)
326  SSE_NORMALIZE_K5(m2, m6, m0, m1)
327 
328  _mm_store_si128((__m128i *) &sums[0], m2);
329  _mm_store_si128((__m128i *) &sums[8], m6);
330  _mm_store_si128((__m128i *) &paths[0], m5);
331  _mm_store_si128((__m128i *) &paths[8], m4);
332 }
333 
334 /* Combined BMU/PMU (K=7, N=2)
335  * Compute branch metrics followed by path metrics for half rate 64-state
336  * trellis. 32 butterfly operations are computed. Deinterleaving path
337  * metrics requires usage of the full SSE register file, so separate sums
338  * before computing branch metrics to avoid register spilling.
339  */
340 __always_inline static void _sse_metrics_k7_n2(const int16_t *val,
341  const int16_t *out, int16_t *sums, int16_t *paths, int norm)
342 {
343  __m128i m0, m1, m2, m3, m4, m5, m6, m7, m8,
344  m9, m10, m11, m12, m13, m14, m15;
345 
346  /* (PMU) Load accumulated path metrics */
347  m0 = _mm_load_si128((__m128i *) &sums[0]);
348  m1 = _mm_load_si128((__m128i *) &sums[8]);
349  m2 = _mm_load_si128((__m128i *) &sums[16]);
350  m3 = _mm_load_si128((__m128i *) &sums[24]);
351  m4 = _mm_load_si128((__m128i *) &sums[32]);
352  m5 = _mm_load_si128((__m128i *) &sums[40]);
353  m6 = _mm_load_si128((__m128i *) &sums[48]);
354  m7 = _mm_load_si128((__m128i *) &sums[56]);
355 
356  /* (PMU) Deinterleave to even-odd registers */
357  SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
358  m8, m9, m10, m11, m12, m13, m14, m15)
359 
360  /* (BMU) Load input symbols */
361  m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
362 
363  /* (BMU) Load trellis outputs */
364  m0 = _mm_load_si128((__m128i *) &out[0]);
365  m1 = _mm_load_si128((__m128i *) &out[8]);
366  m2 = _mm_load_si128((__m128i *) &out[16]);
367  m3 = _mm_load_si128((__m128i *) &out[24]);
368 
369  SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5)
370 
371  m0 = _mm_load_si128((__m128i *) &out[32]);
372  m1 = _mm_load_si128((__m128i *) &out[40]);
373  m2 = _mm_load_si128((__m128i *) &out[48]);
374  m3 = _mm_load_si128((__m128i *) &out[56]);
375 
376  SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7)
377 
378  /* (PMU) Butterflies: 0-15 */
379  SSE_BUTTERFLY(m8, m9, m4, m0, m1)
380  SSE_BUTTERFLY(m10, m11, m5, m2, m3)
381 
382  _mm_store_si128((__m128i *) &paths[0], m0);
383  _mm_store_si128((__m128i *) &paths[8], m2);
384  _mm_store_si128((__m128i *) &paths[32], m9);
385  _mm_store_si128((__m128i *) &paths[40], m11);
386 
387  /* (PMU) Butterflies: 17-31 */
388  SSE_BUTTERFLY(m12, m13, m6, m0, m2)
389  SSE_BUTTERFLY(m14, m15, m7, m9, m11)
390 
391  _mm_store_si128((__m128i *) &paths[16], m0);
392  _mm_store_si128((__m128i *) &paths[24], m9);
393  _mm_store_si128((__m128i *) &paths[48], m13);
394  _mm_store_si128((__m128i *) &paths[56], m15);
395 
396  if (norm)
397  SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
398  m7, m11, m0, m8, m9, m10)
399 
400  _mm_store_si128((__m128i *) &sums[0], m4);
401  _mm_store_si128((__m128i *) &sums[8], m5);
402  _mm_store_si128((__m128i *) &sums[16], m6);
403  _mm_store_si128((__m128i *) &sums[24], m7);
404  _mm_store_si128((__m128i *) &sums[32], m1);
405  _mm_store_si128((__m128i *) &sums[40], m3);
406  _mm_store_si128((__m128i *) &sums[48], m2);
407  _mm_store_si128((__m128i *) &sums[56], m11);
408 }
409 
410 /* Combined BMU/PMU (K=7, N=3 and N=4)
411  * Compute branch metrics followed by path metrics for half rate 64-state
412  * trellis. 32 butterfly operations are computed. Deinterleave path
413  * metrics before computing branch metrics as in the half rate case.
414  */
415 __always_inline static void _sse_metrics_k7_n4(const int16_t *val,
416  const int16_t *out, int16_t *sums, int16_t *paths, int norm)
417 {
418  __m128i m0, m1, m2, m3, m4, m5, m6, m7;
419  __m128i m8, m9, m10, m11, m12, m13, m14, m15;
420 
421  /* (PMU) Load accumulated path metrics */
422  m0 = _mm_load_si128((__m128i *) &sums[0]);
423  m1 = _mm_load_si128((__m128i *) &sums[8]);
424  m2 = _mm_load_si128((__m128i *) &sums[16]);
425  m3 = _mm_load_si128((__m128i *) &sums[24]);
426  m4 = _mm_load_si128((__m128i *) &sums[32]);
427  m5 = _mm_load_si128((__m128i *) &sums[40]);
428  m6 = _mm_load_si128((__m128i *) &sums[48]);
429  m7 = _mm_load_si128((__m128i *) &sums[56]);
430 
431  /* (PMU) Deinterleave into even and odd packed registers */
432  SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
433  m8, m9, m10, m11, m12, m13, m14, m15)
434 
435  /* (BMU) Load and expand 8-bit input out to 16-bits */
436  m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
437 
438  /* (BMU) Load and compute branch metrics */
439  m0 = _mm_load_si128((__m128i *) &out[0]);
440  m1 = _mm_load_si128((__m128i *) &out[8]);
441  m2 = _mm_load_si128((__m128i *) &out[16]);
442  m3 = _mm_load_si128((__m128i *) &out[24]);
443 
444  SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4)
445 
446  m0 = _mm_load_si128((__m128i *) &out[32]);
447  m1 = _mm_load_si128((__m128i *) &out[40]);
448  m2 = _mm_load_si128((__m128i *) &out[48]);
449  m3 = _mm_load_si128((__m128i *) &out[56]);
450 
451  SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5)
452 
453  m0 = _mm_load_si128((__m128i *) &out[64]);
454  m1 = _mm_load_si128((__m128i *) &out[72]);
455  m2 = _mm_load_si128((__m128i *) &out[80]);
456  m3 = _mm_load_si128((__m128i *) &out[88]);
457 
458  SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6)
459 
460  m0 = _mm_load_si128((__m128i *) &out[96]);
461  m1 = _mm_load_si128((__m128i *) &out[104]);
462  m2 = _mm_load_si128((__m128i *) &out[112]);
463  m3 = _mm_load_si128((__m128i *) &out[120]);
464 
465  SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7)
466 
467  /* (PMU) Butterflies: 0-15 */
468  SSE_BUTTERFLY(m8, m9, m4, m0, m1)
469  SSE_BUTTERFLY(m10, m11, m5, m2, m3)
470 
471  _mm_store_si128((__m128i *) &paths[0], m0);
472  _mm_store_si128((__m128i *) &paths[8], m2);
473  _mm_store_si128((__m128i *) &paths[32], m9);
474  _mm_store_si128((__m128i *) &paths[40], m11);
475 
476  /* (PMU) Butterflies: 17-31 */
477  SSE_BUTTERFLY(m12, m13, m6, m0, m2)
478  SSE_BUTTERFLY(m14, m15, m7, m9, m11)
479 
480  _mm_store_si128((__m128i *) &paths[16], m0);
481  _mm_store_si128((__m128i *) &paths[24], m9);
482  _mm_store_si128((__m128i *) &paths[48], m13);
483  _mm_store_si128((__m128i *) &paths[56], m15);
484 
485  if (norm)
486  SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
487  m7, m11, m0, m8, m9, m10)
488 
489  _mm_store_si128((__m128i *) &sums[0], m4);
490  _mm_store_si128((__m128i *) &sums[8], m5);
491  _mm_store_si128((__m128i *) &sums[16], m6);
492  _mm_store_si128((__m128i *) &sums[24], m7);
493  _mm_store_si128((__m128i *) &sums[32], m1);
494  _mm_store_si128((__m128i *) &sums[40], m3);
495  _mm_store_si128((__m128i *) &sums[48], m2);
496  _mm_store_si128((__m128i *) &sums[56], m11);
497 }
int16_t ** paths
Definition: conv_acc.c:168
#define SSE_BUTTERFLY(M0, M1, M2, M3, M4)
Definition: conv_acc_sse_impl.h:44
static __always_inline void _sse_metrics_k7_n2(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:340
#define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5)
Definition: conv_acc_sse_impl.h:149
#define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7)
Definition: conv_acc_sse_impl.h:127
#define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3)
Definition: conv_acc_sse_impl.h:72
static __always_inline void _sse_metrics_k5_n2(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:258
#define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15)
Definition: conv_acc_sse_impl.h:95
#define SSE_NORMALIZE_K5(M0, M1, M2, M3)
Definition: conv_acc_sse_impl.h:211
static __always_inline void _sse_metrics_k7_n4(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:415
int sse41_supported
#define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11)
Definition: conv_acc_sse_impl.h:231
static __always_inline void _sse_metrics_k5_n4(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:300