44 #define SSE_BUTTERFLY(M0, M1, M2, M3, M4) \
46 M3 = _mm_adds_epi16(M0, M2); \
47 M4 = _mm_subs_epi16(M1, M2); \
48 M0 = _mm_subs_epi16(M0, M2); \
49 M1 = _mm_adds_epi16(M1, M2); \
50 M2 = _mm_max_epi16(M3, M4); \
51 M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, M4)); \
52 M4 = _mm_max_epi16(M0, M1); \
53 M1 = _mm_or_si128(_mm_cmpgt_epi16(M0, M1), _mm_cmpeq_epi16(M0, M1)); \
70 #define _I8_SHUFFLE_MASK 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
72 #define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3) \
74 M2 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
75 M0 = _mm_shuffle_epi8(M0, M2); \
76 M1 = _mm_shuffle_epi8(M1, M2); \
77 M2 = _mm_unpacklo_epi64(M0, M1); \
78 M3 = _mm_unpackhi_epi64(M0, M1); \
95 #define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, \
96 M8, M9, M10, M11, M12, M13, M14, M15) \
98 M8 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
99 M0 = _mm_shuffle_epi8(M0, M8); \
100 M1 = _mm_shuffle_epi8(M1, M8); \
101 M2 = _mm_shuffle_epi8(M2, M8); \
102 M3 = _mm_shuffle_epi8(M3, M8); \
103 M4 = _mm_shuffle_epi8(M4, M8); \
104 M5 = _mm_shuffle_epi8(M5, M8); \
105 M6 = _mm_shuffle_epi8(M6, M8); \
106 M7 = _mm_shuffle_epi8(M7, M8); \
107 M8 = _mm_unpacklo_epi64(M0, M1); \
108 M9 = _mm_unpackhi_epi64(M0, M1); \
109 M10 = _mm_unpacklo_epi64(M2, M3); \
110 M11 = _mm_unpackhi_epi64(M2, M3); \
111 M12 = _mm_unpacklo_epi64(M4, M5); \
112 M13 = _mm_unpackhi_epi64(M4, M5); \
113 M14 = _mm_unpacklo_epi64(M6, M7); \
114 M15 = _mm_unpackhi_epi64(M6, M7); \
127 #define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7) \
129 M0 = _mm_sign_epi16(M4, M0); \
130 M1 = _mm_sign_epi16(M4, M1); \
131 M2 = _mm_sign_epi16(M4, M2); \
132 M3 = _mm_sign_epi16(M4, M3); \
133 M6 = _mm_hadds_epi16(M0, M1); \
134 M7 = _mm_hadds_epi16(M2, M3); \
149 #define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5) \
151 M0 = _mm_sign_epi16(M4, M0); \
152 M1 = _mm_sign_epi16(M4, M1); \
153 M2 = _mm_sign_epi16(M4, M2); \
154 M3 = _mm_sign_epi16(M4, M3); \
155 M0 = _mm_hadds_epi16(M0, M1); \
156 M1 = _mm_hadds_epi16(M2, M3); \
157 M5 = _mm_hadds_epi16(M0, M1); \
173 #if defined(HAVE_SSE4_1) || defined(HAVE_SSE41)
174 #define SSE_MINPOS(M0, M1) \
176 if (sse41_supported) { \
177 M0 = _mm_minpos_epu16(M0); \
179 M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
180 M0 = _mm_min_epi16(M0, M1); \
181 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
182 M0 = _mm_min_epi16(M0, M1); \
183 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
184 M0 = _mm_min_epi16(M0, M1); \
188 #define SSE_MINPOS(M0, M1) \
190 M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
191 M0 = _mm_min_epi16(M0, M1); \
192 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
193 M0 = _mm_min_epi16(M0, M1); \
194 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
195 M0 = _mm_min_epi16(M0, M1); \
211 #define SSE_NORMALIZE_K5(M0, M1, M2, M3) \
213 M2 = _mm_min_epi16(M0, M1); \
216 M0 = _mm_subs_epi16(M0, M2); \
217 M1 = _mm_subs_epi16(M1, M2); \
231 #define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11) \
233 M8 = _mm_min_epi16(M0, M1); \
234 M9 = _mm_min_epi16(M2, M3); \
235 M10 = _mm_min_epi16(M4, M5); \
236 M11 = _mm_min_epi16(M6, M7); \
237 M8 = _mm_min_epi16(M8, M9); \
238 M10 = _mm_min_epi16(M10, M11); \
239 M8 = _mm_min_epi16(M8, M10); \
242 M0 = _mm_subs_epi16(M0, M8); \
243 M1 = _mm_subs_epi16(M1, M8); \
244 M2 = _mm_subs_epi16(M2, M8); \
245 M3 = _mm_subs_epi16(M3, M8); \
246 M4 = _mm_subs_epi16(M4, M8); \
247 M5 = _mm_subs_epi16(M5, M8); \
248 M6 = _mm_subs_epi16(M6, M8); \
249 M7 = _mm_subs_epi16(M7, M8); \
259 const int16_t *out, int16_t *sums, int16_t *
paths,
int norm)
261 __m128i m0, m1, m2, m3, m4, m5, m6;
264 m2 = _mm_castpd_si128(_mm_loaddup_pd((
double const *) val));
267 m0 = _mm_load_si128((__m128i *) &out[0]);
268 m1 = _mm_load_si128((__m128i *) &out[8]);
271 m0 = _mm_sign_epi16(m2, m0);
272 m1 = _mm_sign_epi16(m2, m1);
273 m2 = _mm_hadds_epi16(m0, m1);
276 m0 = _mm_load_si128((__m128i *) &sums[0]);
277 m1 = _mm_load_si128((__m128i *) &sums[8]);
287 _mm_store_si128((__m128i *) &sums[0], m2);
288 _mm_store_si128((__m128i *) &sums[8], m6);
289 _mm_store_si128((__m128i *) &paths[0], m5);
290 _mm_store_si128((__m128i *) &paths[8], m4);
301 const int16_t *out, int16_t *sums, int16_t *
paths,
int norm)
303 __m128i m0, m1, m2, m3, m4, m5, m6;
306 m4 = _mm_castpd_si128(_mm_loaddup_pd((
double const *) val));
309 m0 = _mm_load_si128((__m128i *) &out[0]);
310 m1 = _mm_load_si128((__m128i *) &out[8]);
311 m2 = _mm_load_si128((__m128i *) &out[16]);
312 m3 = _mm_load_si128((__m128i *) &out[24]);
317 m0 = _mm_load_si128((__m128i *) &sums[0]);
318 m1 = _mm_load_si128((__m128i *) &sums[8]);
328 _mm_store_si128((__m128i *) &sums[0], m2);
329 _mm_store_si128((__m128i *) &sums[8], m6);
330 _mm_store_si128((__m128i *) &paths[0], m5);
331 _mm_store_si128((__m128i *) &paths[8], m4);
341 const int16_t *out, int16_t *sums, int16_t *
paths,
int norm)
343 __m128i m0, m1, m2, m3, m4, m5, m6, m7, m8,
344 m9, m10, m11, m12, m13, m14, m15;
347 m0 = _mm_load_si128((__m128i *) &sums[0]);
348 m1 = _mm_load_si128((__m128i *) &sums[8]);
349 m2 = _mm_load_si128((__m128i *) &sums[16]);
350 m3 = _mm_load_si128((__m128i *) &sums[24]);
351 m4 = _mm_load_si128((__m128i *) &sums[32]);
352 m5 = _mm_load_si128((__m128i *) &sums[40]);
353 m6 = _mm_load_si128((__m128i *) &sums[48]);
354 m7 = _mm_load_si128((__m128i *) &sums[56]);
358 m8, m9, m10, m11, m12, m13, m14, m15)
361 m7 = _mm_castpd_si128(_mm_loaddup_pd((
double const *) val));
364 m0 = _mm_load_si128((__m128i *) &out[0]);
365 m1 = _mm_load_si128((__m128i *) &out[8]);
366 m2 = _mm_load_si128((__m128i *) &out[16]);
367 m3 = _mm_load_si128((__m128i *) &out[24]);
371 m0 = _mm_load_si128((__m128i *) &out[32]);
372 m1 = _mm_load_si128((__m128i *) &out[40]);
373 m2 = _mm_load_si128((__m128i *) &out[48]);
374 m3 = _mm_load_si128((__m128i *) &out[56]);
382 _mm_store_si128((__m128i *) &paths[0], m0);
383 _mm_store_si128((__m128i *) &paths[8], m2);
384 _mm_store_si128((__m128i *) &paths[32], m9);
385 _mm_store_si128((__m128i *) &paths[40], m11);
391 _mm_store_si128((__m128i *) &paths[16], m0);
392 _mm_store_si128((__m128i *) &paths[24], m9);
393 _mm_store_si128((__m128i *) &paths[48], m13);
394 _mm_store_si128((__m128i *) &paths[56], m15);
398 m7, m11, m0, m8, m9, m10)
400 _mm_store_si128((__m128i *) &sums[0], m4);
401 _mm_store_si128((__m128i *) &sums[8], m5);
402 _mm_store_si128((__m128i *) &sums[16], m6);
403 _mm_store_si128((__m128i *) &sums[24], m7);
404 _mm_store_si128((__m128i *) &sums[32], m1);
405 _mm_store_si128((__m128i *) &sums[40], m3);
406 _mm_store_si128((__m128i *) &sums[48], m2);
407 _mm_store_si128((__m128i *) &sums[56], m11);
416 const int16_t *out, int16_t *sums, int16_t *
paths,
int norm)
418 __m128i m0, m1, m2, m3, m4, m5, m6, m7;
419 __m128i m8, m9, m10, m11, m12, m13, m14, m15;
422 m0 = _mm_load_si128((__m128i *) &sums[0]);
423 m1 = _mm_load_si128((__m128i *) &sums[8]);
424 m2 = _mm_load_si128((__m128i *) &sums[16]);
425 m3 = _mm_load_si128((__m128i *) &sums[24]);
426 m4 = _mm_load_si128((__m128i *) &sums[32]);
427 m5 = _mm_load_si128((__m128i *) &sums[40]);
428 m6 = _mm_load_si128((__m128i *) &sums[48]);
429 m7 = _mm_load_si128((__m128i *) &sums[56]);
433 m8, m9, m10, m11, m12, m13, m14, m15)
436 m7 = _mm_castpd_si128(_mm_loaddup_pd((
double const *) val));
439 m0 = _mm_load_si128((__m128i *) &out[0]);
440 m1 = _mm_load_si128((__m128i *) &out[8]);
441 m2 = _mm_load_si128((__m128i *) &out[16]);
442 m3 = _mm_load_si128((__m128i *) &out[24]);
446 m0 = _mm_load_si128((__m128i *) &out[32]);
447 m1 = _mm_load_si128((__m128i *) &out[40]);
448 m2 = _mm_load_si128((__m128i *) &out[48]);
449 m3 = _mm_load_si128((__m128i *) &out[56]);
453 m0 = _mm_load_si128((__m128i *) &out[64]);
454 m1 = _mm_load_si128((__m128i *) &out[72]);
455 m2 = _mm_load_si128((__m128i *) &out[80]);
456 m3 = _mm_load_si128((__m128i *) &out[88]);
460 m0 = _mm_load_si128((__m128i *) &out[96]);
461 m1 = _mm_load_si128((__m128i *) &out[104]);
462 m2 = _mm_load_si128((__m128i *) &out[112]);
463 m3 = _mm_load_si128((__m128i *) &out[120]);
471 _mm_store_si128((__m128i *) &paths[0], m0);
472 _mm_store_si128((__m128i *) &paths[8], m2);
473 _mm_store_si128((__m128i *) &paths[32], m9);
474 _mm_store_si128((__m128i *) &paths[40], m11);
480 _mm_store_si128((__m128i *) &paths[16], m0);
481 _mm_store_si128((__m128i *) &paths[24], m9);
482 _mm_store_si128((__m128i *) &paths[48], m13);
483 _mm_store_si128((__m128i *) &paths[56], m15);
487 m7, m11, m0, m8, m9, m10)
489 _mm_store_si128((__m128i *) &sums[0], m4);
490 _mm_store_si128((__m128i *) &sums[8], m5);
491 _mm_store_si128((__m128i *) &sums[16], m6);
492 _mm_store_si128((__m128i *) &sums[24], m7);
493 _mm_store_si128((__m128i *) &sums[32], m1);
494 _mm_store_si128((__m128i *) &sums[40], m3);
495 _mm_store_si128((__m128i *) &sums[48], m2);
496 _mm_store_si128((__m128i *) &sums[56], m11);
int16_t ** paths
Definition: conv_acc.c:168
#define SSE_BUTTERFLY(M0, M1, M2, M3, M4)
Definition: conv_acc_sse_impl.h:44
static __always_inline void _sse_metrics_k7_n2(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:340
#define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5)
Definition: conv_acc_sse_impl.h:149
#define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7)
Definition: conv_acc_sse_impl.h:127
#define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3)
Definition: conv_acc_sse_impl.h:72
static __always_inline void _sse_metrics_k5_n2(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:258
#define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15)
Definition: conv_acc_sse_impl.h:95
#define SSE_NORMALIZE_K5(M0, M1, M2, M3)
Definition: conv_acc_sse_impl.h:211
static __always_inline void _sse_metrics_k7_n4(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:415
#define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11)
Definition: conv_acc_sse_impl.h:231
static __always_inline void _sse_metrics_k5_n4(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:300