23 extern int sse41_supported;
42 #define SSE_BUTTERFLY(M0, M1, M2, M3, M4) \
44 M3 = _mm_adds_epi16(M0, M2); \
45 M4 = _mm_subs_epi16(M1, M2); \
46 M0 = _mm_subs_epi16(M0, M2); \
47 M1 = _mm_adds_epi16(M1, M2); \
48 M2 = _mm_max_epi16(M3, M4); \
49 M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, M4)); \
50 M4 = _mm_max_epi16(M0, M1); \
51 M1 = _mm_or_si128(_mm_cmpgt_epi16(M0, M1), _mm_cmpeq_epi16(M0, M1)); \
68 #define _I8_SHUFFLE_MASK 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
70 #define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3) \
72 M2 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
73 M0 = _mm_shuffle_epi8(M0, M2); \
74 M1 = _mm_shuffle_epi8(M1, M2); \
75 M2 = _mm_unpacklo_epi64(M0, M1); \
76 M3 = _mm_unpackhi_epi64(M0, M1); \
93 #define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, \
94 M8, M9, M10, M11, M12, M13, M14, M15) \
96 M8 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
97 M0 = _mm_shuffle_epi8(M0, M8); \
98 M1 = _mm_shuffle_epi8(M1, M8); \
99 M2 = _mm_shuffle_epi8(M2, M8); \
100 M3 = _mm_shuffle_epi8(M3, M8); \
101 M4 = _mm_shuffle_epi8(M4, M8); \
102 M5 = _mm_shuffle_epi8(M5, M8); \
103 M6 = _mm_shuffle_epi8(M6, M8); \
104 M7 = _mm_shuffle_epi8(M7, M8); \
105 M8 = _mm_unpacklo_epi64(M0, M1); \
106 M9 = _mm_unpackhi_epi64(M0, M1); \
107 M10 = _mm_unpacklo_epi64(M2, M3); \
108 M11 = _mm_unpackhi_epi64(M2, M3); \
109 M12 = _mm_unpacklo_epi64(M4, M5); \
110 M13 = _mm_unpackhi_epi64(M4, M5); \
111 M14 = _mm_unpacklo_epi64(M6, M7); \
112 M15 = _mm_unpackhi_epi64(M6, M7); \
125 #define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7) \
127 M0 = _mm_sign_epi16(M4, M0); \
128 M1 = _mm_sign_epi16(M4, M1); \
129 M2 = _mm_sign_epi16(M4, M2); \
130 M3 = _mm_sign_epi16(M4, M3); \
131 M6 = _mm_hadds_epi16(M0, M1); \
132 M7 = _mm_hadds_epi16(M2, M3); \
147 #define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5) \
149 M0 = _mm_sign_epi16(M4, M0); \
150 M1 = _mm_sign_epi16(M4, M1); \
151 M2 = _mm_sign_epi16(M4, M2); \
152 M3 = _mm_sign_epi16(M4, M3); \
153 M0 = _mm_hadds_epi16(M0, M1); \
154 M1 = _mm_hadds_epi16(M2, M3); \
155 M5 = _mm_hadds_epi16(M0, M1); \
171 #if defined(HAVE_SSE4_1) || defined(HAVE_SSE41)
172 #define SSE_MINPOS(M0, M1) \
174 if (sse41_supported) { \
175 M0 = _mm_minpos_epu16(M0); \
177 M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
178 M0 = _mm_min_epi16(M0, M1); \
179 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
180 M0 = _mm_min_epi16(M0, M1); \
181 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
182 M0 = _mm_min_epi16(M0, M1); \
186 #define SSE_MINPOS(M0, M1) \
188 M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
189 M0 = _mm_min_epi16(M0, M1); \
190 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
191 M0 = _mm_min_epi16(M0, M1); \
192 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
193 M0 = _mm_min_epi16(M0, M1); \
209 #define SSE_NORMALIZE_K5(M0, M1, M2, M3) \
211 M2 = _mm_min_epi16(M0, M1); \
214 M0 = _mm_subs_epi16(M0, M2); \
215 M1 = _mm_subs_epi16(M1, M2); \
229 #define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11) \
231 M8 = _mm_min_epi16(M0, M1); \
232 M9 = _mm_min_epi16(M2, M3); \
233 M10 = _mm_min_epi16(M4, M5); \
234 M11 = _mm_min_epi16(M6, M7); \
235 M8 = _mm_min_epi16(M8, M9); \
236 M10 = _mm_min_epi16(M10, M11); \
237 M8 = _mm_min_epi16(M8, M10); \
240 M0 = _mm_subs_epi16(M0, M8); \
241 M1 = _mm_subs_epi16(M1, M8); \
242 M2 = _mm_subs_epi16(M2, M8); \
243 M3 = _mm_subs_epi16(M3, M8); \
244 M4 = _mm_subs_epi16(M4, M8); \
245 M5 = _mm_subs_epi16(M5, M8); \
246 M6 = _mm_subs_epi16(M6, M8); \
247 M7 = _mm_subs_epi16(M7, M8); \
256 __always_inline
static void _sse_metrics_k5_n2(
const int16_t *val,
257 const int16_t *out, int16_t *sums, int16_t *paths,
int norm)
259 __m128i m0, m1, m2, m3, m4, m5, m6;
262 m2 = _mm_castpd_si128(_mm_loaddup_pd((
double const *) val));
265 m0 = _mm_load_si128((__m128i *) &out[0]);
266 m1 = _mm_load_si128((__m128i *) &out[8]);
269 m0 = _mm_sign_epi16(m2, m0);
270 m1 = _mm_sign_epi16(m2, m1);
271 m2 = _mm_hadds_epi16(m0, m1);
274 m0 = _mm_load_si128((__m128i *) &sums[0]);
275 m1 = _mm_load_si128((__m128i *) &sums[8]);
277 SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
280 SSE_BUTTERFLY(m3, m4, m2, m5, m6)
283 SSE_NORMALIZE_K5(m2, m6, m0, m1)
285 _mm_store_si128((__m128i *) &sums[0], m2);
286 _mm_store_si128((__m128i *) &sums[8], m6);
287 _mm_store_si128((__m128i *) &paths[0], m5);
288 _mm_store_si128((__m128i *) &paths[8], m4);
298 __always_inline static
void _sse_metrics_k5_n4(const int16_t *val,
299 const int16_t *out, int16_t *sums, int16_t *paths,
int norm)
301 __m128i m0, m1, m2, m3, m4, m5, m6;
304 m4 = _mm_castpd_si128(_mm_loaddup_pd((
double const *) val));
307 m0 = _mm_load_si128((__m128i *) &out[0]);
308 m1 = _mm_load_si128((__m128i *) &out[8]);
309 m2 = _mm_load_si128((__m128i *) &out[16]);
310 m3 = _mm_load_si128((__m128i *) &out[24]);
312 SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2)
315 m0 = _mm_load_si128((__m128i *) &sums[0]);
316 m1 = _mm_load_si128((__m128i *) &sums[8]);
318 SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
321 SSE_BUTTERFLY(m3, m4, m2, m5, m6)
324 SSE_NORMALIZE_K5(m2, m6, m0, m1)
326 _mm_store_si128((__m128i *) &sums[0], m2);
327 _mm_store_si128((__m128i *) &sums[8], m6);
328 _mm_store_si128((__m128i *) &paths[0], m5);
329 _mm_store_si128((__m128i *) &paths[8], m4);
338 __always_inline static
void _sse_metrics_k7_n2(const int16_t *val,
339 const int16_t *out, int16_t *sums, int16_t *paths,
int norm)
341 __m128i m0, m1, m2, m3, m4, m5, m6, m7, m8,
342 m9, m10, m11, m12, m13, m14, m15;
345 m0 = _mm_load_si128((__m128i *) &sums[0]);
346 m1 = _mm_load_si128((__m128i *) &sums[8]);
347 m2 = _mm_load_si128((__m128i *) &sums[16]);
348 m3 = _mm_load_si128((__m128i *) &sums[24]);
349 m4 = _mm_load_si128((__m128i *) &sums[32]);
350 m5 = _mm_load_si128((__m128i *) &sums[40]);
351 m6 = _mm_load_si128((__m128i *) &sums[48]);
352 m7 = _mm_load_si128((__m128i *) &sums[56]);
355 SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
356 m8, m9, m10, m11, m12, m13, m14, m15)
359 m7 = _mm_castpd_si128(_mm_loaddup_pd((
double const *) val));
362 m0 = _mm_load_si128((__m128i *) &out[0]);
363 m1 = _mm_load_si128((__m128i *) &out[8]);
364 m2 = _mm_load_si128((__m128i *) &out[16]);
365 m3 = _mm_load_si128((__m128i *) &out[24]);
367 SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5)
369 m0 = _mm_load_si128((__m128i *) &out[32]);
370 m1 = _mm_load_si128((__m128i *) &out[40]);
371 m2 = _mm_load_si128((__m128i *) &out[48]);
372 m3 = _mm_load_si128((__m128i *) &out[56]);
374 SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7)
377 SSE_BUTTERFLY(m8, m9, m4, m0, m1)
378 SSE_BUTTERFLY(m10, m11, m5, m2, m3)
380 _mm_store_si128((__m128i *) &paths[0], m0);
381 _mm_store_si128((__m128i *) &paths[8], m2);
382 _mm_store_si128((__m128i *) &paths[32], m9);
383 _mm_store_si128((__m128i *) &paths[40], m11);
386 SSE_BUTTERFLY(m12, m13, m6, m0, m2)
387 SSE_BUTTERFLY(m14, m15, m7, m9, m11)
389 _mm_store_si128((__m128i *) &paths[16], m0);
390 _mm_store_si128((__m128i *) &paths[24], m9);
391 _mm_store_si128((__m128i *) &paths[48], m13);
392 _mm_store_si128((__m128i *) &paths[56], m15);
395 SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
396 m7, m11, m0, m8, m9, m10)
398 _mm_store_si128((__m128i *) &sums[0], m4);
399 _mm_store_si128((__m128i *) &sums[8], m5);
400 _mm_store_si128((__m128i *) &sums[16], m6);
401 _mm_store_si128((__m128i *) &sums[24], m7);
402 _mm_store_si128((__m128i *) &sums[32], m1);
403 _mm_store_si128((__m128i *) &sums[40], m3);
404 _mm_store_si128((__m128i *) &sums[48], m2);
405 _mm_store_si128((__m128i *) &sums[56], m11);
413 __always_inline static
void _sse_metrics_k7_n4(const int16_t *val,
414 const int16_t *out, int16_t *sums, int16_t *paths,
int norm)
416 __m128i m0, m1, m2, m3, m4, m5, m6, m7;
417 __m128i m8, m9, m10, m11, m12, m13, m14, m15;
420 m0 = _mm_load_si128((__m128i *) &sums[0]);
421 m1 = _mm_load_si128((__m128i *) &sums[8]);
422 m2 = _mm_load_si128((__m128i *) &sums[16]);
423 m3 = _mm_load_si128((__m128i *) &sums[24]);
424 m4 = _mm_load_si128((__m128i *) &sums[32]);
425 m5 = _mm_load_si128((__m128i *) &sums[40]);
426 m6 = _mm_load_si128((__m128i *) &sums[48]);
427 m7 = _mm_load_si128((__m128i *) &sums[56]);
430 SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
431 m8, m9, m10, m11, m12, m13, m14, m15)
434 m7 = _mm_castpd_si128(_mm_loaddup_pd((
double const *) val));
437 m0 = _mm_load_si128((__m128i *) &out[0]);
438 m1 = _mm_load_si128((__m128i *) &out[8]);
439 m2 = _mm_load_si128((__m128i *) &out[16]);
440 m3 = _mm_load_si128((__m128i *) &out[24]);
442 SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4)
444 m0 = _mm_load_si128((__m128i *) &out[32]);
445 m1 = _mm_load_si128((__m128i *) &out[40]);
446 m2 = _mm_load_si128((__m128i *) &out[48]);
447 m3 = _mm_load_si128((__m128i *) &out[56]);
449 SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5)
451 m0 = _mm_load_si128((__m128i *) &out[64]);
452 m1 = _mm_load_si128((__m128i *) &out[72]);
453 m2 = _mm_load_si128((__m128i *) &out[80]);
454 m3 = _mm_load_si128((__m128i *) &out[88]);
456 SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6)
458 m0 = _mm_load_si128((__m128i *) &out[96]);
459 m1 = _mm_load_si128((__m128i *) &out[104]);
460 m2 = _mm_load_si128((__m128i *) &out[112]);
461 m3 = _mm_load_si128((__m128i *) &out[120]);
463 SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7)
466 SSE_BUTTERFLY(m8, m9, m4, m0, m1)
467 SSE_BUTTERFLY(m10, m11, m5, m2, m3)
469 _mm_store_si128((__m128i *) &paths[0], m0);
470 _mm_store_si128((__m128i *) &paths[8], m2);
471 _mm_store_si128((__m128i *) &paths[32], m9);
472 _mm_store_si128((__m128i *) &paths[40], m11);
475 SSE_BUTTERFLY(m12, m13, m6, m0, m2)
476 SSE_BUTTERFLY(m14, m15, m7, m9, m11)
478 _mm_store_si128((__m128i *) &paths[16], m0);
479 _mm_store_si128((__m128i *) &paths[24], m9);
480 _mm_store_si128((__m128i *) &paths[48], m13);
481 _mm_store_si128((__m128i *) &paths[56], m15);
484 SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
485 m7, m11, m0, m8, m9, m10)
487 _mm_store_si128((__m128i *) &sums[0], m4);
488 _mm_store_si128((__m128i *) &sums[8], m5);
489 _mm_store_si128((__m128i *) &sums[16], m6);
490 _mm_store_si128((__m128i *) &sums[24], m7);
491 _mm_store_si128((__m128i *) &sums[32], m1);
492 _mm_store_si128((__m128i *) &sums[40], m3);
493 _mm_store_si128((__m128i *) &sums[48], m2);
494 _mm_store_si128((__m128i *) &sums[56], m11);