OLD | NEW |
1 // Copyright 2011 Google Inc. | 1 // Copyright 2011 Google Inc. All Rights Reserved. |
2 // | 2 // |
3 // This code is licensed under the same terms as WebM: | 3 // This code is licensed under the same terms as WebM: |
4 // Software License Agreement: http://www.webmproject.org/license/software/ | 4 // Software License Agreement: http://www.webmproject.org/license/software/ |
5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/ | 5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/ |
6 // ----------------------------------------------------------------------------- | 6 // ----------------------------------------------------------------------------- |
7 // | 7 // |
8 // SSE2 version of some decoding functions (idct, loop filtering). | 8 // SSE2 version of some decoding functions (idct, loop filtering). |
9 // | 9 // |
10 // Author: somnath@google.com (Somnath Banerjee) | 10 // Author: somnath@google.com (Somnath Banerjee) |
11 // cduvivier@google.com (Christian Duvivier) | 11 // cduvivier@google.com (Christian Duvivier) |
12 | 12 |
13 #if defined(__SSE2__) || defined(_MSC_VER) | 13 #include "./dsp.h" |
| 14 |
| 15 #if defined(WEBP_USE_SSE2) |
14 | 16 |
15 #include <emmintrin.h> | 17 #include <emmintrin.h> |
16 #include "../dec/vp8i.h" | 18 #include "../dec/vp8i.h" |
17 | 19 |
18 #if defined(__cplusplus) || defined(c_plusplus) | 20 #if defined(__cplusplus) || defined(c_plusplus) |
19 extern "C" { | 21 extern "C" { |
20 #endif | 22 #endif |
21 | 23 |
22 //------------------------------------------------------------------------------ | 24 //------------------------------------------------------------------------------ |
23 // Transforms (Paragraph 14.4) | 25 // Transforms (Paragraph 14.4) |
(...skipping 310 matching lines...) Loading... |
334 | 336 |
335 t1 = _mm_set1_epi8(thresh); | 337 t1 = _mm_set1_epi8(thresh); |
336 *mask = _mm_subs_epu8(*mask, t1); // mask <= thresh | 338 *mask = _mm_subs_epu8(*mask, t1); // mask <= thresh |
337 *mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128()); | 339 *mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128()); |
338 } | 340 } |
339 | 341 |
340 //------------------------------------------------------------------------------ | 342 //------------------------------------------------------------------------------ |
341 // Edge filtering functions | 343 // Edge filtering functions |
342 | 344 |
343 // Applies filter on 2 pixels (p0 and q0) | 345 // Applies filter on 2 pixels (p0 and q0) |
344 static inline void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0, | 346 static WEBP_INLINE void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0, |
345 const __m128i* q1, int thresh) { | 347 const __m128i* q1, int thresh) { |
346 __m128i a, mask; | 348 __m128i a, mask; |
347 const __m128i sign_bit = _mm_set1_epi8(0x80); | 349 const __m128i sign_bit = _mm_set1_epi8(0x80); |
348 const __m128i p1s = _mm_xor_si128(*p1, sign_bit); | 350 const __m128i p1s = _mm_xor_si128(*p1, sign_bit); |
349 const __m128i q1s = _mm_xor_si128(*q1, sign_bit); | 351 const __m128i q1s = _mm_xor_si128(*q1, sign_bit); |
350 | 352 |
351 NeedsFilter(p1, p0, q0, q1, thresh, &mask); | 353 NeedsFilter(p1, p0, q0, q1, thresh, &mask); |
352 | 354 |
353 // convert to signed values | 355 // convert to signed values |
354 FLIP_SIGN_BIT2(*p0, *q0); | 356 FLIP_SIGN_BIT2(*p0, *q0); |
355 | 357 |
356 GET_BASE_DELTA(p1s, *p0, *q0, q1s, a); | 358 GET_BASE_DELTA(p1s, *p0, *q0, q1s, a); |
357 a = _mm_and_si128(a, mask); // mask filter values we don't care about | 359 a = _mm_and_si128(a, mask); // mask filter values we don't care about |
358 DO_SIMPLE_FILTER(*p0, *q0, a); | 360 DO_SIMPLE_FILTER(*p0, *q0, a); |
359 | 361 |
360 // unoffset | 362 // unoffset |
361 FLIP_SIGN_BIT2(*p0, *q0); | 363 FLIP_SIGN_BIT2(*p0, *q0); |
362 } | 364 } |
363 | 365 |
364 // Applies filter on 4 pixels (p1, p0, q0 and q1) | 366 // Applies filter on 4 pixels (p1, p0, q0 and q1) |
365 static inline void DoFilter4(__m128i* p1, __m128i *p0, __m128i* q0, __m128i* q1, | 367 static WEBP_INLINE void DoFilter4(__m128i* p1, __m128i *p0, |
366 const __m128i* mask, int hev_thresh) { | 368 __m128i* q0, __m128i* q1, |
| 369 const __m128i* mask, int hev_thresh) { |
367 __m128i not_hev; | 370 __m128i not_hev; |
368 __m128i t1, t2, t3; | 371 __m128i t1, t2, t3; |
369 const __m128i sign_bit = _mm_set1_epi8(0x80); | 372 const __m128i sign_bit = _mm_set1_epi8(0x80); |
370 | 373 |
371 // compute hev mask | 374 // compute hev mask |
372 GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev); | 375 GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev); |
373 | 376 |
374 // convert to signed values | 377 // convert to signed values |
375 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); | 378 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); |
376 | 379 |
(...skipping 24 matching lines...) Loading... |
401 | 404 |
402 t3 = _mm_and_si128(not_hev, t3); // if !hev | 405 t3 = _mm_and_si128(not_hev, t3); // if !hev |
403 *q1 = _mm_subs_epi8(*q1, t3); // q1 -= t3 | 406 *q1 = _mm_subs_epi8(*q1, t3); // q1 -= t3 |
404 *p1 = _mm_adds_epi8(*p1, t3); // p1 += t3 | 407 *p1 = _mm_adds_epi8(*p1, t3); // p1 += t3 |
405 | 408 |
406 // unoffset | 409 // unoffset |
407 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); | 410 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); |
408 } | 411 } |
409 | 412 |
410 // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2) | 413 // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2) |
411 static inline void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0, | 414 static WEBP_INLINE void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0, |
412 __m128i* q0, __m128i* q1, __m128i *q2, | 415 __m128i* q0, __m128i* q1, __m128i *q2, |
413 const __m128i* mask, int hev_thresh) { | 416 const __m128i* mask, int hev_thresh) { |
414 __m128i a, not_hev; | 417 __m128i a, not_hev; |
415 const __m128i sign_bit = _mm_set1_epi8(0x80); | 418 const __m128i sign_bit = _mm_set1_epi8(0x80); |
416 | 419 |
417 // compute hev mask | 420 // compute hev mask |
418 GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev); | 421 GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev); |
419 | 422 |
420 // convert to signed values | 423 // convert to signed values |
421 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); | 424 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); |
422 FLIP_SIGN_BIT2(*p2, *q2); | 425 FLIP_SIGN_BIT2(*p2, *q2); |
423 | 426 |
(...skipping 35 matching lines...) Loading... |
459 | 462 |
460 // unoffset | 463 // unoffset |
461 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); | 464 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); |
462 FLIP_SIGN_BIT2(*p2, *q2); | 465 FLIP_SIGN_BIT2(*p2, *q2); |
463 } | 466 } |
464 | 467 |
465 // reads 8 rows across a vertical edge. | 468 // reads 8 rows across a vertical edge. |
466 // | 469 // |
467 // TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into | 470 // TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into |
468 // two Load4x4() to avoid code duplication. | 471 // two Load4x4() to avoid code duplication. |
469 static inline void Load8x4(const uint8_t* b, int stride, | 472 static WEBP_INLINE void Load8x4(const uint8_t* b, int stride, |
470 __m128i* p, __m128i* q) { | 473 __m128i* p, __m128i* q) { |
471 __m128i t1, t2; | 474 __m128i t1, t2; |
472 | 475 |
473 // Load 0th, 1st, 4th and 5th rows | 476 // Load 0th, 1st, 4th and 5th rows |
474 __m128i r0 = _mm_cvtsi32_si128(*((int*)&b[0 * stride])); // 03 02 01 00 | 477 __m128i r0 = _mm_cvtsi32_si128(*((int*)&b[0 * stride])); // 03 02 01 00 |
475 __m128i r1 = _mm_cvtsi32_si128(*((int*)&b[1 * stride])); // 13 12 11 10 | 478 __m128i r1 = _mm_cvtsi32_si128(*((int*)&b[1 * stride])); // 13 12 11 10 |
476 __m128i r4 = _mm_cvtsi32_si128(*((int*)&b[4 * stride])); // 43 42 41 40 | 479 __m128i r4 = _mm_cvtsi32_si128(*((int*)&b[4 * stride])); // 43 42 41 40 |
477 __m128i r5 = _mm_cvtsi32_si128(*((int*)&b[5 * stride])); // 53 52 51 50 | 480 __m128i r5 = _mm_cvtsi32_si128(*((int*)&b[5 * stride])); // 53 52 51 50 |
478 | 481 |
479 r0 = _mm_unpacklo_epi32(r0, r4); // 43 42 41 40 03 02 01 00 | 482 r0 = _mm_unpacklo_epi32(r0, r4); // 43 42 41 40 03 02 01 00 |
480 r1 = _mm_unpacklo_epi32(r1, r5); // 53 52 51 50 13 12 11 10 | 483 r1 = _mm_unpacklo_epi32(r1, r5); // 53 52 51 50 13 12 11 10 |
(...skipping 18 matching lines...) Loading... |
499 r0 = t1; | 502 r0 = t1; |
500 t1 = _mm_unpacklo_epi16(t1, t2); | 503 t1 = _mm_unpacklo_epi16(t1, t2); |
501 t2 = _mm_unpackhi_epi16(r0, t2); | 504 t2 = _mm_unpackhi_epi16(r0, t2); |
502 | 505 |
503 // *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 | 506 // *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 |
504 // *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 | 507 // *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 |
505 *p = _mm_unpacklo_epi32(t1, t2); | 508 *p = _mm_unpacklo_epi32(t1, t2); |
506 *q = _mm_unpackhi_epi32(t1, t2); | 509 *q = _mm_unpackhi_epi32(t1, t2); |
507 } | 510 } |
508 | 511 |
509 static inline void Load16x4(const uint8_t* r0, const uint8_t* r8, int stride, | 512 static WEBP_INLINE void Load16x4(const uint8_t* r0, const uint8_t* r8, |
510 __m128i* p1, __m128i* p0, | 513 int stride, |
511 __m128i* q0, __m128i* q1) { | 514 __m128i* p1, __m128i* p0, |
| 515 __m128i* q0, __m128i* q1) { |
512 __m128i t1, t2; | 516 __m128i t1, t2; |
513 // Assume the pixels around the edge (|) are numbered as follows | 517 // Assume the pixels around the edge (|) are numbered as follows |
514 // 00 01 | 02 03 | 518 // 00 01 | 02 03 |
515 // 10 11 | 12 13 | 519 // 10 11 | 12 13 |
516 // ... | ... | 520 // ... | ... |
517 // e0 e1 | e2 e3 | 521 // e0 e1 | e2 e3 |
518 // f0 f1 | f2 f3 | 522 // f0 f1 | f2 f3 |
519 // | 523 // |
520 // r0 is pointing to the 0th row (00) | 524 // r0 is pointing to the 0th row (00) |
521 // r8 is pointing to the 8th row (80) | 525 // r8 is pointing to the 8th row (80) |
(...skipping 11 matching lines...) Loading... |
533 // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 | 537 // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 |
534 // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 | 538 // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 |
535 // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 | 539 // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 |
536 // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 | 540 // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 |
537 *p1 = _mm_unpacklo_epi64(t1, *p0); | 541 *p1 = _mm_unpacklo_epi64(t1, *p0); |
538 *p0 = _mm_unpackhi_epi64(t1, *p0); | 542 *p0 = _mm_unpackhi_epi64(t1, *p0); |
539 *q0 = _mm_unpacklo_epi64(t2, *q1); | 543 *q0 = _mm_unpacklo_epi64(t2, *q1); |
540 *q1 = _mm_unpackhi_epi64(t2, *q1); | 544 *q1 = _mm_unpackhi_epi64(t2, *q1); |
541 } | 545 } |
542 | 546 |
543 static inline void Store4x4(__m128i* x, uint8_t* dst, int stride) { | 547 static WEBP_INLINE void Store4x4(__m128i* x, uint8_t* dst, int stride) { |
544 int i; | 548 int i; |
545 for (i = 0; i < 4; ++i, dst += stride) { | 549 for (i = 0; i < 4; ++i, dst += stride) { |
546 *((int32_t*)dst) = _mm_cvtsi128_si32(*x); | 550 *((int32_t*)dst) = _mm_cvtsi128_si32(*x); |
547 *x = _mm_srli_si128(*x, 4); | 551 *x = _mm_srli_si128(*x, 4); |
548 } | 552 } |
549 } | 553 } |
550 | 554 |
551 // Transpose back and store | 555 // Transpose back and store |
552 static inline void Store16x4(uint8_t* r0, uint8_t* r8, int stride, __m128i* p1, | 556 static WEBP_INLINE void Store16x4(uint8_t* r0, uint8_t* r8, int stride, |
553 __m128i* p0, __m128i* q0, __m128i* q1) { | 557 __m128i* p1, __m128i* p0, |
| 558 __m128i* q0, __m128i* q1) { |
554 __m128i t1; | 559 __m128i t1; |
555 | 560 |
556 // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 | 561 // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 |
557 // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 | 562 // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 |
558 t1 = *p0; | 563 t1 = *p0; |
559 *p0 = _mm_unpacklo_epi8(*p1, t1); | 564 *p0 = _mm_unpacklo_epi8(*p1, t1); |
560 *p1 = _mm_unpackhi_epi8(*p1, t1); | 565 *p1 = _mm_unpackhi_epi8(*p1, t1); |
561 | 566 |
562 // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 | 567 // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 |
563 // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 | 568 // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 |
(...skipping 324 matching lines...) Loading... |
888 VP8SimpleVFilter16 = SimpleVFilter16SSE2; | 893 VP8SimpleVFilter16 = SimpleVFilter16SSE2; |
889 VP8SimpleHFilter16 = SimpleHFilter16SSE2; | 894 VP8SimpleHFilter16 = SimpleHFilter16SSE2; |
890 VP8SimpleVFilter16i = SimpleVFilter16iSSE2; | 895 VP8SimpleVFilter16i = SimpleVFilter16iSSE2; |
891 VP8SimpleHFilter16i = SimpleHFilter16iSSE2; | 896 VP8SimpleHFilter16i = SimpleHFilter16iSSE2; |
892 } | 897 } |
893 | 898 |
894 #if defined(__cplusplus) || defined(c_plusplus) | 899 #if defined(__cplusplus) || defined(c_plusplus) |
895 } // extern "C" | 900 } // extern "C" |
896 #endif | 901 #endif |
897 | 902 |
898 #endif //__SSE2__ || _MSC_VER | 903 #endif // WEBP_USE_SSE2 |
OLD | NEW |