| OLD | NEW |
| 1 // Copyright 2011 Google Inc. | 1 // Copyright 2011 Google Inc. All Rights Reserved. |
| 2 // | 2 // |
| 3 // This code is licensed under the same terms as WebM: | 3 // This code is licensed under the same terms as WebM: |
| 4 // Software License Agreement: http://www.webmproject.org/license/software/ | 4 // Software License Agreement: http://www.webmproject.org/license/software/ |
| 5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/ | 5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/ |
| 6 // ----------------------------------------------------------------------------- | 6 // ----------------------------------------------------------------------------- |
| 7 // | 7 // |
| 8 // SSE2 version of some decoding functions (idct, loop filtering). | 8 // SSE2 version of some decoding functions (idct, loop filtering). |
| 9 // | 9 // |
| 10 // Author: somnath@google.com (Somnath Banerjee) | 10 // Author: somnath@google.com (Somnath Banerjee) |
| 11 // cduvivier@google.com (Christian Duvivier) | 11 // cduvivier@google.com (Christian Duvivier) |
| 12 | 12 |
| 13 #if defined(__SSE2__) || defined(_MSC_VER) | 13 #include "./dsp.h" |
| 14 |
| 15 #if defined(WEBP_USE_SSE2) |
| 14 | 16 |
| 15 #include <emmintrin.h> | 17 #include <emmintrin.h> |
| 16 #include "../dec/vp8i.h" | 18 #include "../dec/vp8i.h" |
| 17 | 19 |
| 18 #if defined(__cplusplus) || defined(c_plusplus) | 20 #if defined(__cplusplus) || defined(c_plusplus) |
| 19 extern "C" { | 21 extern "C" { |
| 20 #endif | 22 #endif |
| 21 | 23 |
| 22 //------------------------------------------------------------------------------ | 24 //------------------------------------------------------------------------------ |
| 23 // Transforms (Paragraph 14.4) | 25 // Transforms (Paragraph 14.4) |
| (...skipping 310 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 334 | 336 |
| 335 t1 = _mm_set1_epi8(thresh); | 337 t1 = _mm_set1_epi8(thresh); |
| 336 *mask = _mm_subs_epu8(*mask, t1); // mask <= thresh | 338 *mask = _mm_subs_epu8(*mask, t1); // mask <= thresh |
| 337 *mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128()); | 339 *mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128()); |
| 338 } | 340 } |
| 339 | 341 |
| 340 //------------------------------------------------------------------------------ | 342 //------------------------------------------------------------------------------ |
| 341 // Edge filtering functions | 343 // Edge filtering functions |
| 342 | 344 |
| 343 // Applies filter on 2 pixels (p0 and q0) | 345 // Applies filter on 2 pixels (p0 and q0) |
| 344 static inline void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0, | 346 static WEBP_INLINE void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0, |
| 345 const __m128i* q1, int thresh) { | 347 const __m128i* q1, int thresh) { |
| 346 __m128i a, mask; | 348 __m128i a, mask; |
| 347 const __m128i sign_bit = _mm_set1_epi8(0x80); | 349 const __m128i sign_bit = _mm_set1_epi8(0x80); |
| 348 const __m128i p1s = _mm_xor_si128(*p1, sign_bit); | 350 const __m128i p1s = _mm_xor_si128(*p1, sign_bit); |
| 349 const __m128i q1s = _mm_xor_si128(*q1, sign_bit); | 351 const __m128i q1s = _mm_xor_si128(*q1, sign_bit); |
| 350 | 352 |
| 351 NeedsFilter(p1, p0, q0, q1, thresh, &mask); | 353 NeedsFilter(p1, p0, q0, q1, thresh, &mask); |
| 352 | 354 |
| 353 // convert to signed values | 355 // convert to signed values |
| 354 FLIP_SIGN_BIT2(*p0, *q0); | 356 FLIP_SIGN_BIT2(*p0, *q0); |
| 355 | 357 |
| 356 GET_BASE_DELTA(p1s, *p0, *q0, q1s, a); | 358 GET_BASE_DELTA(p1s, *p0, *q0, q1s, a); |
| 357 a = _mm_and_si128(a, mask); // mask filter values we don't care about | 359 a = _mm_and_si128(a, mask); // mask filter values we don't care about |
| 358 DO_SIMPLE_FILTER(*p0, *q0, a); | 360 DO_SIMPLE_FILTER(*p0, *q0, a); |
| 359 | 361 |
| 360 // unoffset | 362 // unoffset |
| 361 FLIP_SIGN_BIT2(*p0, *q0); | 363 FLIP_SIGN_BIT2(*p0, *q0); |
| 362 } | 364 } |
| 363 | 365 |
| 364 // Applies filter on 4 pixels (p1, p0, q0 and q1) | 366 // Applies filter on 4 pixels (p1, p0, q0 and q1) |
| 365 static inline void DoFilter4(__m128i* p1, __m128i *p0, __m128i* q0, __m128i* q1, | 367 static WEBP_INLINE void DoFilter4(__m128i* p1, __m128i *p0, |
| 366 const __m128i* mask, int hev_thresh) { | 368 __m128i* q0, __m128i* q1, |
| 369 const __m128i* mask, int hev_thresh) { |
| 367 __m128i not_hev; | 370 __m128i not_hev; |
| 368 __m128i t1, t2, t3; | 371 __m128i t1, t2, t3; |
| 369 const __m128i sign_bit = _mm_set1_epi8(0x80); | 372 const __m128i sign_bit = _mm_set1_epi8(0x80); |
| 370 | 373 |
| 371 // compute hev mask | 374 // compute hev mask |
| 372 GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev); | 375 GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev); |
| 373 | 376 |
| 374 // convert to signed values | 377 // convert to signed values |
| 375 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); | 378 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); |
| 376 | 379 |
| (...skipping 24 matching lines...) Expand all Loading... |
| 401 | 404 |
| 402 t3 = _mm_and_si128(not_hev, t3); // if !hev | 405 t3 = _mm_and_si128(not_hev, t3); // if !hev |
| 403 *q1 = _mm_subs_epi8(*q1, t3); // q1 -= t3 | 406 *q1 = _mm_subs_epi8(*q1, t3); // q1 -= t3 |
| 404 *p1 = _mm_adds_epi8(*p1, t3); // p1 += t3 | 407 *p1 = _mm_adds_epi8(*p1, t3); // p1 += t3 |
| 405 | 408 |
| 406 // unoffset | 409 // unoffset |
| 407 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); | 410 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); |
| 408 } | 411 } |
| 409 | 412 |
| 410 // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2) | 413 // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2) |
| 411 static inline void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0, | 414 static WEBP_INLINE void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0, |
| 412 __m128i* q0, __m128i* q1, __m128i *q2, | 415 __m128i* q0, __m128i* q1, __m128i *q2, |
| 413 const __m128i* mask, int hev_thresh) { | 416 const __m128i* mask, int hev_thresh) { |
| 414 __m128i a, not_hev; | 417 __m128i a, not_hev; |
| 415 const __m128i sign_bit = _mm_set1_epi8(0x80); | 418 const __m128i sign_bit = _mm_set1_epi8(0x80); |
| 416 | 419 |
| 417 // compute hev mask | 420 // compute hev mask |
| 418 GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev); | 421 GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev); |
| 419 | 422 |
| 420 // convert to signed values | 423 // convert to signed values |
| 421 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); | 424 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); |
| 422 FLIP_SIGN_BIT2(*p2, *q2); | 425 FLIP_SIGN_BIT2(*p2, *q2); |
| 423 | 426 |
| (...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 459 | 462 |
| 460 // unoffset | 463 // unoffset |
| 461 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); | 464 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); |
| 462 FLIP_SIGN_BIT2(*p2, *q2); | 465 FLIP_SIGN_BIT2(*p2, *q2); |
| 463 } | 466 } |
| 464 | 467 |
| 465 // reads 8 rows across a vertical edge. | 468 // reads 8 rows across a vertical edge. |
| 466 // | 469 // |
| 467 // TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into | 470 // TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into |
| 468 // two Load4x4() to avoid code duplication. | 471 // two Load4x4() to avoid code duplication. |
| 469 static inline void Load8x4(const uint8_t* b, int stride, | 472 static WEBP_INLINE void Load8x4(const uint8_t* b, int stride, |
| 470 __m128i* p, __m128i* q) { | 473 __m128i* p, __m128i* q) { |
| 471 __m128i t1, t2; | 474 __m128i t1, t2; |
| 472 | 475 |
| 473 // Load 0th, 1st, 4th and 5th rows | 476 // Load 0th, 1st, 4th and 5th rows |
| 474 __m128i r0 = _mm_cvtsi32_si128(*((int*)&b[0 * stride])); // 03 02 01 00 | 477 __m128i r0 = _mm_cvtsi32_si128(*((int*)&b[0 * stride])); // 03 02 01 00 |
| 475 __m128i r1 = _mm_cvtsi32_si128(*((int*)&b[1 * stride])); // 13 12 11 10 | 478 __m128i r1 = _mm_cvtsi32_si128(*((int*)&b[1 * stride])); // 13 12 11 10 |
| 476 __m128i r4 = _mm_cvtsi32_si128(*((int*)&b[4 * stride])); // 43 42 41 40 | 479 __m128i r4 = _mm_cvtsi32_si128(*((int*)&b[4 * stride])); // 43 42 41 40 |
| 477 __m128i r5 = _mm_cvtsi32_si128(*((int*)&b[5 * stride])); // 53 52 51 50 | 480 __m128i r5 = _mm_cvtsi32_si128(*((int*)&b[5 * stride])); // 53 52 51 50 |
| 478 | 481 |
| 479 r0 = _mm_unpacklo_epi32(r0, r4); // 43 42 41 40 03 02 01 00 | 482 r0 = _mm_unpacklo_epi32(r0, r4); // 43 42 41 40 03 02 01 00 |
| 480 r1 = _mm_unpacklo_epi32(r1, r5); // 53 52 51 50 13 12 11 10 | 483 r1 = _mm_unpacklo_epi32(r1, r5); // 53 52 51 50 13 12 11 10 |
| (...skipping 18 matching lines...) Expand all Loading... |
| 499 r0 = t1; | 502 r0 = t1; |
| 500 t1 = _mm_unpacklo_epi16(t1, t2); | 503 t1 = _mm_unpacklo_epi16(t1, t2); |
| 501 t2 = _mm_unpackhi_epi16(r0, t2); | 504 t2 = _mm_unpackhi_epi16(r0, t2); |
| 502 | 505 |
| 503 // *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 | 506 // *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 |
| 504 // *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 | 507 // *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 |
| 505 *p = _mm_unpacklo_epi32(t1, t2); | 508 *p = _mm_unpacklo_epi32(t1, t2); |
| 506 *q = _mm_unpackhi_epi32(t1, t2); | 509 *q = _mm_unpackhi_epi32(t1, t2); |
| 507 } | 510 } |
| 508 | 511 |
| 509 static inline void Load16x4(const uint8_t* r0, const uint8_t* r8, int stride, | 512 static WEBP_INLINE void Load16x4(const uint8_t* r0, const uint8_t* r8, |
| 510 __m128i* p1, __m128i* p0, | 513 int stride, |
| 511 __m128i* q0, __m128i* q1) { | 514 __m128i* p1, __m128i* p0, |
| 515 __m128i* q0, __m128i* q1) { |
| 512 __m128i t1, t2; | 516 __m128i t1, t2; |
| 513 // Assume the pixels around the edge (|) are numbered as follows | 517 // Assume the pixels around the edge (|) are numbered as follows |
| 514 // 00 01 | 02 03 | 518 // 00 01 | 02 03 |
| 515 // 10 11 | 12 13 | 519 // 10 11 | 12 13 |
| 516 // ... | ... | 520 // ... | ... |
| 517 // e0 e1 | e2 e3 | 521 // e0 e1 | e2 e3 |
| 518 // f0 f1 | f2 f3 | 522 // f0 f1 | f2 f3 |
| 519 // | 523 // |
| 520 // r0 is pointing to the 0th row (00) | 524 // r0 is pointing to the 0th row (00) |
| 521 // r8 is pointing to the 8th row (80) | 525 // r8 is pointing to the 8th row (80) |
| (...skipping 11 matching lines...) Expand all Loading... |
| 533 // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 | 537 // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 |
| 534 // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 | 538 // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 |
| 535 // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 | 539 // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 |
| 536 // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 | 540 // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 |
| 537 *p1 = _mm_unpacklo_epi64(t1, *p0); | 541 *p1 = _mm_unpacklo_epi64(t1, *p0); |
| 538 *p0 = _mm_unpackhi_epi64(t1, *p0); | 542 *p0 = _mm_unpackhi_epi64(t1, *p0); |
| 539 *q0 = _mm_unpacklo_epi64(t2, *q1); | 543 *q0 = _mm_unpacklo_epi64(t2, *q1); |
| 540 *q1 = _mm_unpackhi_epi64(t2, *q1); | 544 *q1 = _mm_unpackhi_epi64(t2, *q1); |
| 541 } | 545 } |
| 542 | 546 |
| 543 static inline void Store4x4(__m128i* x, uint8_t* dst, int stride) { | 547 static WEBP_INLINE void Store4x4(__m128i* x, uint8_t* dst, int stride) { |
| 544 int i; | 548 int i; |
| 545 for (i = 0; i < 4; ++i, dst += stride) { | 549 for (i = 0; i < 4; ++i, dst += stride) { |
| 546 *((int32_t*)dst) = _mm_cvtsi128_si32(*x); | 550 *((int32_t*)dst) = _mm_cvtsi128_si32(*x); |
| 547 *x = _mm_srli_si128(*x, 4); | 551 *x = _mm_srli_si128(*x, 4); |
| 548 } | 552 } |
| 549 } | 553 } |
| 550 | 554 |
| 551 // Transpose back and store | 555 // Transpose back and store |
| 552 static inline void Store16x4(uint8_t* r0, uint8_t* r8, int stride, __m128i* p1, | 556 static WEBP_INLINE void Store16x4(uint8_t* r0, uint8_t* r8, int stride, |
| 553 __m128i* p0, __m128i* q0, __m128i* q1) { | 557 __m128i* p1, __m128i* p0, |
| 558 __m128i* q0, __m128i* q1) { |
| 554 __m128i t1; | 559 __m128i t1; |
| 555 | 560 |
| 556 // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 | 561 // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 |
| 557 // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 | 562 // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 |
| 558 t1 = *p0; | 563 t1 = *p0; |
| 559 *p0 = _mm_unpacklo_epi8(*p1, t1); | 564 *p0 = _mm_unpacklo_epi8(*p1, t1); |
| 560 *p1 = _mm_unpackhi_epi8(*p1, t1); | 565 *p1 = _mm_unpackhi_epi8(*p1, t1); |
| 561 | 566 |
| 562 // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 | 567 // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 |
| 563 // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 | 568 // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 |
| (...skipping 324 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 888 VP8SimpleVFilter16 = SimpleVFilter16SSE2; | 893 VP8SimpleVFilter16 = SimpleVFilter16SSE2; |
| 889 VP8SimpleHFilter16 = SimpleHFilter16SSE2; | 894 VP8SimpleHFilter16 = SimpleHFilter16SSE2; |
| 890 VP8SimpleVFilter16i = SimpleVFilter16iSSE2; | 895 VP8SimpleVFilter16i = SimpleVFilter16iSSE2; |
| 891 VP8SimpleHFilter16i = SimpleHFilter16iSSE2; | 896 VP8SimpleHFilter16i = SimpleHFilter16iSSE2; |
| 892 } | 897 } |
| 893 | 898 |
| 894 #if defined(__cplusplus) || defined(c_plusplus) | 899 #if defined(__cplusplus) || defined(c_plusplus) |
| 895 } // extern "C" | 900 } // extern "C" |
| 896 #endif | 901 #endif |
| 897 | 902 |
| 898 #endif //__SSE2__ || _MSC_VER | 903 #endif // WEBP_USE_SSE2 |
| OLD | NEW |