Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(6)

Side by Side Diff: third_party/libwebp/dsp/dec_sse2.c

Issue 10832153: libwebp: update snapshot to v0.2.0-rc1 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 8 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2011 Google Inc. 1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // 2 //
3 // This code is licensed under the same terms as WebM: 3 // This code is licensed under the same terms as WebM:
4 // Software License Agreement: http://www.webmproject.org/license/software/ 4 // Software License Agreement: http://www.webmproject.org/license/software/
5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/ 5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/
6 // ----------------------------------------------------------------------------- 6 // -----------------------------------------------------------------------------
7 // 7 //
8 // SSE2 version of some decoding functions (idct, loop filtering). 8 // SSE2 version of some decoding functions (idct, loop filtering).
9 // 9 //
10 // Author: somnath@google.com (Somnath Banerjee) 10 // Author: somnath@google.com (Somnath Banerjee)
11 // cduvivier@google.com (Christian Duvivier) 11 // cduvivier@google.com (Christian Duvivier)
12 12
13 #if defined(__SSE2__) || defined(_MSC_VER) 13 #include "./dsp.h"
14
15 #if defined(WEBP_USE_SSE2)
14 16
15 #include <emmintrin.h> 17 #include <emmintrin.h>
16 #include "../dec/vp8i.h" 18 #include "../dec/vp8i.h"
17 19
18 #if defined(__cplusplus) || defined(c_plusplus) 20 #if defined(__cplusplus) || defined(c_plusplus)
19 extern "C" { 21 extern "C" {
20 #endif 22 #endif
21 23
22 //------------------------------------------------------------------------------ 24 //------------------------------------------------------------------------------
23 // Transforms (Paragraph 14.4) 25 // Transforms (Paragraph 14.4)
(...skipping 310 matching lines...) Expand 10 before | Expand all | Expand 10 after
334 336
335 t1 = _mm_set1_epi8(thresh); 337 t1 = _mm_set1_epi8(thresh);
336 *mask = _mm_subs_epu8(*mask, t1); // mask <= thresh 338 *mask = _mm_subs_epu8(*mask, t1); // mask <= thresh
337 *mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128()); 339 *mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128());
338 } 340 }
339 341
340 //------------------------------------------------------------------------------ 342 //------------------------------------------------------------------------------
341 // Edge filtering functions 343 // Edge filtering functions
342 344
343 // Applies filter on 2 pixels (p0 and q0) 345 // Applies filter on 2 pixels (p0 and q0)
344 static inline void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0, 346 static WEBP_INLINE void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0,
345 const __m128i* q1, int thresh) { 347 const __m128i* q1, int thresh) {
346 __m128i a, mask; 348 __m128i a, mask;
347 const __m128i sign_bit = _mm_set1_epi8(0x80); 349 const __m128i sign_bit = _mm_set1_epi8(0x80);
348 const __m128i p1s = _mm_xor_si128(*p1, sign_bit); 350 const __m128i p1s = _mm_xor_si128(*p1, sign_bit);
349 const __m128i q1s = _mm_xor_si128(*q1, sign_bit); 351 const __m128i q1s = _mm_xor_si128(*q1, sign_bit);
350 352
351 NeedsFilter(p1, p0, q0, q1, thresh, &mask); 353 NeedsFilter(p1, p0, q0, q1, thresh, &mask);
352 354
353 // convert to signed values 355 // convert to signed values
354 FLIP_SIGN_BIT2(*p0, *q0); 356 FLIP_SIGN_BIT2(*p0, *q0);
355 357
356 GET_BASE_DELTA(p1s, *p0, *q0, q1s, a); 358 GET_BASE_DELTA(p1s, *p0, *q0, q1s, a);
357 a = _mm_and_si128(a, mask); // mask filter values we don't care about 359 a = _mm_and_si128(a, mask); // mask filter values we don't care about
358 DO_SIMPLE_FILTER(*p0, *q0, a); 360 DO_SIMPLE_FILTER(*p0, *q0, a);
359 361
360 // unoffset 362 // unoffset
361 FLIP_SIGN_BIT2(*p0, *q0); 363 FLIP_SIGN_BIT2(*p0, *q0);
362 } 364 }
363 365
364 // Applies filter on 4 pixels (p1, p0, q0 and q1) 366 // Applies filter on 4 pixels (p1, p0, q0 and q1)
365 static inline void DoFilter4(__m128i* p1, __m128i *p0, __m128i* q0, __m128i* q1, 367 static WEBP_INLINE void DoFilter4(__m128i* p1, __m128i *p0,
366 const __m128i* mask, int hev_thresh) { 368 __m128i* q0, __m128i* q1,
369 const __m128i* mask, int hev_thresh) {
367 __m128i not_hev; 370 __m128i not_hev;
368 __m128i t1, t2, t3; 371 __m128i t1, t2, t3;
369 const __m128i sign_bit = _mm_set1_epi8(0x80); 372 const __m128i sign_bit = _mm_set1_epi8(0x80);
370 373
371 // compute hev mask 374 // compute hev mask
372 GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev); 375 GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev);
373 376
374 // convert to signed values 377 // convert to signed values
375 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); 378 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
376 379
(...skipping 24 matching lines...) Expand all
401 404
402 t3 = _mm_and_si128(not_hev, t3); // if !hev 405 t3 = _mm_and_si128(not_hev, t3); // if !hev
403 *q1 = _mm_subs_epi8(*q1, t3); // q1 -= t3 406 *q1 = _mm_subs_epi8(*q1, t3); // q1 -= t3
404 *p1 = _mm_adds_epi8(*p1, t3); // p1 += t3 407 *p1 = _mm_adds_epi8(*p1, t3); // p1 += t3
405 408
406 // unoffset 409 // unoffset
407 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); 410 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
408 } 411 }
409 412
410 // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2) 413 // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
411 static inline void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0, 414 static WEBP_INLINE void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0,
412 __m128i* q0, __m128i* q1, __m128i *q2, 415 __m128i* q0, __m128i* q1, __m128i *q2,
413 const __m128i* mask, int hev_thresh) { 416 const __m128i* mask, int hev_thresh) {
414 __m128i a, not_hev; 417 __m128i a, not_hev;
415 const __m128i sign_bit = _mm_set1_epi8(0x80); 418 const __m128i sign_bit = _mm_set1_epi8(0x80);
416 419
417 // compute hev mask 420 // compute hev mask
418 GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev); 421 GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev);
419 422
420 // convert to signed values 423 // convert to signed values
421 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); 424 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
422 FLIP_SIGN_BIT2(*p2, *q2); 425 FLIP_SIGN_BIT2(*p2, *q2);
423 426
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after
459 462
460 // unoffset 463 // unoffset
461 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); 464 FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
462 FLIP_SIGN_BIT2(*p2, *q2); 465 FLIP_SIGN_BIT2(*p2, *q2);
463 } 466 }
464 467
465 // reads 8 rows across a vertical edge. 468 // reads 8 rows across a vertical edge.
466 // 469 //
467 // TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into 470 // TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into
468 // two Load4x4() to avoid code duplication. 471 // two Load4x4() to avoid code duplication.
469 static inline void Load8x4(const uint8_t* b, int stride, 472 static WEBP_INLINE void Load8x4(const uint8_t* b, int stride,
470 __m128i* p, __m128i* q) { 473 __m128i* p, __m128i* q) {
471 __m128i t1, t2; 474 __m128i t1, t2;
472 475
473 // Load 0th, 1st, 4th and 5th rows 476 // Load 0th, 1st, 4th and 5th rows
474 __m128i r0 = _mm_cvtsi32_si128(*((int*)&b[0 * stride])); // 03 02 01 00 477 __m128i r0 = _mm_cvtsi32_si128(*((int*)&b[0 * stride])); // 03 02 01 00
475 __m128i r1 = _mm_cvtsi32_si128(*((int*)&b[1 * stride])); // 13 12 11 10 478 __m128i r1 = _mm_cvtsi32_si128(*((int*)&b[1 * stride])); // 13 12 11 10
476 __m128i r4 = _mm_cvtsi32_si128(*((int*)&b[4 * stride])); // 43 42 41 40 479 __m128i r4 = _mm_cvtsi32_si128(*((int*)&b[4 * stride])); // 43 42 41 40
477 __m128i r5 = _mm_cvtsi32_si128(*((int*)&b[5 * stride])); // 53 52 51 50 480 __m128i r5 = _mm_cvtsi32_si128(*((int*)&b[5 * stride])); // 53 52 51 50
478 481
479 r0 = _mm_unpacklo_epi32(r0, r4); // 43 42 41 40 03 02 01 00 482 r0 = _mm_unpacklo_epi32(r0, r4); // 43 42 41 40 03 02 01 00
480 r1 = _mm_unpacklo_epi32(r1, r5); // 53 52 51 50 13 12 11 10 483 r1 = _mm_unpacklo_epi32(r1, r5); // 53 52 51 50 13 12 11 10
(...skipping 18 matching lines...) Expand all
499 r0 = t1; 502 r0 = t1;
500 t1 = _mm_unpacklo_epi16(t1, t2); 503 t1 = _mm_unpacklo_epi16(t1, t2);
501 t2 = _mm_unpackhi_epi16(r0, t2); 504 t2 = _mm_unpackhi_epi16(r0, t2);
502 505
503 // *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 506 // *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
504 // *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 507 // *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
505 *p = _mm_unpacklo_epi32(t1, t2); 508 *p = _mm_unpacklo_epi32(t1, t2);
506 *q = _mm_unpackhi_epi32(t1, t2); 509 *q = _mm_unpackhi_epi32(t1, t2);
507 } 510 }
508 511
509 static inline void Load16x4(const uint8_t* r0, const uint8_t* r8, int stride, 512 static WEBP_INLINE void Load16x4(const uint8_t* r0, const uint8_t* r8,
510 __m128i* p1, __m128i* p0, 513 int stride,
511 __m128i* q0, __m128i* q1) { 514 __m128i* p1, __m128i* p0,
515 __m128i* q0, __m128i* q1) {
512 __m128i t1, t2; 516 __m128i t1, t2;
513 // Assume the pixels around the edge (|) are numbered as follows 517 // Assume the pixels around the edge (|) are numbered as follows
514 // 00 01 | 02 03 518 // 00 01 | 02 03
515 // 10 11 | 12 13 519 // 10 11 | 12 13
516 // ... | ... 520 // ... | ...
517 // e0 e1 | e2 e3 521 // e0 e1 | e2 e3
518 // f0 f1 | f2 f3 522 // f0 f1 | f2 f3
519 // 523 //
520 // r0 is pointing to the 0th row (00) 524 // r0 is pointing to the 0th row (00)
521 // r8 is pointing to the 8th row (80) 525 // r8 is pointing to the 8th row (80)
(...skipping 11 matching lines...) Expand all
533 // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 537 // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
534 // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 538 // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
535 // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 539 // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
536 // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 540 // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
537 *p1 = _mm_unpacklo_epi64(t1, *p0); 541 *p1 = _mm_unpacklo_epi64(t1, *p0);
538 *p0 = _mm_unpackhi_epi64(t1, *p0); 542 *p0 = _mm_unpackhi_epi64(t1, *p0);
539 *q0 = _mm_unpacklo_epi64(t2, *q1); 543 *q0 = _mm_unpacklo_epi64(t2, *q1);
540 *q1 = _mm_unpackhi_epi64(t2, *q1); 544 *q1 = _mm_unpackhi_epi64(t2, *q1);
541 } 545 }
542 546
543 static inline void Store4x4(__m128i* x, uint8_t* dst, int stride) { 547 static WEBP_INLINE void Store4x4(__m128i* x, uint8_t* dst, int stride) {
544 int i; 548 int i;
545 for (i = 0; i < 4; ++i, dst += stride) { 549 for (i = 0; i < 4; ++i, dst += stride) {
546 *((int32_t*)dst) = _mm_cvtsi128_si32(*x); 550 *((int32_t*)dst) = _mm_cvtsi128_si32(*x);
547 *x = _mm_srli_si128(*x, 4); 551 *x = _mm_srli_si128(*x, 4);
548 } 552 }
549 } 553 }
550 554
551 // Transpose back and store 555 // Transpose back and store
552 static inline void Store16x4(uint8_t* r0, uint8_t* r8, int stride, __m128i* p1, 556 static WEBP_INLINE void Store16x4(uint8_t* r0, uint8_t* r8, int stride,
553 __m128i* p0, __m128i* q0, __m128i* q1) { 557 __m128i* p1, __m128i* p0,
558 __m128i* q0, __m128i* q1) {
554 __m128i t1; 559 __m128i t1;
555 560
556 // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 561 // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
557 // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 562 // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
558 t1 = *p0; 563 t1 = *p0;
559 *p0 = _mm_unpacklo_epi8(*p1, t1); 564 *p0 = _mm_unpacklo_epi8(*p1, t1);
560 *p1 = _mm_unpackhi_epi8(*p1, t1); 565 *p1 = _mm_unpackhi_epi8(*p1, t1);
561 566
562 // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 567 // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
563 // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 568 // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
(...skipping 324 matching lines...) Expand 10 before | Expand all | Expand 10 after
888 VP8SimpleVFilter16 = SimpleVFilter16SSE2; 893 VP8SimpleVFilter16 = SimpleVFilter16SSE2;
889 VP8SimpleHFilter16 = SimpleHFilter16SSE2; 894 VP8SimpleHFilter16 = SimpleHFilter16SSE2;
890 VP8SimpleVFilter16i = SimpleVFilter16iSSE2; 895 VP8SimpleVFilter16i = SimpleVFilter16iSSE2;
891 VP8SimpleHFilter16i = SimpleHFilter16iSSE2; 896 VP8SimpleHFilter16i = SimpleHFilter16iSSE2;
892 } 897 }
893 898
894 #if defined(__cplusplus) || defined(c_plusplus) 899 #if defined(__cplusplus) || defined(c_plusplus)
895 } // extern "C" 900 } // extern "C"
896 #endif 901 #endif
897 902
898 #endif //__SSE2__ || _MSC_VER 903 #endif // WEBP_USE_SSE2
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698