third_party/libwebp/dsp/dec_sse2.c - Issue 10832153: libwebp: update snapshot to v0.2.0-rc1

Side by Side Diff: third_party/libwebp/dsp/dec_sse2.c

Issue 10832153: libwebp: update snapshot to v0.2.0-rc1 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 8 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2011 Google Inc.	1 // Copyright 2011 Google Inc. All Rights Reserved.

2 //	2 //

3 // This code is licensed under the same terms as WebM:	3 // This code is licensed under the same terms as WebM:

4 // Software License Agreement: http://www.webmproject.org/license/software/	4 // Software License Agreement: http://www.webmproject.org/license/software/

5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/	5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/

6 // -----------------------------------------------------------------------------	6 // -----------------------------------------------------------------------------

7 //	7 //

8 // SSE2 version of some decoding functions (idct, loop filtering).	8 // SSE2 version of some decoding functions (idct, loop filtering).

9 //	9 //

10 // Author: somnath@google.com (Somnath Banerjee)	10 // Author: somnath@google.com (Somnath Banerjee)

11 // cduvivier@google.com (Christian Duvivier)	11 // cduvivier@google.com (Christian Duvivier)

12	12

13 #if defined(__SSE2__) \|\| defined(_MSC_VER)	13 #include "./dsp.h"

	14

	15 #if defined(WEBP_USE_SSE2)

14	16

15 #include <emmintrin.h>	17 #include <emmintrin.h>

16 #include "../dec/vp8i.h"	18 #include "../dec/vp8i.h"

17	19

18 #if defined(__cplusplus) \|\| defined(c_plusplus)	20 #if defined(__cplusplus) \|\| defined(c_plusplus)

19 extern "C" {	21 extern "C" {

20 #endif	22 #endif

21	23

22 //------------------------------------------------------------------------------	24 //------------------------------------------------------------------------------

23 // Transforms (Paragraph 14.4)	25 // Transforms (Paragraph 14.4)

(...skipping 310 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
334	336

335 t1 = _mm_set1_epi8(thresh);	337 t1 = _mm_set1_epi8(thresh);

336 mask = _mm_subs_epu8(mask, t1); // mask <= thresh	338 mask = _mm_subs_epu8(mask, t1); // mask <= thresh

337 mask = _mm_cmpeq_epi8(mask, _mm_setzero_si128());	339 mask = _mm_cmpeq_epi8(mask, _mm_setzero_si128());

338 }	340 }

339	341

340 //------------------------------------------------------------------------------	342 //------------------------------------------------------------------------------

341 // Edge filtering functions	343 // Edge filtering functions

342	344

343 // Applies filter on 2 pixels (p0 and q0)	345 // Applies filter on 2 pixels (p0 and q0)

344 static inline void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0,	346 static WEBP_INLINE void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0,

345 const __m128i* q1, int thresh) {	347 const __m128i* q1, int thresh) {

346 __m128i a, mask;	348 __m128i a, mask;

347 const __m128i sign_bit = _mm_set1_epi8(0x80);	349 const __m128i sign_bit = _mm_set1_epi8(0x80);

348 const __m128i p1s = _mm_xor_si128(*p1, sign_bit);	350 const __m128i p1s = _mm_xor_si128(*p1, sign_bit);

349 const __m128i q1s = _mm_xor_si128(*q1, sign_bit);	351 const __m128i q1s = _mm_xor_si128(*q1, sign_bit);

350	352

351 NeedsFilter(p1, p0, q0, q1, thresh, &mask);	353 NeedsFilter(p1, p0, q0, q1, thresh, &mask);

352	354

353 // convert to signed values	355 // convert to signed values

354 FLIP_SIGN_BIT2(p0, q0);	356 FLIP_SIGN_BIT2(p0, q0);

355	357

356 GET_BASE_DELTA(p1s, p0, q0, q1s, a);	358 GET_BASE_DELTA(p1s, p0, q0, q1s, a);

357 a = _mm_and_si128(a, mask); // mask filter values we don't care about	359 a = _mm_and_si128(a, mask); // mask filter values we don't care about

358 DO_SIMPLE_FILTER(p0, q0, a);	360 DO_SIMPLE_FILTER(p0, q0, a);

359	361

360 // unoffset	362 // unoffset

361 FLIP_SIGN_BIT2(p0, q0);	363 FLIP_SIGN_BIT2(p0, q0);

362 }	364 }

363	365

364 // Applies filter on 4 pixels (p1, p0, q0 and q1)	366 // Applies filter on 4 pixels (p1, p0, q0 and q1)

365 static inline void DoFilter4(__m128i* p1, __m128i p0, __m128i q0, __m128i* q1,	367 static WEBP_INLINE void DoFilter4(__m128i* p1, __m128i *p0,

366 const __m128i* mask, int hev_thresh) {	368 __m128i* q0, __m128i* q1,

	369 const __m128i* mask, int hev_thresh) {

367 __m128i not_hev;	370 __m128i not_hev;

368 __m128i t1, t2, t3;	371 __m128i t1, t2, t3;

369 const __m128i sign_bit = _mm_set1_epi8(0x80);	372 const __m128i sign_bit = _mm_set1_epi8(0x80);

370	373

371 // compute hev mask	374 // compute hev mask

372 GET_NOTHEV(p1, p0, q0, q1, hev_thresh, not_hev);	375 GET_NOTHEV(p1, p0, q0, q1, hev_thresh, not_hev);

373	376

374 // convert to signed values	377 // convert to signed values

375 FLIP_SIGN_BIT4(p1, p0, q0, q1);	378 FLIP_SIGN_BIT4(p1, p0, q0, q1);

376	379

(...skipping 24 matching lines...) Expand all Loading...
401	404

402 t3 = _mm_and_si128(not_hev, t3); // if !hev	405 t3 = _mm_and_si128(not_hev, t3); // if !hev

403 q1 = _mm_subs_epi8(q1, t3); // q1 -= t3	406 q1 = _mm_subs_epi8(q1, t3); // q1 -= t3

404 p1 = _mm_adds_epi8(p1, t3); // p1 += t3	407 p1 = _mm_adds_epi8(p1, t3); // p1 += t3

405	408

406 // unoffset	409 // unoffset

407 FLIP_SIGN_BIT4(p1, p0, q0, q1);	410 FLIP_SIGN_BIT4(p1, p0, q0, q1);

408 }	411 }

409	412

410 // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)	413 // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)

411 static inline void DoFilter6(__m128i p2, __m128i p1, __m128i *p0,	414 static WEBP_INLINE void DoFilter6(__m128i p2, __m128i p1, __m128i *p0,

412 __m128i* q0, __m128i* q1, __m128i *q2,	415 __m128i* q0, __m128i* q1, __m128i *q2,

413 const __m128i* mask, int hev_thresh) {	416 const __m128i* mask, int hev_thresh) {

414 __m128i a, not_hev;	417 __m128i a, not_hev;

415 const __m128i sign_bit = _mm_set1_epi8(0x80);	418 const __m128i sign_bit = _mm_set1_epi8(0x80);

416	419

417 // compute hev mask	420 // compute hev mask

418 GET_NOTHEV(p1, p0, q0, q1, hev_thresh, not_hev);	421 GET_NOTHEV(p1, p0, q0, q1, hev_thresh, not_hev);

419	422

420 // convert to signed values	423 // convert to signed values

421 FLIP_SIGN_BIT4(p1, p0, q0, q1);	424 FLIP_SIGN_BIT4(p1, p0, q0, q1);

422 FLIP_SIGN_BIT2(p2, q2);	425 FLIP_SIGN_BIT2(p2, q2);

423	426

(...skipping 35 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
459	462

460 // unoffset	463 // unoffset

461 FLIP_SIGN_BIT4(p1, p0, q0, q1);	464 FLIP_SIGN_BIT4(p1, p0, q0, q1);

462 FLIP_SIGN_BIT2(p2, q2);	465 FLIP_SIGN_BIT2(p2, q2);

463 }	466 }

464	467

465 // reads 8 rows across a vertical edge.	468 // reads 8 rows across a vertical edge.

466 //	469 //

467 // TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into	470 // TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into

468 // two Load4x4() to avoid code duplication.	471 // two Load4x4() to avoid code duplication.

469 static inline void Load8x4(const uint8_t* b, int stride,	472 static WEBP_INLINE void Load8x4(const uint8_t* b, int stride,

470 __m128i* p, __m128i* q) {	473 __m128i* p, __m128i* q) {

471 __m128i t1, t2;	474 __m128i t1, t2;

472	475

473 // Load 0th, 1st, 4th and 5th rows	476 // Load 0th, 1st, 4th and 5th rows

474 __m128i r0 = _mm_cvtsi32_si128(((int)&b[0 * stride])); // 03 02 01 00	477 __m128i r0 = _mm_cvtsi32_si128(((int)&b[0 * stride])); // 03 02 01 00

475 __m128i r1 = _mm_cvtsi32_si128(((int)&b[1 * stride])); // 13 12 11 10	478 __m128i r1 = _mm_cvtsi32_si128(((int)&b[1 * stride])); // 13 12 11 10

476 __m128i r4 = _mm_cvtsi32_si128(((int)&b[4 * stride])); // 43 42 41 40	479 __m128i r4 = _mm_cvtsi32_si128(((int)&b[4 * stride])); // 43 42 41 40

477 __m128i r5 = _mm_cvtsi32_si128(((int)&b[5 * stride])); // 53 52 51 50	480 __m128i r5 = _mm_cvtsi32_si128(((int)&b[5 * stride])); // 53 52 51 50

478	481

479 r0 = _mm_unpacklo_epi32(r0, r4); // 43 42 41 40 03 02 01 00	482 r0 = _mm_unpacklo_epi32(r0, r4); // 43 42 41 40 03 02 01 00

480 r1 = _mm_unpacklo_epi32(r1, r5); // 53 52 51 50 13 12 11 10	483 r1 = _mm_unpacklo_epi32(r1, r5); // 53 52 51 50 13 12 11 10

(...skipping 18 matching lines...) Expand all Loading...
499 r0 = t1;	502 r0 = t1;

500 t1 = _mm_unpacklo_epi16(t1, t2);	503 t1 = _mm_unpacklo_epi16(t1, t2);

501 t2 = _mm_unpackhi_epi16(r0, t2);	504 t2 = _mm_unpackhi_epi16(r0, t2);

502	505

503 // *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00	506 // *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00

504 // *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02	507 // *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02

505 *p = _mm_unpacklo_epi32(t1, t2);	508 *p = _mm_unpacklo_epi32(t1, t2);

506 *q = _mm_unpackhi_epi32(t1, t2);	509 *q = _mm_unpackhi_epi32(t1, t2);

507 }	510 }

508	511

509 static inline void Load16x4(const uint8_t* r0, const uint8_t* r8, int stride,	512 static WEBP_INLINE void Load16x4(const uint8_t* r0, const uint8_t* r8,

510 __m128i* p1, __m128i* p0,	513 int stride,

511 __m128i* q0, __m128i* q1) {	514 __m128i* p1, __m128i* p0,

	515 __m128i* q0, __m128i* q1) {

512 __m128i t1, t2;	516 __m128i t1, t2;

513 // Assume the pixels around the edge (\|) are numbered as follows	517 // Assume the pixels around the edge (\|) are numbered as follows

514 // 00 01 \| 02 03	518 // 00 01 \| 02 03

515 // 10 11 \| 12 13	519 // 10 11 \| 12 13

516 // ... \| ...	520 // ... \| ...

517 // e0 e1 \| e2 e3	521 // e0 e1 \| e2 e3

518 // f0 f1 \| f2 f3	522 // f0 f1 \| f2 f3

519 //	523 //

520 // r0 is pointing to the 0th row (00)	524 // r0 is pointing to the 0th row (00)

521 // r8 is pointing to the 8th row (80)	525 // r8 is pointing to the 8th row (80)

(...skipping 11 matching lines...) Expand all Loading...
533 // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00	537 // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00

534 // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01	538 // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01

535 // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02	539 // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02

536 // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03	540 // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03

537 p1 = _mm_unpacklo_epi64(t1, p0);	541 p1 = _mm_unpacklo_epi64(t1, p0);

538 p0 = _mm_unpackhi_epi64(t1, p0);	542 p0 = _mm_unpackhi_epi64(t1, p0);

539 q0 = _mm_unpacklo_epi64(t2, q1);	543 q0 = _mm_unpacklo_epi64(t2, q1);

540 q1 = _mm_unpackhi_epi64(t2, q1);	544 q1 = _mm_unpackhi_epi64(t2, q1);

541 }	545 }

542	546

543 static inline void Store4x4(__m128i* x, uint8_t* dst, int stride) {	547 static WEBP_INLINE void Store4x4(__m128i* x, uint8_t* dst, int stride) {

544 int i;	548 int i;

545 for (i = 0; i < 4; ++i, dst += stride) {	549 for (i = 0; i < 4; ++i, dst += stride) {

546 ((int32_t)dst) = _mm_cvtsi128_si32(*x);	550 ((int32_t)dst) = _mm_cvtsi128_si32(*x);

547 x = _mm_srli_si128(x, 4);	551 x = _mm_srli_si128(x, 4);

548 }	552 }

549 }	553 }

550	554

551 // Transpose back and store	555 // Transpose back and store

552 static inline void Store16x4(uint8_t* r0, uint8_t* r8, int stride, __m128i* p1,	556 static WEBP_INLINE void Store16x4(uint8_t* r0, uint8_t* r8, int stride,

553 __m128i* p0, __m128i* q0, __m128i* q1) {	557 __m128i* p1, __m128i* p0,

	558 __m128i* q0, __m128i* q1) {

554 __m128i t1;	559 __m128i t1;

555	560

556 // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00	561 // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00

557 // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80	562 // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80

558 t1 = *p0;	563 t1 = *p0;

559 p0 = _mm_unpacklo_epi8(p1, t1);	564 p0 = _mm_unpacklo_epi8(p1, t1);

560 p1 = _mm_unpackhi_epi8(p1, t1);	565 p1 = _mm_unpackhi_epi8(p1, t1);

561	566

562 // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02	567 // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02

563 // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82	568 // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82

(...skipping 324 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
888 VP8SimpleVFilter16 = SimpleVFilter16SSE2;	893 VP8SimpleVFilter16 = SimpleVFilter16SSE2;

889 VP8SimpleHFilter16 = SimpleHFilter16SSE2;	894 VP8SimpleHFilter16 = SimpleHFilter16SSE2;

890 VP8SimpleVFilter16i = SimpleVFilter16iSSE2;	895 VP8SimpleVFilter16i = SimpleVFilter16iSSE2;

891 VP8SimpleHFilter16i = SimpleHFilter16iSSE2;	896 VP8SimpleHFilter16i = SimpleHFilter16iSSE2;

892 }	897 }

893	898

894 #if defined(__cplusplus) \|\| defined(c_plusplus)	899 #if defined(__cplusplus) \|\| defined(c_plusplus)

895 } // extern "C"	900 } // extern "C"

896 #endif	901 #endif

897	902

898 #endif //__SSE2__ \|\| _MSC_VER	903 #endif // WEBP_USE_SSE2

OLD	NEW

« no previous file with comments | « third_party/libwebp/dsp/dec_neon.c ('k') | third_party/libwebp/dsp/dsp.h » ('j') | third_party/libwebp/libwebp.gyp » ('J')