| Index: third_party/libwebp/dsp/dec_sse2.c
|
| diff --git a/third_party/libwebp/dsp/dec_sse2.c b/third_party/libwebp/dsp/dec_sse2.c
|
| index 6be94678ef717aa342dc80d7433a70a0db44f7fd..150c559f132f45798a08fd5313a0a7a1f68662b5 100644
|
| --- a/third_party/libwebp/dsp/dec_sse2.c
|
| +++ b/third_party/libwebp/dsp/dec_sse2.c
|
| @@ -14,12 +14,12 @@
|
|
|
| #include "./dsp.h"
|
|
|
| -#if defined(__cplusplus) || defined(c_plusplus)
|
| -extern "C" {
|
| -#endif
|
| -
|
| #if defined(WEBP_USE_SSE2)
|
|
|
| +// The 3-coeff sparse transform in SSE2 is not really faster than the plain-C
|
| +// one it seems => disable it by default. Uncomment the following to enable:
|
| +// #define USE_TRANSFORM_AC3
|
| +
|
| #include <emmintrin.h>
|
| #include "../dec/vp8i.h"
|
|
|
| @@ -201,16 +201,16 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
|
| __m128i dst0, dst1, dst2, dst3;
|
| if (do_two) {
|
| // Load eight bytes/pixels per line.
|
| - dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]);
|
| - dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]);
|
| - dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]);
|
| - dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]);
|
| + dst0 = _mm_loadl_epi64((__m128i*)(dst + 0 * BPS));
|
| + dst1 = _mm_loadl_epi64((__m128i*)(dst + 1 * BPS));
|
| + dst2 = _mm_loadl_epi64((__m128i*)(dst + 2 * BPS));
|
| + dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS));
|
| } else {
|
| // Load four bytes/pixels per line.
|
| - dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]);
|
| - dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]);
|
| - dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]);
|
| - dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]);
|
| + dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
|
| + dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
|
| + dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
|
| + dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
|
| }
|
| // Convert to 16b.
|
| dst0 = _mm_unpacklo_epi8(dst0, zero);
|
| @@ -230,20 +230,66 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
|
| // Store the results.
|
| if (do_two) {
|
| // Store eight bytes/pixels per line.
|
| - _mm_storel_epi64((__m128i*)&dst[0 * BPS], dst0);
|
| - _mm_storel_epi64((__m128i*)&dst[1 * BPS], dst1);
|
| - _mm_storel_epi64((__m128i*)&dst[2 * BPS], dst2);
|
| - _mm_storel_epi64((__m128i*)&dst[3 * BPS], dst3);
|
| + _mm_storel_epi64((__m128i*)(dst + 0 * BPS), dst0);
|
| + _mm_storel_epi64((__m128i*)(dst + 1 * BPS), dst1);
|
| + _mm_storel_epi64((__m128i*)(dst + 2 * BPS), dst2);
|
| + _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3);
|
| } else {
|
| // Store four bytes/pixels per line.
|
| - *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0);
|
| - *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1);
|
| - *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2);
|
| - *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3);
|
| + *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
|
| + *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
|
| + *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
|
| + *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
|
| }
|
| }
|
| }
|
|
|
| +#if defined(USE_TRANSFORM_AC3)
|
| +#define MUL(a, b) (((a) * (b)) >> 16)
|
| +static void TransformAC3SSE2(const int16_t* in, uint8_t* dst) {
|
| + static const int kC1 = 20091 + (1 << 16);
|
| + static const int kC2 = 35468;
|
| + const __m128i A = _mm_set1_epi16(in[0] + 4);
|
| + const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2));
|
| + const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1));
|
| + const int c1 = MUL(in[1], kC2);
|
| + const int d1 = MUL(in[1], kC1);
|
| + const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1);
|
| + const __m128i B = _mm_adds_epi16(A, CD);
|
| + const __m128i m0 = _mm_adds_epi16(B, d4);
|
| + const __m128i m1 = _mm_adds_epi16(B, c4);
|
| + const __m128i m2 = _mm_subs_epi16(B, c4);
|
| + const __m128i m3 = _mm_subs_epi16(B, d4);
|
| + const __m128i zero = _mm_setzero_si128();
|
| + // Load the source pixels.
|
| + __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
|
| + __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
|
| + __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
|
| + __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
|
| + // Convert to 16b.
|
| + dst0 = _mm_unpacklo_epi8(dst0, zero);
|
| + dst1 = _mm_unpacklo_epi8(dst1, zero);
|
| + dst2 = _mm_unpacklo_epi8(dst2, zero);
|
| + dst3 = _mm_unpacklo_epi8(dst3, zero);
|
| + // Add the inverse transform.
|
| + dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3));
|
| + dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3));
|
| + dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3));
|
| + dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3));
|
| + // Unsigned saturate to 8b.
|
| + dst0 = _mm_packus_epi16(dst0, dst0);
|
| + dst1 = _mm_packus_epi16(dst1, dst1);
|
| + dst2 = _mm_packus_epi16(dst2, dst2);
|
| + dst3 = _mm_packus_epi16(dst3, dst3);
|
| + // Store the results.
|
| + *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
|
| + *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
|
| + *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
|
| + *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
|
| +}
|
| +#undef MUL
|
| +#endif // USE_TRANSFORM_AC3
|
| +
|
| //------------------------------------------------------------------------------
|
| // Loop Filter (Paragraph 15)
|
|
|
| @@ -888,6 +934,9 @@ extern void VP8DspInitSSE2(void);
|
| void VP8DspInitSSE2(void) {
|
| #if defined(WEBP_USE_SSE2)
|
| VP8Transform = TransformSSE2;
|
| +#if defined(USE_TRANSFORM_AC3)
|
| + VP8TransformAC3 = TransformAC3SSE2;
|
| +#endif
|
|
|
| VP8VFilter16 = VFilter16SSE2;
|
| VP8HFilter16 = HFilter16SSE2;
|
| @@ -905,6 +954,3 @@ void VP8DspInitSSE2(void) {
|
| #endif // WEBP_USE_SSE2
|
| }
|
|
|
| -#if defined(__cplusplus) || defined(c_plusplus)
|
| -} // extern "C"
|
| -#endif
|
|
|