| Index: third_party/libwebp/dsp/lossless.c
|
| diff --git a/third_party/libwebp/dsp/lossless.c b/third_party/libwebp/dsp/lossless.c
|
| index e445924ed3f9c0e56a56fbd22898712a86cc8848..bab76d22ded2c169b7c2db47d237a642ab90471c 100644
|
| --- a/third_party/libwebp/dsp/lossless.c
|
| +++ b/third_party/libwebp/dsp/lossless.c
|
| @@ -15,14 +15,7 @@
|
|
|
| #include "./dsp.h"
|
|
|
| -// Define the following if target arch is sure to have SSE2
|
| -// #define WEBP_TARGET_HAS_SSE2
|
| -
|
| -#if defined(__cplusplus) || defined(c_plusplus)
|
| -extern "C" {
|
| -#endif
|
| -
|
| -#if defined(WEBP_TARGET_HAS_SSE2)
|
| +#if defined(WEBP_USE_SSE2)
|
| #include <emmintrin.h>
|
| #endif
|
|
|
| @@ -235,6 +228,109 @@ const float kSLog2Table[LOG_LOOKUP_IDX_MAX] = {
|
| 2010.27454072f, 2019.69737440f, 2029.12591044f, 2038.56012640f
|
| };
|
|
|
| +const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX] = {
|
| + { 0, 0}, { 0, 0}, { 1, 0}, { 2, 0}, { 3, 0}, { 4, 1}, { 4, 1}, { 5, 1},
|
| + { 5, 1}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 7, 2}, { 7, 2}, { 7, 2},
|
| + { 7, 2}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3},
|
| + { 8, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3},
|
| + { 9, 3}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
|
| + {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
|
| + {10, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
|
| + {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
|
| + {11, 4}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
|
| + {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
|
| + {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
|
| + {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
|
| + {12, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
|
| + {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
|
| + {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
|
| + {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
|
| + {13, 5}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
|
| + {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
|
| + {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
|
| + {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
|
| + {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
|
| + {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
|
| + {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
|
| + {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
|
| + {14, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
|
| + {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
|
| + {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
|
| + {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
|
| + {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
|
| + {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
|
| + {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
|
| + {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
|
| + {15, 6}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
|
| + {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
|
| + {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
|
| + {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
|
| + {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
|
| + {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
|
| + {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
|
| + {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
|
| + {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
|
| + {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
|
| + {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
|
| + {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
|
| + {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
|
| + {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
|
| + {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
|
| + {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
|
| + {16, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
|
| + {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
|
| + {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
|
| + {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
|
| + {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
|
| + {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
|
| + {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
|
| + {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
|
| + {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
|
| + {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
|
| + {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
|
| + {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
|
| + {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
|
| + {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
|
| + {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
|
| + {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
|
| +};
|
| +
|
| +const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
|
| + 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3,
|
| + 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
|
| + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
| + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
| + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
| + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
| + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
| + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
| + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
| + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
| + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
|
| + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
|
| + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
| + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
| + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
|
| + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
|
| + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
| + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
| + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
|
| + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
|
| + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
|
| + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
|
| + 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
|
| + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
|
| + 127,
|
| + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
| + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
| + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
|
| + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
|
| + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
|
| + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
|
| + 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
|
| + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
|
| +};
|
| +
|
| float VP8LFastSLog2Slow(int v) {
|
| assert(v >= LOG_LOOKUP_IDX_MAX);
|
| if (v < APPROX_LOG_MAX) {
|
| @@ -287,61 +383,6 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
|
| return Average2(Average2(a0, a1), Average2(a2, a3));
|
| }
|
|
|
| -#if defined(WEBP_TARGET_HAS_SSE2)
|
| -static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
|
| - uint32_t c2) {
|
| - const __m128i zero = _mm_setzero_si128();
|
| - const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
|
| - const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
|
| - const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
|
| - const __m128i V1 = _mm_add_epi16(C0, C1);
|
| - const __m128i V2 = _mm_sub_epi16(V1, C2);
|
| - const __m128i b = _mm_packus_epi16(V2, V2);
|
| - const uint32_t output = _mm_cvtsi128_si32(b);
|
| - return output;
|
| -}
|
| -
|
| -static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
|
| - uint32_t c2) {
|
| - const uint32_t ave = Average2(c0, c1);
|
| - const __m128i zero = _mm_setzero_si128();
|
| - const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ave), zero);
|
| - const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
|
| - const __m128i A1 = _mm_sub_epi16(A0, B0);
|
| - const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
|
| - const __m128i A2 = _mm_sub_epi16(A1, BgtA);
|
| - const __m128i A3 = _mm_srai_epi16(A2, 1);
|
| - const __m128i A4 = _mm_add_epi16(A0, A3);
|
| - const __m128i A5 = _mm_packus_epi16(A4, A4);
|
| - const uint32_t output = _mm_cvtsi128_si32(A5);
|
| - return output;
|
| -}
|
| -
|
| -static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
|
| - int pa_minus_pb;
|
| - const __m128i zero = _mm_setzero_si128();
|
| - const __m128i A0 = _mm_cvtsi32_si128(a);
|
| - const __m128i B0 = _mm_cvtsi32_si128(b);
|
| - const __m128i C0 = _mm_cvtsi32_si128(c);
|
| - const __m128i AC0 = _mm_subs_epu8(A0, C0);
|
| - const __m128i CA0 = _mm_subs_epu8(C0, A0);
|
| - const __m128i BC0 = _mm_subs_epu8(B0, C0);
|
| - const __m128i CB0 = _mm_subs_epu8(C0, B0);
|
| - const __m128i AC = _mm_or_si128(AC0, CA0);
|
| - const __m128i BC = _mm_or_si128(BC0, CB0);
|
| - const __m128i pa = _mm_unpacklo_epi8(AC, zero); // |a - c|
|
| - const __m128i pb = _mm_unpacklo_epi8(BC, zero); // |b - c|
|
| - const __m128i diff = _mm_sub_epi16(pb, pa);
|
| - {
|
| - int16_t out[8];
|
| - _mm_storeu_si128((__m128i*)out, diff);
|
| - pa_minus_pb = out[0] + out[1] + out[2] + out[3];
|
| - }
|
| - return (pa_minus_pb <= 0) ? a : b;
|
| -}
|
| -
|
| -#else
|
| -
|
| static WEBP_INLINE uint32_t Clip255(uint32_t a) {
|
| if (a < 256) {
|
| return a;
|
| @@ -396,7 +437,6 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
|
| Sub3((a ) & 0xff, (b ) & 0xff, (c ) & 0xff);
|
| return (pa_minus_pb <= 0) ? a : b;
|
| }
|
| -#endif
|
|
|
| //------------------------------------------------------------------------------
|
| // Predictors
|
| @@ -449,18 +489,19 @@ static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
|
| return pred;
|
| }
|
| static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
|
| - const uint32_t pred = Select(top[0], left, top[-1]);
|
| + const uint32_t pred = VP8LSelect(top[0], left, top[-1]);
|
| return pred;
|
| }
|
| static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
|
| - const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
|
| + const uint32_t pred = VP8LClampedAddSubtractFull(left, top[0], top[-1]);
|
| return pred;
|
| }
|
| static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
|
| - const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
|
| + const uint32_t pred = VP8LClampedAddSubtractHalf(left, top[0], top[-1]);
|
| return pred;
|
| }
|
|
|
| +// TODO(vikasa): Export the predictor array, to allow SSE2 variants.
|
| typedef uint32_t (*PredictorFunc)(uint32_t left, const uint32_t* const top);
|
| static const PredictorFunc kPredictors[16] = {
|
| Predictor0, Predictor1, Predictor2, Predictor3,
|
| @@ -716,21 +757,8 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
|
| }
|
| }
|
|
|
| -void VP8LSubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs) {
|
| +static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs) {
|
| int i = 0;
|
| -#if defined(WEBP_TARGET_HAS_SSE2)
|
| - const __m128i mask = _mm_set1_epi32(0x0000ff00);
|
| - for (; i + 4 < num_pixs; i += 4) {
|
| - const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
|
| - const __m128i in_00g0 = _mm_and_si128(in, mask); // 00g0|00g0|...
|
| - const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8); // 0g00|0g00|...
|
| - const __m128i in_000g = _mm_srli_epi32(in_00g0, 8); // 000g|000g|...
|
| - const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
|
| - const __m128i out = _mm_sub_epi8(in, in_0g0g);
|
| - _mm_storeu_si128((__m128i*)&argb_data[i], out);
|
| - }
|
| - // fallthrough and finish off with plain-C
|
| -#endif
|
| for (; i < num_pixs; ++i) {
|
| const uint32_t argb = argb_data[i];
|
| const uint32_t green = (argb >> 8) & 0xff;
|
| @@ -742,23 +770,7 @@ void VP8LSubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs) {
|
|
|
| // Add green to blue and red channels (i.e. perform the inverse transform of
|
| // 'subtract green').
|
| -static void AddGreenToBlueAndRed(const VP8LTransform* const transform,
|
| - int y_start, int y_end, uint32_t* data) {
|
| - const int width = transform->xsize_;
|
| - const uint32_t* const data_end = data + (y_end - y_start) * width;
|
| -#if defined(WEBP_TARGET_HAS_SSE2)
|
| - const __m128i mask = _mm_set1_epi32(0x0000ff00);
|
| - for (; data + 4 < data_end; data += 4) {
|
| - const __m128i in = _mm_loadu_si128((__m128i*)data);
|
| - const __m128i in_00g0 = _mm_and_si128(in, mask); // 00g0|00g0|...
|
| - const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8); // 0g00|0g00|...
|
| - const __m128i in_000g = _mm_srli_epi32(in_00g0, 8); // 000g|000g|...
|
| - const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
|
| - const __m128i out = _mm_add_epi8(in, in_0g0g);
|
| - _mm_storeu_si128((__m128i*)data, out);
|
| - }
|
| - // fallthrough and finish off with plain-C
|
| -#endif
|
| +static void AddGreenToBlueAndRed(uint32_t* data, const uint32_t* data_end) {
|
| while (data < data_end) {
|
| const uint32_t argb = *data;
|
| const uint32_t green = ((argb >> 8) & 0xff);
|
| @@ -1156,18 +1168,18 @@ COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, uint8_t, GetAlphaIndex,
|
| void VP8LInverseTransform(const VP8LTransform* const transform,
|
| int row_start, int row_end,
|
| const uint32_t* const in, uint32_t* const out) {
|
| + const int width = transform->xsize_;
|
| assert(row_start < row_end);
|
| assert(row_end <= transform->ysize_);
|
| switch (transform->type_) {
|
| case SUBTRACT_GREEN:
|
| - AddGreenToBlueAndRed(transform, row_start, row_end, out);
|
| + VP8LAddGreenToBlueAndRed(out, out + (row_end - row_start) * width);
|
| break;
|
| case PREDICTOR_TRANSFORM:
|
| PredictorInverseTransform(transform, row_start, row_end, out);
|
| if (row_end != transform->ysize_) {
|
| // The last predicted row in this iteration will be the top-pred row
|
| // for the first row in next iteration.
|
| - const int width = transform->xsize_;
|
| memcpy(out - width, out + (row_end - row_start - 1) * width,
|
| width * sizeof(*out));
|
| }
|
| @@ -1182,7 +1194,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
|
| // Also, note that this is the only transform that applies on
|
| // the effective width of VP8LSubSampleSize(xsize_, bits_). All other
|
| // transforms work on effective width of xsize_.
|
| - const int out_stride = (row_end - row_start) * transform->xsize_;
|
| + const int out_stride = (row_end - row_start) * width;
|
| const int in_stride = (row_end - row_start) *
|
| VP8LSubSampleSize(transform->xsize_, transform->bits_);
|
| uint32_t* const src = out + out_stride - in_stride;
|
| @@ -1382,6 +1394,139 @@ void VP8LBundleColorMap(const uint8_t* const row, int width,
|
|
|
| //------------------------------------------------------------------------------
|
|
|
| -#if defined(__cplusplus) || defined(c_plusplus)
|
| -} // extern "C"
|
| +// TODO(vikasa): Move the SSE2 functions to lossless_dsp.c (new file), once
|
| +// color-space conversion methods (ConvertFromBGRA) are also updated for SSE2.
|
| +#if defined(WEBP_USE_SSE2)
|
| +static WEBP_INLINE uint32_t ClampedAddSubtractFullSSE2(uint32_t c0, uint32_t c1,
|
| + uint32_t c2) {
|
| + const __m128i zero = _mm_setzero_si128();
|
| + const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
|
| + const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
|
| + const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
|
| + const __m128i V1 = _mm_add_epi16(C0, C1);
|
| + const __m128i V2 = _mm_sub_epi16(V1, C2);
|
| + const __m128i b = _mm_packus_epi16(V2, V2);
|
| + const uint32_t output = _mm_cvtsi128_si32(b);
|
| + return output;
|
| +}
|
| +
|
| +static WEBP_INLINE uint32_t ClampedAddSubtractHalfSSE2(uint32_t c0, uint32_t c1,
|
| + uint32_t c2) {
|
| + const uint32_t ave = Average2(c0, c1);
|
| + const __m128i zero = _mm_setzero_si128();
|
| + const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ave), zero);
|
| + const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
|
| + const __m128i A1 = _mm_sub_epi16(A0, B0);
|
| + const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
|
| + const __m128i A2 = _mm_sub_epi16(A1, BgtA);
|
| + const __m128i A3 = _mm_srai_epi16(A2, 1);
|
| + const __m128i A4 = _mm_add_epi16(A0, A3);
|
| + const __m128i A5 = _mm_packus_epi16(A4, A4);
|
| + const uint32_t output = _mm_cvtsi128_si32(A5);
|
| + return output;
|
| +}
|
| +
|
| +static WEBP_INLINE uint32_t SelectSSE2(uint32_t a, uint32_t b, uint32_t c) {
|
| + int pa_minus_pb;
|
| + const __m128i zero = _mm_setzero_si128();
|
| + const __m128i A0 = _mm_cvtsi32_si128(a);
|
| + const __m128i B0 = _mm_cvtsi32_si128(b);
|
| + const __m128i C0 = _mm_cvtsi32_si128(c);
|
| + const __m128i AC0 = _mm_subs_epu8(A0, C0);
|
| + const __m128i CA0 = _mm_subs_epu8(C0, A0);
|
| + const __m128i BC0 = _mm_subs_epu8(B0, C0);
|
| + const __m128i CB0 = _mm_subs_epu8(C0, B0);
|
| + const __m128i AC = _mm_or_si128(AC0, CA0);
|
| + const __m128i BC = _mm_or_si128(BC0, CB0);
|
| + const __m128i pa = _mm_unpacklo_epi8(AC, zero); // |a - c|
|
| + const __m128i pb = _mm_unpacklo_epi8(BC, zero); // |b - c|
|
| + const __m128i diff = _mm_sub_epi16(pb, pa);
|
| + {
|
| + int16_t out[8];
|
| + _mm_storeu_si128((__m128i*)out, diff);
|
| + pa_minus_pb = out[0] + out[1] + out[2] + out[3];
|
| + }
|
| + return (pa_minus_pb <= 0) ? a : b;
|
| +}
|
| +
|
| +static void SubtractGreenFromBlueAndRedSSE2(uint32_t* argb_data, int num_pixs) {
|
| + int i = 0;
|
| + const __m128i mask = _mm_set1_epi32(0x0000ff00);
|
| + for (; i + 4 < num_pixs; i += 4) {
|
| + const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
|
| + const __m128i in_00g0 = _mm_and_si128(in, mask); // 00g0|00g0|...
|
| + const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8); // 0g00|0g00|...
|
| + const __m128i in_000g = _mm_srli_epi32(in_00g0, 8); // 000g|000g|...
|
| + const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
|
| + const __m128i out = _mm_sub_epi8(in, in_0g0g);
|
| + _mm_storeu_si128((__m128i*)&argb_data[i], out);
|
| + }
|
| + // fallthrough and finish off with plain-C
|
| + for (; i < num_pixs; ++i) {
|
| + const uint32_t argb = argb_data[i];
|
| + const uint32_t green = (argb >> 8) & 0xff;
|
| + const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
|
| + const uint32_t new_b = ((argb & 0xff) - green) & 0xff;
|
| + argb_data[i] = (argb & 0xff00ff00) | (new_r << 16) | new_b;
|
| + }
|
| +}
|
| +
|
| +static void AddGreenToBlueAndRedSSE2(uint32_t* data, const uint32_t* data_end) {
|
| + const __m128i mask = _mm_set1_epi32(0x0000ff00);
|
| + for (; data + 4 < data_end; data += 4) {
|
| + const __m128i in = _mm_loadu_si128((__m128i*)data);
|
| + const __m128i in_00g0 = _mm_and_si128(in, mask); // 00g0|00g0|...
|
| + const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8); // 0g00|0g00|...
|
| + const __m128i in_000g = _mm_srli_epi32(in_00g0, 8); // 000g|000g|...
|
| + const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
|
| + const __m128i out = _mm_add_epi8(in, in_0g0g);
|
| + _mm_storeu_si128((__m128i*)data, out);
|
| + }
|
| + // fallthrough and finish off with plain-C
|
| + while (data < data_end) {
|
| + const uint32_t argb = *data;
|
| + const uint32_t green = ((argb >> 8) & 0xff);
|
| + uint32_t red_blue = (argb & 0x00ff00ffu);
|
| + red_blue += (green << 16) | green;
|
| + red_blue &= 0x00ff00ffu;
|
| + *data++ = (argb & 0xff00ff00u) | red_blue;
|
| + }
|
| +}
|
| +
|
| +extern void VP8LDspInitSSE2(void);
|
| +
|
| +void VP8LDspInitSSE2(void) {
|
| + VP8LClampedAddSubtractFull = ClampedAddSubtractFullSSE2;
|
| + VP8LClampedAddSubtractHalf = ClampedAddSubtractHalfSSE2;
|
| + VP8LSelect = SelectSSE2;
|
| + VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRedSSE2;
|
| + VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRedSSE2;
|
| +}
|
| +#endif
|
| +//------------------------------------------------------------------------------
|
| +
|
| +VP8LPredClampedAddSubFunc VP8LClampedAddSubtractFull;
|
| +VP8LPredClampedAddSubFunc VP8LClampedAddSubtractHalf;
|
| +VP8LPredSelectFunc VP8LSelect;
|
| +VP8LSubtractGreenFromBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
|
| +VP8LAddGreenToBlueAndRedFunc VP8LAddGreenToBlueAndRed;
|
| +
|
| +void VP8LDspInit(void) {
|
| + VP8LClampedAddSubtractFull = ClampedAddSubtractFull;
|
| + VP8LClampedAddSubtractHalf = ClampedAddSubtractHalf;
|
| + VP8LSelect = Select;
|
| + VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
|
| + VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
|
| +
|
| + // If defined, use CPUInfo() to overwrite some pointers with faster versions.
|
| + if (VP8GetCPUInfo != NULL) {
|
| +#if defined(WEBP_USE_SSE2)
|
| + if (VP8GetCPUInfo(kSSE2)) {
|
| + VP8LDspInitSSE2();
|
| + }
|
| #endif
|
| + }
|
| +}
|
| +
|
| +//------------------------------------------------------------------------------
|
| +
|
|
|