| Index: third_party/libwebp/dsp/yuv_sse2.c
|
| diff --git a/third_party/libwebp/dsp/yuv_sse2.c b/third_party/libwebp/dsp/yuv_sse2.c
|
| index e19bddff6c5d17819d602730d3e1c032c70ddc79..e33c2bbafde4714ae5157dc8c83b08fe43d023d2 100644
|
| --- a/third_party/libwebp/dsp/yuv_sse2.c
|
| +++ b/third_party/libwebp/dsp/yuv_sse2.c
|
| @@ -15,6 +15,8 @@
|
|
|
| #if defined(WEBP_USE_SSE2)
|
|
|
| +#include "./common_sse2.h"
|
| +#include <stdlib.h>
|
| #include <emmintrin.h>
|
|
|
| //-----------------------------------------------------------------------------
|
| @@ -155,30 +157,13 @@ static WEBP_INLINE void PackAndStore565(const __m128i* const R,
|
| _mm_storeu_si128((__m128i*)dst, rgb565);
|
| }
|
|
|
| -// Function used several times in PlanarTo24b.
|
| -// It samples the in buffer as follows: one every two unsigned char is stored
|
| -// at the beginning of the buffer, while the other half is stored at the end.
|
| -static WEBP_INLINE void PlanarTo24bHelper(const __m128i* const in /*in[6]*/,
|
| - __m128i* const out /*out[6]*/) {
|
| - const __m128i v_mask = _mm_set1_epi16(0x00ff);
|
| -
|
| - // Take one every two upper 8b values.
|
| - out[0] = _mm_packus_epi16(_mm_and_si128(in[0], v_mask),
|
| - _mm_and_si128(in[1], v_mask));
|
| - out[1] = _mm_packus_epi16(_mm_and_si128(in[2], v_mask),
|
| - _mm_and_si128(in[3], v_mask));
|
| - out[2] = _mm_packus_epi16(_mm_and_si128(in[4], v_mask),
|
| - _mm_and_si128(in[5], v_mask));
|
| - // Take one every two lower 8b values.
|
| - out[3] = _mm_packus_epi16(_mm_srli_epi16(in[0], 8), _mm_srli_epi16(in[1], 8));
|
| - out[4] = _mm_packus_epi16(_mm_srli_epi16(in[2], 8), _mm_srli_epi16(in[3], 8));
|
| - out[5] = _mm_packus_epi16(_mm_srli_epi16(in[4], 8), _mm_srli_epi16(in[5], 8));
|
| -}
|
| -
|
| // Pack the planar buffers
|
| // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
|
| // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
|
| -static WEBP_INLINE void PlanarTo24b(__m128i* const in /*in[6]*/, uint8_t* rgb) {
|
| +static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1,
|
| + __m128i* const in2, __m128i* const in3,
|
| + __m128i* const in4, __m128i* const in5,
|
| + uint8_t* const rgb) {
|
| // The input is 6 registers of sixteen 8b but for the sake of explanation,
|
| // let's take 6 registers of four 8b values.
|
| // To pack, we will keep taking one every two 8b integer and move it
|
| @@ -191,22 +176,15 @@ static WEBP_INLINE void PlanarTo24b(__m128i* const in /*in[6]*/, uint8_t* rgb) {
|
| // Repeat the same permutations twice more:
|
| // r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
|
| // r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
|
| - __m128i tmp[6];
|
| - PlanarTo24bHelper(in, tmp);
|
| - PlanarTo24bHelper(tmp, in);
|
| - PlanarTo24bHelper(in, tmp);
|
| - // We need to do it two more times than the example as we have sixteen bytes.
|
| - PlanarTo24bHelper(tmp, in);
|
| - PlanarTo24bHelper(in, tmp);
|
| -
|
| - _mm_storeu_si128((__m128i*)(rgb + 0), tmp[0]);
|
| - _mm_storeu_si128((__m128i*)(rgb + 16), tmp[1]);
|
| - _mm_storeu_si128((__m128i*)(rgb + 32), tmp[2]);
|
| - _mm_storeu_si128((__m128i*)(rgb + 48), tmp[3]);
|
| - _mm_storeu_si128((__m128i*)(rgb + 64), tmp[4]);
|
| - _mm_storeu_si128((__m128i*)(rgb + 80), tmp[5]);
|
| -}
|
| -#undef MK_UINT32
|
| + VP8PlanarTo24b(in0, in1, in2, in3, in4, in5);
|
| +
|
| + _mm_storeu_si128((__m128i*)(rgb + 0), *in0);
|
| + _mm_storeu_si128((__m128i*)(rgb + 16), *in1);
|
| + _mm_storeu_si128((__m128i*)(rgb + 32), *in2);
|
| + _mm_storeu_si128((__m128i*)(rgb + 48), *in3);
|
| + _mm_storeu_si128((__m128i*)(rgb + 64), *in4);
|
| + _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
|
| +}
|
|
|
| void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
| uint8_t* dst) {
|
| @@ -265,29 +243,29 @@ void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
| void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
| uint8_t* dst) {
|
| __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
|
| - __m128i rgb[6];
|
| + __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
|
|
|
| - YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
|
| - YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1);
|
| + YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
|
| + YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1);
|
| YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);
|
| YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);
|
|
|
| // Cast to 8b and store as RRRRGGGGBBBB.
|
| - rgb[0] = _mm_packus_epi16(R0, R1);
|
| - rgb[1] = _mm_packus_epi16(R2, R3);
|
| - rgb[2] = _mm_packus_epi16(G0, G1);
|
| - rgb[3] = _mm_packus_epi16(G2, G3);
|
| - rgb[4] = _mm_packus_epi16(B0, B1);
|
| - rgb[5] = _mm_packus_epi16(B2, B3);
|
| + rgb0 = _mm_packus_epi16(R0, R1);
|
| + rgb1 = _mm_packus_epi16(R2, R3);
|
| + rgb2 = _mm_packus_epi16(G0, G1);
|
| + rgb3 = _mm_packus_epi16(G2, G3);
|
| + rgb4 = _mm_packus_epi16(B0, B1);
|
| + rgb5 = _mm_packus_epi16(B2, B3);
|
|
|
| // Pack as RGBRGBRGBRGB.
|
| - PlanarTo24b(rgb, dst);
|
| + PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
|
| }
|
|
|
| void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
| uint8_t* dst) {
|
| __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
|
| - __m128i bgr[6];
|
| + __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
|
|
|
| YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
|
| YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1);
|
| @@ -295,15 +273,15 @@ void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
| YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);
|
|
|
| // Cast to 8b and store as BBBBGGGGRRRR.
|
| - bgr[0] = _mm_packus_epi16(B0, B1);
|
| - bgr[1] = _mm_packus_epi16(B2, B3);
|
| - bgr[2] = _mm_packus_epi16(G0, G1);
|
| - bgr[3] = _mm_packus_epi16(G2, G3);
|
| - bgr[4] = _mm_packus_epi16(R0, R1);
|
| - bgr[5] = _mm_packus_epi16(R2, R3);
|
| + bgr0 = _mm_packus_epi16(B0, B1);
|
| + bgr1 = _mm_packus_epi16(B2, B3);
|
| + bgr2 = _mm_packus_epi16(G0, G1);
|
| + bgr3 = _mm_packus_epi16(G2, G3);
|
| + bgr4 = _mm_packus_epi16(R0, R1);
|
| + bgr5= _mm_packus_epi16(R2, R3);
|
|
|
| // Pack as BGRBGRBGRBGR.
|
| - PlanarTo24b(bgr, dst);
|
| + PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
|
| }
|
|
|
| //-----------------------------------------------------------------------------
|
| @@ -377,7 +355,7 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
| int n;
|
| for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
|
| __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
|
| - __m128i rgb[6];
|
| + __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
|
|
|
| YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
|
| YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1);
|
| @@ -385,15 +363,15 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
| YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);
|
|
|
| // Cast to 8b and store as RRRRGGGGBBBB.
|
| - rgb[0] = _mm_packus_epi16(R0, R1);
|
| - rgb[1] = _mm_packus_epi16(R2, R3);
|
| - rgb[2] = _mm_packus_epi16(G0, G1);
|
| - rgb[3] = _mm_packus_epi16(G2, G3);
|
| - rgb[4] = _mm_packus_epi16(B0, B1);
|
| - rgb[5] = _mm_packus_epi16(B2, B3);
|
| + rgb0 = _mm_packus_epi16(R0, R1);
|
| + rgb1 = _mm_packus_epi16(R2, R3);
|
| + rgb2 = _mm_packus_epi16(G0, G1);
|
| + rgb3 = _mm_packus_epi16(G2, G3);
|
| + rgb4 = _mm_packus_epi16(B0, B1);
|
| + rgb5 = _mm_packus_epi16(B2, B3);
|
|
|
| // Pack as RGBRGBRGBRGB.
|
| - PlanarTo24b(rgb, dst);
|
| + PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
|
|
|
| y += 32;
|
| u += 16;
|
| @@ -413,7 +391,7 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
| int n;
|
| for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
|
| __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
|
| - __m128i bgr[6];
|
| + __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
|
|
|
| YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
|
| YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1);
|
| @@ -421,15 +399,15 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
| YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);
|
|
|
| // Cast to 8b and store as BBBBGGGGRRRR.
|
| - bgr[0] = _mm_packus_epi16(B0, B1);
|
| - bgr[1] = _mm_packus_epi16(B2, B3);
|
| - bgr[2] = _mm_packus_epi16(G0, G1);
|
| - bgr[3] = _mm_packus_epi16(G2, G3);
|
| - bgr[4] = _mm_packus_epi16(R0, R1);
|
| - bgr[5] = _mm_packus_epi16(R2, R3);
|
| + bgr0 = _mm_packus_epi16(B0, B1);
|
| + bgr1 = _mm_packus_epi16(B2, B3);
|
| + bgr2 = _mm_packus_epi16(G0, G1);
|
| + bgr3 = _mm_packus_epi16(G2, G3);
|
| + bgr4 = _mm_packus_epi16(R0, R1);
|
| + bgr5 = _mm_packus_epi16(R2, R3);
|
|
|
| // Pack as BGRBGRBGRBGR.
|
| - PlanarTo24b(bgr, dst);
|
| + PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
|
|
|
| y += 32;
|
| u += 16;
|
| @@ -499,25 +477,19 @@ static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb,
|
|
|
| // Convert 8 packed ARGB to r[], g[], b[]
|
| static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb,
|
| - __m128i* const r,
|
| - __m128i* const g,
|
| - __m128i* const b) {
|
| + __m128i* const rgb /*in[6]*/) {
|
| const __m128i zero = _mm_setzero_si128();
|
| - const __m128i in0 = LOAD_16(argb + 0); // argb3 | argb2 | argb1 | argb0
|
| - const __m128i in1 = LOAD_16(argb + 4); // argb7 | argb6 | argb5 | argb4
|
| - // column-wise transpose
|
| - const __m128i A0 = _mm_unpacklo_epi8(in0, in1);
|
| - const __m128i A1 = _mm_unpackhi_epi8(in0, in1);
|
| - const __m128i B0 = _mm_unpacklo_epi8(A0, A1);
|
| - const __m128i B1 = _mm_unpackhi_epi8(A0, A1);
|
| - // C0 = g7 g6 ... g1 g0 | b7 b6 ... b1 b0
|
| - // C1 = a7 a6 ... a1 a0 | r7 r6 ... r1 r0
|
| - const __m128i C0 = _mm_unpacklo_epi8(B0, B1);
|
| - const __m128i C1 = _mm_unpackhi_epi8(B0, B1);
|
| - // store 16b
|
| - *r = _mm_unpacklo_epi8(C1, zero);
|
| - *g = _mm_unpackhi_epi8(C0, zero);
|
| - *b = _mm_unpacklo_epi8(C0, zero);
|
| + __m128i a0 = LOAD_16(argb + 0);
|
| + __m128i a1 = LOAD_16(argb + 4);
|
| + __m128i a2 = LOAD_16(argb + 8);
|
| + __m128i a3 = LOAD_16(argb + 12);
|
| + VP8L32bToPlanar(&a0, &a1, &a2, &a3);
|
| + rgb[0] = _mm_unpacklo_epi8(a1, zero);
|
| + rgb[1] = _mm_unpackhi_epi8(a1, zero);
|
| + rgb[2] = _mm_unpacklo_epi8(a2, zero);
|
| + rgb[3] = _mm_unpackhi_epi8(a2, zero);
|
| + rgb[4] = _mm_unpacklo_epi8(a3, zero);
|
| + rgb[5] = _mm_unpackhi_epi8(a3, zero);
|
| }
|
|
|
| // This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX
|
| @@ -649,11 +621,10 @@ static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
|
| const int max_width = width & ~15;
|
| int i;
|
| for (i = 0; i < max_width; i += 16) {
|
| - __m128i r, g, b, Y0, Y1;
|
| - RGB32PackedToPlanar(&argb[i + 0], &r, &g, &b);
|
| - ConvertRGBToY(&r, &g, &b, &Y0);
|
| - RGB32PackedToPlanar(&argb[i + 8], &r, &g, &b);
|
| - ConvertRGBToY(&r, &g, &b, &Y1);
|
| + __m128i Y0, Y1, rgb[6];
|
| + RGB32PackedToPlanar(&argb[i], rgb);
|
| + ConvertRGBToY(&rgb[0], &rgb[2], &rgb[4], &Y0);
|
| + ConvertRGBToY(&rgb[1], &rgb[3], &rgb[5], &Y1);
|
| STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
|
| }
|
| for (; i < width; ++i) { // left-over
|
| @@ -678,20 +649,18 @@ static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v,
|
| const int max_width = src_width & ~31;
|
| int i;
|
| for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
|
| - __m128i r0, g0, b0, r1, g1, b1, U0, V0, U1, V1;
|
| - RGB32PackedToPlanar(&argb[i + 0], &r0, &g0, &b0);
|
| - RGB32PackedToPlanar(&argb[i + 8], &r1, &g1, &b1);
|
| - HorizontalAddPack(&r0, &r1, &r0);
|
| - HorizontalAddPack(&g0, &g1, &g0);
|
| - HorizontalAddPack(&b0, &b1, &b0);
|
| - ConvertRGBToUV(&r0, &g0, &b0, &U0, &V0);
|
| -
|
| - RGB32PackedToPlanar(&argb[i + 16], &r0, &g0, &b0);
|
| - RGB32PackedToPlanar(&argb[i + 24], &r1, &g1, &b1);
|
| - HorizontalAddPack(&r0, &r1, &r0);
|
| - HorizontalAddPack(&g0, &g1, &g0);
|
| - HorizontalAddPack(&b0, &b1, &b0);
|
| - ConvertRGBToUV(&r0, &g0, &b0, &U1, &V1);
|
| + __m128i rgb[6], U0, V0, U1, V1;
|
| + RGB32PackedToPlanar(&argb[i], rgb);
|
| + HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);
|
| + HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);
|
| + HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);
|
| + ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
|
| +
|
| + RGB32PackedToPlanar(&argb[i + 16], rgb);
|
| + HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);
|
| + HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);
|
| + HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);
|
| + ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
|
|
|
| U0 = _mm_packus_epi16(U0, U1);
|
| V0 = _mm_packus_epi16(V0, V1);
|
| @@ -767,9 +736,128 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
|
| WebPConvertRGBA32ToUV = ConvertRGBA32ToUV;
|
| }
|
|
|
| +//------------------------------------------------------------------------------
|
| +
|
| +#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
|
| +static uint16_t clip_y(int v) {
|
| + return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
|
| +}
|
| +
|
| +static uint64_t SharpYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
|
| + uint16_t* dst, int len) {
|
| + uint64_t diff = 0;
|
| + uint32_t tmp[4];
|
| + int i;
|
| + const __m128i zero = _mm_setzero_si128();
|
| + const __m128i max = _mm_set1_epi16(MAX_Y);
|
| + const __m128i one = _mm_set1_epi16(1);
|
| + __m128i sum = zero;
|
| +
|
| + for (i = 0; i + 8 <= len; i += 8) {
|
| + const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
|
| + const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
|
| + const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
|
| + const __m128i D = _mm_sub_epi16(A, B); // diff_y
|
| + const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0)
|
| + const __m128i F = _mm_add_epi16(C, D); // new_y
|
| + const __m128i G = _mm_or_si128(E, one); // -1 or 1
|
| + const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
|
| + const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...))
|
| + _mm_storeu_si128((__m128i*)(dst + i), H);
|
| + sum = _mm_add_epi32(sum, I);
|
| + }
|
| + _mm_storeu_si128((__m128i*)tmp, sum);
|
| + diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
|
| + for (; i < len; ++i) {
|
| + const int diff_y = ref[i] - src[i];
|
| + const int new_y = (int)dst[i] + diff_y;
|
| + dst[i] = clip_y(new_y);
|
| + diff += (uint64_t)abs(diff_y);
|
| + }
|
| + return diff;
|
| +}
|
| +
|
| +static void SharpYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
|
| + int16_t* dst, int len) {
|
| + int i = 0;
|
| + for (i = 0; i + 8 <= len; i += 8) {
|
| + const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
|
| + const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
|
| + const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
|
| + const __m128i D = _mm_sub_epi16(A, B); // diff_uv
|
| + const __m128i E = _mm_add_epi16(C, D); // new_uv
|
| + _mm_storeu_si128((__m128i*)(dst + i), E);
|
| + }
|
| + for (; i < len; ++i) {
|
| + const int diff_uv = ref[i] - src[i];
|
| + dst[i] += diff_uv;
|
| + }
|
| +}
|
| +
|
| +static void SharpYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
|
| + const uint16_t* best_y, uint16_t* out) {
|
| + int i;
|
| + const __m128i kCst8 = _mm_set1_epi16(8);
|
| + const __m128i max = _mm_set1_epi16(MAX_Y);
|
| + const __m128i zero = _mm_setzero_si128();
|
| + for (i = 0; i + 8 <= len; i += 8) {
|
| + const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));
|
| + const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));
|
| + const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));
|
| + const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));
|
| + const __m128i a0b1 = _mm_add_epi16(a0, b1);
|
| + const __m128i a1b0 = _mm_add_epi16(a1, b0);
|
| + const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1
|
| + const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
|
| + const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1)
|
| + const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0)
|
| + const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
|
| + const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
|
| + const __m128i d0 = _mm_add_epi16(c1, a0);
|
| + const __m128i d1 = _mm_add_epi16(c0, a1);
|
| + const __m128i e0 = _mm_srai_epi16(d0, 1);
|
| + const __m128i e1 = _mm_srai_epi16(d1, 1);
|
| + const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
|
| + const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
|
| + const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
|
| + const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8));
|
| + const __m128i h0 = _mm_add_epi16(g0, f0);
|
| + const __m128i h1 = _mm_add_epi16(g1, f1);
|
| + const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
|
| + const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
|
| + _mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0);
|
| + _mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1);
|
| + }
|
| + for (; i < len; ++i) {
|
| + // (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
|
| + // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
|
| + // We reuse the common sub-expressions.
|
| + const int a0b1 = A[i + 0] + B[i + 1];
|
| + const int a1b0 = A[i + 1] + B[i + 0];
|
| + const int a0a1b0b1 = a0b1 + a1b0 + 8;
|
| + const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
|
| + const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
|
| + out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
|
| + out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
|
| + }
|
| +}
|
| +
|
| +#undef MAX_Y
|
| +
|
| +//------------------------------------------------------------------------------
|
| +
|
| +extern void WebPInitSharpYUVSSE2(void);
|
| +
|
| +WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVSSE2(void) {
|
| + WebPSharpYUVUpdateY = SharpYUVUpdateY_SSE2;
|
| + WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_SSE2;
|
| + WebPSharpYUVFilterRow = SharpYUVFilterRow_SSE2;
|
| +}
|
| +
|
| #else // !WEBP_USE_SSE2
|
|
|
| WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)
|
| WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)
|
| +WEBP_DSP_INIT_STUB(WebPInitSharpYUVSSE2)
|
|
|
| #endif // WEBP_USE_SSE2
|
|
|