third_party/libwebp/dsp/yuv_sse2.c - Issue 2651883004: libwebp-0.6.0-rc1

Unified Diff: third_party/libwebp/dsp/yuv_sse2.c

Issue 2651883004: libwebp-0.6.0-rc1 (Closed)

Patch Set: Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: third_party/libwebp/dsp/yuv_sse2.c

diff --git a/third_party/libwebp/dsp/yuv_sse2.c b/third_party/libwebp/dsp/yuv_sse2.c

index e19bddff6c5d17819d602730d3e1c032c70ddc79..e33c2bbafde4714ae5157dc8c83b08fe43d023d2 100644

--- a/third_party/libwebp/dsp/yuv_sse2.c

+++ b/third_party/libwebp/dsp/yuv_sse2.c

@@ -15,6 +15,8 @@

#if defined(WEBP_USE_SSE2)

+#include "./common_sse2.h"

+#include <stdlib.h>

#include <emmintrin.h>

//-----------------------------------------------------------------------------

@@ -155,30 +157,13 @@ static WEBP_INLINE void PackAndStore565(const __m128i* const R,

_mm_storeu_si128((__m128i*)dst, rgb565);

}

-// Function used several times in PlanarTo24b.

-// It samples the in buffer as follows: one every two unsigned char is stored

-// at the beginning of the buffer, while the other half is stored at the end.

-static WEBP_INLINE void PlanarTo24bHelper(const __m128i* const in /*in[6]*/,

- __m128i* const out /*out[6]*/) {

- const __m128i v_mask = _mm_set1_epi16(0x00ff);

- // Take one every two upper 8b values.

- out[0] = _mm_packus_epi16(_mm_and_si128(in[0], v_mask),

- _mm_and_si128(in[1], v_mask));

- out[1] = _mm_packus_epi16(_mm_and_si128(in[2], v_mask),

- _mm_and_si128(in[3], v_mask));

- out[2] = _mm_packus_epi16(_mm_and_si128(in[4], v_mask),

- _mm_and_si128(in[5], v_mask));

- // Take one every two lower 8b values.

- out[3] = _mm_packus_epi16(_mm_srli_epi16(in[0], 8), _mm_srli_epi16(in[1], 8));

- out[4] = _mm_packus_epi16(_mm_srli_epi16(in[2], 8), _mm_srli_epi16(in[3], 8));

- out[5] = _mm_packus_epi16(_mm_srli_epi16(in[4], 8), _mm_srli_epi16(in[5], 8));

// Pack the planar buffers

// rrrr... rrrr... gggg... gggg... bbbb... bbbb....

// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...

-static WEBP_INLINE void PlanarTo24b(__m128i* const in /*in[6]*/, uint8_t* rgb) {

+static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1,

+ __m128i* const in2, __m128i* const in3,

+ __m128i* const in4, __m128i* const in5,

+ uint8_t* const rgb) {

// The input is 6 registers of sixteen 8b but for the sake of explanation,

// let's take 6 registers of four 8b values.

// To pack, we will keep taking one every two 8b integer and move it

@@ -191,22 +176,15 @@ static WEBP_INLINE void PlanarTo24b(__m128i* const in /*in[6]*/, uint8_t* rgb) {

// Repeat the same permutations twice more:

- __m128i tmp[6];

- PlanarTo24bHelper(in, tmp);

- PlanarTo24bHelper(tmp, in);

- PlanarTo24bHelper(in, tmp);

- // We need to do it two more times than the example as we have sixteen bytes.

- PlanarTo24bHelper(tmp, in);

- PlanarTo24bHelper(in, tmp);

- _mm_storeu_si128((__m128i*)(rgb + 0), tmp[0]);

- _mm_storeu_si128((__m128i*)(rgb + 16), tmp[1]);

- _mm_storeu_si128((__m128i*)(rgb + 32), tmp[2]);

- _mm_storeu_si128((__m128i*)(rgb + 48), tmp[3]);

- _mm_storeu_si128((__m128i*)(rgb + 64), tmp[4]);

- _mm_storeu_si128((__m128i*)(rgb + 80), tmp[5]);

-#undef MK_UINT32

+ VP8PlanarTo24b(in0, in1, in2, in3, in4, in5);

+ _mm_storeu_si128((__m128i*)(rgb + 0), *in0);

+ _mm_storeu_si128((__m128i*)(rgb + 16), *in1);

+ _mm_storeu_si128((__m128i*)(rgb + 32), *in2);

+ _mm_storeu_si128((__m128i*)(rgb + 48), *in3);

+ _mm_storeu_si128((__m128i*)(rgb + 64), *in4);

+ _mm_storeu_si128((__m128i*)(rgb + 80), *in5);

void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,

uint8_t* dst) {

@@ -265,29 +243,29 @@ void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,

void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,

uint8_t* dst) {

__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;

- __m128i rgb[6];

+ __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;

- YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);

- YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1);

+ YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);

+ YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1);

YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);

YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);

// Cast to 8b and store as RRRRGGGGBBBB.

- rgb[0] = _mm_packus_epi16(R0, R1);

- rgb[1] = _mm_packus_epi16(R2, R3);

- rgb[2] = _mm_packus_epi16(G0, G1);

- rgb[3] = _mm_packus_epi16(G2, G3);

- rgb[4] = _mm_packus_epi16(B0, B1);

- rgb[5] = _mm_packus_epi16(B2, B3);

+ rgb0 = _mm_packus_epi16(R0, R1);

+ rgb1 = _mm_packus_epi16(R2, R3);

+ rgb2 = _mm_packus_epi16(G0, G1);

+ rgb3 = _mm_packus_epi16(G2, G3);

+ rgb4 = _mm_packus_epi16(B0, B1);

+ rgb5 = _mm_packus_epi16(B2, B3);

// Pack as RGBRGBRGBRGB.

- PlanarTo24b(rgb, dst);

+ PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);

}

void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,

uint8_t* dst) {

__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;

- __m128i bgr[6];

+ __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;

YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);

YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1);

@@ -295,15 +273,15 @@ void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,

YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);

// Cast to 8b and store as BBBBGGGGRRRR.

- bgr[0] = _mm_packus_epi16(B0, B1);

- bgr[1] = _mm_packus_epi16(B2, B3);

- bgr[2] = _mm_packus_epi16(G0, G1);

- bgr[3] = _mm_packus_epi16(G2, G3);

- bgr[4] = _mm_packus_epi16(R0, R1);

- bgr[5] = _mm_packus_epi16(R2, R3);

+ bgr0 = _mm_packus_epi16(B0, B1);

+ bgr1 = _mm_packus_epi16(B2, B3);

+ bgr2 = _mm_packus_epi16(G0, G1);

+ bgr3 = _mm_packus_epi16(G2, G3);

+ bgr4 = _mm_packus_epi16(R0, R1);

+ bgr5= _mm_packus_epi16(R2, R3);

// Pack as BGRBGRBGRBGR.

- PlanarTo24b(bgr, dst);

+ PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);

}

//-----------------------------------------------------------------------------

@@ -377,7 +355,7 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,

int n;

for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {

__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;

- __m128i rgb[6];

+ __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;

YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);

YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1);

@@ -385,15 +363,15 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,

YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);

// Cast to 8b and store as RRRRGGGGBBBB.

- rgb[0] = _mm_packus_epi16(R0, R1);

- rgb[1] = _mm_packus_epi16(R2, R3);

- rgb[2] = _mm_packus_epi16(G0, G1);

- rgb[3] = _mm_packus_epi16(G2, G3);

- rgb[4] = _mm_packus_epi16(B0, B1);

- rgb[5] = _mm_packus_epi16(B2, B3);

+ rgb0 = _mm_packus_epi16(R0, R1);

+ rgb1 = _mm_packus_epi16(R2, R3);

+ rgb2 = _mm_packus_epi16(G0, G1);

+ rgb3 = _mm_packus_epi16(G2, G3);

+ rgb4 = _mm_packus_epi16(B0, B1);

+ rgb5 = _mm_packus_epi16(B2, B3);

// Pack as RGBRGBRGBRGB.

- PlanarTo24b(rgb, dst);

+ PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);

y += 32;

u += 16;

@@ -413,7 +391,7 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,

int n;

for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {

__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;

- __m128i bgr[6];

+ __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;

YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);

YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1);

@@ -421,15 +399,15 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,

YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);

// Cast to 8b and store as BBBBGGGGRRRR.

- bgr[0] = _mm_packus_epi16(B0, B1);

- bgr[1] = _mm_packus_epi16(B2, B3);

- bgr[2] = _mm_packus_epi16(G0, G1);

- bgr[3] = _mm_packus_epi16(G2, G3);

- bgr[4] = _mm_packus_epi16(R0, R1);

- bgr[5] = _mm_packus_epi16(R2, R3);

+ bgr0 = _mm_packus_epi16(B0, B1);

+ bgr1 = _mm_packus_epi16(B2, B3);

+ bgr2 = _mm_packus_epi16(G0, G1);

+ bgr3 = _mm_packus_epi16(G2, G3);

+ bgr4 = _mm_packus_epi16(R0, R1);

+ bgr5 = _mm_packus_epi16(R2, R3);

// Pack as BGRBGRBGRBGR.

- PlanarTo24b(bgr, dst);

+ PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);

y += 32;

u += 16;

@@ -499,25 +477,19 @@ static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb,

// Convert 8 packed ARGB to r[], g[], b[]

static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb,

- __m128i* const r,

- __m128i* const g,

- __m128i* const b) {

+ __m128i* const rgb /*in[6]*/) {

const __m128i zero = _mm_setzero_si128();

- const __m128i in0 = LOAD_16(argb + 0); // argb3 | argb2 | argb1 | argb0

- const __m128i in1 = LOAD_16(argb + 4); // argb7 | argb6 | argb5 | argb4

- // column-wise transpose

- const __m128i A0 = _mm_unpacklo_epi8(in0, in1);

- const __m128i A1 = _mm_unpackhi_epi8(in0, in1);

- const __m128i B0 = _mm_unpacklo_epi8(A0, A1);

- const __m128i B1 = _mm_unpackhi_epi8(A0, A1);

- // C0 = g7 g6 ... g1 g0 | b7 b6 ... b1 b0

- // C1 = a7 a6 ... a1 a0 | r7 r6 ... r1 r0

- const __m128i C0 = _mm_unpacklo_epi8(B0, B1);

- const __m128i C1 = _mm_unpackhi_epi8(B0, B1);

- // store 16b

- *r = _mm_unpacklo_epi8(C1, zero);

- *g = _mm_unpackhi_epi8(C0, zero);

- *b = _mm_unpacklo_epi8(C0, zero);

+ __m128i a0 = LOAD_16(argb + 0);

+ __m128i a1 = LOAD_16(argb + 4);

+ __m128i a2 = LOAD_16(argb + 8);

+ __m128i a3 = LOAD_16(argb + 12);

+ VP8L32bToPlanar(&a0, &a1, &a2, &a3);

+ rgb[0] = _mm_unpacklo_epi8(a1, zero);

+ rgb[1] = _mm_unpackhi_epi8(a1, zero);

+ rgb[2] = _mm_unpacklo_epi8(a2, zero);

+ rgb[3] = _mm_unpackhi_epi8(a2, zero);

+ rgb[4] = _mm_unpacklo_epi8(a3, zero);

+ rgb[5] = _mm_unpackhi_epi8(a3, zero);

}

// This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX

@@ -649,11 +621,10 @@ static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {

const int max_width = width & ~15;

int i;

for (i = 0; i < max_width; i += 16) {

- __m128i r, g, b, Y0, Y1;

- RGB32PackedToPlanar(&argb[i + 0], &r, &g, &b);

- ConvertRGBToY(&r, &g, &b, &Y0);

- RGB32PackedToPlanar(&argb[i + 8], &r, &g, &b);

- ConvertRGBToY(&r, &g, &b, &Y1);

+ __m128i Y0, Y1, rgb[6];

+ RGB32PackedToPlanar(&argb[i], rgb);

+ ConvertRGBToY(&rgb[0], &rgb[2], &rgb[4], &Y0);

+ ConvertRGBToY(&rgb[1], &rgb[3], &rgb[5], &Y1);

STORE_16(_mm_packus_epi16(Y0, Y1), y + i);

}

for (; i < width; ++i) { // left-over

@@ -678,20 +649,18 @@ static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v,

const int max_width = src_width & ~31;

int i;

for (i = 0; i < max_width; i += 32, u += 16, v += 16) {

- __m128i r0, g0, b0, r1, g1, b1, U0, V0, U1, V1;

- RGB32PackedToPlanar(&argb[i + 0], &r0, &g0, &b0);

- RGB32PackedToPlanar(&argb[i + 8], &r1, &g1, &b1);

- HorizontalAddPack(&r0, &r1, &r0);

- HorizontalAddPack(&g0, &g1, &g0);

- HorizontalAddPack(&b0, &b1, &b0);

- ConvertRGBToUV(&r0, &g0, &b0, &U0, &V0);

- RGB32PackedToPlanar(&argb[i + 16], &r0, &g0, &b0);

- RGB32PackedToPlanar(&argb[i + 24], &r1, &g1, &b1);

- HorizontalAddPack(&r0, &r1, &r0);

- HorizontalAddPack(&g0, &g1, &g0);

- HorizontalAddPack(&b0, &b1, &b0);

- ConvertRGBToUV(&r0, &g0, &b0, &U1, &V1);

+ __m128i rgb[6], U0, V0, U1, V1;

+ RGB32PackedToPlanar(&argb[i], rgb);

+ HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);

+ HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);

+ HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);

+ ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);

+ RGB32PackedToPlanar(&argb[i + 16], rgb);

+ HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);

+ HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);

+ HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);

+ ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);

U0 = _mm_packus_epi16(U0, U1);

V0 = _mm_packus_epi16(V0, V1);

@@ -767,9 +736,128 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {

WebPConvertRGBA32ToUV = ConvertRGBA32ToUV;

}

+//------------------------------------------------------------------------------

+#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic

+static uint16_t clip_y(int v) {

+ return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;

+static uint64_t SharpYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,

+ uint16_t* dst, int len) {

+ uint64_t diff = 0;

+ uint32_t tmp[4];

+ int i;

+ const __m128i zero = _mm_setzero_si128();

+ const __m128i max = _mm_set1_epi16(MAX_Y);

+ const __m128i one = _mm_set1_epi16(1);

+ __m128i sum = zero;

+ for (i = 0; i + 8 <= len; i += 8) {

+ const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));

+ const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));

+ const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));

+ const __m128i D = _mm_sub_epi16(A, B); // diff_y

+ const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0)

+ const __m128i F = _mm_add_epi16(C, D); // new_y

+ const __m128i G = _mm_or_si128(E, one); // -1 or 1

+ const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);

+ const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...))

+ _mm_storeu_si128((__m128i*)(dst + i), H);

+ sum = _mm_add_epi32(sum, I);

+ }

+ _mm_storeu_si128((__m128i*)tmp, sum);

+ diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];

+ for (; i < len; ++i) {

+ const int diff_y = ref[i] - src[i];

+ const int new_y = (int)dst[i] + diff_y;

+ dst[i] = clip_y(new_y);

+ diff += (uint64_t)abs(diff_y);

+ }

+ return diff;

+static void SharpYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,

+ int16_t* dst, int len) {

+ int i = 0;

+ for (i = 0; i + 8 <= len; i += 8) {

+ const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));

+ const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));

+ const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));

+ const __m128i D = _mm_sub_epi16(A, B); // diff_uv

+ const __m128i E = _mm_add_epi16(C, D); // new_uv

+ _mm_storeu_si128((__m128i*)(dst + i), E);

+ }

+ for (; i < len; ++i) {

+ const int diff_uv = ref[i] - src[i];

+ dst[i] += diff_uv;

+ }

+static void SharpYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,

+ const uint16_t* best_y, uint16_t* out) {

+ int i;

+ const __m128i kCst8 = _mm_set1_epi16(8);

+ const __m128i max = _mm_set1_epi16(MAX_Y);

+ const __m128i zero = _mm_setzero_si128();

+ for (i = 0; i + 8 <= len; i += 8) {

+ const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));

+ const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));

+ const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));

+ const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));

+ const __m128i a0b1 = _mm_add_epi16(a0, b1);

+ const __m128i a1b0 = _mm_add_epi16(a1, b0);

+ const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1

+ const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);

+ const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1)

+ const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0)

+ const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);

+ const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);

+ const __m128i d0 = _mm_add_epi16(c1, a0);

+ const __m128i d1 = _mm_add_epi16(c0, a1);

+ const __m128i e0 = _mm_srai_epi16(d0, 1);

+ const __m128i e1 = _mm_srai_epi16(d1, 1);

+ const __m128i f0 = _mm_unpacklo_epi16(e0, e1);

+ const __m128i f1 = _mm_unpackhi_epi16(e0, e1);

+ const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));

+ const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8));

+ const __m128i h0 = _mm_add_epi16(g0, f0);

+ const __m128i h1 = _mm_add_epi16(g1, f1);

+ const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);

+ const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);

+ _mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0);

+ _mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1);

+ }

+ for (; i < len; ++i) {

+ // (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =

+ // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4

+ // We reuse the common sub-expressions.

+ const int a0b1 = A[i + 0] + B[i + 1];

+ const int a1b0 = A[i + 1] + B[i + 0];

+ const int a0a1b0b1 = a0b1 + a1b0 + 8;

+ const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;

+ const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;

+ out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);

+ out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);

+ }

+#undef MAX_Y

+//------------------------------------------------------------------------------

+extern void WebPInitSharpYUVSSE2(void);

+WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVSSE2(void) {

+ WebPSharpYUVUpdateY = SharpYUVUpdateY_SSE2;

+ WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_SSE2;

+ WebPSharpYUVFilterRow = SharpYUVFilterRow_SSE2;

#else // !WEBP_USE_SSE2

WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)

WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)

+WEBP_DSP_INIT_STUB(WebPInitSharpYUVSSE2)

#endif // WEBP_USE_SSE2

« no previous file with comments | « third_party/libwebp/dsp/yuv.c ('k') | third_party/libwebp/enc/alpha.c » ('j') | no next file with comments »