third_party/libwebp/dsp/alpha_processing_sse2.c - Issue 2651883004: libwebp-0.6.0-rc1

Unified Diff: third_party/libwebp/dsp/alpha_processing_sse2.c

Issue 2651883004: libwebp-0.6.0-rc1 (Closed)

Patch Set: Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: third_party/libwebp/dsp/alpha_processing_sse2.c

diff --git a/third_party/libwebp/dsp/alpha_processing_sse2.c b/third_party/libwebp/dsp/alpha_processing_sse2.c

index 5acb481dcddbb7d71c0008d45f56d3cafbd8bb6d..83dc559fac4c0e048bd4285a4f56e0a91ae34516 100644

--- a/third_party/libwebp/dsp/alpha_processing_sse2.c

+++ b/third_party/libwebp/dsp/alpha_processing_sse2.c

@@ -150,46 +150,46 @@ static int ExtractAlpha(const uint8_t* argb, int argb_stride,

#define PREMULTIPLY(x, m) (((x) * (m)) >> 23)

// We can't use a 'const int' for the SHUFFLE value, because it has to be an

-// immediate in the _mm_shufflexx_epi16() instruction. We really a macro here.

-#define APPLY_ALPHA(RGBX, SHUFFLE, MASK, MULT) do { \

- const __m128i argb0 = _mm_loadl_epi64((__m128i*)&(RGBX)); \

- const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero); \

- const __m128i alpha0 = _mm_and_si128(argb1, MASK); \

- const __m128i alpha1 = _mm_shufflelo_epi16(alpha0, SHUFFLE); \

- const __m128i alpha2 = _mm_shufflehi_epi16(alpha1, SHUFFLE); \

- /* alpha2 = [0 a0 a0 a0][0 a1 a1 a1] */ \

- const __m128i scale0 = _mm_mullo_epi16(alpha2, MULT); \

- const __m128i scale1 = _mm_mulhi_epu16(alpha2, MULT); \

- const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0); \

- const __m128i argb3 = _mm_mullo_epi16(argb1, scale1); \

- const __m128i argb4 = _mm_adds_epu16(argb2, argb3); \

- const __m128i argb5 = _mm_srli_epi16(argb4, 7); \

- const __m128i argb6 = _mm_or_si128(argb5, alpha0); \

- const __m128i argb7 = _mm_packus_epi16(argb6, zero); \

- _mm_storel_epi64((__m128i*)&(RGBX), argb7); \

+// immediate in the _mm_shufflexx_epi16() instruction. We really need a macro.

+// We use: v / 255 = (v * 0x8081) >> 23, where v = alpha * {r,g,b} is a 16bit

+// value.

+#define APPLY_ALPHA(RGBX, SHUFFLE) do { \

+ const __m128i argb0 = _mm_loadu_si128((const __m128i*)&(RGBX)); \

+ const __m128i argb1_lo = _mm_unpacklo_epi8(argb0, zero); \

+ const __m128i argb1_hi = _mm_unpackhi_epi8(argb0, zero); \

+ const __m128i alpha0_lo = _mm_or_si128(argb1_lo, kMask); \

+ const __m128i alpha0_hi = _mm_or_si128(argb1_hi, kMask); \

+ const __m128i alpha1_lo = _mm_shufflelo_epi16(alpha0_lo, SHUFFLE); \

+ const __m128i alpha1_hi = _mm_shufflelo_epi16(alpha0_hi, SHUFFLE); \

+ const __m128i alpha2_lo = _mm_shufflehi_epi16(alpha1_lo, SHUFFLE); \

+ const __m128i alpha2_hi = _mm_shufflehi_epi16(alpha1_hi, SHUFFLE); \

+ /* alpha2 = [ff a0 a0 a0][ff a1 a1 a1] */ \

+ const __m128i A0_lo = _mm_mullo_epi16(alpha2_lo, argb1_lo); \

+ const __m128i A0_hi = _mm_mullo_epi16(alpha2_hi, argb1_hi); \

+ const __m128i A1_lo = _mm_mulhi_epu16(A0_lo, kMult); \

+ const __m128i A1_hi = _mm_mulhi_epu16(A0_hi, kMult); \

+ const __m128i A2_lo = _mm_srli_epi16(A1_lo, 7); \

+ const __m128i A2_hi = _mm_srli_epi16(A1_hi, 7); \

+ const __m128i A3 = _mm_packus_epi16(A2_lo, A2_hi); \

+ _mm_storeu_si128((__m128i*)&(RGBX), A3); \

} while (0)

-static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,

- int w, int h, int stride) {

+static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first,

+ int w, int h, int stride) {

const __m128i zero = _mm_setzero_si128();

- const int kSpan = 2;

- const int w2 = w & ~(kSpan - 1);

+ const __m128i kMult = _mm_set1_epi16(0x8081u);

+ const __m128i kMask = _mm_set_epi16(0, 0xff, 0xff, 0, 0, 0xff, 0xff, 0);

+ const int kSpan = 4;

while (h-- > 0) {

uint32_t* const rgbx = (uint32_t*)rgba;

int i;

if (!alpha_first) {

- const __m128i kMask = _mm_set_epi16(0xff, 0, 0, 0, 0xff, 0, 0, 0);

- const __m128i kMult =

- _mm_set_epi16(0, 0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081);

- for (i = 0; i < w2; i += kSpan) {

- APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 3, 3, 3), kMask, kMult);

+ for (i = 0; i + kSpan <= w; i += kSpan) {

+ APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(2, 3, 3, 3));

}

} else {

- const __m128i kMask = _mm_set_epi16(0, 0, 0, 0xff, 0, 0, 0, 0xff);

- const __m128i kMult =

- _mm_set_epi16(0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081, 0);

- for (i = 0; i < w2; i += kSpan) {

- APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 3), kMask, kMult);

+ for (i = 0; i + kSpan <= w; i += kSpan) {

+ APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 1));

}

// Finish with left-overs.

@@ -213,64 +213,51 @@ static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,

// -----------------------------------------------------------------------------

// Apply alpha value to rows

-// We use: kINV255 = (1 << 24) / 255 = 0x010101

-// So: a * kINV255 = (a << 16) | [(a << 8) | a]

-// -> _mm_mulhi_epu16() takes care of the (a<<16) part,

-// and _mm_mullo_epu16(a * 0x0101,...) takes care of the "(a << 8) | a" one.

-static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {

+static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) {

int x = 0;

if (!inverse) {

const int kSpan = 2;

const __m128i zero = _mm_setzero_si128();

- const __m128i kRound =

- _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7);

- const __m128i kMult =

- _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101);

- const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0);

- const int w2 = width & ~(kSpan - 1);

- for (x = 0; x < w2; x += kSpan) {

- const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]);

- const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero);

- const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3));

- const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3));

- const __m128i tmp2 = _mm_srli_epi64(tmp1, 16);

- const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult);

- const __m128i scale1 = _mm_or_si128(tmp2, kOne64);

- const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0);

- const __m128i argb3 = _mm_mullo_epi16(argb1, scale1);

- const __m128i argb4 = _mm_adds_epu16(argb2, argb3);

- const __m128i argb5 = _mm_adds_epu16(argb4, kRound);

- const __m128i argb6 = _mm_srli_epi16(argb5, 8);

- const __m128i argb7 = _mm_packus_epi16(argb6, zero);

- _mm_storel_epi64((__m128i*)&ptr[x], argb7);

+ const __m128i k128 = _mm_set1_epi16(128);

+ const __m128i kMult = _mm_set1_epi16(0x0101);

+ const __m128i kMask = _mm_set_epi16(0, 0xff, 0, 0, 0, 0xff, 0, 0);

+ for (x = 0; x + kSpan <= width; x += kSpan) {

+ // To compute 'result = (int)(a * x / 255. + .5)', we use:

+ // tmp = a * v + 128, result = (tmp * 0x0101u) >> 16

+ const __m128i A0 = _mm_loadl_epi64((const __m128i*)&ptr[x]);

+ const __m128i A1 = _mm_unpacklo_epi8(A0, zero);

+ const __m128i A2 = _mm_or_si128(A1, kMask);

+ const __m128i A3 = _mm_shufflelo_epi16(A2, _MM_SHUFFLE(2, 3, 3, 3));

+ const __m128i A4 = _mm_shufflehi_epi16(A3, _MM_SHUFFLE(2, 3, 3, 3));

+ // here, A4 = [ff a0 a0 a0][ff a1 a1 a1]

+ const __m128i A5 = _mm_mullo_epi16(A4, A1);

+ const __m128i A6 = _mm_add_epi16(A5, k128);

+ const __m128i A7 = _mm_mulhi_epu16(A6, kMult);

+ const __m128i A10 = _mm_packus_epi16(A7, zero);

+ _mm_storel_epi64((__m128i*)&ptr[x], A10);

}

width -= x;

if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);

}

-static void MultRow(uint8_t* const ptr, const uint8_t* const alpha,

- int width, int inverse) {

+static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,

+ int width, int inverse) {

int x = 0;

if (!inverse) {

- const int kSpan = 8;

const __m128i zero = _mm_setzero_si128();

- const __m128i kRound = _mm_set1_epi16(1 << 7);

- const int w2 = width & ~(kSpan - 1);

- for (x = 0; x < w2; x += kSpan) {

+ const __m128i k128 = _mm_set1_epi16(128);

+ const __m128i kMult = _mm_set1_epi16(0x0101);

+ for (x = 0; x + 8 <= width; x += 8) {

const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]);

+ const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[x]);

const __m128i v1 = _mm_unpacklo_epi8(v0, zero);

- const __m128i alpha0 = _mm_loadl_epi64((const __m128i*)&alpha[x]);

- const __m128i alpha1 = _mm_unpacklo_epi8(alpha0, zero);

- const __m128i alpha2 = _mm_unpacklo_epi8(alpha0, alpha0);

- const __m128i v2 = _mm_mulhi_epu16(v1, alpha2);

- const __m128i v3 = _mm_mullo_epi16(v1, alpha1);

- const __m128i v4 = _mm_adds_epu16(v2, v3);

- const __m128i v5 = _mm_adds_epu16(v4, kRound);

- const __m128i v6 = _mm_srli_epi16(v5, 8);

- const __m128i v7 = _mm_packus_epi16(v6, zero);

- _mm_storel_epi64((__m128i*)&ptr[x], v7);

+ const __m128i a1 = _mm_unpacklo_epi8(a0, zero);

+ const __m128i v2 = _mm_mullo_epi16(v1, a1);

+ const __m128i v3 = _mm_add_epi16(v2, k128);

+ const __m128i v4 = _mm_mulhi_epu16(v3, kMult);

+ const __m128i v5 = _mm_packus_epi16(v4, zero);

+ _mm_storel_epi64((__m128i*)&ptr[x], v5);

}

width -= x;

@@ -283,9 +270,9 @@ static void MultRow(uint8_t* const ptr, const uint8_t* const alpha,

extern void WebPInitAlphaProcessingSSE2(void);

WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {

- WebPMultARGBRow = MultARGBRow;

- WebPMultRow = MultRow;

- WebPApplyAlphaMultiply = ApplyAlphaMultiply;

+ WebPMultARGBRow = MultARGBRow_SSE2;

+ WebPMultRow = MultRow_SSE2;

+ WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2;

WebPDispatchAlpha = DispatchAlpha;

WebPDispatchAlphaToGreen = DispatchAlphaToGreen;

WebPExtractAlpha = ExtractAlpha;

« no previous file with comments | « third_party/libwebp/dsp/alpha_processing_neon.c ('k') | third_party/libwebp/dsp/common_sse2.h » ('j') | no next file with comments »