| Index: src/opts/SkBlitRow_opts_SSE2.cpp
|
| diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp
|
| index 7b9c0438357b4a7602115ffb040bdc699011f992..e830c5fa06ce3a7e1ecd176e6072dea2271028da 100644
|
| --- a/src/opts/SkBlitRow_opts_SSE2.cpp
|
| +++ b/src/opts/SkBlitRow_opts_SSE2.cpp
|
| @@ -74,6 +74,7 @@ void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
|
| return;
|
| }
|
|
|
| +#ifdef SK_USE_ACCURATE_BLENDING
|
| if (count >= 4) {
|
| SkASSERT(((size_t)dst & 0x03) == 0);
|
| while (((size_t)dst & 0x0F) != 0) {
|
| @@ -85,7 +86,6 @@ void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
|
|
|
| const __m128i *s = reinterpret_cast<const __m128i*>(src);
|
| __m128i *d = reinterpret_cast<__m128i*>(dst);
|
| -#ifdef SK_USE_ACCURATE_BLENDING
|
| __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
|
| __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
|
| __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
|
| @@ -134,51 +134,6 @@ void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
|
| d++;
|
| count -= 4;
|
| }
|
| -#else
|
| - __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
|
| - __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)
|
| - while (count >= 4) {
|
| - // Load 4 pixels
|
| - __m128i src_pixel = _mm_loadu_si128(s);
|
| - __m128i dst_pixel = _mm_load_si128(d);
|
| -
|
| - __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
|
| - __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
|
| -
|
| - // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)
|
| - __m128i alpha = _mm_srli_epi16(src_pixel, 8);
|
| -
|
| - // (a0, a0, a1, a1, a2, g2, a3, g3)
|
| - alpha = _mm_shufflehi_epi16(alpha, 0xF5);
|
| -
|
| - // (a0, a0, a1, a1, a2, a2, a3, a3)
|
| - alpha = _mm_shufflelo_epi16(alpha, 0xF5);
|
| -
|
| - // Subtract alphas from 256, to get 1..256
|
| - alpha = _mm_sub_epi16(c_256, alpha);
|
| -
|
| - // Multiply by red and blue by src alpha.
|
| - dst_rb = _mm_mullo_epi16(dst_rb, alpha);
|
| - // Multiply by alpha and green by src alpha.
|
| - dst_ag = _mm_mullo_epi16(dst_ag, alpha);
|
| -
|
| - // Divide by 256.
|
| - dst_rb = _mm_srli_epi16(dst_rb, 8);
|
| -
|
| - // Mask out high bits (already in the right place)
|
| - dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
|
| -
|
| - // Combine back into RGBA.
|
| - dst_pixel = _mm_or_si128(dst_rb, dst_ag);
|
| -
|
| - // Add result
|
| - __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
|
| - _mm_store_si128(d, result);
|
| - s++;
|
| - d++;
|
| - count -= 4;
|
| - }
|
| -#endif
|
| src = reinterpret_cast<const SkPMColor*>(s);
|
| dst = reinterpret_cast<SkPMColor*>(d);
|
| }
|
| @@ -189,6 +144,51 @@ void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
|
| dst++;
|
| count--;
|
| }
|
| +#else
|
| + int count16 = count / 16;
|
| + __m128i* dst4 = (__m128i*)dst;
|
| + const __m128i* src4 = (const __m128i*)src;
|
| +
|
| + for (int i = 0; i < count16 * 4; i += 4) {
|
| + // Load 16 source pixels.
|
| + __m128i s0 = _mm_loadu_si128(src4+i+0),
|
| + s1 = _mm_loadu_si128(src4+i+1),
|
| + s2 = _mm_loadu_si128(src4+i+2),
|
| + s3 = _mm_loadu_si128(src4+i+3);
|
| +
|
| + const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);
|
| + const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
|
| + __m128i cmp = _mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask), _mm_setzero_si128());
|
| + if (0xffff == _mm_movemask_epi8(cmp)) {
|
| + // All 16 source pixels are fully transparent. There's nothing to do!
|
| + continue;
|
| + }
|
| + const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
|
| + cmp = _mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask), alphaMask);
|
| + if (0xffff == _mm_movemask_epi8(cmp)) {
|
| + // All 16 source pixels are fully opaque. There's no need to read dst or blend it.
|
| + _mm_storeu_si128(dst4+i+0, s0);
|
| + _mm_storeu_si128(dst4+i+1, s1);
|
| + _mm_storeu_si128(dst4+i+2, s2);
|
| + _mm_storeu_si128(dst4+i+3, s3);
|
| + continue;
|
| + }
|
| + // The general slow case: do the blend for all 16 pixels.
|
| + _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0)));
|
| + _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1)));
|
| + _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2)));
|
| + _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3)));
|
| + }
|
| +
|
| + // Wrap up the last <= 15 pixels.
|
| + SkASSERT(count - (count16*16) <= 15);
|
| + for (int i = count16*16; i < count; i++) {
|
| + // This check is not really necessarily, but it prevents pointless autovectorization.
|
| + if (src[i] & 0xFF000000) {
|
| + dst[i] = SkPMSrcOver(src[i], dst[i]);
|
| + }
|
| + }
|
| +#endif
|
| }
|
|
|
| void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
|
|
|