Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(310)

Unified Diff: src/opts/SkBlitRow_opts_SSE2.cpp

Issue 886403002: Optimize SSE2 opaque blend (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/opts/SkBlitRow_opts_SSE2.cpp
diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp
index 7b9c0438357b4a7602115ffb040bdc699011f992..e830c5fa06ce3a7e1ecd176e6072dea2271028da 100644
--- a/src/opts/SkBlitRow_opts_SSE2.cpp
+++ b/src/opts/SkBlitRow_opts_SSE2.cpp
@@ -74,6 +74,7 @@ void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
return;
}
+#ifdef SK_USE_ACCURATE_BLENDING
if (count >= 4) {
SkASSERT(((size_t)dst & 0x03) == 0);
while (((size_t)dst & 0x0F) != 0) {
@@ -85,7 +86,6 @@ void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
const __m128i *s = reinterpret_cast<const __m128i*>(src);
__m128i *d = reinterpret_cast<__m128i*>(dst);
-#ifdef SK_USE_ACCURATE_BLENDING
__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
__m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
__m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
@@ -134,51 +134,6 @@ void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
d++;
count -= 4;
}
-#else
- __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
- __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)
- while (count >= 4) {
- // Load 4 pixels
- __m128i src_pixel = _mm_loadu_si128(s);
- __m128i dst_pixel = _mm_load_si128(d);
-
- __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
- __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
-
- // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)
- __m128i alpha = _mm_srli_epi16(src_pixel, 8);
-
- // (a0, a0, a1, a1, a2, g2, a3, g3)
- alpha = _mm_shufflehi_epi16(alpha, 0xF5);
-
- // (a0, a0, a1, a1, a2, a2, a3, a3)
- alpha = _mm_shufflelo_epi16(alpha, 0xF5);
-
- // Subtract alphas from 256, to get 1..256
- alpha = _mm_sub_epi16(c_256, alpha);
-
- // Multiply by red and blue by src alpha.
- dst_rb = _mm_mullo_epi16(dst_rb, alpha);
- // Multiply by alpha and green by src alpha.
- dst_ag = _mm_mullo_epi16(dst_ag, alpha);
-
- // Divide by 256.
- dst_rb = _mm_srli_epi16(dst_rb, 8);
-
- // Mask out high bits (already in the right place)
- dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
-
- // Combine back into RGBA.
- dst_pixel = _mm_or_si128(dst_rb, dst_ag);
-
- // Add result
- __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
- _mm_store_si128(d, result);
- s++;
- d++;
- count -= 4;
- }
-#endif
src = reinterpret_cast<const SkPMColor*>(s);
dst = reinterpret_cast<SkPMColor*>(d);
}
@@ -189,6 +144,51 @@ void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
dst++;
count--;
}
+#else
+ int count16 = count / 16;
+ __m128i* dst4 = (__m128i*)dst;
+ const __m128i* src4 = (const __m128i*)src;
+
+ for (int i = 0; i < count16 * 4; i += 4) {
+ // Load 16 source pixels.
+ __m128i s0 = _mm_loadu_si128(src4+i+0),
+ s1 = _mm_loadu_si128(src4+i+1),
+ s2 = _mm_loadu_si128(src4+i+2),
+ s3 = _mm_loadu_si128(src4+i+3);
+
+ const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);
+ const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
+ __m128i cmp = _mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask), _mm_setzero_si128());
+ if (0xffff == _mm_movemask_epi8(cmp)) {
+ // All 16 source pixels are fully transparent. There's nothing to do!
+ continue;
+ }
+ const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
+ cmp = _mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask), alphaMask);
+ if (0xffff == _mm_movemask_epi8(cmp)) {
+ // All 16 source pixels are fully opaque. There's no need to read dst or blend it.
+ _mm_storeu_si128(dst4+i+0, s0);
+ _mm_storeu_si128(dst4+i+1, s1);
+ _mm_storeu_si128(dst4+i+2, s2);
+ _mm_storeu_si128(dst4+i+3, s3);
+ continue;
+ }
+ // The general slow case: do the blend for all 16 pixels.
+ _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0)));
+ _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1)));
+ _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2)));
+ _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3)));
+ }
+
+ // Wrap up the last <= 15 pixels.
+ SkASSERT(count - (count16*16) <= 15);
+ for (int i = count16*16; i < count; i++) {
+ // This check is not really necessarily, but it prevents pointless autovectorization.
+ if (src[i] & 0xFF000000) {
+ dst[i] = SkPMSrcOver(src[i], dst[i]);
+ }
+ }
+#endif
}
void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698