Index: src/opts/SkBlend_opts.h |
diff --git a/src/opts/SkBlend_opts.h b/src/opts/SkBlend_opts.h |
index a1067407be84a89b74b0b396d9d7a94ab1412973..93946438e59610961732766f8f030dcd9de37054 100644 |
--- a/src/opts/SkBlend_opts.h |
+++ b/src/opts/SkBlend_opts.h |
@@ -5,52 +5,233 @@ |
* found in the LICENSE file. |
*/ |
+/* |
+ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; and ./out/Release/nanobench --samples 300 --nompd --match LinearSrcOver -q |
+ */ |
+ |
#ifndef SkBlend_opts_DEFINED |
#define SkBlend_opts_DEFINED |
+#include "SkNx.h" |
+#include "SkPM4fPriv.h" |
+ |
namespace SK_OPTS_NS { |
-#if 0 |
+// An implementation of SrcOver from bytes to bytes in linear space that takes advantage of the |
+// observation that the 255's cancel. |
+// invA = 1 - (As / 255); |
+// |
+// R = 255 * sqrt((Rs/255)^2 + (Rd/255)^2 * invA) |
+// => R = 255 * sqrt((Rs^2 + Rd^2 * invA)/255^2) |
+// => R = sqrt(Rs^2 + Rd^2 * invA) |
+static inline void blend_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) { |
+ Sk4f s = srgb_to_linear(to_4f(pixel)); |
+ Sk4f d = srgb_to_linear(to_4f(*dst)); |
+ Sk4f invAlpha = 1.0f - Sk4f{s[SkPM4f::A]} * (1.0f / 255.0f); |
+ Sk4f r = linear_to_srgb(s + d * invAlpha) + 0.5f; |
+ *dst = to_4b(r); |
+} |
-#else |
+static inline void srcover_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) { |
+ if ((~pixel & 0xFF000000) == 0) { |
+ *dst = pixel; |
+ } else if ((pixel & 0xFF000000) != 0) { |
+ blend_srgb_srgb_1(dst, pixel); |
+ } |
+} |
+ |
+static inline void srcover_srgb_srgb_2(uint32_t* dst, const uint32_t* src) { |
+ srcover_srgb_srgb_1(dst++, *src++); |
+ srcover_srgb_srgb_1(dst, *src); |
+} |
+ |
+static inline void srcover_srgb_srgb_4(uint32_t* dst, const uint32_t* src) { |
+ srcover_srgb_srgb_1(dst++, *src++); |
+ srcover_srgb_srgb_1(dst++, *src++); |
+ srcover_srgb_srgb_1(dst++, *src++); |
+ srcover_srgb_srgb_1(dst, *src); |
+} |
+ |
+void best_non_simd_srcover_srgb_srgb( |
+ uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { |
+ uint64_t* ddst = reinterpret_cast<uint64_t*>(dst); |
- static inline void srcover_srgb_srgb_1(uint32_t* dst, uint32_t src) { |
- switch (src >> 24) { |
- case 0x00: return; |
- case 0xff: *dst = src; return; |
+ while (ndst >0) { |
+ int count = SkTMin(ndst, nsrc); |
+ ndst -= count; |
+ const uint64_t* dsrc = reinterpret_cast<const uint64_t*>(src); |
+ const uint64_t* end = dsrc + (count >> 1); |
+ do { |
+ if ((~*dsrc & 0xFF000000FF000000) == 0) { |
+ do { |
+ *ddst++ = *dsrc++; |
+ } while (dsrc < end && (~*dsrc & 0xFF000000FF000000) == 0); |
+ } else if ((*dsrc & 0xFF000000FF000000) == 0) { |
+ do { |
+ dsrc++; |
+ ddst++; |
+ } while (dsrc < end && (*dsrc & 0xFF000000FF000000) == 0); |
+ } else { |
+ srcover_srgb_srgb_2(reinterpret_cast<uint32_t*>(ddst++), |
+ reinterpret_cast<const uint32_t*>(dsrc++)); |
+ } |
+ } while (dsrc < end); |
+ |
+ if ((count & 1) != 0) { |
+ srcover_srgb_srgb_1(reinterpret_cast<uint32_t*>(ddst), |
+ *reinterpret_cast<const uint32_t*>(dsrc)); |
+ } |
+ } |
+} |
+ |
+void brute_force_srcover_srgb_srgb( |
+ uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { |
+ while (ndst > 0) { |
+ int n = SkTMin(ndst, nsrc); |
+ |
+ for (int i = 0; i < n; i++) { |
+ blend_srgb_srgb_1(dst++, src[i]); |
} |
+ ndst -= n; |
+ } |
+} |
- Sk4f d = SkNx_cast<float>(Sk4b::Load( dst)), |
- s = SkNx_cast<float>(Sk4b::Load(&src)); |
+void trivial_srcover_srgb_srgb( |
+ uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { |
+ while (ndst > 0) { |
+ int n = SkTMin(ndst, nsrc); |
- // Approximate sRGB gamma as 2.0. |
- Sk4f d_sq = d*d, |
- s_sq = s*s; |
- d = Sk4f{d_sq[0], d_sq[1], d_sq[2], d[3]}; |
- s = Sk4f{s_sq[0], s_sq[1], s_sq[2], s[3]}; |
+ for (int i = 0; i < n; i++) { |
+ srcover_srgb_srgb_1(dst++, src[i]); |
+ } |
+ ndst -= n; |
+ } |
+} |
- // SrcOver. |
- Sk4f invA = 1.0f - s[3]*(1/255.0f); |
- d = s + d * invA; |
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
- // Re-apply approximate sRGB gamma. |
- Sk4f d_sqrt = d.sqrt(); |
- d = Sk4f{d_sqrt[0], d_sqrt[1], d_sqrt[2], d[3]}; |
+ static inline __m128i load(const uint32_t* p) { |
+ return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p)); |
+ } |
- SkNx_cast<uint8_t>(d).store(dst); |
+ static inline void store(uint32_t* p, __m128i v) { |
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v); |
} |
- static inline void srcover_srgb_srgb(uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { |
- while (ndst > 0) { |
- int n = SkTMin(ndst, nsrc); |
+ #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 |
+ |
+ void srcover_srgb_srgb( |
+ uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) { |
+ const __m128i alphaMask = _mm_set1_epi32(0xFF000000); |
+ while (ndst > 0) { |
+ int count = SkTMin(ndst, nsrc); |
+ ndst -= count; |
+ const uint32_t* src = srcStart; |
+ const uint32_t* end = src + (count & ~3); |
- for (int i = 0; i < n; i++) { |
- srcover_srgb_srgb_1(dst++, src[i]); |
+ while (src < end) { |
+ __m128i pixels = load(src); |
+ if (_mm_testc_si128(pixels, alphaMask)) { |
+ do { |
+ store(dst, pixels); |
+ dst += 4; |
+ src += 4; |
+ } while (src < end && _mm_testc_si128(pixels = load(src), alphaMask)); |
+ } else if (_mm_testz_si128(pixels, alphaMask)) { |
+ do { |
+ dst += 4; |
+ src += 4; |
+ } while (src < end && _mm_testz_si128(pixels = load(src), alphaMask)); |
+ } else { |
+ do { |
+ srcover_srgb_srgb_4(dst, src); |
+ dst += 4; |
+ src += 4; |
+ } while (src < end && _mm_testnzc_si128(pixels = load(src), alphaMask)); |
+ } |
+ } |
+ |
+ count = count & 3; |
+ while (count-- > 0) { |
+ srcover_srgb_srgb_1(dst++, *src++); |
+ } |
} |
- ndst -= n; |
} |
+ #else |
+ // SSE2 versions |
+ static inline bool check_opaque_alphas(__m128i pixels) { |
+ int mask = |
+ _mm_movemask_epi8( |
+ _mm_cmpeq_epi32( |
+ _mm_andnot_si128(pixels, _mm_set1_epi32(0xFF000000)), |
+ _mm_setzero_si128())); |
+ return mask == 0xFFFF; |
+ } |
+ |
+ static inline bool check_transparent_alphas(__m128i pixels) { |
+ int mask = |
+ _mm_movemask_epi8( |
+ _mm_cmpeq_epi32( |
+ _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000)), |
+ _mm_setzero_si128())); |
+ return mask == 0xFFFF; |
+ } |
+ |
+ static inline bool check_partial_alphas(__m128i pixels) { |
+ __m128i alphas = _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000)); |
+ int mask = |
+ _mm_movemask_epi8( |
+ _mm_cmpeq_epi8( |
+ _mm_srai_epi32(alphas, 8), |
+ alphas)); |
+ return mask == 0xFFFF; |
+ } |
+ |
+ void srcover_srgb_srgb( |
+ uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) { |
+ while (ndst > 0) { |
+ int count = SkTMin(ndst, nsrc); |
+ ndst -= count; |
+ const uint32_t* src = srcStart; |
+ const uint32_t* end = src + (count & ~3); |
+ |
+ __m128i pixels = load(src); |
+ do { |
+ if (check_opaque_alphas(pixels)) { |
+ do { |
+ store(dst, pixels); |
+ dst += 4; |
+ src += 4; |
+ } while (src < end && check_opaque_alphas(pixels = load(src))); |
+ } else if (check_transparent_alphas(pixels)) { |
+ const uint32_t* start = src; |
+ do { |
+ src += 4; |
+ } while (src < end && check_transparent_alphas(pixels = load(src))); |
+ dst += src - start; |
+ } else { |
+ do { |
+ srcover_srgb_srgb_4(dst, src); |
+ dst += 4; |
+ src += 4; |
+ } while (src < end && check_partial_alphas(pixels = load(src))); |
+ } |
+ } while (src < end); |
+ |
+ count = count & 3; |
+ while (count-- > 0) { |
+ srcover_srgb_srgb_1(dst++, *src++); |
+ } |
+ } |
+ } |
+ #endif |
+#else |
+ |
+ void srcover_srgb_srgb( |
+ uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { |
+ trivial_srcover_srgb_srgb(dst, src, ndst, nsrc); |
} |
- |
+ |
#endif |
} // namespace SK_OPTS_NS |