Chromium Code Reviews| Index: src/opts/SkBlend_opts.h |
| diff --git a/src/opts/SkBlend_opts.h b/src/opts/SkBlend_opts.h |
| index a1067407be84a89b74b0b396d9d7a94ab1412973..02ec51c3376678381b0d13a39fc82c6c460c1e54 100644 |
| --- a/src/opts/SkBlend_opts.h |
| +++ b/src/opts/SkBlend_opts.h |
| @@ -5,52 +5,244 @@ |
| * found in the LICENSE file. |
| */ |
| +/* |
| +ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; and ./out/Release/nanobench --samples 300 --nompd --match LinearSrcOver -q |
| + */ |
| + |
| #ifndef SkBlend_opts_DEFINED |
| #define SkBlend_opts_DEFINED |
| +#include "SkNx.h" |
| + |
| namespace SK_OPTS_NS { |
| -#if 0 |
| +// Fast but approximate implementation of sRGB gamma to linear. |
| +static inline Sk4f sRGB_to_linear(Sk4f pixel) { |
|
f(malita)
2016/05/06 17:43:56
Same as SkPM4fPriv.h:srgb_to_linear() - any reason
herb_g
2016/05/06 20:57:45
Done.
|
| + Sk4f l = pixel * pixel; |
| + return Sk4f{l[0], l[1], l[2], pixel[3]}; |
| +} |
| -#else |
| +// Fast but approximate implementation of linear to sRGB gamma. |
| +static inline Sk4f linear_to_sRGB(Sk4f pixel) { |
|
f(malita)
2016/05/06 17:43:56
Same as SkPM4fPriv.h:linear_to_srgb().
herb_g
2016/05/06 20:57:45
Done.
|
| + Sk4f s = pixel.sqrt(); |
| + return Sk4f{s[0], s[1], s[2], pixel[3]}; |
| +} |
| + |
| +// An implementation of SrcOver from bytes to bytes in linear space that takes advantage of the |
| +// observation that the 255's cancel. |
| +// invA = 1 - (As / 255); |
| +// |
| +// R = 255 * sqrt((Rs/255)^2 + (Rd/255)^2 * invA) |
| +// => R = 255 * sqrt((Rs^2 + Rd^2 * invA)/255^2) |
| +// => R = sqrt(Rs^2 + Rd^2 * invA) |
| +static inline void blend_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) { |
| + Sk4f s = sRGB_to_linear(SkNx_cast<float>(Sk4b::Load(&pixel))); |
| + Sk4f d = sRGB_to_linear(SkNx_cast<float>(Sk4b::Load(dst))); |
| + Sk4f invAlpha = 1.0f - Sk4f{s[3]} * (1.0f / 255.0f); |
| + Sk4f r = linear_to_sRGB(s + d * invAlpha); |
| + SkNx_cast<uint8_t>(r).store(dst); |
|
f(malita)
2016/05/06 17:43:56
Can we use the SkPM4fPriv.h helpers?
to_4f(), to_
herb_g
2016/05/06 20:57:45
Done.
|
| +} |
| + |
| +static inline void srcover_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) { |
| + if ((~pixel & 0xFF000000) == 0) { |
| + *dst = pixel; |
| + } else if ((pixel & 0xFF000000) != 0) { |
| + blend_srgb_srgb_1(dst, pixel); |
| + } |
|
f(malita)
2016/05/06 17:43:56
Nit: I would use more color macros here for readab
herb_g
2016/05/06 20:57:45
I started with code similar to what you suggest, b
|
| +} |
| + |
| +static inline void srcover_srgb_srgb_2(uint32_t* dst, const uint32_t* src) { |
| + srcover_srgb_srgb_1(dst++, *src++); |
| + srcover_srgb_srgb_1(dst, *src); |
| +} |
| - static inline void srcover_srgb_srgb_1(uint32_t* dst, uint32_t src) { |
| - switch (src >> 24) { |
| - case 0x00: return; |
| - case 0xff: *dst = src; return; |
| +static inline void srcover_srgb_srgb_4(uint32_t* dst, const uint32_t* src) { |
| + srcover_srgb_srgb_1(dst++, *src++); |
| + srcover_srgb_srgb_1(dst++, *src++); |
| + srcover_srgb_srgb_1(dst++, *src++); |
| + srcover_srgb_srgb_1(dst, *src); |
| +} |
| + |
| +void best_non_simd_srcover_srgb_srgb( |
| + uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { |
| + uint64_t* ddst = reinterpret_cast<uint64_t*>(dst); |
| + |
| + while (ndst >0) { |
| + int count = SkTMin(ndst, nsrc); |
| + ndst -= count; |
| + const uint64_t* dsrc = reinterpret_cast<const uint64_t*>(src); |
| + const uint64_t* end = dsrc + (count >> 1); |
| + do { |
| + if ((~*dsrc & 0xFF000000FF000000) == 0) { |
| + do { |
| + *ddst++ = *dsrc++; |
| + } while (dsrc < end && (~*dsrc & 0xFF000000FF000000) == 0); |
| + } else if ((*dsrc & 0xFF000000FF000000) == 0) { |
| + do { |
| + dsrc++; |
| + ddst++; |
| + } while (dsrc < end && (*dsrc & 0xFF000000FF000000) == 0); |
| + } else { |
| + srcover_srgb_srgb_2(reinterpret_cast<uint32_t*>(ddst++), |
| + reinterpret_cast<const uint32_t*>(dsrc++)); |
| + } |
| + } while (dsrc < end); |
| + |
| + if ((count & 1) != 0) { |
| + srcover_srgb_srgb_1(reinterpret_cast<uint32_t*>(ddst), |
| + *reinterpret_cast<const uint32_t*>(dsrc)); |
| } |
| + } |
| +} |
| + |
| +void brute_force_srcover_srgb_srgb( |
| + uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { |
| + while (ndst > 0) { |
| + int n = SkTMin(ndst, nsrc); |
| + |
| + for (int i = 0; i < n; i++) { |
| + blend_srgb_srgb_1(dst++, src[i]); |
| + } |
| + ndst -= n; |
| + } |
| +} |
| - Sk4f d = SkNx_cast<float>(Sk4b::Load( dst)), |
| - s = SkNx_cast<float>(Sk4b::Load(&src)); |
| +void trivial_srcover_srgb_srgb( |
| + uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { |
| + while (ndst > 0) { |
| + int n = SkTMin(ndst, nsrc); |
| - // Approximate sRGB gamma as 2.0. |
| - Sk4f d_sq = d*d, |
| - s_sq = s*s; |
| - d = Sk4f{d_sq[0], d_sq[1], d_sq[2], d[3]}; |
| - s = Sk4f{s_sq[0], s_sq[1], s_sq[2], s[3]}; |
| + for (int i = 0; i < n; i++) { |
| + srcover_srgb_srgb_1(dst++, src[i]); |
| + } |
| + ndst -= n; |
| + } |
| +} |
| - // SrcOver. |
| - Sk4f invA = 1.0f - s[3]*(1/255.0f); |
| - d = s + d * invA; |
| +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
| - // Re-apply approximate sRGB gamma. |
| - Sk4f d_sqrt = d.sqrt(); |
| - d = Sk4f{d_sqrt[0], d_sqrt[1], d_sqrt[2], d[3]}; |
| + static inline __m128i load(const uint32_t* p) { |
| + return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p)); |
| + } |
| - SkNx_cast<uint8_t>(d).store(dst); |
| + static inline void store(uint32_t* p, __m128i v) { |
| + _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v); |
| } |
| - static inline void srcover_srgb_srgb(uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { |
| - while (ndst > 0) { |
| - int n = SkTMin(ndst, nsrc); |
| + #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 |
| + |
| + void srcover_srgb_srgb( |
| + uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) { |
| + const __m128i alphaMask = _mm_set1_epi32(0xFF000000); |
| + while (ndst > 0) { |
| + int count = SkTMin(ndst, nsrc); |
| + ndst -= count; |
| + const uint32_t* src = srcStart; |
| + const uint32_t* end = src + (count & ~3); |
| + |
| + while (src < end) { |
| + __m128i pixels = load(src); |
| + if (_mm_testc_si128(pixels, alphaMask)) { |
| + do { |
| + store(dst, pixels); |
| + dst += 4; |
| + src += 4; |
| + } while (src < end && _mm_testc_si128(pixels = load(src), alphaMask)); |
| + } else if (_mm_testz_si128(pixels, alphaMask)) { |
| + do { |
| + dst += 4; |
| + src += 4; |
| + } while (src < end && _mm_testz_si128(pixels = load(src), alphaMask)); |
| + } else { |
| + do { |
| + srcover_srgb_srgb_4(dst, src); |
| + dst += 4; |
| + src += 4; |
| + } while (src < end && _mm_testnzc_si128(pixels = load(src), alphaMask)); |
| + } |
| + } |
| + |
| + count = count & 3; |
| + while (count-- > 0) { |
| + srcover_srgb_srgb_1(dst++, *src++); |
| + } |
| + } |
| + } |
| + #else |
| + // SSE2 versions |
| + static inline bool check_opaque_alphas(__m128i pixels) { |
| + int mask = |
| + _mm_movemask_epi8( |
| + _mm_cmpeq_epi32( |
| + _mm_andnot_si128(pixels, _mm_set1_epi32(0xFF000000)), |
| + _mm_setzero_si128())); |
| + return mask == 0xFFFF; |
| + } |
| + |
| + static inline bool check_transparent_alphas(__m128i pixels) { |
| + int mask = |
| + _mm_movemask_epi8( |
| + _mm_cmpeq_epi32( |
| + _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000)), |
| + _mm_setzero_si128())); |
| + return mask == 0xFFFF; |
| + } |
| + |
| + static inline bool check_partial_alphas(__m128 pixels) { |
| + __m128i alphas = _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000)); |
| + int mask = |
| + _mm_movemask_epi8( |
| + _mm_cmpeq_epi8( |
| + _mm_srai_epi32(alphas, 8), |
| + alphas)); |
| + return mask == 0xFFFF; |
| + } |
| + |
| + void srcover_srgb_srgb( |
| + uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) { |
| + while (ndst > 0) { |
| + int count = SkTMin(ndst, nsrc); |
| + ndst -= count; |
| + const uint32_t* src = srcStart; |
| + const uint32_t* end = src + (count & ~3); |
| - for (int i = 0; i < n; i++) { |
| - srcover_srgb_srgb_1(dst++, src[i]); |
| + __m128i pixels = load(src); |
| + do { |
| + if (check_opaque_alphas(pixels)) { |
| + do { |
| + store(dst, pixels); |
| + dst += 4; |
| + src += 4; |
| + } while (src < end && check_opaque_alphas(pixels = load(src))); |
| + } else if (check_transparent_alphas(pixels)) { |
| + const uint32_t* start = src; |
| + do { |
| + src += 4; |
| + } while (src < end && check_transparent_alphas(pixels = load(src))); |
| + dst += src - start; |
| + } else { |
| + do { |
| + srcover_srgb_srgb_4(dst, src); |
| + dst += 4; |
| + src += 4; |
| + } while (src < end && check_partial_alphas(pixels = load(src))); |
| + } |
| + } while (src < end); |
| + |
| + count = count & 3; |
| + while (count-- > 0) { |
| + srcover_srgb_srgb_1(dst++, *src++); |
| + } |
| } |
| - ndst -= n; |
| } |
| + #endif |
| +#else |
| + |
| + void srcover_srgb_srgb( |
| + uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { |
| + trivial_srcover_srgb_srgb(dst, src, ndst, nsrc); |
| } |
| - |
| + |
| #endif |
| } // namespace SK_OPTS_NS |