| Index: src/opts/SkBlend_opts.h
|
| diff --git a/src/opts/SkBlend_opts.h b/src/opts/SkBlend_opts.h
|
| index a1067407be84a89b74b0b396d9d7a94ab1412973..93946438e59610961732766f8f030dcd9de37054 100644
|
| --- a/src/opts/SkBlend_opts.h
|
| +++ b/src/opts/SkBlend_opts.h
|
| @@ -5,52 +5,233 @@
|
| * found in the LICENSE file.
|
| */
|
|
|
| +/*
|
| +ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; and ./out/Release/nanobench --samples 300 --nompd --match LinearSrcOver -q
|
| + */
|
| +
|
| #ifndef SkBlend_opts_DEFINED
|
| #define SkBlend_opts_DEFINED
|
|
|
| +#include "SkNx.h"
|
| +#include "SkPM4fPriv.h"
|
| +
|
| namespace SK_OPTS_NS {
|
|
|
| -#if 0
|
| +// An implementation of SrcOver from bytes to bytes in linear space that takes advantage of the
|
| +// observation that the 255's cancel.
|
| +// invA = 1 - (As / 255);
|
| +//
|
| +// R = 255 * sqrt((Rs/255)^2 + (Rd/255)^2 * invA)
|
| +// => R = 255 * sqrt((Rs^2 + Rd^2 * invA)/255^2)
|
| +// => R = sqrt(Rs^2 + Rd^2 * invA)
|
| +static inline void blend_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) {
|
| + Sk4f s = srgb_to_linear(to_4f(pixel));
|
| + Sk4f d = srgb_to_linear(to_4f(*dst));
|
| + Sk4f invAlpha = 1.0f - Sk4f{s[SkPM4f::A]} * (1.0f / 255.0f);
|
| + Sk4f r = linear_to_srgb(s + d * invAlpha) + 0.5f;
|
| + *dst = to_4b(r);
|
| +}
|
|
|
| -#else
|
| +static inline void srcover_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) {
|
| + if ((~pixel & 0xFF000000) == 0) {
|
| + *dst = pixel;
|
| + } else if ((pixel & 0xFF000000) != 0) {
|
| + blend_srgb_srgb_1(dst, pixel);
|
| + }
|
| +}
|
| +
|
| +static inline void srcover_srgb_srgb_2(uint32_t* dst, const uint32_t* src) {
|
| + srcover_srgb_srgb_1(dst++, *src++);
|
| + srcover_srgb_srgb_1(dst, *src);
|
| +}
|
| +
|
| +static inline void srcover_srgb_srgb_4(uint32_t* dst, const uint32_t* src) {
|
| + srcover_srgb_srgb_1(dst++, *src++);
|
| + srcover_srgb_srgb_1(dst++, *src++);
|
| + srcover_srgb_srgb_1(dst++, *src++);
|
| + srcover_srgb_srgb_1(dst, *src);
|
| +}
|
| +
|
| +void best_non_simd_srcover_srgb_srgb(
|
| + uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {
|
| + uint64_t* ddst = reinterpret_cast<uint64_t*>(dst);
|
|
|
| - static inline void srcover_srgb_srgb_1(uint32_t* dst, uint32_t src) {
|
| - switch (src >> 24) {
|
| - case 0x00: return;
|
| - case 0xff: *dst = src; return;
|
| + while (ndst >0) {
|
| + int count = SkTMin(ndst, nsrc);
|
| + ndst -= count;
|
| + const uint64_t* dsrc = reinterpret_cast<const uint64_t*>(src);
|
| + const uint64_t* end = dsrc + (count >> 1);
|
| + do {
|
| + if ((~*dsrc & 0xFF000000FF000000) == 0) {
|
| + do {
|
| + *ddst++ = *dsrc++;
|
| + } while (dsrc < end && (~*dsrc & 0xFF000000FF000000) == 0);
|
| + } else if ((*dsrc & 0xFF000000FF000000) == 0) {
|
| + do {
|
| + dsrc++;
|
| + ddst++;
|
| + } while (dsrc < end && (*dsrc & 0xFF000000FF000000) == 0);
|
| + } else {
|
| + srcover_srgb_srgb_2(reinterpret_cast<uint32_t*>(ddst++),
|
| + reinterpret_cast<const uint32_t*>(dsrc++));
|
| + }
|
| + } while (dsrc < end);
|
| +
|
| + if ((count & 1) != 0) {
|
| + srcover_srgb_srgb_1(reinterpret_cast<uint32_t*>(ddst),
|
| + *reinterpret_cast<const uint32_t*>(dsrc));
|
| + }
|
| + }
|
| +}
|
| +
|
| +void brute_force_srcover_srgb_srgb(
|
| + uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {
|
| + while (ndst > 0) {
|
| + int n = SkTMin(ndst, nsrc);
|
| +
|
| + for (int i = 0; i < n; i++) {
|
| + blend_srgb_srgb_1(dst++, src[i]);
|
| }
|
| + ndst -= n;
|
| + }
|
| +}
|
|
|
| - Sk4f d = SkNx_cast<float>(Sk4b::Load( dst)),
|
| - s = SkNx_cast<float>(Sk4b::Load(&src));
|
| +void trivial_srcover_srgb_srgb(
|
| + uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {
|
| + while (ndst > 0) {
|
| + int n = SkTMin(ndst, nsrc);
|
|
|
| - // Approximate sRGB gamma as 2.0.
|
| - Sk4f d_sq = d*d,
|
| - s_sq = s*s;
|
| - d = Sk4f{d_sq[0], d_sq[1], d_sq[2], d[3]};
|
| - s = Sk4f{s_sq[0], s_sq[1], s_sq[2], s[3]};
|
| + for (int i = 0; i < n; i++) {
|
| + srcover_srgb_srgb_1(dst++, src[i]);
|
| + }
|
| + ndst -= n;
|
| + }
|
| +}
|
|
|
| - // SrcOver.
|
| - Sk4f invA = 1.0f - s[3]*(1/255.0f);
|
| - d = s + d * invA;
|
| +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
|
|
|
| - // Re-apply approximate sRGB gamma.
|
| - Sk4f d_sqrt = d.sqrt();
|
| - d = Sk4f{d_sqrt[0], d_sqrt[1], d_sqrt[2], d[3]};
|
| + static inline __m128i load(const uint32_t* p) {
|
| + return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
|
| + }
|
|
|
| - SkNx_cast<uint8_t>(d).store(dst);
|
| + static inline void store(uint32_t* p, __m128i v) {
|
| + _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v);
|
| }
|
|
|
| - static inline void srcover_srgb_srgb(uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {
|
| - while (ndst > 0) {
|
| - int n = SkTMin(ndst, nsrc);
|
| + #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
|
| +
|
| + void srcover_srgb_srgb(
|
| + uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) {
|
| + const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
|
| + while (ndst > 0) {
|
| + int count = SkTMin(ndst, nsrc);
|
| + ndst -= count;
|
| + const uint32_t* src = srcStart;
|
| + const uint32_t* end = src + (count & ~3);
|
|
|
| - for (int i = 0; i < n; i++) {
|
| - srcover_srgb_srgb_1(dst++, src[i]);
|
| + while (src < end) {
|
| + __m128i pixels = load(src);
|
| + if (_mm_testc_si128(pixels, alphaMask)) {
|
| + do {
|
| + store(dst, pixels);
|
| + dst += 4;
|
| + src += 4;
|
| + } while (src < end && _mm_testc_si128(pixels = load(src), alphaMask));
|
| + } else if (_mm_testz_si128(pixels, alphaMask)) {
|
| + do {
|
| + dst += 4;
|
| + src += 4;
|
| + } while (src < end && _mm_testz_si128(pixels = load(src), alphaMask));
|
| + } else {
|
| + do {
|
| + srcover_srgb_srgb_4(dst, src);
|
| + dst += 4;
|
| + src += 4;
|
| + } while (src < end && _mm_testnzc_si128(pixels = load(src), alphaMask));
|
| + }
|
| + }
|
| +
|
| + count = count & 3;
|
| + while (count-- > 0) {
|
| + srcover_srgb_srgb_1(dst++, *src++);
|
| + }
|
| }
|
| - ndst -= n;
|
| }
|
| + #else
|
| + // SSE2 versions
|
| + static inline bool check_opaque_alphas(__m128i pixels) {
|
| + int mask =
|
| + _mm_movemask_epi8(
|
| + _mm_cmpeq_epi32(
|
| + _mm_andnot_si128(pixels, _mm_set1_epi32(0xFF000000)),
|
| + _mm_setzero_si128()));
|
| + return mask == 0xFFFF;
|
| + }
|
| +
|
| + static inline bool check_transparent_alphas(__m128i pixels) {
|
| + int mask =
|
| + _mm_movemask_epi8(
|
| + _mm_cmpeq_epi32(
|
| + _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000)),
|
| + _mm_setzero_si128()));
|
| + return mask == 0xFFFF;
|
| + }
|
| +
|
| + static inline bool check_partial_alphas(__m128i pixels) {
|
| + __m128i alphas = _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000));
|
| + int mask =
|
| + _mm_movemask_epi8(
|
| + _mm_cmpeq_epi8(
|
| + _mm_srai_epi32(alphas, 8),
|
| + alphas));
|
| + return mask == 0xFFFF;
|
| + }
|
| +
|
| + void srcover_srgb_srgb(
|
| + uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) {
|
| + while (ndst > 0) {
|
| + int count = SkTMin(ndst, nsrc);
|
| + ndst -= count;
|
| + const uint32_t* src = srcStart;
|
| + const uint32_t* end = src + (count & ~3);
|
| +
|
| + __m128i pixels = load(src);
|
| + do {
|
| + if (check_opaque_alphas(pixels)) {
|
| + do {
|
| + store(dst, pixels);
|
| + dst += 4;
|
| + src += 4;
|
| + } while (src < end && check_opaque_alphas(pixels = load(src)));
|
| + } else if (check_transparent_alphas(pixels)) {
|
| + const uint32_t* start = src;
|
| + do {
|
| + src += 4;
|
| + } while (src < end && check_transparent_alphas(pixels = load(src)));
|
| + dst += src - start;
|
| + } else {
|
| + do {
|
| + srcover_srgb_srgb_4(dst, src);
|
| + dst += 4;
|
| + src += 4;
|
| + } while (src < end && check_partial_alphas(pixels = load(src)));
|
| + }
|
| + } while (src < end);
|
| +
|
| + count = count & 3;
|
| + while (count-- > 0) {
|
| + srcover_srgb_srgb_1(dst++, *src++);
|
| + }
|
| + }
|
| + }
|
| + #endif
|
| +#else
|
| +
|
| + void srcover_srgb_srgb(
|
| + uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {
|
| + trivial_srcover_srgb_srgb(dst, src, ndst, nsrc);
|
| }
|
| -
|
| +
|
| #endif
|
|
|
| } // namespace SK_OPTS_NS
|
|
|