src/opts/SkBlend_opts.h - Issue 1939513002: Add specialized sRGB blitter for SkOpts

Unified Diff: src/opts/SkBlend_opts.h

Issue 1939513002: Add specialized sRGB blitter for SkOpts (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Sync and remove unneeded. Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/opts/SkBlend_opts.h

diff --git a/src/opts/SkBlend_opts.h b/src/opts/SkBlend_opts.h

index a1067407be84a89b74b0b396d9d7a94ab1412973..02ec51c3376678381b0d13a39fc82c6c460c1e54 100644

--- a/src/opts/SkBlend_opts.h

+++ b/src/opts/SkBlend_opts.h

@@ -5,52 +5,244 @@

* found in the LICENSE file.

+/*

+ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; and ./out/Release/nanobench --samples 300 --nompd --match LinearSrcOver -q

+ */

#ifndef SkBlend_opts_DEFINED

#define SkBlend_opts_DEFINED

+#include "SkNx.h"

namespace SK_OPTS_NS {

-#if 0

+// Fast but approximate implementation of sRGB gamma to linear.

+static inline Sk4f sRGB_to_linear(Sk4f pixel) {

f(malita) 2016/05/06 17:43:56 Same as SkPM4fPriv.h:srgb_to_linear() - any reason

herb_g 2016/05/06 20:57:45 Done.

+ Sk4f l = pixel * pixel;

+ return Sk4f{l[0], l[1], l[2], pixel[3]};

-#else

+// Fast but approximate implementation of linear to sRGB gamma.

+static inline Sk4f linear_to_sRGB(Sk4f pixel) {

f(malita) 2016/05/06 17:43:56 Same as SkPM4fPriv.h:linear_to_srgb().

herb_g 2016/05/06 20:57:45 Done.

+ Sk4f s = pixel.sqrt();

+ return Sk4f{s[0], s[1], s[2], pixel[3]};

+// An implementation of SrcOver from bytes to bytes in linear space that takes advantage of the

+// observation that the 255's cancel.

+// invA = 1 - (As / 255);

+//

+// R = 255 * sqrt((Rs/255)^2 + (Rd/255)^2 * invA)

+// => R = 255 * sqrt((Rs^2 + Rd^2 * invA)/255^2)

+// => R = sqrt(Rs^2 + Rd^2 * invA)

+static inline void blend_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) {

+ Sk4f s = sRGB_to_linear(SkNx_cast<float>(Sk4b::Load(&pixel)));

+ Sk4f d = sRGB_to_linear(SkNx_cast<float>(Sk4b::Load(dst)));

+ Sk4f invAlpha = 1.0f - Sk4f{s[3]} * (1.0f / 255.0f);

+ Sk4f r = linear_to_sRGB(s + d * invAlpha);

+ SkNx_cast<uint8_t>(r).store(dst);

f(malita) 2016/05/06 17:43:56 Can we use the SkPM4fPriv.h helpers? to_4f(), to_

herb_g 2016/05/06 20:57:45 Done.

+static inline void srcover_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) {

+ if ((~pixel & 0xFF000000) == 0) {

+ *dst = pixel;

+ } else if ((pixel & 0xFF000000) != 0) {

+ blend_srgb_srgb_1(dst, pixel);

+ }

f(malita) 2016/05/06 17:43:56 Nit: I would use more color macros here for readab

herb_g 2016/05/06 20:57:45 I started with code similar to what you suggest, b

+static inline void srcover_srgb_srgb_2(uint32_t* dst, const uint32_t* src) {

+ srcover_srgb_srgb_1(dst++, *src++);

+ srcover_srgb_srgb_1(dst, *src);

- static inline void srcover_srgb_srgb_1(uint32_t* dst, uint32_t src) {

- switch (src >> 24) {

- case 0x00: return;

- case 0xff: *dst = src; return;

+static inline void srcover_srgb_srgb_4(uint32_t* dst, const uint32_t* src) {

+ srcover_srgb_srgb_1(dst++, *src++);

+ srcover_srgb_srgb_1(dst, *src);

+void best_non_simd_srcover_srgb_srgb(

+ uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {

+ uint64_t* ddst = reinterpret_cast<uint64_t*>(dst);

+ while (ndst >0) {

+ int count = SkTMin(ndst, nsrc);

+ ndst -= count;

+ const uint64_t* dsrc = reinterpret_cast<const uint64_t*>(src);

+ const uint64_t* end = dsrc + (count >> 1);

+ do {

+ if ((~*dsrc & 0xFF000000FF000000) == 0) {

+ do {

+ *ddst++ = *dsrc++;

+ } while (dsrc < end && (~*dsrc & 0xFF000000FF000000) == 0);

+ } else if ((*dsrc & 0xFF000000FF000000) == 0) {

+ do {

+ dsrc++;

+ ddst++;

+ } while (dsrc < end && (*dsrc & 0xFF000000FF000000) == 0);

+ } else {

+ srcover_srgb_srgb_2(reinterpret_cast<uint32_t*>(ddst++),

+ reinterpret_cast<const uint32_t*>(dsrc++));

+ }

+ } while (dsrc < end);

+ if ((count & 1) != 0) {

+ srcover_srgb_srgb_1(reinterpret_cast<uint32_t*>(ddst),

+ *reinterpret_cast<const uint32_t*>(dsrc));

}

+ }

+void brute_force_srcover_srgb_srgb(

+ uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {

+ while (ndst > 0) {

+ int n = SkTMin(ndst, nsrc);

+ for (int i = 0; i < n; i++) {

+ blend_srgb_srgb_1(dst++, src[i]);

+ }

+ ndst -= n;

+ }

- Sk4f d = SkNx_cast<float>(Sk4b::Load( dst)),

- s = SkNx_cast<float>(Sk4b::Load(&src));

+void trivial_srcover_srgb_srgb(

+ uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {

+ while (ndst > 0) {

+ int n = SkTMin(ndst, nsrc);

- // Approximate sRGB gamma as 2.0.

- Sk4f d_sq = d*d,

- s_sq = s*s;

- d = Sk4f{d_sq[0], d_sq[1], d_sq[2], d[3]};

- s = Sk4f{s_sq[0], s_sq[1], s_sq[2], s[3]};

+ for (int i = 0; i < n; i++) {

+ srcover_srgb_srgb_1(dst++, src[i]);

+ }

+ ndst -= n;

+ }

- // SrcOver.

- Sk4f invA = 1.0f - s[3]*(1/255.0f);

- d = s + d * invA;

+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

- // Re-apply approximate sRGB gamma.

- Sk4f d_sqrt = d.sqrt();

- d = Sk4f{d_sqrt[0], d_sqrt[1], d_sqrt[2], d[3]};

+ static inline __m128i load(const uint32_t* p) {

+ return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));

+ }

- SkNx_cast<uint8_t>(d).store(dst);

+ static inline void store(uint32_t* p, __m128i v) {

+ _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v);

}

- static inline void srcover_srgb_srgb(uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {

- while (ndst > 0) {

- int n = SkTMin(ndst, nsrc);

+ #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41

+ void srcover_srgb_srgb(

+ uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) {

+ const __m128i alphaMask = _mm_set1_epi32(0xFF000000);

+ while (ndst > 0) {

+ int count = SkTMin(ndst, nsrc);

+ ndst -= count;

+ const uint32_t* src = srcStart;

+ const uint32_t* end = src + (count & ~3);

+ while (src < end) {

+ __m128i pixels = load(src);

+ if (_mm_testc_si128(pixels, alphaMask)) {

+ do {

+ store(dst, pixels);

+ dst += 4;

+ src += 4;

+ } while (src < end && _mm_testc_si128(pixels = load(src), alphaMask));

+ } else if (_mm_testz_si128(pixels, alphaMask)) {

+ do {

+ dst += 4;

+ src += 4;

+ } while (src < end && _mm_testz_si128(pixels = load(src), alphaMask));

+ } else {

+ do {

+ srcover_srgb_srgb_4(dst, src);

+ dst += 4;

+ src += 4;

+ } while (src < end && _mm_testnzc_si128(pixels = load(src), alphaMask));

+ }

+ count = count & 3;

+ while (count-- > 0) {

+ srcover_srgb_srgb_1(dst++, *src++);

+ }

+ #else

+ // SSE2 versions

+ static inline bool check_opaque_alphas(__m128i pixels) {

+ int mask =

+ _mm_movemask_epi8(

+ _mm_cmpeq_epi32(

+ _mm_andnot_si128(pixels, _mm_set1_epi32(0xFF000000)),

+ _mm_setzero_si128()));

+ return mask == 0xFFFF;

+ }

+ static inline bool check_transparent_alphas(__m128i pixels) {

+ int mask =

+ _mm_movemask_epi8(

+ _mm_cmpeq_epi32(

+ _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000)),

+ _mm_setzero_si128()));

+ return mask == 0xFFFF;

+ }

+ static inline bool check_partial_alphas(__m128 pixels) {

+ __m128i alphas = _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000));

+ int mask =

+ _mm_movemask_epi8(

+ _mm_cmpeq_epi8(

+ _mm_srai_epi32(alphas, 8),

+ alphas));

+ return mask == 0xFFFF;

+ }

+ void srcover_srgb_srgb(

+ uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) {

+ while (ndst > 0) {

+ int count = SkTMin(ndst, nsrc);

+ ndst -= count;

+ const uint32_t* src = srcStart;

+ const uint32_t* end = src + (count & ~3);

- for (int i = 0; i < n; i++) {

- srcover_srgb_srgb_1(dst++, src[i]);

+ __m128i pixels = load(src);

+ do {

+ if (check_opaque_alphas(pixels)) {

+ do {

+ store(dst, pixels);

+ dst += 4;

+ src += 4;

+ } while (src < end && check_opaque_alphas(pixels = load(src)));

+ } else if (check_transparent_alphas(pixels)) {

+ const uint32_t* start = src;

+ do {

+ src += 4;

+ } while (src < end && check_transparent_alphas(pixels = load(src)));

+ dst += src - start;

+ } else {

+ do {

+ srcover_srgb_srgb_4(dst, src);

+ dst += 4;

+ src += 4;

+ } while (src < end && check_partial_alphas(pixels = load(src)));

+ }

+ } while (src < end);

+ count = count & 3;

+ while (count-- > 0) {

+ srcover_srgb_srgb_1(dst++, *src++);

+ }

}

- ndst -= n;

}

+ #endif

+#else

+ void srcover_srgb_srgb(

+ uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {

+ trivial_srcover_srgb_srgb(dst, src, ndst, nsrc);

}

#endif

} // namespace SK_OPTS_NS

« bench/SkBlend_optsBench.cpp ('K') | « resources/iconstrip.png ('k') | src/opts/SkOpts_sse41.cpp » ('j') | tests/SkBlend_optsTest.cpp » ('J')