Index: src/opts/SkBlend_opts.h |
diff --git a/src/opts/SkBlend_opts.h b/src/opts/SkBlend_opts.h |
index 93946438e59610961732766f8f030dcd9de37054..a1067407be84a89b74b0b396d9d7a94ab1412973 100644 |
--- a/src/opts/SkBlend_opts.h |
+++ b/src/opts/SkBlend_opts.h |
@@ -5,233 +5,52 @@ |
* found in the LICENSE file. |
*/ |
-/* |
-ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; and ./out/Release/nanobench --samples 300 --nompd --match LinearSrcOver -q |
- */ |
- |
#ifndef SkBlend_opts_DEFINED |
#define SkBlend_opts_DEFINED |
-#include "SkNx.h" |
-#include "SkPM4fPriv.h" |
- |
namespace SK_OPTS_NS { |
-// An implementation of SrcOver from bytes to bytes in linear space that takes advantage of the |
-// observation that the 255's cancel. |
-// invA = 1 - (As / 255); |
-// |
-// R = 255 * sqrt((Rs/255)^2 + (Rd/255)^2 * invA) |
-// => R = 255 * sqrt((Rs^2 + Rd^2 * invA)/255^2) |
-// => R = sqrt(Rs^2 + Rd^2 * invA) |
-static inline void blend_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) { |
- Sk4f s = srgb_to_linear(to_4f(pixel)); |
- Sk4f d = srgb_to_linear(to_4f(*dst)); |
- Sk4f invAlpha = 1.0f - Sk4f{s[SkPM4f::A]} * (1.0f / 255.0f); |
- Sk4f r = linear_to_srgb(s + d * invAlpha) + 0.5f; |
- *dst = to_4b(r); |
-} |
+#if 0 |
-static inline void srcover_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) { |
- if ((~pixel & 0xFF000000) == 0) { |
- *dst = pixel; |
- } else if ((pixel & 0xFF000000) != 0) { |
- blend_srgb_srgb_1(dst, pixel); |
+#else |
+ |
+ static inline void srcover_srgb_srgb_1(uint32_t* dst, uint32_t src) { |
+ switch (src >> 24) { |
+ case 0x00: return; |
+ case 0xff: *dst = src; return; |
+ } |
+ |
+ Sk4f d = SkNx_cast<float>(Sk4b::Load( dst)), |
+ s = SkNx_cast<float>(Sk4b::Load(&src)); |
+ |
+ // Approximate sRGB gamma as 2.0. |
+ Sk4f d_sq = d*d, |
+ s_sq = s*s; |
+ d = Sk4f{d_sq[0], d_sq[1], d_sq[2], d[3]}; |
+ s = Sk4f{s_sq[0], s_sq[1], s_sq[2], s[3]}; |
+ |
+ // SrcOver. |
+ Sk4f invA = 1.0f - s[3]*(1/255.0f); |
+ d = s + d * invA; |
+ |
+ // Re-apply approximate sRGB gamma. |
+ Sk4f d_sqrt = d.sqrt(); |
+ d = Sk4f{d_sqrt[0], d_sqrt[1], d_sqrt[2], d[3]}; |
+ |
+ SkNx_cast<uint8_t>(d).store(dst); |
} |
-} |
-static inline void srcover_srgb_srgb_2(uint32_t* dst, const uint32_t* src) { |
- srcover_srgb_srgb_1(dst++, *src++); |
- srcover_srgb_srgb_1(dst, *src); |
-} |
+ static inline void srcover_srgb_srgb(uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { |
+ while (ndst > 0) { |
+ int n = SkTMin(ndst, nsrc); |
-static inline void srcover_srgb_srgb_4(uint32_t* dst, const uint32_t* src) { |
- srcover_srgb_srgb_1(dst++, *src++); |
- srcover_srgb_srgb_1(dst++, *src++); |
- srcover_srgb_srgb_1(dst++, *src++); |
- srcover_srgb_srgb_1(dst, *src); |
-} |
- |
-void best_non_simd_srcover_srgb_srgb( |
- uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { |
- uint64_t* ddst = reinterpret_cast<uint64_t*>(dst); |
- |
- while (ndst >0) { |
- int count = SkTMin(ndst, nsrc); |
- ndst -= count; |
- const uint64_t* dsrc = reinterpret_cast<const uint64_t*>(src); |
- const uint64_t* end = dsrc + (count >> 1); |
- do { |
- if ((~*dsrc & 0xFF000000FF000000) == 0) { |
- do { |
- *ddst++ = *dsrc++; |
- } while (dsrc < end && (~*dsrc & 0xFF000000FF000000) == 0); |
- } else if ((*dsrc & 0xFF000000FF000000) == 0) { |
- do { |
- dsrc++; |
- ddst++; |
- } while (dsrc < end && (*dsrc & 0xFF000000FF000000) == 0); |
- } else { |
- srcover_srgb_srgb_2(reinterpret_cast<uint32_t*>(ddst++), |
- reinterpret_cast<const uint32_t*>(dsrc++)); |
+ for (int i = 0; i < n; i++) { |
+ srcover_srgb_srgb_1(dst++, src[i]); |
} |
- } while (dsrc < end); |
- |
- if ((count & 1) != 0) { |
- srcover_srgb_srgb_1(reinterpret_cast<uint32_t*>(ddst), |
- *reinterpret_cast<const uint32_t*>(dsrc)); |
+ ndst -= n; |
} |
} |
-} |
- |
-void brute_force_srcover_srgb_srgb( |
- uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { |
- while (ndst > 0) { |
- int n = SkTMin(ndst, nsrc); |
- |
- for (int i = 0; i < n; i++) { |
- blend_srgb_srgb_1(dst++, src[i]); |
- } |
- ndst -= n; |
- } |
-} |
- |
-void trivial_srcover_srgb_srgb( |
- uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { |
- while (ndst > 0) { |
- int n = SkTMin(ndst, nsrc); |
- |
- for (int i = 0; i < n; i++) { |
- srcover_srgb_srgb_1(dst++, src[i]); |
- } |
- ndst -= n; |
- } |
-} |
- |
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
- |
- static inline __m128i load(const uint32_t* p) { |
- return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p)); |
- } |
- |
- static inline void store(uint32_t* p, __m128i v) { |
- _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v); |
- } |
- |
- #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 |
- |
- void srcover_srgb_srgb( |
- uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) { |
- const __m128i alphaMask = _mm_set1_epi32(0xFF000000); |
- while (ndst > 0) { |
- int count = SkTMin(ndst, nsrc); |
- ndst -= count; |
- const uint32_t* src = srcStart; |
- const uint32_t* end = src + (count & ~3); |
- |
- while (src < end) { |
- __m128i pixels = load(src); |
- if (_mm_testc_si128(pixels, alphaMask)) { |
- do { |
- store(dst, pixels); |
- dst += 4; |
- src += 4; |
- } while (src < end && _mm_testc_si128(pixels = load(src), alphaMask)); |
- } else if (_mm_testz_si128(pixels, alphaMask)) { |
- do { |
- dst += 4; |
- src += 4; |
- } while (src < end && _mm_testz_si128(pixels = load(src), alphaMask)); |
- } else { |
- do { |
- srcover_srgb_srgb_4(dst, src); |
- dst += 4; |
- src += 4; |
- } while (src < end && _mm_testnzc_si128(pixels = load(src), alphaMask)); |
- } |
- } |
- |
- count = count & 3; |
- while (count-- > 0) { |
- srcover_srgb_srgb_1(dst++, *src++); |
- } |
- } |
- } |
- #else |
- // SSE2 versions |
- static inline bool check_opaque_alphas(__m128i pixels) { |
- int mask = |
- _mm_movemask_epi8( |
- _mm_cmpeq_epi32( |
- _mm_andnot_si128(pixels, _mm_set1_epi32(0xFF000000)), |
- _mm_setzero_si128())); |
- return mask == 0xFFFF; |
- } |
- |
- static inline bool check_transparent_alphas(__m128i pixels) { |
- int mask = |
- _mm_movemask_epi8( |
- _mm_cmpeq_epi32( |
- _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000)), |
- _mm_setzero_si128())); |
- return mask == 0xFFFF; |
- } |
- |
- static inline bool check_partial_alphas(__m128i pixels) { |
- __m128i alphas = _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000)); |
- int mask = |
- _mm_movemask_epi8( |
- _mm_cmpeq_epi8( |
- _mm_srai_epi32(alphas, 8), |
- alphas)); |
- return mask == 0xFFFF; |
- } |
- |
- void srcover_srgb_srgb( |
- uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) { |
- while (ndst > 0) { |
- int count = SkTMin(ndst, nsrc); |
- ndst -= count; |
- const uint32_t* src = srcStart; |
- const uint32_t* end = src + (count & ~3); |
- |
- __m128i pixels = load(src); |
- do { |
- if (check_opaque_alphas(pixels)) { |
- do { |
- store(dst, pixels); |
- dst += 4; |
- src += 4; |
- } while (src < end && check_opaque_alphas(pixels = load(src))); |
- } else if (check_transparent_alphas(pixels)) { |
- const uint32_t* start = src; |
- do { |
- src += 4; |
- } while (src < end && check_transparent_alphas(pixels = load(src))); |
- dst += src - start; |
- } else { |
- do { |
- srcover_srgb_srgb_4(dst, src); |
- dst += 4; |
- src += 4; |
- } while (src < end && check_partial_alphas(pixels = load(src))); |
- } |
- } while (src < end); |
- |
- count = count & 3; |
- while (count-- > 0) { |
- srcover_srgb_srgb_1(dst++, *src++); |
- } |
- } |
- } |
- #endif |
-#else |
- |
- void srcover_srgb_srgb( |
- uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { |
- trivial_srcover_srgb_srgb(dst, src, ndst, nsrc); |
- } |
- |
+ |
#endif |
} // namespace SK_OPTS_NS |