| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 /* | 8 /* |
| 9 ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; an
d ./out/Release/nanobench --samples 300 --nompd --match LinearSrcOver -q | 9 ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; an
d ./out/Release/nanobench --samples 300 --nompd --match LinearSrcOver -q |
| 10 */ | 10 */ |
| 11 | 11 |
| 12 #ifndef SkBlend_opts_DEFINED | 12 #ifndef SkBlend_opts_DEFINED |
| 13 #define SkBlend_opts_DEFINED | 13 #define SkBlend_opts_DEFINED |
| 14 | 14 |
| 15 #include "SkNx.h" | 15 #include "SkNx.h" |
| 16 #include "SkPM4fPriv.h" | 16 #include "SkPM4fPriv.h" |
| 17 | 17 |
| 18 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | 18 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
| 19 #include <immintrin.h> | 19 #include <immintrin.h> |
| 20 #endif | 20 #endif |
| 21 | 21 |
| 22 namespace SK_OPTS_NS { | 22 namespace SK_OPTS_NS { |
| 23 | 23 |
| 24 // An implementation of SrcOver from bytes to bytes in linear space that takes a
dvantage of the | 24 static inline void srcover_srgb8888_srgb_1(uint32_t* dst, const uint32_t pixel)
{ |
| 25 // observation that the 255's cancel. | |
| 26 // invA = 1 - (As / 255); | |
| 27 // | |
| 28 // R = 255 * sqrt((Rs/255)^2 + (Rd/255)^2 * invA) | |
| 29 // => R = 255 * sqrt((Rs^2 + Rd^2 * invA)/255^2) | |
| 30 // => R = sqrt(Rs^2 + Rd^2 * invA) | |
| 31 static inline void blend_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) { | |
| 32 Sk4f s = srgb_to_linear(to_4f(pixel)); | |
| 33 Sk4f d = srgb_to_linear(to_4f(*dst)); | |
| 34 Sk4f invAlpha = 1.0f - Sk4f{s[SkPM4f::A]} * (1.0f / 255.0f); | |
| 35 Sk4f r = linear_to_srgb(s + d * invAlpha) + 0.5f; | |
| 36 *dst = to_4b(r); | |
| 37 } | |
| 38 | |
| 39 static inline void srcover_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) { | |
| 40 if ((~pixel & 0xFF000000) == 0) { | 25 if ((~pixel & 0xFF000000) == 0) { |
| 41 *dst = pixel; | 26 *dst = pixel; |
| 42 } else if ((pixel & 0xFF000000) != 0) { | 27 } else if ((pixel & 0xFF000000) != 0) { |
| 43 blend_srgb_srgb_1(dst, pixel); | 28 srcover_blend_srgb8888_srgb_1(dst, srgb_to_linear(to_4f(pixel))); |
| 44 } | 29 } |
| 45 } | 30 } |
| 46 | 31 |
| 47 static inline void srcover_srgb_srgb_2(uint32_t* dst, const uint32_t* src) { | |
| 48 srcover_srgb_srgb_1(dst++, *src++); | |
| 49 srcover_srgb_srgb_1(dst, *src); | |
| 50 } | |
| 51 | |
| 52 static inline void srcover_srgb_srgb_4(uint32_t* dst, const uint32_t* src) { | 32 static inline void srcover_srgb_srgb_4(uint32_t* dst, const uint32_t* src) { |
| 53 srcover_srgb_srgb_1(dst++, *src++); | 33 srcover_srgb8888_srgb_1(dst++, *src++); |
| 54 srcover_srgb_srgb_1(dst++, *src++); | 34 srcover_srgb8888_srgb_1(dst++, *src++); |
| 55 srcover_srgb_srgb_1(dst++, *src++); | 35 srcover_srgb8888_srgb_1(dst++, *src++); |
| 56 srcover_srgb_srgb_1(dst, *src); | 36 srcover_srgb8888_srgb_1(dst, *src); |
| 57 } | |
| 58 | |
| 59 void best_non_simd_srcover_srgb_srgb( | |
| 60 uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { | |
| 61 uint64_t* ddst = reinterpret_cast<uint64_t*>(dst); | |
| 62 | |
| 63 while (ndst >0) { | |
| 64 int count = SkTMin(ndst, nsrc); | |
| 65 ndst -= count; | |
| 66 const uint64_t* dsrc = reinterpret_cast<const uint64_t*>(src); | |
| 67 const uint64_t* end = dsrc + (count >> 1); | |
| 68 do { | |
| 69 if ((~*dsrc & 0xFF000000FF000000) == 0) { | |
| 70 do { | |
| 71 *ddst++ = *dsrc++; | |
| 72 } while (dsrc < end && (~*dsrc & 0xFF000000FF000000) == 0); | |
| 73 } else if ((*dsrc & 0xFF000000FF000000) == 0) { | |
| 74 do { | |
| 75 dsrc++; | |
| 76 ddst++; | |
| 77 } while (dsrc < end && (*dsrc & 0xFF000000FF000000) == 0); | |
| 78 } else { | |
| 79 srcover_srgb_srgb_2(reinterpret_cast<uint32_t*>(ddst++), | |
| 80 reinterpret_cast<const uint32_t*>(dsrc++)); | |
| 81 } | |
| 82 } while (dsrc < end); | |
| 83 | |
| 84 if ((count & 1) != 0) { | |
| 85 srcover_srgb_srgb_1(reinterpret_cast<uint32_t*>(ddst), | |
| 86 *reinterpret_cast<const uint32_t*>(dsrc)); | |
| 87 } | |
| 88 } | |
| 89 } | |
| 90 | |
| 91 void brute_force_srcover_srgb_srgb( | |
| 92 uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { | |
| 93 while (ndst > 0) { | |
| 94 int n = SkTMin(ndst, nsrc); | |
| 95 | |
| 96 for (int i = 0; i < n; i++) { | |
| 97 blend_srgb_srgb_1(dst++, src[i]); | |
| 98 } | |
| 99 ndst -= n; | |
| 100 } | |
| 101 } | |
| 102 | |
| 103 void trivial_srcover_srgb_srgb( | |
| 104 uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { | |
| 105 while (ndst > 0) { | |
| 106 int n = SkTMin(ndst, nsrc); | |
| 107 | |
| 108 for (int i = 0; i < n; i++) { | |
| 109 srcover_srgb_srgb_1(dst++, src[i]); | |
| 110 } | |
| 111 ndst -= n; | |
| 112 } | |
| 113 } | 37 } |
| 114 | 38 |
| 115 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | 39 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
| 116 | 40 |
| 117 static inline __m128i load(const uint32_t* p) { | 41 static inline __m128i load(const uint32_t* p) { |
| 118 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p)); | 42 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p)); |
| 119 } | 43 } |
| 120 | 44 |
| 121 static inline void store(uint32_t* p, __m128i v) { | 45 static inline void store(uint32_t* p, __m128i v) { |
| 122 _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v); | 46 _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v); |
| (...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 156 srcover_srgb_srgb_4(dst, dst + delta); | 80 srcover_srgb_srgb_4(dst, dst + delta); |
| 157 dst += 4; | 81 dst += 4; |
| 158 } while (dst < end | 82 } while (dst < end |
| 159 && _mm_testnzc_si128(pixels = load(dst + delta)
, alphaMask)); | 83 && _mm_testnzc_si128(pixels = load(dst + delta)
, alphaMask)); |
| 160 src += dst - start; | 84 src += dst - start; |
| 161 } | 85 } |
| 162 } | 86 } |
| 163 | 87 |
| 164 count = count & 3; | 88 count = count & 3; |
| 165 while (count-- > 0) { | 89 while (count-- > 0) { |
| 166 srcover_srgb_srgb_1(dst++, *src++); | 90 srcover_srgb8888_srgb_1(dst++, *src++); |
| 167 } | 91 } |
| 168 } | 92 } |
| 169 } | 93 } |
| 170 #else | 94 #else |
| 171 // SSE2 versions | 95 // SSE2 versions |
| 172 | 96 |
| 173 // Note: In the next three comparisons a group of 4 pixels is converted
to a group of | 97 // Note: In the next three comparisons a group of 4 pixels is converted
to a group of |
| 174 // "signed" pixels because the sse2 does not have an unsigned comparison
. | 98 // "signed" pixels because the sse2 does not have an unsigned comparison
. |
| 175 // Make it so that we can use the signed comparison operators by biasing | 99 // Make it so that we can use the signed comparison operators by biasing |
| 176 // 0x00xxxxxx to 0x80xxxxxxx which is the smallest values and biasing 0x
ffxxxxxx to | 100 // 0x00xxxxxx to 0x80xxxxxxx which is the smallest values and biasing 0x
ffxxxxxx to |
| (...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 228 do { | 152 do { |
| 229 srcover_srgb_srgb_4(dst, dst + delta); | 153 srcover_srgb_srgb_4(dst, dst + delta); |
| 230 dst += 4; | 154 dst += 4; |
| 231 } while (dst < end && check_partial_alphas(pixels = load
(dst + delta))); | 155 } while (dst < end && check_partial_alphas(pixels = load
(dst + delta))); |
| 232 src += dst - start; | 156 src += dst - start; |
| 233 } | 157 } |
| 234 } while (dst < end); | 158 } while (dst < end); |
| 235 | 159 |
| 236 count = count & 3; | 160 count = count & 3; |
| 237 while (count-- > 0) { | 161 while (count-- > 0) { |
| 238 srcover_srgb_srgb_1(dst++, *src++); | 162 srcover_srgb8888_srgb_1(dst++, *src++); |
| 239 } | 163 } |
| 240 } | 164 } |
| 241 } | 165 } |
| 242 #endif | 166 #endif |
| 243 #else | 167 #else |
| 244 | 168 |
| 245 void srcover_srgb_srgb( | 169 void srcover_srgb_srgb( |
| 246 uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { | 170 uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { |
| 247 trivial_srcover_srgb_srgb(dst, src, ndst, nsrc); | 171 while (ndst > 0) { |
| 172 int n = SkTMin(ndst, nsrc); |
| 173 |
| 174 for (int i = 0; i < n; i++) { |
| 175 srcover_srgb8888_srgb_1(dst++, src[i]); |
| 176 } |
| 177 ndst -= n; |
| 178 } |
| 248 } | 179 } |
| 249 | 180 |
| 250 #endif | 181 #endif |
| 251 | 182 |
| 252 } // namespace SK_OPTS_NS | 183 } // namespace SK_OPTS_NS |
| 253 | 184 |
| 254 #endif//SkBlend_opts_DEFINED | 185 #endif//SkBlend_opts_DEFINED |
| OLD | NEW |