Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(8)

Side by Side Diff: src/opts/SkBlend_opts.h

Issue 1939513002: Add specialized sRGB blitter for SkOpts (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Sync and remove unneeded. Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright 2016 Google Inc. 2 * Copyright 2016 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 /*
9 ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; an d ./out/Release/nanobench --samples 300 --nompd --match LinearSrcOver -q
10 */
11
8 #ifndef SkBlend_opts_DEFINED 12 #ifndef SkBlend_opts_DEFINED
9 #define SkBlend_opts_DEFINED 13 #define SkBlend_opts_DEFINED
10 14
15 #include "SkNx.h"
16
11 namespace SK_OPTS_NS { 17 namespace SK_OPTS_NS {
12 18
13 #if 0 19 // Fast but approximate implementation of sRGB gamma to linear.
14 20 static inline Sk4f sRGB_to_linear(Sk4f pixel) {
f(malita) 2016/05/06 17:43:56 Same as SkPM4fPriv.h:srgb_to_linear() - any reason
herb_g 2016/05/06 20:57:45 Done.
21 Sk4f l = pixel * pixel;
22 return Sk4f{l[0], l[1], l[2], pixel[3]};
23 }
24
25 // Fast but approximate implementation of linear to sRGB gamma.
26 static inline Sk4f linear_to_sRGB(Sk4f pixel) {
f(malita) 2016/05/06 17:43:56 Same as SkPM4fPriv.h:linear_to_srgb().
herb_g 2016/05/06 20:57:45 Done.
27 Sk4f s = pixel.sqrt();
28 return Sk4f{s[0], s[1], s[2], pixel[3]};
29 }
30
31 // An implementation of SrcOver from bytes to bytes in linear space that takes a dvantage of the
32 // observation that the 255's cancel.
33 // invA = 1 - (As / 255);
34 //
35 // R = 255 * sqrt((Rs/255)^2 + (Rd/255)^2 * invA)
36 // => R = 255 * sqrt((Rs^2 + Rd^2 * invA)/255^2)
37 // => R = sqrt(Rs^2 + Rd^2 * invA)
38 static inline void blend_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) {
39 Sk4f s = sRGB_to_linear(SkNx_cast<float>(Sk4b::Load(&pixel)));
40 Sk4f d = sRGB_to_linear(SkNx_cast<float>(Sk4b::Load(dst)));
41 Sk4f invAlpha = 1.0f - Sk4f{s[3]} * (1.0f / 255.0f);
42 Sk4f r = linear_to_sRGB(s + d * invAlpha);
43 SkNx_cast<uint8_t>(r).store(dst);
f(malita) 2016/05/06 17:43:56 Can we use the SkPM4fPriv.h helpers? to_4f(), to_
herb_g 2016/05/06 20:57:45 Done.
44 }
45
46 static inline void srcover_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) {
47 if ((~pixel & 0xFF000000) == 0) {
48 *dst = pixel;
49 } else if ((pixel & 0xFF000000) != 0) {
50 blend_srgb_srgb_1(dst, pixel);
51 }
f(malita) 2016/05/06 17:43:56 Nit: I would use more color macros here for readab
herb_g 2016/05/06 20:57:45 I started with code similar to what you suggest, b
52 }
53
54 static inline void srcover_srgb_srgb_2(uint32_t* dst, const uint32_t* src) {
55 srcover_srgb_srgb_1(dst++, *src++);
56 srcover_srgb_srgb_1(dst, *src);
57 }
58
59 static inline void srcover_srgb_srgb_4(uint32_t* dst, const uint32_t* src) {
60 srcover_srgb_srgb_1(dst++, *src++);
61 srcover_srgb_srgb_1(dst++, *src++);
62 srcover_srgb_srgb_1(dst++, *src++);
63 srcover_srgb_srgb_1(dst, *src);
64 }
65
66 void best_non_simd_srcover_srgb_srgb(
67 uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {
68 uint64_t* ddst = reinterpret_cast<uint64_t*>(dst);
69
70 while (ndst >0) {
71 int count = SkTMin(ndst, nsrc);
72 ndst -= count;
73 const uint64_t* dsrc = reinterpret_cast<const uint64_t*>(src);
74 const uint64_t* end = dsrc + (count >> 1);
75 do {
76 if ((~*dsrc & 0xFF000000FF000000) == 0) {
77 do {
78 *ddst++ = *dsrc++;
79 } while (dsrc < end && (~*dsrc & 0xFF000000FF000000) == 0);
80 } else if ((*dsrc & 0xFF000000FF000000) == 0) {
81 do {
82 dsrc++;
83 ddst++;
84 } while (dsrc < end && (*dsrc & 0xFF000000FF000000) == 0);
85 } else {
86 srcover_srgb_srgb_2(reinterpret_cast<uint32_t*>(ddst++),
87 reinterpret_cast<const uint32_t*>(dsrc++));
88 }
89 } while (dsrc < end);
90
91 if ((count & 1) != 0) {
92 srcover_srgb_srgb_1(reinterpret_cast<uint32_t*>(ddst),
93 *reinterpret_cast<const uint32_t*>(dsrc));
94 }
95 }
96 }
97
98 void brute_force_srcover_srgb_srgb(
99 uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {
100 while (ndst > 0) {
101 int n = SkTMin(ndst, nsrc);
102
103 for (int i = 0; i < n; i++) {
104 blend_srgb_srgb_1(dst++, src[i]);
105 }
106 ndst -= n;
107 }
108 }
109
110 void trivial_srcover_srgb_srgb(
111 uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {
112 while (ndst > 0) {
113 int n = SkTMin(ndst, nsrc);
114
115 for (int i = 0; i < n; i++) {
116 srcover_srgb_srgb_1(dst++, src[i]);
117 }
118 ndst -= n;
119 }
120 }
121
122 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
123
124 static inline __m128i load(const uint32_t* p) {
125 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
126 }
127
128 static inline void store(uint32_t* p, __m128i v) {
129 _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v);
130 }
131
132 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
133
134 void srcover_srgb_srgb(
135 uint32_t* dst, const uint32_t* const srcStart, int ndst, const int n src) {
136 const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
137 while (ndst > 0) {
138 int count = SkTMin(ndst, nsrc);
139 ndst -= count;
140 const uint32_t* src = srcStart;
141 const uint32_t* end = src + (count & ~3);
142
143 while (src < end) {
144 __m128i pixels = load(src);
145 if (_mm_testc_si128(pixels, alphaMask)) {
146 do {
147 store(dst, pixels);
148 dst += 4;
149 src += 4;
150 } while (src < end && _mm_testc_si128(pixels = load(src) , alphaMask));
151 } else if (_mm_testz_si128(pixels, alphaMask)) {
152 do {
153 dst += 4;
154 src += 4;
155 } while (src < end && _mm_testz_si128(pixels = load(src) , alphaMask));
156 } else {
157 do {
158 srcover_srgb_srgb_4(dst, src);
159 dst += 4;
160 src += 4;
161 } while (src < end && _mm_testnzc_si128(pixels = load(sr c), alphaMask));
162 }
163 }
164
165 count = count & 3;
166 while (count-- > 0) {
167 srcover_srgb_srgb_1(dst++, *src++);
168 }
169 }
170 }
171 #else
172 // SSE2 versions
173 static inline bool check_opaque_alphas(__m128i pixels) {
174 int mask =
175 _mm_movemask_epi8(
176 _mm_cmpeq_epi32(
177 _mm_andnot_si128(pixels, _mm_set1_epi32(0xFF000000)),
178 _mm_setzero_si128()));
179 return mask == 0xFFFF;
180 }
181
182 static inline bool check_transparent_alphas(__m128i pixels) {
183 int mask =
184 _mm_movemask_epi8(
185 _mm_cmpeq_epi32(
186 _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000)),
187 _mm_setzero_si128()));
188 return mask == 0xFFFF;
189 }
190
191 static inline bool check_partial_alphas(__m128 pixels) {
192 __m128i alphas = _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000));
193 int mask =
194 _mm_movemask_epi8(
195 _mm_cmpeq_epi8(
196 _mm_srai_epi32(alphas, 8),
197 alphas));
198 return mask == 0xFFFF;
199 }
200
201 void srcover_srgb_srgb(
202 uint32_t* dst, const uint32_t* const srcStart, int ndst, const int n src) {
203 while (ndst > 0) {
204 int count = SkTMin(ndst, nsrc);
205 ndst -= count;
206 const uint32_t* src = srcStart;
207 const uint32_t* end = src + (count & ~3);
208
209 __m128i pixels = load(src);
210 do {
211 if (check_opaque_alphas(pixels)) {
212 do {
213 store(dst, pixels);
214 dst += 4;
215 src += 4;
216 } while (src < end && check_opaque_alphas(pixels = load( src)));
217 } else if (check_transparent_alphas(pixels)) {
218 const uint32_t* start = src;
219 do {
220 src += 4;
221 } while (src < end && check_transparent_alphas(pixels = load(src)));
222 dst += src - start;
223 } else {
224 do {
225 srcover_srgb_srgb_4(dst, src);
226 dst += 4;
227 src += 4;
228 } while (src < end && check_partial_alphas(pixels = load (src)));
229 }
230 } while (src < end);
231
232 count = count & 3;
233 while (count-- > 0) {
234 srcover_srgb_srgb_1(dst++, *src++);
235 }
236 }
237 }
238 #endif
15 #else 239 #else
16 240
17 static inline void srcover_srgb_srgb_1(uint32_t* dst, uint32_t src) { 241 void srcover_srgb_srgb(
18 switch (src >> 24) { 242 uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {
19 case 0x00: return; 243 trivial_srcover_srgb_srgb(dst, src, ndst, nsrc);
20 case 0xff: *dst = src; return; 244 }
21 } 245
22
23 Sk4f d = SkNx_cast<float>(Sk4b::Load( dst)),
24 s = SkNx_cast<float>(Sk4b::Load(&src));
25
26 // Approximate sRGB gamma as 2.0.
27 Sk4f d_sq = d*d,
28 s_sq = s*s;
29 d = Sk4f{d_sq[0], d_sq[1], d_sq[2], d[3]};
30 s = Sk4f{s_sq[0], s_sq[1], s_sq[2], s[3]};
31
32 // SrcOver.
33 Sk4f invA = 1.0f - s[3]*(1/255.0f);
34 d = s + d * invA;
35
36 // Re-apply approximate sRGB gamma.
37 Sk4f d_sqrt = d.sqrt();
38 d = Sk4f{d_sqrt[0], d_sqrt[1], d_sqrt[2], d[3]};
39
40 SkNx_cast<uint8_t>(d).store(dst);
41 }
42
43 static inline void srcover_srgb_srgb(uint32_t* dst, const uint32_t* const sr c, int ndst, const int nsrc) {
44 while (ndst > 0) {
45 int n = SkTMin(ndst, nsrc);
46
47 for (int i = 0; i < n; i++) {
48 srcover_srgb_srgb_1(dst++, src[i]);
49 }
50 ndst -= n;
51 }
52 }
53
54 #endif 246 #endif
55 247
56 } // namespace SK_OPTS_NS 248 } // namespace SK_OPTS_NS
57 249
58 #endif//SkBlend_opts_DEFINED 250 #endif//SkBlend_opts_DEFINED
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698