Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1093)

Side by Side Diff: src/core/SkXfermode4f.cpp

Issue 1813263002: custom ssse3 srcover_n_srgb_bw, about 1.8x faster (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: undo Created 4 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 Google Inc. 2 * Copyright 2016 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkPM4fPriv.h" 8 #include "SkPM4fPriv.h"
9 #include "SkUtils.h" 9 #include "SkUtils.h"
10 #include "SkXfermode.h" 10 #include "SkXfermode.h"
(...skipping 160 matching lines...) Expand 10 before | Expand all | Expand 10 after
171 if (D == kLinear_Dst) { 171 if (D == kLinear_Dst) {
172 // operate in bias-255 space for src and dst 172 // operate in bias-255 space for src and dst
173 const Sk4f& s4_255 = s4 * Sk4f(255); 173 const Sk4f& s4_255 = s4 * Sk4f(255);
174 while (count >= 4) { 174 while (count >= 4) {
175 Sk4f aa4 = SkNx_cast<float>(Sk4b::Load(aa)) * Sk4f(1/255.f); 175 Sk4f aa4 = SkNx_cast<float>(Sk4b::Load(aa)) * Sk4f(1/255.f);
176 Sk4f r0 = lerp(s4_255, to_4f(dst[0]), Sk4f(aa4[0])) + Sk4f(0.5f) ; 176 Sk4f r0 = lerp(s4_255, to_4f(dst[0]), Sk4f(aa4[0])) + Sk4f(0.5f) ;
177 Sk4f r1 = lerp(s4_255, to_4f(dst[1]), Sk4f(aa4[1])) + Sk4f(0.5f) ; 177 Sk4f r1 = lerp(s4_255, to_4f(dst[1]), Sk4f(aa4[1])) + Sk4f(0.5f) ;
178 Sk4f r2 = lerp(s4_255, to_4f(dst[2]), Sk4f(aa4[2])) + Sk4f(0.5f) ; 178 Sk4f r2 = lerp(s4_255, to_4f(dst[2]), Sk4f(aa4[2])) + Sk4f(0.5f) ;
179 Sk4f r3 = lerp(s4_255, to_4f(dst[3]), Sk4f(aa4[3])) + Sk4f(0.5f) ; 179 Sk4f r3 = lerp(s4_255, to_4f(dst[3]), Sk4f(aa4[3])) + Sk4f(0.5f) ;
180 Sk4f_ToBytes((uint8_t*)dst, r0, r1, r2, r3); 180 Sk4f_ToBytes((uint8_t*)dst, r0, r1, r2, r3);
181 181
182 dst += 4; 182 dst += 4;
183 aa += 4; 183 aa += 4;
184 count -= 4; 184 count -= 4;
185 } 185 }
186 } else { // kSRGB 186 } else { // kSRGB
187 while (count >= 4) { 187 while (count >= 4) {
188 Sk4f aa4 = SkNx_cast<float>(Sk4b::Load(aa)) * Sk4f(1/255.0f); 188 Sk4f aa4 = SkNx_cast<float>(Sk4b::Load(aa)) * Sk4f(1/255.0f);
189 189
190 /* If we ever natively support convert 255_linear -> 255_srgb, then perhaps 190 /* If we ever natively support convert 255_linear -> 255_srgb, then perhaps
191 * it would be faster (and possibly allow more code sharing wit h kLinear) to 191 * it would be faster (and possibly allow more code sharing wit h kLinear) to
192 * stay in that space. 192 * stay in that space.
193 */ 193 */
194 Sk4f r0 = lerp(s4, load_dst<D>(dst[0]), Sk4f(aa4[0])); 194 Sk4f r0 = lerp(s4, load_dst<D>(dst[0]), Sk4f(aa4[0]));
195 Sk4f r1 = lerp(s4, load_dst<D>(dst[1]), Sk4f(aa4[1])); 195 Sk4f r1 = lerp(s4, load_dst<D>(dst[1]), Sk4f(aa4[1]));
196 Sk4f r2 = lerp(s4, load_dst<D>(dst[2]), Sk4f(aa4[2])); 196 Sk4f r2 = lerp(s4, load_dst<D>(dst[2]), Sk4f(aa4[2]));
197 Sk4f r3 = lerp(s4, load_dst<D>(dst[3]), Sk4f(aa4[3])); 197 Sk4f r3 = lerp(s4, load_dst<D>(dst[3]), Sk4f(aa4[3]));
198 Sk4f_ToBytes((uint8_t*)dst, 198 Sk4f_ToBytes((uint8_t*)dst,
199 linear_unit_to_srgb_255f(r0), 199 linear_unit_to_srgb_255f(r0),
200 linear_unit_to_srgb_255f(r1), 200 linear_unit_to_srgb_255f(r1),
201 linear_unit_to_srgb_255f(r2), 201 linear_unit_to_srgb_255f(r2),
202 linear_unit_to_srgb_255f(r3)); 202 linear_unit_to_srgb_255f(r3));
203 203
204 dst += 4; 204 dst += 4;
205 aa += 4; 205 aa += 4;
206 count -= 4; 206 count -= 4;
207 } 207 }
208 } 208 }
209 for (int i = 0; i < count; ++i) { 209 for (int i = 0; i < count; ++i) {
210 unsigned a = aa[i]; 210 unsigned a = aa[i];
211 Sk4f d4 = load_dst<D>(dst[i]); 211 Sk4f d4 = load_dst<D>(dst[i]);
212 dst[i] = store_dst<D>(lerp(s4, d4, a)); 212 dst[i] = store_dst<D>(lerp(s4, d4, a));
213 } 213 }
(...skipping 12 matching lines...) Expand all
226 //////////////////////////////////////////////////////////////////////////////// /////////////////// 226 //////////////////////////////////////////////////////////////////////////////// ///////////////////
227 227
228 static void dst(const SkXfermode*, uint32_t dst[], const SkPM4f[], int count, co nst SkAlpha aa[]) {} 228 static void dst(const SkXfermode*, uint32_t dst[], const SkPM4f[], int count, co nst SkAlpha aa[]) {}
229 229
230 const SkXfermode::D32Proc gProcs_Dst[] = { 230 const SkXfermode::D32Proc gProcs_Dst[] = {
231 dst, dst, dst, dst, dst, dst, dst, dst, 231 dst, dst, dst, dst, dst, dst, dst, dst,
232 }; 232 };
233 233
234 //////////////////////////////////////////////////////////////////////////////// /////////////////// 234 //////////////////////////////////////////////////////////////////////////////// ///////////////////
235 235
236
237 static void srcover_n_srgb_bw(uint32_t dst[], const SkPM4f src[], int count) {
238 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 // For _mm_shuffle_epi8
239 while (count >= 4) {
240 // Load 4 sRGB RGBA/BGRA 8888 dst pixels.
241 // We'll write most of this as if they're RGBA, and just swizzle the src pixels to match.
242 __m128i d4 = _mm_loadu_si128((const __m128i*)dst);
243
244 // Transpose into planar and convert each plane to float.
245 auto _ = ~0; // Shuffles in a zero byte.
246 auto dr = _mm_cvtepi32_ps(
247 _mm_shuffle_epi8(d4, _mm_setr_epi8(0,_,_,_, 4,_,_,_, 8,_,_,_,12, _,_,_)));
248 auto dg = _mm_cvtepi32_ps(
249 _mm_shuffle_epi8(d4, _mm_setr_epi8(1,_,_,_, 5,_,_,_, 9,_,_,_,13, _,_,_)));
250 auto db = _mm_cvtepi32_ps(
251 _mm_shuffle_epi8(d4, _mm_setr_epi8(2,_,_,_, 6,_,_,_,10,_,_,_,14, _,_,_)));
252 auto da = _mm_cvtepi32_ps(
253 _mm_shuffle_epi8(d4, _mm_setr_epi8(3,_,_,_, 7,_,_,_,11,_,_,_,15, _,_,_)));
254
255 // Scale to [0,1].
256 dr = _mm_mul_ps(dr, _mm_set1_ps(1/255.0f));
257 dg = _mm_mul_ps(dg, _mm_set1_ps(1/255.0f));
258 db = _mm_mul_ps(db, _mm_set1_ps(1/255.0f));
259 da = _mm_mul_ps(da, _mm_set1_ps(1/255.0f));
260
261 // Apply approximate sRGB gamma correction to convert to linear (as if g amma were 2).
262 dr = _mm_mul_ps(dr, dr);
263 dg = _mm_mul_ps(dg, dg);
264 db = _mm_mul_ps(db, db);
265
266 // Load 4 linear float src pixels.
267 auto s0 = _mm_loadu_ps(src[0].fVec),
268 s1 = _mm_loadu_ps(src[1].fVec),
269 s2 = _mm_loadu_ps(src[2].fVec),
270 s3 = _mm_loadu_ps(src[3].fVec);
271
272 // Transpose src pixels to planar too, and give the registers better nam es.
273 _MM_TRANSPOSE4_PS(s0, s1, s2, s3);
274 auto sr = s0,
275 sg = s1,
276 sb = s2,
277 sa = s3;
278
279 // Match color order with destination, if necessary.
280 #if defined(SK_PMCOLOR_IS_BGRA)
281 SkTSwap(sr, sb);
282 #endif
283
284 // Now, the meat of what we wanted to do... perform the srcover blend.
285 auto invSA = _mm_sub_ps(_mm_set1_ps(1), sa);
286 auto r = _mm_add_ps(sr, _mm_mul_ps(dr, invSA)),
287 g = _mm_add_ps(sg, _mm_mul_ps(dg, invSA)),
288 b = _mm_add_ps(sb, _mm_mul_ps(db, invSA)),
289 a = _mm_add_ps(sa, _mm_mul_ps(da, invSA));
290
291 // Convert back to sRGB and [0,255], again approximating sRGB as gamma = = 2.
292 r = _mm_mul_ps(_mm_sqrt_ps(r), _mm_set1_ps(255));
293 g = _mm_mul_ps(_mm_sqrt_ps(g), _mm_set1_ps(255));
294 b = _mm_mul_ps(_mm_sqrt_ps(b), _mm_set1_ps(255));
295 a = _mm_mul_ps( (a), _mm_set1_ps(255));
296
297 // Convert to int (with rounding) and pack back down to planar 8-bit.
298 __m128i x = _mm_packus_epi16(_mm_packus_epi16(_mm_cvtps_epi32(r), _mm_cv tps_epi32(g)),
299 _mm_packus_epi16(_mm_cvtps_epi32(b), _mm_cv tps_epi32(a)));
300
301 // Transpose back to interlaced RGBA and write back to dst.
302 x = _mm_shuffle_epi8(x, _mm_setr_epi8(0, 4, 8, 12,
303 1, 5, 9, 13,
304 2, 6, 10, 14,
305 3, 7, 11, 15));
306 _mm_storeu_si128((__m128i*)dst, x);
307
308 count -= 4;
309 dst += 4;
310 src += 4;
311 }
312 #endif
313 // This should look just like the non-specialized case in srcover_n.
314 for (int i = 0; i < count; ++i) {
315 Sk4f s4 = src[i].to4f_pmorder();
316 Sk4f d4 = load_dst<kSRGB_Dst>(dst[i]);
317 Sk4f r4 = s4 + d4 * Sk4f(1 - get_alpha(s4));
318 dst[i] = store_dst<kSRGB_Dst>(r4);
319 }
320 }
321
236 template <DstType D> void srcover_n(const SkXfermode*, uint32_t dst[], 322 template <DstType D> void srcover_n(const SkXfermode*, uint32_t dst[],
237 const SkPM4f src[], int count, const SkAlpha aa[]) { 323 const SkPM4f src[], int count, const SkAlpha aa[]) {
238 if (aa) { 324 if (aa) {
239 for (int i = 0; i < count; ++i) { 325 for (int i = 0; i < count; ++i) {
240 unsigned a = aa[i]; 326 unsigned a = aa[i];
241 if (0 == a) { 327 if (0 == a) {
242 continue; 328 continue;
243 } 329 }
244 Sk4f s4 = src[i].to4f_pmorder(); 330 Sk4f s4 = src[i].to4f_pmorder();
245 Sk4f d4 = load_dst<D>(dst[i]); 331 Sk4f d4 = load_dst<D>(dst[i]);
246 if (a != 0xFF) { 332 if (a != 0xFF) {
247 s4 = scale_by_coverage(s4, a); 333 s4 = scale_by_coverage(s4, a);
248 } 334 }
249 Sk4f r4 = s4 + d4 * Sk4f(1 - get_alpha(s4)); 335 Sk4f r4 = s4 + d4 * Sk4f(1 - get_alpha(s4));
250 dst[i] = store_dst<D>(r4); 336 dst[i] = store_dst<D>(r4);
251 } 337 }
252 } else { 338 } else {
253 for (int i = 0; i < count; ++i) { 339 if (D == kSRGB_Dst) {
254 Sk4f s4 = src[i].to4f_pmorder(); 340 srcover_n_srgb_bw(dst, src, count);
255 Sk4f d4 = load_dst<D>(dst[i]); 341 } else {
256 Sk4f r4 = s4 + d4 * Sk4f(1 - get_alpha(s4)); 342 for (int i = 0; i < count; ++i) {
257 dst[i] = store_dst<D>(r4); 343 Sk4f s4 = src[i].to4f_pmorder();
344 Sk4f d4 = load_dst<D>(dst[i]);
345 Sk4f r4 = s4 + d4 * Sk4f(1 - get_alpha(s4));
346 dst[i] = store_dst<D>(r4);
347 }
258 } 348 }
259 } 349 }
260 } 350 }
261 351
262 static void srcover_linear_dst_1(const SkXfermode*, uint32_t dst[], 352 static void srcover_linear_dst_1(const SkXfermode*, uint32_t dst[],
263 const SkPM4f* src, int count, const SkAlpha aa[ ]) { 353 const SkPM4f* src, int count, const SkAlpha aa[ ]) {
264 const Sk4f s4 = src->to4f_pmorder(); 354 const Sk4f s4 = src->to4f_pmorder();
265 const Sk4f dst_scale = Sk4f(1 - get_alpha(s4)); 355 const Sk4f dst_scale = Sk4f(1 - get_alpha(s4));
266 356
267 if (aa) { 357 if (aa) {
268 for (int i = 0; i < count; ++i) { 358 for (int i = 0; i < count; ++i) {
269 unsigned a = aa[i]; 359 unsigned a = aa[i];
270 if (0 == a) { 360 if (0 == a) {
271 continue; 361 continue;
272 } 362 }
273 Sk4f d4 = Sk4f_fromL32(dst[i]); 363 Sk4f d4 = Sk4f_fromL32(dst[i]);
274 Sk4f r4; 364 Sk4f r4;
275 if (a != 0xFF) { 365 if (a != 0xFF) {
276 Sk4f s4_aa = scale_by_coverage(s4, a); 366 Sk4f s4_aa = scale_by_coverage(s4, a);
(...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after
389 Sk4i rgbi = Sk4i(SkGetPackedR16(rgb), SkGetPackedG16(rgb), SkGetPackedB16(rg b), 0); 479 Sk4i rgbi = Sk4i(SkGetPackedR16(rgb), SkGetPackedG16(rgb), SkGetPackedB16(rg b), 0);
390 #else 480 #else
391 Sk4i rgbi = Sk4i(SkGetPackedB16(rgb), SkGetPackedG16(rgb), SkGetPackedR16(rg b), 0); 481 Sk4i rgbi = Sk4i(SkGetPackedB16(rgb), SkGetPackedG16(rgb), SkGetPackedR16(rg b), 0);
392 #endif 482 #endif
393 return SkNx_cast<float>(rgbi) * Sk4f(1.0f/31, 1.0f/63, 1.0f/31, 0); 483 return SkNx_cast<float>(rgbi) * Sk4f(1.0f/31, 1.0f/63, 1.0f/31, 0);
394 } 484 }
395 485
396 template <DstType D> 486 template <DstType D>
397 void src_1_lcd(uint32_t dst[], const SkPM4f* src, int count, const uint16_t lcd[ ]) { 487 void src_1_lcd(uint32_t dst[], const SkPM4f* src, int count, const uint16_t lcd[ ]) {
398 const Sk4f s4 = Sk4f::Load(src->fVec); 488 const Sk4f s4 = Sk4f::Load(src->fVec);
399 489
400 if (D == kLinear_Dst) { 490 if (D == kLinear_Dst) {
401 // operate in bias-255 space for src and dst 491 // operate in bias-255 space for src and dst
402 const Sk4f s4bias = s4 * Sk4f(255); 492 const Sk4f s4bias = s4 * Sk4f(255);
403 for (int i = 0; i < count; ++i) { 493 for (int i = 0; i < count; ++i) {
404 uint16_t rgb = lcd[i]; 494 uint16_t rgb = lcd[i];
405 if (0 == rgb) { 495 if (0 == rgb) {
406 continue; 496 continue;
407 } 497 }
408 Sk4f d4bias = to_4f(dst[i]); 498 Sk4f d4bias = to_4f(dst[i]);
409 dst[i] = to_4b(lerp(s4bias, d4bias, lcd16_to_unit_4f(rgb))) | (SK_A3 2_MASK << SK_A32_SHIFT); 499 dst[i] = to_4b(lerp(s4bias, d4bias, lcd16_to_unit_4f(rgb))) | (SK_A3 2_MASK << SK_A32_SHIFT);
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after
472 562
473 const LCD32Proc procs[] = { 563 const LCD32Proc procs[] = {
474 srcover_n_lcd<kSRGB_Dst>, src_n_lcd<kSRGB_Dst>, 564 srcover_n_lcd<kSRGB_Dst>, src_n_lcd<kSRGB_Dst>,
475 srcover_1_lcd<kSRGB_Dst>, src_1_lcd<kSRGB_Dst>, 565 srcover_1_lcd<kSRGB_Dst>, src_1_lcd<kSRGB_Dst>,
476 566
477 srcover_n_lcd<kLinear_Dst>, src_n_lcd<kLinear_Dst>, 567 srcover_n_lcd<kLinear_Dst>, src_n_lcd<kLinear_Dst>,
478 srcover_1_lcd<kLinear_Dst>, src_1_lcd<kLinear_Dst>, 568 srcover_1_lcd<kLinear_Dst>, src_1_lcd<kLinear_Dst>,
479 }; 569 };
480 return procs[flags]; 570 return procs[flags];
481 } 571 }
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698