src/opts/SkBlend_opts.h - Issue 1939513002: Add specialized sRGB blitter for SkOpts

Side by Side Diff: src/opts/SkBlend_opts.h

Issue 1939513002: Add specialized sRGB blitter for SkOpts (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Sync and remove unneeded. Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 Google Inc.	2 * Copyright 2016 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

	8 /*

	9 ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; an d ./out/Release/nanobench --samples 300 --nompd --match LinearSrcOver -q

	10 */

	11

8 #ifndef SkBlend_opts_DEFINED	12 #ifndef SkBlend_opts_DEFINED

9 #define SkBlend_opts_DEFINED	13 #define SkBlend_opts_DEFINED

10	14

	15 #include "SkNx.h"

	16

11 namespace SK_OPTS_NS {	17 namespace SK_OPTS_NS {

12	18

13 #if 0	19 // Fast but approximate implementation of sRGB gamma to linear.

14	20 static inline Sk4f sRGB_to_linear(Sk4f pixel) {
	f(malita) 2016/05/06 17:43:56 Same as SkPM4fPriv.h:srgb_to_linear() - any reason Same as SkPM4fPriv.h:srgb_to_linear() - any reason not to use that one? herb_g 2016/05/06 20:57:45 Done. Show quoted text On 2016/05/06 17:43:56, f(malita) wrote: > Same as SkPM4fPriv.h:srgb_to_linear() - any reason not to use that one? Done.
	21 Sk4f l = pixel * pixel;

	22 return Sk4f{l[0], l[1], l[2], pixel[3]};

	23 }

	24

	25 // Fast but approximate implementation of linear to sRGB gamma.

	26 static inline Sk4f linear_to_sRGB(Sk4f pixel) {
	f(malita) 2016/05/06 17:43:56 Same as SkPM4fPriv.h:linear_to_srgb(). Same as SkPM4fPriv.h:linear_to_srgb(). herb_g 2016/05/06 20:57:45 Done. Show quoted text On 2016/05/06 17:43:56, f(malita) wrote: > Same as SkPM4fPriv.h:linear_to_srgb(). Done.
	27 Sk4f s = pixel.sqrt();

	28 return Sk4f{s[0], s[1], s[2], pixel[3]};

	29 }

	30

	31 // An implementation of SrcOver from bytes to bytes in linear space that takes a dvantage of the

	32 // observation that the 255's cancel.

	33 // invA = 1 - (As / 255);

	34 //

	35 // R = 255 * sqrt((Rs/255)^2 + (Rd/255)^2 * invA)

	36 // => R = 255 * sqrt((Rs^2 + Rd^2 * invA)/255^2)

	37 // => R = sqrt(Rs^2 + Rd^2 * invA)

	38 static inline void blend_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) {

	39 Sk4f s = sRGB_to_linear(SkNx_cast<float>(Sk4b::Load(&pixel)));

	40 Sk4f d = sRGB_to_linear(SkNx_cast<float>(Sk4b::Load(dst)));

	41 Sk4f invAlpha = 1.0f - Sk4f{s[3]} * (1.0f / 255.0f);

	42 Sk4f r = linear_to_sRGB(s + d * invAlpha);

	43 SkNx_cast<uint8_t>(r).store(dst);
	f(malita) 2016/05/06 17:43:56 Can we use the SkPM4fPriv.h helpers? to_4f(), to_ Can we use the SkPM4fPriv.h helpers? to_4f(), to_4b, srgb_to_linear(), linear_to_srgb(), SkPM4f::A herb_g 2016/05/06 20:57:45 Done. Show quoted text On 2016/05/06 17:43:56, f(malita) wrote: > Can we use the SkPM4fPriv.h helpers? > > to_4f(), to_4b, srgb_to_linear(), linear_to_srgb(), SkPM4f::A Done.
	44 }

	45

	46 static inline void srcover_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) {

	47 if ((~pixel & 0xFF000000) == 0) {

	48 *dst = pixel;

	49 } else if ((pixel & 0xFF000000) != 0) {

	50 blend_srgb_srgb_1(dst, pixel);

	51 }
	f(malita) 2016/05/06 17:43:56 Nit: I would use more color macros here for readab Nit: I would use more color macros here for readability, maybe auto alpha = SkGetPackedA32(pixel); switch (alpha) { case SK_AlphaTRANSPARENT: break; case SK_AlphaOPAQUE: dst = pixel; break; default: blend_srgb_srgb_1(dst, pixel); break; } ? herb_g* 2016/05/06 20:57:45 I started with code similar to what you suggest, b Show quoted text On 2016/05/06 17:43:56, f(malita) wrote: > Nit: I would use more color macros here for readability, maybe > > auto alpha = SkGetPackedA32(pixel); > switch (alpha) { > case SK_AlphaTRANSPARENT: > break; > case SK_AlphaOPAQUE: > *dst = pixel; > break; > default: > blend_srgb_srgb_1(dst, pixel); > break; > } > > ? I started with code similar to what you suggest, but this code ends up being much faster.
	52 }

	53

	54 static inline void srcover_srgb_srgb_2(uint32_t* dst, const uint32_t* src) {

	55 srcover_srgb_srgb_1(dst++, *src++);

	56 srcover_srgb_srgb_1(dst, *src);

	57 }

	58

	59 static inline void srcover_srgb_srgb_4(uint32_t* dst, const uint32_t* src) {

	60 srcover_srgb_srgb_1(dst++, *src++);

	61 srcover_srgb_srgb_1(dst++, *src++);

	62 srcover_srgb_srgb_1(dst++, *src++);

	63 srcover_srgb_srgb_1(dst, *src);

	64 }

	65

	66 void best_non_simd_srcover_srgb_srgb(

	67 uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {

	68 uint64_t* ddst = reinterpret_cast<uint64_t*>(dst);

	69

	70 while (ndst >0) {

	71 int count = SkTMin(ndst, nsrc);

	72 ndst -= count;

	73 const uint64_t* dsrc = reinterpret_cast<const uint64_t*>(src);

	74 const uint64_t* end = dsrc + (count >> 1);

	75 do {

	76 if ((~*dsrc & 0xFF000000FF000000) == 0) {

	77 do {

	78 ddst++ = dsrc++;

	79 } while (dsrc < end && (~*dsrc & 0xFF000000FF000000) == 0);

	80 } else if ((*dsrc & 0xFF000000FF000000) == 0) {

	81 do {

	82 dsrc++;

	83 ddst++;

	84 } while (dsrc < end && (*dsrc & 0xFF000000FF000000) == 0);

	85 } else {

	86 srcover_srgb_srgb_2(reinterpret_cast<uint32_t*>(ddst++),

	87 reinterpret_cast<const uint32_t*>(dsrc++));

	88 }

	89 } while (dsrc < end);

	90

	91 if ((count & 1) != 0) {

	92 srcover_srgb_srgb_1(reinterpret_cast<uint32_t*>(ddst),

	93 reinterpret_cast<const uint32_t>(dsrc));

	94 }

	95 }

	96 }

	97

	98 void brute_force_srcover_srgb_srgb(

	99 uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {

	100 while (ndst > 0) {

	101 int n = SkTMin(ndst, nsrc);

	102

	103 for (int i = 0; i < n; i++) {

	104 blend_srgb_srgb_1(dst++, src[i]);

	105 }

	106 ndst -= n;

	107 }

	108 }

	109

	110 void trivial_srcover_srgb_srgb(

	111 uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {

	112 while (ndst > 0) {

	113 int n = SkTMin(ndst, nsrc);

	114

	115 for (int i = 0; i < n; i++) {

	116 srcover_srgb_srgb_1(dst++, src[i]);

	117 }

	118 ndst -= n;

	119 }

	120 }

	121

	122 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

	123

	124 static inline __m128i load(const uint32_t* p) {

	125 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));

	126 }

	127

	128 static inline void store(uint32_t* p, __m128i v) {

	129 _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v);

	130 }

	131

	132 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41

	133

	134 void srcover_srgb_srgb(

	135 uint32_t* dst, const uint32_t* const srcStart, int ndst, const int n src) {

	136 const __m128i alphaMask = _mm_set1_epi32(0xFF000000);

	137 while (ndst > 0) {

	138 int count = SkTMin(ndst, nsrc);

	139 ndst -= count;

	140 const uint32_t* src = srcStart;

	141 const uint32_t* end = src + (count & ~3);

	142

	143 while (src < end) {

	144 __m128i pixels = load(src);

	145 if (_mm_testc_si128(pixels, alphaMask)) {

	146 do {

	147 store(dst, pixels);

	148 dst += 4;

	149 src += 4;

	150 } while (src < end && _mm_testc_si128(pixels = load(src) , alphaMask));

	151 } else if (_mm_testz_si128(pixels, alphaMask)) {

	152 do {

	153 dst += 4;

	154 src += 4;

	155 } while (src < end && _mm_testz_si128(pixels = load(src) , alphaMask));

	156 } else {

	157 do {

	158 srcover_srgb_srgb_4(dst, src);

	159 dst += 4;

	160 src += 4;

	161 } while (src < end && _mm_testnzc_si128(pixels = load(sr c), alphaMask));

	162 }

	163 }

	164

	165 count = count & 3;

	166 while (count-- > 0) {

	167 srcover_srgb_srgb_1(dst++, *src++);

	168 }

	169 }

	170 }

	171 #else

	172 // SSE2 versions

	173 static inline bool check_opaque_alphas(__m128i pixels) {

	174 int mask =

	175 _mm_movemask_epi8(

	176 _mm_cmpeq_epi32(

	177 _mm_andnot_si128(pixels, _mm_set1_epi32(0xFF000000)),

	178 _mm_setzero_si128()));

	179 return mask == 0xFFFF;

	180 }

	181

	182 static inline bool check_transparent_alphas(__m128i pixels) {

	183 int mask =

	184 _mm_movemask_epi8(

	185 _mm_cmpeq_epi32(

	186 _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000)),

	187 _mm_setzero_si128()));

	188 return mask == 0xFFFF;

	189 }

	190

	191 static inline bool check_partial_alphas(__m128 pixels) {

	192 __m128i alphas = _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000));

	193 int mask =

	194 _mm_movemask_epi8(

	195 _mm_cmpeq_epi8(

	196 _mm_srai_epi32(alphas, 8),

	197 alphas));

	198 return mask == 0xFFFF;

	199 }

	200

	201 void srcover_srgb_srgb(

	202 uint32_t* dst, const uint32_t* const srcStart, int ndst, const int n src) {

	203 while (ndst > 0) {

	204 int count = SkTMin(ndst, nsrc);

	205 ndst -= count;

	206 const uint32_t* src = srcStart;

	207 const uint32_t* end = src + (count & ~3);

	208

	209 __m128i pixels = load(src);

	210 do {

	211 if (check_opaque_alphas(pixels)) {

	212 do {

	213 store(dst, pixels);

	214 dst += 4;

	215 src += 4;

	216 } while (src < end && check_opaque_alphas(pixels = load( src)));

	217 } else if (check_transparent_alphas(pixels)) {

	218 const uint32_t* start = src;

	219 do {

	220 src += 4;

	221 } while (src < end && check_transparent_alphas(pixels = load(src)));

	222 dst += src - start;

	223 } else {

	224 do {

	225 srcover_srgb_srgb_4(dst, src);

	226 dst += 4;

	227 src += 4;

	228 } while (src < end && check_partial_alphas(pixels = load (src)));

	229 }

	230 } while (src < end);

	231

	232 count = count & 3;

	233 while (count-- > 0) {

	234 srcover_srgb_srgb_1(dst++, *src++);

	235 }

	236 }

	237 }

	238 #endif

15 #else	239 #else

16	240

17 static inline void srcover_srgb_srgb_1(uint32_t* dst, uint32_t src) {	241 void srcover_srgb_srgb(

18 switch (src >> 24) {	242 uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {

19 case 0x00: return;	243 trivial_srcover_srgb_srgb(dst, src, ndst, nsrc);

20 case 0xff: *dst = src; return;	244 }

21 }	245

22

23 Sk4f d = SkNx_cast<float>(Sk4b::Load( dst)),

24 s = SkNx_cast<float>(Sk4b::Load(&src));

25

26 // Approximate sRGB gamma as 2.0.

27 Sk4f d_sq = d*d,

28 s_sq = s*s;

29 d = Sk4f{d_sq[0], d_sq[1], d_sq[2], d[3]};

30 s = Sk4f{s_sq[0], s_sq[1], s_sq[2], s[3]};

31

32 // SrcOver.

33 Sk4f invA = 1.0f - s[3]*(1/255.0f);

34 d = s + d * invA;

35

36 // Re-apply approximate sRGB gamma.

37 Sk4f d_sqrt = d.sqrt();

38 d = Sk4f{d_sqrt[0], d_sqrt[1], d_sqrt[2], d[3]};

39

40 SkNx_cast<uint8_t>(d).store(dst);

41 }

42

43 static inline void srcover_srgb_srgb(uint32_t* dst, const uint32_t* const sr c, int ndst, const int nsrc) {

44 while (ndst > 0) {

45 int n = SkTMin(ndst, nsrc);

46

47 for (int i = 0; i < n; i++) {

48 srcover_srgb_srgb_1(dst++, src[i]);

49 }

50 ndst -= n;

51 }

52 }

53

54 #endif	246 #endif

55	247

56 } // namespace SK_OPTS_NS	248 } // namespace SK_OPTS_NS

57	249

58 #endif//SkBlend_opts_DEFINED	250 #endif//SkBlend_opts_DEFINED

OLD	NEW

« bench/SkBlend_optsBench.cpp ('K') | « resources/iconstrip.png ('k') | src/opts/SkOpts_sse41.cpp » ('j') | tests/SkBlend_optsTest.cpp » ('J')