src/opts/SkBlend_opts.h - Issue 1939513002: Add specialized sRGB blitter for SkOpts

Side by Side Diff: src/opts/SkBlend_opts.h

Issue 1939513002: Add specialized sRGB blitter for SkOpts (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Address comments. Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 Google Inc.	2 * Copyright 2016 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

	8 /*

	9 ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; an d ./out/Release/nanobench --samples 300 --nompd --match LinearSrcOver -q

	10 */

	11

8 #ifndef SkBlend_opts_DEFINED	12 #ifndef SkBlend_opts_DEFINED

9 #define SkBlend_opts_DEFINED	13 #define SkBlend_opts_DEFINED

10	14

	15 #include "SkNx.h"

	16 #include "SkPM4fPriv.h"

	17

11 namespace SK_OPTS_NS {	18 namespace SK_OPTS_NS {

12	19

13 #if 0	20 // An implementation of SrcOver from bytes to bytes in linear space that takes a dvantage of the

14	21 // observation that the 255's cancel.

	22 // invA = 1 - (As / 255);

	23 //

	24 // R = 255 * sqrt((Rs/255)^2 + (Rd/255)^2 * invA)

	25 // => R = 255 * sqrt((Rs^2 + Rd^2 * invA)/255^2)

	26 // => R = sqrt(Rs^2 + Rd^2 * invA)

	27 static inline void blend_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) {

	28 Sk4f s = srgb_to_linear(to_4f(pixel));

	29 Sk4f d = srgb_to_linear(to_4f(*dst));

	30 Sk4f invAlpha = 1.0f - Sk4f{s[SkPM4f::A]} * (1.0f / 255.0f);

	31 Sk4f r = linear_to_srgb(s + d * invAlpha);

	32 *dst = to_4b(r);

	33 //SkNx_cast<uint8_t>(r).store(dst);
	f(malita) 2016/05/06 21:04:00 nit: missed a comment bit here. nit: missed a comment bit here. herb_g 2016/05/06 21:19:50 Done. Show quoted text On 2016/05/06 21:04:00, f(malita) wrote: > nit: missed a comment bit here. Done.
	34 }

	35

	36 static inline void srcover_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) {

	37 if ((~pixel & 0xFF000000) == 0) {

	38 *dst = pixel;

	39 } else if ((pixel & 0xFF000000) != 0) {

	40 blend_srgb_srgb_1(dst, pixel);

	41 }

	42 }

	43

	44 static inline void srcover_srgb_srgb_2(uint32_t* dst, const uint32_t* src) {

	45 srcover_srgb_srgb_1(dst++, *src++);

	46 srcover_srgb_srgb_1(dst, *src);

	47 }

	48

	49 static inline void srcover_srgb_srgb_4(uint32_t* dst, const uint32_t* src) {

	50 srcover_srgb_srgb_1(dst++, *src++);

	51 srcover_srgb_srgb_1(dst++, *src++);

	52 srcover_srgb_srgb_1(dst++, *src++);

	53 srcover_srgb_srgb_1(dst, *src);

	54 }

	55

	56 void best_non_simd_srcover_srgb_srgb(

	57 uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {

	58 uint64_t* ddst = reinterpret_cast<uint64_t*>(dst);

	59

	60 while (ndst >0) {

	61 int count = SkTMin(ndst, nsrc);

	62 ndst -= count;

	63 const uint64_t* dsrc = reinterpret_cast<const uint64_t*>(src);

	64 const uint64_t* end = dsrc + (count >> 1);

	65 do {

	66 if ((~*dsrc & 0xFF000000FF000000) == 0) {

	67 do {

	68 ddst++ = dsrc++;

	69 } while (dsrc < end && (~*dsrc & 0xFF000000FF000000) == 0);

	70 } else if ((*dsrc & 0xFF000000FF000000) == 0) {

	71 do {

	72 dsrc++;

	73 ddst++;

	74 } while (dsrc < end && (*dsrc & 0xFF000000FF000000) == 0);

	75 } else {

	76 srcover_srgb_srgb_2(reinterpret_cast<uint32_t*>(ddst++),

	77 reinterpret_cast<const uint32_t*>(dsrc++));

	78 }

	79 } while (dsrc < end);

	80

	81 if ((count & 1) != 0) {

	82 srcover_srgb_srgb_1(reinterpret_cast<uint32_t*>(ddst),

	83 reinterpret_cast<const uint32_t>(dsrc));

	84 }

	85 }

	86 }

	87

	88 void brute_force_srcover_srgb_srgb(

	89 uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {

	90 while (ndst > 0) {

	91 int n = SkTMin(ndst, nsrc);

	92

	93 for (int i = 0; i < n; i++) {

	94 blend_srgb_srgb_1(dst++, src[i]);

	95 }

	96 ndst -= n;

	97 }

	98 }

	99

	100 void trivial_srcover_srgb_srgb(

	101 uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {

	102 while (ndst > 0) {

	103 int n = SkTMin(ndst, nsrc);

	104

	105 for (int i = 0; i < n; i++) {

	106 srcover_srgb_srgb_1(dst++, src[i]);

	107 }

	108 ndst -= n;

	109 }

	110 }

	111

	112 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

	113

	114 static inline __m128i load(const uint32_t* p) {

	115 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));

	116 }

	117

	118 static inline void store(uint32_t* p, __m128i v) {

	119 _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v);

	120 }

	121

	122 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41

	123

	124 void srcover_srgb_srgb(

	125 uint32_t* dst, const uint32_t* const srcStart, int ndst, const int n src) {

	126 const __m128i alphaMask = _mm_set1_epi32(0xFF000000);

	127 while (ndst > 0) {

	128 int count = SkTMin(ndst, nsrc);

	129 ndst -= count;

	130 const uint32_t* src = srcStart;

	131 const uint32_t* end = src + (count & ~3);

	132

	133 while (src < end) {

	134 __m128i pixels = load(src);

	135 if (_mm_testc_si128(pixels, alphaMask)) {

	136 do {

	137 store(dst, pixels);

	138 dst += 4;

	139 src += 4;

	140 } while (src < end && _mm_testc_si128(pixels = load(src) , alphaMask));

	141 } else if (_mm_testz_si128(pixels, alphaMask)) {

	142 do {

	143 dst += 4;

	144 src += 4;

	145 } while (src < end && _mm_testz_si128(pixels = load(src) , alphaMask));

	146 } else {

	147 do {

	148 srcover_srgb_srgb_4(dst, src);

	149 dst += 4;

	150 src += 4;

	151 } while (src < end && _mm_testnzc_si128(pixels = load(sr c), alphaMask));

	152 }

	153 }

	154

	155 count = count & 3;

	156 while (count-- > 0) {

	157 srcover_srgb_srgb_1(dst++, *src++);

	158 }

	159 }

	160 }

	161 #else

	162 // SSE2 versions

	163 static inline bool check_opaque_alphas(__m128i pixels) {

	164 int mask =

	165 _mm_movemask_epi8(

	166 _mm_cmpeq_epi32(

	167 _mm_andnot_si128(pixels, _mm_set1_epi32(0xFF000000)),

	168 _mm_setzero_si128()));

	169 return mask == 0xFFFF;

	170 }

	171

	172 static inline bool check_transparent_alphas(__m128i pixels) {

	173 int mask =

	174 _mm_movemask_epi8(

	175 _mm_cmpeq_epi32(

	176 _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000)),

	177 _mm_setzero_si128()));

	178 return mask == 0xFFFF;

	179 }

	180

	181 static inline bool check_partial_alphas(__m128 pixels) {

	182 __m128i alphas = _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000));

	183 int mask =

	184 _mm_movemask_epi8(

	185 _mm_cmpeq_epi8(

	186 _mm_srai_epi32(alphas, 8),

	187 alphas));

	188 return mask == 0xFFFF;

	189 }

	190

	191 void srcover_srgb_srgb(

	192 uint32_t* dst, const uint32_t* const srcStart, int ndst, const int n src) {

	193 while (ndst > 0) {

	194 int count = SkTMin(ndst, nsrc);

	195 ndst -= count;

	196 const uint32_t* src = srcStart;

	197 const uint32_t* end = src + (count & ~3);

	198

	199 __m128i pixels = load(src);

	200 do {

	201 if (check_opaque_alphas(pixels)) {

	202 do {

	203 store(dst, pixels);

	204 dst += 4;

	205 src += 4;

	206 } while (src < end && check_opaque_alphas(pixels = load( src)));

	207 } else if (check_transparent_alphas(pixels)) {

	208 const uint32_t* start = src;

	209 do {

	210 src += 4;

	211 } while (src < end && check_transparent_alphas(pixels = load(src)));

	212 dst += src - start;

	213 } else {

	214 do {

	215 srcover_srgb_srgb_4(dst, src);

	216 dst += 4;

	217 src += 4;

	218 } while (src < end && check_partial_alphas(pixels = load (src)));

	219 }

	220 } while (src < end);

	221

	222 count = count & 3;

	223 while (count-- > 0) {

	224 srcover_srgb_srgb_1(dst++, *src++);

	225 }

	226 }

	227 }

	228 #endif

15 #else	229 #else

16	230

17 static inline void srcover_srgb_srgb_1(uint32_t* dst, uint32_t src) {	231 void srcover_srgb_srgb(

18 switch (src >> 24) {	232 uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {

19 case 0x00: return;	233 trivial_srcover_srgb_srgb(dst, src, ndst, nsrc);

20 case 0xff: *dst = src; return;	234 }

21 }	235

22

23 Sk4f d = SkNx_cast<float>(Sk4b::Load( dst)),

24 s = SkNx_cast<float>(Sk4b::Load(&src));

25

26 // Approximate sRGB gamma as 2.0.

27 Sk4f d_sq = d*d,

28 s_sq = s*s;

29 d = Sk4f{d_sq[0], d_sq[1], d_sq[2], d[3]};

30 s = Sk4f{s_sq[0], s_sq[1], s_sq[2], s[3]};

31

32 // SrcOver.

33 Sk4f invA = 1.0f - s[3]*(1/255.0f);

34 d = s + d * invA;

35

36 // Re-apply approximate sRGB gamma.

37 Sk4f d_sqrt = d.sqrt();

38 d = Sk4f{d_sqrt[0], d_sqrt[1], d_sqrt[2], d[3]};

39

40 SkNx_cast<uint8_t>(d).store(dst);

41 }

42

43 static inline void srcover_srgb_srgb(uint32_t* dst, const uint32_t* const sr c, int ndst, const int nsrc) {

44 while (ndst > 0) {

45 int n = SkTMin(ndst, nsrc);

46

47 for (int i = 0; i < n; i++) {

48 srcover_srgb_srgb_1(dst++, src[i]);

49 }

50 ndst -= n;

51 }

52 }

53

54 #endif	236 #endif

55	237

56 } // namespace SK_OPTS_NS	238 } // namespace SK_OPTS_NS

57	239

58 #endif//SkBlend_opts_DEFINED	240 #endif//SkBlend_opts_DEFINED

OLD	NEW

« no previous file with comments | « resources/iconstrip.png ('k') | src/opts/SkOpts_sse41.cpp » ('j') | tests/SkBlend_optsTest.cpp » ('J')