src/opts/SkColor_opts_SSE2.h - Issue 2097883002: revise row blits to keep intermediate precision so that color is preserved when blended against its…

Side by Side Diff: src/opts/SkColor_opts_SSE2.h

Issue 2097883002: revise row blits to keep intermediate precision so that color is preserved when blended against its… (Closed) Base URL: https://skia.googlesource.com/skia@master

Patch Set: guard more changes with SK_SUPPORT_LEGACY_BROKEN_LERP Created 4 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2014 The Android Open Source Project	2 * Copyright 2014 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef SkColor_opts_SSE2_DEFINED	8 #ifndef SkColor_opts_SSE2_DEFINED

9 #define SkColor_opts_SSE2_DEFINED	9 #define SkColor_opts_SSE2_DEFINED

10	10

(...skipping 62 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
73 __m128i rb = _mm_and_si128(mask, c);	73 __m128i rb = _mm_and_si128(mask, c);

74 rb = _mm_mulhi_epu16(rb, s);	74 rb = _mm_mulhi_epu16(rb, s);

75	75

76 __m128i ag = _mm_andnot_si128(mask, c);	76 __m128i ag = _mm_andnot_si128(mask, c);

77 ag = _mm_mulhi_epu16(ag, s); // Alpha and green values are in the higher byte of each word.	77 ag = _mm_mulhi_epu16(ag, s); // Alpha and green values are in the higher byte of each word.

78 ag = _mm_andnot_si128(mask, ag);	78 ag = _mm_andnot_si128(mask, ag);

79	79

80 return _mm_or_si128(rb, ag);	80 return _mm_or_si128(rb, ag);

81 }	81 }

82	82

	83 // Portable version SkFastFourByteInterp256 is in SkColorPriv.h.

	84 static inline __m128i SkFastFourByteInterp256_SSE2(const __m128i& src, const __m 128i& dst, const unsigned src_scale) {

	85 // Computes dst + (((src - dst)*src_scale)>>8)

	86 const __m128i mask = _mm_set1_epi32(0x00FF00FF);

	87

	88 // Unpack the 16x8-bit source into 2 8x16-bit splayed halves.

	89 __m128i src_rb = _mm_and_si128(mask, src);

	90 __m128i src_ag = _mm_srli_epi16(src, 8);

	91 __m128i dst_rb = _mm_and_si128(mask, dst);

	92 __m128i dst_ag = _mm_srli_epi16(dst, 8);

	93

	94 // Compute scaled differences.

	95 __m128i diff_rb = _mm_sub_epi16(src_rb, dst_rb);

	96 __m128i diff_ag = _mm_sub_epi16(src_ag, dst_ag);

	97 __m128i s = _mm_set1_epi16(src_scale);

	98 diff_rb = _mm_mullo_epi16(diff_rb, s);

	99 diff_ag = _mm_mullo_epi16(diff_ag, s);

	100

	101 // Pack the differences back together.

	102 diff_rb = _mm_srli_epi16(diff_rb, 8);

	103 diff_ag = _mm_andnot_si128(mask, diff_ag);

	104 __m128i diff = _mm_or_si128(diff_rb, diff_ag);

	105

	106 // Add difference to destination.

	107 return _mm_add_epi8(dst, diff);

	108 }

	109

	110 // Portable version SkPMLerp is in SkColorPriv.h

	111 static inline __m128i SkPMLerp_SSE2(const __m128i& src, const __m128i& dst, cons t unsigned scale) {

	112 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP

	113 return _mm_add_epi8(SkAlphaMulQ_SSE2(src, scale), SkAlphaMulQ_SSE2(dst, 256 - scale));

	114 #else

	115 return SkFastFourByteInterp256_SSE2(src, dst, scale);

	116 #endif

	117 }

	118

83 static inline __m128i SkGetPackedA32_SSE2(const __m128i& src) {	119 static inline __m128i SkGetPackedA32_SSE2(const __m128i& src) {

84 #if SK_A32_SHIFT == 24 // It's very common (universal?) that alph a is the top byte.	120 #if SK_A32_SHIFT == 24 // It's very common (universal?) that alph a is the top byte.

85 return _mm_srli_epi32(src, 24); // You'd hope the compiler would remove th e left shift then,	121 return _mm_srli_epi32(src, 24); // You'd hope the compiler would remove th e left shift then,

86 #else // but I've seen Clang just do a dumb left shift of zero. :(	122 #else // but I've seen Clang just do a dumb left shift of zero. :(

87 __m128i a = _mm_slli_epi32(src, (24 - SK_A32_SHIFT));	123 __m128i a = _mm_slli_epi32(src, (24 - SK_A32_SHIFT));

88 return _mm_srli_epi32(a, 24);	124 return _mm_srli_epi32(a, 24);

89 #endif	125 #endif

90 }	126 }

91	127

92 static inline __m128i SkGetPackedR32_SSE2(const __m128i& src) {	128 static inline __m128i SkGetPackedR32_SSE2(const __m128i& src) {

(...skipping 113 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
206 return d_pixel;	242 return d_pixel;

207 }	243 }

208	244

209 // Portable version is SkPMSrcOver in SkColorPriv.h.	245 // Portable version is SkPMSrcOver in SkColorPriv.h.

210 static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) {	246 static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) {

211 return _mm_add_epi32(src,	247 return _mm_add_epi32(src,

212 SkAlphaMulQ_SSE2(dst, _mm_sub_epi32(_mm_set1_epi32(256) ,	248 SkAlphaMulQ_SSE2(dst, _mm_sub_epi32(_mm_set1_epi32(256) ,

213 SkGetPackedA32_SSE2 (src))));	249 SkGetPackedA32_SSE2 (src))));

214 }	250 }

215	251

216 // Portable version is SkBlendARGB32 in SkColorPriv.h.

217 static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst,

218 const __m128i& aa) {

219 __m128i src_scale = SkAlpha255To256_SSE2(aa);

220 // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale))

221 __m128i dst_scale = SkGetPackedA32_SSE2(src);

222 dst_scale = _mm_mullo_epi16(dst_scale, src_scale);

223 dst_scale = _mm_srli_epi16(dst_scale, 8);

224 dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale);

225

226 __m128i result = SkAlphaMulQ_SSE2(src, src_scale);

227 return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale));

228 }

229

230 // Fast path for SkBlendARGB32_SSE2 with a constant alpha factor.	252 // Fast path for SkBlendARGB32_SSE2 with a constant alpha factor.

231 static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst,	253 static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst,

232 const unsigned aa) {	254 const unsigned aa) {

233 unsigned alpha = SkAlpha255To256(aa);	255 unsigned alpha = SkAlpha255To256(aa);

234 __m128i src_scale = _mm_set1_epi32(alpha);	256 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP

235 // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale))	257 __m128i src_scale = _mm_set1_epi32(alpha);

	258 // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale))

	259 __m128i dst_scale = SkGetPackedA32_SSE2(src);

	260 dst_scale = _mm_mullo_epi16(dst_scale, src_scale);

	261 dst_scale = _mm_srli_epi16(dst_scale, 8);

	262 dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale);

	263

	264 __m128i result = SkAlphaMulQ_SSE2(src, alpha);

	265 return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale));

	266 #else

	267 __m128i src_scale = _mm_set1_epi16(alpha);

	268 // SkAlphaMulInv256(SkGetPackedA32(src), src_scale)

236 __m128i dst_scale = SkGetPackedA32_SSE2(src);	269 __m128i dst_scale = SkGetPackedA32_SSE2(src);

	270 // High words in dst_scale are 0, so it's safe to multiply with 16-bit src_s cale.

237 dst_scale = _mm_mullo_epi16(dst_scale, src_scale);	271 dst_scale = _mm_mullo_epi16(dst_scale, src_scale);

238 dst_scale = _mm_srli_epi16(dst_scale, 8);	272 dst_scale = _mm_sub_epi32(_mm_set1_epi32(0xFFFF), dst_scale);

239 dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale);	273 dst_scale = _mm_add_epi32(dst_scale, _mm_srli_epi32(dst_scale, 8));

	274 dst_scale = _mm_srli_epi32(dst_scale, 8);

	275 // Duplicate scales into 2x16-bit pattern per pixel.

	276 dst_scale = _mm_shufflelo_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));

	277 dst_scale = _mm_shufflehi_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));

240	278

241 __m128i result = SkAlphaMulQ_SSE2(src, alpha);	279 const __m128i mask = _mm_set1_epi32(0x00FF00FF);

242 return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale));	280

	281 // Unpack the 16x8-bit source/destination into 2 8x16-bit splayed halves.

	282 __m128i src_rb = _mm_and_si128(mask, src);

	283 __m128i src_ag = _mm_srli_epi16(src, 8);

	284 __m128i dst_rb = _mm_and_si128(mask, dst);

	285 __m128i dst_ag = _mm_srli_epi16(dst, 8);

	286

	287 // Scale them.

	288 src_rb = _mm_mullo_epi16(src_rb, src_scale);

	289 src_ag = _mm_mullo_epi16(src_ag, src_scale);

	290 dst_rb = _mm_mullo_epi16(dst_rb, dst_scale);

	291 dst_ag = _mm_mullo_epi16(dst_ag, dst_scale);

	292

	293 // Add the scaled source and destination.

	294 dst_rb = _mm_add_epi16(src_rb, dst_rb);

	295 dst_ag = _mm_add_epi16(src_ag, dst_ag);

	296

	297 // Unsplay the halves back together.

	298 dst_rb = _mm_srli_epi16(dst_rb, 8);

	299 dst_ag = _mm_andnot_si128(mask, dst_ag);

	300 return _mm_or_si128(dst_rb, dst_ag);

	301 #endif

243 }	302 }

244	303

245 #undef ASSERT_EQ	304 #undef ASSERT_EQ

246 #endif // SkColor_opts_SSE2_DEFINED	305 #endif // SkColor_opts_SSE2_DEFINED

OLD	NEW

« no previous file with comments | « src/opts/SkBlitRow_opts_mips_dsp.cpp ('k') | no next file » | no next file with comments »