Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(297)

Side by Side Diff: src/opts/SkColor_opts_SSE2.h

Issue 2097883002: revise row blits to keep intermediate precision so that color is preserved when blended against its… (Closed) Base URL: https://skia.googlesource.com/skia@master
Patch Set: guard more changes with SK_SUPPORT_LEGACY_BROKEN_LERP Created 4 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/opts/SkBlitRow_opts_mips_dsp.cpp ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2014 The Android Open Source Project 2 * Copyright 2014 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #ifndef SkColor_opts_SSE2_DEFINED 8 #ifndef SkColor_opts_SSE2_DEFINED
9 #define SkColor_opts_SSE2_DEFINED 9 #define SkColor_opts_SSE2_DEFINED
10 10
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after
73 __m128i rb = _mm_and_si128(mask, c); 73 __m128i rb = _mm_and_si128(mask, c);
74 rb = _mm_mulhi_epu16(rb, s); 74 rb = _mm_mulhi_epu16(rb, s);
75 75
76 __m128i ag = _mm_andnot_si128(mask, c); 76 __m128i ag = _mm_andnot_si128(mask, c);
77 ag = _mm_mulhi_epu16(ag, s); // Alpha and green values are in the higher byte of each word. 77 ag = _mm_mulhi_epu16(ag, s); // Alpha and green values are in the higher byte of each word.
78 ag = _mm_andnot_si128(mask, ag); 78 ag = _mm_andnot_si128(mask, ag);
79 79
80 return _mm_or_si128(rb, ag); 80 return _mm_or_si128(rb, ag);
81 } 81 }
82 82
83 // Portable version SkFastFourByteInterp256 is in SkColorPriv.h.
84 static inline __m128i SkFastFourByteInterp256_SSE2(const __m128i& src, const __m 128i& dst, const unsigned src_scale) {
85 // Computes dst + (((src - dst)*src_scale)>>8)
86 const __m128i mask = _mm_set1_epi32(0x00FF00FF);
87
88 // Unpack the 16x8-bit source into 2 8x16-bit splayed halves.
89 __m128i src_rb = _mm_and_si128(mask, src);
90 __m128i src_ag = _mm_srli_epi16(src, 8);
91 __m128i dst_rb = _mm_and_si128(mask, dst);
92 __m128i dst_ag = _mm_srli_epi16(dst, 8);
93
94 // Compute scaled differences.
95 __m128i diff_rb = _mm_sub_epi16(src_rb, dst_rb);
96 __m128i diff_ag = _mm_sub_epi16(src_ag, dst_ag);
97 __m128i s = _mm_set1_epi16(src_scale);
98 diff_rb = _mm_mullo_epi16(diff_rb, s);
99 diff_ag = _mm_mullo_epi16(diff_ag, s);
100
101 // Pack the differences back together.
102 diff_rb = _mm_srli_epi16(diff_rb, 8);
103 diff_ag = _mm_andnot_si128(mask, diff_ag);
104 __m128i diff = _mm_or_si128(diff_rb, diff_ag);
105
106 // Add difference to destination.
107 return _mm_add_epi8(dst, diff);
108 }
109
110 // Portable version SkPMLerp is in SkColorPriv.h
111 static inline __m128i SkPMLerp_SSE2(const __m128i& src, const __m128i& dst, cons t unsigned scale) {
112 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
113 return _mm_add_epi8(SkAlphaMulQ_SSE2(src, scale), SkAlphaMulQ_SSE2(dst, 256 - scale));
114 #else
115 return SkFastFourByteInterp256_SSE2(src, dst, scale);
116 #endif
117 }
118
83 static inline __m128i SkGetPackedA32_SSE2(const __m128i& src) { 119 static inline __m128i SkGetPackedA32_SSE2(const __m128i& src) {
84 #if SK_A32_SHIFT == 24 // It's very common (universal?) that alph a is the top byte. 120 #if SK_A32_SHIFT == 24 // It's very common (universal?) that alph a is the top byte.
85 return _mm_srli_epi32(src, 24); // You'd hope the compiler would remove th e left shift then, 121 return _mm_srli_epi32(src, 24); // You'd hope the compiler would remove th e left shift then,
86 #else // but I've seen Clang just do a dumb left shift of zero. :( 122 #else // but I've seen Clang just do a dumb left shift of zero. :(
87 __m128i a = _mm_slli_epi32(src, (24 - SK_A32_SHIFT)); 123 __m128i a = _mm_slli_epi32(src, (24 - SK_A32_SHIFT));
88 return _mm_srli_epi32(a, 24); 124 return _mm_srli_epi32(a, 24);
89 #endif 125 #endif
90 } 126 }
91 127
92 static inline __m128i SkGetPackedR32_SSE2(const __m128i& src) { 128 static inline __m128i SkGetPackedR32_SSE2(const __m128i& src) {
(...skipping 113 matching lines...) Expand 10 before | Expand all | Expand 10 after
206 return d_pixel; 242 return d_pixel;
207 } 243 }
208 244
209 // Portable version is SkPMSrcOver in SkColorPriv.h. 245 // Portable version is SkPMSrcOver in SkColorPriv.h.
210 static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) { 246 static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) {
211 return _mm_add_epi32(src, 247 return _mm_add_epi32(src,
212 SkAlphaMulQ_SSE2(dst, _mm_sub_epi32(_mm_set1_epi32(256) , 248 SkAlphaMulQ_SSE2(dst, _mm_sub_epi32(_mm_set1_epi32(256) ,
213 SkGetPackedA32_SSE2 (src)))); 249 SkGetPackedA32_SSE2 (src))));
214 } 250 }
215 251
216 // Portable version is SkBlendARGB32 in SkColorPriv.h.
217 static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst,
218 const __m128i& aa) {
219 __m128i src_scale = SkAlpha255To256_SSE2(aa);
220 // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale))
221 __m128i dst_scale = SkGetPackedA32_SSE2(src);
222 dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
223 dst_scale = _mm_srli_epi16(dst_scale, 8);
224 dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale);
225
226 __m128i result = SkAlphaMulQ_SSE2(src, src_scale);
227 return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale));
228 }
229
230 // Fast path for SkBlendARGB32_SSE2 with a constant alpha factor. 252 // Fast path for SkBlendARGB32_SSE2 with a constant alpha factor.
231 static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst, 253 static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst,
232 const unsigned aa) { 254 const unsigned aa) {
233 unsigned alpha = SkAlpha255To256(aa); 255 unsigned alpha = SkAlpha255To256(aa);
234 __m128i src_scale = _mm_set1_epi32(alpha); 256 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
235 // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale)) 257 __m128i src_scale = _mm_set1_epi32(alpha);
258 // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale))
259 __m128i dst_scale = SkGetPackedA32_SSE2(src);
260 dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
261 dst_scale = _mm_srli_epi16(dst_scale, 8);
262 dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale);
263
264 __m128i result = SkAlphaMulQ_SSE2(src, alpha);
265 return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale));
266 #else
267 __m128i src_scale = _mm_set1_epi16(alpha);
268 // SkAlphaMulInv256(SkGetPackedA32(src), src_scale)
236 __m128i dst_scale = SkGetPackedA32_SSE2(src); 269 __m128i dst_scale = SkGetPackedA32_SSE2(src);
270 // High words in dst_scale are 0, so it's safe to multiply with 16-bit src_s cale.
237 dst_scale = _mm_mullo_epi16(dst_scale, src_scale); 271 dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
238 dst_scale = _mm_srli_epi16(dst_scale, 8); 272 dst_scale = _mm_sub_epi32(_mm_set1_epi32(0xFFFF), dst_scale);
239 dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale); 273 dst_scale = _mm_add_epi32(dst_scale, _mm_srli_epi32(dst_scale, 8));
274 dst_scale = _mm_srli_epi32(dst_scale, 8);
275 // Duplicate scales into 2x16-bit pattern per pixel.
276 dst_scale = _mm_shufflelo_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
277 dst_scale = _mm_shufflehi_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
240 278
241 __m128i result = SkAlphaMulQ_SSE2(src, alpha); 279 const __m128i mask = _mm_set1_epi32(0x00FF00FF);
242 return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale)); 280
281 // Unpack the 16x8-bit source/destination into 2 8x16-bit splayed halves.
282 __m128i src_rb = _mm_and_si128(mask, src);
283 __m128i src_ag = _mm_srli_epi16(src, 8);
284 __m128i dst_rb = _mm_and_si128(mask, dst);
285 __m128i dst_ag = _mm_srli_epi16(dst, 8);
286
287 // Scale them.
288 src_rb = _mm_mullo_epi16(src_rb, src_scale);
289 src_ag = _mm_mullo_epi16(src_ag, src_scale);
290 dst_rb = _mm_mullo_epi16(dst_rb, dst_scale);
291 dst_ag = _mm_mullo_epi16(dst_ag, dst_scale);
292
293 // Add the scaled source and destination.
294 dst_rb = _mm_add_epi16(src_rb, dst_rb);
295 dst_ag = _mm_add_epi16(src_ag, dst_ag);
296
297 // Unsplay the halves back together.
298 dst_rb = _mm_srli_epi16(dst_rb, 8);
299 dst_ag = _mm_andnot_si128(mask, dst_ag);
300 return _mm_or_si128(dst_rb, dst_ag);
301 #endif
243 } 302 }
244 303
245 #undef ASSERT_EQ 304 #undef ASSERT_EQ
246 #endif // SkColor_opts_SSE2_DEFINED 305 #endif // SkColor_opts_SSE2_DEFINED
OLDNEW
« no previous file with comments | « src/opts/SkBlitRow_opts_mips_dsp.cpp ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698