OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkPM4fPriv.h" | 8 #include "SkPM4fPriv.h" |
9 #include "SkUtils.h" | 9 #include "SkUtils.h" |
10 #include "SkXfermode.h" | 10 #include "SkXfermode.h" |
| 11 #include "Sk4x4f.h" |
11 | 12 |
12 static SkPM4f rgba_to_pmcolor_order(const SkPM4f& x) { | 13 static SkPM4f rgba_to_pmcolor_order(const SkPM4f& x) { |
13 #ifdef SK_PMCOLOR_IS_BGRA | 14 #ifdef SK_PMCOLOR_IS_BGRA |
14 return {{ x.fVec[2], x.fVec[1], x.fVec[0], x.fVec[3] }}; | 15 return {{ x.fVec[2], x.fVec[1], x.fVec[0], x.fVec[3] }}; |
15 #else | 16 #else |
16 return x; | 17 return x; |
17 #endif | 18 #endif |
18 } | 19 } |
19 | 20 |
20 enum DstType { | 21 enum DstType { |
(...skipping 207 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
228 static void dst(const SkXfermode*, uint32_t dst[], const SkPM4f[], int count, co
nst SkAlpha aa[]) {} | 229 static void dst(const SkXfermode*, uint32_t dst[], const SkPM4f[], int count, co
nst SkAlpha aa[]) {} |
229 | 230 |
230 const SkXfermode::D32Proc gProcs_Dst[] = { | 231 const SkXfermode::D32Proc gProcs_Dst[] = { |
231 dst, dst, dst, dst, dst, dst, dst, dst, | 232 dst, dst, dst, dst, dst, dst, dst, dst, |
232 }; | 233 }; |
233 | 234 |
234 ////////////////////////////////////////////////////////////////////////////////
/////////////////// | 235 ////////////////////////////////////////////////////////////////////////////////
/////////////////// |
235 | 236 |
236 | 237 |
237 static void srcover_n_srgb_bw(uint32_t dst[], const SkPM4f src[], int count) { | 238 static void srcover_n_srgb_bw(uint32_t dst[], const SkPM4f src[], int count) { |
238 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 // For _mm_shuffle_epi8 | |
239 while (count >= 4) { | 239 while (count >= 4) { |
240 // Load 4 sRGB RGBA/BGRA 8888 dst pixels. | 240 // Load 4 sRGB RGBA/BGRA 8888 dst pixels. |
241 // We'll write most of this as if they're RGBA, and just swizzle the src
pixels to match. | 241 // We'll write most of this as if they're RGBA, and just swizzle the src
pixels to match. |
242 __m128i d4 = _mm_loadu_si128((const __m128i*)dst); | 242 auto d = Sk4x4f::Transpose((const uint8_t*)dst); |
243 | |
244 // Transpose into planar and convert each plane to float. | |
245 auto _ = ~0; // Shuffles in a zero byte. | |
246 auto dr = _mm_cvtepi32_ps( | |
247 _mm_shuffle_epi8(d4, _mm_setr_epi8(0,_,_,_, 4,_,_,_, 8,_,_,_,12,
_,_,_))); | |
248 auto dg = _mm_cvtepi32_ps( | |
249 _mm_shuffle_epi8(d4, _mm_setr_epi8(1,_,_,_, 5,_,_,_, 9,_,_,_,13,
_,_,_))); | |
250 auto db = _mm_cvtepi32_ps( | |
251 _mm_shuffle_epi8(d4, _mm_setr_epi8(2,_,_,_, 6,_,_,_,10,_,_,_,14,
_,_,_))); | |
252 auto da = _mm_cvtepi32_ps( | |
253 _mm_shuffle_epi8(d4, _mm_setr_epi8(3,_,_,_, 7,_,_,_,11,_,_,_,15,
_,_,_))); | |
254 | 243 |
255 // Scale to [0,1]. | 244 // Scale to [0,1]. |
256 dr = _mm_mul_ps(dr, _mm_set1_ps(1/255.0f)); | 245 d.r *= 1/255.0f; |
257 dg = _mm_mul_ps(dg, _mm_set1_ps(1/255.0f)); | 246 d.g *= 1/255.0f; |
258 db = _mm_mul_ps(db, _mm_set1_ps(1/255.0f)); | 247 d.b *= 1/255.0f; |
259 da = _mm_mul_ps(da, _mm_set1_ps(1/255.0f)); | 248 d.a *= 1/255.0f; |
260 | 249 |
261 // Apply approximate sRGB gamma correction to convert to linear (as if g
amma were 2). | 250 // Apply approximate sRGB gamma correction to convert to linear (as if g
amma were 2). |
262 dr = _mm_mul_ps(dr, dr); | 251 d.r *= d.r; |
263 dg = _mm_mul_ps(dg, dg); | 252 d.g *= d.g; |
264 db = _mm_mul_ps(db, db); | 253 d.b *= d.b; |
265 | 254 |
266 // Load 4 linear float src pixels. | 255 // Load 4 linear float src pixels. |
267 auto s0 = _mm_loadu_ps(src[0].fVec), | 256 auto s = Sk4x4f::Transpose(src->fVec); |
268 s1 = _mm_loadu_ps(src[1].fVec), | |
269 s2 = _mm_loadu_ps(src[2].fVec), | |
270 s3 = _mm_loadu_ps(src[3].fVec); | |
271 | |
272 // Transpose src pixels to planar too, and give the registers better nam
es. | |
273 _MM_TRANSPOSE4_PS(s0, s1, s2, s3); | |
274 auto sr = s0, | |
275 sg = s1, | |
276 sb = s2, | |
277 sa = s3; | |
278 | 257 |
279 // Match color order with destination, if necessary. | 258 // Match color order with destination, if necessary. |
280 #if defined(SK_PMCOLOR_IS_BGRA) | 259 #if defined(SK_PMCOLOR_IS_BGRA) |
281 SkTSwap(sr, sb); | 260 SkTSwap(s.r, s.b); |
282 #endif | 261 #endif |
283 | 262 |
284 // Now, the meat of what we wanted to do... perform the srcover blend. | 263 // Now, the meat of what we wanted to do... perform the srcover blend. |
285 auto invSA = _mm_sub_ps(_mm_set1_ps(1), sa); | 264 auto invSA = 1.0f - s.a; |
286 auto r = _mm_add_ps(sr, _mm_mul_ps(dr, invSA)), | 265 auto r = s.r + d.r * invSA, |
287 g = _mm_add_ps(sg, _mm_mul_ps(dg, invSA)), | 266 g = s.g + d.g * invSA, |
288 b = _mm_add_ps(sb, _mm_mul_ps(db, invSA)), | 267 b = s.b + d.b * invSA, |
289 a = _mm_add_ps(sa, _mm_mul_ps(da, invSA)); | 268 a = s.a + d.a * invSA; |
290 | 269 |
291 // Convert back to sRGB and [0,255], again approximating sRGB as gamma =
= 2. | 270 // Convert back to sRGB and [0,255], again approximating sRGB as gamma =
= 2. |
292 r = _mm_mul_ps(_mm_sqrt_ps(r), _mm_set1_ps(255)); | 271 r = r.sqrt() * 255.0f + 0.5f; |
293 g = _mm_mul_ps(_mm_sqrt_ps(g), _mm_set1_ps(255)); | 272 g = g.sqrt() * 255.0f + 0.5f; |
294 b = _mm_mul_ps(_mm_sqrt_ps(b), _mm_set1_ps(255)); | 273 b = b.sqrt() * 255.0f + 0.5f; |
295 a = _mm_mul_ps( (a), _mm_set1_ps(255)); | 274 a = a * 255.0f + 0.5f; |
296 | 275 |
297 // Convert to int (with rounding) and pack back down to planar 8-bit. | 276 Sk4x4f{r,g,b,a}.transpose((uint8_t*)dst); |
298 __m128i x = _mm_packus_epi16(_mm_packus_epi16(_mm_cvtps_epi32(r), _mm_cv
tps_epi32(g)), | |
299 _mm_packus_epi16(_mm_cvtps_epi32(b), _mm_cv
tps_epi32(a))); | |
300 | |
301 // Transpose back to interlaced RGBA and write back to dst. | |
302 x = _mm_shuffle_epi8(x, _mm_setr_epi8(0, 4, 8, 12, | |
303 1, 5, 9, 13, | |
304 2, 6, 10, 14, | |
305 3, 7, 11, 15)); | |
306 _mm_storeu_si128((__m128i*)dst, x); | |
307 | 277 |
308 count -= 4; | 278 count -= 4; |
309 dst += 4; | 279 dst += 4; |
310 src += 4; | 280 src += 4; |
311 } | 281 } |
312 #endif | 282 |
313 // This should look just like the non-specialized case in srcover_n. | 283 // This should look just like the non-specialized case in srcover_n. |
314 for (int i = 0; i < count; ++i) { | 284 for (int i = 0; i < count; ++i) { |
315 Sk4f s4 = src[i].to4f_pmorder(); | 285 Sk4f s4 = src[i].to4f_pmorder(); |
316 Sk4f d4 = load_dst<kSRGB_Dst>(dst[i]); | 286 Sk4f d4 = load_dst<kSRGB_Dst>(dst[i]); |
317 Sk4f r4 = s4 + d4 * Sk4f(1 - get_alpha(s4)); | 287 Sk4f r4 = s4 + d4 * Sk4f(1 - get_alpha(s4)); |
318 dst[i] = store_dst<kSRGB_Dst>(r4); | 288 dst[i] = store_dst<kSRGB_Dst>(r4); |
319 } | 289 } |
320 } | 290 } |
321 | 291 |
322 template <DstType D> void srcover_n(const SkXfermode*, uint32_t dst[], | 292 template <DstType D> void srcover_n(const SkXfermode*, uint32_t dst[], |
(...skipping 239 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
562 | 532 |
563 const LCD32Proc procs[] = { | 533 const LCD32Proc procs[] = { |
564 srcover_n_lcd<kSRGB_Dst>, src_n_lcd<kSRGB_Dst>, | 534 srcover_n_lcd<kSRGB_Dst>, src_n_lcd<kSRGB_Dst>, |
565 srcover_1_lcd<kSRGB_Dst>, src_1_lcd<kSRGB_Dst>, | 535 srcover_1_lcd<kSRGB_Dst>, src_1_lcd<kSRGB_Dst>, |
566 | 536 |
567 srcover_n_lcd<kLinear_Dst>, src_n_lcd<kLinear_Dst>, | 537 srcover_n_lcd<kLinear_Dst>, src_n_lcd<kLinear_Dst>, |
568 srcover_1_lcd<kLinear_Dst>, src_1_lcd<kLinear_Dst>, | 538 srcover_1_lcd<kLinear_Dst>, src_1_lcd<kLinear_Dst>, |
569 }; | 539 }; |
570 return procs[flags]; | 540 return procs[flags]; |
571 } | 541 } |
OLD | NEW |