| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include "SkPM4fPriv.h" | 8 #include "SkPM4fPriv.h" |
| 9 #include "SkUtils.h" | 9 #include "SkUtils.h" |
| 10 #include "SkXfermode.h" | 10 #include "SkXfermode.h" |
| 11 #include "Sk4x4f.h" |
| 11 | 12 |
| 12 static SkPM4f rgba_to_pmcolor_order(const SkPM4f& x) { | 13 static SkPM4f rgba_to_pmcolor_order(const SkPM4f& x) { |
| 13 #ifdef SK_PMCOLOR_IS_BGRA | 14 #ifdef SK_PMCOLOR_IS_BGRA |
| 14 return {{ x.fVec[2], x.fVec[1], x.fVec[0], x.fVec[3] }}; | 15 return {{ x.fVec[2], x.fVec[1], x.fVec[0], x.fVec[3] }}; |
| 15 #else | 16 #else |
| 16 return x; | 17 return x; |
| 17 #endif | 18 #endif |
| 18 } | 19 } |
| 19 | 20 |
| 20 enum DstType { | 21 enum DstType { |
| (...skipping 207 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 228 static void dst(const SkXfermode*, uint32_t dst[], const SkPM4f[], int count, co
nst SkAlpha aa[]) {} | 229 static void dst(const SkXfermode*, uint32_t dst[], const SkPM4f[], int count, co
nst SkAlpha aa[]) {} |
| 229 | 230 |
| 230 const SkXfermode::D32Proc gProcs_Dst[] = { | 231 const SkXfermode::D32Proc gProcs_Dst[] = { |
| 231 dst, dst, dst, dst, dst, dst, dst, dst, | 232 dst, dst, dst, dst, dst, dst, dst, dst, |
| 232 }; | 233 }; |
| 233 | 234 |
| 234 ////////////////////////////////////////////////////////////////////////////////
/////////////////// | 235 ////////////////////////////////////////////////////////////////////////////////
/////////////////// |
| 235 | 236 |
| 236 | 237 |
| 237 static void srcover_n_srgb_bw(uint32_t dst[], const SkPM4f src[], int count) { | 238 static void srcover_n_srgb_bw(uint32_t dst[], const SkPM4f src[], int count) { |
| 238 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 // For _mm_shuffle_epi8 | |
| 239 while (count >= 4) { | 239 while (count >= 4) { |
| 240 // Load 4 sRGB RGBA/BGRA 8888 dst pixels. | 240 // Load 4 sRGB RGBA/BGRA 8888 dst pixels. |
| 241 // We'll write most of this as if they're RGBA, and just swizzle the src
pixels to match. | 241 // We'll write most of this as if they're RGBA, and just swizzle the src
pixels to match. |
| 242 __m128i d4 = _mm_loadu_si128((const __m128i*)dst); | 242 auto d = Sk4x4f::Transpose((const uint8_t*)dst); |
| 243 | |
| 244 // Transpose into planar and convert each plane to float. | |
| 245 auto _ = ~0; // Shuffles in a zero byte. | |
| 246 auto dr = _mm_cvtepi32_ps( | |
| 247 _mm_shuffle_epi8(d4, _mm_setr_epi8(0,_,_,_, 4,_,_,_, 8,_,_,_,12,
_,_,_))); | |
| 248 auto dg = _mm_cvtepi32_ps( | |
| 249 _mm_shuffle_epi8(d4, _mm_setr_epi8(1,_,_,_, 5,_,_,_, 9,_,_,_,13,
_,_,_))); | |
| 250 auto db = _mm_cvtepi32_ps( | |
| 251 _mm_shuffle_epi8(d4, _mm_setr_epi8(2,_,_,_, 6,_,_,_,10,_,_,_,14,
_,_,_))); | |
| 252 auto da = _mm_cvtepi32_ps( | |
| 253 _mm_shuffle_epi8(d4, _mm_setr_epi8(3,_,_,_, 7,_,_,_,11,_,_,_,15,
_,_,_))); | |
| 254 | 243 |
| 255 // Scale to [0,1]. | 244 // Scale to [0,1]. |
| 256 dr = _mm_mul_ps(dr, _mm_set1_ps(1/255.0f)); | 245 d.r *= 1/255.0f; |
| 257 dg = _mm_mul_ps(dg, _mm_set1_ps(1/255.0f)); | 246 d.g *= 1/255.0f; |
| 258 db = _mm_mul_ps(db, _mm_set1_ps(1/255.0f)); | 247 d.b *= 1/255.0f; |
| 259 da = _mm_mul_ps(da, _mm_set1_ps(1/255.0f)); | 248 d.a *= 1/255.0f; |
| 260 | 249 |
| 261 // Apply approximate sRGB gamma correction to convert to linear (as if g
amma were 2). | 250 // Apply approximate sRGB gamma correction to convert to linear (as if g
amma were 2). |
| 262 dr = _mm_mul_ps(dr, dr); | 251 d.r *= d.r; |
| 263 dg = _mm_mul_ps(dg, dg); | 252 d.g *= d.g; |
| 264 db = _mm_mul_ps(db, db); | 253 d.b *= d.b; |
| 265 | 254 |
| 266 // Load 4 linear float src pixels. | 255 // Load 4 linear float src pixels. |
| 267 auto s0 = _mm_loadu_ps(src[0].fVec), | 256 auto s = Sk4x4f::Transpose(src->fVec); |
| 268 s1 = _mm_loadu_ps(src[1].fVec), | |
| 269 s2 = _mm_loadu_ps(src[2].fVec), | |
| 270 s3 = _mm_loadu_ps(src[3].fVec); | |
| 271 | |
| 272 // Transpose src pixels to planar too, and give the registers better nam
es. | |
| 273 _MM_TRANSPOSE4_PS(s0, s1, s2, s3); | |
| 274 auto sr = s0, | |
| 275 sg = s1, | |
| 276 sb = s2, | |
| 277 sa = s3; | |
| 278 | 257 |
| 279 // Match color order with destination, if necessary. | 258 // Match color order with destination, if necessary. |
| 280 #if defined(SK_PMCOLOR_IS_BGRA) | 259 #if defined(SK_PMCOLOR_IS_BGRA) |
| 281 SkTSwap(sr, sb); | 260 SkTSwap(s.r, s.b); |
| 282 #endif | 261 #endif |
| 283 | 262 |
| 284 // Now, the meat of what we wanted to do... perform the srcover blend. | 263 // Now, the meat of what we wanted to do... perform the srcover blend. |
| 285 auto invSA = _mm_sub_ps(_mm_set1_ps(1), sa); | 264 auto invSA = 1.0f - s.a; |
| 286 auto r = _mm_add_ps(sr, _mm_mul_ps(dr, invSA)), | 265 auto r = s.r + d.r * invSA, |
| 287 g = _mm_add_ps(sg, _mm_mul_ps(dg, invSA)), | 266 g = s.g + d.g * invSA, |
| 288 b = _mm_add_ps(sb, _mm_mul_ps(db, invSA)), | 267 b = s.b + d.b * invSA, |
| 289 a = _mm_add_ps(sa, _mm_mul_ps(da, invSA)); | 268 a = s.a + d.a * invSA; |
| 290 | 269 |
| 291 // Convert back to sRGB and [0,255], again approximating sRGB as gamma =
= 2. | 270 // Convert back to sRGB and [0,255], again approximating sRGB as gamma =
= 2. |
| 292 r = _mm_mul_ps(_mm_sqrt_ps(r), _mm_set1_ps(255)); | 271 r = r.sqrt() * 255.0f + 0.5f; |
| 293 g = _mm_mul_ps(_mm_sqrt_ps(g), _mm_set1_ps(255)); | 272 g = g.sqrt() * 255.0f + 0.5f; |
| 294 b = _mm_mul_ps(_mm_sqrt_ps(b), _mm_set1_ps(255)); | 273 b = b.sqrt() * 255.0f + 0.5f; |
| 295 a = _mm_mul_ps( (a), _mm_set1_ps(255)); | 274 a = a * 255.0f + 0.5f; |
| 296 | 275 |
| 297 // Convert to int (with rounding) and pack back down to planar 8-bit. | 276 Sk4x4f{r,g,b,a}.transpose((uint8_t*)dst); |
| 298 __m128i x = _mm_packus_epi16(_mm_packus_epi16(_mm_cvtps_epi32(r), _mm_cv
tps_epi32(g)), | |
| 299 _mm_packus_epi16(_mm_cvtps_epi32(b), _mm_cv
tps_epi32(a))); | |
| 300 | |
| 301 // Transpose back to interlaced RGBA and write back to dst. | |
| 302 x = _mm_shuffle_epi8(x, _mm_setr_epi8(0, 4, 8, 12, | |
| 303 1, 5, 9, 13, | |
| 304 2, 6, 10, 14, | |
| 305 3, 7, 11, 15)); | |
| 306 _mm_storeu_si128((__m128i*)dst, x); | |
| 307 | 277 |
| 308 count -= 4; | 278 count -= 4; |
| 309 dst += 4; | 279 dst += 4; |
| 310 src += 4; | 280 src += 4; |
| 311 } | 281 } |
| 312 #endif | 282 |
| 313 // This should look just like the non-specialized case in srcover_n. | 283 // This should look just like the non-specialized case in srcover_n. |
| 314 for (int i = 0; i < count; ++i) { | 284 for (int i = 0; i < count; ++i) { |
| 315 Sk4f s4 = src[i].to4f_pmorder(); | 285 Sk4f s4 = src[i].to4f_pmorder(); |
| 316 Sk4f d4 = load_dst<kSRGB_Dst>(dst[i]); | 286 Sk4f d4 = load_dst<kSRGB_Dst>(dst[i]); |
| 317 Sk4f r4 = s4 + d4 * Sk4f(1 - get_alpha(s4)); | 287 Sk4f r4 = s4 + d4 * Sk4f(1 - get_alpha(s4)); |
| 318 dst[i] = store_dst<kSRGB_Dst>(r4); | 288 dst[i] = store_dst<kSRGB_Dst>(r4); |
| 319 } | 289 } |
| 320 } | 290 } |
| 321 | 291 |
| 322 template <DstType D> void srcover_n(const SkXfermode*, uint32_t dst[], | 292 template <DstType D> void srcover_n(const SkXfermode*, uint32_t dst[], |
| (...skipping 239 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 562 | 532 |
| 563 const LCD32Proc procs[] = { | 533 const LCD32Proc procs[] = { |
| 564 srcover_n_lcd<kSRGB_Dst>, src_n_lcd<kSRGB_Dst>, | 534 srcover_n_lcd<kSRGB_Dst>, src_n_lcd<kSRGB_Dst>, |
| 565 srcover_1_lcd<kSRGB_Dst>, src_1_lcd<kSRGB_Dst>, | 535 srcover_1_lcd<kSRGB_Dst>, src_1_lcd<kSRGB_Dst>, |
| 566 | 536 |
| 567 srcover_n_lcd<kLinear_Dst>, src_n_lcd<kLinear_Dst>, | 537 srcover_n_lcd<kLinear_Dst>, src_n_lcd<kLinear_Dst>, |
| 568 srcover_1_lcd<kLinear_Dst>, src_1_lcd<kLinear_Dst>, | 538 srcover_1_lcd<kLinear_Dst>, src_1_lcd<kLinear_Dst>, |
| 569 }; | 539 }; |
| 570 return procs[flags]; | 540 return procs[flags]; |
| 571 } | 541 } |
| OLD | NEW |