OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkPM4fPriv.h" | 8 #include "SkPM4fPriv.h" |
9 #include "SkUtils.h" | 9 #include "SkUtils.h" |
10 #include "SkXfermode.h" | 10 #include "SkXfermode.h" |
(...skipping 160 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
171 if (D == kLinear_Dst) { | 171 if (D == kLinear_Dst) { |
172 // operate in bias-255 space for src and dst | 172 // operate in bias-255 space for src and dst |
173 const Sk4f& s4_255 = s4 * Sk4f(255); | 173 const Sk4f& s4_255 = s4 * Sk4f(255); |
174 while (count >= 4) { | 174 while (count >= 4) { |
175 Sk4f aa4 = SkNx_cast<float>(Sk4b::Load(aa)) * Sk4f(1/255.f); | 175 Sk4f aa4 = SkNx_cast<float>(Sk4b::Load(aa)) * Sk4f(1/255.f); |
176 Sk4f r0 = lerp(s4_255, to_4f(dst[0]), Sk4f(aa4[0])) + Sk4f(0.5f)
; | 176 Sk4f r0 = lerp(s4_255, to_4f(dst[0]), Sk4f(aa4[0])) + Sk4f(0.5f)
; |
177 Sk4f r1 = lerp(s4_255, to_4f(dst[1]), Sk4f(aa4[1])) + Sk4f(0.5f)
; | 177 Sk4f r1 = lerp(s4_255, to_4f(dst[1]), Sk4f(aa4[1])) + Sk4f(0.5f)
; |
178 Sk4f r2 = lerp(s4_255, to_4f(dst[2]), Sk4f(aa4[2])) + Sk4f(0.5f)
; | 178 Sk4f r2 = lerp(s4_255, to_4f(dst[2]), Sk4f(aa4[2])) + Sk4f(0.5f)
; |
179 Sk4f r3 = lerp(s4_255, to_4f(dst[3]), Sk4f(aa4[3])) + Sk4f(0.5f)
; | 179 Sk4f r3 = lerp(s4_255, to_4f(dst[3]), Sk4f(aa4[3])) + Sk4f(0.5f)
; |
180 Sk4f_ToBytes((uint8_t*)dst, r0, r1, r2, r3); | 180 Sk4f_ToBytes((uint8_t*)dst, r0, r1, r2, r3); |
181 | 181 |
182 dst += 4; | 182 dst += 4; |
183 aa += 4; | 183 aa += 4; |
184 count -= 4; | 184 count -= 4; |
185 } | 185 } |
186 } else { // kSRGB | 186 } else { // kSRGB |
187 while (count >= 4) { | 187 while (count >= 4) { |
188 Sk4f aa4 = SkNx_cast<float>(Sk4b::Load(aa)) * Sk4f(1/255.0f); | 188 Sk4f aa4 = SkNx_cast<float>(Sk4b::Load(aa)) * Sk4f(1/255.0f); |
189 | 189 |
190 /* If we ever natively support convert 255_linear -> 255_srgb,
then perhaps | 190 /* If we ever natively support convert 255_linear -> 255_srgb,
then perhaps |
191 * it would be faster (and possibly allow more code sharing wit
h kLinear) to | 191 * it would be faster (and possibly allow more code sharing wit
h kLinear) to |
192 * stay in that space. | 192 * stay in that space. |
193 */ | 193 */ |
194 Sk4f r0 = lerp(s4, load_dst<D>(dst[0]), Sk4f(aa4[0])); | 194 Sk4f r0 = lerp(s4, load_dst<D>(dst[0]), Sk4f(aa4[0])); |
195 Sk4f r1 = lerp(s4, load_dst<D>(dst[1]), Sk4f(aa4[1])); | 195 Sk4f r1 = lerp(s4, load_dst<D>(dst[1]), Sk4f(aa4[1])); |
196 Sk4f r2 = lerp(s4, load_dst<D>(dst[2]), Sk4f(aa4[2])); | 196 Sk4f r2 = lerp(s4, load_dst<D>(dst[2]), Sk4f(aa4[2])); |
197 Sk4f r3 = lerp(s4, load_dst<D>(dst[3]), Sk4f(aa4[3])); | 197 Sk4f r3 = lerp(s4, load_dst<D>(dst[3]), Sk4f(aa4[3])); |
198 Sk4f_ToBytes((uint8_t*)dst, | 198 Sk4f_ToBytes((uint8_t*)dst, |
199 linear_unit_to_srgb_255f(r0), | 199 linear_unit_to_srgb_255f(r0), |
200 linear_unit_to_srgb_255f(r1), | 200 linear_unit_to_srgb_255f(r1), |
201 linear_unit_to_srgb_255f(r2), | 201 linear_unit_to_srgb_255f(r2), |
202 linear_unit_to_srgb_255f(r3)); | 202 linear_unit_to_srgb_255f(r3)); |
203 | 203 |
204 dst += 4; | 204 dst += 4; |
205 aa += 4; | 205 aa += 4; |
206 count -= 4; | 206 count -= 4; |
207 } | 207 } |
208 } | 208 } |
209 for (int i = 0; i < count; ++i) { | 209 for (int i = 0; i < count; ++i) { |
210 unsigned a = aa[i]; | 210 unsigned a = aa[i]; |
211 Sk4f d4 = load_dst<D>(dst[i]); | 211 Sk4f d4 = load_dst<D>(dst[i]); |
212 dst[i] = store_dst<D>(lerp(s4, d4, a)); | 212 dst[i] = store_dst<D>(lerp(s4, d4, a)); |
213 } | 213 } |
(...skipping 12 matching lines...) Expand all Loading... |
226 ////////////////////////////////////////////////////////////////////////////////
/////////////////// | 226 ////////////////////////////////////////////////////////////////////////////////
/////////////////// |
227 | 227 |
228 static void dst(const SkXfermode*, uint32_t dst[], const SkPM4f[], int count, co
nst SkAlpha aa[]) {} | 228 static void dst(const SkXfermode*, uint32_t dst[], const SkPM4f[], int count, co
nst SkAlpha aa[]) {} |
229 | 229 |
230 const SkXfermode::D32Proc gProcs_Dst[] = { | 230 const SkXfermode::D32Proc gProcs_Dst[] = { |
231 dst, dst, dst, dst, dst, dst, dst, dst, | 231 dst, dst, dst, dst, dst, dst, dst, dst, |
232 }; | 232 }; |
233 | 233 |
234 ////////////////////////////////////////////////////////////////////////////////
/////////////////// | 234 ////////////////////////////////////////////////////////////////////////////////
/////////////////// |
235 | 235 |
| 236 |
| 237 static void srcover_n_srgb_bw(uint32_t dst[], const SkPM4f src[], int count) { |
| 238 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 // For _mm_shuffle_epi8 |
| 239 while (count >= 4) { |
| 240 // Load 4 sRGB RGBA/BGRA 8888 dst pixels. |
| 241 // We'll write most of this as if they're RGBA, and just swizzle the src
pixels to match. |
| 242 __m128i d4 = _mm_loadu_si128((const __m128i*)dst); |
| 243 |
| 244 // Transpose into planar and convert each plane to float. |
| 245 auto _ = ~0; // Shuffles in a zero byte. |
| 246 auto dr = _mm_cvtepi32_ps( |
| 247 _mm_shuffle_epi8(d4, _mm_setr_epi8(0,_,_,_, 4,_,_,_, 8,_,_,_,12,
_,_,_))); |
| 248 auto dg = _mm_cvtepi32_ps( |
| 249 _mm_shuffle_epi8(d4, _mm_setr_epi8(1,_,_,_, 5,_,_,_, 9,_,_,_,13,
_,_,_))); |
| 250 auto db = _mm_cvtepi32_ps( |
| 251 _mm_shuffle_epi8(d4, _mm_setr_epi8(2,_,_,_, 6,_,_,_,10,_,_,_,14,
_,_,_))); |
| 252 auto da = _mm_cvtepi32_ps( |
| 253 _mm_shuffle_epi8(d4, _mm_setr_epi8(3,_,_,_, 7,_,_,_,11,_,_,_,15,
_,_,_))); |
| 254 |
| 255 // Scale to [0,1]. |
| 256 dr = _mm_mul_ps(dr, _mm_set1_ps(1/255.0f)); |
| 257 dg = _mm_mul_ps(dg, _mm_set1_ps(1/255.0f)); |
| 258 db = _mm_mul_ps(db, _mm_set1_ps(1/255.0f)); |
| 259 da = _mm_mul_ps(da, _mm_set1_ps(1/255.0f)); |
| 260 |
| 261 // Apply approximate sRGB gamma correction to convert to linear (as if g
amma were 2). |
| 262 dr = _mm_mul_ps(dr, dr); |
| 263 dg = _mm_mul_ps(dg, dg); |
| 264 db = _mm_mul_ps(db, db); |
| 265 |
| 266 // Load 4 linear float src pixels. |
| 267 auto s0 = _mm_loadu_ps(src[0].fVec), |
| 268 s1 = _mm_loadu_ps(src[1].fVec), |
| 269 s2 = _mm_loadu_ps(src[2].fVec), |
| 270 s3 = _mm_loadu_ps(src[3].fVec); |
| 271 |
| 272 // Transpose src pixels to planar too, and give the registers better nam
es. |
| 273 _MM_TRANSPOSE4_PS(s0, s1, s2, s3); |
| 274 auto sr = s0, |
| 275 sg = s1, |
| 276 sb = s2, |
| 277 sa = s3; |
| 278 |
| 279 // Match color order with destination, if necessary. |
| 280 #if defined(SK_PMCOLOR_IS_BGRA) |
| 281 SkTSwap(sr, sb); |
| 282 #endif |
| 283 |
| 284 // Now, the meat of what we wanted to do... perform the srcover blend. |
| 285 auto invSA = _mm_sub_ps(_mm_set1_ps(1), sa); |
| 286 auto r = _mm_add_ps(sr, _mm_mul_ps(dr, invSA)), |
| 287 g = _mm_add_ps(sg, _mm_mul_ps(dg, invSA)), |
| 288 b = _mm_add_ps(sb, _mm_mul_ps(db, invSA)), |
| 289 a = _mm_add_ps(sa, _mm_mul_ps(da, invSA)); |
| 290 |
| 291 // Convert back to sRGB and [0,255], again approximating sRGB as gamma =
= 2. |
| 292 r = _mm_mul_ps(_mm_sqrt_ps(r), _mm_set1_ps(255)); |
| 293 g = _mm_mul_ps(_mm_sqrt_ps(g), _mm_set1_ps(255)); |
| 294 b = _mm_mul_ps(_mm_sqrt_ps(b), _mm_set1_ps(255)); |
| 295 a = _mm_mul_ps( (a), _mm_set1_ps(255)); |
| 296 |
| 297 // Convert to int (with rounding) and pack back down to planar 8-bit. |
| 298 __m128i x = _mm_packus_epi16(_mm_packus_epi16(_mm_cvtps_epi32(r), _mm_cv
tps_epi32(g)), |
| 299 _mm_packus_epi16(_mm_cvtps_epi32(b), _mm_cv
tps_epi32(a))); |
| 300 |
| 301 // Transpose back to interlaced RGBA and write back to dst. |
| 302 x = _mm_shuffle_epi8(x, _mm_setr_epi8(0, 4, 8, 12, |
| 303 1, 5, 9, 13, |
| 304 2, 6, 10, 14, |
| 305 3, 7, 11, 15)); |
| 306 _mm_storeu_si128((__m128i*)dst, x); |
| 307 |
| 308 count -= 4; |
| 309 dst += 4; |
| 310 src += 4; |
| 311 } |
| 312 #endif |
| 313 // This should look just like the non-specialized case in srcover_n. |
| 314 for (int i = 0; i < count; ++i) { |
| 315 Sk4f s4 = src[i].to4f_pmorder(); |
| 316 Sk4f d4 = load_dst<kSRGB_Dst>(dst[i]); |
| 317 Sk4f r4 = s4 + d4 * Sk4f(1 - get_alpha(s4)); |
| 318 dst[i] = store_dst<kSRGB_Dst>(r4); |
| 319 } |
| 320 } |
| 321 |
236 template <DstType D> void srcover_n(const SkXfermode*, uint32_t dst[], | 322 template <DstType D> void srcover_n(const SkXfermode*, uint32_t dst[], |
237 const SkPM4f src[], int count, const SkAlpha
aa[]) { | 323 const SkPM4f src[], int count, const SkAlpha
aa[]) { |
238 if (aa) { | 324 if (aa) { |
239 for (int i = 0; i < count; ++i) { | 325 for (int i = 0; i < count; ++i) { |
240 unsigned a = aa[i]; | 326 unsigned a = aa[i]; |
241 if (0 == a) { | 327 if (0 == a) { |
242 continue; | 328 continue; |
243 } | 329 } |
244 Sk4f s4 = src[i].to4f_pmorder(); | 330 Sk4f s4 = src[i].to4f_pmorder(); |
245 Sk4f d4 = load_dst<D>(dst[i]); | 331 Sk4f d4 = load_dst<D>(dst[i]); |
246 if (a != 0xFF) { | 332 if (a != 0xFF) { |
247 s4 = scale_by_coverage(s4, a); | 333 s4 = scale_by_coverage(s4, a); |
248 } | 334 } |
249 Sk4f r4 = s4 + d4 * Sk4f(1 - get_alpha(s4)); | 335 Sk4f r4 = s4 + d4 * Sk4f(1 - get_alpha(s4)); |
250 dst[i] = store_dst<D>(r4); | 336 dst[i] = store_dst<D>(r4); |
251 } | 337 } |
252 } else { | 338 } else { |
253 for (int i = 0; i < count; ++i) { | 339 if (D == kSRGB_Dst) { |
254 Sk4f s4 = src[i].to4f_pmorder(); | 340 srcover_n_srgb_bw(dst, src, count); |
255 Sk4f d4 = load_dst<D>(dst[i]); | 341 } else { |
256 Sk4f r4 = s4 + d4 * Sk4f(1 - get_alpha(s4)); | 342 for (int i = 0; i < count; ++i) { |
257 dst[i] = store_dst<D>(r4); | 343 Sk4f s4 = src[i].to4f_pmorder(); |
| 344 Sk4f d4 = load_dst<D>(dst[i]); |
| 345 Sk4f r4 = s4 + d4 * Sk4f(1 - get_alpha(s4)); |
| 346 dst[i] = store_dst<D>(r4); |
| 347 } |
258 } | 348 } |
259 } | 349 } |
260 } | 350 } |
261 | 351 |
262 static void srcover_linear_dst_1(const SkXfermode*, uint32_t dst[], | 352 static void srcover_linear_dst_1(const SkXfermode*, uint32_t dst[], |
263 const SkPM4f* src, int count, const SkAlpha aa[
]) { | 353 const SkPM4f* src, int count, const SkAlpha aa[
]) { |
264 const Sk4f s4 = src->to4f_pmorder(); | 354 const Sk4f s4 = src->to4f_pmorder(); |
265 const Sk4f dst_scale = Sk4f(1 - get_alpha(s4)); | 355 const Sk4f dst_scale = Sk4f(1 - get_alpha(s4)); |
266 | 356 |
267 if (aa) { | 357 if (aa) { |
268 for (int i = 0; i < count; ++i) { | 358 for (int i = 0; i < count; ++i) { |
269 unsigned a = aa[i]; | 359 unsigned a = aa[i]; |
270 if (0 == a) { | 360 if (0 == a) { |
271 continue; | 361 continue; |
272 } | 362 } |
273 Sk4f d4 = Sk4f_fromL32(dst[i]); | 363 Sk4f d4 = Sk4f_fromL32(dst[i]); |
274 Sk4f r4; | 364 Sk4f r4; |
275 if (a != 0xFF) { | 365 if (a != 0xFF) { |
276 Sk4f s4_aa = scale_by_coverage(s4, a); | 366 Sk4f s4_aa = scale_by_coverage(s4, a); |
(...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
389 Sk4i rgbi = Sk4i(SkGetPackedR16(rgb), SkGetPackedG16(rgb), SkGetPackedB16(rg
b), 0); | 479 Sk4i rgbi = Sk4i(SkGetPackedR16(rgb), SkGetPackedG16(rgb), SkGetPackedB16(rg
b), 0); |
390 #else | 480 #else |
391 Sk4i rgbi = Sk4i(SkGetPackedB16(rgb), SkGetPackedG16(rgb), SkGetPackedR16(rg
b), 0); | 481 Sk4i rgbi = Sk4i(SkGetPackedB16(rgb), SkGetPackedG16(rgb), SkGetPackedR16(rg
b), 0); |
392 #endif | 482 #endif |
393 return SkNx_cast<float>(rgbi) * Sk4f(1.0f/31, 1.0f/63, 1.0f/31, 0); | 483 return SkNx_cast<float>(rgbi) * Sk4f(1.0f/31, 1.0f/63, 1.0f/31, 0); |
394 } | 484 } |
395 | 485 |
396 template <DstType D> | 486 template <DstType D> |
397 void src_1_lcd(uint32_t dst[], const SkPM4f* src, int count, const uint16_t lcd[
]) { | 487 void src_1_lcd(uint32_t dst[], const SkPM4f* src, int count, const uint16_t lcd[
]) { |
398 const Sk4f s4 = Sk4f::Load(src->fVec); | 488 const Sk4f s4 = Sk4f::Load(src->fVec); |
399 | 489 |
400 if (D == kLinear_Dst) { | 490 if (D == kLinear_Dst) { |
401 // operate in bias-255 space for src and dst | 491 // operate in bias-255 space for src and dst |
402 const Sk4f s4bias = s4 * Sk4f(255); | 492 const Sk4f s4bias = s4 * Sk4f(255); |
403 for (int i = 0; i < count; ++i) { | 493 for (int i = 0; i < count; ++i) { |
404 uint16_t rgb = lcd[i]; | 494 uint16_t rgb = lcd[i]; |
405 if (0 == rgb) { | 495 if (0 == rgb) { |
406 continue; | 496 continue; |
407 } | 497 } |
408 Sk4f d4bias = to_4f(dst[i]); | 498 Sk4f d4bias = to_4f(dst[i]); |
409 dst[i] = to_4b(lerp(s4bias, d4bias, lcd16_to_unit_4f(rgb))) | (SK_A3
2_MASK << SK_A32_SHIFT); | 499 dst[i] = to_4b(lerp(s4bias, d4bias, lcd16_to_unit_4f(rgb))) | (SK_A3
2_MASK << SK_A32_SHIFT); |
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
472 | 562 |
473 const LCD32Proc procs[] = { | 563 const LCD32Proc procs[] = { |
474 srcover_n_lcd<kSRGB_Dst>, src_n_lcd<kSRGB_Dst>, | 564 srcover_n_lcd<kSRGB_Dst>, src_n_lcd<kSRGB_Dst>, |
475 srcover_1_lcd<kSRGB_Dst>, src_1_lcd<kSRGB_Dst>, | 565 srcover_1_lcd<kSRGB_Dst>, src_1_lcd<kSRGB_Dst>, |
476 | 566 |
477 srcover_n_lcd<kLinear_Dst>, src_n_lcd<kLinear_Dst>, | 567 srcover_n_lcd<kLinear_Dst>, src_n_lcd<kLinear_Dst>, |
478 srcover_1_lcd<kLinear_Dst>, src_1_lcd<kLinear_Dst>, | 568 srcover_1_lcd<kLinear_Dst>, src_1_lcd<kLinear_Dst>, |
479 }; | 569 }; |
480 return procs[flags]; | 570 return procs[flags]; |
481 } | 571 } |
OLD | NEW |