Index: src/core/SkXfermode4f.cpp |
diff --git a/src/core/SkXfermode4f.cpp b/src/core/SkXfermode4f.cpp |
index 0ec3ec15f79e2e26e857ff3ce4cd357a66b79927..9aba0da04f3d63f341df5a6740f5d7c7a5f364fc 100644 |
--- a/src/core/SkXfermode4f.cpp |
+++ b/src/core/SkXfermode4f.cpp |
@@ -189,26 +189,60 @@ template <DstType D> void src_n(const SkXfermode::PM4fState& state, uint32_t dst |
} |
} |
+static Sk4f lerp(const Sk4f& src, const Sk4f& dst, const Sk4f& src_scale) { |
+ return dst + (src - dst) * src_scale; |
+} |
+ |
template <DstType D> void src_1(const SkXfermode::PM4fState& state, uint32_t dst[], |
const SkPM4f& src, int count, const SkAlpha aa[]) { |
- const Sk4f r4 = Sk4f::Load(src.fVec); // src always overrides dst |
- const uint32_t r32 = store_dst<D>(r4); |
+ const Sk4f s4 = Sk4f::Load(src.fVec); |
if (aa) { |
- for (int i = 0; i < count; ++i) { |
- unsigned a = aa[i]; |
- if (0 == a) { |
- continue; |
+ if (D == kLinear_Dst) { |
+ // operate in bias-255 space for src and dst |
+ const Sk4f& s4_255 = s4 * Sk4f(255); |
+ while (count >= 4) { |
+ Sk4f aa4 = SkNx_cast<float>(Sk4b::Load(aa)) * Sk4f(1/255.f); |
+ Sk4f r0 = lerp(s4_255, to_4f(dst[0]), Sk4f(aa4.kth<0>())) + Sk4f(0.5f); |
+ Sk4f r1 = lerp(s4_255, to_4f(dst[1]), Sk4f(aa4.kth<1>())) + Sk4f(0.5f); |
+ Sk4f r2 = lerp(s4_255, to_4f(dst[2]), Sk4f(aa4.kth<2>())) + Sk4f(0.5f); |
+ Sk4f r3 = lerp(s4_255, to_4f(dst[3]), Sk4f(aa4.kth<3>())) + Sk4f(0.5f); |
+ Sk4f_ToBytes((uint8_t*)dst, r0, r1, r2, r3); |
+ |
+ dst += 4; |
+ aa += 4; |
+ count -= 4; |
} |
- if (a != 0xFF) { |
- Sk4f d4 = load_dst<D>(dst[i]); |
- dst[i] = store_dst<D>(lerp(r4, d4, a)); |
- } else { |
- dst[i] = r32; |
+ } else { // kSRGB |
+ while (count >= 4) { |
+ Sk4f aa4 = SkNx_cast<float>(Sk4b::Load(aa)) * Sk4f(1/255.0f); |
+ |
+ /* If we ever natively support convert 255_linear -> 255_srgb, then perhaps |
+ * it would be faster (and possibly allow more code sharing with kLinear) to |
+ * stay in that space. |
+ */ |
+ Sk4f r0 = lerp(s4, load_dst<D>(dst[0]), Sk4f(aa4.kth<0>())); |
+ Sk4f r1 = lerp(s4, load_dst<D>(dst[1]), Sk4f(aa4.kth<1>())); |
+ Sk4f r2 = lerp(s4, load_dst<D>(dst[2]), Sk4f(aa4.kth<2>())); |
+ Sk4f r3 = lerp(s4, load_dst<D>(dst[3]), Sk4f(aa4.kth<3>())); |
+ Sk4f_ToBytes((uint8_t*)dst, |
+ linear_unit_to_srgb_255f(r0), |
+ linear_unit_to_srgb_255f(r1), |
+ linear_unit_to_srgb_255f(r2), |
+ linear_unit_to_srgb_255f(r3)); |
+ |
+ dst += 4; |
+ aa += 4; |
+ count -= 4; |
} |
} |
+ for (int i = 0; i < count; ++i) { |
+ unsigned a = aa[i]; |
+ Sk4f d4 = load_dst<D>(dst[i]); |
+ dst[i] = store_dst<D>(lerp(s4, d4, a)); |
+ } |
} else { |
- sk_memset32(dst, r32, count); |
+ sk_memset32(dst, store_dst<D>(s4), count); |
} |
} |