Index: src/opts/SkBlitRow_opts_arm_neon.cpp |
diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp |
index 3cb5a92f86b1d9310a47a62af9582099c6bc770b..7998a8951771423c34722c2284dc58394a75e983 100644 |
--- a/src/opts/SkBlitRow_opts_arm_neon.cpp |
+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp |
@@ -909,7 +909,12 @@ void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); |
// Combine |
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP |
vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); |
+#else |
+ vdst_wide += vsrc_wide; |
+ vres = vshrn_n_u16(vdst_wide, 8); |
+#endif |
// Store |
vst1_u32(dst, vreinterpret_u32_u8(vres)); |
@@ -931,7 +936,12 @@ void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
vsrc_wide = vmovl_u8(vsrc); |
vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); |
vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); |
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP |
vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); |
+#else |
+ vdst_wide += vsrc_wide; |
+ vres = vshrn_n_u16(vdst_wide, 8); |
+#endif |
// Store |
vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); |
@@ -943,7 +953,7 @@ void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
const SkPMColor* SK_RESTRICT src, |
int count, U8CPU alpha) { |
- SkASSERT(255 >= alpha); |
+ SkASSERT(255 > alpha); |
if (count <= 0) { |
return; |
@@ -963,9 +973,7 @@ void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
// Calc dst_scale |
dst_scale = vget_lane_u8(vsrc, 3); |
- dst_scale *= alpha256; |
- dst_scale >>= 8; |
- dst_scale = 256 - dst_scale; |
+ dst_scale = SkAlphaMulInv256(dst_scale, alpha256); |
// Process src |
vsrc_wide = vmovl_u8(vsrc); |
@@ -976,7 +984,12 @@ void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); |
// Combine |
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP |
vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); |
+#else |
+ vdst_wide += vsrc_wide; |
+ vres = vshrn_n_u16(vdst_wide, 8); |
+#endif |
vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); |
dst++; |
@@ -1007,9 +1020,20 @@ void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
// Calc dst_scale |
vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); |
vdst_scale = vmovl_u8(vsrc_alphas); |
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP |
vdst_scale *= vsrc_scale; |
vdst_scale = vshrq_n_u16(vdst_scale, 8); |
vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale); |
+#else |
+ // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale). |
+ // A 16-bit lane would overflow if we used 0xFFFF here, |
+ // so use an approximation with 0xFF00 that is off by 1, |
+ // and add back 1 after to get the correct value. |
+ // This is valid if alpha256 <= 255. |
+ vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale); |
+ vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8); |
+ vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8); |
+#endif |
// Process src |
vsrc_wide = vmovl_u8(vsrc); |
@@ -1020,7 +1044,12 @@ void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
vdst_wide *= vdst_scale; |
// Combine |
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP |
vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); |
+#else |
+ vdst_wide += vsrc_wide; |
+ vres = vshrn_n_u16(vdst_wide, 8); |
+#endif |
vst1_u32(dst, vreinterpret_u32_u8(vres)); |