| Index: src/opts/SkBlitRow_opts_arm_neon.cpp
|
| diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp
|
| index 3cb5a92f86b1d9310a47a62af9582099c6bc770b..7998a8951771423c34722c2284dc58394a75e983 100644
|
| --- a/src/opts/SkBlitRow_opts_arm_neon.cpp
|
| +++ b/src/opts/SkBlitRow_opts_arm_neon.cpp
|
| @@ -909,7 +909,12 @@ void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
|
| vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
|
|
|
| // Combine
|
| +#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
|
| vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
|
| +#else
|
| + vdst_wide += vsrc_wide;
|
| + vres = vshrn_n_u16(vdst_wide, 8);
|
| +#endif
|
|
|
| // Store
|
| vst1_u32(dst, vreinterpret_u32_u8(vres));
|
| @@ -931,7 +936,12 @@ void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
|
| vsrc_wide = vmovl_u8(vsrc);
|
| vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
|
| vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
|
| +#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
|
| vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
|
| +#else
|
| + vdst_wide += vsrc_wide;
|
| + vres = vshrn_n_u16(vdst_wide, 8);
|
| +#endif
|
|
|
| // Store
|
| vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
|
| @@ -943,7 +953,7 @@ void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
|
| const SkPMColor* SK_RESTRICT src,
|
| int count, U8CPU alpha) {
|
|
|
| - SkASSERT(255 >= alpha);
|
| + SkASSERT(255 > alpha);
|
|
|
| if (count <= 0) {
|
| return;
|
| @@ -963,9 +973,7 @@ void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
|
|
|
| // Calc dst_scale
|
| dst_scale = vget_lane_u8(vsrc, 3);
|
| - dst_scale *= alpha256;
|
| - dst_scale >>= 8;
|
| - dst_scale = 256 - dst_scale;
|
| + dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
|
|
|
| // Process src
|
| vsrc_wide = vmovl_u8(vsrc);
|
| @@ -976,7 +984,12 @@ void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
|
| vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
|
|
|
| // Combine
|
| +#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
|
| vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
|
| +#else
|
| + vdst_wide += vsrc_wide;
|
| + vres = vshrn_n_u16(vdst_wide, 8);
|
| +#endif
|
|
|
| vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
|
| dst++;
|
| @@ -1007,9 +1020,20 @@ void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
|
| // Calc dst_scale
|
| vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
|
| vdst_scale = vmovl_u8(vsrc_alphas);
|
| +#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
|
| vdst_scale *= vsrc_scale;
|
| vdst_scale = vshrq_n_u16(vdst_scale, 8);
|
| vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
|
| +#else
|
| + // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
|
| + // A 16-bit lane would overflow if we used 0xFFFF here,
|
| + // so use an approximation with 0xFF00 that is off by 1,
|
| + // and add back 1 after to get the correct value.
|
| + // This is valid if alpha256 <= 255.
|
| + vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
|
| + vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
|
| + vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
|
| +#endif
|
|
|
| // Process src
|
| vsrc_wide = vmovl_u8(vsrc);
|
| @@ -1020,7 +1044,12 @@ void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
|
| vdst_wide *= vdst_scale;
|
|
|
| // Combine
|
| +#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
|
| vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
|
| +#else
|
| + vdst_wide += vsrc_wide;
|
| + vres = vshrn_n_u16(vdst_wide, 8);
|
| +#endif
|
|
|
| vst1_u32(dst, vreinterpret_u32_u8(vres));
|
|
|
|
|