Index: src/opts/SkBlitRow_opts_arm_neon.cpp |
diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp |
index 22785be6104fbd53d103484036c59f4324c831c7..00086c37898915d25ed463df09fdb05970346efd 100644 |
--- a/src/opts/SkBlitRow_opts_arm_neon.cpp |
+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp |
@@ -517,6 +517,178 @@ void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
} |
} |
+void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst, |
+ const SkPMColor* SK_RESTRICT src, |
+ int count, U8CPU alpha) { |
+ SkASSERT(255 == alpha); |
+ |
+ if (count <= 0) |
+ return; |
+ |
+ /* Use these to check if src is transparent or opaque */ |
+ const unsigned int ALPHA_OPAQ = 0xFF000000; |
+ const unsigned int ALPHA_TRANS = 0x00FFFFFF; |
+ |
+#define UNROLL 4 |
+ const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1); |
+ const SkPMColor* SK_RESTRICT src_temp = src; |
+ |
+ /* set up the NEON variables */ |
+ uint8x8_t alpha_mask; |
+ static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; |
+ alpha_mask = vld1_u8(alpha_mask_setup); |
+ |
+ uint8x8_t src_raw, dst_raw, dst_final; |
+ uint8x8_t src_raw_2, dst_raw_2, dst_final_2; |
+ uint8x8_t dst_cooked; |
+ uint16x8_t dst_wide; |
+ uint8x8_t alpha_narrow; |
+ uint16x8_t alpha_wide; |
+ |
+ /* choose the first processing type */ |
+ if( src >= src_end) |
+ goto TAIL; |
+ if(*src <= ALPHA_TRANS) |
+ goto ALPHA_0; |
+ if(*src >= ALPHA_OPAQ) |
+ goto ALPHA_255; |
+ /* fall-thru */ |
+ |
+ALPHA_1_TO_254: |
+ do { |
+ |
+ /* get the source */ |
+ src_raw = vreinterpret_u8_u32(vld1_u32(src)); |
+ src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); |
+ |
+ /* get and hold the dst too */ |
+ dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); |
+ dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); |
+ |
+ |
+ /* get the alphas spread out properly */ |
+ alpha_narrow = vtbl1_u8(src_raw, alpha_mask); |
+ /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ |
+ /* we collapsed (255-a)+1 ... */ |
+ alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); |
+ |
+ /* spread the dest */ |
+ dst_wide = vmovl_u8(dst_raw); |
+ |
+ /* alpha mul the dest */ |
+ dst_wide = vmulq_u16 (dst_wide, alpha_wide); |
+ dst_cooked = vshrn_n_u16(dst_wide, 8); |
+ |
+ /* sum -- ignoring any byte lane overflows */ |
+ dst_final = vadd_u8(src_raw, dst_cooked); |
+ |
+ alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); |
+ /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ |
+ /* we collapsed (255-a)+1 ... */ |
+ alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); |
+ |
+ /* spread the dest */ |
+ dst_wide = vmovl_u8(dst_raw_2); |
+ |
+ /* alpha mul the dest */ |
+ dst_wide = vmulq_u16 (dst_wide, alpha_wide); |
+ dst_cooked = vshrn_n_u16(dst_wide, 8); |
+ |
+ /* sum -- ignoring any byte lane overflows */ |
+ dst_final_2 = vadd_u8(src_raw_2, dst_cooked); |
+ |
+ vst1_u32(dst, vreinterpret_u32_u8(dst_final)); |
+ vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); |
+ |
+ src += UNROLL; |
+ dst += UNROLL; |
+ |
+ /* if 2 of the next pixels aren't between 1 and 254 |
+ it might make sense to go to the optimized loops */ |
+ if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)) |
+ break; |
+ |
+ } while(src < src_end); |
+ |
+ if (src >= src_end) |
+ goto TAIL; |
+ |
+ if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ) |
+ goto ALPHA_255; |
+ |
+ /*fall-thru*/ |
+ |
+ALPHA_0: |
+ |
+ /*In this state, we know the current alpha is 0 and |
+ we optimize for the next alpha also being zero. */ |
+ src_temp = src; //so we don't have to increment dst every time |
+ do { |
+ if(*(++src) > ALPHA_TRANS) |
+ break; |
+ if(*(++src) > ALPHA_TRANS) |
+ break; |
+ if(*(++src) > ALPHA_TRANS) |
+ break; |
+ if(*(++src) > ALPHA_TRANS) |
+ break; |
+ } while(src < src_end); |
+ |
+ dst += (src - src_temp); |
+ |
+ /* no longer alpha 0, so determine where to go next. */ |
+ if( src >= src_end) |
+ goto TAIL; |
+ if(*src >= ALPHA_OPAQ) |
+ goto ALPHA_255; |
+ else |
+ goto ALPHA_1_TO_254; |
+ |
+ALPHA_255: |
+ while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) { |
+ dst[0]=src[0]; |
+ dst[1]=src[1]; |
+ dst[2]=src[2]; |
+ dst[3]=src[3]; |
+ src+=UNROLL; |
+ dst+=UNROLL; |
+ if(src >= src_end) |
+ goto TAIL; |
+ } |
+ |
+ //Handle remainder. |
+ if(*src >= ALPHA_OPAQ) { *dst++ = *src++; |
+ if(*src >= ALPHA_OPAQ) { *dst++ = *src++; |
+ if(*src >= ALPHA_OPAQ) { *dst++ = *src++; } |
+ } |
+ } |
+ |
+ if( src >= src_end) |
+ goto TAIL; |
+ if(*src <= ALPHA_TRANS) |
+ goto ALPHA_0; |
+ else |
+ goto ALPHA_1_TO_254; |
+ |
+TAIL: |
+ /* do any residual iterations */ |
+ src_end += UNROLL + 1; //goto the real end |
+ while(src != src_end) { |
+ if( *src != 0 ) { |
+ if( *src >= ALPHA_OPAQ ) { |
+ *dst = *src; |
+ } |
+ else { |
+ *dst = SkPMSrcOver(*src, *dst); |
+ } |
+ } |
+ src++; |
+ dst++; |
+ } |
+ |
+#undef UNROLL |
+ return; |
+} |
/* Neon version of S32_Blend_BlitRow32() |
* portable version is in src/core/SkBlitRow_D32.cpp |
@@ -1107,6 +1279,20 @@ const SkBlitRow::Proc sk_blitrow_platform_4444_procs_arm_neon[] = { |
const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { |
NULL, // S32_Opaque, |
S32_Blend_BlitRow32_neon, // S32_Blend, |
- S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
+ /* |
+ * We have two choices for S32A_Opaque procs. The one reads the src alpha |
+ * value and attempts to optimize accordingly. The optimization is |
+ * sensitive to the source content and is not a win in all cases. For |
+ * example, if there are a lot of transitions between the alpha states, |
+ * the performance will almost certainly be worse. However, for many |
+ * common cases the performance is equivalent or better than the standard |
+ * case where we do not inspect the src alpha. |
+ */ |
+#if SK_A32_SHIFT == 24 |
+ // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
+ S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
+#else |
+ S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
+#endif |
S32A_Blend_BlitRow32_arm // S32A_Blend |
}; |