Index: src/opts/SkBlitRow_opts_arm_neon.cpp |
diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp |
index 95bd229486291542ea278a36076e498d6315cf26..3cb5a92f86b1d9310a47a62af9582099c6bc770b 100644 |
--- a/src/opts/SkBlitRow_opts_arm_neon.cpp |
+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp |
@@ -871,282 +871,6 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, |
} |
} |
-void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
- const SkPMColor* SK_RESTRICT src, |
- int count, U8CPU alpha) { |
- |
- SkASSERT(255 == alpha); |
- if (count > 0) { |
- |
- |
- uint8x8_t alpha_mask; |
- |
- static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; |
- alpha_mask = vld1_u8(alpha_mask_setup); |
- |
- /* do the NEON unrolled code */ |
-#define UNROLL 4 |
- while (count >= UNROLL) { |
- uint8x8_t src_raw, dst_raw, dst_final; |
- uint8x8_t src_raw_2, dst_raw_2, dst_final_2; |
- |
- /* The two prefetches below may make the code slighlty |
- * slower for small values of count but are worth having |
- * in the general case. |
- */ |
- __builtin_prefetch(src+32); |
- __builtin_prefetch(dst+32); |
- |
- /* get the source */ |
- src_raw = vreinterpret_u8_u32(vld1_u32(src)); |
-#if UNROLL > 2 |
- src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); |
-#endif |
- |
- /* get and hold the dst too */ |
- dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); |
-#if UNROLL > 2 |
- dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); |
-#endif |
- |
- /* 1st and 2nd bits of the unrolling */ |
- { |
- uint8x8_t dst_cooked; |
- uint16x8_t dst_wide; |
- uint8x8_t alpha_narrow; |
- uint16x8_t alpha_wide; |
- |
- /* get the alphas spread out properly */ |
- alpha_narrow = vtbl1_u8(src_raw, alpha_mask); |
- alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); |
- |
- /* spread the dest */ |
- dst_wide = vmovl_u8(dst_raw); |
- |
- /* alpha mul the dest */ |
- dst_wide = vmulq_u16 (dst_wide, alpha_wide); |
- dst_cooked = vshrn_n_u16(dst_wide, 8); |
- |
- /* sum -- ignoring any byte lane overflows */ |
- dst_final = vadd_u8(src_raw, dst_cooked); |
- } |
- |
-#if UNROLL > 2 |
- /* the 3rd and 4th bits of our unrolling */ |
- { |
- uint8x8_t dst_cooked; |
- uint16x8_t dst_wide; |
- uint8x8_t alpha_narrow; |
- uint16x8_t alpha_wide; |
- |
- alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); |
- alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); |
- |
- /* spread the dest */ |
- dst_wide = vmovl_u8(dst_raw_2); |
- |
- /* alpha mul the dest */ |
- dst_wide = vmulq_u16 (dst_wide, alpha_wide); |
- dst_cooked = vshrn_n_u16(dst_wide, 8); |
- |
- /* sum -- ignoring any byte lane overflows */ |
- dst_final_2 = vadd_u8(src_raw_2, dst_cooked); |
- } |
-#endif |
- |
- vst1_u32(dst, vreinterpret_u32_u8(dst_final)); |
-#if UNROLL > 2 |
- vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); |
-#endif |
- |
- src += UNROLL; |
- dst += UNROLL; |
- count -= UNROLL; |
- } |
-#undef UNROLL |
- |
- /* do any residual iterations */ |
- while (--count >= 0) { |
- *dst = SkPMSrcOver(*src, *dst); |
- src += 1; |
- dst += 1; |
- } |
- } |
-} |
- |
-void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst, |
- const SkPMColor* SK_RESTRICT src, |
- int count, U8CPU alpha) { |
- SkASSERT(255 == alpha); |
- |
- if (count <= 0) |
- return; |
- |
- /* Use these to check if src is transparent or opaque */ |
- const unsigned int ALPHA_OPAQ = 0xFF000000; |
- const unsigned int ALPHA_TRANS = 0x00FFFFFF; |
- |
-#define UNROLL 4 |
- const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1); |
- const SkPMColor* SK_RESTRICT src_temp = src; |
- |
- /* set up the NEON variables */ |
- uint8x8_t alpha_mask; |
- static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; |
- alpha_mask = vld1_u8(alpha_mask_setup); |
- |
- uint8x8_t src_raw, dst_raw, dst_final; |
- uint8x8_t src_raw_2, dst_raw_2, dst_final_2; |
- uint8x8_t dst_cooked; |
- uint16x8_t dst_wide; |
- uint8x8_t alpha_narrow; |
- uint16x8_t alpha_wide; |
- |
- /* choose the first processing type */ |
- if( src >= src_end) |
- goto TAIL; |
- if(*src <= ALPHA_TRANS) |
- goto ALPHA_0; |
- if(*src >= ALPHA_OPAQ) |
- goto ALPHA_255; |
- /* fall-thru */ |
- |
-ALPHA_1_TO_254: |
- do { |
- |
- /* get the source */ |
- src_raw = vreinterpret_u8_u32(vld1_u32(src)); |
- src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); |
- |
- /* get and hold the dst too */ |
- dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); |
- dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); |
- |
- |
- /* get the alphas spread out properly */ |
- alpha_narrow = vtbl1_u8(src_raw, alpha_mask); |
- /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ |
- /* we collapsed (255-a)+1 ... */ |
- alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); |
- |
- /* spread the dest */ |
- dst_wide = vmovl_u8(dst_raw); |
- |
- /* alpha mul the dest */ |
- dst_wide = vmulq_u16 (dst_wide, alpha_wide); |
- dst_cooked = vshrn_n_u16(dst_wide, 8); |
- |
- /* sum -- ignoring any byte lane overflows */ |
- dst_final = vadd_u8(src_raw, dst_cooked); |
- |
- alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); |
- /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ |
- /* we collapsed (255-a)+1 ... */ |
- alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); |
- |
- /* spread the dest */ |
- dst_wide = vmovl_u8(dst_raw_2); |
- |
- /* alpha mul the dest */ |
- dst_wide = vmulq_u16 (dst_wide, alpha_wide); |
- dst_cooked = vshrn_n_u16(dst_wide, 8); |
- |
- /* sum -- ignoring any byte lane overflows */ |
- dst_final_2 = vadd_u8(src_raw_2, dst_cooked); |
- |
- vst1_u32(dst, vreinterpret_u32_u8(dst_final)); |
- vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); |
- |
- src += UNROLL; |
- dst += UNROLL; |
- |
- /* if 2 of the next pixels aren't between 1 and 254 |
- it might make sense to go to the optimized loops */ |
- if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)) |
- break; |
- |
- } while(src < src_end); |
- |
- if (src >= src_end) |
- goto TAIL; |
- |
- if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ) |
- goto ALPHA_255; |
- |
- /*fall-thru*/ |
- |
-ALPHA_0: |
- |
- /*In this state, we know the current alpha is 0 and |
- we optimize for the next alpha also being zero. */ |
- src_temp = src; //so we don't have to increment dst every time |
- do { |
- if(*(++src) > ALPHA_TRANS) |
- break; |
- if(*(++src) > ALPHA_TRANS) |
- break; |
- if(*(++src) > ALPHA_TRANS) |
- break; |
- if(*(++src) > ALPHA_TRANS) |
- break; |
- } while(src < src_end); |
- |
- dst += (src - src_temp); |
- |
- /* no longer alpha 0, so determine where to go next. */ |
- if( src >= src_end) |
- goto TAIL; |
- if(*src >= ALPHA_OPAQ) |
- goto ALPHA_255; |
- else |
- goto ALPHA_1_TO_254; |
- |
-ALPHA_255: |
- while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) { |
- dst[0]=src[0]; |
- dst[1]=src[1]; |
- dst[2]=src[2]; |
- dst[3]=src[3]; |
- src+=UNROLL; |
- dst+=UNROLL; |
- if(src >= src_end) |
- goto TAIL; |
- } |
- |
- //Handle remainder. |
- if(*src >= ALPHA_OPAQ) { *dst++ = *src++; |
- if(*src >= ALPHA_OPAQ) { *dst++ = *src++; |
- if(*src >= ALPHA_OPAQ) { *dst++ = *src++; } |
- } |
- } |
- |
- if( src >= src_end) |
- goto TAIL; |
- if(*src <= ALPHA_TRANS) |
- goto ALPHA_0; |
- else |
- goto ALPHA_1_TO_254; |
- |
-TAIL: |
- /* do any residual iterations */ |
- src_end += UNROLL + 1; //goto the real end |
- while(src != src_end) { |
- if( *src != 0 ) { |
- if( *src >= ALPHA_OPAQ ) { |
- *dst = *src; |
- } |
- else { |
- *dst = SkPMSrcOver(*src, *dst); |
- } |
- } |
- src++; |
- dst++; |
- } |
- |
-#undef UNROLL |
- return; |
-} |
- |
/* Neon version of S32_Blend_BlitRow32() |
* portable version is in src/core/SkBlitRow_D32.cpp |
*/ |
@@ -1561,21 +1285,7 @@ const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = { |
const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { |
nullptr, // S32_Opaque, |
S32_Blend_BlitRow32_neon, // S32_Blend, |
- /* |
- * We have two choices for S32A_Opaque procs. The one reads the src alpha |
- * value and attempts to optimize accordingly. The optimization is |
- * sensitive to the source content and is not a win in all cases. For |
- * example, if there are a lot of transitions between the alpha states, |
- * the performance will almost certainly be worse. However, for many |
- * common cases the performance is equivalent or better than the standard |
- * case where we do not inspect the src alpha. |
- */ |
-#if SK_A32_SHIFT == 24 |
- // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
- S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
-#else |
- S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
-#endif |
+ nullptr, // Ported to SkOpts |
#ifdef SK_CPU_ARM32 |
S32A_Blend_BlitRow32_neon // S32A_Blend |
#else |