| Index: src/opts/SkBlitRow_opts_arm_neon.cpp
|
| diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp
|
| index 95bd229486291542ea278a36076e498d6315cf26..3cb5a92f86b1d9310a47a62af9582099c6bc770b 100644
|
| --- a/src/opts/SkBlitRow_opts_arm_neon.cpp
|
| +++ b/src/opts/SkBlitRow_opts_arm_neon.cpp
|
| @@ -871,282 +871,6 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
|
| }
|
| }
|
|
|
| -void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
|
| - const SkPMColor* SK_RESTRICT src,
|
| - int count, U8CPU alpha) {
|
| -
|
| - SkASSERT(255 == alpha);
|
| - if (count > 0) {
|
| -
|
| -
|
| - uint8x8_t alpha_mask;
|
| -
|
| - static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
|
| - alpha_mask = vld1_u8(alpha_mask_setup);
|
| -
|
| - /* do the NEON unrolled code */
|
| -#define UNROLL 4
|
| - while (count >= UNROLL) {
|
| - uint8x8_t src_raw, dst_raw, dst_final;
|
| - uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
|
| -
|
| - /* The two prefetches below may make the code slighlty
|
| - * slower for small values of count but are worth having
|
| - * in the general case.
|
| - */
|
| - __builtin_prefetch(src+32);
|
| - __builtin_prefetch(dst+32);
|
| -
|
| - /* get the source */
|
| - src_raw = vreinterpret_u8_u32(vld1_u32(src));
|
| -#if UNROLL > 2
|
| - src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
|
| -#endif
|
| -
|
| - /* get and hold the dst too */
|
| - dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
|
| -#if UNROLL > 2
|
| - dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
|
| -#endif
|
| -
|
| - /* 1st and 2nd bits of the unrolling */
|
| - {
|
| - uint8x8_t dst_cooked;
|
| - uint16x8_t dst_wide;
|
| - uint8x8_t alpha_narrow;
|
| - uint16x8_t alpha_wide;
|
| -
|
| - /* get the alphas spread out properly */
|
| - alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
|
| - alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
|
| -
|
| - /* spread the dest */
|
| - dst_wide = vmovl_u8(dst_raw);
|
| -
|
| - /* alpha mul the dest */
|
| - dst_wide = vmulq_u16 (dst_wide, alpha_wide);
|
| - dst_cooked = vshrn_n_u16(dst_wide, 8);
|
| -
|
| - /* sum -- ignoring any byte lane overflows */
|
| - dst_final = vadd_u8(src_raw, dst_cooked);
|
| - }
|
| -
|
| -#if UNROLL > 2
|
| - /* the 3rd and 4th bits of our unrolling */
|
| - {
|
| - uint8x8_t dst_cooked;
|
| - uint16x8_t dst_wide;
|
| - uint8x8_t alpha_narrow;
|
| - uint16x8_t alpha_wide;
|
| -
|
| - alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
|
| - alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
|
| -
|
| - /* spread the dest */
|
| - dst_wide = vmovl_u8(dst_raw_2);
|
| -
|
| - /* alpha mul the dest */
|
| - dst_wide = vmulq_u16 (dst_wide, alpha_wide);
|
| - dst_cooked = vshrn_n_u16(dst_wide, 8);
|
| -
|
| - /* sum -- ignoring any byte lane overflows */
|
| - dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
|
| - }
|
| -#endif
|
| -
|
| - vst1_u32(dst, vreinterpret_u32_u8(dst_final));
|
| -#if UNROLL > 2
|
| - vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
|
| -#endif
|
| -
|
| - src += UNROLL;
|
| - dst += UNROLL;
|
| - count -= UNROLL;
|
| - }
|
| -#undef UNROLL
|
| -
|
| - /* do any residual iterations */
|
| - while (--count >= 0) {
|
| - *dst = SkPMSrcOver(*src, *dst);
|
| - src += 1;
|
| - dst += 1;
|
| - }
|
| - }
|
| -}
|
| -
|
| -void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
|
| - const SkPMColor* SK_RESTRICT src,
|
| - int count, U8CPU alpha) {
|
| - SkASSERT(255 == alpha);
|
| -
|
| - if (count <= 0)
|
| - return;
|
| -
|
| - /* Use these to check if src is transparent or opaque */
|
| - const unsigned int ALPHA_OPAQ = 0xFF000000;
|
| - const unsigned int ALPHA_TRANS = 0x00FFFFFF;
|
| -
|
| -#define UNROLL 4
|
| - const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
|
| - const SkPMColor* SK_RESTRICT src_temp = src;
|
| -
|
| - /* set up the NEON variables */
|
| - uint8x8_t alpha_mask;
|
| - static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
|
| - alpha_mask = vld1_u8(alpha_mask_setup);
|
| -
|
| - uint8x8_t src_raw, dst_raw, dst_final;
|
| - uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
|
| - uint8x8_t dst_cooked;
|
| - uint16x8_t dst_wide;
|
| - uint8x8_t alpha_narrow;
|
| - uint16x8_t alpha_wide;
|
| -
|
| - /* choose the first processing type */
|
| - if( src >= src_end)
|
| - goto TAIL;
|
| - if(*src <= ALPHA_TRANS)
|
| - goto ALPHA_0;
|
| - if(*src >= ALPHA_OPAQ)
|
| - goto ALPHA_255;
|
| - /* fall-thru */
|
| -
|
| -ALPHA_1_TO_254:
|
| - do {
|
| -
|
| - /* get the source */
|
| - src_raw = vreinterpret_u8_u32(vld1_u32(src));
|
| - src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
|
| -
|
| - /* get and hold the dst too */
|
| - dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
|
| - dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
|
| -
|
| -
|
| - /* get the alphas spread out properly */
|
| - alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
|
| - /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
|
| - /* we collapsed (255-a)+1 ... */
|
| - alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
|
| -
|
| - /* spread the dest */
|
| - dst_wide = vmovl_u8(dst_raw);
|
| -
|
| - /* alpha mul the dest */
|
| - dst_wide = vmulq_u16 (dst_wide, alpha_wide);
|
| - dst_cooked = vshrn_n_u16(dst_wide, 8);
|
| -
|
| - /* sum -- ignoring any byte lane overflows */
|
| - dst_final = vadd_u8(src_raw, dst_cooked);
|
| -
|
| - alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
|
| - /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
|
| - /* we collapsed (255-a)+1 ... */
|
| - alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
|
| -
|
| - /* spread the dest */
|
| - dst_wide = vmovl_u8(dst_raw_2);
|
| -
|
| - /* alpha mul the dest */
|
| - dst_wide = vmulq_u16 (dst_wide, alpha_wide);
|
| - dst_cooked = vshrn_n_u16(dst_wide, 8);
|
| -
|
| - /* sum -- ignoring any byte lane overflows */
|
| - dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
|
| -
|
| - vst1_u32(dst, vreinterpret_u32_u8(dst_final));
|
| - vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
|
| -
|
| - src += UNROLL;
|
| - dst += UNROLL;
|
| -
|
| - /* if 2 of the next pixels aren't between 1 and 254
|
| - it might make sense to go to the optimized loops */
|
| - if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
|
| - break;
|
| -
|
| - } while(src < src_end);
|
| -
|
| - if (src >= src_end)
|
| - goto TAIL;
|
| -
|
| - if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
|
| - goto ALPHA_255;
|
| -
|
| - /*fall-thru*/
|
| -
|
| -ALPHA_0:
|
| -
|
| - /*In this state, we know the current alpha is 0 and
|
| - we optimize for the next alpha also being zero. */
|
| - src_temp = src; //so we don't have to increment dst every time
|
| - do {
|
| - if(*(++src) > ALPHA_TRANS)
|
| - break;
|
| - if(*(++src) > ALPHA_TRANS)
|
| - break;
|
| - if(*(++src) > ALPHA_TRANS)
|
| - break;
|
| - if(*(++src) > ALPHA_TRANS)
|
| - break;
|
| - } while(src < src_end);
|
| -
|
| - dst += (src - src_temp);
|
| -
|
| - /* no longer alpha 0, so determine where to go next. */
|
| - if( src >= src_end)
|
| - goto TAIL;
|
| - if(*src >= ALPHA_OPAQ)
|
| - goto ALPHA_255;
|
| - else
|
| - goto ALPHA_1_TO_254;
|
| -
|
| -ALPHA_255:
|
| - while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
|
| - dst[0]=src[0];
|
| - dst[1]=src[1];
|
| - dst[2]=src[2];
|
| - dst[3]=src[3];
|
| - src+=UNROLL;
|
| - dst+=UNROLL;
|
| - if(src >= src_end)
|
| - goto TAIL;
|
| - }
|
| -
|
| - //Handle remainder.
|
| - if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
|
| - if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
|
| - if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
|
| - }
|
| - }
|
| -
|
| - if( src >= src_end)
|
| - goto TAIL;
|
| - if(*src <= ALPHA_TRANS)
|
| - goto ALPHA_0;
|
| - else
|
| - goto ALPHA_1_TO_254;
|
| -
|
| -TAIL:
|
| - /* do any residual iterations */
|
| - src_end += UNROLL + 1; //goto the real end
|
| - while(src != src_end) {
|
| - if( *src != 0 ) {
|
| - if( *src >= ALPHA_OPAQ ) {
|
| - *dst = *src;
|
| - }
|
| - else {
|
| - *dst = SkPMSrcOver(*src, *dst);
|
| - }
|
| - }
|
| - src++;
|
| - dst++;
|
| - }
|
| -
|
| -#undef UNROLL
|
| - return;
|
| -}
|
| -
|
| /* Neon version of S32_Blend_BlitRow32()
|
| * portable version is in src/core/SkBlitRow_D32.cpp
|
| */
|
| @@ -1561,21 +1285,7 @@ const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {
|
| const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
|
| nullptr, // S32_Opaque,
|
| S32_Blend_BlitRow32_neon, // S32_Blend,
|
| - /*
|
| - * We have two choices for S32A_Opaque procs. The one reads the src alpha
|
| - * value and attempts to optimize accordingly. The optimization is
|
| - * sensitive to the source content and is not a win in all cases. For
|
| - * example, if there are a lot of transitions between the alpha states,
|
| - * the performance will almost certainly be worse. However, for many
|
| - * common cases the performance is equivalent or better than the standard
|
| - * case where we do not inspect the src alpha.
|
| - */
|
| -#if SK_A32_SHIFT == 24
|
| - // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
|
| - S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
|
| -#else
|
| - S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
|
| -#endif
|
| + nullptr, // Ported to SkOpts
|
| #ifdef SK_CPU_ARM32
|
| S32A_Blend_BlitRow32_neon // S32A_Blend
|
| #else
|
|
|