src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 13060004: Partial reapply of r5364 minus the non-neon code path.

Unified Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 13060004: Partial reapply of r5364 minus the non-neon code path. (Closed) Base URL: https://skia.googlecode.com/svn/trunk

Patch Set: Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/opts/SkBlitRow_opts_arm_neon.cpp

diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp

index 22785be6104fbd53d103484036c59f4324c831c7..00086c37898915d25ed463df09fdb05970346efd 100644

--- a/src/opts/SkBlitRow_opts_arm_neon.cpp

+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp

@@ -517,6 +517,178 @@ void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,

}

+void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,

+ const SkPMColor* SK_RESTRICT src,

+ int count, U8CPU alpha) {

+ SkASSERT(255 == alpha);

+ if (count <= 0)

+ return;

+ /* Use these to check if src is transparent or opaque */

+ const unsigned int ALPHA_OPAQ = 0xFF000000;

+ const unsigned int ALPHA_TRANS = 0x00FFFFFF;

+#define UNROLL 4

+ const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);

+ const SkPMColor* SK_RESTRICT src_temp = src;

+ /* set up the NEON variables */

+ uint8x8_t alpha_mask;

+ static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};

+ alpha_mask = vld1_u8(alpha_mask_setup);

+ uint8x8_t src_raw, dst_raw, dst_final;

+ uint8x8_t src_raw_2, dst_raw_2, dst_final_2;

+ uint8x8_t dst_cooked;

+ uint16x8_t dst_wide;

+ uint8x8_t alpha_narrow;

+ uint16x8_t alpha_wide;

+ /* choose the first processing type */

+ if( src >= src_end)

+ goto TAIL;

+ if(*src <= ALPHA_TRANS)

+ goto ALPHA_0;

+ if(*src >= ALPHA_OPAQ)

+ goto ALPHA_255;

+ /* fall-thru */

+ALPHA_1_TO_254:

+ do {

+ /* get the source */

+ src_raw = vreinterpret_u8_u32(vld1_u32(src));

+ src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));

+ /* get and hold the dst too */

+ dst_raw = vreinterpret_u8_u32(vld1_u32(dst));

+ dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));

+ /* get the alphas spread out properly */

+ alpha_narrow = vtbl1_u8(src_raw, alpha_mask);

+ /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */

+ /* we collapsed (255-a)+1 ... */

+ alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);

+ /* spread the dest */

+ dst_wide = vmovl_u8(dst_raw);

+ /* alpha mul the dest */

+ dst_wide = vmulq_u16 (dst_wide, alpha_wide);

+ dst_cooked = vshrn_n_u16(dst_wide, 8);

+ /* sum -- ignoring any byte lane overflows */

+ dst_final = vadd_u8(src_raw, dst_cooked);

+ alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);

+ /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */

+ /* we collapsed (255-a)+1 ... */

+ alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);

+ /* spread the dest */

+ dst_wide = vmovl_u8(dst_raw_2);

+ /* alpha mul the dest */

+ dst_wide = vmulq_u16 (dst_wide, alpha_wide);

+ dst_cooked = vshrn_n_u16(dst_wide, 8);

+ /* sum -- ignoring any byte lane overflows */

+ dst_final_2 = vadd_u8(src_raw_2, dst_cooked);

+ vst1_u32(dst, vreinterpret_u32_u8(dst_final));

+ vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));

+ src += UNROLL;

+ dst += UNROLL;

+ /* if 2 of the next pixels aren't between 1 and 254

+ it might make sense to go to the optimized loops */

+ if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))

+ break;

+ } while(src < src_end);

+ if (src >= src_end)

+ goto TAIL;

+ if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)

+ goto ALPHA_255;

+ /*fall-thru*/

+ALPHA_0:

+ /*In this state, we know the current alpha is 0 and

+ we optimize for the next alpha also being zero. */

+ src_temp = src; //so we don't have to increment dst every time

+ do {

+ if(*(++src) > ALPHA_TRANS)

+ break;

+ if(*(++src) > ALPHA_TRANS)

+ break;

+ if(*(++src) > ALPHA_TRANS)

+ break;

+ if(*(++src) > ALPHA_TRANS)

+ break;

+ } while(src < src_end);

+ dst += (src - src_temp);

+ /* no longer alpha 0, so determine where to go next. */

+ if( src >= src_end)

+ goto TAIL;

+ if(*src >= ALPHA_OPAQ)

+ goto ALPHA_255;

+ else

+ goto ALPHA_1_TO_254;

+ALPHA_255:

+ while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {

+ dst[0]=src[0];

+ dst[1]=src[1];

+ dst[2]=src[2];

+ dst[3]=src[3];

+ src+=UNROLL;

+ dst+=UNROLL;

+ if(src >= src_end)

+ goto TAIL;

+ }

+ //Handle remainder.

+ if(*src >= ALPHA_OPAQ) { *dst++ = *src++;

+ if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }

+ }

+ if( src >= src_end)

+ goto TAIL;

+ if(*src <= ALPHA_TRANS)

+ goto ALPHA_0;

+ else

+ goto ALPHA_1_TO_254;

+TAIL:

+ /* do any residual iterations */

+ src_end += UNROLL + 1; //goto the real end

+ while(src != src_end) {

+ if( *src != 0 ) {

+ if( *src >= ALPHA_OPAQ ) {

+ *dst = *src;

+ }

+ else {

+ *dst = SkPMSrcOver(*src, *dst);

+ }

+ src++;

+ dst++;

+ }

+#undef UNROLL

+ return;

/* Neon version of S32_Blend_BlitRow32()

* portable version is in src/core/SkBlitRow_D32.cpp

@@ -1107,6 +1279,20 @@ const SkBlitRow::Proc sk_blitrow_platform_4444_procs_arm_neon[] = {

const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {

NULL, // S32_Opaque,

S32_Blend_BlitRow32_neon, // S32_Blend,

- S32A_Opaque_BlitRow32_neon, // S32A_Opaque,

+ /*

+ * We have two choices for S32A_Opaque procs. The one reads the src alpha

+ * value and attempts to optimize accordingly. The optimization is

+ * sensitive to the source content and is not a win in all cases. For

+ * example, if there are a lot of transitions between the alpha states,

+ * the performance will almost certainly be worse. However, for many

+ * common cases the performance is equivalent or better than the standard

+ * case where we do not inspect the src alpha.

+ */

+#if SK_A32_SHIFT == 24

+ // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor

+ S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,

+#else

+ S32A_Opaque_BlitRow32_neon, // S32A_Opaque,

+#endif

S32A_Blend_BlitRow32_arm // S32A_Blend

};

« no previous file with comments | « bench/BitmapBench.cpp ('k') | no next file » | no next file with comments »