src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 1820313002: Port S32A_opaque blit row to SkOpts.

Unified Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 1820313002: Port S32A_opaque blit row to SkOpts. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: fmt Created 4 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/opts/SkBlitRow_opts_arm_neon.cpp

diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp

index 95bd229486291542ea278a36076e498d6315cf26..3cb5a92f86b1d9310a47a62af9582099c6bc770b 100644

--- a/src/opts/SkBlitRow_opts_arm_neon.cpp

+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp

@@ -871,282 +871,6 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,

}

-void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,

- const SkPMColor* SK_RESTRICT src,

- int count, U8CPU alpha) {

- SkASSERT(255 == alpha);

- if (count > 0) {

- uint8x8_t alpha_mask;

- static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};

- alpha_mask = vld1_u8(alpha_mask_setup);

- /* do the NEON unrolled code */

-#define UNROLL 4

- while (count >= UNROLL) {

- uint8x8_t src_raw, dst_raw, dst_final;

- uint8x8_t src_raw_2, dst_raw_2, dst_final_2;

- /* The two prefetches below may make the code slighlty

- * slower for small values of count but are worth having

- * in the general case.

- */

- __builtin_prefetch(src+32);

- __builtin_prefetch(dst+32);

- /* get the source */

- src_raw = vreinterpret_u8_u32(vld1_u32(src));

-#if UNROLL > 2

- src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));

-#endif

- /* get and hold the dst too */

- dst_raw = vreinterpret_u8_u32(vld1_u32(dst));

-#if UNROLL > 2

- dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));

-#endif

- /* 1st and 2nd bits of the unrolling */

- {

- uint8x8_t dst_cooked;

- uint16x8_t dst_wide;

- uint8x8_t alpha_narrow;

- uint16x8_t alpha_wide;

- /* get the alphas spread out properly */

- alpha_narrow = vtbl1_u8(src_raw, alpha_mask);

- alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);

- /* spread the dest */

- dst_wide = vmovl_u8(dst_raw);

- /* alpha mul the dest */

- dst_wide = vmulq_u16 (dst_wide, alpha_wide);

- dst_cooked = vshrn_n_u16(dst_wide, 8);

- /* sum -- ignoring any byte lane overflows */

- dst_final = vadd_u8(src_raw, dst_cooked);

- }

-#if UNROLL > 2

- /* the 3rd and 4th bits of our unrolling */

- {

- uint8x8_t dst_cooked;

- uint16x8_t dst_wide;

- uint8x8_t alpha_narrow;

- uint16x8_t alpha_wide;

- alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);

- alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);

- /* spread the dest */

- dst_wide = vmovl_u8(dst_raw_2);

- /* alpha mul the dest */

- dst_wide = vmulq_u16 (dst_wide, alpha_wide);

- dst_cooked = vshrn_n_u16(dst_wide, 8);

- /* sum -- ignoring any byte lane overflows */

- dst_final_2 = vadd_u8(src_raw_2, dst_cooked);

- }

-#endif

- vst1_u32(dst, vreinterpret_u32_u8(dst_final));

-#if UNROLL > 2

- vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));

-#endif

- src += UNROLL;

- dst += UNROLL;

- count -= UNROLL;

- }

-#undef UNROLL

- /* do any residual iterations */

- while (--count >= 0) {

- *dst = SkPMSrcOver(*src, *dst);

- src += 1;

- dst += 1;

- }

-void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,

- const SkPMColor* SK_RESTRICT src,

- int count, U8CPU alpha) {

- SkASSERT(255 == alpha);

- if (count <= 0)

- return;

- /* Use these to check if src is transparent or opaque */

- const unsigned int ALPHA_OPAQ = 0xFF000000;

- const unsigned int ALPHA_TRANS = 0x00FFFFFF;

-#define UNROLL 4

- const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);

- const SkPMColor* SK_RESTRICT src_temp = src;

- /* set up the NEON variables */

- uint8x8_t alpha_mask;

- static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};

- alpha_mask = vld1_u8(alpha_mask_setup);

- uint8x8_t src_raw, dst_raw, dst_final;

- uint8x8_t src_raw_2, dst_raw_2, dst_final_2;

- uint8x8_t dst_cooked;

- uint16x8_t dst_wide;

- uint8x8_t alpha_narrow;

- uint16x8_t alpha_wide;

- /* choose the first processing type */

- if( src >= src_end)

- goto TAIL;

- if(*src <= ALPHA_TRANS)

- goto ALPHA_0;

- if(*src >= ALPHA_OPAQ)

- goto ALPHA_255;

- /* fall-thru */

-ALPHA_1_TO_254:

- do {

- /* get the source */

- src_raw = vreinterpret_u8_u32(vld1_u32(src));

- src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));

- /* get and hold the dst too */

- dst_raw = vreinterpret_u8_u32(vld1_u32(dst));

- dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));

- /* get the alphas spread out properly */

- alpha_narrow = vtbl1_u8(src_raw, alpha_mask);

- /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */

- /* we collapsed (255-a)+1 ... */

- alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);

- /* spread the dest */

- dst_wide = vmovl_u8(dst_raw);

- /* alpha mul the dest */

- dst_wide = vmulq_u16 (dst_wide, alpha_wide);

- dst_cooked = vshrn_n_u16(dst_wide, 8);

- /* sum -- ignoring any byte lane overflows */

- dst_final = vadd_u8(src_raw, dst_cooked);

- alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);

- /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */

- /* we collapsed (255-a)+1 ... */

- alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);

- /* spread the dest */

- dst_wide = vmovl_u8(dst_raw_2);

- /* alpha mul the dest */

- dst_wide = vmulq_u16 (dst_wide, alpha_wide);

- dst_cooked = vshrn_n_u16(dst_wide, 8);

- /* sum -- ignoring any byte lane overflows */

- dst_final_2 = vadd_u8(src_raw_2, dst_cooked);

- vst1_u32(dst, vreinterpret_u32_u8(dst_final));

- vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));

- src += UNROLL;

- dst += UNROLL;

- /* if 2 of the next pixels aren't between 1 and 254

- it might make sense to go to the optimized loops */

- if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))

- break;

- } while(src < src_end);

- if (src >= src_end)

- goto TAIL;

- if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)

- goto ALPHA_255;

- /*fall-thru*/

-ALPHA_0:

- /*In this state, we know the current alpha is 0 and

- we optimize for the next alpha also being zero. */

- src_temp = src; //so we don't have to increment dst every time

- do {

- if(*(++src) > ALPHA_TRANS)

- break;

- if(*(++src) > ALPHA_TRANS)

- break;

- if(*(++src) > ALPHA_TRANS)

- break;

- if(*(++src) > ALPHA_TRANS)

- break;

- } while(src < src_end);

- dst += (src - src_temp);

- /* no longer alpha 0, so determine where to go next. */

- if( src >= src_end)

- goto TAIL;

- if(*src >= ALPHA_OPAQ)

- goto ALPHA_255;

- else

- goto ALPHA_1_TO_254;

-ALPHA_255:

- while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {

- dst[0]=src[0];

- dst[1]=src[1];

- dst[2]=src[2];

- dst[3]=src[3];

- src+=UNROLL;

- dst+=UNROLL;

- if(src >= src_end)

- goto TAIL;

- }

- //Handle remainder.

- if(*src >= ALPHA_OPAQ) { *dst++ = *src++;

- if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }

- }

- if( src >= src_end)

- goto TAIL;

- if(*src <= ALPHA_TRANS)

- goto ALPHA_0;

- else

- goto ALPHA_1_TO_254;

-TAIL:

- /* do any residual iterations */

- src_end += UNROLL + 1; //goto the real end

- while(src != src_end) {

- if( *src != 0 ) {

- if( *src >= ALPHA_OPAQ ) {

- *dst = *src;

- }

- else {

- *dst = SkPMSrcOver(*src, *dst);

- }

- src++;

- dst++;

- }

-#undef UNROLL

- return;

/* Neon version of S32_Blend_BlitRow32()

* portable version is in src/core/SkBlitRow_D32.cpp

@@ -1561,21 +1285,7 @@ const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {

const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {

nullptr, // S32_Opaque,

S32_Blend_BlitRow32_neon, // S32_Blend,

- /*

- * We have two choices for S32A_Opaque procs. The one reads the src alpha

- * value and attempts to optimize accordingly. The optimization is

- * sensitive to the source content and is not a win in all cases. For

- * example, if there are a lot of transitions between the alpha states,

- * the performance will almost certainly be worse. However, for many

- * common cases the performance is equivalent or better than the standard

- * case where we do not inspect the src alpha.

- */

-#if SK_A32_SHIFT == 24

- // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor

- S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,

-#else

- S32A_Opaque_BlitRow32_neon, // S32A_Opaque,

-#endif

+ nullptr, // Ported to SkOpts

#ifdef SK_CPU_ARM32

S32A_Blend_BlitRow32_neon // S32A_Blend

#else

« no previous file with comments | « src/opts/SkBlitRow_opts_SSE4.cpp ('k') | src/opts/SkOpts_neon.cpp » ('j') | src/opts/SkOpts_sse41.cpp » ('J')