src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 13060004: Partial reapply of r5364 minus the non-neon code path.

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 13060004: Partial reapply of r5364 minus the non-neon code path. (Closed) Base URL: https://skia.googlecode.com/svn/trunk

Patch Set: Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBlitRow_opts_arm.h"	8 #include "SkBlitRow_opts_arm.h"

9	9

10 #include "SkBlitMask.h"	10 #include "SkBlitMask.h"

(...skipping 499 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
510	510

511 /* do any residual iterations */	511 /* do any residual iterations */

512 while (--count >= 0) {	512 while (--count >= 0) {

513 dst = SkPMSrcOver(src, *dst);	513 dst = SkPMSrcOver(src, *dst);

514 src += 1;	514 src += 1;

515 dst += 1;	515 dst += 1;

516 }	516 }

517 }	517 }

518 }	518 }

519	519

	520 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,

	521 const SkPMColor* SK_RESTRICT src,

	522 int count, U8CPU alpha) {

	523 SkASSERT(255 == alpha);

	524

	525 if (count <= 0)

	526 return;

	527

	528 /* Use these to check if src is transparent or opaque */

	529 const unsigned int ALPHA_OPAQ = 0xFF000000;

	530 const unsigned int ALPHA_TRANS = 0x00FFFFFF;

	531

	532 #define UNROLL 4

	533 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);

	534 const SkPMColor* SK_RESTRICT src_temp = src;

	535

	536 /* set up the NEON variables */

	537 uint8x8_t alpha_mask;

	538 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};

	539 alpha_mask = vld1_u8(alpha_mask_setup);

	540

	541 uint8x8_t src_raw, dst_raw, dst_final;

	542 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;

	543 uint8x8_t dst_cooked;

	544 uint16x8_t dst_wide;

	545 uint8x8_t alpha_narrow;

	546 uint16x8_t alpha_wide;

	547

	548 /* choose the first processing type */

	549 if( src >= src_end)

	550 goto TAIL;

	551 if(*src <= ALPHA_TRANS)

	552 goto ALPHA_0;

	553 if(*src >= ALPHA_OPAQ)

	554 goto ALPHA_255;

	555 /* fall-thru */

	556

	557 ALPHA_1_TO_254:

	558 do {

	559

	560 /* get the source */

	561 src_raw = vreinterpret_u8_u32(vld1_u32(src));

	562 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));

	563

	564 /* get and hold the dst too */

	565 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));

	566 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));

	567

	568

	569 /* get the alphas spread out properly */

	570 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);

	571 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */

	572 /* we collapsed (255-a)+1 ... */

	573 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);

	574

	575 /* spread the dest */

	576 dst_wide = vmovl_u8(dst_raw);

	577

	578 /* alpha mul the dest */

	579 dst_wide = vmulq_u16 (dst_wide, alpha_wide);

	580 dst_cooked = vshrn_n_u16(dst_wide, 8);

	581

	582 /* sum -- ignoring any byte lane overflows */

	583 dst_final = vadd_u8(src_raw, dst_cooked);

	584

	585 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);

	586 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */

	587 /* we collapsed (255-a)+1 ... */

	588 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);

	589

	590 /* spread the dest */

	591 dst_wide = vmovl_u8(dst_raw_2);

	592

	593 /* alpha mul the dest */

	594 dst_wide = vmulq_u16 (dst_wide, alpha_wide);

	595 dst_cooked = vshrn_n_u16(dst_wide, 8);

	596

	597 /* sum -- ignoring any byte lane overflows */

	598 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);

	599

	600 vst1_u32(dst, vreinterpret_u32_u8(dst_final));

	601 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));

	602

	603 src += UNROLL;

	604 dst += UNROLL;

	605

	606 /* if 2 of the next pixels aren't between 1 and 254

	607 it might make sense to go to the optimized loops */

	608 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) \|\| (src[0] >= ALPHA_ OPAQ && src[1] >= ALPHA_OPAQ))

	609 break;

	610

	611 } while(src < src_end);

	612

	613 if (src >= src_end)

	614 goto TAIL;

	615

	616 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)

	617 goto ALPHA_255;

	618

	619 /fall-thru/

	620

	621 ALPHA_0:

	622

	623 /*In this state, we know the current alpha is 0 and

	624 we optimize for the next alpha also being zero. */

	625 src_temp = src; //so we don't have to increment dst every time

	626 do {

	627 if(*(++src) > ALPHA_TRANS)

	628 break;

	629 if(*(++src) > ALPHA_TRANS)

	630 break;

	631 if(*(++src) > ALPHA_TRANS)

	632 break;

	633 if(*(++src) > ALPHA_TRANS)

	634 break;

	635 } while(src < src_end);

	636

	637 dst += (src - src_temp);

	638

	639 /* no longer alpha 0, so determine where to go next. */

	640 if( src >= src_end)

	641 goto TAIL;

	642 if(*src >= ALPHA_OPAQ)

	643 goto ALPHA_255;

	644 else

	645 goto ALPHA_1_TO_254;

	646

	647 ALPHA_255:

	648 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {

	649 dst[0]=src[0];

	650 dst[1]=src[1];

	651 dst[2]=src[2];

	652 dst[3]=src[3];

	653 src+=UNROLL;

	654 dst+=UNROLL;

	655 if(src >= src_end)

	656 goto TAIL;

	657 }

	658

	659 //Handle remainder.

	660 if(src >= ALPHA_OPAQ) { dst++ = *src++;

	661 if(src >= ALPHA_OPAQ) { dst++ = *src++;

	662 if(src >= ALPHA_OPAQ) { dst++ = *src++; }

	663 }

	664 }

	665

	666 if( src >= src_end)

	667 goto TAIL;

	668 if(*src <= ALPHA_TRANS)

	669 goto ALPHA_0;

	670 else

	671 goto ALPHA_1_TO_254;

	672

	673 TAIL:

	674 /* do any residual iterations */

	675 src_end += UNROLL + 1; //goto the real end

	676 while(src != src_end) {

	677 if( *src != 0 ) {

	678 if( *src >= ALPHA_OPAQ ) {

	679 dst = src;

	680 }

	681 else {

	682 dst = SkPMSrcOver(src, *dst);

	683 }

	684 }

	685 src++;

	686 dst++;

	687 }

	688

	689 #undef UNROLL

	690 return;

	691 }

520	692

521 /* Neon version of S32_Blend_BlitRow32()	693 /* Neon version of S32_Blend_BlitRow32()

522 * portable version is in src/core/SkBlitRow_D32.cpp	694 * portable version is in src/core/SkBlitRow_D32.cpp

523 */	695 */

524 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,	696 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,

525 const SkPMColor* SK_RESTRICT src,	697 const SkPMColor* SK_RESTRICT src,

526 int count, U8CPU alpha) {	698 int count, U8CPU alpha) {

527 SkASSERT(alpha <= 255);	699 SkASSERT(alpha <= 255);

528 if (count > 0) {	700 if (count > 0) {

529 uint16_t src_scale = SkAlpha255To256(alpha);	701 uint16_t src_scale = SkAlpha255To256(alpha);

(...skipping 570 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1100 // dither	1272 // dither

1101 NULL, // S32_D4444_Opaque_Dither,	1273 NULL, // S32_D4444_Opaque_Dither,

1102 NULL, // S32_D4444_Blend_Dither,	1274 NULL, // S32_D4444_Blend_Dither,

1103 NULL, // S32A_D4444_Opaque_Dither,	1275 NULL, // S32A_D4444_Opaque_Dither,

1104 NULL, // S32A_D4444_Blend_Dither	1276 NULL, // S32A_D4444_Blend_Dither

1105 };	1277 };

1106	1278

1107 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {	1279 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {

1108 NULL, // S32_Opaque,	1280 NULL, // S32_Opaque,

1109 S32_Blend_BlitRow32_neon, // S32_Blend,	1281 S32_Blend_BlitRow32_neon, // S32_Blend,

1110 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,	1282 /*

	1283 * We have two choices for S32A_Opaque procs. The one reads the src alpha

	1284 * value and attempts to optimize accordingly. The optimization is

	1285 * sensitive to the source content and is not a win in all cases. For

	1286 * example, if there are a lot of transitions between the alpha states,

	1287 * the performance will almost certainly be worse. However, for many

	1288 * common cases the performance is equivalent or better than the standard

	1289 * case where we do not inspect the src alpha.

	1290 */

	1291 #if SK_A32_SHIFT == 24

	1292 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor

	1293 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,

	1294 #else

	1295 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,

	1296 #endif

1111 S32A_Blend_BlitRow32_arm // S32A_Blend	1297 S32A_Blend_BlitRow32_arm // S32A_Blend

1112 };	1298 };

OLD	NEW

« no previous file with comments | « bench/BitmapBench.cpp ('k') | no next file » | no next file with comments »