src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 18173012: ARM Skia NEON patches - 12 - S32_Blend

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 18173012: ARM Skia NEON patches - 12 - S32_Blend (Closed) Base URL: https://skia.googlecode.com/svn/trunk

Patch Set: Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBlitRow_opts_arm.h"	8 #include "SkBlitRow_opts_arm.h"

9	9

10 #include "SkBlitMask.h"	10 #include "SkBlitMask.h"

(...skipping 679 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
690 return;	690 return;

691 }	691 }

692	692

693 /* Neon version of S32_Blend_BlitRow32()	693 /* Neon version of S32_Blend_BlitRow32()

694 * portable version is in src/core/SkBlitRow_D32.cpp	694 * portable version is in src/core/SkBlitRow_D32.cpp

695 */	695 */

696 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,	696 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,

697 const SkPMColor* SK_RESTRICT src,	697 const SkPMColor* SK_RESTRICT src,

698 int count, U8CPU alpha) {	698 int count, U8CPU alpha) {

699 SkASSERT(alpha <= 255);	699 SkASSERT(alpha <= 255);

	700

700 if (count > 0) {	701 if (count > 0) {

701 uint16_t src_scale = SkAlpha255To256(alpha);	702 uint16_t src_scale = SkAlpha255To256(alpha);

702 uint16_t dst_scale = 256 - src_scale;	703 uint16_t dst_scale = 256 - src_scale;

703	704

704 /* run them N at a time through the NEON unit */	705 while (count >= 2) {

705 /* note that each 1 is 4 bytes, each treated exactly the same,	706 uint8x8_t vsrc, vdst, vres;

706 * so we can work under that guise. We do know that the src&dst	707 uint16x8_t vsrc_wide, vdst_wide;

707 * will be 32-bit aligned quantities, so we can specify that on

708 * the load/store ops and do a neon 'reinterpret' to get us to

709 * byte-sized (pun intended) pieces that we widen/multiply/shift

710 * we're limited at 128 bits in the wide ops, which is 8x16bits

711 * or a pair of 32 bit src/dsts.

712 */

713 /* we could manually unroll this loop so that we load 128 bits

714 * (as a pair of 64s) from each of src and dst, processing them

715 * in pieces. This might give us a little better management of

716 * the memory latency, but my initial attempts here did not

717 * produce an instruction stream that looked all that nice.

718 */

719 #define UNROLL 2

720 while (count >= UNROLL) {

721 uint8x8_t src_raw, dst_raw, dst_final;

722 uint16x8_t src_wide, dst_wide;

723	708

724 /* get 64 bits of src, widen it, multiply by src_scale */	709 //__builtin_prefetch(src+32);
	djsollen 2013/07/15 12:52:22 can you document in the code what you put in your can you document in the code what you put in your CL description regarding these prefetch commands. kevin.petit.not.used.account 2013/07/15 13:55:59 I'm not sure I understand. There's already a comme Show quoted text On 2013/07/15 12:52:22, djsollen wrote: > can you document in the code what you put in your CL description regarding these > prefetch commands. I'm not sure I understand. There's already a comment just under the prefetches. Also, given that you were quite happy with the compromise in a previous patch, you might want to try the code with the prefetches uncommented...
725 src_raw = vreinterpret_u8_u32(vld1_u32(src));	710 //__builtin_prefetch(dst+32);

726 src_wide = vmovl_u8(src_raw);

727 /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */

728 src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));

729	711

730 /* ditto with dst */	712 /* The above commented prefetches are a big win for count

731 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));	713 * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.

732 dst_wide = vmovl_u8(dst_raw);	714 * They also hurt a little (<5%) on an A15

	715 */

733	716

734 /* combine add with dst multiply into mul-accumulate */	717 // Load

735 dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));	718 vsrc = vreinterpret_u8_u32(vld1_u32(src));

	719 vdst = vreinterpret_u8_u32(vld1_u32(dst));

736	720

737 dst_final = vshrn_n_u16(dst_wide, 8);	721 // Process src

738 vst1_u32(dst, vreinterpret_u32_u8(dst_final));	722 vsrc_wide = vmovl_u8(vsrc);

	723 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));

739	724

740 src += UNROLL;	725 // Process dst

741 dst += UNROLL;	726 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));

742 count -= UNROLL;

743 }

744 /* RBE: well, i don't like how gcc manages src/dst across the above

745 * loop it's constantly calculating src+bias, dst+bias and it only

746 * adjusts the real ones when we leave the loop. Not sure why

747 * it's "hoisting down" (hoisting implies above in my lexicon ;))

748 * the adjustments to src/dst/count, but it does...

749 * (might be SSA-style internal logic...

750 */

751	727

752 #if UNROLL == 2	728 // Combine

753 if (count == 1) {	729 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);

754 dst = SkAlphaMulQ(src, src_scale) + SkAlphaMulQ(*dst, dst_scale);

755 }

756 #else

757 if (count > 0) {

758 do {

759 dst = SkAlphaMulQ(src, src_scale) + SkAlphaMulQ(*dst, dst_scal e);

760 src += 1;

761 dst += 1;

762 } while (--count > 0);

763 }

764 #endif

765	730

766 #undef UNROLL	731 // Store

	732 vst1_u32(dst, vreinterpret_u32_u8(vres));

	733

	734 src += 2;

	735 dst += 2;

	736 count -= 2;

	737 }

	738

	739 if (count == 1) {

	740 uint8x8_t vsrc, vdst, vres;

	741 uint16x8_t vsrc_wide, vdst_wide;

	742

	743 // Load

	744 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vs rc), 0));

	745 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vd st), 0));

	746

	747 // Process

	748 vsrc_wide = vmovl_u8(vsrc);

	749 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));

	750 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));

	751 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);

	752

	753 // Store

	754 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);

	755 }

767 }	756 }

768 }	757 }

769	758

770 ///////////////////////////////////////////////////////////////////////////////	759 ///////////////////////////////////////////////////////////////////////////////

771	760

772 #undef DEBUG_OPAQUE_DITHER	761 #undef DEBUG_OPAQUE_DITHER

773	762

774 #if defined(DEBUG_OPAQUE_DITHER)	763 #if defined(DEBUG_OPAQUE_DITHER)

775 static void showme8(char str, void p, int len)	764 static void showme8(char str, void p, int len)

776 {	765 {

(...skipping 512 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1289 * case where we do not inspect the src alpha.	1278 * case where we do not inspect the src alpha.

1290 */	1279 */

1291 #if SK_A32_SHIFT == 24	1280 #if SK_A32_SHIFT == 24

1292 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor	1281 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor

1293 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,	1282 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,

1294 #else	1283 #else

1295 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,	1284 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,

1296 #endif	1285 #endif

1297 S32A_Blend_BlitRow32_arm // S32A_Blend	1286 S32A_Blend_BlitRow32_arm // S32A_Blend

1298 };	1287 };

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »