src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 158973002: ARM Skia NEON patches - 12 - S32_Blend

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 158973002: ARM Skia NEON patches - 12 - S32_Blend (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 6 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBlitRow_opts_arm_neon.h"	8 #include "SkBlitRow_opts_arm_neon.h"

9	9

10 #include "SkBlitMask.h"	10 #include "SkBlitMask.h"

(...skipping 748 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
759 return;	759 return;

760 }	760 }

761	761

762 /* Neon version of S32_Blend_BlitRow32()	762 /* Neon version of S32_Blend_BlitRow32()

763 * portable version is in src/core/SkBlitRow_D32.cpp	763 * portable version is in src/core/SkBlitRow_D32.cpp

764 */	764 */

765 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,	765 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,

766 const SkPMColor* SK_RESTRICT src,	766 const SkPMColor* SK_RESTRICT src,

767 int count, U8CPU alpha) {	767 int count, U8CPU alpha) {

768 SkASSERT(alpha <= 255);	768 SkASSERT(alpha <= 255);

	769

769 if (count > 0) {	770 if (count > 0) {
	djsollen 2014/02/13 17:30:15 instead of nesting the remainder of this method I instead of nesting the remainder of this method I would turn this into... if (count <= 0) { return; } kevin.petit 2014/02/13 17:40:19 Done. Show quoted text On 2014/02/13 17:30:15, djsollen wrote: > instead of nesting the remainder of this method I would turn this into... > > if (count <= 0) { > return; > } Done.
770 uint16_t src_scale = SkAlpha255To256(alpha);	771 uint16_t src_scale = SkAlpha255To256(alpha);

771 uint16_t dst_scale = 256 - src_scale;	772 uint16_t dst_scale = 256 - src_scale;

772	773

773 /* run them N at a time through the NEON unit */	774 while (count >= 2) {

774 /* note that each 1 is 4 bytes, each treated exactly the same,	775 uint8x8_t vsrc, vdst, vres;

775 * so we can work under that guise. We do know that the src&dst	776 uint16x8_t vsrc_wide, vdst_wide;

776 * will be 32-bit aligned quantities, so we can specify that on

777 * the load/store ops and do a neon 'reinterpret' to get us to

778 * byte-sized (pun intended) pieces that we widen/multiply/shift

779 * we're limited at 128 bits in the wide ops, which is 8x16bits

780 * or a pair of 32 bit src/dsts.

781 */

782 /* we could manually unroll this loop so that we load 128 bits

783 * (as a pair of 64s) from each of src and dst, processing them

784 * in pieces. This might give us a little better management of

785 * the memory latency, but my initial attempts here did not

786 * produce an instruction stream that looked all that nice.

787 */

788 #define UNROLL 2

789 while (count >= UNROLL) {

790 uint8x8_t src_raw, dst_raw, dst_final;

791 uint16x8_t src_wide, dst_wide;

792	777

793 /* get 64 bits of src, widen it, multiply by src_scale */	778 //__builtin_prefetch(src+32);

794 src_raw = vreinterpret_u8_u32(vld1_u32(src));	779 //__builtin_prefetch(dst+32);

795 src_wide = vmovl_u8(src_raw);

796 /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */

797 src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));

798	780

799 /* ditto with dst */	781 /* The above commented prefetches are a big win for count
	djsollen 2014/02/13 17:30:15 Move this comment to be directly above the prefetc Move this comment to be directly above the prefetch lines. kevin.petit 2014/02/13 17:40:19 Done. Show quoted text On 2014/02/13 17:30:15, djsollen wrote: > Move this comment to be directly above the prefetch lines. Done.
800 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));	782 * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.

801 dst_wide = vmovl_u8(dst_raw);	783 * They also hurt a little (<5%) on an A15

	784 */

802	785

803 /* combine add with dst multiply into mul-accumulate */	786 // Load

804 dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));	787 vsrc = vreinterpret_u8_u32(vld1_u32(src));

	788 vdst = vreinterpret_u8_u32(vld1_u32(dst));

805	789

806 dst_final = vshrn_n_u16(dst_wide, 8);	790 // Process src

807 vst1_u32(dst, vreinterpret_u32_u8(dst_final));	791 vsrc_wide = vmovl_u8(vsrc);

	792 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));

808	793

809 src += UNROLL;	794 // Process dst

810 dst += UNROLL;	795 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));

811 count -= UNROLL;

812 }

813 /* RBE: well, i don't like how gcc manages src/dst across the above

814 * loop it's constantly calculating src+bias, dst+bias and it only

815 * adjusts the real ones when we leave the loop. Not sure why

816 * it's "hoisting down" (hoisting implies above in my lexicon ;))

817 * the adjustments to src/dst/count, but it does...

818 * (might be SSA-style internal logic...

819 */

820	796

821 #if UNROLL == 2	797 // Combine

822 if (count == 1) {	798 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);

823 dst = SkAlphaMulQ(src, src_scale) + SkAlphaMulQ(*dst, dst_scale);

824 }

825 #else

826 if (count > 0) {

827 do {

828 dst = SkAlphaMulQ(src, src_scale) + SkAlphaMulQ(*dst, dst_scal e);

829 src += 1;

830 dst += 1;

831 } while (--count > 0);

832 }

833 #endif

834	799

835 #undef UNROLL	800 // Store

	801 vst1_u32(dst, vreinterpret_u32_u8(vres));

	802

	803 src += 2;

	804 dst += 2;

	805 count -= 2;

	806 }

	807

	808 if (count == 1) {

	809 uint8x8_t vsrc, vdst, vres;

	810 uint16x8_t vsrc_wide, vdst_wide;

	811

	812 // Load

	813 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vs rc), 0));

	814 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vd st), 0));

	815

	816 // Process

	817 vsrc_wide = vmovl_u8(vsrc);

	818 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));

	819 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));

	820 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);

	821

	822 // Store

	823 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);

	824 }

836 }	825 }

837 }	826 }

838	827

839 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,	828 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,

840 const SkPMColor* SK_RESTRICT src,	829 const SkPMColor* SK_RESTRICT src,

841 int count, U8CPU alpha) {	830 int count, U8CPU alpha) {

842	831

843 SkASSERT(255 >= alpha);	832 SkASSERT(255 >= alpha);

844	833

845 if (count <= 0) {	834 if (count <= 0) {

(...skipping 588 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1434 * case where we do not inspect the src alpha.	1423 * case where we do not inspect the src alpha.

1435 */	1424 */

1436 #if SK_A32_SHIFT == 24	1425 #if SK_A32_SHIFT == 24

1437 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor	1426 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor

1438 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,	1427 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,

1439 #else	1428 #else

1440 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,	1429 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,

1441 #endif	1430 #endif

1442 S32A_Blend_BlitRow32_neon // S32A_Blend	1431 S32A_Blend_BlitRow32_neon // S32A_Blend

1443 };	1432 };

OLD	NEW

« no previous file with comments | « expectations/gm/ignored-tests.txt ('k') | no next file » | no next file with comments »