src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 158973002: ARM Skia NEON patches - 12 - S32_Blend

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 158973002: ARM Skia NEON patches - 12 - S32_Blend (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Rebase Created 6 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBlitRow_opts_arm_neon.h"	8 #include "SkBlitRow_opts_arm_neon.h"

9	9

10 #include "SkBlitMask.h"	10 #include "SkBlitMask.h"

(...skipping 758 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
769 return;	769 return;

770 }	770 }

771	771

772 /* Neon version of S32_Blend_BlitRow32()	772 /* Neon version of S32_Blend_BlitRow32()

773 * portable version is in src/core/SkBlitRow_D32.cpp	773 * portable version is in src/core/SkBlitRow_D32.cpp

774 */	774 */

775 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,	775 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,

776 const SkPMColor* SK_RESTRICT src,	776 const SkPMColor* SK_RESTRICT src,

777 int count, U8CPU alpha) {	777 int count, U8CPU alpha) {

778 SkASSERT(alpha <= 255);	778 SkASSERT(alpha <= 255);

779 if (count > 0) {

780 uint16_t src_scale = SkAlpha255To256(alpha);

781 uint16_t dst_scale = 256 - src_scale;

782	779

783 /* run them N at a time through the NEON unit */	780 if (count <= 0) {

784 /* note that each 1 is 4 bytes, each treated exactly the same,	781 return;

785 * so we can work under that guise. We do know that the src&dst	782 }

786 * will be 32-bit aligned quantities, so we can specify that on

787 * the load/store ops and do a neon 'reinterpret' to get us to

788 * byte-sized (pun intended) pieces that we widen/multiply/shift

789 * we're limited at 128 bits in the wide ops, which is 8x16bits

790 * or a pair of 32 bit src/dsts.

791 */

792 /* we could manually unroll this loop so that we load 128 bits

793 * (as a pair of 64s) from each of src and dst, processing them

794 * in pieces. This might give us a little better management of

795 * the memory latency, but my initial attempts here did not

796 * produce an instruction stream that looked all that nice.

797 */

798 #define UNROLL 2

799 while (count >= UNROLL) {

800 uint8x8_t src_raw, dst_raw, dst_final;

801 uint16x8_t src_wide, dst_wide;

802	783

803 /* get 64 bits of src, widen it, multiply by src_scale */	784 uint16_t src_scale = SkAlpha255To256(alpha);

804 src_raw = vreinterpret_u8_u32(vld1_u32(src));	785 uint16_t dst_scale = 256 - src_scale;

805 src_wide = vmovl_u8(src_raw);

806 /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */

807 src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));

808	786

809 /* ditto with dst */	787 while (count >= 2) {

810 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));	788 uint8x8_t vsrc, vdst, vres;

811 dst_wide = vmovl_u8(dst_raw);	789 uint16x8_t vsrc_wide, vdst_wide;

812	790

813 /* combine add with dst multiply into mul-accumulate */	791 /* These commented prefetches are a big win for count

814 dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));	792 * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.

	793 * They also hurt a little (<5%) on an A15

	794 */

	795 //__builtin_prefetch(src+32);

	796 //__builtin_prefetch(dst+32);

815	797

816 dst_final = vshrn_n_u16(dst_wide, 8);	798 // Load

817 vst1_u32(dst, vreinterpret_u32_u8(dst_final));	799 vsrc = vreinterpret_u8_u32(vld1_u32(src));

	800 vdst = vreinterpret_u8_u32(vld1_u32(dst));

818	801

819 src += UNROLL;	802 // Process src

820 dst += UNROLL;	803 vsrc_wide = vmovl_u8(vsrc);

821 count -= UNROLL;	804 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));

	805

	806 // Process dst

	807 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));

	808

	809 // Combine

	810 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);

	811

	812 // Store

	813 vst1_u32(dst, vreinterpret_u32_u8(vres));

	814

	815 src += 2;

	816 dst += 2;

	817 count -= 2;

822 }	818 }

823 /* RBE: well, i don't like how gcc manages src/dst across the above

824 * loop it's constantly calculating src+bias, dst+bias and it only

825 * adjusts the real ones when we leave the loop. Not sure why

826 * it's "hoisting down" (hoisting implies above in my lexicon ;))

827 * the adjustments to src/dst/count, but it does...

828 * (might be SSA-style internal logic...

829 */

830	819

831 #if UNROLL == 2

832 if (count == 1) {	820 if (count == 1) {

833 dst = SkAlphaMulQ(src, src_scale) + SkAlphaMulQ(*dst, dst_scale);	821 uint8x8_t vsrc, vdst, vres;

834 }	822 uint16x8_t vsrc_wide, vdst_wide;

835 #else

836 if (count > 0) {

837 do {

838 dst = SkAlphaMulQ(src, src_scale) + SkAlphaMulQ(*dst, dst_scal e);

839 src += 1;

840 dst += 1;

841 } while (--count > 0);

842 }

843 #endif

844	823

845 #undef UNROLL	824 // Load

	825 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));

	826 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));

	827

	828 // Process

	829 vsrc_wide = vmovl_u8(vsrc);

	830 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));

	831 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));

	832 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);

	833

	834 // Store

	835 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);

846 }	836 }

847 }	837 }

848	838

849 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,	839 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,

850 const SkPMColor* SK_RESTRICT src,	840 const SkPMColor* SK_RESTRICT src,

851 int count, U8CPU alpha) {	841 int count, U8CPU alpha) {

852	842

853 SkASSERT(255 >= alpha);	843 SkASSERT(255 >= alpha);

854	844

855 if (count <= 0) {	845 if (count <= 0) {

(...skipping 588 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1444 * case where we do not inspect the src alpha.	1434 * case where we do not inspect the src alpha.

1445 */	1435 */

1446 #if SK_A32_SHIFT == 24	1436 #if SK_A32_SHIFT == 24

1447 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor	1437 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor

1448 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,	1438 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,

1449 #else	1439 #else

1450 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,	1440 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,

1451 #endif	1441 #endif

1452 S32A_Blend_BlitRow32_neon // S32A_Blend	1442 S32A_Blend_BlitRow32_neon // S32A_Blend

1453 };	1443 };

OLD	NEW

« expectations/gm/ignored-tests.txt ('K') | « expectations/gm/ignored-tests.txt ('k') | no next file » | no next file with comments »