Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(5)

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 175433002: Revert of ARM Skia NEON patches - 12 - S32_Blend (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « expectations/gm/ignored-tests.txt ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2012 The Android Open Source Project 2 * Copyright 2012 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBlitRow_opts_arm_neon.h" 8 #include "SkBlitRow_opts_arm_neon.h"
9 9
10 #include "SkBlitMask.h" 10 #include "SkBlitMask.h"
(...skipping 758 matching lines...) Expand 10 before | Expand all | Expand 10 after
769 return; 769 return;
770 } 770 }
771 771
772 /* Neon version of S32_Blend_BlitRow32() 772 /* Neon version of S32_Blend_BlitRow32()
773 * portable version is in src/core/SkBlitRow_D32.cpp 773 * portable version is in src/core/SkBlitRow_D32.cpp
774 */ 774 */
775 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 775 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
776 const SkPMColor* SK_RESTRICT src, 776 const SkPMColor* SK_RESTRICT src,
777 int count, U8CPU alpha) { 777 int count, U8CPU alpha) {
778 SkASSERT(alpha <= 255); 778 SkASSERT(alpha <= 255);
779 if (count > 0) {
780 uint16_t src_scale = SkAlpha255To256(alpha);
781 uint16_t dst_scale = 256 - src_scale;
779 782
780 if (count <= 0) { 783 /* run them N at a time through the NEON unit */
781 return; 784 /* note that each 1 is 4 bytes, each treated exactly the same,
785 * so we can work under that guise. We *do* know that the src&dst
786 * will be 32-bit aligned quantities, so we can specify that on
787 * the load/store ops and do a neon 'reinterpret' to get us to
788 * byte-sized (pun intended) pieces that we widen/multiply/shift
789 * we're limited at 128 bits in the wide ops, which is 8x16bits
790 * or a pair of 32 bit src/dsts.
791 */
792 /* we *could* manually unroll this loop so that we load 128 bits
793 * (as a pair of 64s) from each of src and dst, processing them
794 * in pieces. This might give us a little better management of
795 * the memory latency, but my initial attempts here did not
796 * produce an instruction stream that looked all that nice.
797 */
798 #define UNROLL 2
799 while (count >= UNROLL) {
800 uint8x8_t src_raw, dst_raw, dst_final;
801 uint16x8_t src_wide, dst_wide;
802
803 /* get 64 bits of src, widen it, multiply by src_scale */
804 src_raw = vreinterpret_u8_u32(vld1_u32(src));
805 src_wide = vmovl_u8(src_raw);
806 /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
807 src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
808
809 /* ditto with dst */
810 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
811 dst_wide = vmovl_u8(dst_raw);
812
813 /* combine add with dst multiply into mul-accumulate */
814 dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
815
816 dst_final = vshrn_n_u16(dst_wide, 8);
817 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
818
819 src += UNROLL;
820 dst += UNROLL;
821 count -= UNROLL;
782 } 822 }
823 /* RBE: well, i don't like how gcc manages src/dst across the above
824 * loop it's constantly calculating src+bias, dst+bias and it only
825 * adjusts the real ones when we leave the loop. Not sure why
826 * it's "hoisting down" (hoisting implies above in my lexicon ;))
827 * the adjustments to src/dst/count, but it does...
828 * (might be SSA-style internal logic...
829 */
783 830
784 uint16_t src_scale = SkAlpha255To256(alpha); 831 #if UNROLL == 2
785 uint16_t dst_scale = 256 - src_scale; 832 if (count == 1) {
833 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
834 }
835 #else
836 if (count > 0) {
837 do {
838 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scal e);
839 src += 1;
840 dst += 1;
841 } while (--count > 0);
842 }
843 #endif
786 844
787 while (count >= 2) { 845 #undef UNROLL
788 uint8x8_t vsrc, vdst, vres;
789 uint16x8_t vsrc_wide, vdst_wide;
790
791 /* These commented prefetches are a big win for count
792 * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
793 * They also hurt a little (<5%) on an A15
794 */
795 //__builtin_prefetch(src+32);
796 //__builtin_prefetch(dst+32);
797
798 // Load
799 vsrc = vreinterpret_u8_u32(vld1_u32(src));
800 vdst = vreinterpret_u8_u32(vld1_u32(dst));
801
802 // Process src
803 vsrc_wide = vmovl_u8(vsrc);
804 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
805
806 // Process dst
807 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
808
809 // Combine
810 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
811
812 // Store
813 vst1_u32(dst, vreinterpret_u32_u8(vres));
814
815 src += 2;
816 dst += 2;
817 count -= 2;
818 }
819
820 if (count == 1) {
821 uint8x8_t vsrc, vdst, vres;
822 uint16x8_t vsrc_wide, vdst_wide;
823
824 // Load
825 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
826 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
827
828 // Process
829 vsrc_wide = vmovl_u8(vsrc);
830 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
831 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
832 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
833
834 // Store
835 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
836 } 846 }
837 } 847 }
838 848
839 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 849 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
840 const SkPMColor* SK_RESTRICT src, 850 const SkPMColor* SK_RESTRICT src,
841 int count, U8CPU alpha) { 851 int count, U8CPU alpha) {
842 852
843 SkASSERT(255 >= alpha); 853 SkASSERT(255 >= alpha);
844 854
845 if (count <= 0) { 855 if (count <= 0) {
(...skipping 588 matching lines...) Expand 10 before | Expand all | Expand 10 after
1434 * case where we do not inspect the src alpha. 1444 * case where we do not inspect the src alpha.
1435 */ 1445 */
1436 #if SK_A32_SHIFT == 24 1446 #if SK_A32_SHIFT == 24
1437 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1447 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1438 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1448 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
1439 #else 1449 #else
1440 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1450 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
1441 #endif 1451 #endif
1442 S32A_Blend_BlitRow32_neon // S32A_Blend 1452 S32A_Blend_BlitRow32_neon // S32A_Blend
1443 }; 1453 };
OLDNEW
« no previous file with comments | « expectations/gm/ignored-tests.txt ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698