Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(333)

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 18173012: ARM Skia NEON patches - 12 - S32_Blend (Closed) Base URL: https://skia.googlecode.com/svn/trunk
Patch Set: Created 7 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2012 The Android Open Source Project 2 * Copyright 2012 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBlitRow_opts_arm.h" 8 #include "SkBlitRow_opts_arm.h"
9 9
10 #include "SkBlitMask.h" 10 #include "SkBlitMask.h"
(...skipping 679 matching lines...) Expand 10 before | Expand all | Expand 10 after
690 return; 690 return;
691 } 691 }
692 692
693 /* Neon version of S32_Blend_BlitRow32() 693 /* Neon version of S32_Blend_BlitRow32()
694 * portable version is in src/core/SkBlitRow_D32.cpp 694 * portable version is in src/core/SkBlitRow_D32.cpp
695 */ 695 */
696 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 696 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
697 const SkPMColor* SK_RESTRICT src, 697 const SkPMColor* SK_RESTRICT src,
698 int count, U8CPU alpha) { 698 int count, U8CPU alpha) {
699 SkASSERT(alpha <= 255); 699 SkASSERT(alpha <= 255);
700
700 if (count > 0) { 701 if (count > 0) {
701 uint16_t src_scale = SkAlpha255To256(alpha); 702 uint16_t src_scale = SkAlpha255To256(alpha);
702 uint16_t dst_scale = 256 - src_scale; 703 uint16_t dst_scale = 256 - src_scale;
703 704
704 /* run them N at a time through the NEON unit */ 705 while (count >= 2) {
705 /* note that each 1 is 4 bytes, each treated exactly the same, 706 uint8x8_t vsrc, vdst, vres;
706 * so we can work under that guise. We *do* know that the src&dst 707 uint16x8_t vsrc_wide, vdst_wide;
707 * will be 32-bit aligned quantities, so we can specify that on
708 * the load/store ops and do a neon 'reinterpret' to get us to
709 * byte-sized (pun intended) pieces that we widen/multiply/shift
710 * we're limited at 128 bits in the wide ops, which is 8x16bits
711 * or a pair of 32 bit src/dsts.
712 */
713 /* we *could* manually unroll this loop so that we load 128 bits
714 * (as a pair of 64s) from each of src and dst, processing them
715 * in pieces. This might give us a little better management of
716 * the memory latency, but my initial attempts here did not
717 * produce an instruction stream that looked all that nice.
718 */
719 #define UNROLL 2
720 while (count >= UNROLL) {
721 uint8x8_t src_raw, dst_raw, dst_final;
722 uint16x8_t src_wide, dst_wide;
723 708
724 /* get 64 bits of src, widen it, multiply by src_scale */ 709 //__builtin_prefetch(src+32);
djsollen 2013/07/15 12:52:22 can you document in the code what you put in your
kevin.petit.not.used.account 2013/07/15 13:55:59 I'm not sure I understand. There's already a comme
725 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 710 //__builtin_prefetch(dst+32);
726 src_wide = vmovl_u8(src_raw);
727 /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
728 src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
729 711
730 /* ditto with dst */ 712 /* The above commented prefetches are a big win for count
731 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 713 * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
732 dst_wide = vmovl_u8(dst_raw); 714 * They also hurt a little (<5%) on an A15
715 */
733 716
734 /* combine add with dst multiply into mul-accumulate */ 717 // Load
735 dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale)); 718 vsrc = vreinterpret_u8_u32(vld1_u32(src));
719 vdst = vreinterpret_u8_u32(vld1_u32(dst));
736 720
737 dst_final = vshrn_n_u16(dst_wide, 8); 721 // Process src
738 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 722 vsrc_wide = vmovl_u8(vsrc);
723 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
739 724
740 src += UNROLL; 725 // Process dst
741 dst += UNROLL; 726 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
742 count -= UNROLL;
743 }
744 /* RBE: well, i don't like how gcc manages src/dst across the above
745 * loop it's constantly calculating src+bias, dst+bias and it only
746 * adjusts the real ones when we leave the loop. Not sure why
747 * it's "hoisting down" (hoisting implies above in my lexicon ;))
748 * the adjustments to src/dst/count, but it does...
749 * (might be SSA-style internal logic...
750 */
751 727
752 #if UNROLL == 2 728 // Combine
753 if (count == 1) { 729 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
754 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
755 }
756 #else
757 if (count > 0) {
758 do {
759 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scal e);
760 src += 1;
761 dst += 1;
762 } while (--count > 0);
763 }
764 #endif
765 730
766 #undef UNROLL 731 // Store
732 vst1_u32(dst, vreinterpret_u32_u8(vres));
733
734 src += 2;
735 dst += 2;
736 count -= 2;
737 }
738
739 if (count == 1) {
740 uint8x8_t vsrc, vdst, vres;
741 uint16x8_t vsrc_wide, vdst_wide;
742
743 // Load
744 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vs rc), 0));
745 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vd st), 0));
746
747 // Process
748 vsrc_wide = vmovl_u8(vsrc);
749 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
750 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
751 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
752
753 // Store
754 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
755 }
767 } 756 }
768 } 757 }
769 758
770 /////////////////////////////////////////////////////////////////////////////// 759 ///////////////////////////////////////////////////////////////////////////////
771 760
772 #undef DEBUG_OPAQUE_DITHER 761 #undef DEBUG_OPAQUE_DITHER
773 762
774 #if defined(DEBUG_OPAQUE_DITHER) 763 #if defined(DEBUG_OPAQUE_DITHER)
775 static void showme8(char *str, void *p, int len) 764 static void showme8(char *str, void *p, int len)
776 { 765 {
(...skipping 512 matching lines...) Expand 10 before | Expand all | Expand 10 after
1289 * case where we do not inspect the src alpha. 1278 * case where we do not inspect the src alpha.
1290 */ 1279 */
1291 #if SK_A32_SHIFT == 24 1280 #if SK_A32_SHIFT == 24
1292 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1281 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1293 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1282 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
1294 #else 1283 #else
1295 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1284 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
1296 #endif 1285 #endif
1297 S32A_Blend_BlitRow32_arm // S32A_Blend 1286 S32A_Blend_BlitRow32_arm // S32A_Blend
1298 }; 1287 };
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698