Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1992)

Side by Side Diff: src/opts/SkXfermode_opts_arm_neon.cpp

Issue 350343002: ARM Skia NEON patches - 41 - arm64: SkXfermode::xfer32 (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Add a comment Created 6 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 #include "SkXfermode.h" 1 #include "SkXfermode.h"
2 #include "SkXfermode_proccoeff.h" 2 #include "SkXfermode_proccoeff.h"
3 #include "SkColorPriv.h" 3 #include "SkColorPriv.h"
4 4
5 #include <arm_neon.h> 5 #include <arm_neon.h>
6 #include "SkColor_opts_neon.h" 6 #include "SkColor_opts_neon.h"
7 #include "SkXfermode_opts_arm_neon.h" 7 #include "SkXfermode_opts_arm_neon.h"
8 8
9 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) 9 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b)
10 10
(...skipping 730 matching lines...) Expand 10 before | Expand all | Expand 10 after
741 741
742 typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst); 742 typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst);
743 743
744 extern SkXfermodeProcSIMD gNEONXfermodeProcs[]; 744 extern SkXfermodeProcSIMD gNEONXfermodeProcs[];
745 745
746 SkNEONProcCoeffXfermode::SkNEONProcCoeffXfermode(SkReadBuffer& buffer) 746 SkNEONProcCoeffXfermode::SkNEONProcCoeffXfermode(SkReadBuffer& buffer)
747 : INHERITED(buffer) { 747 : INHERITED(buffer) {
748 fProcSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[this->getMode()]); 748 fProcSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[this->getMode()]);
749 } 749 }
750 750
751 void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[], 751 void SkNEONProcCoeffXfermode::xfer32(SkPMColor* SK_RESTRICT dst,
752 int count, const SkAlpha aa[]) const { 752 const SkPMColor* SK_RESTRICT src, int count ,
753 const SkAlpha* SK_RESTRICT aa) const {
753 SkASSERT(dst && src && count >= 0); 754 SkASSERT(dst && src && count >= 0);
754 755
755 SkXfermodeProc proc = this->getProc(); 756 SkXfermodeProc proc = this->getProc();
756 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD ); 757 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD );
757 SkASSERT(procSIMD != NULL); 758 SkASSERT(procSIMD != NULL);
758 759
759 if (NULL == aa) { 760 if (NULL == aa) {
760 // Unrolled NEON code 761 // Unrolled NEON code
762 // We'd like to just do this (modulo a few casts):
763 // vst4_u8(dst, procSIMD(vld4_u8(src), vld4_u8(dst)));
764 // src += 8;
765 // dst += 8;
766 // but that tends to generate miserable code. Here are a bunch of faster
767 // workarounds for different architectures and compilers.
761 while (count >= 8) { 768 while (count >= 8) {
769
770 #ifdef SK_CPU_ARM32
762 uint8x8x4_t vsrc, vdst, vres; 771 uint8x8x4_t vsrc, vdst, vres;
763
764 #ifdef SK_CPU_ARM64
765 vsrc = vld4_u8((uint8_t*)src);
766 vdst = vld4_u8((uint8_t*)dst);
767 #else
768 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) 772 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
769 asm volatile ( 773 asm volatile (
770 "vld4.u8 %h[vsrc], [%[src]]! \t\n" 774 "vld4.u8 %h[vsrc], [%[src]]! \t\n"
771 "vld4.u8 %h[vdst], [%[dst]] \t\n" 775 "vld4.u8 %h[vdst], [%[dst]] \t\n"
772 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src) 776 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src)
773 : [dst] "r" (dst) 777 : [dst] "r" (dst)
774 : 778 :
775 ); 779 );
776 #else 780 #else
777 register uint8x8_t d0 asm("d0"); 781 register uint8x8_t d0 asm("d0");
(...skipping 12 matching lines...) Expand all
790 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7), 794 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7),
791 [src] "+&r" (src) 795 [src] "+&r" (src)
792 : [dst] "r" (dst) 796 : [dst] "r" (dst)
793 : 797 :
794 ); 798 );
795 vsrc.val[0] = d0; vdst.val[0] = d4; 799 vsrc.val[0] = d0; vdst.val[0] = d4;
796 vsrc.val[1] = d1; vdst.val[1] = d5; 800 vsrc.val[1] = d1; vdst.val[1] = d5;
797 vsrc.val[2] = d2; vdst.val[2] = d6; 801 vsrc.val[2] = d2; vdst.val[2] = d6;
798 vsrc.val[3] = d3; vdst.val[3] = d7; 802 vsrc.val[3] = d3; vdst.val[3] = d7;
799 #endif 803 #endif
800 #endif // #ifdef SK_CPU_ARM64
801 804
802 vres = procSIMD(vsrc, vdst); 805 vres = procSIMD(vsrc, vdst);
803 806
804 vst4_u8((uint8_t*)dst, vres); 807 vst4_u8((uint8_t*)dst, vres);
805 808
809 dst += 8;
810
811 #else // #ifdef SK_CPU_ARM32
812
813 asm volatile (
814 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n"
815 "ld4 {v4.8b - v7.8b}, [%[dst]] \t\n"
816 "blr %[proc] \t\n"
817 "st4 {v0.8b - v3.8b}, [%[dst]], #32 \t\n"
818 : [src] "+&r" (src), [dst] "+&r" (dst)
819 : [proc] "r" (procSIMD)
820 : "cc", "memory",
821 /* We don't know what proc is going to clobber so we must
822 * add everything that is not callee-saved.
823 */
824 "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9",
825 "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18",
826 "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
827 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
828 "v27", "v28", "v29", "v30", "v31"
829 );
830
831 #endif // #ifdef SK_CPU_ARM32
832
806 count -= 8; 833 count -= 8;
807 dst += 8;
808 #ifdef SK_CPU_ARM64
809 src += 8;
810 #endif
811 } 834 }
812 // Leftovers 835 // Leftovers
813 for (int i = 0; i < count; i++) { 836 for (int i = 0; i < count; i++) {
814 dst[i] = proc(src[i], dst[i]); 837 dst[i] = proc(src[i], dst[i]);
815 } 838 }
816 } else { 839 } else {
817 for (int i = count - 1; i >= 0; --i) { 840 for (int i = count - 1; i >= 0; --i) {
818 unsigned a = aa[i]; 841 unsigned a = aa[i];
819 if (0 != a) { 842 if (0 != a) {
820 SkPMColor dstC = dst[i]; 843 SkPMColor dstC = dst[i];
(...skipping 175 matching lines...) Expand 10 before | Expand all | Expand 10 after
996 1019
997 if (procSIMD != NULL) { 1020 if (procSIMD != NULL) {
998 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD)); 1021 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD));
999 } 1022 }
1000 return NULL; 1023 return NULL;
1001 } 1024 }
1002 1025
1003 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) { 1026 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) {
1004 return gNEONXfermodeProcs1[mode]; 1027 return gNEONXfermodeProcs1[mode];
1005 } 1028 }
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698