OLD | NEW |
1 #include "SkXfermode.h" | 1 #include "SkXfermode.h" |
2 #include "SkXfermode_proccoeff.h" | 2 #include "SkXfermode_proccoeff.h" |
3 #include "SkColorPriv.h" | 3 #include "SkColorPriv.h" |
4 | 4 |
5 #include <arm_neon.h> | 5 #include <arm_neon.h> |
6 #include "SkColor_opts_neon.h" | 6 #include "SkColor_opts_neon.h" |
7 #include "SkXfermode_opts_arm_neon.h" | 7 #include "SkXfermode_opts_arm_neon.h" |
8 | 8 |
9 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) | 9 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) |
10 | 10 |
(...skipping 730 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
741 | 741 |
742 typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst); | 742 typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst); |
743 | 743 |
744 extern SkXfermodeProcSIMD gNEONXfermodeProcs[]; | 744 extern SkXfermodeProcSIMD gNEONXfermodeProcs[]; |
745 | 745 |
746 SkNEONProcCoeffXfermode::SkNEONProcCoeffXfermode(SkReadBuffer& buffer) | 746 SkNEONProcCoeffXfermode::SkNEONProcCoeffXfermode(SkReadBuffer& buffer) |
747 : INHERITED(buffer) { | 747 : INHERITED(buffer) { |
748 fProcSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[this->getMode()]); | 748 fProcSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[this->getMode()]); |
749 } | 749 } |
750 | 750 |
751 void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[], | 751 void SkNEONProcCoeffXfermode::xfer32(SkPMColor* SK_RESTRICT dst, |
752 int count, const SkAlpha aa[]) const { | 752 const SkPMColor* SK_RESTRICT src, int count
, |
| 753 const SkAlpha* SK_RESTRICT aa) const { |
753 SkASSERT(dst && src && count >= 0); | 754 SkASSERT(dst && src && count >= 0); |
754 | 755 |
755 SkXfermodeProc proc = this->getProc(); | 756 SkXfermodeProc proc = this->getProc(); |
756 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); | 757 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); |
757 SkASSERT(procSIMD != NULL); | 758 SkASSERT(procSIMD != NULL); |
758 | 759 |
759 if (NULL == aa) { | 760 if (NULL == aa) { |
760 // Unrolled NEON code | 761 // Unrolled NEON code |
| 762 // We'd like to just do this (modulo a few casts): |
| 763 // vst4_u8(dst, procSIMD(vld4_u8(src), vld4_u8(dst))); |
| 764 // src += 8; |
| 765 // dst += 8; |
| 766 // but that tends to generate miserable code. Here are a bunch of faster |
| 767 // workarounds for different architectures and compilers. |
761 while (count >= 8) { | 768 while (count >= 8) { |
| 769 |
| 770 #ifdef SK_CPU_ARM32 |
762 uint8x8x4_t vsrc, vdst, vres; | 771 uint8x8x4_t vsrc, vdst, vres; |
763 | |
764 #ifdef SK_CPU_ARM64 | |
765 vsrc = vld4_u8((uint8_t*)src); | |
766 vdst = vld4_u8((uint8_t*)dst); | |
767 #else | |
768 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | 772 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
769 asm volatile ( | 773 asm volatile ( |
770 "vld4.u8 %h[vsrc], [%[src]]! \t\n" | 774 "vld4.u8 %h[vsrc], [%[src]]! \t\n" |
771 "vld4.u8 %h[vdst], [%[dst]] \t\n" | 775 "vld4.u8 %h[vdst], [%[dst]] \t\n" |
772 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src) | 776 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src) |
773 : [dst] "r" (dst) | 777 : [dst] "r" (dst) |
774 : | 778 : |
775 ); | 779 ); |
776 #else | 780 #else |
777 register uint8x8_t d0 asm("d0"); | 781 register uint8x8_t d0 asm("d0"); |
(...skipping 12 matching lines...) Expand all Loading... |
790 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7), | 794 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7), |
791 [src] "+&r" (src) | 795 [src] "+&r" (src) |
792 : [dst] "r" (dst) | 796 : [dst] "r" (dst) |
793 : | 797 : |
794 ); | 798 ); |
795 vsrc.val[0] = d0; vdst.val[0] = d4; | 799 vsrc.val[0] = d0; vdst.val[0] = d4; |
796 vsrc.val[1] = d1; vdst.val[1] = d5; | 800 vsrc.val[1] = d1; vdst.val[1] = d5; |
797 vsrc.val[2] = d2; vdst.val[2] = d6; | 801 vsrc.val[2] = d2; vdst.val[2] = d6; |
798 vsrc.val[3] = d3; vdst.val[3] = d7; | 802 vsrc.val[3] = d3; vdst.val[3] = d7; |
799 #endif | 803 #endif |
800 #endif // #ifdef SK_CPU_ARM64 | |
801 | 804 |
802 vres = procSIMD(vsrc, vdst); | 805 vres = procSIMD(vsrc, vdst); |
803 | 806 |
804 vst4_u8((uint8_t*)dst, vres); | 807 vst4_u8((uint8_t*)dst, vres); |
805 | 808 |
| 809 dst += 8; |
| 810 |
| 811 #else // #ifdef SK_CPU_ARM32 |
| 812 |
| 813 asm volatile ( |
| 814 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" |
| 815 "ld4 {v4.8b - v7.8b}, [%[dst]] \t\n" |
| 816 "blr %[proc] \t\n" |
| 817 "st4 {v0.8b - v3.8b}, [%[dst]], #32 \t\n" |
| 818 : [src] "+&r" (src), [dst] "+&r" (dst) |
| 819 : [proc] "r" (procSIMD) |
| 820 : "cc", "memory", |
| 821 /* We don't know what proc is going to clobber so we must |
| 822 * add everything that is not callee-saved. |
| 823 */ |
| 824 "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", |
| 825 "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", |
| 826 "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", |
| 827 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", |
| 828 "v27", "v28", "v29", "v30", "v31" |
| 829 ); |
| 830 |
| 831 #endif // #ifdef SK_CPU_ARM32 |
| 832 |
806 count -= 8; | 833 count -= 8; |
807 dst += 8; | |
808 #ifdef SK_CPU_ARM64 | |
809 src += 8; | |
810 #endif | |
811 } | 834 } |
812 // Leftovers | 835 // Leftovers |
813 for (int i = 0; i < count; i++) { | 836 for (int i = 0; i < count; i++) { |
814 dst[i] = proc(src[i], dst[i]); | 837 dst[i] = proc(src[i], dst[i]); |
815 } | 838 } |
816 } else { | 839 } else { |
817 for (int i = count - 1; i >= 0; --i) { | 840 for (int i = count - 1; i >= 0; --i) { |
818 unsigned a = aa[i]; | 841 unsigned a = aa[i]; |
819 if (0 != a) { | 842 if (0 != a) { |
820 SkPMColor dstC = dst[i]; | 843 SkPMColor dstC = dst[i]; |
(...skipping 175 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
996 | 1019 |
997 if (procSIMD != NULL) { | 1020 if (procSIMD != NULL) { |
998 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD)); | 1021 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD)); |
999 } | 1022 } |
1000 return NULL; | 1023 return NULL; |
1001 } | 1024 } |
1002 | 1025 |
1003 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) { | 1026 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) { |
1004 return gNEONXfermodeProcs1[mode]; | 1027 return gNEONXfermodeProcs1[mode]; |
1005 } | 1028 } |
OLD | NEW |