| OLD | NEW |
| 1 #include "SkXfermode.h" | 1 #include "SkXfermode.h" |
| 2 #include "SkXfermode_proccoeff.h" | 2 #include "SkXfermode_proccoeff.h" |
| 3 #include "SkColorPriv.h" | 3 #include "SkColorPriv.h" |
| 4 | 4 |
| 5 #include <arm_neon.h> | 5 #include <arm_neon.h> |
| 6 #include "SkColor_opts_neon.h" | 6 #include "SkColor_opts_neon.h" |
| 7 #include "SkXfermode_opts_arm_neon.h" | 7 #include "SkXfermode_opts_arm_neon.h" |
| 8 | 8 |
| 9 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) | 9 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) |
| 10 | 10 |
| (...skipping 730 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 741 | 741 |
| 742 typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst); | 742 typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst); |
| 743 | 743 |
| 744 extern SkXfermodeProcSIMD gNEONXfermodeProcs[]; | 744 extern SkXfermodeProcSIMD gNEONXfermodeProcs[]; |
| 745 | 745 |
| 746 SkNEONProcCoeffXfermode::SkNEONProcCoeffXfermode(SkReadBuffer& buffer) | 746 SkNEONProcCoeffXfermode::SkNEONProcCoeffXfermode(SkReadBuffer& buffer) |
| 747 : INHERITED(buffer) { | 747 : INHERITED(buffer) { |
| 748 fProcSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[this->getMode()]); | 748 fProcSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[this->getMode()]); |
| 749 } | 749 } |
| 750 | 750 |
| 751 void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[], | 751 void SkNEONProcCoeffXfermode::xfer32(SkPMColor* SK_RESTRICT dst, |
| 752 int count, const SkAlpha aa[]) const { | 752 const SkPMColor* SK_RESTRICT src, int count
, |
| 753 const SkAlpha* SK_RESTRICT aa) const { |
| 753 SkASSERT(dst && src && count >= 0); | 754 SkASSERT(dst && src && count >= 0); |
| 754 | 755 |
| 755 SkXfermodeProc proc = this->getProc(); | 756 SkXfermodeProc proc = this->getProc(); |
| 756 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); | 757 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); |
| 757 SkASSERT(procSIMD != NULL); | 758 SkASSERT(procSIMD != NULL); |
| 758 | 759 |
| 759 if (NULL == aa) { | 760 if (NULL == aa) { |
| 760 // Unrolled NEON code | 761 // Unrolled NEON code |
| 762 // We'd like to just do this (modulo a few casts): |
| 763 // vst4_u8(dst, procSIMD(vld4_u8(src), vld4_u8(dst))); |
| 764 // src += 8; |
| 765 // dst += 8; |
| 766 // but that tends to generate miserable code. Here are a bunch of faster |
| 767 // workarounds for different architectures and compilers. |
| 761 while (count >= 8) { | 768 while (count >= 8) { |
| 769 |
| 770 #ifdef SK_CPU_ARM32 |
| 762 uint8x8x4_t vsrc, vdst, vres; | 771 uint8x8x4_t vsrc, vdst, vres; |
| 763 | |
| 764 #ifdef SK_CPU_ARM64 | |
| 765 vsrc = vld4_u8((uint8_t*)src); | |
| 766 vdst = vld4_u8((uint8_t*)dst); | |
| 767 #else | |
| 768 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | 772 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
| 769 asm volatile ( | 773 asm volatile ( |
| 770 "vld4.u8 %h[vsrc], [%[src]]! \t\n" | 774 "vld4.u8 %h[vsrc], [%[src]]! \t\n" |
| 771 "vld4.u8 %h[vdst], [%[dst]] \t\n" | 775 "vld4.u8 %h[vdst], [%[dst]] \t\n" |
| 772 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src) | 776 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src) |
| 773 : [dst] "r" (dst) | 777 : [dst] "r" (dst) |
| 774 : | 778 : |
| 775 ); | 779 ); |
| 776 #else | 780 #else |
| 777 register uint8x8_t d0 asm("d0"); | 781 register uint8x8_t d0 asm("d0"); |
| (...skipping 12 matching lines...) Expand all Loading... |
| 790 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7), | 794 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7), |
| 791 [src] "+&r" (src) | 795 [src] "+&r" (src) |
| 792 : [dst] "r" (dst) | 796 : [dst] "r" (dst) |
| 793 : | 797 : |
| 794 ); | 798 ); |
| 795 vsrc.val[0] = d0; vdst.val[0] = d4; | 799 vsrc.val[0] = d0; vdst.val[0] = d4; |
| 796 vsrc.val[1] = d1; vdst.val[1] = d5; | 800 vsrc.val[1] = d1; vdst.val[1] = d5; |
| 797 vsrc.val[2] = d2; vdst.val[2] = d6; | 801 vsrc.val[2] = d2; vdst.val[2] = d6; |
| 798 vsrc.val[3] = d3; vdst.val[3] = d7; | 802 vsrc.val[3] = d3; vdst.val[3] = d7; |
| 799 #endif | 803 #endif |
| 800 #endif // #ifdef SK_CPU_ARM64 | |
| 801 | 804 |
| 802 vres = procSIMD(vsrc, vdst); | 805 vres = procSIMD(vsrc, vdst); |
| 803 | 806 |
| 804 vst4_u8((uint8_t*)dst, vres); | 807 vst4_u8((uint8_t*)dst, vres); |
| 805 | 808 |
| 809 dst += 8; |
| 810 |
| 811 #else // #ifdef SK_CPU_ARM32 |
| 812 |
| 813 asm volatile ( |
| 814 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" |
| 815 "ld4 {v4.8b - v7.8b}, [%[dst]] \t\n" |
| 816 "blr %[proc] \t\n" |
| 817 "st4 {v0.8b - v3.8b}, [%[dst]], #32 \t\n" |
| 818 : [src] "+&r" (src), [dst] "+&r" (dst) |
| 819 : [proc] "r" (procSIMD) |
| 820 : "cc", "memory", |
| 821 /* We don't know what proc is going to clobber so we must |
| 822 * add everything that is not callee-saved. |
| 823 */ |
| 824 "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", |
| 825 "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", |
| 826 "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", |
| 827 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", |
| 828 "v27", "v28", "v29", "v30", "v31" |
| 829 ); |
| 830 |
| 831 #endif // #ifdef SK_CPU_ARM32 |
| 832 |
| 806 count -= 8; | 833 count -= 8; |
| 807 dst += 8; | |
| 808 #ifdef SK_CPU_ARM64 | |
| 809 src += 8; | |
| 810 #endif | |
| 811 } | 834 } |
| 812 // Leftovers | 835 // Leftovers |
| 813 for (int i = 0; i < count; i++) { | 836 for (int i = 0; i < count; i++) { |
| 814 dst[i] = proc(src[i], dst[i]); | 837 dst[i] = proc(src[i], dst[i]); |
| 815 } | 838 } |
| 816 } else { | 839 } else { |
| 817 for (int i = count - 1; i >= 0; --i) { | 840 for (int i = count - 1; i >= 0; --i) { |
| 818 unsigned a = aa[i]; | 841 unsigned a = aa[i]; |
| 819 if (0 != a) { | 842 if (0 != a) { |
| 820 SkPMColor dstC = dst[i]; | 843 SkPMColor dstC = dst[i]; |
| (...skipping 175 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 996 | 1019 |
| 997 if (procSIMD != NULL) { | 1020 if (procSIMD != NULL) { |
| 998 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD)); | 1021 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD)); |
| 999 } | 1022 } |
| 1000 return NULL; | 1023 return NULL; |
| 1001 } | 1024 } |
| 1002 | 1025 |
| 1003 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) { | 1026 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) { |
| 1004 return gNEONXfermodeProcs1[mode]; | 1027 return gNEONXfermodeProcs1[mode]; |
| 1005 } | 1028 } |
| OLD | NEW |