Index: src/opts/SkXfermode_opts_arm_neon.cpp |
diff --git a/src/opts/SkXfermode_opts_arm_neon.cpp b/src/opts/SkXfermode_opts_arm_neon.cpp |
index 70e92af66bc548f9f42c0ecc70b8f8513794d096..88c179d9e827601fd72002d4dbb84167e4c84ecb 100644 |
--- a/src/opts/SkXfermode_opts_arm_neon.cpp |
+++ b/src/opts/SkXfermode_opts_arm_neon.cpp |
@@ -748,8 +748,9 @@ SkNEONProcCoeffXfermode::SkNEONProcCoeffXfermode(SkReadBuffer& buffer) |
fProcSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[this->getMode()]); |
} |
-void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[], |
- int count, const SkAlpha aa[]) const { |
+void SkNEONProcCoeffXfermode::xfer32(SkPMColor* SK_RESTRICT dst, |
+ const SkPMColor* SK_RESTRICT src, int count, |
+ const SkAlpha* SK_RESTRICT aa) const { |
SkASSERT(dst && src && count >= 0); |
SkXfermodeProc proc = this->getProc(); |
@@ -758,13 +759,16 @@ void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[], |
if (NULL == aa) { |
// Unrolled NEON code |
+ // We'd like to just do this (modulo a few casts): |
+ // vst4_u8(dst, procSIMD(vld4_u8(src), vld4_u8(dst))); |
+ // src += 8; |
+ // dst += 8; |
+ // but that tends to generate miserable code. Here are a bunch of faster |
+ // workarounds for different architectures and compilers. |
while (count >= 8) { |
- uint8x8x4_t vsrc, vdst, vres; |
-#ifdef SK_CPU_ARM64 |
- vsrc = vld4_u8((uint8_t*)src); |
- vdst = vld4_u8((uint8_t*)dst); |
-#else |
+#ifdef SK_CPU_ARM32 |
+ uint8x8x4_t vsrc, vdst, vres; |
#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
asm volatile ( |
"vld4.u8 %h[vsrc], [%[src]]! \t\n" |
@@ -797,17 +801,36 @@ void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[], |
vsrc.val[2] = d2; vdst.val[2] = d6; |
vsrc.val[3] = d3; vdst.val[3] = d7; |
#endif |
-#endif // #ifdef SK_CPU_ARM64 |
vres = procSIMD(vsrc, vdst); |
vst4_u8((uint8_t*)dst, vres); |
- count -= 8; |
dst += 8; |
-#ifdef SK_CPU_ARM64 |
- src += 8; |
-#endif |
+ |
+#else // #ifdef SK_CPU_ARM32 |
+ |
+ asm volatile ( |
+ "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" |
+ "ld4 {v4.8b - v7.8b}, [%[dst]] \t\n" |
+ "blr %[proc] \t\n" |
+ "st4 {v0.8b - v3.8b}, [%[dst]], #32 \t\n" |
+ : [src] "+&r" (src), [dst] "+&r" (dst) |
+ : [proc] "r" (procSIMD) |
+ : "cc", "memory", |
+ /* We don't know what proc is going to clobber so we must |
+ * add everything that is not callee-saved. |
+ */ |
+ "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", |
+ "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", |
+ "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", |
+ "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", |
+ "v27", "v28", "v29", "v30", "v31" |
+ ); |
+ |
+#endif // #ifdef SK_CPU_ARM32 |
+ |
+ count -= 8; |
} |
// Leftovers |
for (int i = 0; i < count; i++) { |