Index: bench/IndexBench.cpp |
diff --git a/bench/IndexBench.cpp b/bench/IndexBench.cpp |
new file mode 100644 |
index 0000000000000000000000000000000000000000..1d76ac82e28e62242407d3dfea325d425b3491dc |
--- /dev/null |
+++ b/bench/IndexBench.cpp |
@@ -0,0 +1,411 @@ |
+/* |
+ * Copyright 2016 Google Inc. |
+ * |
+ * Use of this source code is governed by a BSD-style license that can be |
+ * found in the LICENSE file. |
+ */ |
+ |
+#include "Benchmark.h" |
+#include "SkColor.h" |
+ |
+static void swizzle_index_to_n32( |
+ void* dst, const uint8_t* src, int width, |
+ int bpp, int deltaSrc, int offset, const SkPMColor ctable[]) { |
+ |
+#if defined(SK_ARM_HAS_NEON) |
+ src += offset; |
+ const SkPMColor* table = ctable; |
+ |
+ while (width >= 16) { |
+ ctable = table; |
+ asm volatile ( |
+ "movi v13.8b, #0xe0 \t\n" |
+ |
+ "ldr q14, [%[src]], #16 \t\n" |
+ "mov v15.d[0], v14.d[1] \t\n" |
+ |
+ "ld4 {v4.16b-v7.16b}, [%[ctable]], #64 \t\n" |
+ "ld4 {v8.16b-v11.16b}, [%[ctable]], #64 \t\n" |
+ "mov v12.16b, v5.16b \t\n" |
+ "mov v5.16b, v8.16b \t\n" |
+ "mov v8.16b, v12.16b \t\n" |
+ "mov v12.16b, v7.16b \t\n" |
+ "mov v7.16b, v10.16b \t\n" |
+ "mov v10.16b, v12.16b \t\n" |
+ |
+ "tbx v0.8b, {v4.16b, v5.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v4.16b, v5.16b}, v15.8b \t\n" |
+ "mov v0.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v1.8b, {v8.16b, v9.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v8.16b, v9.16b}, v15.8b \t\n" |
+ "mov v1.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v2.8b, {v6.16b, v7.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v6.16b, v7.16b}, v15.8b \t\n" |
+ "mov v2.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v3.8b, {v10.16b, v11.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v10.16b, v11.16b}, v15.8b \t\n" |
+ "mov v3.d[1], v12.d[0] \t\n" |
+ |
+ "add v14.8b, v14.8b, v13.8b \t\n" |
+ "add v15.8b, v15.8b, v13.8b \t\n" |
+ |
+ "ld4 {v4.16b-v7.16b}, [%[ctable]], #64 \t\n" |
+ "ld4 {v8.16b-v11.16b}, [%[ctable]], #64 \t\n" |
+ "mov v12.16b, v5.16b \t\n" |
+ "mov v5.16b, v8.16b \t\n" |
+ "mov v8.16b, v12.16b \t\n" |
+ "mov v12.16b, v7.16b \t\n" |
+ "mov v7.16b, v10.16b \t\n" |
+ "mov v10.16b, v12.16b \t\n" |
+ |
+ "tbx v0.8b, {v4.16b, v5.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v4.16b, v5.16b}, v15.8b \t\n" |
+ "mov v0.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v1.8b, {v8.16b, v9.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v8.16b, v9.16b}, v15.8b \t\n" |
+ "mov v1.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v2.8b, {v6.16b, v7.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v6.16b, v7.16b}, v15.8b \t\n" |
+ "mov v2.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v3.8b, {v10.16b, v11.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v10.16b, v11.16b}, v15.8b \t\n" |
+ "mov v3.d[1], v12.d[0] \t\n" |
+ |
+ "add v14.8b, v14.8b, v13.8b \t\n" |
+ "add v15.8b, v15.8b, v13.8b \t\n" |
+ |
+ "ld4 {v4.16b-v7.16b}, [%[ctable]], #64 \t\n" |
+ "ld4 {v8.16b-v11.16b}, [%[ctable]], #64 \t\n" |
+ "mov v12.16b, v5.16b \t\n" |
+ "mov v5.16b, v8.16b \t\n" |
+ "mov v8.16b, v12.16b \t\n" |
+ "mov v12.16b, v7.16b \t\n" |
+ "mov v7.16b, v10.16b \t\n" |
+ "mov v10.16b, v12.16b \t\n" |
+ |
+ "tbx v0.8b, {v4.16b, v5.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v4.16b, v5.16b}, v15.8b \t\n" |
+ "mov v0.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v1.8b, {v8.16b, v9.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v8.16b, v9.16b}, v15.8b \t\n" |
+ "mov v1.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v2.8b, {v6.16b, v7.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v6.16b, v7.16b}, v15.8b \t\n" |
+ "mov v2.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v3.8b, {v10.16b, v11.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v10.16b, v11.16b}, v15.8b \t\n" |
+ "mov v3.d[1], v12.d[0] \t\n" |
+ |
+ "add v14.8b, v14.8b, v13.8b \t\n" |
+ "add v15.8b, v15.8b, v13.8b \t\n" |
+ |
+ "ld4 {v4.16b-v7.16b}, [%[ctable]], #64 \t\n" |
+ "ld4 {v8.16b-v11.16b}, [%[ctable]], #64 \t\n" |
+ "mov v12.16b, v5.16b \t\n" |
+ "mov v5.16b, v8.16b \t\n" |
+ "mov v8.16b, v12.16b \t\n" |
+ "mov v12.16b, v7.16b \t\n" |
+ "mov v7.16b, v10.16b \t\n" |
+ "mov v10.16b, v12.16b \t\n" |
+ |
+ "tbx v0.8b, {v4.16b, v5.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v4.16b, v5.16b}, v15.8b \t\n" |
+ "mov v0.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v1.8b, {v8.16b, v9.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v8.16b, v9.16b}, v15.8b \t\n" |
+ "mov v1.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v2.8b, {v6.16b, v7.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v6.16b, v7.16b}, v15.8b \t\n" |
+ "mov v2.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v3.8b, {v10.16b, v11.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v10.16b, v11.16b}, v15.8b \t\n" |
+ "mov v3.d[1], v12.d[0] \t\n" |
+ |
+ "add v14.8b, v14.8b, v13.8b \t\n" |
+ "add v15.8b, v15.8b, v13.8b \t\n" |
+ |
+ "ld4 {v4.16b-v7.16b}, [%[ctable]], #64 \t\n" |
+ "ld4 {v8.16b-v11.16b}, [%[ctable]], #64 \t\n" |
+ "mov v12.16b, v5.16b \t\n" |
+ "mov v5.16b, v8.16b \t\n" |
+ "mov v8.16b, v12.16b \t\n" |
+ "mov v12.16b, v7.16b \t\n" |
+ "mov v7.16b, v10.16b \t\n" |
+ "mov v10.16b, v12.16b \t\n" |
+ |
+ "tbx v0.8b, {v4.16b, v5.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v4.16b, v5.16b}, v15.8b \t\n" |
+ "mov v0.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v1.8b, {v8.16b, v9.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v8.16b, v9.16b}, v15.8b \t\n" |
+ "mov v1.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v2.8b, {v6.16b, v7.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v6.16b, v7.16b}, v15.8b \t\n" |
+ "mov v2.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v3.8b, {v10.16b, v11.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v10.16b, v11.16b}, v15.8b \t\n" |
+ "mov v3.d[1], v12.d[0] \t\n" |
+ |
+ "add v14.8b, v14.8b, v13.8b \t\n" |
+ "add v15.8b, v15.8b, v13.8b \t\n" |
+ |
+ "ld4 {v4.16b-v7.16b}, [%[ctable]], #64 \t\n" |
+ "ld4 {v8.16b-v11.16b}, [%[ctable]], #64 \t\n" |
+ "mov v12.16b, v5.16b \t\n" |
+ "mov v5.16b, v8.16b \t\n" |
+ "mov v8.16b, v12.16b \t\n" |
+ "mov v12.16b, v7.16b \t\n" |
+ "mov v7.16b, v10.16b \t\n" |
+ "mov v10.16b, v12.16b \t\n" |
+ |
+ "tbx v0.8b, {v4.16b, v5.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v4.16b, v5.16b}, v15.8b \t\n" |
+ "mov v0.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v1.8b, {v8.16b, v9.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v8.16b, v9.16b}, v15.8b \t\n" |
+ "mov v1.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v2.8b, {v6.16b, v7.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v6.16b, v7.16b}, v15.8b \t\n" |
+ "mov v2.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v3.8b, {v10.16b, v11.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v10.16b, v11.16b}, v15.8b \t\n" |
+ "mov v3.d[1], v12.d[0] \t\n" |
+ |
+ "add v14.8b, v14.8b, v13.8b \t\n" |
+ "add v15.8b, v15.8b, v13.8b \t\n" |
+ |
+ "ld4 {v4.16b-v7.16b}, [%[ctable]], #64 \t\n" |
+ "ld4 {v8.16b-v11.16b}, [%[ctable]], #64 \t\n" |
+ "mov v12.16b, v5.16b \t\n" |
+ "mov v5.16b, v8.16b \t\n" |
+ "mov v8.16b, v12.16b \t\n" |
+ "mov v12.16b, v7.16b \t\n" |
+ "mov v7.16b, v10.16b \t\n" |
+ "mov v10.16b, v12.16b \t\n" |
+ |
+ "tbx v0.8b, {v4.16b, v5.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v4.16b, v5.16b}, v15.8b \t\n" |
+ "mov v0.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v1.8b, {v8.16b, v9.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v8.16b, v9.16b}, v15.8b \t\n" |
+ "mov v1.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v2.8b, {v6.16b, v7.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v6.16b, v7.16b}, v15.8b \t\n" |
+ "mov v2.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v3.8b, {v10.16b, v11.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v10.16b, v11.16b}, v15.8b \t\n" |
+ "mov v3.d[1], v12.d[0] \t\n" |
+ |
+ "add v14.8b, v14.8b, v13.8b \t\n" |
+ "add v15.8b, v15.8b, v13.8b \t\n" |
+ |
+ "ld4 {v4.16b-v7.16b}, [%[ctable]], #64 \t\n" |
+ "ld4 {v8.16b-v11.16b}, [%[ctable]] \t\n" |
+ "mov v12.16b, v5.16b \t\n" |
+ "mov v5.16b, v8.16b \t\n" |
+ "mov v8.16b, v12.16b \t\n" |
+ "mov v12.16b, v7.16b \t\n" |
+ "mov v7.16b, v10.16b \t\n" |
+ "mov v10.16b, v12.16b \t\n" |
+ |
+ "tbx v0.8b, {v4.16b, v5.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v4.16b, v5.16b}, v15.8b \t\n" |
+ "mov v0.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v1.8b, {v8.16b, v9.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v8.16b, v9.16b}, v15.8b \t\n" |
+ "mov v1.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v2.8b, {v6.16b, v7.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v6.16b, v7.16b}, v15.8b \t\n" |
+ "mov v2.d[1], v12.d[0] \t\n" |
+ |
+ "tbx v3.8b, {v10.16b, v11.16b}, v14.8b \t\n" |
+ "tbx v12.8b, {v10.16b, v11.16b}, v15.8b \t\n" |
+ "mov v3.d[1], v12.d[0] \t\n" |
+ |
+ "add v14.8b, v14.8b, v13.8b \t\n" |
+ "add v15.8b, v15.8b, v13.8b \t\n" |
+ |
+ "st4 {v0.16b-v3.16b}, [%[dst]], #64 \t\n" |
+ : [dst] "+&r" (dst), [src] "+&r" (src), [ctable] "+&r" (ctable) |
+ : |
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" |
+ ); |
+ width -= 16; |
+ } |
+ |
+#elif 0 |
+ uint32_t* dst = (uint32_t*) dstRow; |
+ src += offset; |
+ |
+ while (dstWidth >= 16) { |
+ // Table registers |
+ uint8%[src]6x4_t t0, t1; |
+ uint8x8x4_t tr, tg, tb, ta; |
+ |
+ // Indices into table |
+ uint8x16_t indices = vld1q_u8(src); |
+ |
+ // Pixel output registers |
+ uint8x16x4_t rgba; |
+ rgba.val[0] = vdupq_n_u8(0); |
+ rgba.val[1] = vdupq_n_u8(0); |
+ rgba.val[2] = vdupq_n_u8(0); |
+ rgba.val[3] = vdupq_n_u8(0); |
+ |
+ const uint32_t* table = ctable; |
+ const int numColors = 256; |
+ const int numColorsPerLoop = 32; |
+ for (int j = 0; j < numColors / numColorsPerLoop; j++) { |
+ // Load a separate color table for each of r, g, b, a |
+ t0 = vld4q_u8((const uint8_t*) (table + 0)); // rgba |
+ t1 = vld4q_u8((const uint8_t*) (table + 16)); // RGBA |
+ SkTSwap(t0.val[1], t1.val[0]); // rRba, gGBA |
+ SkTSwap(t0.val[3], t1.val[2]); // rRbB, gGaA |
+ tr = *(((uint8x8x4_t*) &t0) + 0); // rR |
+ tb = *(((uint8x8x4_t*) &t0) + 1); // bB |
+ tg = *(((uint8x8x4_t*) &t1) + 0); // gG |
+ ta = *(((uint8x8x4_t*) &t1) + 1); // aA |
+ |
+ // Use VTBL, then OR the results together. |
+ rgba.val[0] = vorrq_u8(rgba.val[0], |
+ vcombine_u8(vtbl4_u8(tr, *(((uint8x8_t*) &indices) + 0)), |
+ vtbl4_u8(tr, *(((uint8x8_t*) &indices) + 1)))); |
+ rgba.val[1] = vorrq_u8(rgba.val[1], |
+ vcombine_u8(vtbl4_u8(tg, *(((uint8x8_t*) &indices) + 0)), |
+ vtbl4_u8(tg, *(((uint8x8_t*) &indices) + 1)))); |
+ rgba.val[2] = vorrq_u8(rgba.val[2], |
+ vcombine_u8(vtbl4_u8(tb, *(((uint8x8_t*) &indices) + 0)), |
+ vtbl4_u8(tb, *(((uint8x8_t*) &indices) + 1)))); |
+ rgba.val[3] = vorrq_u8(rgba.val[3], |
+ vcombine_u8(vtbl4_u8(ta, *(((uint8x8_t*) &indices) + 0)), |
+ vtbl4_u8(ta, *(((uint8x8_t*) &indices) + 1)))); |
+ |
+ // Move the next set of indices into the range of the color table. Indices |
+ // that are currently in range should go out of range due to underflow. |
+ indices = vsubq_u8(indices, vdupq_n_u8(32)); |
+ table += numColorsPerLoop; |
+ } |
+ |
+ // Store output pixel values. |
+ vst4q_u8((uint8_t*) dst, rgba); |
+ |
+ src += 16; |
+ dst += 16; |
+ dstWidth -= 16; |
+ } |
+#else |
+ src += offset; |
+ SkPMColor* dst = (SkPMColor*)dstRow; |
+ for (int x = 0; x < dstWidth; x++) { |
+ SkPMColor c = ctable[*src]; |
+ dst[x] = c; |
+ src += deltaSrc; |
+ } |
+#endif |
+} |
+ |
+class IndexBench : public Benchmark { |
+public: |
+ IndexBench() {} |
+ |
+ bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; } |
+ const char* onGetName() override { return "IndexYay"; } |
+ |
+ void onDelayedSetup() override { |
+ fColorTable.reset(256); |
+ uint32_t* ptr = fColorTable.get(); |
+ for (int i = 0; i < 256; i++) { |
+ ptr[i] = (i * 37 - 52) * 49; |
+ } |
+ |
+ fSrc.reset(1024); |
+ uint8_t* p = fSrc.get(); |
+ for (int i = 0; i < 1024; i++) { |
+ p[i] = i % 256; |
+ } |
+ |
+ fDst.reset(1024); |
+ } |
+ |
+ void onDraw(int loops, SkCanvas*) override { |
+ while (loops --> 0) { |
+ swizzle_index_to_n32(fDst, fSrc, 1024, 1, 1, 0, fColorTable); |
+ } |
+ } |
+private: |
+ SkAutoTMalloc<uint32_t> fDst; |
+ SkAutoTMalloc<uint8_t> fSrc; |
+ SkAutoTMalloc<uint32_t> fColorTable; |
+}; |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+DEF_BENCH(return new IndexBench()); |