| Index: bench/IndexBench.cpp
|
| diff --git a/bench/IndexBench.cpp b/bench/IndexBench.cpp
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..1d76ac82e28e62242407d3dfea325d425b3491dc
|
| --- /dev/null
|
| +++ b/bench/IndexBench.cpp
|
| @@ -0,0 +1,411 @@
|
| +/*
|
| + * Copyright 2016 Google Inc.
|
| + *
|
| + * Use of this source code is governed by a BSD-style license that can be
|
| + * found in the LICENSE file.
|
| + */
|
| +
|
| +#include "Benchmark.h"
|
| +#include "SkColor.h"
|
| +
|
| +static void swizzle_index_to_n32(
|
| + void* dst, const uint8_t* src, int width,
|
| + int bpp, int deltaSrc, int offset, const SkPMColor ctable[]) {
|
| +
|
| +#if defined(SK_ARM_HAS_NEON)
|
| + src += offset;
|
| + const SkPMColor* table = ctable;
|
| +
|
| + while (width >= 16) {
|
| + ctable = table;
|
| + asm volatile (
|
| + "movi v13.8b, #0xe0 \t\n"
|
| +
|
| + "ldr q14, [%[src]], #16 \t\n"
|
| + "mov v15.d[0], v14.d[1] \t\n"
|
| +
|
| + "ld4 {v4.16b-v7.16b}, [%[ctable]], #64 \t\n"
|
| + "ld4 {v8.16b-v11.16b}, [%[ctable]], #64 \t\n"
|
| + "mov v12.16b, v5.16b \t\n"
|
| + "mov v5.16b, v8.16b \t\n"
|
| + "mov v8.16b, v12.16b \t\n"
|
| + "mov v12.16b, v7.16b \t\n"
|
| + "mov v7.16b, v10.16b \t\n"
|
| + "mov v10.16b, v12.16b \t\n"
|
| +
|
| + "tbx v0.8b, {v4.16b, v5.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v4.16b, v5.16b}, v15.8b \t\n"
|
| + "mov v0.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v1.8b, {v8.16b, v9.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v8.16b, v9.16b}, v15.8b \t\n"
|
| + "mov v1.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v2.8b, {v6.16b, v7.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v6.16b, v7.16b}, v15.8b \t\n"
|
| + "mov v2.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v3.8b, {v10.16b, v11.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v10.16b, v11.16b}, v15.8b \t\n"
|
| + "mov v3.d[1], v12.d[0] \t\n"
|
| +
|
| + "add v14.8b, v14.8b, v13.8b \t\n"
|
| + "add v15.8b, v15.8b, v13.8b \t\n"
|
| +
|
| + "ld4 {v4.16b-v7.16b}, [%[ctable]], #64 \t\n"
|
| + "ld4 {v8.16b-v11.16b}, [%[ctable]], #64 \t\n"
|
| + "mov v12.16b, v5.16b \t\n"
|
| + "mov v5.16b, v8.16b \t\n"
|
| + "mov v8.16b, v12.16b \t\n"
|
| + "mov v12.16b, v7.16b \t\n"
|
| + "mov v7.16b, v10.16b \t\n"
|
| + "mov v10.16b, v12.16b \t\n"
|
| +
|
| + "tbx v0.8b, {v4.16b, v5.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v4.16b, v5.16b}, v15.8b \t\n"
|
| + "mov v0.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v1.8b, {v8.16b, v9.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v8.16b, v9.16b}, v15.8b \t\n"
|
| + "mov v1.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v2.8b, {v6.16b, v7.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v6.16b, v7.16b}, v15.8b \t\n"
|
| + "mov v2.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v3.8b, {v10.16b, v11.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v10.16b, v11.16b}, v15.8b \t\n"
|
| + "mov v3.d[1], v12.d[0] \t\n"
|
| +
|
| + "add v14.8b, v14.8b, v13.8b \t\n"
|
| + "add v15.8b, v15.8b, v13.8b \t\n"
|
| +
|
| + "ld4 {v4.16b-v7.16b}, [%[ctable]], #64 \t\n"
|
| + "ld4 {v8.16b-v11.16b}, [%[ctable]], #64 \t\n"
|
| + "mov v12.16b, v5.16b \t\n"
|
| + "mov v5.16b, v8.16b \t\n"
|
| + "mov v8.16b, v12.16b \t\n"
|
| + "mov v12.16b, v7.16b \t\n"
|
| + "mov v7.16b, v10.16b \t\n"
|
| + "mov v10.16b, v12.16b \t\n"
|
| +
|
| + "tbx v0.8b, {v4.16b, v5.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v4.16b, v5.16b}, v15.8b \t\n"
|
| + "mov v0.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v1.8b, {v8.16b, v9.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v8.16b, v9.16b}, v15.8b \t\n"
|
| + "mov v1.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v2.8b, {v6.16b, v7.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v6.16b, v7.16b}, v15.8b \t\n"
|
| + "mov v2.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v3.8b, {v10.16b, v11.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v10.16b, v11.16b}, v15.8b \t\n"
|
| + "mov v3.d[1], v12.d[0] \t\n"
|
| +
|
| + "add v14.8b, v14.8b, v13.8b \t\n"
|
| + "add v15.8b, v15.8b, v13.8b \t\n"
|
| +
|
| + "ld4 {v4.16b-v7.16b}, [%[ctable]], #64 \t\n"
|
| + "ld4 {v8.16b-v11.16b}, [%[ctable]], #64 \t\n"
|
| + "mov v12.16b, v5.16b \t\n"
|
| + "mov v5.16b, v8.16b \t\n"
|
| + "mov v8.16b, v12.16b \t\n"
|
| + "mov v12.16b, v7.16b \t\n"
|
| + "mov v7.16b, v10.16b \t\n"
|
| + "mov v10.16b, v12.16b \t\n"
|
| +
|
| + "tbx v0.8b, {v4.16b, v5.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v4.16b, v5.16b}, v15.8b \t\n"
|
| + "mov v0.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v1.8b, {v8.16b, v9.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v8.16b, v9.16b}, v15.8b \t\n"
|
| + "mov v1.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v2.8b, {v6.16b, v7.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v6.16b, v7.16b}, v15.8b \t\n"
|
| + "mov v2.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v3.8b, {v10.16b, v11.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v10.16b, v11.16b}, v15.8b \t\n"
|
| + "mov v3.d[1], v12.d[0] \t\n"
|
| +
|
| + "add v14.8b, v14.8b, v13.8b \t\n"
|
| + "add v15.8b, v15.8b, v13.8b \t\n"
|
| +
|
| + "ld4 {v4.16b-v7.16b}, [%[ctable]], #64 \t\n"
|
| + "ld4 {v8.16b-v11.16b}, [%[ctable]], #64 \t\n"
|
| + "mov v12.16b, v5.16b \t\n"
|
| + "mov v5.16b, v8.16b \t\n"
|
| + "mov v8.16b, v12.16b \t\n"
|
| + "mov v12.16b, v7.16b \t\n"
|
| + "mov v7.16b, v10.16b \t\n"
|
| + "mov v10.16b, v12.16b \t\n"
|
| +
|
| + "tbx v0.8b, {v4.16b, v5.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v4.16b, v5.16b}, v15.8b \t\n"
|
| + "mov v0.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v1.8b, {v8.16b, v9.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v8.16b, v9.16b}, v15.8b \t\n"
|
| + "mov v1.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v2.8b, {v6.16b, v7.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v6.16b, v7.16b}, v15.8b \t\n"
|
| + "mov v2.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v3.8b, {v10.16b, v11.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v10.16b, v11.16b}, v15.8b \t\n"
|
| + "mov v3.d[1], v12.d[0] \t\n"
|
| +
|
| + "add v14.8b, v14.8b, v13.8b \t\n"
|
| + "add v15.8b, v15.8b, v13.8b \t\n"
|
| +
|
| + "ld4 {v4.16b-v7.16b}, [%[ctable]], #64 \t\n"
|
| + "ld4 {v8.16b-v11.16b}, [%[ctable]], #64 \t\n"
|
| + "mov v12.16b, v5.16b \t\n"
|
| + "mov v5.16b, v8.16b \t\n"
|
| + "mov v8.16b, v12.16b \t\n"
|
| + "mov v12.16b, v7.16b \t\n"
|
| + "mov v7.16b, v10.16b \t\n"
|
| + "mov v10.16b, v12.16b \t\n"
|
| +
|
| + "tbx v0.8b, {v4.16b, v5.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v4.16b, v5.16b}, v15.8b \t\n"
|
| + "mov v0.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v1.8b, {v8.16b, v9.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v8.16b, v9.16b}, v15.8b \t\n"
|
| + "mov v1.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v2.8b, {v6.16b, v7.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v6.16b, v7.16b}, v15.8b \t\n"
|
| + "mov v2.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v3.8b, {v10.16b, v11.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v10.16b, v11.16b}, v15.8b \t\n"
|
| + "mov v3.d[1], v12.d[0] \t\n"
|
| +
|
| + "add v14.8b, v14.8b, v13.8b \t\n"
|
| + "add v15.8b, v15.8b, v13.8b \t\n"
|
| +
|
| + "ld4 {v4.16b-v7.16b}, [%[ctable]], #64 \t\n"
|
| + "ld4 {v8.16b-v11.16b}, [%[ctable]], #64 \t\n"
|
| + "mov v12.16b, v5.16b \t\n"
|
| + "mov v5.16b, v8.16b \t\n"
|
| + "mov v8.16b, v12.16b \t\n"
|
| + "mov v12.16b, v7.16b \t\n"
|
| + "mov v7.16b, v10.16b \t\n"
|
| + "mov v10.16b, v12.16b \t\n"
|
| +
|
| + "tbx v0.8b, {v4.16b, v5.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v4.16b, v5.16b}, v15.8b \t\n"
|
| + "mov v0.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v1.8b, {v8.16b, v9.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v8.16b, v9.16b}, v15.8b \t\n"
|
| + "mov v1.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v2.8b, {v6.16b, v7.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v6.16b, v7.16b}, v15.8b \t\n"
|
| + "mov v2.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v3.8b, {v10.16b, v11.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v10.16b, v11.16b}, v15.8b \t\n"
|
| + "mov v3.d[1], v12.d[0] \t\n"
|
| +
|
| + "add v14.8b, v14.8b, v13.8b \t\n"
|
| + "add v15.8b, v15.8b, v13.8b \t\n"
|
| +
|
| + "ld4 {v4.16b-v7.16b}, [%[ctable]], #64 \t\n"
|
| + "ld4 {v8.16b-v11.16b}, [%[ctable]] \t\n"
|
| + "mov v12.16b, v5.16b \t\n"
|
| + "mov v5.16b, v8.16b \t\n"
|
| + "mov v8.16b, v12.16b \t\n"
|
| + "mov v12.16b, v7.16b \t\n"
|
| + "mov v7.16b, v10.16b \t\n"
|
| + "mov v10.16b, v12.16b \t\n"
|
| +
|
| + "tbx v0.8b, {v4.16b, v5.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v4.16b, v5.16b}, v15.8b \t\n"
|
| + "mov v0.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v1.8b, {v8.16b, v9.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v8.16b, v9.16b}, v15.8b \t\n"
|
| + "mov v1.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v2.8b, {v6.16b, v7.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v6.16b, v7.16b}, v15.8b \t\n"
|
| + "mov v2.d[1], v12.d[0] \t\n"
|
| +
|
| + "tbx v3.8b, {v10.16b, v11.16b}, v14.8b \t\n"
|
| + "tbx v12.8b, {v10.16b, v11.16b}, v15.8b \t\n"
|
| + "mov v3.d[1], v12.d[0] \t\n"
|
| +
|
| + "add v14.8b, v14.8b, v13.8b \t\n"
|
| + "add v15.8b, v15.8b, v13.8b \t\n"
|
| +
|
| + "st4 {v0.16b-v3.16b}, [%[dst]], #64 \t\n"
|
| + : [dst] "+&r" (dst), [src] "+&r" (src), [ctable] "+&r" (ctable)
|
| + :
|
| + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
| + );
|
| + width -= 16;
|
| + }
|
| +
|
| +#elif 0
|
| + uint32_t* dst = (uint32_t*) dstRow;
|
| + src += offset;
|
| +
|
| + while (dstWidth >= 16) {
|
| + // Table registers
|
| + uint8%[src]6x4_t t0, t1;
|
| + uint8x8x4_t tr, tg, tb, ta;
|
| +
|
| + // Indices into table
|
| + uint8x16_t indices = vld1q_u8(src);
|
| +
|
| + // Pixel output registers
|
| + uint8x16x4_t rgba;
|
| + rgba.val[0] = vdupq_n_u8(0);
|
| + rgba.val[1] = vdupq_n_u8(0);
|
| + rgba.val[2] = vdupq_n_u8(0);
|
| + rgba.val[3] = vdupq_n_u8(0);
|
| +
|
| + const uint32_t* table = ctable;
|
| + const int numColors = 256;
|
| + const int numColorsPerLoop = 32;
|
| + for (int j = 0; j < numColors / numColorsPerLoop; j++) {
|
| + // Load a separate color table for each of r, g, b, a
|
| + t0 = vld4q_u8((const uint8_t*) (table + 0)); // rgba
|
| + t1 = vld4q_u8((const uint8_t*) (table + 16)); // RGBA
|
| + SkTSwap(t0.val[1], t1.val[0]); // rRba, gGBA
|
| + SkTSwap(t0.val[3], t1.val[2]); // rRbB, gGaA
|
| + tr = *(((uint8x8x4_t*) &t0) + 0); // rR
|
| + tb = *(((uint8x8x4_t*) &t0) + 1); // bB
|
| + tg = *(((uint8x8x4_t*) &t1) + 0); // gG
|
| + ta = *(((uint8x8x4_t*) &t1) + 1); // aA
|
| +
|
| + // Use VTBL, then OR the results together.
|
| + rgba.val[0] = vorrq_u8(rgba.val[0],
|
| + vcombine_u8(vtbl4_u8(tr, *(((uint8x8_t*) &indices) + 0)),
|
| + vtbl4_u8(tr, *(((uint8x8_t*) &indices) + 1))));
|
| + rgba.val[1] = vorrq_u8(rgba.val[1],
|
| + vcombine_u8(vtbl4_u8(tg, *(((uint8x8_t*) &indices) + 0)),
|
| + vtbl4_u8(tg, *(((uint8x8_t*) &indices) + 1))));
|
| + rgba.val[2] = vorrq_u8(rgba.val[2],
|
| + vcombine_u8(vtbl4_u8(tb, *(((uint8x8_t*) &indices) + 0)),
|
| + vtbl4_u8(tb, *(((uint8x8_t*) &indices) + 1))));
|
| + rgba.val[3] = vorrq_u8(rgba.val[3],
|
| + vcombine_u8(vtbl4_u8(ta, *(((uint8x8_t*) &indices) + 0)),
|
| + vtbl4_u8(ta, *(((uint8x8_t*) &indices) + 1))));
|
| +
|
| + // Move the next set of indices into the range of the color table. Indices
|
| + // that are currently in range should go out of range due to underflow.
|
| + indices = vsubq_u8(indices, vdupq_n_u8(32));
|
| + table += numColorsPerLoop;
|
| + }
|
| +
|
| + // Store output pixel values.
|
| + vst4q_u8((uint8_t*) dst, rgba);
|
| +
|
| + src += 16;
|
| + dst += 16;
|
| + dstWidth -= 16;
|
| + }
|
| +#else
|
| + src += offset;
|
| + SkPMColor* dst = (SkPMColor*)dstRow;
|
| + for (int x = 0; x < dstWidth; x++) {
|
| + SkPMColor c = ctable[*src];
|
| + dst[x] = c;
|
| + src += deltaSrc;
|
| + }
|
| +#endif
|
| +}
|
| +
|
| +class IndexBench : public Benchmark {
|
| +public:
|
| + IndexBench() {}
|
| +
|
| + bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
|
| + const char* onGetName() override { return "IndexYay"; }
|
| +
|
| + void onDelayedSetup() override {
|
| + fColorTable.reset(256);
|
| + uint32_t* ptr = fColorTable.get();
|
| + for (int i = 0; i < 256; i++) {
|
| + ptr[i] = (i * 37 - 52) * 49;
|
| + }
|
| +
|
| + fSrc.reset(1024);
|
| + uint8_t* p = fSrc.get();
|
| + for (int i = 0; i < 1024; i++) {
|
| + p[i] = i % 256;
|
| + }
|
| +
|
| + fDst.reset(1024);
|
| + }
|
| +
|
| + void onDraw(int loops, SkCanvas*) override {
|
| + while (loops --> 0) {
|
| + swizzle_index_to_n32(fDst, fSrc, 1024, 1, 1, 0, fColorTable);
|
| + }
|
| + }
|
| +private:
|
| + SkAutoTMalloc<uint32_t> fDst;
|
| + SkAutoTMalloc<uint8_t> fSrc;
|
| + SkAutoTMalloc<uint32_t> fColorTable;
|
| +};
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +DEF_BENCH(return new IndexBench());
|
|
|