| Index: bench/MemcpyBench.cpp
|
| diff --git a/bench/MemcpyBench.cpp b/bench/MemcpyBench.cpp
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..452bf6fdc2ccb84d22ca4134730ed526ae42ba85
|
| --- /dev/null
|
| +++ b/bench/MemcpyBench.cpp
|
| @@ -0,0 +1,154 @@
|
| +/*
|
| + * Copyright 2014 Google Inc.
|
| + *
|
| + * Use of this source code is governed by a BSD-style license that can be
|
| + * found in the LICENSE file.
|
| + */
|
| +
|
| +#include "SkBenchmark.h"
|
| +#include "SkRandom.h"
|
| +#include "SkTemplates.h"
|
| +
|
| +template <typename Memcpy32>
|
| +class Memcpy32Bench : public SkBenchmark {
|
| +public:
|
| + explicit Memcpy32Bench(int count, Memcpy32 memcpy32, const char* name)
|
| + : fCount(count)
|
| + , fMemcpy32(memcpy32)
|
| + , fName(SkStringPrintf("%s_%d", name, count)) {}
|
| +
|
| + virtual const char* onGetName() SK_OVERRIDE {
|
| + return fName.c_str();
|
| + }
|
| +
|
| + virtual bool isSuitableFor(Backend backend) SK_OVERRIDE {
|
| + return backend == kNonRendering_Backend;
|
| + }
|
| +
|
| + virtual void onPreDraw() SK_OVERRIDE {
|
| + fDst.reset(fCount);
|
| + fSrc.reset(fCount);
|
| +
|
| + SkRandom rand;
|
| + for (int i = 0; i < fCount; i++) {
|
| + fSrc[i] = rand.nextU();
|
| + }
|
| + }
|
| +
|
| + virtual void onDraw(const int loops, SkCanvas*) SK_OVERRIDE {
|
| + for (int i = 0; i < loops; i++) {
|
| + fMemcpy32(fDst, fSrc, fCount);
|
| + }
|
| + }
|
| +
|
| +private:
|
| + SkAutoTMalloc<uint32_t> fDst, fSrc;
|
| +
|
| + int fCount;
|
| + Memcpy32 fMemcpy32;
|
| + const SkString fName;
|
| +};
|
| +
|
| +template <typename Memcpy32>
|
| +static Memcpy32Bench<Memcpy32>* Bench(int count, Memcpy32 memcpy32, const char* name) {
|
| + return new Memcpy32Bench<Memcpy32>(count, memcpy32, name);
|
| +}
|
| +#define BENCH(memcpy32, count) DEF_BENCH(return Bench(count, memcpy32, #memcpy32); )
|
| +
|
| +
|
| +// Let the libc developers do what they think is best.
|
| +static void memcpy32_memcpy(uint32_t* dst, const uint32_t* src, int count) {
|
| + memcpy(dst, src, sizeof(uint32_t) * count);
|
| +}
|
| +BENCH(memcpy32_memcpy, 10)
|
| +BENCH(memcpy32_memcpy, 100)
|
| +BENCH(memcpy32_memcpy, 1000)
|
| +BENCH(memcpy32_memcpy, 10000)
|
| +BENCH(memcpy32_memcpy, 100000)
|
| +
|
| +// Let the compiler's autovectorizer do what it thinks is best.
|
| +static void memcpy32_autovectorize(uint32_t* dst, const uint32_t* src, int count) {
|
| + while (count --> 0) {
|
| + *dst++ = *src++;
|
| + }
|
| +}
|
| +BENCH(memcpy32_autovectorize, 10)
|
| +BENCH(memcpy32_autovectorize, 100)
|
| +BENCH(memcpy32_autovectorize, 1000)
|
| +BENCH(memcpy32_autovectorize, 10000)
|
| +BENCH(memcpy32_autovectorize, 100000)
|
| +
|
| +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
|
| +
|
| +// Align dst to 16 bytes, then use aligned stores. src isn't algined, so use unaligned loads.
|
| +static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) {
|
| + if (count >= 16) {
|
| + while (uintptr_t(dst) & 0xF) {
|
| + *dst++ = *src++;
|
| + count--;
|
| + }
|
| +
|
| + __m128i* dst128 = reinterpret_cast<__m128i*>(dst);
|
| + const __m128i* src128 = reinterpret_cast<const __m128i*>(src);
|
| + while (count >= 16) {
|
| + __m128i a = _mm_loadu_si128(src128++);
|
| + __m128i b = _mm_loadu_si128(src128++);
|
| + __m128i c = _mm_loadu_si128(src128++);
|
| + __m128i d = _mm_loadu_si128(src128++);
|
| +
|
| + _mm_store_si128(dst128++, a);
|
| + _mm_store_si128(dst128++, b);
|
| + _mm_store_si128(dst128++, c);
|
| + _mm_store_si128(dst128++, d);
|
| +
|
| + count -= 16;
|
| + }
|
| +
|
| + dst = reinterpret_cast<uint32_t*>(dst128);
|
| + src = reinterpret_cast<const uint32_t*>(src128);
|
| + }
|
| +
|
| + while (count --> 0) {
|
| + *dst++ = *src++;
|
| + }
|
| +}
|
| +BENCH(memcpy32_sse2_align, 10)
|
| +BENCH(memcpy32_sse2_align, 100)
|
| +BENCH(memcpy32_sse2_align, 1000)
|
| +BENCH(memcpy32_sse2_align, 10000)
|
| +BENCH(memcpy32_sse2_align, 100000)
|
| +
|
| +// Leave both dst and src unaliged, and so use unaligned stores for dst and unaligned loads for src.
|
| +static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count) {
|
| + __m128i* dst128 = reinterpret_cast<__m128i*>(dst);
|
| + const __m128i* src128 = reinterpret_cast<const __m128i*>(src);
|
| + while (count >= 16) {
|
| + __m128i a = _mm_loadu_si128(src128++);
|
| + __m128i b = _mm_loadu_si128(src128++);
|
| + __m128i c = _mm_loadu_si128(src128++);
|
| + __m128i d = _mm_loadu_si128(src128++);
|
| +
|
| + _mm_storeu_si128(dst128++, a);
|
| + _mm_storeu_si128(dst128++, b);
|
| + _mm_storeu_si128(dst128++, c);
|
| + _mm_storeu_si128(dst128++, d);
|
| +
|
| + count -= 16;
|
| + }
|
| +
|
| + dst = reinterpret_cast<uint32_t*>(dst128);
|
| + src = reinterpret_cast<const uint32_t*>(src128);
|
| + while (count --> 0) {
|
| + *dst++ = *src++;
|
| + }
|
| +}
|
| +BENCH(memcpy32_sse2_unalign, 10)
|
| +BENCH(memcpy32_sse2_unalign, 100)
|
| +BENCH(memcpy32_sse2_unalign, 1000)
|
| +BENCH(memcpy32_sse2_unalign, 10000)
|
| +BENCH(memcpy32_sse2_unalign, 100000)
|
| +
|
| +#endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
|
| +
|
| +#undef BENCH
|
| +
|
|
|