bench/MemcpyBench.cpp - Issue 290533002: Add Memcpy32 bench.

Unified Diff: bench/MemcpyBench.cpp

Issue 290533002: Add Memcpy32 bench. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: alpha Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: bench/MemcpyBench.cpp

diff --git a/bench/MemcpyBench.cpp b/bench/MemcpyBench.cpp

new file mode 100644

index 0000000000000000000000000000000000000000..452bf6fdc2ccb84d22ca4134730ed526ae42ba85

--- /dev/null

+++ b/bench/MemcpyBench.cpp

@@ -0,0 +1,154 @@

+/*

+ *

+ * Use of this source code is governed by a BSD-style license that can be

+ * found in the LICENSE file.

+ */

+#include "SkBenchmark.h"

+#include "SkRandom.h"

+#include "SkTemplates.h"

+template <typename Memcpy32>

+class Memcpy32Bench : public SkBenchmark {

+public:

+ explicit Memcpy32Bench(int count, Memcpy32 memcpy32, const char* name)

+ : fCount(count)

+ , fMemcpy32(memcpy32)

+ , fName(SkStringPrintf("%s_%d", name, count)) {}

+ virtual const char* onGetName() SK_OVERRIDE {

+ return fName.c_str();

+ }

+ virtual bool isSuitableFor(Backend backend) SK_OVERRIDE {

+ return backend == kNonRendering_Backend;

+ }

+ virtual void onPreDraw() SK_OVERRIDE {

+ fDst.reset(fCount);

+ fSrc.reset(fCount);

+ SkRandom rand;

+ for (int i = 0; i < fCount; i++) {

+ fSrc[i] = rand.nextU();

+ }

+ virtual void onDraw(const int loops, SkCanvas*) SK_OVERRIDE {

+ for (int i = 0; i < loops; i++) {

+ fMemcpy32(fDst, fSrc, fCount);

+ }

+private:

+ SkAutoTMalloc<uint32_t> fDst, fSrc;

+ int fCount;

+ Memcpy32 fMemcpy32;

+ const SkString fName;

+};

+template <typename Memcpy32>

+static Memcpy32Bench<Memcpy32>* Bench(int count, Memcpy32 memcpy32, const char* name) {

+ return new Memcpy32Bench<Memcpy32>(count, memcpy32, name);

+#define BENCH(memcpy32, count) DEF_BENCH(return Bench(count, memcpy32, #memcpy32); )

+// Let the libc developers do what they think is best.

+static void memcpy32_memcpy(uint32_t* dst, const uint32_t* src, int count) {

+ memcpy(dst, src, sizeof(uint32_t) * count);

+BENCH(memcpy32_memcpy, 10)

+BENCH(memcpy32_memcpy, 100)

+BENCH(memcpy32_memcpy, 1000)

+BENCH(memcpy32_memcpy, 10000)

+BENCH(memcpy32_memcpy, 100000)

+// Let the compiler's autovectorizer do what it thinks is best.

+static void memcpy32_autovectorize(uint32_t* dst, const uint32_t* src, int count) {

+ while (count --> 0) {

+ *dst++ = *src++;

+ }

+BENCH(memcpy32_autovectorize, 10)

+BENCH(memcpy32_autovectorize, 100)

+BENCH(memcpy32_autovectorize, 1000)

+BENCH(memcpy32_autovectorize, 10000)

+BENCH(memcpy32_autovectorize, 100000)

+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

+// Align dst to 16 bytes, then use aligned stores. src isn't algined, so use unaligned loads.

+static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) {

+ if (count >= 16) {

+ while (uintptr_t(dst) & 0xF) {

+ *dst++ = *src++;

+ count--;

+ }

+ __m128i* dst128 = reinterpret_cast<__m128i*>(dst);

+ const __m128i* src128 = reinterpret_cast<const __m128i*>(src);

+ while (count >= 16) {

+ __m128i a = _mm_loadu_si128(src128++);

+ __m128i b = _mm_loadu_si128(src128++);

+ __m128i c = _mm_loadu_si128(src128++);

+ __m128i d = _mm_loadu_si128(src128++);

+ _mm_store_si128(dst128++, a);

+ _mm_store_si128(dst128++, b);

+ _mm_store_si128(dst128++, c);

+ _mm_store_si128(dst128++, d);

+ count -= 16;

+ }

+ dst = reinterpret_cast<uint32_t*>(dst128);

+ src = reinterpret_cast<const uint32_t*>(src128);

+ }

+ while (count --> 0) {

+ *dst++ = *src++;

+ }

+BENCH(memcpy32_sse2_align, 10)

+BENCH(memcpy32_sse2_align, 100)

+BENCH(memcpy32_sse2_align, 1000)

+BENCH(memcpy32_sse2_align, 10000)

+BENCH(memcpy32_sse2_align, 100000)

+// Leave both dst and src unaliged, and so use unaligned stores for dst and unaligned loads for src.

+static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count) {

+ __m128i* dst128 = reinterpret_cast<__m128i*>(dst);

+ const __m128i* src128 = reinterpret_cast<const __m128i*>(src);

+ while (count >= 16) {

+ __m128i a = _mm_loadu_si128(src128++);

+ __m128i b = _mm_loadu_si128(src128++);

+ __m128i c = _mm_loadu_si128(src128++);

+ __m128i d = _mm_loadu_si128(src128++);

+ _mm_storeu_si128(dst128++, a);

+ _mm_storeu_si128(dst128++, b);

+ _mm_storeu_si128(dst128++, c);

+ _mm_storeu_si128(dst128++, d);

+ count -= 16;

+ }

+ dst = reinterpret_cast<uint32_t*>(dst128);

+ src = reinterpret_cast<const uint32_t*>(src128);

+ while (count --> 0) {

+ *dst++ = *src++;

+ }

+BENCH(memcpy32_sse2_unalign, 10)

+BENCH(memcpy32_sse2_unalign, 100)

+BENCH(memcpy32_sse2_unalign, 1000)

+BENCH(memcpy32_sse2_unalign, 10000)

+BENCH(memcpy32_sse2_unalign, 100000)

+#endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

+#undef BENCH

« no previous file with comments | « no previous file | gyp/bench.gypi » ('j') | no next file with comments »