bench/MemcpyBench.cpp - Issue 290533002: Add Memcpy32 bench.

Side by Side Diff: bench/MemcpyBench.cpp

Issue 290533002: Add Memcpy32 bench. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: alpha Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 /*

	2 * Copyright 2014 Google Inc.

	3 *

	4 * Use of this source code is governed by a BSD-style license that can be

	5 * found in the LICENSE file.

	6 */

	7

	8 #include "SkBenchmark.h"

	9 #include "SkRandom.h"

	10 #include "SkTemplates.h"

	11

	12 template <typename Memcpy32>

	13 class Memcpy32Bench : public SkBenchmark {

	14 public:

	15 explicit Memcpy32Bench(int count, Memcpy32 memcpy32, const char* name)

	16 : fCount(count)

	17 , fMemcpy32(memcpy32)

	18 , fName(SkStringPrintf("%s_%d", name, count)) {}

	19

	20 virtual const char* onGetName() SK_OVERRIDE {

	21 return fName.c_str();

	22 }

	23

	24 virtual bool isSuitableFor(Backend backend) SK_OVERRIDE {

	25 return backend == kNonRendering_Backend;

	26 }

	27

	28 virtual void onPreDraw() SK_OVERRIDE {

	29 fDst.reset(fCount);

	30 fSrc.reset(fCount);

	31

	32 SkRandom rand;

	33 for (int i = 0; i < fCount; i++) {

	34 fSrc[i] = rand.nextU();

	35 }

	36 }

	37

	38 virtual void onDraw(const int loops, SkCanvas*) SK_OVERRIDE {

	39 for (int i = 0; i < loops; i++) {

	40 fMemcpy32(fDst, fSrc, fCount);

	41 }

	42 }

	43

	44 private:

	45 SkAutoTMalloc<uint32_t> fDst, fSrc;

	46

	47 int fCount;

	48 Memcpy32 fMemcpy32;

	49 const SkString fName;

	50 };

	51

	52 template <typename Memcpy32>

	53 static Memcpy32Bench<Memcpy32>* Bench(int count, Memcpy32 memcpy32, const char* name) {

	54 return new Memcpy32Bench<Memcpy32>(count, memcpy32, name);

	55 }

	56 #define BENCH(memcpy32, count) DEF_BENCH(return Bench(count, memcpy32, #memcpy32 ); )

	57

	58

	59 // Let the libc developers do what they think is best.

	60 static void memcpy32_memcpy(uint32_t* dst, const uint32_t* src, int count) {

	61 memcpy(dst, src, sizeof(uint32_t) * count);

	62 }

	63 BENCH(memcpy32_memcpy, 10)

	64 BENCH(memcpy32_memcpy, 100)

	65 BENCH(memcpy32_memcpy, 1000)

	66 BENCH(memcpy32_memcpy, 10000)

	67 BENCH(memcpy32_memcpy, 100000)

	68

	69 // Let the compiler's autovectorizer do what it thinks is best.

	70 static void memcpy32_autovectorize(uint32_t* dst, const uint32_t* src, int count ) {

	71 while (count --> 0) {

	72 dst++ = src++;

	73 }

	74 }

	75 BENCH(memcpy32_autovectorize, 10)

	76 BENCH(memcpy32_autovectorize, 100)

	77 BENCH(memcpy32_autovectorize, 1000)

	78 BENCH(memcpy32_autovectorize, 10000)

	79 BENCH(memcpy32_autovectorize, 100000)

	80

	81 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

	82

	83 // Align dst to 16 bytes, then use aligned stores. src isn't algined, so use un aligned loads.

	84 static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) {

	85 if (count >= 16) {

	86 while (uintptr_t(dst) & 0xF) {

	87 dst++ = src++;

	88 count--;

	89 }

	90

	91 __m128i* dst128 = reinterpret_cast<__m128i*>(dst);

	92 const __m128i* src128 = reinterpret_cast<const __m128i*>(src);

	93 while (count >= 16) {

	94 __m128i a = _mm_loadu_si128(src128++);

	95 __m128i b = _mm_loadu_si128(src128++);

	96 __m128i c = _mm_loadu_si128(src128++);

	97 __m128i d = _mm_loadu_si128(src128++);

	98

	99 _mm_store_si128(dst128++, a);

	100 _mm_store_si128(dst128++, b);

	101 _mm_store_si128(dst128++, c);

	102 _mm_store_si128(dst128++, d);

	103

	104 count -= 16;

	105 }

	106

	107 dst = reinterpret_cast<uint32_t*>(dst128);

	108 src = reinterpret_cast<const uint32_t*>(src128);

	109 }

	110

	111 while (count --> 0) {

	112 dst++ = src++;

	113 }

	114 }

	115 BENCH(memcpy32_sse2_align, 10)

	116 BENCH(memcpy32_sse2_align, 100)

	117 BENCH(memcpy32_sse2_align, 1000)

	118 BENCH(memcpy32_sse2_align, 10000)

	119 BENCH(memcpy32_sse2_align, 100000)

	120

	121 // Leave both dst and src unaliged, and so use unaligned stores for dst and unal igned loads for src.

	122 static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count) {

	123 __m128i* dst128 = reinterpret_cast<__m128i*>(dst);

	124 const __m128i* src128 = reinterpret_cast<const __m128i*>(src);

	125 while (count >= 16) {

	126 __m128i a = _mm_loadu_si128(src128++);

	127 __m128i b = _mm_loadu_si128(src128++);

	128 __m128i c = _mm_loadu_si128(src128++);

	129 __m128i d = _mm_loadu_si128(src128++);

	130

	131 _mm_storeu_si128(dst128++, a);

	132 _mm_storeu_si128(dst128++, b);

	133 _mm_storeu_si128(dst128++, c);

	134 _mm_storeu_si128(dst128++, d);

	135

	136 count -= 16;

	137 }

	138

	139 dst = reinterpret_cast<uint32_t*>(dst128);

	140 src = reinterpret_cast<const uint32_t*>(src128);

	141 while (count --> 0) {

	142 dst++ = src++;

	143 }

	144 }

	145 BENCH(memcpy32_sse2_unalign, 10)

	146 BENCH(memcpy32_sse2_unalign, 100)

	147 BENCH(memcpy32_sse2_unalign, 1000)

	148 BENCH(memcpy32_sse2_unalign, 10000)

	149 BENCH(memcpy32_sse2_unalign, 100000)

	150

	151 #endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

	152

	153 #undef BENCH

	154

OLD	NEW

« no previous file with comments | « no previous file | gyp/bench.gypi » ('j') | no next file with comments »