Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(54)

Side by Side Diff: bench/MemcpyBench.cpp

Issue 290533002: Add Memcpy32 bench. (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: alpha Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | gyp/bench.gypi » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /*
2 * Copyright 2014 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "SkBenchmark.h"
9 #include "SkRandom.h"
10 #include "SkTemplates.h"
11
12 template <typename Memcpy32>
13 class Memcpy32Bench : public SkBenchmark {
14 public:
15 explicit Memcpy32Bench(int count, Memcpy32 memcpy32, const char* name)
16 : fCount(count)
17 , fMemcpy32(memcpy32)
18 , fName(SkStringPrintf("%s_%d", name, count)) {}
19
20 virtual const char* onGetName() SK_OVERRIDE {
21 return fName.c_str();
22 }
23
24 virtual bool isSuitableFor(Backend backend) SK_OVERRIDE {
25 return backend == kNonRendering_Backend;
26 }
27
28 virtual void onPreDraw() SK_OVERRIDE {
29 fDst.reset(fCount);
30 fSrc.reset(fCount);
31
32 SkRandom rand;
33 for (int i = 0; i < fCount; i++) {
34 fSrc[i] = rand.nextU();
35 }
36 }
37
38 virtual void onDraw(const int loops, SkCanvas*) SK_OVERRIDE {
39 for (int i = 0; i < loops; i++) {
40 fMemcpy32(fDst, fSrc, fCount);
41 }
42 }
43
44 private:
45 SkAutoTMalloc<uint32_t> fDst, fSrc;
46
47 int fCount;
48 Memcpy32 fMemcpy32;
49 const SkString fName;
50 };
51
52 template <typename Memcpy32>
53 static Memcpy32Bench<Memcpy32>* Bench(int count, Memcpy32 memcpy32, const char* name) {
54 return new Memcpy32Bench<Memcpy32>(count, memcpy32, name);
55 }
56 #define BENCH(memcpy32, count) DEF_BENCH(return Bench(count, memcpy32, #memcpy32 ); )
57
58
59 // Let the libc developers do what they think is best.
60 static void memcpy32_memcpy(uint32_t* dst, const uint32_t* src, int count) {
61 memcpy(dst, src, sizeof(uint32_t) * count);
62 }
63 BENCH(memcpy32_memcpy, 10)
64 BENCH(memcpy32_memcpy, 100)
65 BENCH(memcpy32_memcpy, 1000)
66 BENCH(memcpy32_memcpy, 10000)
67 BENCH(memcpy32_memcpy, 100000)
68
69 // Let the compiler's autovectorizer do what it thinks is best.
70 static void memcpy32_autovectorize(uint32_t* dst, const uint32_t* src, int count ) {
71 while (count --> 0) {
72 *dst++ = *src++;
73 }
74 }
75 BENCH(memcpy32_autovectorize, 10)
76 BENCH(memcpy32_autovectorize, 100)
77 BENCH(memcpy32_autovectorize, 1000)
78 BENCH(memcpy32_autovectorize, 10000)
79 BENCH(memcpy32_autovectorize, 100000)
80
81 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
82
83 // Align dst to 16 bytes, then use aligned stores. src isn't algined, so use un aligned loads.
84 static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) {
85 if (count >= 16) {
86 while (uintptr_t(dst) & 0xF) {
87 *dst++ = *src++;
88 count--;
89 }
90
91 __m128i* dst128 = reinterpret_cast<__m128i*>(dst);
92 const __m128i* src128 = reinterpret_cast<const __m128i*>(src);
93 while (count >= 16) {
94 __m128i a = _mm_loadu_si128(src128++);
95 __m128i b = _mm_loadu_si128(src128++);
96 __m128i c = _mm_loadu_si128(src128++);
97 __m128i d = _mm_loadu_si128(src128++);
98
99 _mm_store_si128(dst128++, a);
100 _mm_store_si128(dst128++, b);
101 _mm_store_si128(dst128++, c);
102 _mm_store_si128(dst128++, d);
103
104 count -= 16;
105 }
106
107 dst = reinterpret_cast<uint32_t*>(dst128);
108 src = reinterpret_cast<const uint32_t*>(src128);
109 }
110
111 while (count --> 0) {
112 *dst++ = *src++;
113 }
114 }
115 BENCH(memcpy32_sse2_align, 10)
116 BENCH(memcpy32_sse2_align, 100)
117 BENCH(memcpy32_sse2_align, 1000)
118 BENCH(memcpy32_sse2_align, 10000)
119 BENCH(memcpy32_sse2_align, 100000)
120
121 // Leave both dst and src unaliged, and so use unaligned stores for dst and unal igned loads for src.
122 static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count) {
123 __m128i* dst128 = reinterpret_cast<__m128i*>(dst);
124 const __m128i* src128 = reinterpret_cast<const __m128i*>(src);
125 while (count >= 16) {
126 __m128i a = _mm_loadu_si128(src128++);
127 __m128i b = _mm_loadu_si128(src128++);
128 __m128i c = _mm_loadu_si128(src128++);
129 __m128i d = _mm_loadu_si128(src128++);
130
131 _mm_storeu_si128(dst128++, a);
132 _mm_storeu_si128(dst128++, b);
133 _mm_storeu_si128(dst128++, c);
134 _mm_storeu_si128(dst128++, d);
135
136 count -= 16;
137 }
138
139 dst = reinterpret_cast<uint32_t*>(dst128);
140 src = reinterpret_cast<const uint32_t*>(src128);
141 while (count --> 0) {
142 *dst++ = *src++;
143 }
144 }
145 BENCH(memcpy32_sse2_unalign, 10)
146 BENCH(memcpy32_sse2_unalign, 100)
147 BENCH(memcpy32_sse2_unalign, 1000)
148 BENCH(memcpy32_sse2_unalign, 10000)
149 BENCH(memcpy32_sse2_unalign, 100000)
150
151 #endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
152
153 #undef BENCH
154
OLDNEW
« no previous file with comments | « no previous file | gyp/bench.gypi » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698