OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2014 Google Inc. | 2 * Copyright 2014 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "Benchmark.h" | 8 #include "Benchmark.h" |
9 #include "SkRandom.h" | 9 #include "SkRandom.h" |
10 #include "SkTemplates.h" | 10 #include "SkTemplates.h" |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
60 // Let the libc developers do what they think is best. | 60 // Let the libc developers do what they think is best. |
61 static void memcpy32_memcpy(uint32_t* dst, const uint32_t* src, int count) { | 61 static void memcpy32_memcpy(uint32_t* dst, const uint32_t* src, int count) { |
62 memcpy(dst, src, sizeof(uint32_t) * count); | 62 memcpy(dst, src, sizeof(uint32_t) * count); |
63 } | 63 } |
64 BENCH(memcpy32_memcpy, 10) | 64 BENCH(memcpy32_memcpy, 10) |
65 BENCH(memcpy32_memcpy, 100) | 65 BENCH(memcpy32_memcpy, 100) |
66 BENCH(memcpy32_memcpy, 1000) | 66 BENCH(memcpy32_memcpy, 1000) |
67 BENCH(memcpy32_memcpy, 10000) | 67 BENCH(memcpy32_memcpy, 10000) |
68 BENCH(memcpy32_memcpy, 100000) | 68 BENCH(memcpy32_memcpy, 100000) |
69 | 69 |
70 // Let the compiler's autovectorizer do what it thinks is best. | |
71 static void memcpy32_autovectorize(uint32_t* dst, const uint32_t* src, int count
) { | |
72 while (count --> 0) { | |
73 *dst++ = *src++; | |
74 } | |
75 } | |
76 BENCH(memcpy32_autovectorize, 10) | |
77 BENCH(memcpy32_autovectorize, 100) | |
78 BENCH(memcpy32_autovectorize, 1000) | |
79 BENCH(memcpy32_autovectorize, 10000) | |
80 BENCH(memcpy32_autovectorize, 100000) | |
81 | |
82 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | |
83 | |
84 // Align dst to 16 bytes, then use aligned stores. src isn't algined, so use un
aligned loads. | |
85 static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) { | |
86 if (count >= 16) { | |
87 while (uintptr_t(dst) & 0xF) { | |
88 *dst++ = *src++; | |
89 count--; | |
90 } | |
91 | |
92 __m128i* dst128 = reinterpret_cast<__m128i*>(dst); | |
93 const __m128i* src128 = reinterpret_cast<const __m128i*>(src); | |
94 dst += 16 * (count / 16); | |
95 src += 16 * (count / 16); | |
96 while (count >= 16) { | |
97 __m128i a = _mm_loadu_si128(src128++); | |
98 __m128i b = _mm_loadu_si128(src128++); | |
99 __m128i c = _mm_loadu_si128(src128++); | |
100 __m128i d = _mm_loadu_si128(src128++); | |
101 | |
102 _mm_store_si128(dst128++, a); | |
103 _mm_store_si128(dst128++, b); | |
104 _mm_store_si128(dst128++, c); | |
105 _mm_store_si128(dst128++, d); | |
106 | |
107 count -= 16; | |
108 } | |
109 } | |
110 | |
111 while (count --> 0) { | |
112 *dst++ = *src++; | |
113 } | |
114 } | |
115 BENCH(memcpy32_sse2_align, 10) | |
116 BENCH(memcpy32_sse2_align, 100) | |
117 BENCH(memcpy32_sse2_align, 1000) | |
118 BENCH(memcpy32_sse2_align, 10000) | |
119 BENCH(memcpy32_sse2_align, 100000) | |
120 | |
121 // Leave both dst and src unaliged, and so use unaligned stores for dst and unal
igned loads for src. | |
122 static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count)
{ | |
123 __m128i* dst128 = reinterpret_cast<__m128i*>(dst); | |
124 const __m128i* src128 = reinterpret_cast<const __m128i*>(src); | |
125 dst += 16 * (count / 16); | |
126 src += 16 * (count / 16); | |
127 while (count >= 16) { | |
128 __m128i a = _mm_loadu_si128(src128++); | |
129 __m128i b = _mm_loadu_si128(src128++); | |
130 __m128i c = _mm_loadu_si128(src128++); | |
131 __m128i d = _mm_loadu_si128(src128++); | |
132 | |
133 _mm_storeu_si128(dst128++, a); | |
134 _mm_storeu_si128(dst128++, b); | |
135 _mm_storeu_si128(dst128++, c); | |
136 _mm_storeu_si128(dst128++, d); | |
137 | |
138 count -= 16; | |
139 } | |
140 | |
141 while (count --> 0) { | |
142 *dst++ = *src++; | |
143 } | |
144 } | |
145 BENCH(memcpy32_sse2_unalign, 10) | |
146 BENCH(memcpy32_sse2_unalign, 100) | |
147 BENCH(memcpy32_sse2_unalign, 1000) | |
148 BENCH(memcpy32_sse2_unalign, 10000) | |
149 BENCH(memcpy32_sse2_unalign, 100000) | |
150 | |
151 // Test our chosen best, from SkUtils.h | 70 // Test our chosen best, from SkUtils.h |
152 BENCH(sk_memcpy32, 10) | 71 BENCH(sk_memcpy32, 10) |
153 BENCH(sk_memcpy32, 100) | 72 BENCH(sk_memcpy32, 100) |
154 BENCH(sk_memcpy32, 1000) | 73 BENCH(sk_memcpy32, 1000) |
155 BENCH(sk_memcpy32, 10000) | 74 BENCH(sk_memcpy32, 10000) |
156 BENCH(sk_memcpy32, 100000) | 75 BENCH(sk_memcpy32, 100000) |
157 | 76 |
158 #endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | |
159 | |
160 #undef BENCH | 77 #undef BENCH |
OLD | NEW |