| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2014 Google Inc. | 2 * Copyright 2014 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include "Benchmark.h" | 8 #include "Benchmark.h" |
| 9 #include "SkRandom.h" | 9 #include "SkRandom.h" |
| 10 #include "SkTemplates.h" | 10 #include "SkTemplates.h" |
| (...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 60 // Let the libc developers do what they think is best. | 60 // Let the libc developers do what they think is best. |
| 61 static void memcpy32_memcpy(uint32_t* dst, const uint32_t* src, int count) { | 61 static void memcpy32_memcpy(uint32_t* dst, const uint32_t* src, int count) { |
| 62 memcpy(dst, src, sizeof(uint32_t) * count); | 62 memcpy(dst, src, sizeof(uint32_t) * count); |
| 63 } | 63 } |
| 64 BENCH(memcpy32_memcpy, 10) | 64 BENCH(memcpy32_memcpy, 10) |
| 65 BENCH(memcpy32_memcpy, 100) | 65 BENCH(memcpy32_memcpy, 100) |
| 66 BENCH(memcpy32_memcpy, 1000) | 66 BENCH(memcpy32_memcpy, 1000) |
| 67 BENCH(memcpy32_memcpy, 10000) | 67 BENCH(memcpy32_memcpy, 10000) |
| 68 BENCH(memcpy32_memcpy, 100000) | 68 BENCH(memcpy32_memcpy, 100000) |
| 69 | 69 |
| 70 // Let the compiler's autovectorizer do what it thinks is best. | |
| 71 static void memcpy32_autovectorize(uint32_t* dst, const uint32_t* src, int count
) { | |
| 72 while (count --> 0) { | |
| 73 *dst++ = *src++; | |
| 74 } | |
| 75 } | |
| 76 BENCH(memcpy32_autovectorize, 10) | |
| 77 BENCH(memcpy32_autovectorize, 100) | |
| 78 BENCH(memcpy32_autovectorize, 1000) | |
| 79 BENCH(memcpy32_autovectorize, 10000) | |
| 80 BENCH(memcpy32_autovectorize, 100000) | |
| 81 | |
| 82 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | |
| 83 | |
| 84 // Align dst to 16 bytes, then use aligned stores. src isn't algined, so use un
aligned loads. | |
| 85 static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) { | |
| 86 if (count >= 16) { | |
| 87 while (uintptr_t(dst) & 0xF) { | |
| 88 *dst++ = *src++; | |
| 89 count--; | |
| 90 } | |
| 91 | |
| 92 __m128i* dst128 = reinterpret_cast<__m128i*>(dst); | |
| 93 const __m128i* src128 = reinterpret_cast<const __m128i*>(src); | |
| 94 dst += 16 * (count / 16); | |
| 95 src += 16 * (count / 16); | |
| 96 while (count >= 16) { | |
| 97 __m128i a = _mm_loadu_si128(src128++); | |
| 98 __m128i b = _mm_loadu_si128(src128++); | |
| 99 __m128i c = _mm_loadu_si128(src128++); | |
| 100 __m128i d = _mm_loadu_si128(src128++); | |
| 101 | |
| 102 _mm_store_si128(dst128++, a); | |
| 103 _mm_store_si128(dst128++, b); | |
| 104 _mm_store_si128(dst128++, c); | |
| 105 _mm_store_si128(dst128++, d); | |
| 106 | |
| 107 count -= 16; | |
| 108 } | |
| 109 } | |
| 110 | |
| 111 while (count --> 0) { | |
| 112 *dst++ = *src++; | |
| 113 } | |
| 114 } | |
| 115 BENCH(memcpy32_sse2_align, 10) | |
| 116 BENCH(memcpy32_sse2_align, 100) | |
| 117 BENCH(memcpy32_sse2_align, 1000) | |
| 118 BENCH(memcpy32_sse2_align, 10000) | |
| 119 BENCH(memcpy32_sse2_align, 100000) | |
| 120 | |
| 121 // Leave both dst and src unaliged, and so use unaligned stores for dst and unal
igned loads for src. | |
| 122 static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count)
{ | |
| 123 __m128i* dst128 = reinterpret_cast<__m128i*>(dst); | |
| 124 const __m128i* src128 = reinterpret_cast<const __m128i*>(src); | |
| 125 dst += 16 * (count / 16); | |
| 126 src += 16 * (count / 16); | |
| 127 while (count >= 16) { | |
| 128 __m128i a = _mm_loadu_si128(src128++); | |
| 129 __m128i b = _mm_loadu_si128(src128++); | |
| 130 __m128i c = _mm_loadu_si128(src128++); | |
| 131 __m128i d = _mm_loadu_si128(src128++); | |
| 132 | |
| 133 _mm_storeu_si128(dst128++, a); | |
| 134 _mm_storeu_si128(dst128++, b); | |
| 135 _mm_storeu_si128(dst128++, c); | |
| 136 _mm_storeu_si128(dst128++, d); | |
| 137 | |
| 138 count -= 16; | |
| 139 } | |
| 140 | |
| 141 while (count --> 0) { | |
| 142 *dst++ = *src++; | |
| 143 } | |
| 144 } | |
| 145 BENCH(memcpy32_sse2_unalign, 10) | |
| 146 BENCH(memcpy32_sse2_unalign, 100) | |
| 147 BENCH(memcpy32_sse2_unalign, 1000) | |
| 148 BENCH(memcpy32_sse2_unalign, 10000) | |
| 149 BENCH(memcpy32_sse2_unalign, 100000) | |
| 150 | |
| 151 // Test our chosen best, from SkUtils.h | 70 // Test our chosen best, from SkUtils.h |
| 152 BENCH(sk_memcpy32, 10) | 71 BENCH(sk_memcpy32, 10) |
| 153 BENCH(sk_memcpy32, 100) | 72 BENCH(sk_memcpy32, 100) |
| 154 BENCH(sk_memcpy32, 1000) | 73 BENCH(sk_memcpy32, 1000) |
| 155 BENCH(sk_memcpy32, 10000) | 74 BENCH(sk_memcpy32, 10000) |
| 156 BENCH(sk_memcpy32, 100000) | 75 BENCH(sk_memcpy32, 100000) |
| 157 | 76 |
| 158 #endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | |
| 159 | |
| 160 #undef BENCH | 77 #undef BENCH |
| OLD | NEW |