| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2014 Google Inc. | 2 * Copyright 2014 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include "SkBenchmark.h" | 8 #include "SkBenchmark.h" |
| 9 #include "SkRandom.h" | 9 #include "SkRandom.h" |
| 10 #include "SkTemplates.h" | 10 #include "SkTemplates.h" |
| (...skipping 72 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 83 // Align dst to 16 bytes, then use aligned stores. src isn't algined, so use un
aligned loads. | 83 // Align dst to 16 bytes, then use aligned stores. src isn't algined, so use un
aligned loads. |
| 84 static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) { | 84 static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) { |
| 85 if (count >= 16) { | 85 if (count >= 16) { |
| 86 while (uintptr_t(dst) & 0xF) { | 86 while (uintptr_t(dst) & 0xF) { |
| 87 *dst++ = *src++; | 87 *dst++ = *src++; |
| 88 count--; | 88 count--; |
| 89 } | 89 } |
| 90 | 90 |
| 91 __m128i* dst128 = reinterpret_cast<__m128i*>(dst); | 91 __m128i* dst128 = reinterpret_cast<__m128i*>(dst); |
| 92 const __m128i* src128 = reinterpret_cast<const __m128i*>(src); | 92 const __m128i* src128 = reinterpret_cast<const __m128i*>(src); |
| 93 dst += 16 * (count / 16); |
| 94 src += 16 * (count / 16); |
| 93 while (count >= 16) { | 95 while (count >= 16) { |
| 94 __m128i a = _mm_loadu_si128(src128++); | 96 __m128i a = _mm_loadu_si128(src128++); |
| 95 __m128i b = _mm_loadu_si128(src128++); | 97 __m128i b = _mm_loadu_si128(src128++); |
| 96 __m128i c = _mm_loadu_si128(src128++); | 98 __m128i c = _mm_loadu_si128(src128++); |
| 97 __m128i d = _mm_loadu_si128(src128++); | 99 __m128i d = _mm_loadu_si128(src128++); |
| 98 | 100 |
| 99 _mm_store_si128(dst128++, a); | 101 _mm_store_si128(dst128++, a); |
| 100 _mm_store_si128(dst128++, b); | 102 _mm_store_si128(dst128++, b); |
| 101 _mm_store_si128(dst128++, c); | 103 _mm_store_si128(dst128++, c); |
| 102 _mm_store_si128(dst128++, d); | 104 _mm_store_si128(dst128++, d); |
| 103 | 105 |
| 104 count -= 16; | 106 count -= 16; |
| 105 } | 107 } |
| 106 | |
| 107 dst = reinterpret_cast<uint32_t*>(dst128); | |
| 108 src = reinterpret_cast<const uint32_t*>(src128); | |
| 109 } | 108 } |
| 110 | 109 |
| 111 while (count --> 0) { | 110 while (count --> 0) { |
| 112 *dst++ = *src++; | 111 *dst++ = *src++; |
| 113 } | 112 } |
| 114 } | 113 } |
| 115 BENCH(memcpy32_sse2_align, 10) | 114 BENCH(memcpy32_sse2_align, 10) |
| 116 BENCH(memcpy32_sse2_align, 100) | 115 BENCH(memcpy32_sse2_align, 100) |
| 117 BENCH(memcpy32_sse2_align, 1000) | 116 BENCH(memcpy32_sse2_align, 1000) |
| 118 BENCH(memcpy32_sse2_align, 10000) | 117 BENCH(memcpy32_sse2_align, 10000) |
| 119 BENCH(memcpy32_sse2_align, 100000) | 118 BENCH(memcpy32_sse2_align, 100000) |
| 120 | 119 |
| 121 // Leave both dst and src unaliged, and so use unaligned stores for dst and unal
igned loads for src. | 120 // Leave both dst and src unaliged, and so use unaligned stores for dst and unal
igned loads for src. |
| 122 static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count)
{ | 121 static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count)
{ |
| 123 __m128i* dst128 = reinterpret_cast<__m128i*>(dst); | 122 __m128i* dst128 = reinterpret_cast<__m128i*>(dst); |
| 124 const __m128i* src128 = reinterpret_cast<const __m128i*>(src); | 123 const __m128i* src128 = reinterpret_cast<const __m128i*>(src); |
| 124 dst += 16 * (count / 16); |
| 125 src += 16 * (count / 16); |
| 125 while (count >= 16) { | 126 while (count >= 16) { |
| 126 __m128i a = _mm_loadu_si128(src128++); | 127 __m128i a = _mm_loadu_si128(src128++); |
| 127 __m128i b = _mm_loadu_si128(src128++); | 128 __m128i b = _mm_loadu_si128(src128++); |
| 128 __m128i c = _mm_loadu_si128(src128++); | 129 __m128i c = _mm_loadu_si128(src128++); |
| 129 __m128i d = _mm_loadu_si128(src128++); | 130 __m128i d = _mm_loadu_si128(src128++); |
| 130 | 131 |
| 131 _mm_storeu_si128(dst128++, a); | 132 _mm_storeu_si128(dst128++, a); |
| 132 _mm_storeu_si128(dst128++, b); | 133 _mm_storeu_si128(dst128++, b); |
| 133 _mm_storeu_si128(dst128++, c); | 134 _mm_storeu_si128(dst128++, c); |
| 134 _mm_storeu_si128(dst128++, d); | 135 _mm_storeu_si128(dst128++, d); |
| 135 | 136 |
| 136 count -= 16; | 137 count -= 16; |
| 137 } | 138 } |
| 138 | 139 |
| 139 dst = reinterpret_cast<uint32_t*>(dst128); | |
| 140 src = reinterpret_cast<const uint32_t*>(src128); | |
| 141 while (count --> 0) { | 140 while (count --> 0) { |
| 142 *dst++ = *src++; | 141 *dst++ = *src++; |
| 143 } | 142 } |
| 144 } | 143 } |
| 145 // skia:2589: Crashing on ChromeOS Alex bot. TODO(mtklein): why? | 144 BENCH(memcpy32_sse2_unalign, 10) |
| 146 //BENCH(memcpy32_sse2_unalign, 10) | |
| 147 BENCH(memcpy32_sse2_unalign, 100) | 145 BENCH(memcpy32_sse2_unalign, 100) |
| 148 BENCH(memcpy32_sse2_unalign, 1000) | 146 BENCH(memcpy32_sse2_unalign, 1000) |
| 149 BENCH(memcpy32_sse2_unalign, 10000) | 147 BENCH(memcpy32_sse2_unalign, 10000) |
| 150 BENCH(memcpy32_sse2_unalign, 100000) | 148 BENCH(memcpy32_sse2_unalign, 100000) |
| 151 | 149 |
| 152 #endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | 150 #endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
| 153 | 151 |
| 154 #undef BENCH | 152 #undef BENCH |
| OLD | NEW |