OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2014 Google Inc. | 2 * Copyright 2014 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBenchmark.h" | 8 #include "SkBenchmark.h" |
9 #include "SkRandom.h" | 9 #include "SkRandom.h" |
10 #include "SkTemplates.h" | 10 #include "SkTemplates.h" |
(...skipping 72 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
83 // Align dst to 16 bytes, then use aligned stores. src isn't algined, so use un
aligned loads. | 83 // Align dst to 16 bytes, then use aligned stores. src isn't algined, so use un
aligned loads. |
84 static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) { | 84 static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) { |
85 if (count >= 16) { | 85 if (count >= 16) { |
86 while (uintptr_t(dst) & 0xF) { | 86 while (uintptr_t(dst) & 0xF) { |
87 *dst++ = *src++; | 87 *dst++ = *src++; |
88 count--; | 88 count--; |
89 } | 89 } |
90 | 90 |
91 __m128i* dst128 = reinterpret_cast<__m128i*>(dst); | 91 __m128i* dst128 = reinterpret_cast<__m128i*>(dst); |
92 const __m128i* src128 = reinterpret_cast<const __m128i*>(src); | 92 const __m128i* src128 = reinterpret_cast<const __m128i*>(src); |
| 93 dst += 16 * (count / 16); |
| 94 src += 16 * (count / 16); |
93 while (count >= 16) { | 95 while (count >= 16) { |
94 __m128i a = _mm_loadu_si128(src128++); | 96 __m128i a = _mm_loadu_si128(src128++); |
95 __m128i b = _mm_loadu_si128(src128++); | 97 __m128i b = _mm_loadu_si128(src128++); |
96 __m128i c = _mm_loadu_si128(src128++); | 98 __m128i c = _mm_loadu_si128(src128++); |
97 __m128i d = _mm_loadu_si128(src128++); | 99 __m128i d = _mm_loadu_si128(src128++); |
98 | 100 |
99 _mm_store_si128(dst128++, a); | 101 _mm_store_si128(dst128++, a); |
100 _mm_store_si128(dst128++, b); | 102 _mm_store_si128(dst128++, b); |
101 _mm_store_si128(dst128++, c); | 103 _mm_store_si128(dst128++, c); |
102 _mm_store_si128(dst128++, d); | 104 _mm_store_si128(dst128++, d); |
103 | 105 |
104 count -= 16; | 106 count -= 16; |
105 } | 107 } |
106 | |
107 dst = reinterpret_cast<uint32_t*>(dst128); | |
108 src = reinterpret_cast<const uint32_t*>(src128); | |
109 } | 108 } |
110 | 109 |
111 while (count --> 0) { | 110 while (count --> 0) { |
112 *dst++ = *src++; | 111 *dst++ = *src++; |
113 } | 112 } |
114 } | 113 } |
115 BENCH(memcpy32_sse2_align, 10) | 114 BENCH(memcpy32_sse2_align, 10) |
116 BENCH(memcpy32_sse2_align, 100) | 115 BENCH(memcpy32_sse2_align, 100) |
117 BENCH(memcpy32_sse2_align, 1000) | 116 BENCH(memcpy32_sse2_align, 1000) |
118 BENCH(memcpy32_sse2_align, 10000) | 117 BENCH(memcpy32_sse2_align, 10000) |
119 BENCH(memcpy32_sse2_align, 100000) | 118 BENCH(memcpy32_sse2_align, 100000) |
120 | 119 |
121 // Leave both dst and src unaliged, and so use unaligned stores for dst and unal
igned loads for src. | 120 // Leave both dst and src unaliged, and so use unaligned stores for dst and unal
igned loads for src. |
122 static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count)
{ | 121 static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count)
{ |
123 __m128i* dst128 = reinterpret_cast<__m128i*>(dst); | 122 __m128i* dst128 = reinterpret_cast<__m128i*>(dst); |
124 const __m128i* src128 = reinterpret_cast<const __m128i*>(src); | 123 const __m128i* src128 = reinterpret_cast<const __m128i*>(src); |
| 124 dst += 16 * (count / 16); |
| 125 src += 16 * (count / 16); |
125 while (count >= 16) { | 126 while (count >= 16) { |
126 __m128i a = _mm_loadu_si128(src128++); | 127 __m128i a = _mm_loadu_si128(src128++); |
127 __m128i b = _mm_loadu_si128(src128++); | 128 __m128i b = _mm_loadu_si128(src128++); |
128 __m128i c = _mm_loadu_si128(src128++); | 129 __m128i c = _mm_loadu_si128(src128++); |
129 __m128i d = _mm_loadu_si128(src128++); | 130 __m128i d = _mm_loadu_si128(src128++); |
130 | 131 |
131 _mm_storeu_si128(dst128++, a); | 132 _mm_storeu_si128(dst128++, a); |
132 _mm_storeu_si128(dst128++, b); | 133 _mm_storeu_si128(dst128++, b); |
133 _mm_storeu_si128(dst128++, c); | 134 _mm_storeu_si128(dst128++, c); |
134 _mm_storeu_si128(dst128++, d); | 135 _mm_storeu_si128(dst128++, d); |
135 | 136 |
136 count -= 16; | 137 count -= 16; |
137 } | 138 } |
138 | 139 |
139 dst = reinterpret_cast<uint32_t*>(dst128); | |
140 src = reinterpret_cast<const uint32_t*>(src128); | |
141 while (count --> 0) { | 140 while (count --> 0) { |
142 *dst++ = *src++; | 141 *dst++ = *src++; |
143 } | 142 } |
144 } | 143 } |
145 // skia:2589: Crashing on ChromeOS Alex bot. TODO(mtklein): why? | 144 BENCH(memcpy32_sse2_unalign, 10) |
146 //BENCH(memcpy32_sse2_unalign, 10) | |
147 BENCH(memcpy32_sse2_unalign, 100) | 145 BENCH(memcpy32_sse2_unalign, 100) |
148 BENCH(memcpy32_sse2_unalign, 1000) | 146 BENCH(memcpy32_sse2_unalign, 1000) |
149 BENCH(memcpy32_sse2_unalign, 10000) | 147 BENCH(memcpy32_sse2_unalign, 10000) |
150 BENCH(memcpy32_sse2_unalign, 100000) | 148 BENCH(memcpy32_sse2_unalign, 100000) |
151 | 149 |
152 #endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | 150 #endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
153 | 151 |
154 #undef BENCH | 152 #undef BENCH |
OLD | NEW |