Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(423)

Side by Side Diff: bench/MemcpyBench.cpp

Issue 291893008: Fix memcpy32_sse2_unalign. (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: hoist, for align too Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2014 Google Inc. 2 * Copyright 2014 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBenchmark.h" 8 #include "SkBenchmark.h"
9 #include "SkRandom.h" 9 #include "SkRandom.h"
10 #include "SkTemplates.h" 10 #include "SkTemplates.h"
(...skipping 72 matching lines...) Expand 10 before | Expand all | Expand 10 after
83 // Align dst to 16 bytes, then use aligned stores. src isn't algined, so use un aligned loads. 83 // Align dst to 16 bytes, then use aligned stores. src isn't algined, so use un aligned loads.
84 static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) { 84 static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) {
85 if (count >= 16) { 85 if (count >= 16) {
86 while (uintptr_t(dst) & 0xF) { 86 while (uintptr_t(dst) & 0xF) {
87 *dst++ = *src++; 87 *dst++ = *src++;
88 count--; 88 count--;
89 } 89 }
90 90
91 __m128i* dst128 = reinterpret_cast<__m128i*>(dst); 91 __m128i* dst128 = reinterpret_cast<__m128i*>(dst);
92 const __m128i* src128 = reinterpret_cast<const __m128i*>(src); 92 const __m128i* src128 = reinterpret_cast<const __m128i*>(src);
93 dst += 16 * (count / 16);
94 src += 16 * (count / 16);
93 while (count >= 16) { 95 while (count >= 16) {
94 __m128i a = _mm_loadu_si128(src128++); 96 __m128i a = _mm_loadu_si128(src128++);
95 __m128i b = _mm_loadu_si128(src128++); 97 __m128i b = _mm_loadu_si128(src128++);
96 __m128i c = _mm_loadu_si128(src128++); 98 __m128i c = _mm_loadu_si128(src128++);
97 __m128i d = _mm_loadu_si128(src128++); 99 __m128i d = _mm_loadu_si128(src128++);
98 100
99 _mm_store_si128(dst128++, a); 101 _mm_store_si128(dst128++, a);
100 _mm_store_si128(dst128++, b); 102 _mm_store_si128(dst128++, b);
101 _mm_store_si128(dst128++, c); 103 _mm_store_si128(dst128++, c);
102 _mm_store_si128(dst128++, d); 104 _mm_store_si128(dst128++, d);
103 105
104 count -= 16; 106 count -= 16;
105 } 107 }
106
107 dst = reinterpret_cast<uint32_t*>(dst128);
108 src = reinterpret_cast<const uint32_t*>(src128);
109 } 108 }
110 109
111 while (count --> 0) { 110 while (count --> 0) {
112 *dst++ = *src++; 111 *dst++ = *src++;
113 } 112 }
114 } 113 }
115 BENCH(memcpy32_sse2_align, 10) 114 BENCH(memcpy32_sse2_align, 10)
116 BENCH(memcpy32_sse2_align, 100) 115 BENCH(memcpy32_sse2_align, 100)
117 BENCH(memcpy32_sse2_align, 1000) 116 BENCH(memcpy32_sse2_align, 1000)
118 BENCH(memcpy32_sse2_align, 10000) 117 BENCH(memcpy32_sse2_align, 10000)
119 BENCH(memcpy32_sse2_align, 100000) 118 BENCH(memcpy32_sse2_align, 100000)
120 119
121 // Leave both dst and src unaliged, and so use unaligned stores for dst and unal igned loads for src. 120 // Leave both dst and src unaliged, and so use unaligned stores for dst and unal igned loads for src.
122 static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count) { 121 static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count) {
123 __m128i* dst128 = reinterpret_cast<__m128i*>(dst); 122 __m128i* dst128 = reinterpret_cast<__m128i*>(dst);
124 const __m128i* src128 = reinterpret_cast<const __m128i*>(src); 123 const __m128i* src128 = reinterpret_cast<const __m128i*>(src);
124 dst += 16 * (count / 16);
125 src += 16 * (count / 16);
125 while (count >= 16) { 126 while (count >= 16) {
126 __m128i a = _mm_loadu_si128(src128++); 127 __m128i a = _mm_loadu_si128(src128++);
127 __m128i b = _mm_loadu_si128(src128++); 128 __m128i b = _mm_loadu_si128(src128++);
128 __m128i c = _mm_loadu_si128(src128++); 129 __m128i c = _mm_loadu_si128(src128++);
129 __m128i d = _mm_loadu_si128(src128++); 130 __m128i d = _mm_loadu_si128(src128++);
130 131
131 _mm_storeu_si128(dst128++, a); 132 _mm_storeu_si128(dst128++, a);
132 _mm_storeu_si128(dst128++, b); 133 _mm_storeu_si128(dst128++, b);
133 _mm_storeu_si128(dst128++, c); 134 _mm_storeu_si128(dst128++, c);
134 _mm_storeu_si128(dst128++, d); 135 _mm_storeu_si128(dst128++, d);
135 136
136 count -= 16; 137 count -= 16;
137 } 138 }
138 139
139 dst = reinterpret_cast<uint32_t*>(dst128);
140 src = reinterpret_cast<const uint32_t*>(src128);
141 while (count --> 0) { 140 while (count --> 0) {
142 *dst++ = *src++; 141 *dst++ = *src++;
143 } 142 }
144 } 143 }
145 // skia:2589: Crashing on ChromeOS Alex bot. TODO(mtklein): why? 144 BENCH(memcpy32_sse2_unalign, 10)
146 //BENCH(memcpy32_sse2_unalign, 10)
147 BENCH(memcpy32_sse2_unalign, 100) 145 BENCH(memcpy32_sse2_unalign, 100)
148 BENCH(memcpy32_sse2_unalign, 1000) 146 BENCH(memcpy32_sse2_unalign, 1000)
149 BENCH(memcpy32_sse2_unalign, 10000) 147 BENCH(memcpy32_sse2_unalign, 10000)
150 BENCH(memcpy32_sse2_unalign, 100000) 148 BENCH(memcpy32_sse2_unalign, 100000)
151 149
152 #endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 150 #endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
153 151
154 #undef BENCH 152 #undef BENCH
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698