OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright 2009 The Android Open Source Project | 2 * Copyright 2009 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include <emmintrin.h> | 8 #include <emmintrin.h> |
9 #include "SkUtils_opts_SSE2.h" | 9 #include "SkUtils_opts_SSE2.h" |
10 | 10 |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
60 d += 4; | 60 d += 4; |
61 count -= 16; | 61 count -= 16; |
62 } | 62 } |
63 dst = reinterpret_cast<uint32_t*>(d); | 63 dst = reinterpret_cast<uint32_t*>(d); |
64 } | 64 } |
65 while (count > 0) { | 65 while (count > 0) { |
66 *dst++ = value; | 66 *dst++ = value; |
67 --count; | 67 --count; |
68 } | 68 } |
69 } | 69 } |
70 | |
71 void sk_memcpy32_SSE2(uint32_t *dst, const uint32_t *src, int count) | |
72 { | |
73 if (count >= 16) { | |
74 while (((size_t)dst) & 0x0F) { | |
75 *dst++ = *src++; | |
76 --count; | |
77 } | |
78 __m128i *d = reinterpret_cast<__m128i*>(dst); | |
79 const __m128i *s = reinterpret_cast<const __m128i*>(src); | |
80 while (count >= 16) { | |
81 __m128i src_pixel1 = _mm_loadu_si128(s++); | |
mtklein
2014/05/15 15:38:30
src_pixelN seems like a misleading name, as it's r
qiankun
2014/05/20 09:35:30
Done.
| |
82 __m128i src_pixel2 = _mm_loadu_si128(s++); | |
83 __m128i src_pixel3 = _mm_loadu_si128(s++); | |
84 __m128i src_pixel4 = _mm_loadu_si128(s++); | |
85 | |
86 _mm_store_si128(d , src_pixel1); | |
mtklein
2014/05/15 15:38:30
Any chance you compared _mm_store and _mm_stream h
mtklein
2014/05/15 15:38:30
For symmetry with s, maybe _mm_store_si128(d++, ..
qiankun
2014/05/20 09:35:30
Done.
qiankun
2014/05/20 09:35:30
At my side, _mm_store is better than _mm_stream to
| |
87 _mm_store_si128(d + 1, src_pixel2); | |
88 _mm_store_si128(d + 2, src_pixel3); | |
89 _mm_store_si128(d + 3, src_pixel4); | |
90 d += 4; | |
91 count -= 16; | |
92 } | |
93 dst = reinterpret_cast<uint32_t*>(d); | |
94 src = reinterpret_cast<const uint32_t*>(s); | |
95 } | |
96 while (count > 0) { | |
97 *dst++ = *src++; | |
98 --count; | |
99 } | |
100 } | |
OLD | NEW |