OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2012 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include "libyuv/basic_types.h" | 11 #include "libyuv/basic_types.h" |
12 #include "libyuv/row.h" | 12 #include "libyuv/row.h" |
13 | 13 |
14 #ifdef __cplusplus | 14 #ifdef __cplusplus |
15 namespace libyuv { | 15 namespace libyuv { |
16 extern "C" { | 16 extern "C" { |
17 #endif | 17 #endif |
18 | 18 |
19 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) | 19 // This module is for Visual C x86. |
| 20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ |
| 21 defined(_MSC_VER) && !defined(__clang__) |
20 | 22 |
21 __declspec(naked) __declspec(align(16)) | 23 __declspec(naked) |
22 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { | 24 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { |
23 __asm { | 25 __asm { |
24 mov eax, [esp + 4] // src_a | 26 mov eax, [esp + 4] // src_a |
25 mov edx, [esp + 8] // src_b | 27 mov edx, [esp + 8] // src_b |
26 mov ecx, [esp + 12] // count | 28 mov ecx, [esp + 12] // count |
27 pxor xmm0, xmm0 | 29 pxor xmm0, xmm0 |
28 pxor xmm5, xmm5 | 30 pxor xmm5, xmm5 |
29 | 31 |
30 wloop: | 32 wloop: |
31 movdqu xmm1, [eax] | 33 movdqu xmm1, [eax] |
(...skipping 20 matching lines...) Expand all Loading... |
52 paddd xmm0, xmm1 | 54 paddd xmm0, xmm1 |
53 movd eax, xmm0 | 55 movd eax, xmm0 |
54 ret | 56 ret |
55 } | 57 } |
56 } | 58 } |
57 | 59 |
58 // Visual C 2012 required for AVX2. | 60 // Visual C 2012 required for AVX2. |
59 #if _MSC_VER >= 1700 | 61 #if _MSC_VER >= 1700 |
60 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. | 62 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. |
61 #pragma warning(disable: 4752) | 63 #pragma warning(disable: 4752) |
62 __declspec(naked) __declspec(align(16)) | 64 __declspec(naked) |
63 uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { | 65 uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { |
64 __asm { | 66 __asm { |
65 mov eax, [esp + 4] // src_a | 67 mov eax, [esp + 4] // src_a |
66 mov edx, [esp + 8] // src_b | 68 mov edx, [esp + 8] // src_b |
67 mov ecx, [esp + 12] // count | 69 mov ecx, [esp + 12] // count |
68 vpxor ymm0, ymm0, ymm0 // sum | 70 vpxor ymm0, ymm0, ymm0 // sum |
69 vpxor ymm5, ymm5, ymm5 // constant 0 for unpck | 71 vpxor ymm5, ymm5, ymm5 // constant 0 for unpck |
70 sub edx, eax | 72 sub edx, eax |
71 | 73 |
72 wloop: | 74 wloop: |
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
126 }; | 128 }; |
127 | 129 |
128 // 27: 66 0F 38 40 C6 pmulld xmm0,xmm6 | 130 // 27: 66 0F 38 40 C6 pmulld xmm0,xmm6 |
129 // 44: 66 0F 38 40 DD pmulld xmm3,xmm5 | 131 // 44: 66 0F 38 40 DD pmulld xmm3,xmm5 |
130 // 59: 66 0F 38 40 E5 pmulld xmm4,xmm5 | 132 // 59: 66 0F 38 40 E5 pmulld xmm4,xmm5 |
131 // 72: 66 0F 38 40 D5 pmulld xmm2,xmm5 | 133 // 72: 66 0F 38 40 D5 pmulld xmm2,xmm5 |
132 // 83: 66 0F 38 40 CD pmulld xmm1,xmm5 | 134 // 83: 66 0F 38 40 CD pmulld xmm1,xmm5 |
133 #define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \ | 135 #define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \ |
134 _asm _emit 0x40 _asm _emit reg | 136 _asm _emit 0x40 _asm _emit reg |
135 | 137 |
136 __declspec(naked) __declspec(align(16)) | 138 __declspec(naked) |
137 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { | 139 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { |
138 __asm { | 140 __asm { |
139 mov eax, [esp + 4] // src | 141 mov eax, [esp + 4] // src |
140 mov ecx, [esp + 8] // count | 142 mov ecx, [esp + 8] // count |
141 movd xmm0, [esp + 12] // seed | 143 movd xmm0, [esp + 12] // seed |
142 | 144 |
143 pxor xmm7, xmm7 // constant 0 for unpck | 145 pxor xmm7, xmm7 // constant 0 for unpck |
144 movdqa xmm6, kHash16x33 | 146 movdqa xmm6, kHash16x33 |
145 | 147 |
146 wloop: | 148 wloop: |
(...skipping 30 matching lines...) Expand all Loading... |
177 sub ecx, 16 | 179 sub ecx, 16 |
178 jg wloop | 180 jg wloop |
179 | 181 |
180 movd eax, xmm0 // return hash | 182 movd eax, xmm0 // return hash |
181 ret | 183 ret |
182 } | 184 } |
183 } | 185 } |
184 | 186 |
185 // Visual C 2012 required for AVX2. | 187 // Visual C 2012 required for AVX2. |
186 #if _MSC_VER >= 1700 | 188 #if _MSC_VER >= 1700 |
187 __declspec(naked) __declspec(align(16)) | 189 __declspec(naked) |
188 uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { | 190 uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { |
189 __asm { | 191 __asm { |
190 mov eax, [esp + 4] // src | 192 mov eax, [esp + 4] // src |
191 mov ecx, [esp + 8] // count | 193 mov ecx, [esp + 8] // count |
192 movd xmm0, [esp + 12] // seed | 194 movd xmm0, [esp + 12] // seed |
193 movdqa xmm6, kHash16x33 | 195 movdqa xmm6, kHash16x33 |
194 | 196 |
195 wloop: | 197 wloop: |
196 vpmovzxbd xmm3, dword ptr [eax] // src[0-3] | 198 vpmovzxbd xmm3, dword ptr [eax] // src[0-3] |
197 pmulld xmm0, xmm6 // hash *= 33 ^ 16 | 199 pmulld xmm0, xmm6 // hash *= 33 ^ 16 |
(...skipping 14 matching lines...) Expand all Loading... |
212 paddd xmm1, xmm2 | 214 paddd xmm1, xmm2 |
213 paddd xmm0, xmm1 | 215 paddd xmm0, xmm1 |
214 sub ecx, 16 | 216 sub ecx, 16 |
215 jg wloop | 217 jg wloop |
216 | 218 |
217 movd eax, xmm0 // return hash | 219 movd eax, xmm0 // return hash |
218 ret | 220 ret |
219 } | 221 } |
220 } | 222 } |
221 #endif // _MSC_VER >= 1700 | 223 #endif // _MSC_VER >= 1700 |
222 | 224 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) |
223 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) | |
224 | 225 |
225 #ifdef __cplusplus | 226 #ifdef __cplusplus |
226 } // extern "C" | 227 } // extern "C" |
227 } // namespace libyuv | 228 } // namespace libyuv |
228 #endif | 229 #endif |
OLD | NEW |