| Index: source/libvpx/third_party/libyuv/source/compare_win.cc
|
| diff --git a/source/libvpx/third_party/libyuv/source/compare_win.cc b/source/libvpx/third_party/libyuv/source/compare_win.cc
|
| index 99831651f5f58d723dc044dd0f447bc13dcfe7a1..e99009a21dff24c63f341f9ef3344068feffaeb0 100644
|
| --- a/source/libvpx/third_party/libyuv/source/compare_win.cc
|
| +++ b/source/libvpx/third_party/libyuv/source/compare_win.cc
|
| @@ -27,13 +27,11 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
| pxor xmm0, xmm0
|
| pxor xmm5, xmm5
|
|
|
| - align 4
|
| wloop:
|
| - movdqa xmm1, [eax]
|
| + movdqu xmm1, [eax]
|
| lea eax, [eax + 16]
|
| - movdqa xmm2, [edx]
|
| + movdqu xmm2, [edx]
|
| lea edx, [edx + 16]
|
| - sub ecx, 16
|
| movdqa xmm3, xmm1 // abs trick
|
| psubusb xmm1, xmm2
|
| psubusb xmm2, xmm3
|
| @@ -45,6 +43,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
| pmaddwd xmm2, xmm2
|
| paddd xmm0, xmm1
|
| paddd xmm0, xmm2
|
| + sub ecx, 16
|
| jg wloop
|
|
|
| pshufd xmm1, xmm0, 0xee
|
| @@ -70,12 +69,10 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
|
| vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
|
| sub edx, eax
|
|
|
| - align 4
|
| wloop:
|
| vmovdqu ymm1, [eax]
|
| vmovdqu ymm2, [eax + edx]
|
| lea eax, [eax + 32]
|
| - sub ecx, 32
|
| vpsubusb ymm3, ymm1, ymm2 // abs difference trick
|
| vpsubusb ymm2, ymm2, ymm1
|
| vpor ymm1, ymm2, ymm3
|
| @@ -85,6 +82,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
|
| vpmaddwd ymm1, ymm1, ymm1
|
| vpaddd ymm0, ymm0, ymm1
|
| vpaddd ymm0, ymm0, ymm2
|
| + sub ecx, 32
|
| jg wloop
|
|
|
| vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
|
| @@ -145,7 +143,6 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
|
| pxor xmm7, xmm7 // constant 0 for unpck
|
| movdqa xmm6, kHash16x33
|
|
|
| - align 4
|
| wloop:
|
| movdqu xmm1, [eax] // src[0-15]
|
| lea eax, [eax + 16]
|
| @@ -170,7 +167,6 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
|
| pmulld(0xcd) // pmulld xmm1, xmm5
|
| paddd xmm3, xmm4 // add 16 results
|
| paddd xmm1, xmm2
|
| - sub ecx, 16
|
| paddd xmm1, xmm3
|
|
|
| pshufd xmm2, xmm1, 0x0e // upper 2 dwords
|
| @@ -178,6 +174,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
|
| pshufd xmm2, xmm1, 0x01
|
| paddd xmm1, xmm2
|
| paddd xmm0, xmm1
|
| + sub ecx, 16
|
| jg wloop
|
|
|
| movd eax, xmm0 // return hash
|
| @@ -195,7 +192,6 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
|
| movd xmm0, [esp + 12] // seed
|
| movdqa xmm6, kHash16x33
|
|
|
| - align 4
|
| wloop:
|
| vpmovzxbd xmm3, dword ptr [eax] // src[0-3]
|
| pmulld xmm0, xmm6 // hash *= 33 ^ 16
|
| @@ -209,13 +205,13 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
|
| pmulld xmm1, kHashMul3
|
| paddd xmm3, xmm4 // add 16 results
|
| paddd xmm1, xmm2
|
| - sub ecx, 16
|
| paddd xmm1, xmm3
|
| pshufd xmm2, xmm1, 0x0e // upper 2 dwords
|
| paddd xmm1, xmm2
|
| pshufd xmm2, xmm1, 0x01
|
| paddd xmm1, xmm2
|
| paddd xmm0, xmm1
|
| + sub ecx, 16
|
| jg wloop
|
|
|
| movd eax, xmm0 // return hash
|
|
|