| Index: source/libvpx/third_party/libyuv/source/scale_win.cc
|
| diff --git a/source/libvpx/third_party/libyuv/source/scale_win.cc b/source/libvpx/third_party/libyuv/source/scale_win.cc
|
| index 840b9738da53009f95b0a03aed3995aa9a690d0c..e0209cdec8c243d1b06dd4159c6f7c553b380798 100644
|
| --- a/source/libvpx/third_party/libyuv/source/scale_win.cc
|
| +++ b/source/libvpx/third_party/libyuv/source/scale_win.cc
|
| @@ -103,118 +103,6 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| mov edx, [esp + 12] // dst_ptr
|
| mov ecx, [esp + 16] // dst_width
|
|
|
| - align 4
|
| - wloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - lea eax, [eax + 32]
|
| - psrlw xmm0, 8 // isolate odd pixels.
|
| - psrlw xmm1, 8
|
| - packuswb xmm0, xmm1
|
| - sub ecx, 16
|
| - movdqa [edx], xmm0
|
| - lea edx, [edx + 16]
|
| - jg wloop
|
| -
|
| - ret
|
| - }
|
| -}
|
| -
|
| -// Blends 32x1 rectangle to 16x1.
|
| -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
|
| -__declspec(naked) __declspec(align(16))
|
| -void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| - uint8* dst_ptr, int dst_width) {
|
| - __asm {
|
| - mov eax, [esp + 4] // src_ptr
|
| - // src_stride
|
| - mov edx, [esp + 12] // dst_ptr
|
| - mov ecx, [esp + 16] // dst_width
|
| - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
| - psrlw xmm5, 8
|
| -
|
| - align 4
|
| - wloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - lea eax, [eax + 32]
|
| -
|
| - movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
|
| - psrlw xmm0, 8
|
| - movdqa xmm3, xmm1
|
| - psrlw xmm1, 8
|
| - pand xmm2, xmm5
|
| - pand xmm3, xmm5
|
| - pavgw xmm0, xmm2
|
| - pavgw xmm1, xmm3
|
| - packuswb xmm0, xmm1
|
| -
|
| - sub ecx, 16
|
| - movdqa [edx], xmm0
|
| - lea edx, [edx + 16]
|
| - jg wloop
|
| -
|
| - ret
|
| - }
|
| -}
|
| -
|
| -// Blends 32x2 rectangle to 16x1.
|
| -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
|
| -__declspec(naked) __declspec(align(16))
|
| -void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| - uint8* dst_ptr, int dst_width) {
|
| - __asm {
|
| - push esi
|
| - mov eax, [esp + 4 + 4] // src_ptr
|
| - mov esi, [esp + 4 + 8] // src_stride
|
| - mov edx, [esp + 4 + 12] // dst_ptr
|
| - mov ecx, [esp + 4 + 16] // dst_width
|
| - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
| - psrlw xmm5, 8
|
| -
|
| - align 4
|
| - wloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - movdqa xmm2, [eax + esi]
|
| - movdqa xmm3, [eax + esi + 16]
|
| - lea eax, [eax + 32]
|
| - pavgb xmm0, xmm2 // average rows
|
| - pavgb xmm1, xmm3
|
| -
|
| - movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
|
| - psrlw xmm0, 8
|
| - movdqa xmm3, xmm1
|
| - psrlw xmm1, 8
|
| - pand xmm2, xmm5
|
| - pand xmm3, xmm5
|
| - pavgw xmm0, xmm2
|
| - pavgw xmm1, xmm3
|
| - packuswb xmm0, xmm1
|
| -
|
| - sub ecx, 16
|
| - movdqa [edx], xmm0
|
| - lea edx, [edx + 16]
|
| - jg wloop
|
| -
|
| - pop esi
|
| - ret
|
| - }
|
| -}
|
| -
|
| -// Reads 32 pixels, throws half away and writes 16 pixels.
|
| -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
|
| -__declspec(naked) __declspec(align(16))
|
| -void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
|
| - ptrdiff_t src_stride,
|
| - uint8* dst_ptr, int dst_width) {
|
| - __asm {
|
| - mov eax, [esp + 4] // src_ptr
|
| - // src_stride ignored
|
| - mov edx, [esp + 12] // dst_ptr
|
| - mov ecx, [esp + 16] // dst_width
|
| -
|
| - align 4
|
| wloop:
|
| movdqu xmm0, [eax]
|
| movdqu xmm1, [eax + 16]
|
| @@ -222,9 +110,9 @@ void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
|
| psrlw xmm0, 8 // isolate odd pixels.
|
| psrlw xmm1, 8
|
| packuswb xmm0, xmm1
|
| - sub ecx, 16
|
| movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 16
|
| jg wloop
|
|
|
| ret
|
| @@ -234,9 +122,8 @@ void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
|
| // Blends 32x1 rectangle to 16x1.
|
| // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
|
| __declspec(naked) __declspec(align(16))
|
| -void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
|
| - ptrdiff_t src_stride,
|
| - uint8* dst_ptr, int dst_width) {
|
| +void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| __asm {
|
| mov eax, [esp + 4] // src_ptr
|
| // src_stride
|
| @@ -245,7 +132,6 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
|
| pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
| psrlw xmm5, 8
|
|
|
| - align 4
|
| wloop:
|
| movdqu xmm0, [eax]
|
| movdqu xmm1, [eax + 16]
|
| @@ -261,9 +147,9 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
|
| pavgw xmm1, xmm3
|
| packuswb xmm0, xmm1
|
|
|
| - sub ecx, 16
|
| movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 16
|
| jg wloop
|
|
|
| ret
|
| @@ -273,9 +159,8 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
|
| // Blends 32x2 rectangle to 16x1.
|
| // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
|
| __declspec(naked) __declspec(align(16))
|
| -void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
|
| - ptrdiff_t src_stride,
|
| - uint8* dst_ptr, int dst_width) {
|
| +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| __asm {
|
| push esi
|
| mov eax, [esp + 4 + 4] // src_ptr
|
| @@ -285,7 +170,6 @@ void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
|
| pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
| psrlw xmm5, 8
|
|
|
| - align 4
|
| wloop:
|
| movdqu xmm0, [eax]
|
| movdqu xmm1, [eax + 16]
|
| @@ -305,9 +189,9 @@ void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
|
| pavgw xmm1, xmm3
|
| packuswb xmm0, xmm1
|
|
|
| - sub ecx, 16
|
| movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 16
|
| jg wloop
|
|
|
| pop esi
|
| @@ -329,19 +213,18 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| psrld xmm5, 24
|
| pslld xmm5, 16
|
|
|
| - align 4
|
| wloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| + movdqu xmm0, [eax]
|
| + movdqu xmm1, [eax + 16]
|
| lea eax, [eax + 32]
|
| pand xmm0, xmm5
|
| pand xmm1, xmm5
|
| packuswb xmm0, xmm1
|
| psrlw xmm0, 8
|
| packuswb xmm0, xmm0
|
| - sub ecx, 8
|
| movq qword ptr [edx], xmm0
|
| lea edx, [edx + 8]
|
| + sub ecx, 8
|
| jg wloop
|
|
|
| ret
|
| @@ -364,18 +247,17 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
|
| psrlw xmm7, 8
|
|
|
| - align 4
|
| wloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - movdqa xmm2, [eax + esi]
|
| - movdqa xmm3, [eax + esi + 16]
|
| + movdqu xmm0, [eax]
|
| + movdqu xmm1, [eax + 16]
|
| + movdqu xmm2, [eax + esi]
|
| + movdqu xmm3, [eax + esi + 16]
|
| pavgb xmm0, xmm2 // average rows
|
| pavgb xmm1, xmm3
|
| - movdqa xmm2, [eax + esi * 2]
|
| - movdqa xmm3, [eax + esi * 2 + 16]
|
| - movdqa xmm4, [eax + edi]
|
| - movdqa xmm5, [eax + edi + 16]
|
| + movdqu xmm2, [eax + esi * 2]
|
| + movdqu xmm3, [eax + esi * 2 + 16]
|
| + movdqu xmm4, [eax + edi]
|
| + movdqu xmm5, [eax + edi + 16]
|
| lea eax, [eax + 32]
|
| pavgb xmm2, xmm4
|
| pavgb xmm3, xmm5
|
| @@ -398,9 +280,9 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| pavgw xmm0, xmm2
|
| packuswb xmm0, xmm0
|
|
|
| - sub ecx, 8
|
| movq qword ptr [edx], xmm0
|
| lea edx, [edx + 8]
|
| + sub ecx, 8
|
| jg wloop
|
|
|
| pop edi
|
| @@ -427,10 +309,9 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
| movdqa xmm4, kShuf1
|
| movdqa xmm5, kShuf2
|
|
|
| - align 4
|
| wloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| + movdqu xmm0, [eax]
|
| + movdqu xmm1, [eax + 16]
|
| lea eax, [eax + 32]
|
| movdqa xmm2, xmm1
|
| palignr xmm1, xmm0, 8
|
| @@ -481,10 +362,9 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
|
| movdqa xmm6, kMadd11
|
| movdqa xmm7, kRound34
|
|
|
| - align 4
|
| wloop:
|
| - movdqa xmm0, [eax] // pixels 0..7
|
| - movdqa xmm1, [eax + esi]
|
| + movdqu xmm0, [eax] // pixels 0..7
|
| + movdqu xmm1, [eax + esi]
|
| pavgb xmm0, xmm1
|
| pshufb xmm0, xmm2
|
| pmaddubsw xmm0, xmm5
|
| @@ -501,8 +381,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
|
| psrlw xmm0, 2
|
| packuswb xmm0, xmm0
|
| movq qword ptr [edx + 8], xmm0
|
| - movdqa xmm0, [eax + 16] // pixels 16..23
|
| - movdqa xmm1, [eax + esi + 16]
|
| + movdqu xmm0, [eax + 16] // pixels 16..23
|
| + movdqu xmm1, [eax + esi + 16]
|
| lea eax, [eax + 32]
|
| pavgb xmm0, xmm1
|
| pshufb xmm0, xmm4
|
| @@ -511,9 +391,9 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
|
| paddsw xmm0, xmm7
|
| psrlw xmm0, 2
|
| packuswb xmm0, xmm0
|
| - sub ecx, 24
|
| movq qword ptr [edx + 16], xmm0
|
| lea edx, [edx + 24]
|
| + sub ecx, 24
|
| jg wloop
|
|
|
| pop esi
|
| @@ -540,10 +420,9 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
|
| movdqa xmm6, kMadd11
|
| movdqa xmm7, kRound34
|
|
|
| - align 4
|
| wloop:
|
| - movdqa xmm0, [eax] // pixels 0..7
|
| - movdqa xmm1, [eax + esi]
|
| + movdqu xmm0, [eax] // pixels 0..7
|
| + movdqu xmm1, [eax + esi]
|
| pavgb xmm1, xmm0
|
| pavgb xmm0, xmm1
|
| pshufb xmm0, xmm2
|
| @@ -562,8 +441,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
|
| psrlw xmm0, 2
|
| packuswb xmm0, xmm0
|
| movq qword ptr [edx + 8], xmm0
|
| - movdqa xmm0, [eax + 16] // pixels 16..23
|
| - movdqa xmm1, [eax + esi + 16]
|
| + movdqu xmm0, [eax + 16] // pixels 16..23
|
| + movdqu xmm1, [eax + esi + 16]
|
| lea eax, [eax + 32]
|
| pavgb xmm1, xmm0
|
| pavgb xmm0, xmm1
|
| @@ -573,9 +452,9 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
|
| paddsw xmm0, xmm7
|
| psrlw xmm0, 2
|
| packuswb xmm0, xmm0
|
| - sub ecx, 24
|
| movq qword ptr [edx + 16], xmm0
|
| lea edx, [edx+24]
|
| + sub ecx, 24
|
| jg wloop
|
|
|
| pop esi
|
| @@ -597,20 +476,19 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
| movdqa xmm4, kShuf38a
|
| movdqa xmm5, kShuf38b
|
|
|
| - align 4
|
| xloop:
|
| - movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
|
| - movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
|
| + movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
|
| + movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
|
| lea eax, [eax + 32]
|
| pshufb xmm0, xmm4
|
| pshufb xmm1, xmm5
|
| paddusb xmm0, xmm1
|
|
|
| - sub ecx, 12
|
| movq qword ptr [edx], xmm0 // write 12 pixels
|
| movhlps xmm1, xmm0
|
| movd [edx + 8], xmm1
|
| lea edx, [edx + 12]
|
| + sub ecx, 12
|
| jg xloop
|
|
|
| ret
|
| @@ -633,10 +511,9 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
|
| movdqa xmm4, kScaleAc33
|
| pxor xmm5, xmm5
|
|
|
| - align 4
|
| xloop:
|
| - movdqa xmm0, [eax] // sum up 3 rows into xmm0/1
|
| - movdqa xmm6, [eax + esi]
|
| + movdqu xmm0, [eax] // sum up 3 rows into xmm0/1
|
| + movdqu xmm6, [eax + esi]
|
| movhlps xmm1, xmm0
|
| movhlps xmm7, xmm6
|
| punpcklbw xmm0, xmm5
|
| @@ -645,7 +522,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
|
| punpcklbw xmm7, xmm5
|
| paddusw xmm0, xmm6
|
| paddusw xmm1, xmm7
|
| - movdqa xmm6, [eax + esi * 2]
|
| + movdqu xmm6, [eax + esi * 2]
|
| lea eax, [eax + 16]
|
| movhlps xmm7, xmm6
|
| punpcklbw xmm6, xmm5
|
| @@ -671,11 +548,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
|
| pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
|
| packuswb xmm6, xmm6
|
|
|
| - sub ecx, 6
|
| movd [edx], xmm6 // write 6 pixels
|
| psrlq xmm6, 16
|
| movd [edx + 2], xmm6
|
| lea edx, [edx + 6]
|
| + sub ecx, 6
|
| jg xloop
|
|
|
| pop esi
|
| @@ -699,11 +576,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
|
| movdqa xmm4, kShufAb2
|
| movdqa xmm5, kScaleAb2
|
|
|
| - align 4
|
| xloop:
|
| - movdqa xmm0, [eax] // average 2 rows into xmm0
|
| - pavgb xmm0, [eax + esi]
|
| + movdqu xmm0, [eax] // average 2 rows into xmm0
|
| + movdqu xmm1, [eax + esi]
|
| lea eax, [eax + 16]
|
| + pavgb xmm0, xmm1
|
|
|
| movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
|
| pshufb xmm1, xmm2
|
| @@ -716,11 +593,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
|
| pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
|
| packuswb xmm1, xmm1
|
|
|
| - sub ecx, 6
|
| movd [edx], xmm1 // write 6 pixels
|
| psrlq xmm1, 16
|
| movd [edx + 2], xmm1
|
| lea edx, [edx + 6]
|
| + sub ecx, 6
|
| jg xloop
|
|
|
| pop esi
|
| @@ -747,10 +624,9 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| pxor xmm4, xmm4
|
| dec ebx
|
|
|
| - align 4
|
| xloop:
|
| // first row
|
| - movdqa xmm0, [esi]
|
| + movdqu xmm0, [esi]
|
| lea eax, [esi + edx]
|
| movdqa xmm1, xmm0
|
| punpcklbw xmm0, xmm4
|
| @@ -761,9 +637,8 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| je ydone
|
|
|
| // sum remaining rows
|
| - align 4
|
| yloop:
|
| - movdqa xmm2, [eax] // read 16 pixels
|
| + movdqu xmm2, [eax] // read 16 pixels
|
| lea eax, [eax + edx] // advance to next row
|
| movdqa xmm3, xmm2
|
| punpcklbw xmm2, xmm4
|
| @@ -773,10 +648,9 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| sub ebp, 1
|
| jg yloop
|
|
|
| - align 4
|
| ydone:
|
| - movdqa [edi], xmm0
|
| - movdqa [edi + 16], xmm1
|
| + movdqu [edi], xmm0
|
| + movdqu [edi + 16], xmm1
|
| lea edi, [edi + 32]
|
|
|
| sub ecx, 16
|
| @@ -828,7 +702,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| pextrw edx, xmm2, 3 // get x1 integer. preroll
|
|
|
| // 2 Pixel loop.
|
| - align 4
|
| xloop2:
|
| movdqa xmm1, xmm2 // x0, x1 fractions.
|
| paddd xmm2, xmm3 // x += dx
|
| @@ -851,7 +724,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| sub ecx, 2 // 2 pixels
|
| jge xloop2
|
|
|
| - align 4
|
| xloop29:
|
|
|
| add ecx, 2 - 1
|
| @@ -869,7 +741,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| movd ebx, xmm0
|
| mov [edi], bl
|
|
|
| - align 4
|
| xloop99:
|
|
|
| pop edi
|
| @@ -889,17 +760,16 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
| mov eax, [esp + 8] // src_ptr
|
| mov ecx, [esp + 12] // dst_width
|
|
|
| - align 4
|
| wloop:
|
| - movdqa xmm0, [eax]
|
| + movdqu xmm0, [eax]
|
| lea eax, [eax + 16]
|
| movdqa xmm1, xmm0
|
| punpcklbw xmm0, xmm0
|
| punpckhbw xmm1, xmm1
|
| - sub ecx, 32
|
| - movdqa [edx], xmm0
|
| - movdqa [edx + 16], xmm1
|
| + movdqu [edx], xmm0
|
| + movdqu [edx + 16], xmm1
|
| lea edx, [edx + 32]
|
| + sub ecx, 32
|
| jg wloop
|
|
|
| ret
|
| @@ -918,15 +788,14 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
|
| mov edx, [esp + 12] // dst_argb
|
| mov ecx, [esp + 16] // dst_width
|
|
|
| - align 4
|
| wloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| + movdqu xmm0, [eax]
|
| + movdqu xmm1, [eax + 16]
|
| lea eax, [eax + 32]
|
| shufps xmm0, xmm1, 0xdd
|
| - sub ecx, 4
|
| - movdqa [edx], xmm0
|
| + movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 4
|
| jg wloop
|
|
|
| ret
|
| @@ -945,18 +814,17 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
|
| mov edx, [esp + 12] // dst_argb
|
| mov ecx, [esp + 16] // dst_width
|
|
|
| - align 4
|
| wloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| + movdqu xmm0, [eax]
|
| + movdqu xmm1, [eax + 16]
|
| lea eax, [eax + 32]
|
| movdqa xmm2, xmm0
|
| shufps xmm0, xmm1, 0x88 // even pixels
|
| shufps xmm2, xmm1, 0xdd // odd pixels
|
| pavgb xmm0, xmm2
|
| - sub ecx, 4
|
| - movdqa [edx], xmm0
|
| + movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 4
|
| jg wloop
|
|
|
| ret
|
| @@ -976,12 +844,11 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
|
| mov edx, [esp + 4 + 12] // dst_argb
|
| mov ecx, [esp + 4 + 16] // dst_width
|
|
|
| - align 4
|
| wloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - movdqa xmm2, [eax + esi]
|
| - movdqa xmm3, [eax + esi + 16]
|
| + movdqu xmm0, [eax]
|
| + movdqu xmm1, [eax + 16]
|
| + movdqu xmm2, [eax + esi]
|
| + movdqu xmm3, [eax + esi + 16]
|
| lea eax, [eax + 32]
|
| pavgb xmm0, xmm2 // average rows
|
| pavgb xmm1, xmm3
|
| @@ -989,9 +856,9 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
|
| shufps xmm0, xmm1, 0x88 // even pixels
|
| shufps xmm2, xmm1, 0xdd // odd pixels
|
| pavgb xmm0, xmm2
|
| - sub ecx, 4
|
| - movdqa [edx], xmm0
|
| + movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 4
|
| jg wloop
|
|
|
| pop esi
|
| @@ -1016,7 +883,6 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
| lea ebx, [ebx * 4]
|
| lea edi, [ebx + ebx * 2]
|
|
|
| - align 4
|
| wloop:
|
| movd xmm0, [eax]
|
| movd xmm1, [eax + ebx]
|
| @@ -1026,9 +892,9 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
| lea eax, [eax + ebx * 4]
|
| punpckldq xmm2, xmm3
|
| punpcklqdq xmm0, xmm2
|
| - sub ecx, 4
|
| - movdqa [edx], xmm0
|
| + movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 4
|
| jg wloop
|
|
|
| pop edi
|
| @@ -1057,7 +923,6 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
|
| lea ebx, [ebx * 4]
|
| lea edi, [ebx + ebx * 2]
|
|
|
| - align 4
|
| wloop:
|
| movq xmm0, qword ptr [eax] // row0 4 pairs
|
| movhps xmm0, qword ptr [eax + ebx]
|
| @@ -1075,9 +940,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
|
| shufps xmm0, xmm1, 0x88 // even pixels
|
| shufps xmm2, xmm1, 0xdd // odd pixels
|
| pavgb xmm0, xmm2
|
| - sub ecx, 4
|
| - movdqa [edx], xmm0
|
| + movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 4
|
| jg wloop
|
|
|
| pop edi
|
| @@ -1118,7 +983,6 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
|
| jl xloop49
|
|
|
| // 4 Pixel loop.
|
| - align 4
|
| xloop4:
|
| movd xmm0, [esi + eax * 4] // 1 source x0 pixels
|
| movd xmm1, [esi + edx * 4] // 1 source x1 pixels
|
| @@ -1133,12 +997,11 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
|
| pextrw edx, xmm2, 3 // get x1 integer. next iteration.
|
| punpckldq xmm1, xmm4 // x2 x3
|
| punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
|
| - sub ecx, 4 // 4 pixels
|
| movdqu [edi], xmm0
|
| lea edi, [edi + 16]
|
| + sub ecx, 4 // 4 pixels
|
| jge xloop4
|
|
|
| - align 4
|
| xloop49:
|
| test ecx, 2
|
| je xloop29
|
| @@ -1159,7 +1022,6 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
|
| // 1 Pixels.
|
| movd xmm0, [esi + eax * 4] // 1 source x2 pixels
|
| movd dword ptr [edi], xmm0
|
| - align 4
|
| xloop99:
|
|
|
| pop esi
|
| @@ -1209,7 +1071,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
|
| pextrw edx, xmm2, 3 // get x1 integer. preroll
|
|
|
| // 2 Pixel loop.
|
| - align 4
|
| xloop2:
|
| movdqa xmm1, xmm2 // x0, x1 fractions.
|
| paddd xmm2, xmm3 // x += dx
|
| @@ -1229,7 +1090,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
|
| sub ecx, 2 // 2 pixels
|
| jge xloop2
|
|
|
| - align 4
|
| xloop29:
|
|
|
| add ecx, 2 - 1
|
| @@ -1246,7 +1106,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
|
| packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
|
| movd [edi], xmm0
|
|
|
| - align 4
|
| xloop99:
|
|
|
| pop edi
|
| @@ -1265,17 +1124,16 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
|
| mov eax, [esp + 8] // src_argb
|
| mov ecx, [esp + 12] // dst_width
|
|
|
| - align 4
|
| wloop:
|
| - movdqa xmm0, [eax]
|
| + movdqu xmm0, [eax]
|
| lea eax, [eax + 16]
|
| movdqa xmm1, xmm0
|
| punpckldq xmm0, xmm0
|
| punpckhdq xmm1, xmm1
|
| - sub ecx, 8
|
| - movdqa [edx], xmm0
|
| - movdqa [edx + 16], xmm1
|
| + movdqu [edx], xmm0
|
| + movdqu [edx + 16], xmm1
|
| lea edx, [edx + 32]
|
| + sub ecx, 8
|
| jg wloop
|
|
|
| ret
|
|
|