Index: source/scale_win.cc |
diff --git a/source/scale_win.cc b/source/scale_win.cc |
index 6930f729590dd6f4b9c64d5aa9cd8a642230c919..5ab4fa0ccc2524d78b4a939baa53dce464fe0905 100644 |
--- a/source/scale_win.cc |
+++ b/source/scale_win.cc |
@@ -309,7 +309,7 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, |
// Point samples 32 pixels to 8 pixels. |
__declspec(naked) |
-void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
+void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
uint8* dst_ptr, int dst_width) { |
__asm { |
mov eax, [esp + 4] // src_ptr |
@@ -340,7 +340,7 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
// Blends 32x4 rectangle to 8x1. |
__declspec(naked) |
-void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
+void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
uint8* dst_ptr, int dst_width) { |
__asm { |
push esi |
@@ -350,42 +350,40 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
mov edx, [esp + 8 + 12] // dst_ptr |
mov ecx, [esp + 8 + 16] // dst_width |
lea edi, [esi + esi * 2] // src_stride * 3 |
- pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff |
- psrlw xmm7, 8 |
+ pcmpeqb xmm4, xmm4 // constant 0x0101 |
+ psrlw xmm4, 15 |
+ movdqa xmm5, xmm4 |
+ packuswb xmm4, xmm4 |
+ psllw xmm5, 3 // constant 0x0008 |
wloop: |
movdqu xmm0, [eax] // average rows |
movdqu xmm1, [eax + 16] |
movdqu xmm2, [eax + esi] |
movdqu xmm3, [eax + esi + 16] |
- pavgb xmm0, xmm2 |
- pavgb xmm1, xmm3 |
+ pmaddubsw xmm0, xmm4 // horizontal add |
+ pmaddubsw xmm1, xmm4 |
+ pmaddubsw xmm2, xmm4 |
+ pmaddubsw xmm3, xmm4 |
+ paddw xmm0, xmm2 // vertical add rows 0, 1 |
+ paddw xmm1, xmm3 |
movdqu xmm2, [eax + esi * 2] |
movdqu xmm3, [eax + esi * 2 + 16] |
- movdqu xmm4, [eax + edi] |
- movdqu xmm5, [eax + edi + 16] |
+ pmaddubsw xmm2, xmm4 |
+ pmaddubsw xmm3, xmm4 |
+ paddw xmm0, xmm2 // add row 2 |
+ paddw xmm1, xmm3 |
+ movdqu xmm2, [eax + edi] |
+ movdqu xmm3, [eax + edi + 16] |
lea eax, [eax + 32] |
- pavgb xmm2, xmm4 |
- pavgb xmm3, xmm5 |
- pavgb xmm0, xmm2 |
- pavgb xmm1, xmm3 |
- |
- movdqa xmm2, xmm0 // average columns (32 to 16 pixels) |
- psrlw xmm0, 8 |
- movdqa xmm3, xmm1 |
- psrlw xmm1, 8 |
- pand xmm2, xmm7 |
- pand xmm3, xmm7 |
- pavgw xmm0, xmm2 |
- pavgw xmm1, xmm3 |
- packuswb xmm0, xmm1 |
- |
- movdqa xmm2, xmm0 // average columns (16 to 8 pixels) |
- psrlw xmm0, 8 |
- pand xmm2, xmm7 |
- pavgw xmm0, xmm2 |
+ pmaddubsw xmm2, xmm4 |
+ pmaddubsw xmm3, xmm4 |
+ paddw xmm0, xmm2 // add row 3 |
+ paddw xmm1, xmm3 |
+ phaddw xmm0, xmm1 |
+ paddw xmm0, xmm5 // + 8 for round |
+ psrlw xmm0, 4 // /16 for average of 4 * 4 |
packuswb xmm0, xmm0 |
- |
movq qword ptr [edx], xmm0 |
lea edx, [edx + 8] |
sub ecx, 8 |
@@ -444,37 +442,41 @@ void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, |
mov edx, [esp + 8 + 12] // dst_ptr |
mov ecx, [esp + 8 + 16] // dst_width |
lea edi, [esi + esi * 2] // src_stride * 3 |
- vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff |
- vpsrlw ymm7, ymm7, 8 |
+ vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101 |
+ vpsrlw ymm4, ymm4, 15 |
+ vpsllw ymm5, ymm4, 3 // constant 0x0008 |
+ vpackuswb ymm4, ymm4, ymm4 |
wloop: |
vmovdqu ymm0, [eax] // average rows |
vmovdqu ymm1, [eax + 32] |
- vpavgb ymm0, ymm0, [eax + esi] |
- vpavgb ymm1, ymm1, [eax + esi + 32] |
+ vmovdqu ymm2, [eax + esi] |
+ vmovdqu ymm3, [eax + esi + 32] |
+ vpmaddubsw ymm0, ymm0, ymm4 // horizontal add |
+ vpmaddubsw ymm1, ymm1, ymm4 |
+ vpmaddubsw ymm2, ymm2, ymm4 |
+ vpmaddubsw ymm3, ymm3, ymm4 |
+ vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1 |
+ vpaddw ymm1, ymm1, ymm3 |
vmovdqu ymm2, [eax + esi * 2] |
vmovdqu ymm3, [eax + esi * 2 + 32] |
- vpavgb ymm2, ymm2, [eax + edi] |
- vpavgb ymm3, ymm3, [eax + edi + 32] |
- lea eax, [eax + 64] |
- vpavgb ymm0, ymm0, ymm2 |
- vpavgb ymm1, ymm1, ymm3 |
- |
- vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels) |
- vpand ymm3, ymm1, ymm7 |
- vpsrlw ymm0, ymm0, 8 |
- vpsrlw ymm1, ymm1, 8 |
- vpavgw ymm0, ymm0, ymm2 |
- vpavgw ymm1, ymm1, ymm3 |
- vpackuswb ymm0, ymm0, ymm1 |
- vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb |
- |
- vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels) |
- vpsrlw ymm0, ymm0, 8 |
- vpavgw ymm0, ymm0, ymm2 |
+ vpmaddubsw ymm2, ymm2, ymm4 |
+ vpmaddubsw ymm3, ymm3, ymm4 |
+ vpaddw ymm0, ymm0, ymm2 // add row 2 |
+ vpaddw ymm1, ymm1, ymm3 |
+ vmovdqu ymm2, [eax + edi] |
+ vmovdqu ymm3, [eax + edi + 32] |
+ lea eax, [eax + 64] |
+ vpmaddubsw ymm2, ymm2, ymm4 |
+ vpmaddubsw ymm3, ymm3, ymm4 |
+ vpaddw ymm0, ymm0, ymm2 // add row 3 |
+ vpaddw ymm1, ymm1, ymm3 |
+ vphaddw ymm0, ymm0, ymm1 // mutates |
+ vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw |
+ vpaddw ymm0, ymm0, ymm5 // + 8 for round |
+ vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4 |
vpackuswb ymm0, ymm0, ymm0 |
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb |
- |
vmovdqu [edx], xmm0 |
lea edx, [edx + 16] |
sub ecx, 16 |