Index: source/scale_win.cc |
diff --git a/source/scale_win.cc b/source/scale_win.cc |
index f48a4ee7671f852629083fe5f610680ff639e4b7..6930f729590dd6f4b9c64d5aa9cd8a642230c919 100644 |
--- a/source/scale_win.cc |
+++ b/source/scale_win.cc |
@@ -95,8 +95,8 @@ static uvec16 kScaleAb2 = |
// Reads 32 pixels, throws half away and writes 16 pixels. |
__declspec(naked) |
-void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
- uint8* dst_ptr, int dst_width) { |
+void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
+ uint8* dst_ptr, int dst_width) { |
__asm { |
mov eax, [esp + 4] // src_ptr |
// src_stride ignored |
@@ -121,31 +121,28 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
// Blends 32x1 rectangle to 16x1. |
__declspec(naked) |
-void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
- uint8* dst_ptr, int dst_width) { |
+void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
+ uint8* dst_ptr, int dst_width) { |
__asm { |
mov eax, [esp + 4] // src_ptr |
// src_stride |
mov edx, [esp + 12] // dst_ptr |
mov ecx, [esp + 16] // dst_width |
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
- psrlw xmm5, 8 |
+ |
+ pcmpeqb xmm4, xmm4 // constant 0x0101 |
+ psrlw xmm4, 15 |
+ packuswb xmm4, xmm4 |
+ pxor xmm5, xmm5 // constant 0 |
wloop: |
movdqu xmm0, [eax] |
movdqu xmm1, [eax + 16] |
lea eax, [eax + 32] |
- |
- movdqa xmm2, xmm0 // average columns (32 to 16 pixels) |
- psrlw xmm0, 8 |
- movdqa xmm3, xmm1 |
- psrlw xmm1, 8 |
- pand xmm2, xmm5 |
- pand xmm3, xmm5 |
- pavgw xmm0, xmm2 |
- pavgw xmm1, xmm3 |
+ pmaddubsw xmm0, xmm4 // horizontal add |
+ pmaddubsw xmm1, xmm4 |
+ pavgw xmm0, xmm5 // (x + 1) / 2 |
+ pavgw xmm1, xmm5 |
packuswb xmm0, xmm1 |
- |
movdqu [edx], xmm0 |
lea edx, [edx + 16] |
sub ecx, 16 |
@@ -157,16 +154,19 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
// Blends 32x2 rectangle to 16x1. |
__declspec(naked) |
-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
- uint8* dst_ptr, int dst_width) { |
+void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
+ uint8* dst_ptr, int dst_width) { |
__asm { |
push esi |
mov eax, [esp + 4 + 4] // src_ptr |
mov esi, [esp + 4 + 8] // src_stride |
mov edx, [esp + 4 + 12] // dst_ptr |
mov ecx, [esp + 4 + 16] // dst_width |
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
- psrlw xmm5, 8 |
+ |
+ pcmpeqb xmm4, xmm4 // constant 0x0101 |
+ psrlw xmm4, 15 |
+ packuswb xmm4, xmm4 |
+ pxor xmm5, xmm5 // constant 0 |
wloop: |
movdqu xmm0, [eax] |
@@ -174,19 +174,17 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
movdqu xmm2, [eax + esi] |
movdqu xmm3, [eax + esi + 16] |
lea eax, [eax + 32] |
- pavgb xmm0, xmm2 // average rows |
- pavgb xmm1, xmm3 |
- |
- movdqa xmm2, xmm0 // average columns (32 to 16 pixels) |
- psrlw xmm0, 8 |
- movdqa xmm3, xmm1 |
- psrlw xmm1, 8 |
- pand xmm2, xmm5 |
- pand xmm3, xmm5 |
- pavgw xmm0, xmm2 |
- pavgw xmm1, xmm3 |
+ pmaddubsw xmm0, xmm4 // horizontal add |
+ pmaddubsw xmm1, xmm4 |
+ pmaddubsw xmm2, xmm4 |
+ pmaddubsw xmm3, xmm4 |
+ paddw xmm0, xmm2 // vertical add |
+ paddw xmm1, xmm3 |
+ psrlw xmm0, 1 |
+ psrlw xmm1, 1 |
+ pavgw xmm0, xmm5 // (x + 1) / 2 |
+ pavgw xmm1, xmm5 |
packuswb xmm0, xmm1 |
- |
movdqu [edx], xmm0 |
lea edx, [edx + 16] |
sub ecx, 16 |
@@ -245,14 +243,12 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, |
vmovdqu ymm0, [eax] |
vmovdqu ymm1, [eax + 32] |
lea eax, [eax + 64] |
- |
- vpmaddubsw ymm0, ymm0, ymm4 // average horizontally |
+ vpmaddubsw ymm0, ymm0, ymm4 // horizontal add |
vpmaddubsw ymm1, ymm1, ymm4 |
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 |
vpavgw ymm1, ymm1, ymm5 |
vpackuswb ymm0, ymm0, ymm1 |
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb |
- |
vmovdqu [edx], ymm0 |
lea edx, [edx + 32] |
sub ecx, 32 |
@@ -263,6 +259,8 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, |
} |
} |
+// For rounding, average = (sum + 2) / 4 |
+// becomes average((sum >> 1), 0) |
// Blends 64x2 rectangle to 32x1. |
__declspec(naked) |
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, |
@@ -280,19 +278,23 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, |
vpxor ymm5, ymm5, ymm5 // constant 0 |
wloop: |
- vmovdqu ymm0, [eax] // average rows |
+ vmovdqu ymm0, [eax] |
vmovdqu ymm1, [eax + 32] |
- vpavgb ymm0, ymm0, [eax + esi] |
- vpavgb ymm1, ymm1, [eax + esi + 32] |
+ vmovdqu ymm2, [eax + esi] |
+ vmovdqu ymm3, [eax + esi + 32] |
lea eax, [eax + 64] |
- |
- vpmaddubsw ymm0, ymm0, ymm4 // average horizontally |
+ vpmaddubsw ymm0, ymm0, ymm4 // horizontal add |
vpmaddubsw ymm1, ymm1, ymm4 |
+ vpmaddubsw ymm2, ymm2, ymm4 |
+ vpmaddubsw ymm3, ymm3, ymm4 |
+ vpaddw ymm0, ymm0, ymm2 // vertical add |
+ vpaddw ymm1, ymm1, ymm3 |
+ vpsrlw ymm0, ymm0, 1 |
+ vpsrlw ymm1, ymm1, 1 |
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 |
vpavgw ymm1, ymm1, ymm5 |
vpackuswb ymm0, ymm0, ymm1 |
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb |
- |
vmovdqu [edx], ymm0 |
lea edx, [edx + 32] |
sub ecx, 32 |