| Index: source/row_win.cc
|
| diff --git a/source/row_win.cc b/source/row_win.cc
|
| index 494043c626f3107588c067accec0fce0ad4d4386..5cb5d1e4f5d9f20f7baa172be1d0fa4faa91af64 100644
|
| --- a/source/row_win.cc
|
| +++ b/source/row_win.cc
|
| @@ -5566,24 +5566,22 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
|
| mov edx, [esp + 8 + 12] // src_stride
|
| mov ecx, [esp + 8 + 16] // dst_width
|
| mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
|
| - shr eax, 1
|
| // Dispatch to specialized filters if applicable.
|
| cmp eax, 0
|
| - je xloop100 // 0 / 128. Blend 100 / 0.
|
| + je xloop100 // 0 / 256. Blend 100 / 0.
|
| sub edi, esi
|
| - cmp eax, 64
|
| - je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
|
| + cmp eax, 128
|
| + je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
|
|
|
| - vmovd xmm0, eax // high fraction 0..127
|
| + vmovd xmm0, eax // high fraction 0..255
|
| neg eax
|
| - add eax, 128
|
| - vmovd xmm5, eax // low fraction 128..1
|
| + add eax, 256
|
| + vmovd xmm5, eax // low fraction 256..1
|
| vpunpcklbw xmm5, xmm5, xmm0
|
| vpunpcklwd xmm5, xmm5, xmm5
|
| - vpxor ymm0, ymm0, ymm0
|
| - vpermd ymm5, ymm0, ymm5
|
| + vbroadcastss ymm5, xmm5
|
|
|
| - mov eax, 0x00400040 // 64 for rounding.
|
| + mov eax, 0x80808080 // 128b for bias and rounding.
|
| vmovd xmm4, eax
|
| vbroadcastss ymm4, xmm4
|
|
|
| @@ -5591,13 +5589,15 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
|
| vmovdqu ymm0, [esi]
|
| vmovdqu ymm2, [esi + edx]
|
| vpunpckhbw ymm1, ymm0, ymm2 // mutates
|
| - vpunpcklbw ymm0, ymm0, ymm2 // mutates
|
| - vpmaddubsw ymm0, ymm0, ymm5
|
| - vpmaddubsw ymm1, ymm1, ymm5
|
| + vpunpcklbw ymm0, ymm0, ymm2
|
| + vpsubb ymm1, ymm1, ymm4 // bias to signed image
|
| + vpsubb ymm0, ymm0, ymm4
|
| + vpmaddubsw ymm1, ymm5, ymm1
|
| + vpmaddubsw ymm0, ymm5, ymm0
|
| + vpaddw ymm1, ymm1, ymm4 // unbias and round
|
| vpaddw ymm0, ymm0, ymm4
|
| - vpaddw ymm1, ymm1, ymm4
|
| - vpsrlw ymm0, ymm0, 7
|
| - vpsrlw ymm1, ymm1, 7
|
| + vpsrlw ymm1, ymm1, 8
|
| + vpsrlw ymm0, ymm0, 8
|
| vpackuswb ymm0, ymm0, ymm1 // unmutates
|
| vmovdqu [esi + edi], ymm0
|
| lea esi, [esi + 32]
|
| @@ -5629,6 +5629,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
|
| #endif // HAS_INTERPOLATEROW_AVX2
|
|
|
| // Bilinear filter 16x2 -> 16x1
|
| +// TODO(fbarchard): Consider allowing 256 using memcpy.
|
| __declspec(naked)
|
| void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| ptrdiff_t src_stride, int dst_width,
|
| @@ -5636,28 +5637,27 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| __asm {
|
| push esi
|
| push edi
|
| +
|
| mov edi, [esp + 8 + 4] // dst_ptr
|
| mov esi, [esp + 8 + 8] // src_ptr
|
| mov edx, [esp + 8 + 12] // src_stride
|
| mov ecx, [esp + 8 + 16] // dst_width
|
| mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
|
| sub edi, esi
|
| - shr eax, 1
|
| // Dispatch to specialized filters if applicable.
|
| cmp eax, 0
|
| - je xloop100 // 0 / 128. Blend 100 / 0.
|
| - cmp eax, 64
|
| - je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
|
| + je xloop100 // 0 /256. Blend 100 / 0.
|
| + cmp eax, 128
|
| + je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
|
|
|
| - movd xmm0, eax // high fraction 0..127
|
| + movd xmm0, eax // high fraction 0..255
|
| neg eax
|
| - add eax, 128
|
| - movd xmm5, eax // low fraction 128..1
|
| + add eax, 256
|
| + movd xmm5, eax // low fraction 255..1
|
| punpcklbw xmm5, xmm0
|
| punpcklwd xmm5, xmm5
|
| pshufd xmm5, xmm5, 0
|
| -
|
| - mov eax, 0x00400040 // 64 for rounding.
|
| + mov eax, 0x80808080 // 128 for biasing image to signed.
|
| movd xmm4, eax
|
| pshufd xmm4, xmm4, 0x00
|
|
|
| @@ -5667,14 +5667,18 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| movdqu xmm1, xmm0
|
| punpcklbw xmm0, xmm2
|
| punpckhbw xmm1, xmm2
|
| - pmaddubsw xmm0, xmm5
|
| - pmaddubsw xmm1, xmm5
|
| - paddw xmm0, xmm4
|
| - paddw xmm1, xmm4
|
| - psrlw xmm0, 7
|
| - psrlw xmm1, 7
|
| - packuswb xmm0, xmm1
|
| - movdqu [esi + edi], xmm0
|
| + psubb xmm0, xmm4 // bias image by -128
|
| + psubb xmm1, xmm4
|
| + movdqa xmm2, xmm5
|
| + movdqa xmm3, xmm5
|
| + pmaddubsw xmm2, xmm0
|
| + pmaddubsw xmm3, xmm1
|
| + paddw xmm2, xmm4
|
| + paddw xmm3, xmm4
|
| + psrlw xmm2, 8
|
| + psrlw xmm3, 8
|
| + packuswb xmm2, xmm3
|
| + movdqu [esi + edi], xmm2
|
| lea esi, [esi + 16]
|
| sub ecx, 16
|
| jg xloop
|
|
|