| Index: source/scale_win.cc
|
| diff --git a/source/scale_win.cc b/source/scale_win.cc
|
| index 21b1ed923fa65087a0b213337b52dd9d4b049a25..f17097365cc07c0640238592208d677007522bf5 100644
|
| --- a/source/scale_win.cc
|
| +++ b/source/scale_win.cc
|
| @@ -860,6 +860,16 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
|
| }
|
| #endif // HAS_SCALEADDROW_AVX2
|
|
|
| +// Constant for making pixels signed to avoid pmaddubsw
|
| +// saturation.
|
| +static uvec8 kFsub80 =
|
| + { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
| + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
|
| +
|
| +// Constant for making pixels unsigned and adding .5 for rounding.
|
| +static uvec16 kFadd40 =
|
| + { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
|
| +
|
| // Bilinear column filtering. SSSE3 version.
|
| __declspec(naked)
|
| void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| @@ -877,6 +887,8 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| movd xmm5, eax
|
| pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
|
| psrlw xmm6, 9
|
| + pcmpeqb xmm7, xmm7 // generate 0x0001
|
| + psrlw xmm7, 15
|
| pextrw eax, xmm2, 1 // get x0 integer. preroll
|
| sub ecx, 2
|
| jl xloop29
|
| @@ -899,20 +911,22 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| movd xmm4, ebx
|
| pshufb xmm1, xmm5 // 0011
|
| punpcklwd xmm0, xmm4
|
| + psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
|
| pxor xmm1, xmm6 // 0..7f and 7f..0
|
| - pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels.
|
| + paddusb xmm1, xmm7 // +1 so 0..7f and 80..1
|
| + pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels.
|
| pextrw eax, xmm2, 1 // get x0 integer. next iteration.
|
| pextrw edx, xmm2, 3 // get x1 integer. next iteration.
|
| - psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
|
| - packuswb xmm0, xmm0 // 8 bits, 2 pixels.
|
| - movd ebx, xmm0
|
| + paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round.
|
| + psrlw xmm1, 7 // 8.7 fixed point to low 8 bits.
|
| + packuswb xmm1, xmm1 // 8 bits, 2 pixels.
|
| + movd ebx, xmm1
|
| mov [edi], bx
|
| lea edi, [edi + 2]
|
| sub ecx, 2 // 2 pixels
|
| jge xloop2
|
|
|
| xloop29:
|
| -
|
| add ecx, 2 - 1
|
| jl xloop99
|
|
|
| @@ -921,11 +935,14 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| movd xmm0, ebx
|
| psrlw xmm2, 9 // 7 bit fractions.
|
| pshufb xmm2, xmm5 // 0011
|
| + psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
|
| pxor xmm2, xmm6 // 0..7f and 7f..0
|
| - pmaddubsw xmm0, xmm2 // 16 bit
|
| - psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
|
| - packuswb xmm0, xmm0 // 8 bits
|
| - movd ebx, xmm0
|
| + paddusb xmm2, xmm7 // +1 so 0..7f and 80..1
|
| + pmaddubsw xmm2, xmm0 // 16 bit
|
| + paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round.
|
| + psrlw xmm2, 7 // 8.7 fixed point to low 8 bits.
|
| + packuswb xmm2, xmm2 // 8 bits
|
| + movd ebx, xmm2
|
| mov [edi], bl
|
|
|
| xloop99:
|
|
|