| Index: source/scale_gcc.cc
|
| diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc
|
| index 400f2fde9b5acdfb4a672fdc2ebbd361ad4cdc6a..8d234edaf20e6eed4f668fc01a640c94b3b71e71 100644
|
| --- a/source/scale_gcc.cc
|
| +++ b/source/scale_gcc.cc
|
| @@ -821,6 +821,16 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
|
| }
|
| #endif // HAS_SCALEADDROW_AVX2
|
|
|
| +// Constant for making pixels signed to avoid pmaddubsw
|
| +// saturation.
|
| +static uvec8 kFsub80 =
|
| + { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
| + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
|
| +
|
| +// Constant for making pixels unsigned and adding .5 for rounding.
|
| +static uvec16 kFadd40 =
|
| + { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
|
| +
|
| // Bilinear column filtering. SSSE3 version.
|
| void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| int dst_width, int x, int dx) {
|
| @@ -831,7 +841,10 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| "movl $0x04040000,%k2 \n"
|
| "movd %k2,%%xmm5 \n"
|
| "pcmpeqb %%xmm6,%%xmm6 \n"
|
| - "psrlw $0x9,%%xmm6 \n"
|
| + "psrlw $0x9,%%xmm6 \n" // 0x007f007f
|
| + "pcmpeqb %%xmm7,%%xmm7 \n"
|
| + "psrlw $15,%%xmm7 \n" // 0x00010001
|
| +
|
| "pextrw $0x1,%%xmm2,%k3 \n"
|
| "subl $0x2,%5 \n"
|
| "jl 29f \n"
|
| @@ -853,13 +866,16 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| "movd %k2,%%xmm4 \n"
|
| "pshufb %%xmm5,%%xmm1 \n"
|
| "punpcklwd %%xmm4,%%xmm0 \n"
|
| - "pxor %%xmm6,%%xmm1 \n"
|
| - "pmaddubsw %%xmm1,%%xmm0 \n"
|
| + "psubb %8,%%xmm0 \n" // make pixels signed.
|
| + "pxor %%xmm6,%%xmm1 \n" // 128 -f = (f ^ 127 ) + 1
|
| + "paddusb %%xmm7,%%xmm1 \n"
|
| + "pmaddubsw %%xmm0,%%xmm1 \n"
|
| "pextrw $0x1,%%xmm2,%k3 \n"
|
| "pextrw $0x3,%%xmm2,%k4 \n"
|
| - "psrlw $0x7,%%xmm0 \n"
|
| - "packuswb %%xmm0,%%xmm0 \n"
|
| - "movd %%xmm0,%k2 \n"
|
| + "paddw %9,%%xmm1 \n" // make pixels unsigned.
|
| + "psrlw $0x7,%%xmm1 \n"
|
| + "packuswb %%xmm1,%%xmm1 \n"
|
| + "movd %%xmm1,%k2 \n"
|
| "mov %w2," MEMACCESS(0) " \n"
|
| "lea " MEMLEA(0x2,0) ",%0 \n"
|
| "sub $0x2,%5 \n"
|
| @@ -873,11 +889,14 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| "movd %k2,%%xmm0 \n"
|
| "psrlw $0x9,%%xmm2 \n"
|
| "pshufb %%xmm5,%%xmm2 \n"
|
| + "psubb %8,%%xmm0 \n" // make pixels signed.
|
| "pxor %%xmm6,%%xmm2 \n"
|
| - "pmaddubsw %%xmm2,%%xmm0 \n"
|
| - "psrlw $0x7,%%xmm0 \n"
|
| - "packuswb %%xmm0,%%xmm0 \n"
|
| - "movd %%xmm0,%k2 \n"
|
| + "paddusb %%xmm7,%%xmm2 \n"
|
| + "pmaddubsw %%xmm0,%%xmm2 \n"
|
| + "paddw %9,%%xmm2 \n" // make pixels unsigned.
|
| + "psrlw $0x7,%%xmm2 \n"
|
| + "packuswb %%xmm2,%%xmm2 \n"
|
| + "movd %%xmm2,%k2 \n"
|
| "mov %b2," MEMACCESS(0) " \n"
|
| "99: \n"
|
| : "+r"(dst_ptr), // %0
|
| @@ -887,9 +906,16 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| "=&r"(x1), // %4
|
| "+rm"(dst_width) // %5
|
| : "rm"(x), // %6
|
| - "rm"(dx) // %7
|
| + "rm"(dx), // %7
|
| +#if defined(__x86_64__)
|
| + "x"(kFsub80), // %8
|
| + "x"(kFadd40) // %9
|
| +#else
|
| + "m"(kFsub80), // %8
|
| + "m"(kFadd40) // %9
|
| +#endif
|
| : "memory", "cc", NACL_R14
|
| - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
|
| + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
|
| );
|
| }
|
|
|
|
|