Index: source/scale_gcc.cc |
diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc |
index 400f2fde9b5acdfb4a672fdc2ebbd361ad4cdc6a..8d234edaf20e6eed4f668fc01a640c94b3b71e71 100644 |
--- a/source/scale_gcc.cc |
+++ b/source/scale_gcc.cc |
@@ -821,6 +821,16 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { |
} |
#endif // HAS_SCALEADDROW_AVX2 |
+// Constant for making pixels signed to avoid pmaddubsw |
+// saturation. |
+static uvec8 kFsub80 = |
+ { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; |
+ |
+// Constant for making pixels unsigned and adding .5 for rounding. |
+static uvec16 kFadd40 = |
+ { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 }; |
+ |
// Bilinear column filtering. SSSE3 version. |
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
int dst_width, int x, int dx) { |
@@ -831,7 +841,10 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
"movl $0x04040000,%k2 \n" |
"movd %k2,%%xmm5 \n" |
"pcmpeqb %%xmm6,%%xmm6 \n" |
- "psrlw $0x9,%%xmm6 \n" |
+ "psrlw $0x9,%%xmm6 \n" // 0x007f007f |
+ "pcmpeqb %%xmm7,%%xmm7 \n" |
+ "psrlw $15,%%xmm7 \n" // 0x00010001 |
+ |
"pextrw $0x1,%%xmm2,%k3 \n" |
"subl $0x2,%5 \n" |
"jl 29f \n" |
@@ -853,13 +866,16 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
"movd %k2,%%xmm4 \n" |
"pshufb %%xmm5,%%xmm1 \n" |
"punpcklwd %%xmm4,%%xmm0 \n" |
- "pxor %%xmm6,%%xmm1 \n" |
- "pmaddubsw %%xmm1,%%xmm0 \n" |
+ "psubb %8,%%xmm0 \n" // make pixels signed. |
+ "pxor %%xmm6,%%xmm1 \n" // 128 -f = (f ^ 127 ) + 1 |
+ "paddusb %%xmm7,%%xmm1 \n" |
+ "pmaddubsw %%xmm0,%%xmm1 \n" |
"pextrw $0x1,%%xmm2,%k3 \n" |
"pextrw $0x3,%%xmm2,%k4 \n" |
- "psrlw $0x7,%%xmm0 \n" |
- "packuswb %%xmm0,%%xmm0 \n" |
- "movd %%xmm0,%k2 \n" |
+ "paddw %9,%%xmm1 \n" // make pixels unsigned. |
+ "psrlw $0x7,%%xmm1 \n" |
+ "packuswb %%xmm1,%%xmm1 \n" |
+ "movd %%xmm1,%k2 \n" |
"mov %w2," MEMACCESS(0) " \n" |
"lea " MEMLEA(0x2,0) ",%0 \n" |
"sub $0x2,%5 \n" |
@@ -873,11 +889,14 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
"movd %k2,%%xmm0 \n" |
"psrlw $0x9,%%xmm2 \n" |
"pshufb %%xmm5,%%xmm2 \n" |
+ "psubb %8,%%xmm0 \n" // make pixels signed. |
"pxor %%xmm6,%%xmm2 \n" |
- "pmaddubsw %%xmm2,%%xmm0 \n" |
- "psrlw $0x7,%%xmm0 \n" |
- "packuswb %%xmm0,%%xmm0 \n" |
- "movd %%xmm0,%k2 \n" |
+ "paddusb %%xmm7,%%xmm2 \n" |
+ "pmaddubsw %%xmm0,%%xmm2 \n" |
+ "paddw %9,%%xmm2 \n" // make pixels unsigned. |
+ "psrlw $0x7,%%xmm2 \n" |
+ "packuswb %%xmm2,%%xmm2 \n" |
+ "movd %%xmm2,%k2 \n" |
"mov %b2," MEMACCESS(0) " \n" |
"99: \n" |
: "+r"(dst_ptr), // %0 |
@@ -887,9 +906,16 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
"=&r"(x1), // %4 |
"+rm"(dst_width) // %5 |
: "rm"(x), // %6 |
- "rm"(dx) // %7 |
+ "rm"(dx), // %7 |
+#if defined(__x86_64__) |
+ "x"(kFsub80), // %8 |
+ "x"(kFadd40) // %9 |
+#else |
+ "m"(kFsub80), // %8 |
+ "m"(kFadd40) // %9 |
+#endif |
: "memory", "cc", NACL_R14 |
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
); |
} |