Index: source/row_gcc.cc |
diff --git a/source/row_gcc.cc b/source/row_gcc.cc |
index d1b25140f94d513054cdb4349bda879eadf869d0..7dc3fd3e27c8ddad6e1267b68311a4f63ae43b46 100644 |
--- a/source/row_gcc.cc |
+++ b/source/row_gcc.cc |
@@ -4791,20 +4791,19 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
int source_y_fraction) { |
asm volatile ( |
"sub %1,%0 \n" |
- "shr %3 \n" |
"cmp $0x0,%3 \n" |
"je 100f \n" |
- "cmp $0x40,%3 \n" |
+ "cmp $0x80,%3 \n" |
"je 50f \n" |
"movd %3,%%xmm0 \n" |
"neg %3 \n" |
- "add $0x80,%3 \n" |
+ "add $0x100,%3 \n" |
"movd %3,%%xmm5 \n" |
"punpcklbw %%xmm0,%%xmm5 \n" |
"punpcklwd %%xmm5,%%xmm5 \n" |
"pshufd $0x0,%%xmm5,%%xmm5 \n" |
- "mov $0x400040,%%eax \n" |
+ "mov $0x80808080,%%eax \n" |
"movd %%eax,%%xmm4 \n" |
"pshufd $0x0,%%xmm4,%%xmm4 \n" |
@@ -4813,17 +4812,21 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
"1: \n" |
"movdqu " MEMACCESS(1) ",%%xmm0 \n" |
MEMOPREG(movdqu,0x00,1,4,1,xmm2) |
- "movdqa %%xmm0,%%xmm1 \n" |
- "punpcklbw %%xmm2,%%xmm0 \n" |
- "punpckhbw %%xmm2,%%xmm1 \n" |
- "pmaddubsw %%xmm5,%%xmm0 \n" |
- "pmaddubsw %%xmm5,%%xmm1 \n" |
- "paddw %%xmm4,%%xmm0 \n" |
- "paddw %%xmm4,%%xmm1 \n" |
- "psrlw $0x7,%%xmm0 \n" |
- "psrlw $0x7,%%xmm1 \n" |
- "packuswb %%xmm1,%%xmm0 \n" |
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1) |
+ "movdqa %%xmm0,%%xmm1 \n" |
+ "punpcklbw %%xmm2,%%xmm0 \n" |
+ "punpckhbw %%xmm2,%%xmm1 \n" |
+ "psubb %%xmm4,%%xmm0 \n" |
+ "psubb %%xmm4,%%xmm1 \n" |
+ "movdqa %%xmm5,%%xmm2 \n" |
+ "movdqa %%xmm5,%%xmm3 \n" |
+ "pmaddubsw %%xmm0,%%xmm2 \n" |
+ "pmaddubsw %%xmm1,%%xmm3 \n" |
+ "paddw %%xmm4,%%xmm2 \n" |
+ "paddw %%xmm4,%%xmm3 \n" |
+ "psrlw $0x8,%%xmm2 \n" |
+ "psrlw $0x8,%%xmm3 \n" |
+ "packuswb %%xmm3,%%xmm2 \n" |
+ MEMOPMEM(movdqu,xmm2,0x00,1,0,1) |
"lea " MEMLEA(0x10,1) ",%1 \n" |
"sub $0x10,%2 \n" |
"jg 1b \n" |
@@ -4857,7 +4860,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
"+r"(source_y_fraction) // %3 |
: "r"((intptr_t)(src_stride)) // %4 |
: "memory", "cc", "eax", NACL_R14 |
- "xmm0", "xmm1", "xmm2", "xmm4", "xmm5" |
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
); |
} |
#endif // HAS_INTERPOLATEROW_SSSE3 |
@@ -4881,9 +4884,8 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, |
"vmovd %3,%%xmm5 \n" |
"vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" |
"vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" |
- "vpxor %%ymm0,%%ymm0,%%ymm0 \n" |
- "vpermd %%ymm5,%%ymm0,%%ymm5 \n" |
- "mov $0x400040,%%eax \n" |
+ "vbroadcastss %%xmm5,%%ymm5 \n" |
+ "mov $0x80808080,%%eax \n" |
"vmovd %%eax,%%xmm4 \n" |
"vbroadcastss %%xmm4,%%ymm4 \n" |
@@ -4894,12 +4896,14 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, |
MEMOPREG(vmovdqu,0x00,1,4,1,ymm2) |
"vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" |
"vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" |
- "vpmaddubsw %%ymm5,%%ymm0,%%ymm0 \n" |
- "vpmaddubsw %%ymm5,%%ymm1,%%ymm1 \n" |
- "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" |
+ "vpsubb %%ymm4,%%ymm1,%%ymm1 \n" |
+ "vpsubb %%ymm4,%%ymm0,%%ymm0 \n" |
+ "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n" |
+ "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n" |
"vpaddw %%ymm4,%%ymm1,%%ymm1 \n" |
- "vpsrlw $0x7,%%ymm0,%%ymm0 \n" |
- "vpsrlw $0x7,%%ymm1,%%ymm1 \n" |
+ "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" |
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n" |
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n" |
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" |
MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) |
"lea " MEMLEA(0x20,1) ",%1 \n" |