| Index: source/scale_gcc.cc
|
| diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc
|
| index f0509a0695a885e718248cec13ddaaca4c43b2e7..eeeb165cb7f1097992ea7c671c5faf32ef9b52a0 100644
|
| --- a/source/scale_gcc.cc
|
| +++ b/source/scale_gcc.cc
|
| @@ -98,8 +98,8 @@ static uvec16 kScaleAb2 =
|
| // Generated using gcc disassembly on Visual C object file:
|
| // objdump -D yuvscaler.obj >yuvscaler.txt
|
|
|
| -void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| - uint8* dst_ptr, int dst_width) {
|
| +void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| asm volatile (
|
| LABELALIGN
|
| "1: \n"
|
| @@ -120,26 +120,24 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| );
|
| }
|
|
|
| -void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| - uint8* dst_ptr, int dst_width) {
|
| +void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| asm volatile (
|
| - "pcmpeqb %%xmm5,%%xmm5 \n"
|
| - "psrlw $0x8,%%xmm5 \n"
|
| + "pcmpeqb %%xmm4,%%xmm4 \n"
|
| + "psrlw $0xf,%%xmm4 \n"
|
| + "packuswb %%xmm4,%%xmm4 \n"
|
| + "pxor %%xmm5,%%xmm5 \n"
|
|
|
| LABELALIGN
|
| "1: \n"
|
| "movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
| "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
|
| "lea " MEMLEA(0x20,0) ",%0 \n"
|
| - "movdqa %%xmm0,%%xmm2 \n"
|
| - "psrlw $0x8,%%xmm0 \n"
|
| - "movdqa %%xmm1,%%xmm3 \n"
|
| - "psrlw $0x8,%%xmm1 \n"
|
| - "pand %%xmm5,%%xmm2 \n"
|
| - "pand %%xmm5,%%xmm3 \n"
|
| - "pavgw %%xmm2,%%xmm0 \n"
|
| - "pavgw %%xmm3,%%xmm1 \n"
|
| - "packuswb %%xmm1,%%xmm0 \n"
|
| + "pmaddubsw %%xmm4,%%xmm0 \n"
|
| + "pmaddubsw %%xmm4,%%xmm1 \n"
|
| + "pavgw %%xmm5,%%xmm0 \n"
|
| + "pavgw %%xmm5,%%xmm1 \n"
|
| + "packuswb %%xmm1,%%xmm0 \n"
|
| "movdqu %%xmm0," MEMACCESS(1) " \n"
|
| "lea " MEMLEA(0x10,1) ",%1 \n"
|
| "sub $0x10,%2 \n"
|
| @@ -147,15 +145,17 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| : "+r"(src_ptr), // %0
|
| "+r"(dst_ptr), // %1
|
| "+r"(dst_width) // %2
|
| - :: "memory", "cc", "xmm0", "xmm1", "xmm5"
|
| + :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
|
| );
|
| }
|
|
|
| -void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| - uint8* dst_ptr, int dst_width) {
|
| +void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| asm volatile (
|
| - "pcmpeqb %%xmm5,%%xmm5 \n"
|
| - "psrlw $0x8,%%xmm5 \n"
|
| + "pcmpeqb %%xmm4,%%xmm4 \n"
|
| + "psrlw $0xf,%%xmm4 \n"
|
| + "packuswb %%xmm4,%%xmm4 \n"
|
| + "pxor %%xmm5,%%xmm5 \n"
|
|
|
| LABELALIGN
|
| "1: \n"
|
| @@ -164,17 +164,17 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
|
| MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
|
| "lea " MEMLEA(0x20,0) ",%0 \n"
|
| - "pavgb %%xmm2,%%xmm0 \n"
|
| - "pavgb %%xmm3,%%xmm1 \n"
|
| - "movdqa %%xmm0,%%xmm2 \n"
|
| - "psrlw $0x8,%%xmm0 \n"
|
| - "movdqa %%xmm1,%%xmm3 \n"
|
| - "psrlw $0x8,%%xmm1 \n"
|
| - "pand %%xmm5,%%xmm2 \n"
|
| - "pand %%xmm5,%%xmm3 \n"
|
| - "pavgw %%xmm2,%%xmm0 \n"
|
| - "pavgw %%xmm3,%%xmm1 \n"
|
| - "packuswb %%xmm1,%%xmm0 \n"
|
| + "pmaddubsw %%xmm4,%%xmm0 \n"
|
| + "pmaddubsw %%xmm4,%%xmm1 \n"
|
| + "pmaddubsw %%xmm4,%%xmm2 \n"
|
| + "pmaddubsw %%xmm4,%%xmm3 \n"
|
| + "paddw %%xmm2,%%xmm0 \n"
|
| + "paddw %%xmm3,%%xmm1 \n"
|
| + "psrlw $0x1,%%xmm0 \n"
|
| + "psrlw $0x1,%%xmm1 \n"
|
| + "pavgw %%xmm5,%%xmm0 \n"
|
| + "pavgw %%xmm5,%%xmm1 \n"
|
| + "packuswb %%xmm1,%%xmm0 \n"
|
| "movdqu %%xmm0," MEMACCESS(1) " \n"
|
| "lea " MEMLEA(0x10,1) ",%1 \n"
|
| "sub $0x10,%2 \n"
|
|
|