| Index: source/row_gcc.cc
|
| diff --git a/source/row_gcc.cc b/source/row_gcc.cc
|
| index af5ca2b520725d86749f4e914bcf9b40397b92ea..6037ae66e12d88c72162b47b5d91f6be0cbf349a 100644
|
| --- a/source/row_gcc.cc
|
| +++ b/source/row_gcc.cc
|
| @@ -1324,7 +1324,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
|
| "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
|
| MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
|
| "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
|
| - "punpcklbw %%xmm1,%%xmm0 \n"
|
| + "punpcklbw %%xmm1,%%xmm0 \n" \
|
| + "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
|
| + "punpcklbw %%xmm4,%%xmm4 \n" \
|
| + "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
|
|
|
| // Read 4 UV from 422, upsample to 8 UV
|
| #define READYUV422 \
|
| @@ -1332,7 +1335,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
|
| MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
|
| "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
|
| "punpcklbw %%xmm1,%%xmm0 \n" \
|
| - "punpcklwd %%xmm0,%%xmm0 \n"
|
| + "punpcklwd %%xmm0,%%xmm0 \n" \
|
| + "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
|
| + "punpcklbw %%xmm4,%%xmm4 \n" \
|
| + "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
|
|
|
| // Read 2 UV from 411, upsample to 8 UV
|
| #define READYUV411 \
|
| @@ -1341,13 +1347,55 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
|
| "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \
|
| "punpcklbw %%xmm1,%%xmm0 \n" \
|
| "punpcklwd %%xmm0,%%xmm0 \n" \
|
| - "punpckldq %%xmm0,%%xmm0 \n"
|
| + "punpckldq %%xmm0,%%xmm0 \n" \
|
| + "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
|
| + "punpcklbw %%xmm4,%%xmm4 \n" \
|
| + "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
|
|
|
| // Read 4 UV from NV12, upsample to 8 UV
|
| #define READNV12 \
|
| "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
|
| "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
|
| - "punpcklwd %%xmm0,%%xmm0 \n"
|
| + "punpcklwd %%xmm0,%%xmm0 \n" \
|
| + "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
|
| + "punpcklbw %%xmm4,%%xmm4 \n" \
|
| + "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
|
| +
|
| +// YUY2 shuf 8 Y to 16 Y.
|
| +static const vec8 kShuffleYUY2Y = {
|
| + 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
|
| +};
|
| +
|
| +// YUY2 shuf 4 UV to 8 UV.
|
| +static const vec8 kShuffleYUY2UV = {
|
| + 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
|
| +};
|
| +
|
| +// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
|
| +#define READYUY2 \
|
| + "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \
|
| + "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
|
| + "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \
|
| + "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
|
| + "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n"
|
| +
|
| +// UYVY shuf 8 Y to 16 Y.
|
| +static const vec8 kShuffleUYVYY = {
|
| + 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
|
| +};
|
| +
|
| +// UYVY shuf 4 UV to 8 UV.
|
| +static const vec8 kShuffleUYVYUV = {
|
| + 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
|
| +};
|
| +
|
| +// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
|
| +#define READUYVY \
|
| + "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \
|
| + "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
|
| + "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \
|
| + "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
|
| + "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n"
|
|
|
| // Convert 8 pixels: 8 UV and 8 Y
|
| #define YUVTORGB(yuvconstants) \
|
| @@ -1363,13 +1411,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
|
| "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \
|
| "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \
|
| "psubw %%xmm3,%%xmm2 \n" \
|
| - "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \
|
| - "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
|
| - "punpcklbw %%xmm3,%%xmm3 \n" \
|
| - "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm3 \n" \
|
| - "paddsw %%xmm3,%%xmm0 \n" \
|
| - "paddsw %%xmm3,%%xmm1 \n" \
|
| - "paddsw %%xmm3,%%xmm2 \n" \
|
| + "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \
|
| + "paddsw %%xmm4,%%xmm0 \n" \
|
| + "paddsw %%xmm4,%%xmm1 \n" \
|
| + "paddsw %%xmm4,%%xmm2 \n" \
|
| "psraw $0x6,%%xmm0 \n" \
|
| "psraw $0x6,%%xmm1 \n" \
|
| "psraw $0x6,%%xmm2 \n" \
|
| @@ -1446,7 +1491,7 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
|
| [width]"+rm"(width) // %[width]
|
| : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
|
| : "memory", "cc", NACL_R14
|
| - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
| + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
| );
|
| }
|
|
|
| @@ -1473,7 +1518,7 @@ void OMITFP I444ToABGRRow_SSSE3(const uint8* y_buf,
|
| [width]"+rm"(width) // %[width]
|
| : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
|
| : "memory", "cc", NACL_R14
|
| - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
| + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
| );
|
| }
|
|
|
| @@ -1519,7 +1564,7 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
|
| [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
|
| [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
|
| : "memory", "cc", NACL_R14
|
| - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
|
| + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
|
| );
|
| }
|
|
|
| @@ -1564,7 +1609,7 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
|
| [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
|
| [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
|
| : "memory", "cc", NACL_R14
|
| - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
|
| + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
|
| );
|
| }
|
|
|
| @@ -1591,7 +1636,7 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
|
| [width]"+rm"(width) // %[width]
|
| : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
|
| : "memory", "cc", NACL_R14
|
| - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
| + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
| );
|
| }
|
|
|
| @@ -1618,7 +1663,7 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
|
| [width]"+rm"(width) // %[width]
|
| : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
|
| : "memory", "cc", NACL_R14
|
| - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
| + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
| );
|
| }
|
|
|
| @@ -1642,7 +1687,55 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
|
| [width]"+rm"(width) // %[width]
|
| : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
|
| // Does not use r14.
|
| - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
| + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
| + );
|
| +}
|
| +
|
| +void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
|
| + uint8* dst_argb,
|
| + struct YuvConstants* yuvconstants,
|
| + int width) {
|
| + asm volatile (
|
| + "pcmpeqb %%xmm5,%%xmm5 \n"
|
| + LABELALIGN
|
| + "1: \n"
|
| + READYUY2
|
| + YUVTORGB(yuvconstants)
|
| + STOREARGB
|
| + "sub $0x8,%[width] \n"
|
| + "jg 1b \n"
|
| + : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
|
| + [dst_argb]"+r"(dst_argb), // %[dst_argb]
|
| + [width]"+rm"(width) // %[width]
|
| + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
|
| + [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
|
| + [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
|
| + // Does not use r14.
|
| + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
| + );
|
| +}
|
| +
|
| +void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
|
| + uint8* dst_argb,
|
| + struct YuvConstants* yuvconstants,
|
| + int width) {
|
| + asm volatile (
|
| + "pcmpeqb %%xmm5,%%xmm5 \n"
|
| + LABELALIGN
|
| + "1: \n"
|
| + READUYVY
|
| + YUVTORGB(yuvconstants)
|
| + STOREARGB
|
| + "sub $0x8,%[width] \n"
|
| + "jg 1b \n"
|
| + : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
|
| + [dst_argb]"+r"(dst_argb), // %[dst_argb]
|
| + [width]"+rm"(width) // %[width]
|
| + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
|
| + [kShuffleUYVYY]"m"(kShuffleUYVYY),
|
| + [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
|
| + // Does not use r14.
|
| + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
| );
|
| }
|
|
|
| @@ -1669,7 +1762,7 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
|
| [width]"+rm"(width) // %[width]
|
| : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
|
| : "memory", "cc", NACL_R14
|
| - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
| + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
| );
|
| }
|
|
|
| @@ -1696,7 +1789,7 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
|
| [width]"+rm"(width) // %[width]
|
| : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
|
| : "memory", "cc", NACL_R14
|
| - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
| + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
| );
|
| }
|
|
|
| @@ -1723,7 +1816,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
|
| [width]"+rm"(width) // %[width]
|
| : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
|
| : "memory", "cc", NACL_R14
|
| - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
| + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
| );
|
| }
|
|
|
| @@ -1802,7 +1895,7 @@ void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,
|
| [width]"+rm"(width) // %[width]
|
| : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
|
| : "memory", "cc", NACL_R14
|
| - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
| + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
| );
|
| }
|
| #endif // HAS_I422TOBGRAROW_AVX2
|
| @@ -1845,7 +1938,7 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
|
| [width]"+rm"(width) // %[width]
|
| : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
|
| : "memory", "cc", NACL_R14
|
| - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
| + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
| );
|
| }
|
| #endif // HAS_I422TOARGBROW_AVX2
|
| @@ -1887,7 +1980,7 @@ void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf,
|
| [width]"+rm"(width) // %[width]
|
| : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
|
| : "memory", "cc", NACL_R14
|
| - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
| + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
| );
|
| }
|
| #endif // HAS_I422TOABGRROW_AVX2
|
| @@ -1929,7 +2022,7 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
|
| [width]"+rm"(width) // %[width]
|
| : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
|
| : "memory", "cc", NACL_R14
|
| - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
| + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
| );
|
| }
|
| #endif // HAS_I422TORGBAROW_AVX2
|
|
|