| Index: source/row_gcc.cc
|
| diff --git a/source/row_gcc.cc b/source/row_gcc.cc
|
| index de8769d33ad8b811cb2a637aafa530e755aeb47f..3ce0f0a492a99bbbb145b951629adcdcb36315bf 100644
|
| --- a/source/row_gcc.cc
|
| +++ b/source/row_gcc.cc
|
| @@ -164,6 +164,12 @@ static const lvec8 kShuffleUYVYUV = {
|
| 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
|
| 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
|
| };
|
| +
|
| +// NV21 shuf 8 VU to 16 UV.
|
| +static const lvec8 kShuffleNV21 = {
|
| + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
|
| + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
|
| +};
|
| #endif // HAS_RGB24TOARGBROW_SSSE3
|
|
|
| #ifdef HAS_J400TOARGBROW_SSE2
|
| @@ -1398,6 +1404,15 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
|
| "punpcklbw %%xmm4,%%xmm4 \n" \
|
| "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
|
|
|
| +// Read 4 VU from NV21, upsample to 8 UV
|
| +#define READNV21 \
|
| + "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
|
| + "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \
|
| + "pshufb %[kShuffleNV21], %%xmm0 \n" \
|
| + "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
|
| + "punpcklbw %%xmm4,%%xmm4 \n" \
|
| + "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
|
| +
|
| // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
|
| #define READYUY2 \
|
| "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \
|
| @@ -1769,6 +1784,31 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
|
| );
|
| }
|
|
|
| +void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
|
| + const uint8* vu_buf,
|
| + uint8* dst_argb,
|
| + struct YuvConstants* yuvconstants,
|
| + int width) {
|
| + asm volatile (
|
| + "pcmpeqb %%xmm5,%%xmm5 \n"
|
| + LABELALIGN
|
| + "1: \n"
|
| + READNV21
|
| + YUVTORGB(yuvconstants)
|
| + STOREARGB
|
| + "sub $0x8,%[width] \n"
|
| + "jg 1b \n"
|
| + : [y_buf]"+r"(y_buf), // %[y_buf]
|
| + [vu_buf]"+r"(vu_buf), // %[vu_buf]
|
| + [dst_argb]"+r"(dst_argb), // %[dst_argb]
|
| + [width]"+rm"(width) // %[width]
|
| + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
|
| + [kShuffleNV21]"m"(kShuffleNV21)
|
| + // Does not use r14.
|
| + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
| + );
|
| +}
|
| +
|
| void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
|
| uint8* dst_argb,
|
| struct YuvConstants* yuvconstants,
|
| @@ -1940,6 +1980,17 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
|
| "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
|
| "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
|
|
|
| +// Read 8 VU from NV21, upsample to 16 UV.
|
| +#define READNV21_AVX2 \
|
| + "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
|
| + "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \
|
| + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
|
| + "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \
|
| + "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
|
| + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
|
| + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
|
| + "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
|
| +
|
| // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
|
| #define READYUY2_AVX2 \
|
| "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \
|
| @@ -2251,8 +2302,37 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
|
| : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
| );
|
| }
|
| -#endif // HAS_YUY2TOARGBROW_AVX2
|
| +#endif // HAS_NV12TOARGBROW_AVX2
|
|
|
| +#if defined(HAS_NV21TOARGBROW_AVX2)
|
| +// 16 pixels.
|
| +// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
|
| +void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
|
| + const uint8* vu_buf,
|
| + uint8* dst_argb,
|
| + struct YuvConstants* yuvconstants,
|
| + int width) {
|
| + asm volatile (
|
| + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
|
| + LABELALIGN
|
| + "1: \n"
|
| + READNV21_AVX2
|
| + YUVTORGB_AVX2(yuvconstants)
|
| + STOREARGB_AVX2
|
| + "sub $0x10,%[width] \n"
|
| + "jg 1b \n"
|
| + "vzeroupper \n"
|
| + : [y_buf]"+r"(y_buf), // %[y_buf]
|
| + [vu_buf]"+r"(vu_buf), // %[vu_buf]
|
| + [dst_argb]"+r"(dst_argb), // %[dst_argb]
|
| + [width]"+rm"(width) // %[width]
|
| + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
|
| + [kShuffleNV21]"m"(kShuffleNV21)
|
| + // Does not use r14.
|
| + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
| + );
|
| +}
|
| +#endif // HAS_NV21TOARGBROW_AVX2
|
|
|
| #if defined(HAS_YUY2TOARGBROW_AVX2)
|
| // 16 pixels.
|
|
|