Index: source/row_gcc.cc |
diff --git a/source/row_gcc.cc b/source/row_gcc.cc |
index 1ff1dc2f6e6fac8a4b1a78a508b7d48f4b1a4115..6fab45076245b0dbd5b94d1f9f7e14e478c09611 100644 |
--- a/source/row_gcc.cc |
+++ b/source/row_gcc.cc |
@@ -1431,7 +1431,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, |
#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) |
-// Read 8 UV from 411 |
+// Read 8 UV from 444 |
#define READYUV444 \ |
"movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
@@ -1952,6 +1952,20 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, |
"vpermq $0xd8,%%ymm5,%%ymm5 \n" \ |
"lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n" |
+// Read 4 UV from 411, upsample to 16 UV. |
+#define READYUV411_AVX2 \ |
+ "vmovd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
+ MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
+ "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ |
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ |
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
+ "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" \ |
+ "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
+ "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" |
+ |
// Read 8 UV from NV12, upsample to 16 UV. |
#define READNV12_AVX2 \ |
"vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ |
@@ -2067,7 +2081,7 @@ void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf, |
asm volatile ( |
YUVTORGB_SETUP_AVX2(yuvconstants) |
"sub %[u_buf],%[v_buf] \n" |
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
LABELALIGN |
"1: \n" |
READYUV444_AVX2 |
@@ -2088,6 +2102,39 @@ void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf, |
} |
#endif // HAS_I444TOARGBROW_AVX2 |
+#ifdef HAS_I411TOARGBROW_AVX2 |
+// 16 pixels |
+// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
+void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf, |
+ const uint8* u_buf, |
+ const uint8* v_buf, |
+ uint8* dst_argb, |
+ const struct YuvConstants* yuvconstants, |
+ int width) { |
+ asm volatile ( |
+ YUVTORGB_SETUP_AVX2(yuvconstants) |
+ "sub %[u_buf],%[v_buf] \n" |
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
+ LABELALIGN |
+ "1: \n" |
+ READYUV411_AVX2 |
+ YUVTORGB_AVX2(yuvconstants) |
+ STOREARGB_AVX2 |
+ "sub $0x10,%[width] \n" |
+ "jg 1b \n" |
+ "vzeroupper \n" |
+ : [y_buf]"+r"(y_buf), // %[y_buf] |
+ [u_buf]"+r"(u_buf), // %[u_buf] |
+ [v_buf]"+r"(v_buf), // %[v_buf] |
+ [dst_argb]"+r"(dst_argb), // %[dst_argb] |
+ [width]"+rm"(width) // %[width] |
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
+ : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 |
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
+ ); |
+} |
+#endif // HAS_I411TOARGBROW_AVX2 |
+ |
#if defined(HAS_I422TOARGBROW_AVX2) |
// 16 pixels |
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
@@ -2100,7 +2147,7 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, |
asm volatile ( |
YUVTORGB_SETUP_AVX2(yuvconstants) |
"sub %[u_buf],%[v_buf] \n" |
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
LABELALIGN |
"1: \n" |
READYUV422_AVX2 |