Index: source/row_gcc.cc |
diff --git a/source/row_gcc.cc b/source/row_gcc.cc |
index 9940cba125655c238f8bdc4b72e9a2070882b73d..348935405761d16bf12ed18278fe277a1559d3a6 100644 |
--- a/source/row_gcc.cc |
+++ b/source/row_gcc.cc |
@@ -1947,6 +1947,19 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, |
#endif // HAS_I422TOARGBROW_SSSE3 |
+// Read 16 UV from 444 |
+#define READYUV444_AVX2 \ |
+ "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
+ MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
+ "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \ |
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ |
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
+ "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
+ "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" |
+ |
// Read 8 UV from 422, upsample to 16 UV. |
#define READYUV422_AVX2 \ |
"vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
@@ -2079,6 +2092,39 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, |
"vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \ |
"lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n" |
+#ifdef HAS_I444TOARGBROW_AVX2 |
+// 16 pixels |
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes). |
+void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf, |
+ const uint8* u_buf, |
+ const uint8* v_buf, |
+ uint8* dst_argb, |
+ const struct YuvConstants* yuvconstants, |
+ int width) { |
+ asm volatile ( |
+ YUVTORGB_SETUP_AVX2(yuvconstants) |
+ "sub %[u_buf],%[v_buf] \n" |
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
+ LABELALIGN |
+ "1: \n" |
+ READYUV444_AVX2 |
+ YUVTORGB_AVX2(yuvconstants) |
+ STOREARGB_AVX2 |
+ "sub $0x10,%[width] \n" |
+ "jg 1b \n" |
+ "vzeroupper \n" |
+ : [y_buf]"+r"(y_buf), // %[y_buf] |
+ [u_buf]"+r"(u_buf), // %[u_buf] |
+ [v_buf]"+r"(v_buf), // %[v_buf] |
+ [dst_argb]"+r"(dst_argb), // %[dst_argb] |
+ [width]"+rm"(width) // %[width] |
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
+ : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 |
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
+ ); |
+} |
+#endif // HAS_I444TOARGBROW_AVX2 |
+ |
#if defined(HAS_I422TOARGBROW_AVX2) |
// 16 pixels |
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
@@ -2091,7 +2137,7 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, |
asm volatile ( |
YUVTORGB_SETUP_AVX2(yuvconstants) |
"sub %[u_buf],%[v_buf] \n" |
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
LABELALIGN |
"1: \n" |
READYUV422_AVX2 |