Index: source/row_gcc.cc |
diff --git a/source/row_gcc.cc b/source/row_gcc.cc |
index 42666cca7369e1d520b255e3c0d853e07efd7718..a6df00264b07bfed22b6a0ec8a315b0e30bfbde9 100644 |
--- a/source/row_gcc.cc |
+++ b/source/row_gcc.cc |
@@ -1364,6 +1364,19 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, |
"punpcklbw %%xmm4,%%xmm4 \n" \ |
"lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" |
+// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. |
+#define READYUVA422 \ |
+ "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
+ MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
+ "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ |
+ "punpcklbw %%xmm1,%%xmm0 \n" \ |
+ "punpcklwd %%xmm0,%%xmm0 \n" \ |
+ "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
+ "punpcklbw %%xmm4,%%xmm4 \n" \ |
+ "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ |
+ "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \ |
+ "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n" |
+ |
// Read 2 UV from 411, upsample to 8 UV |
#define READYUV411 \ |
"movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
@@ -1426,7 +1439,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, |
"packuswb %%xmm1,%%xmm1 \n" \ |
"packuswb %%xmm2,%%xmm2 \n" |
-// Store 8 ARGB values. Assumes XMM5 is set. |
+// Store 8 ARGB values. |
#define STOREARGB \ |
"punpcklbw %%xmm1,%%xmm0 \n" \ |
"punpcklbw %%xmm5,%%xmm2 \n" \ |
@@ -1449,7 +1462,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, |
"movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \ |
"lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n" |
-// Store 8 ABGR values. Assumes XMM5 is set. |
+// Store 8 ABGR values. |
#define STOREABGR \ |
"punpcklbw %%xmm1,%%xmm2 \n" \ |
"punpcklbw %%xmm5,%%xmm0 \n" \ |
@@ -1460,7 +1473,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, |
"movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \ |
"lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n" |
-// Store 8 RGBA values. Assumes XMM5 is set. |
+// Store 8 RGBA values. |
#define STORERGBA \ |
"pcmpeqb %%xmm5,%%xmm5 \n" \ |
"punpcklbw %%xmm2,%%xmm1 \n" \ |
@@ -1643,6 +1656,62 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, |
); |
} |
+void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, |
+ const uint8* u_buf, |
+ const uint8* v_buf, |
+ const uint8* a_buf, |
+ uint8* dst_argb, |
+ struct YuvConstants* yuvconstants, |
+ int width) { |
+ asm volatile ( |
+ "sub %[u_buf],%[v_buf] \n" |
+ LABELALIGN |
+ "1: \n" |
+ READYUVA422 |
+ YUVTORGB(yuvconstants) |
+ STOREARGB |
+ "sub $0x8,%[width] \n" |
+ "jg 1b \n" |
+ : [y_buf]"+r"(y_buf), // %[y_buf] |
+ [u_buf]"+r"(u_buf), // %[u_buf] |
+ [v_buf]"+r"(v_buf), // %[v_buf] |
+ [a_buf]"+r"(a_buf), // %[a_buf] |
+ [dst_argb]"+r"(dst_argb), // %[dst_argb] |
+ [width]"+rm"(width) // %[width] |
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
+ : "memory", "cc", NACL_R14 |
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
+ ); |
+} |
+ |
+void OMITFP I422AlphaToABGRRow_SSSE3(const uint8* y_buf, |
+ const uint8* u_buf, |
+ const uint8* v_buf, |
+ const uint8* a_buf, |
+ uint8* dst_abgr, |
+ struct YuvConstants* yuvconstants, |
+ int width) { |
+ asm volatile ( |
+ "sub %[u_buf],%[v_buf] \n" |
+ LABELALIGN |
+ "1: \n" |
+ READYUVA422 |
+ YUVTORGB(yuvconstants) |
+ STOREABGR |
+ "sub $0x8,%[width] \n" |
+ "jg 1b \n" |
+ : [y_buf]"+r"(y_buf), // %[y_buf] |
+ [u_buf]"+r"(u_buf), // %[u_buf] |
+ [v_buf]"+r"(v_buf), // %[v_buf] |
+ [a_buf]"+r"(a_buf), // %[a_buf] |
+ [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] |
+ [width]"+rm"(width) // %[width] |
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
+ : "memory", "cc", NACL_R14 |
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
+ ); |
+} |
+ |
void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, |
const uint8* u_buf, |
const uint8* v_buf, |
@@ -1838,6 +1907,22 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, |
"vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
"lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" |
+// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. |
+#define READYUVA422_AVX2 \ |
+ "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
+ MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
+ "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ |
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ |
+ "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
+ "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ |
+ "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \ |
+ "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ |
+ "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n" |
+ |
// Read 8 UV from NV12, upsample to 16 UV. |
#define READNV12_AVX2 \ |
"vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ |
@@ -1887,7 +1972,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, |
"vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ |
"vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" |
-// Store 16 ARGB values. Assumes XMM5 is set. |
+// Store 16 ARGB values. |
#define STOREARGB_AVX2 \ |
"vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
"vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
@@ -1899,6 +1984,18 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, |
"vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) " \n" \ |
"lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" |
+// Store 16 ABGR values. |
+#define STOREABGR_AVX2 \ |
+ "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" \ |
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ |
+ "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" \ |
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ |
+ "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" \ |
+ "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" \ |
+ "vmovdqu %%ymm0," MEMACCESS([dst_abgr]) " \n" \ |
+ "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_abgr]) " \n" \ |
+ "lea " MEMLEA(0x40,[dst_abgr]) ",%[dst_abgr] \n" |
+ |
#if defined(HAS_I422TOBGRAROW_AVX2) |
// 16 pixels |
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). |
@@ -1974,13 +2071,79 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, |
} |
#endif // HAS_I422TOARGBROW_AVX2 |
+#if defined(HAS_I422ALPHATOARGBROW_AVX2) |
+// 16 pixels |
+// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. |
+void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, |
+ const uint8* u_buf, |
+ const uint8* v_buf, |
+ const uint8* a_buf, |
+ uint8* dst_argb, |
+ struct YuvConstants* yuvconstants, |
+ int width) { |
+ asm volatile ( |
+ "sub %[u_buf],%[v_buf] \n" |
+ LABELALIGN |
+ "1: \n" |
+ READYUVA422_AVX2 |
+ YUVTORGB_AVX2(yuvconstants) |
+ STOREARGB_AVX2 |
+ "sub $0x10,%[width] \n" |
+ "jg 1b \n" |
+ "vzeroupper \n" |
+ : [y_buf]"+r"(y_buf), // %[y_buf] |
+ [u_buf]"+r"(u_buf), // %[u_buf] |
+ [v_buf]"+r"(v_buf), // %[v_buf] |
+ [a_buf]"+r"(a_buf), // %[a_buf] |
+ [dst_argb]"+r"(dst_argb), // %[dst_argb] |
+ [width]"+rm"(width) // %[width] |
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
+ : "memory", "cc", NACL_R14 |
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
+ ); |
+} |
+#endif // HAS_I422ALPHATOARGBROW_AVX2 |
+ |
+#if defined(HAS_I422ALPHATOABGRROW_AVX2) |
+// 16 pixels |
+// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ABGR. |
+void OMITFP I422AlphaToABGRRow_AVX2(const uint8* y_buf, |
+ const uint8* u_buf, |
+ const uint8* v_buf, |
+ const uint8* a_buf, |
+ uint8* dst_abgr, |
+ struct YuvConstants* yuvconstants, |
+ int width) { |
+ asm volatile ( |
+ "sub %[u_buf],%[v_buf] \n" |
+ LABELALIGN |
+ "1: \n" |
+ READYUVA422_AVX2 |
+ YUVTORGB_AVX2(yuvconstants) |
+ STOREABGR_AVX2 |
+ "sub $0x10,%[width] \n" |
+ "jg 1b \n" |
+ "vzeroupper \n" |
+ : [y_buf]"+r"(y_buf), // %[y_buf] |
+ [u_buf]"+r"(u_buf), // %[u_buf] |
+ [v_buf]"+r"(v_buf), // %[v_buf] |
+ [a_buf]"+r"(a_buf), // %[a_buf] |
+ [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] |
+ [width]"+rm"(width) // %[width] |
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
+ : "memory", "cc", NACL_R14 |
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
+ ); |
+} |
+#endif // HAS_I422ALPHATOABGRROW_AVX2 |
+ |
#if defined(HAS_I422TOABGRROW_AVX2) |
// 16 pixels |
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). |
void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, |
const uint8* u_buf, |
const uint8* v_buf, |
- uint8* dst_argb, |
+ uint8* dst_abgr, |
struct YuvConstants* yuvconstants, |
int width) { |
asm volatile ( |
@@ -1990,24 +2153,14 @@ void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, |
"1: \n" |
READYUV422_AVX2 |
YUVTORGB_AVX2(yuvconstants) |
- |
- // Step 3: Weave into ABGR |
- "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" // RG |
- "vpermq $0xd8,%%ymm1,%%ymm1 \n" |
- "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" // BA |
- "vpermq $0xd8,%%ymm2,%%ymm2 \n" |
- "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" // RGBA first 8 pixels |
- "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" // RGBA next 8 pixels |
- "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" |
- "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" |
- "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" |
+ STOREABGR_AVX2 |
"sub $0x10,%[width] \n" |
"jg 1b \n" |
"vzeroupper \n" |
: [y_buf]"+r"(y_buf), // %[y_buf] |
[u_buf]"+r"(u_buf), // %[u_buf] |
[v_buf]"+r"(v_buf), // %[v_buf] |
- [dst_argb]"+r"(dst_argb), // %[dst_argb] |
+ [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] |
[width]"+rm"(width) // %[width] |
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
: "memory", "cc", NACL_R14 |