Index: source/row_gcc.cc |
diff --git a/source/row_gcc.cc b/source/row_gcc.cc |
index e4084e41ca6aee913dfe3c452eea74ee96bef6b8..e7b78b497aeeb5c0d1c753e5522faabe018c860f 100644 |
--- a/source/row_gcc.cc |
+++ b/source/row_gcc.cc |
@@ -566,12 +566,58 @@ void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst, |
: "+r"(src), // %0 |
"+r"(dst), // %1 |
"+r"(pix) // %2 |
- : "m"(dither4) // %3 |
+ : "rm"(dither4) // %3 |
: "memory", "cc", |
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
); |
} |
+#ifdef HAS_ARGBTORGB565DITHERROW_AVX2 |
+void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst, |
+ const uint32 dither4, int pix) { |
+ asm volatile ( |
+ "vbroadcastss %3,%%xmm6 \n" |
+ "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" |
+ "vpermq $0xd8,%%ymm6,%%ymm6 \n" |
+ "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" |
+ "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" |
+ "vpsrld $0x1b,%%ymm3,%%ymm3 \n" |
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" |
+ "vpsrld $0x1a,%%ymm4,%%ymm4 \n" |
+ "vpslld $0x5,%%ymm4,%%ymm4 \n" |
+ "vpslld $0xb,%%ymm3,%%ymm5 \n" |
+ |
+ LABELALIGN |
+ "1: \n" |
+ "vmovdqu (%0),%%ymm0 \n" |
+ "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" |
+ "vpsrld $0x5,%%ymm0,%%ymm2 \n" |
+ "vpsrld $0x3,%%ymm0,%%ymm1 \n" |
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n" |
+ "vpand %%ymm4,%%ymm2,%%ymm2 \n" |
+ "vpand %%ymm3,%%ymm1,%%ymm1 \n" |
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n" |
+ "vpor %%ymm2,%%ymm1,%%ymm1 \n" |
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n" |
+ "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" |
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
+ "lea 0x20(%0),%0 \n" |
+ "vmovdqu %%xmm0,(%1) \n" |
+ "lea 0x10(%1),%1 \n" |
+ "sub $0x8,%2 \n" |
+ "jg 1b \n" |
+ "vzeroupper \n" |
+ : "+r"(src), // %0 |
+ "+r"(dst), // %1 |
+ "+r"(pix) // %2 |
+ : "rm"(dither4) // %3 |
+ : "memory", "cc", |
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
+ ); |
+} |
+#endif // HAS_ARGBTORGB565DITHERROW_AVX2 |
+ |
+ |
void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { |
asm volatile ( |
"pcmpeqb %%xmm4,%%xmm4 \n" |