Index: source/libvpx/third_party/libyuv/source/row_gcc.cc |
diff --git a/source/libvpx/third_party/libyuv/source/row_posix.cc b/source/libvpx/third_party/libyuv/source/row_gcc.cc |
similarity index 97% |
rename from source/libvpx/third_party/libyuv/source/row_posix.cc |
rename to source/libvpx/third_party/libyuv/source/row_gcc.cc |
index 1a6f7dc4dd006910125f33ad9bf1827fde92a704..820de0a1c69526401ff3781c71e8c755e279af2b 100644 |
--- a/source/libvpx/third_party/libyuv/source/row_posix.cc |
+++ b/source/libvpx/third_party/libyuv/source/row_gcc.cc |
@@ -236,8 +236,8 @@ void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { |
} |
#endif // TESTING |
-#ifdef HAS_I400TOARGBROW_SSE2 |
-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { |
+#ifdef HAS_J400TOARGBROW_SSE2 |
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { |
asm volatile ( |
"pcmpeqb %%xmm5,%%xmm5 \n" |
"pslld $0x18,%%xmm5 \n" |
@@ -262,7 +262,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { |
:: "memory", "cc", "xmm0", "xmm1", "xmm5" |
); |
} |
-#endif // HAS_I400TOARGBROW_SSE2 |
+#endif // HAS_J400TOARGBROW_SSE2 |
#ifdef HAS_RGB24TOARGBROW_SSSE3 |
void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { |
@@ -953,7 +953,6 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, |
#endif // HAS_ARGBTOUVROW_AVX2 |
#ifdef HAS_ARGBTOUVJROW_SSSE3 |
-// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3. |
void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
uint8* dst_u, uint8* dst_v, int width) { |
asm volatile ( |
@@ -1414,22 +1413,6 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, |
#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) |
-// YUV to RGB conversion constants. |
-// Y contribution to R,G,B. Scale and bias. |
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ |
-#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */ |
- |
-// U and V contributions to R,G,B. |
-#define UB -128 /* -min(128, round(2.018 * 64)) */ |
-#define UG 25 /* -round(-0.391 * 64) */ |
-#define VG 52 /* -round(-0.813 * 64) */ |
-#define VR -102 /* -round(1.596 * 64) */ |
- |
-// Bias values to subtract 16 from Y and 128 from U and V. |
-#define BB (UB * 128 - YGB) |
-#define BG (UG * 128 + VG * 128 - YGB) |
-#define BR (VR * 128 - YGB) |
- |
struct YuvConstants { |
lvec8 kUVToB; // 0 |
lvec8 kUVToG; // 32 |
@@ -1440,6 +1423,27 @@ struct YuvConstants { |
lvec16 kYToRgb; // 192 |
}; |
+// BT.601 YUV to RGB reference |
+// R = (Y - 16) * 1.164 - V * -1.596 |
+// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 |
+// B = (Y - 16) * 1.164 - U * -2.018 |
+ |
+// Y contribution to R,G,B. Scale and bias. |
+// TODO(fbarchard): Consider moving constants into a common header. |
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ |
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ |
+ |
+// U and V contributions to R,G,B. |
+#define UB -128 /* max(-128, round(-2.018 * 64)) */ |
+#define UG 25 /* round(0.391 * 64) */ |
+#define VG 52 /* round(0.813 * 64) */ |
+#define VR -102 /* round(-1.596 * 64) */ |
+ |
+// Bias values to subtract 16 from Y and 128 from U and V. |
+#define BB (UB * 128 + YGB) |
+#define BG (UG * 128 + VG * 128 + YGB) |
+#define BR (VR * 128 + YGB) |
+ |
// BT601 constants for YUV to RGB. |
static YuvConstants SIMD_ALIGNED(kYuvConstants) = { |
{ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, |
@@ -1468,6 +1472,67 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = { |
{ YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } |
}; |
+#undef YG |
+#undef YGB |
+#undef UB |
+#undef UG |
+#undef VG |
+#undef VR |
+#undef BB |
+#undef BG |
+#undef BR |
+ |
+// JPEG YUV to RGB reference |
+// * R = Y - V * -1.40200 |
+// * G = Y - U * 0.34414 - V * 0.71414 |
+// * B = Y - U * -1.77200 |
+ |
+// Y contribution to R,G,B. Scale and bias. |
+// TODO(fbarchard): Consider moving constants into a common header. |
+#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ |
+#define YGBJ 32 /* 64 / 2 */ |
+ |
+// U and V contributions to R,G,B. |
+#define UBJ -113 /* round(-1.77200 * 64) */ |
+#define UGJ 22 /* round(0.34414 * 64) */ |
+#define VGJ 46 /* round(0.71414 * 64) */ |
+#define VRJ -90 /* round(-1.40200 * 64) */ |
+ |
+// Bias values to subtract 16 from Y and 128 from U and V. |
+#define BBJ (UBJ * 128 + YGBJ) |
+#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ) |
+#define BRJ (VRJ * 128 + YGBJ) |
+ |
+// JPEG constants for YUV to RGB. |
+YuvConstants SIMD_ALIGNED(kYuvJConstants) = { |
+ { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, |
+ UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 }, |
+ { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, |
+ UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, |
+ UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, |
+ UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ }, |
+ { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, |
+ 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ }, |
+ { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, |
+ BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ }, |
+ { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, |
+ BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ }, |
+ { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, |
+ BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ }, |
+ { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, |
+ YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ } |
+}; |
+ |
+#undef YGJ |
+#undef YGBJ |
+#undef UBJ |
+#undef UGJ |
+#undef VGJ |
+#undef VRJ |
+#undef BBJ |
+#undef BGJ |
+#undef BRJ |
+ |
// Read 8 UV from 411 |
#define READYUV444 \ |
"movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
@@ -1534,8 +1599,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = { |
"punpcklwd %%xmm2,%%xmm0 \n" \ |
"punpckhwd %%xmm2,%%xmm1 \n" \ |
"movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ |
- "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" \ |
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" |
+ "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ |
+ "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" |
// Store 8 BGRA values. Assumes XMM5 is zero. |
#define STOREBGRA \ |
@@ -1546,8 +1611,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = { |
"punpcklwd %%xmm1,%%xmm5 \n" \ |
"punpckhwd %%xmm1,%%xmm0 \n" \ |
"movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \ |
- "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) " \n" \ |
- "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" |
+ "movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \ |
+ "lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n" |
// Store 8 ABGR values. Assumes XMM5 is zero. |
#define STOREABGR \ |
@@ -1557,8 +1622,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = { |
"punpcklwd %%xmm0,%%xmm2 \n" \ |
"punpckhwd %%xmm0,%%xmm1 \n" \ |
"movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \ |
- "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) " \n" \ |
- "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" |
+ "movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \ |
+ "lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n" |
// Store 8 RGBA values. Assumes XMM5 is zero. |
#define STORERGBA \ |
@@ -1569,8 +1634,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = { |
"punpcklwd %%xmm1,%%xmm5 \n" \ |
"punpckhwd %%xmm1,%%xmm0 \n" \ |
"movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ |
- "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) " \n" \ |
- "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" |
+ "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ |
+ "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" |
void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, |
const uint8* u_buf, |
@@ -1713,6 +1778,32 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, |
); |
} |
+void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf, |
+ const uint8* u_buf, |
+ const uint8* v_buf, |
+ uint8* dst_argb, |
+ int width) { |
+ asm volatile ( |
+ "sub %[u_buf],%[v_buf] \n" |
+ "pcmpeqb %%xmm5,%%xmm5 \n" |
+ LABELALIGN |
+ "1: \n" |
+ READYUV422 |
+ YUVTORGB(kYuvConstants) |
+ STOREARGB |
+ "sub $0x8,%[width] \n" |
+ "jg 1b \n" |
+ : [y_buf]"+r"(y_buf), // %[y_buf] |
+ [u_buf]"+r"(u_buf), // %[u_buf] |
+ [v_buf]"+r"(v_buf), // %[v_buf] |
+ [dst_argb]"+r"(dst_argb), // %[dst_argb] |
+ [width]"+rm"(width) // %[width] |
+ : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants] |
+ : "memory", "cc", NACL_R14 |
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
+ ); |
+} |
+ |
void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, |
const uint8* u_buf, |
const uint8* v_buf, |
@@ -1881,10 +1972,10 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, |
"vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \ |
"vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \ |
"vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ |
- "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm2 \n" \ |
- "vpsubw %%ymm1,%%ymm2,%%ymm1 \n" \ |
- "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm1 \n" \ |
- "vpsubw %%ymm0,%%ymm1,%%ymm0 \n" \ |
+ "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm3 \n" \ |
+ "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ |
+ "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm3 \n" \ |
+ "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ |
"vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \ |
"lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ |
"vpermq $0xd8,%%ymm3,%%ymm3 \n" \ |
@@ -1984,6 +2075,48 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, |
} |
#endif // HAS_I422TOARGBROW_AVX2 |
+#if defined(HAS_J422TOARGBROW_AVX2) |
+// 16 pixels |
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
+void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf, |
+ const uint8* u_buf, |
+ const uint8* v_buf, |
+ uint8* dst_argb, |
+ int width) { |
+ asm volatile ( |
+ "sub %[u_buf],%[v_buf] \n" |
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
+ LABELALIGN |
+ "1: \n" |
+ READYUV422_AVX2 |
+ YUVTORGB_AVX2(kYuvConstants) |
+ |
+ // Step 3: Weave into ARGB |
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG |
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
+ "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA |
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n" |
+ "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels |
+ "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels |
+ |
+ "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n" |
+ "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n" |
+ "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" |
+ "sub $0x10,%[width] \n" |
+ "jg 1b \n" |
+ "vzeroupper \n" |
+ : [y_buf]"+r"(y_buf), // %[y_buf] |
+ [u_buf]"+r"(u_buf), // %[u_buf] |
+ [v_buf]"+r"(v_buf), // %[v_buf] |
+ [dst_argb]"+r"(dst_argb), // %[dst_argb] |
+ [width]"+rm"(width) // %[width] |
+ : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants] |
+ : "memory", "cc", NACL_R14 |
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
+ ); |
+} |
+#endif // HAS_J422TOARGBROW_AVX2 |
+ |
#if defined(HAS_I422TOABGRROW_AVX2) |
// 16 pixels |
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). |
@@ -2066,8 +2199,8 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, |
} |
#endif // HAS_I422TORGBAROW_AVX2 |
-#ifdef HAS_YTOARGBROW_SSE2 |
-void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { |
+#ifdef HAS_I400TOARGBROW_SSE2 |
+void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { |
asm volatile ( |
"mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 |
"movd %%eax,%%xmm2 \n" |
@@ -2109,12 +2242,12 @@ void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { |
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" |
); |
} |
-#endif // HAS_YTOARGBROW_SSE2 |
+#endif // HAS_I400TOARGBROW_SSE2 |
-#ifdef HAS_YTOARGBROW_AVX2 |
+#ifdef HAS_I400TOARGBROW_AVX2 |
// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). |
// note: vpunpcklbw mutates and vpackuswb unmutates. |
-void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) { |
+void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) { |
asm volatile ( |
"mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16 |
"vmovd %%eax,%%xmm2 \n" |
@@ -2156,7 +2289,7 @@ void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) { |
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" |
); |
} |
-#endif // HAS_YTOARGBROW_AVX2 |
+#endif // HAS_I400TOARGBROW_AVX2 |
#ifdef HAS_MIRRORROW_SSSE3 |
// Shuffle table for reversing the bytes. |
@@ -3096,41 +3229,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
"psllw $0x8,%%xmm5 \n" |
"pcmpeqb %%xmm4,%%xmm4 \n" |
"pslld $0x18,%%xmm4 \n" |
- "sub $0x1,%3 \n" |
- "je 91f \n" |
- "jl 99f \n" |
- |
- // 1 pixel loop until destination pointer is aligned. |
- "10: \n" |
- "test $0xf,%2 \n" |
- "je 19f \n" |
- "movd " MEMACCESS(0) ",%%xmm3 \n" |
- "lea " MEMLEA(0x4,0) ",%0 \n" |
- "movdqa %%xmm3,%%xmm0 \n" |
- "pxor %%xmm4,%%xmm3 \n" |
- "movd " MEMACCESS(1) ",%%xmm2 \n" |
- "psrlw $0x8,%%xmm3 \n" |
- "pshufhw $0xf5,%%xmm3,%%xmm3 \n" |
- "pshuflw $0xf5,%%xmm3,%%xmm3 \n" |
- "pand %%xmm6,%%xmm2 \n" |
- "paddw %%xmm7,%%xmm3 \n" |
- "pmullw %%xmm3,%%xmm2 \n" |
- "movd " MEMACCESS(1) ",%%xmm1 \n" |
- "lea " MEMLEA(0x4,1) ",%1 \n" |
- "psrlw $0x8,%%xmm1 \n" |
- "por %%xmm4,%%xmm0 \n" |
- "pmullw %%xmm3,%%xmm1 \n" |
- "psrlw $0x8,%%xmm2 \n" |
- "paddusb %%xmm2,%%xmm0 \n" |
- "pand %%xmm5,%%xmm1 \n" |
- "paddusb %%xmm1,%%xmm0 \n" |
- "movd %%xmm0," MEMACCESS(2) " \n" |
- "lea " MEMLEA(0x4,2) ",%2 \n" |
- "sub $0x1,%3 \n" |
- "jge 10b \n" |
- |
- "19: \n" |
- "add $1-4,%3 \n" |
+ "sub $0x4,%3 \n" |
"jl 49f \n" |
// 4 pixel loop. |
@@ -3231,39 +3330,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, |
"psllw $0x8,%%xmm5 \n" |
"pcmpeqb %%xmm4,%%xmm4 \n" |
"pslld $0x18,%%xmm4 \n" |
- "sub $0x1,%3 \n" |
- "je 91f \n" |
- "jl 99f \n" |
- |
- // 1 pixel loop until destination pointer is aligned. |
- "10: \n" |
- "test $0xf,%2 \n" |
- "je 19f \n" |
- "movd " MEMACCESS(0) ",%%xmm3 \n" |
- "lea " MEMLEA(0x4,0) ",%0 \n" |
- "movdqa %%xmm3,%%xmm0 \n" |
- "pxor %%xmm4,%%xmm3 \n" |
- "movd " MEMACCESS(1) ",%%xmm2 \n" |
- "pshufb %4,%%xmm3 \n" |
- "pand %%xmm6,%%xmm2 \n" |
- "paddw %%xmm7,%%xmm3 \n" |
- "pmullw %%xmm3,%%xmm2 \n" |
- "movd " MEMACCESS(1) ",%%xmm1 \n" |
- "lea " MEMLEA(0x4,1) ",%1 \n" |
- "psrlw $0x8,%%xmm1 \n" |
- "por %%xmm4,%%xmm0 \n" |
- "pmullw %%xmm3,%%xmm1 \n" |
- "psrlw $0x8,%%xmm2 \n" |
- "paddusb %%xmm2,%%xmm0 \n" |
- "pand %%xmm5,%%xmm1 \n" |
- "paddusb %%xmm1,%%xmm0 \n" |
- "movd %%xmm0," MEMACCESS(2) " \n" |
- "lea " MEMLEA(0x4,2) ",%2 \n" |
- "sub $0x1,%3 \n" |
- "jge 10b \n" |
- |
- "19: \n" |
- "add $1-4,%3 \n" |
+ "sub $0x4,%3 \n" |
"jl 49f \n" |
// 4 pixel loop. |
@@ -4897,37 +4964,6 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
} |
#endif // HAS_INTERPOLATEROW_SSE2 |
-#ifdef HAS_ARGBTOBAYERGGROW_SSE2 |
-void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, |
- uint32 selector, int pix) { |
- asm volatile ( |
- "pcmpeqb %%xmm5,%%xmm5 \n" |
- "psrld $0x18,%%xmm5 \n" |
- LABELALIGN |
- "1: \n" |
- "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
- "lea " MEMLEA(0x20,0) ",%0 \n" |
- "psrld $0x8,%%xmm0 \n" |
- "psrld $0x8,%%xmm1 \n" |
- "pand %%xmm5,%%xmm0 \n" |
- "pand %%xmm5,%%xmm1 \n" |
- "packssdw %%xmm1,%%xmm0 \n" |
- "packuswb %%xmm1,%%xmm0 \n" |
- "movq %%xmm0," MEMACCESS(1) " \n" |
- "lea " MEMLEA(0x8,1) ",%1 \n" |
- "sub $0x8,%2 \n" |
- "jg 1b \n" |
- : "+r"(src_argb), // %0 |
- "+r"(dst_bayer), // %1 |
- "+r"(pix) // %2 |
- : |
- : "memory", "cc" |
- , "xmm0", "xmm1", "xmm5" |
- ); |
-} |
-#endif // HAS_ARGBTOBAYERGGROW_SSE2 |
- |
#ifdef HAS_ARGBSHUFFLEROW_SSSE3 |
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |