Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(11)

Unified Diff: source/libvpx/third_party/libyuv/source/row_win.cc

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: source/libvpx/third_party/libyuv/source/row_win.cc
diff --git a/source/libvpx/third_party/libyuv/source/row_win.cc b/source/libvpx/third_party/libyuv/source/row_win.cc
index 6e9d04c0e4efcb4e3c9a0cf613313eeb9ffdf061..71be268b47fbb5d5b2fc486cd754253927ac3d67 100644
--- a/source/libvpx/third_party/libyuv/source/row_win.cc
+++ b/source/libvpx/third_party/libyuv/source/row_win.cc
@@ -10,7 +10,8 @@
#include "libyuv/row.h"
-#if defined (_M_X64) && !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \
+ defined(_MSC_VER) && !defined(__clang__)
#include <emmintrin.h>
#include <tmmintrin.h> // For _mm_maddubs_epi16
#endif
@@ -21,24 +22,8 @@ extern "C" {
#endif
// This module is for Visual C.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
- (defined(_M_IX86) || defined(_M_X64))
-
-// YUV to RGB conversion constants.
-// Y contribution to R,G,B. Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
-
-// U and V contributions to R,G,B.
-#define UB -128 /* -min(128, round(2.018 * 64)) */
-#define UG 25 /* -round(-0.391 * 64) */
-#define VG 52 /* -round(-0.813 * 64) */
-#define VR -102 /* -round(1.596 * 64) */
-
-// Bias values to subtract 16 from Y and 128 from U and V.
-#define BB (UB * 128 - YGB)
-#define BG (UG * 128 + VG * 128 - YGB)
-#define BR (VR * 128 - YGB)
+#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \
+ defined(_MSC_VER) && !defined(__clang__)
struct YuvConstants {
lvec8 kUVToB; // 0
@@ -50,6 +35,27 @@ struct YuvConstants {
lvec16 kYToRgb; // 192
};
+// BT.601 YUV to RGB reference
+// R = (Y - 16) * 1.164 - V * -1.596
+// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
+// B = (Y - 16) * 1.164 - U * -2.018
+
+// Y contribution to R,G,B. Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.018 * 64)) */
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
+#define VR -102 /* round(-1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
// BT601 constants for YUV to RGB.
static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
{ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
@@ -78,10 +84,70 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
{ YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
};
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+// JPEG YUV to RGB reference
+// * R = Y - V * -1.40200
+// * G = Y - U * 0.34414 - V * 0.71414
+// * B = Y - U * -1.77200
+
+// Y contribution to R,G,B. Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGBJ 32 /* 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UBJ -113 /* round(-1.77200 * 64) */
+#define UGJ 22 /* round(0.34414 * 64) */
+#define VGJ 46 /* round(0.71414 * 64) */
+#define VRJ -90 /* round(-1.40200 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BBJ (UBJ * 128 + YGBJ)
+#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
+#define BRJ (VRJ * 128 + YGBJ)
+
+// JPEG constants for YUV to RGB.
+static YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
+ { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
+ UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
+ { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+ UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+ UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+ UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
+ { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
+ 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
+ { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
+ BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
+ { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
+ BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
+ { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
+ BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
+ { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
+ YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
+};
+
+#undef YGJ
+#undef YGBJ
+#undef UBJ
+#undef UGJ
+#undef VGJ
+#undef VRJ
+#undef BBJ
+#undef BGJ
+#undef BRJ
+
// 64 bit
#if defined(_M_X64)
-
-__declspec(align(16))
+#if defined(HAS_I422TOARGBROW_SSSE3)
void I422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
@@ -131,10 +197,9 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
width -= 8;
}
}
-
+#endif
// 32 bit
#else // defined(_M_X64)
-
#ifdef HAS_ARGBTOYROW_SSSE3
// Constants for ARGB.
@@ -257,8 +322,8 @@ static const uvec8 kShuffleMaskARGBToRAW_0 = {
};
// Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked) __declspec(align(16))
-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+__declspec(naked)
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
__asm {
mov eax, [esp + 4] // src_y
mov edx, [esp + 8] // dst_argb
@@ -284,7 +349,39 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
}
}
-__declspec(naked) __declspec(align(16))
+#ifdef HAS_J400TOARGBROW_AVX2
+// Duplicates gray value 3 times and fills in alpha opaque.
+__declspec(naked)
+void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_y
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
+ vpslld ymm5, ymm5, 24
+
+ convertloop:
+ vmovdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ vpermq ymm0, ymm0, 0xd8
+ vpunpcklbw ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8
+ vpunpckhwd ymm1, ymm0, ymm0
+ vpunpcklwd ymm0, ymm0, ymm0
+ vpor ymm0, ymm0, ymm5
+ vpor ymm1, ymm1, ymm5
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_J400TOARGBROW_AVX2
+
+__declspec(naked)
void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
__asm {
mov eax, [esp + 4] // src_rgb24
@@ -322,7 +419,7 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
int pix) {
__asm {
@@ -368,7 +465,7 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
// v * (256 + 8)
// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
// 20 instructions.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
int pix) {
__asm {
@@ -417,8 +514,155 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
}
}
+#ifdef HAS_RGB565TOARGBROW_AVX2
+// pmul method to replicate bits.
+// Math to replicate bits:
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+__declspec(naked)
+void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
+ int pix) {
+ __asm {
+ mov eax, 0x01080108 // generate multiplier to repeat 5 bits
+ vmovd xmm5, eax
+ vbroadcastss ymm5, xmm5
+ mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
+ movd xmm6, eax
+ vbroadcastss ymm6, xmm6
+ vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
+ vpsllw ymm3, ymm3, 11
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green
+ vpsllw ymm4, ymm4, 10
+ vpsrlw ymm4, ymm4, 5
+ vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
+ vpsllw ymm7, ymm7, 8
+
+ mov eax, [esp + 4] // src_rgb565
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ sub edx, eax
+ sub edx, eax
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565
+ vpand ymm1, ymm0, ymm3 // R in upper 5 bits
+ vpsllw ymm2, ymm0, 11 // B in upper 5 bits
+ vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
+ vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
+ vpsllw ymm1, ymm1, 8
+ vpor ymm1, ymm1, ymm2 // RB
+ vpand ymm0, ymm0, ymm4 // G in middle 6 bits
+ vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4)
+ vpor ymm0, ymm0, ymm7 // AG
+ vpermq ymm0, ymm0, 0xd8 // mutate for unpack
+ vpermq ymm1, ymm1, 0xd8
+ vpunpckhbw ymm2, ymm1, ymm0
+ vpunpcklbw ymm1, ymm1, ymm0
+ vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB
+ vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB
+ lea eax, [eax + 32]
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_RGB565TOARGBROW_AVX2
+
+#ifdef HAS_ARGB1555TOARGBROW_AVX2
+__declspec(naked)
+void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
+ int pix) {
+ __asm {
+ mov eax, 0x01080108 // generate multiplier to repeat 5 bits
+ vmovd xmm5, eax
+ vbroadcastss ymm5, xmm5
+ mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
+ movd xmm6, eax
+ vbroadcastss ymm6, xmm6
+ vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
+ vpsllw ymm3, ymm3, 11
+ vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
+ vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
+ vpsllw ymm7, ymm7, 8
+
+ mov eax, [esp + 4] // src_argb1555
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ sub edx, eax
+ sub edx, eax
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 16 pixels of 1555
+ vpsllw ymm1, ymm0, 1 // R in upper 5 bits
+ vpsllw ymm2, ymm0, 11 // B in upper 5 bits
+ vpand ymm1, ymm1, ymm3
+ vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
+ vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
+ vpsllw ymm1, ymm1, 8
+ vpor ymm1, ymm1, ymm2 // RB
+ vpsraw ymm2, ymm0, 8 // A
+ vpand ymm0, ymm0, ymm4 // G in middle 5 bits
+ vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8)
+ vpand ymm2, ymm2, ymm7
+ vpor ymm0, ymm0, ymm2 // AG
+ vpermq ymm0, ymm0, 0xd8 // mutate for unpack
+ vpermq ymm1, ymm1, 0xd8
+ vpunpckhbw ymm2, ymm1, ymm0
+ vpunpcklbw ymm1, ymm1, ymm0
+ vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB
+ vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB
+ lea eax, [eax + 32]
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGB1555TOARGBROW_AVX2
+
+#ifdef HAS_ARGB4444TOARGBROW_AVX2
+__declspec(naked)
+void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
+ int pix) {
+ __asm {
+ mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
+ vmovd xmm4, eax
+ vbroadcastss ymm4, xmm4
+ vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
+ mov eax, [esp + 4] // src_argb4444
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ sub edx, eax
+ sub edx, eax
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444
+ vpand ymm2, ymm0, ymm5 // mask high nibbles
+ vpand ymm0, ymm0, ymm4 // mask low nibbles
+ vpsrlw ymm3, ymm2, 4
+ vpsllw ymm1, ymm0, 4
+ vpor ymm2, ymm2, ymm3
+ vpor ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8 // mutate for unpack
+ vpermq ymm2, ymm2, 0xd8
+ vpunpckhbw ymm1, ymm0, ymm2
+ vpunpcklbw ymm0, ymm0, ymm2
+ vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB
+ vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB
+ lea eax, [eax + 32]
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGB4444TOARGBROW_AVX2
+
// 24 instructions
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
int pix) {
__asm {
@@ -471,7 +715,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
}
// 18 instructions.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
int pix) {
__asm {
@@ -509,7 +753,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -547,7 +791,7 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -585,7 +829,8 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
}
}
-__declspec(naked) __declspec(align(16))
+// 4 pixels
+__declspec(naked)
void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -622,8 +867,97 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
}
}
+// 8 pixels
+__declspec(naked)
+void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
+ const uint32 dither4, int pix) {
+ __asm {
+
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ movd xmm6, [esp + 12] // dither4
+ mov ecx, [esp + 16] // pix
+ punpcklbw xmm6, xmm6 // make dither 16 bytes
+ movdqa xmm7, xmm6
+ punpcklwd xmm6, xmm6
+ punpckhwd xmm7, xmm7
+ pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
+ psrld xmm3, 27
+ pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
+ psrld xmm4, 26
+ pslld xmm4, 5
+ pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
+ pslld xmm5, 11
+
+ convertloop:
+ movdqu xmm0, [eax] // fetch 4 pixels of argb
+ paddusb xmm0, xmm6 // add dither
+ movdqa xmm1, xmm0 // B
+ movdqa xmm2, xmm0 // G
+ pslld xmm0, 8 // R
+ psrld xmm1, 3 // B
+ psrld xmm2, 5 // G
+ psrad xmm0, 16 // R
+ pand xmm1, xmm3 // B
+ pand xmm2, xmm4 // G
+ pand xmm0, xmm5 // R
+ por xmm1, xmm2 // BG
+ por xmm0, xmm1 // BGR
+ packssdw xmm0, xmm0
+ lea eax, [eax + 16]
+ movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
+ lea edx, [edx + 8]
+ sub ecx, 4
+ jg convertloop
+ ret
+ }
+}
+
+#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
+__declspec(naked)
+void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
+ const uint32 dither4, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ vbroadcastss xmm6, [esp + 12] // dither4
+ mov ecx, [esp + 16] // pix
+ vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes
+ vpermq ymm6, ymm6, 0xd8
+ vpunpcklwd ymm6, ymm6, ymm6
+ vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
+ vpsrld ymm3, ymm3, 27
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
+ vpsrld ymm4, ymm4, 26
+ vpslld ymm4, ymm4, 5
+ vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 8 pixels of argb
+ vpaddusb ymm0, ymm0, ymm6 // add dither
+ vpsrld ymm2, ymm0, 5 // G
+ vpsrld ymm1, ymm0, 3 // B
+ vpsrld ymm0, ymm0, 8 // R
+ vpand ymm2, ymm2, ymm4 // G
+ vpand ymm1, ymm1, ymm3 // B
+ vpand ymm0, ymm0, ymm5 // R
+ vpor ymm1, ymm1, ymm2 // BG
+ vpor ymm0, ymm0, ymm1 // BGR
+ vpackusdw ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8
+ lea eax, [eax + 32]
+ vmovdqu [edx], xmm0 // store 8 pixels of RGB565
+ lea edx, [edx + 16]
+ sub ecx, 8
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTORGB565DITHERROW_AVX2
+
// TODO(fbarchard): Improve sign extension/packing.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -664,7 +998,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -694,7 +1028,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
}
#ifdef HAS_ARGBTORGB565ROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -705,21 +1039,19 @@ void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
vpsrld ymm4, ymm4, 26
vpslld ymm4, ymm4, 5
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xfffff800
- vpslld ymm5, ymm5, 11
+ vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
convertloop:
vmovdqu ymm0, [eax] // fetch 8 pixels of argb
vpsrld ymm2, ymm0, 5 // G
vpsrld ymm1, ymm0, 3 // B
- vpslld ymm0, ymm0, 8 // R
+ vpsrld ymm0, ymm0, 8 // R
vpand ymm2, ymm2, ymm4 // G
vpand ymm1, ymm1, ymm3 // B
- vpsrad ymm0, ymm0, 16 // R
vpand ymm0, ymm0, ymm5 // R
vpor ymm1, ymm1, ymm2 // BG
vpor ymm0, ymm0, ymm1 // BGR
- vpackssdw ymm0, ymm0, ymm0
+ vpackusdw ymm0, ymm0, ymm0
vpermq ymm0, ymm0, 0xd8
lea eax, [eax + 32]
vmovdqu [edx], xmm0 // store 8 pixels of RGB565
@@ -733,7 +1065,7 @@ void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
#endif // HAS_ARGBTORGB565ROW_AVX2
#ifdef HAS_ARGBTOARGB1555ROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -773,7 +1105,7 @@ void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
#endif // HAS_ARGBTOARGB1555ROW_AVX2
#ifdef HAS_ARGBTOARGB4444ROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -804,7 +1136,7 @@ void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
#endif // HAS_ARGBTOARGB4444ROW_AVX2
// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -839,7 +1171,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -880,7 +1212,7 @@ static const lvec32 kPermdARGBToY_AVX = {
};
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked) __declspec(align(32))
+__declspec(naked)
void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -917,9 +1249,9 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
}
#endif // HAS_ARGBTOYROW_AVX2
-#ifdef HAS_ARGBTOYROW_AVX2
+#ifdef HAS_ARGBTOYJROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked) __declspec(align(32))
+__declspec(naked)
void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -958,7 +1290,7 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
}
#endif // HAS_ARGBTOYJROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -991,7 +1323,7 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -1024,7 +1356,7 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -1057,7 +1389,7 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
@@ -1127,7 +1459,7 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
@@ -1199,7 +1531,7 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
}
#ifdef HAS_ARGBTOUVROW_AVX2
-__declspec(naked) __declspec(align(32))
+__declspec(naked)
void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
@@ -1264,7 +1596,7 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
}
#endif // HAS_ARGBTOUVROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
@@ -1321,7 +1653,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
@@ -1379,7 +1711,7 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
@@ -1449,7 +1781,7 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
@@ -1519,7 +1851,7 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
@@ -1590,6 +1922,16 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
}
#endif // HAS_ARGBTOYROW_SSSE3
+// Read 16 UV from 444
+#define READYUV444_AVX2 __asm { \
+ __asm vmovdqu xmm0, [esi] /* U */ /* NOLINT */ \
+ __asm vmovdqu xmm1, [esi + edi] /* V */ /* NOLINT */ \
+ __asm lea esi, [esi + 16] \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpermq ymm1, ymm1, 0xd8 \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
+ }
+
// Read 8 UV from 422, upsample to 16 UV.
#define READYUV422_AVX2 __asm { \
__asm vmovq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \
@@ -1600,6 +1942,17 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
__asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
}
+// Read 4 UV from 411, upsample to 16 UV.
+#define READYUV411_AVX2 __asm { \
+ __asm vmovd xmm0, dword ptr [esi] /* U */ /* NOLINT */ \
+ __asm vmovd xmm1, dword ptr [esi + edi] /* V */ /* NOLINT */ \
+ __asm lea esi, [esi + 4] \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
+ __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \
+ }
+
// Read 8 UV from NV12, upsample to 16 UV.
#define READNV12_AVX2 __asm { \
__asm vmovdqu xmm0, [esi] /* UV */ \
@@ -1646,15 +1999,15 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
__asm vpermq ymm2, ymm2, 0xd8 \
__asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
__asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
- __asm vmovdqu [edx], ymm1 \
- __asm vmovdqu [edx + 32], ymm0 \
+ __asm vmovdqu 0[edx], ymm1 \
+ __asm vmovdqu 32[edx], ymm0 \
__asm lea edx, [edx + 64] \
}
#ifdef HAS_I422TOARGBROW_AVX2
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void I422ToARGBRow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
@@ -1687,10 +2040,118 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
}
#endif // HAS_I422TOARGBROW_AVX2
+#ifdef HAS_J422TOARGBROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void J422ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+ READYUV422_AVX2
+ YUVTORGB_AVX2(kYuvJConstants)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_J422TOARGBROW_AVX2
+
+#ifdef HAS_I444TOARGBROW_AVX2
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void I444ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+ READYUV444_AVX2
+ YUVTORGB_AVX2(kYuvConstants)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_I444TOARGBROW_AVX2
+
+#ifdef HAS_I411TOARGBROW_AVX2
+// 16 pixels
+// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void I411ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+ READYUV411_AVX2
+ YUVTORGB_AVX2(kYuvConstants)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_I411TOARGBROW_AVX2
+
#ifdef HAS_NV12TOARGBROW_AVX2
// 16 pixels.
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void NV12ToARGBRow_AVX2(const uint8* y_buf,
const uint8* uv_buf,
uint8* dst_argb,
@@ -1712,6 +2173,7 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf,
jg convertloop
pop esi
+ vzeroupper
ret
}
}
@@ -1720,7 +2182,7 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf,
#ifdef HAS_NV21TOARGBROW_AVX2
// 16 pixels.
// 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void NV21ToARGBRow_AVX2(const uint8* y_buf,
const uint8* uv_buf,
uint8* dst_argb,
@@ -1742,6 +2204,7 @@ void NV21ToARGBRow_AVX2(const uint8* y_buf,
jg convertloop
pop esi
+ vzeroupper
ret
}
}
@@ -1751,7 +2214,7 @@ void NV21ToARGBRow_AVX2(const uint8* y_buf,
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void I422ToBGRARow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
@@ -1797,7 +2260,7 @@ void I422ToBGRARow_AVX2(const uint8* y_buf,
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void I422ToRGBARow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
@@ -1843,7 +2306,7 @@ void I422ToRGBARow_AVX2(const uint8* y_buf,
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void I422ToABGRRow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
@@ -1914,7 +2377,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm lea esi, [esi + 2] \
__asm punpcklbw xmm0, xmm1 /* UV */ \
__asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
- __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \
}
// Read 4 UV from NV12, upsample to 8 UV.
@@ -1963,8 +2426,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm movdqa xmm1, xmm0 \
__asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
__asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
- __asm movdqu [edx], xmm0 \
- __asm movdqu [edx + 16], xmm1 \
+ __asm movdqu 0[edx], xmm0 \
+ __asm movdqu 16[edx], xmm1 \
__asm lea edx, [edx + 32] \
}
@@ -1977,8 +2440,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm movdqa xmm0, xmm5 \
__asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
__asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
- __asm movdqu [edx], xmm5 \
- __asm movdqu [edx + 16], xmm0 \
+ __asm movdqu 0[edx], xmm5 \
+ __asm movdqu 16[edx], xmm0 \
__asm lea edx, [edx + 32] \
}
@@ -1990,8 +2453,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm movdqa xmm1, xmm2 \
__asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \
__asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \
- __asm movdqu [edx], xmm2 \
- __asm movdqu [edx + 16], xmm1 \
+ __asm movdqu 0[edx], xmm2 \
+ __asm movdqu 16[edx], xmm1 \
__asm lea edx, [edx + 32] \
}
@@ -2004,8 +2467,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm movdqa xmm0, xmm5 \
__asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
__asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
- __asm movdqu [edx], xmm5 \
- __asm movdqu [edx + 16], xmm0 \
+ __asm movdqu 0[edx], xmm5 \
+ __asm movdqu 16[edx], xmm0 \
__asm lea edx, [edx + 32] \
}
@@ -2021,8 +2484,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
__asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
__asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
- __asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \
- __asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \
+ __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
+ __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
__asm lea edx, [edx + 24] \
}
@@ -2038,8 +2501,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
__asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
__asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
- __asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \
- __asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \
+ __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
+ __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
__asm lea edx, [edx + 24] \
}
@@ -2075,13 +2538,13 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm por xmm3, xmm2 /* BG */ \
__asm por xmm1, xmm3 /* BGR */ \
__asm packssdw xmm0, xmm1 \
- __asm movdqu [edx], xmm0 /* store 8 pixels of RGB565 */ \
+ __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
__asm lea edx, [edx + 16] \
}
// 8 pixels.
// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void I444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
@@ -2114,7 +2577,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void I422ToRGB24Row_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
@@ -2148,7 +2611,7 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf,
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void I422ToRAWRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
@@ -2182,7 +2645,7 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf,
// 8 pixels
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void I422ToRGB565Row_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
@@ -2221,7 +2684,7 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void I422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
@@ -2253,9 +2716,43 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
}
// 8 pixels.
+// JPeg color space version of I422ToARGB
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void J422ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+
+ convertloop:
+ READYUV422
+ YUVTORGB(kYuvJConstants)
+ STOREARGB
+
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels.
// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
// Similar to I420 but duplicate UV once more.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void I411ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
@@ -2290,7 +2787,7 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void NV12ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* uv_buf,
uint8* dst_argb,
@@ -2318,7 +2815,7 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
// 8 pixels.
// 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void NV21ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* uv_buf,
uint8* dst_argb,
@@ -2344,7 +2841,7 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf,
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void I422ToBGRARow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
@@ -2374,7 +2871,7 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf,
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void I422ToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
@@ -2405,7 +2902,7 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf,
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void I422ToRGBARow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
@@ -2437,12 +2934,12 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
#endif // HAS_I422TOARGBROW_SSSE3
-#ifdef HAS_YTOARGBROW_SSE2
+#ifdef HAS_I400TOARGBROW_SSE2
// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void YToARGBRow_SSE2(const uint8* y_buf,
- uint8* rgb_buf,
- int width) {
+__declspec(naked)
+void I400ToARGBRow_SSE2(const uint8* y_buf,
+ uint8* rgb_buf,
+ int width) {
__asm {
mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
movd xmm2, eax
@@ -2482,15 +2979,15 @@ void YToARGBRow_SSE2(const uint8* y_buf,
ret
}
}
-#endif // HAS_YTOARGBROW_SSE2
+#endif // HAS_I400TOARGBROW_SSE2
-#ifdef HAS_YTOARGBROW_AVX2
+#ifdef HAS_I400TOARGBROW_AVX2
// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
// note: vpunpcklbw mutates and vpackuswb unmutates.
-__declspec(naked) __declspec(align(16))
-void YToARGBRow_AVX2(const uint8* y_buf,
- uint8* rgb_buf,
- int width) {
+__declspec(naked)
+void I400ToARGBRow_AVX2(const uint8* y_buf,
+ uint8* rgb_buf,
+ int width) {
__asm {
mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
vmovd xmm2, eax
@@ -2506,7 +3003,7 @@ void YToARGBRow_AVX2(const uint8* y_buf,
mov ecx, [esp + 12] // width
convertloop:
- // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
+ // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
vmovdqu xmm0, [eax]
lea eax, [eax + 16]
vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates
@@ -2533,7 +3030,7 @@ void YToARGBRow_AVX2(const uint8* y_buf,
ret
}
}
-#endif // HAS_YTOARGBROW_AVX2
+#endif // HAS_I400TOARGBROW_AVX2
#ifdef HAS_MIRRORROW_SSSE3
// Shuffle table for reversing the bytes.
@@ -2542,7 +3039,7 @@ static const uvec8 kShuffleMirror = {
};
// TODO(fbarchard): Replace lea with -16 offset.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
__asm {
mov eax, [esp + 4] // src
@@ -2563,7 +3060,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
#endif // HAS_MIRRORROW_SSSE3
#ifdef HAS_MIRRORROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
__asm {
mov eax, [esp + 4] // src
@@ -2586,7 +3083,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
#endif // HAS_MIRRORROW_AVX2
#ifdef HAS_MIRRORROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
__asm {
mov eax, [esp + 4] // src
@@ -2617,7 +3114,7 @@ static const uvec8 kShuffleMirrorUV = {
14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
};
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
int width) {
__asm {
@@ -2647,7 +3144,7 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
#endif // HAS_MIRRORROW_UV_SSSE3
#ifdef HAS_ARGBMIRRORROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
__asm {
mov eax, [esp + 4] // src
@@ -2674,7 +3171,7 @@ static const ulvec32 kARGBShuffleMirror_AVX2 = {
7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
};
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
__asm {
mov eax, [esp + 4] // src
@@ -2695,7 +3192,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
#endif // HAS_ARGBMIRRORROW_AVX2
#ifdef HAS_SPLITUVROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
__asm {
push edi
@@ -2733,7 +3230,7 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
#endif // HAS_SPLITUVROW_SSE2
#ifdef HAS_SPLITUVROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
__asm {
push edi
@@ -2771,7 +3268,7 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
#endif // HAS_SPLITUVROW_AVX2
#ifdef HAS_MERGEUVROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) {
__asm {
@@ -2802,7 +3299,7 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
#endif // HAS_MERGEUVROW_SSE2
#ifdef HAS_MERGEUVROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) {
__asm {
@@ -2836,7 +3333,7 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
#ifdef HAS_COPYROW_SSE2
// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
__asm {
mov eax, [esp + 4] // src
@@ -2859,7 +3356,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
#ifdef HAS_COPYROW_AVX
// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
__asm {
mov eax, [esp + 4] // src
@@ -2883,7 +3380,7 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
#endif // HAS_COPYROW_AVX
// Multiple of 1.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
__asm {
mov eax, esi
@@ -2900,7 +3397,7 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
#ifdef HAS_ARGBCOPYALPHAROW_SSE2
// width in pixels
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
__asm {
mov eax, [esp + 4] // src
@@ -2936,7 +3433,7 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
#ifdef HAS_ARGBCOPYALPHAROW_AVX2
// width in pixels
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
__asm {
mov eax, [esp + 4] // src
@@ -2965,7 +3462,7 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
// width in pixels
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
__asm {
mov eax, [esp + 4] // src
@@ -3003,7 +3500,7 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
// width in pixels
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
__asm {
mov eax, [esp + 4] // src
@@ -3035,7 +3532,7 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
#ifdef HAS_SETROW_X86
// Write 'count' bytes using an 8 bit value repeated.
// Count should be multiple of 4.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void SetRow_X86(uint8* dst, uint8 v8, int count) {
__asm {
movzx eax, byte ptr [esp + 8] // v8
@@ -3052,7 +3549,7 @@ void SetRow_X86(uint8* dst, uint8 v8, int count) {
}
// Write 'count' bytes using an 8 bit value repeated.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
__asm {
mov edx, edi
@@ -3066,7 +3563,7 @@ void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
}
// Write 'count' 32 bit values.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
__asm {
mov edx, edi
@@ -3081,7 +3578,7 @@ void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
#endif // HAS_SETROW_X86
#ifdef HAS_YUY2TOYROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void YUY2ToYRow_AVX2(const uint8* src_yuy2,
uint8* dst_y, int pix) {
__asm {
@@ -3108,7 +3605,7 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix) {
__asm {
@@ -3152,7 +3649,7 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
uint8* dst_u, uint8* dst_v, int pix) {
__asm {
@@ -3191,7 +3688,7 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void UYVYToYRow_AVX2(const uint8* src_uyvy,
uint8* dst_y, int pix) {
__asm {
@@ -3216,7 +3713,7 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix) {
__asm {
@@ -3260,7 +3757,7 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
uint8* dst_u, uint8* dst_v, int pix) {
__asm {
@@ -3301,7 +3798,7 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
#endif // HAS_YUY2TOYROW_AVX2
#ifdef HAS_YUY2TOYROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void YUY2ToYRow_SSE2(const uint8* src_yuy2,
uint8* dst_y, int pix) {
__asm {
@@ -3326,7 +3823,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix) {
__asm {
@@ -3369,7 +3866,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
uint8* dst_u, uint8* dst_v, int pix) {
__asm {
@@ -3405,7 +3902,7 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void UYVYToYRow_SSE2(const uint8* src_uyvy,
uint8* dst_y, int pix) {
__asm {
@@ -3428,7 +3925,7 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix) {
__asm {
@@ -3471,7 +3968,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
uint8* dst_u, uint8* dst_v, int pix) {
__asm {
@@ -3510,7 +4007,7 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
#ifdef HAS_ARGBBLENDROW_SSE2
// Blend 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
__asm {
@@ -3527,43 +4024,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
psllw xmm5, 8
pcmpeqb xmm4, xmm4 // generate mask 0xff000000
pslld xmm4, 24
-
- sub ecx, 1
- je convertloop1 // only 1 pixel?
- jl convertloop1b
-
- // 1 pixel loop until destination pointer is aligned.
- alignloop1:
- test edx, 15 // aligned?
- je alignloop1b
- movd xmm3, [eax]
- lea eax, [eax + 4]
- movdqa xmm0, xmm3 // src argb
- pxor xmm3, xmm4 // ~alpha
- movd xmm2, [esi] // _r_b
- psrlw xmm3, 8 // alpha
- pshufhw xmm3, xmm3, 0F5h // 8 alpha words
- pshuflw xmm3, xmm3, 0F5h
- pand xmm2, xmm6 // _r_b
- paddw xmm3, xmm7 // 256 - alpha
- pmullw xmm2, xmm3 // _r_b * alpha
- movd xmm1, [esi] // _a_g
- lea esi, [esi + 4]
- psrlw xmm1, 8 // _a_g
- por xmm0, xmm4 // set alpha to 255
- pmullw xmm1, xmm3 // _a_g * alpha
- psrlw xmm2, 8 // _r_b convert to 8 bits again
- paddusb xmm0, xmm2 // + src argb
- pand xmm1, xmm5 // a_g_ convert to 8 bits again
- paddusb xmm0, xmm1 // + src argb
- movd [edx], xmm0
- lea edx, [edx + 4]
- sub ecx, 1
- jge alignloop1
-
- alignloop1b:
- add ecx, 1 - 4
- jl convertloop4b
+ sub ecx, 4
+ jl convertloop4b // less than 4 pixels?
// 4 pixel loop.
convertloop4:
@@ -3644,7 +4106,7 @@ static const uvec8 kShuffleAlpha = {
// pshufb xmm3, kShuffleAlpha // alpha
// Blend 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
__asm {
@@ -3661,41 +4123,8 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
psllw xmm5, 8
pcmpeqb xmm4, xmm4 // generate mask 0xff000000
pslld xmm4, 24
-
- sub ecx, 1
- je convertloop1 // only 1 pixel?
- jl convertloop1b
-
- // 1 pixel loop until destination pointer is aligned.
- alignloop1:
- test edx, 15 // aligned?
- je alignloop1b
- movd xmm3, [eax]
- lea eax, [eax + 4]
- movdqa xmm0, xmm3 // src argb
- pxor xmm3, xmm4 // ~alpha
- movd xmm2, [esi] // _r_b
- pshufb xmm3, kShuffleAlpha // alpha
- pand xmm2, xmm6 // _r_b
- paddw xmm3, xmm7 // 256 - alpha
- pmullw xmm2, xmm3 // _r_b * alpha
- movd xmm1, [esi] // _a_g
- lea esi, [esi + 4]
- psrlw xmm1, 8 // _a_g
- por xmm0, xmm4 // set alpha to 255
- pmullw xmm1, xmm3 // _a_g * alpha
- psrlw xmm2, 8 // _r_b convert to 8 bits again
- paddusb xmm0, xmm2 // + src argb
- pand xmm1, xmm5 // a_g_ convert to 8 bits again
- paddusb xmm0, xmm1 // + src argb
- movd [edx], xmm0
- lea edx, [edx + 4]
- sub ecx, 1
- jge alignloop1
-
- alignloop1b:
- add ecx, 1 - 4
- jl convertloop4b
+ sub ecx, 4
+ jl convertloop4b // less than 4 pixels?
// 4 pixel loop.
convertloop4:
@@ -3760,7 +4189,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBATTENUATEROW_SSE2
// Attenuate 4 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
__asm {
mov eax, [esp + 4] // src_argb0
@@ -3809,7 +4238,7 @@ static const uvec8 kShuffleAlpha1 = {
11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
};
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
__asm {
mov eax, [esp + 4] // src_argb0
@@ -3853,7 +4282,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
static const uvec8 kShuffleAlpha_AVX2 = {
6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
};
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
__asm {
mov eax, [esp + 4] // src_argb0
@@ -3890,7 +4319,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
#ifdef HAS_ARGBUNATTENUATEROW_SSE2
// Unattenuate 4 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
int width) {
__asm {
@@ -3944,7 +4373,7 @@ static const uvec8 kUnattenShuffleAlpha_AVX2 = {
// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
// USE_GATHER is not on by default, due to being a slow instruction.
#ifdef USE_GATHER
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
int width) {
__asm {
@@ -3978,7 +4407,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
}
}
#else // USE_GATHER
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
int width) {
__asm {
@@ -4045,7 +4474,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
#ifdef HAS_ARGBGRAYROW_SSSE3
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -4104,7 +4533,7 @@ static const vec8 kARGBToSepiaR = {
};
// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
__asm {
mov eax, [esp + 4] /* dst_argb */
@@ -4161,7 +4590,7 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
// Same as Sepia except matrix is provided.
// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
const int8* matrix_argb, int width) {
__asm {
@@ -4222,7 +4651,7 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
#ifdef HAS_ARGBQUANTIZEROW_SSE2
// Quantize 4 ARGB pixels (16 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
int interval_offset, int width) {
__asm {
@@ -4267,7 +4696,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
#ifdef HAS_ARGBSHADEROW_SSE2
// Shade 4 pixels at a time by specified value.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
uint32 value) {
__asm {
@@ -4301,7 +4730,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
#ifdef HAS_ARGBMULTIPLYROW_SSE2
// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
__asm {
@@ -4340,7 +4769,7 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBADDROW_SSE2
// Add 2 rows of ARGB pixels together, 4 pixels at a time.
// TODO(fbarchard): Port this to posix, neon and other math functions.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
__asm {
@@ -4388,7 +4817,7 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBSUBTRACTROW_SSE2
// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
__asm {
@@ -4417,7 +4846,7 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBMULTIPLYROW_AVX2
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
__asm {
@@ -4454,7 +4883,7 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBADDROW_AVX2
// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
__asm {
@@ -4483,7 +4912,7 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBSUBTRACTROW_AVX2
// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
__asm {
@@ -4515,7 +4944,7 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
// -1 0 1
// -2 0 2
// -1 0 1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
const uint8* src_y2, uint8* dst_sobelx, int width) {
__asm {
@@ -4571,7 +5000,7 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
// -1 -2 -1
// 0 0 0
// 1 2 1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
uint8* dst_sobely, int width) {
__asm {
@@ -4624,7 +5053,7 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
// R = Sobel
// G = Sobel
// B = Sobel
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) {
__asm {
@@ -4671,7 +5100,7 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
#ifdef HAS_SOBELTOPLANEROW_SSE2
// Adds Sobel X and Sobel Y and stores Sobel into a plane.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_y, int width) {
__asm {
@@ -4704,7 +5133,7 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
// R = Sobel X
// G = Sobel
// B = Sobel Y
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) {
__asm {
@@ -4991,7 +5420,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
#ifdef HAS_ARGBAFFINEROW_SSE2
// Copy ARGB pixels from source image with slope to a row of destination.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
LIBYUV_API
void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width) {
@@ -5076,7 +5505,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
#ifdef HAS_INTERPOLATEROW_AVX2
// Bilinear filter 32x2 -> 32x1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) {
@@ -5173,7 +5602,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
#endif // HAS_INTERPOLATEROW_AVX2
// Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) {
@@ -5274,7 +5703,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
#ifdef HAS_INTERPOLATEROW_SSE2
// Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) {
@@ -5380,38 +5809,8 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
}
#endif // HAS_INTERPOLATEROW_SSE2
-// Specialized ARGB to Bayer that just isolates G channel.
-__declspec(naked) __declspec(align(16))
-void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_bayer
- // selector
- mov ecx, [esp + 16] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
- psrld xmm5, 24
-
- wloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrld xmm0, 8 // Move green to bottom.
- psrld xmm1, 8
- pand xmm0, xmm5
- pand xmm1, xmm5
- packssdw xmm0, xmm1
- packuswb xmm0, xmm1
- movq qword ptr [edx], xmm0
- lea edx, [edx + 8]
- sub ecx, 8
- jg wloop
- ret
- }
-}
-
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
__asm {
@@ -5437,7 +5836,7 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
}
#ifdef HAS_ARGBSHUFFLEROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
__asm {
@@ -5465,7 +5864,7 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
}
#endif // HAS_ARGBSHUFFLEROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
__asm {
@@ -5587,7 +5986,7 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
// UYVY - Macro-pixel = 2 image pixels
// U0Y0V0Y1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void I422ToYUY2Row_SSE2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -5624,7 +6023,7 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void I422ToUYVYRow_SSE2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -5662,7 +6061,7 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
}
#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBPolynomialRow_SSE2(const uint8* src_argb,
uint8* dst_argb, const float* poly,
int width) {
@@ -5721,7 +6120,7 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
#endif // HAS_ARGBPOLYNOMIALROW_SSE2
#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBPolynomialRow_AVX2(const uint8* src_argb,
uint8* dst_argb, const float* poly,
int width) {
@@ -5761,7 +6160,7 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
#ifdef HAS_ARGBCOLORTABLEROW_X86
// Tranform ARGB pixels with color table.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
int width) {
__asm {
@@ -5795,7 +6194,7 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
#ifdef HAS_RGBCOLORTABLEROW_X86
// Tranform RGB pixels with color table.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
__asm {
push esi
@@ -5826,7 +6225,7 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
// Tranform RGB pixels with luma table.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
int width,
const uint8* luma, uint32 lumacoeff) {
@@ -5924,7 +6323,7 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
#endif // defined(_M_X64)
-#endif // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
+#endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
#ifdef __cplusplus
} // extern "C"
« no previous file with comments | « source/libvpx/third_party/libyuv/source/row_posix.cc ('k') | source/libvpx/third_party/libyuv/source/scale.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698