| Index: source/libvpx/third_party/libyuv/source/row_win.cc
|
| diff --git a/source/libvpx/third_party/libyuv/source/row_win.cc b/source/libvpx/third_party/libyuv/source/row_win.cc
|
| index 6e9d04c0e4efcb4e3c9a0cf613313eeb9ffdf061..71be268b47fbb5d5b2fc486cd754253927ac3d67 100644
|
| --- a/source/libvpx/third_party/libyuv/source/row_win.cc
|
| +++ b/source/libvpx/third_party/libyuv/source/row_win.cc
|
| @@ -10,7 +10,8 @@
|
|
|
| #include "libyuv/row.h"
|
|
|
| -#if defined (_M_X64) && !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
|
| +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \
|
| + defined(_MSC_VER) && !defined(__clang__)
|
| #include <emmintrin.h>
|
| #include <tmmintrin.h> // For _mm_maddubs_epi16
|
| #endif
|
| @@ -21,24 +22,8 @@ extern "C" {
|
| #endif
|
|
|
| // This module is for Visual C.
|
| -#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
|
| - (defined(_M_IX86) || defined(_M_X64))
|
| -
|
| -// YUV to RGB conversion constants.
|
| -// Y contribution to R,G,B. Scale and bias.
|
| -#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
|
| -#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
|
| -
|
| -// U and V contributions to R,G,B.
|
| -#define UB -128 /* -min(128, round(2.018 * 64)) */
|
| -#define UG 25 /* -round(-0.391 * 64) */
|
| -#define VG 52 /* -round(-0.813 * 64) */
|
| -#define VR -102 /* -round(1.596 * 64) */
|
| -
|
| -// Bias values to subtract 16 from Y and 128 from U and V.
|
| -#define BB (UB * 128 - YGB)
|
| -#define BG (UG * 128 + VG * 128 - YGB)
|
| -#define BR (VR * 128 - YGB)
|
| +#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \
|
| + defined(_MSC_VER) && !defined(__clang__)
|
|
|
| struct YuvConstants {
|
| lvec8 kUVToB; // 0
|
| @@ -50,6 +35,27 @@ struct YuvConstants {
|
| lvec16 kYToRgb; // 192
|
| };
|
|
|
| +// BT.601 YUV to RGB reference
|
| +// R = (Y - 16) * 1.164 - V * -1.596
|
| +// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
|
| +// B = (Y - 16) * 1.164 - U * -2.018
|
| +
|
| +// Y contribution to R,G,B. Scale and bias.
|
| +// TODO(fbarchard): Consider moving constants into a common header.
|
| +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
|
| +#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
|
| +
|
| +// U and V contributions to R,G,B.
|
| +#define UB -128 /* max(-128, round(-2.018 * 64)) */
|
| +#define UG 25 /* round(0.391 * 64) */
|
| +#define VG 52 /* round(0.813 * 64) */
|
| +#define VR -102 /* round(-1.596 * 64) */
|
| +
|
| +// Bias values to subtract 16 from Y and 128 from U and V.
|
| +#define BB (UB * 128 + YGB)
|
| +#define BG (UG * 128 + VG * 128 + YGB)
|
| +#define BR (VR * 128 + YGB)
|
| +
|
| // BT601 constants for YUV to RGB.
|
| static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
|
| { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
|
| @@ -78,10 +84,70 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
|
| { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
|
| };
|
|
|
| +#undef YG
|
| +#undef YGB
|
| +#undef UB
|
| +#undef UG
|
| +#undef VG
|
| +#undef VR
|
| +#undef BB
|
| +#undef BG
|
| +#undef BR
|
| +
|
| +// JPEG YUV to RGB reference
|
| +// * R = Y - V * -1.40200
|
| +// * G = Y - U * 0.34414 - V * 0.71414
|
| +// * B = Y - U * -1.77200
|
| +
|
| +// Y contribution to R,G,B. Scale and bias.
|
| +// TODO(fbarchard): Consider moving constants into a common header.
|
| +#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
|
| +#define YGBJ 32 /* 64 / 2 */
|
| +
|
| +// U and V contributions to R,G,B.
|
| +#define UBJ -113 /* round(-1.77200 * 64) */
|
| +#define UGJ 22 /* round(0.34414 * 64) */
|
| +#define VGJ 46 /* round(0.71414 * 64) */
|
| +#define VRJ -90 /* round(-1.40200 * 64) */
|
| +
|
| +// Bias values to subtract 16 from Y and 128 from U and V.
|
| +#define BBJ (UBJ * 128 + YGBJ)
|
| +#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
|
| +#define BRJ (VRJ * 128 + YGBJ)
|
| +
|
| +// JPEG constants for YUV to RGB.
|
| +static YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
|
| + { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
|
| + UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
|
| + { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
|
| + UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
|
| + UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
|
| + UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
|
| + { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
|
| + 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
|
| + { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
|
| + BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
|
| + { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
|
| + BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
|
| + { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
|
| + BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
|
| + { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
|
| + YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
|
| +};
|
| +
|
| +#undef YGJ
|
| +#undef YGBJ
|
| +#undef UBJ
|
| +#undef UGJ
|
| +#undef VGJ
|
| +#undef VRJ
|
| +#undef BBJ
|
| +#undef BGJ
|
| +#undef BRJ
|
| +
|
| // 64 bit
|
| #if defined(_M_X64)
|
| -
|
| -__declspec(align(16))
|
| +#if defined(HAS_I422TOARGBROW_SSSE3)
|
| void I422ToARGBRow_SSSE3(const uint8* y_buf,
|
| const uint8* u_buf,
|
| const uint8* v_buf,
|
| @@ -131,10 +197,9 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
|
| width -= 8;
|
| }
|
| }
|
| -
|
| +#endif
|
| // 32 bit
|
| #else // defined(_M_X64)
|
| -
|
| #ifdef HAS_ARGBTOYROW_SSSE3
|
|
|
| // Constants for ARGB.
|
| @@ -257,8 +322,8 @@ static const uvec8 kShuffleMaskARGBToRAW_0 = {
|
| };
|
|
|
| // Duplicates gray value 3 times and fills in alpha opaque.
|
| -__declspec(naked) __declspec(align(16))
|
| -void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
|
| +__declspec(naked)
|
| +void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
|
| __asm {
|
| mov eax, [esp + 4] // src_y
|
| mov edx, [esp + 8] // dst_argb
|
| @@ -284,7 +349,39 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +#ifdef HAS_J400TOARGBROW_AVX2
|
| +// Duplicates gray value 3 times and fills in alpha opaque.
|
| +__declspec(naked)
|
| +void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) {
|
| + __asm {
|
| + mov eax, [esp + 4] // src_y
|
| + mov edx, [esp + 8] // dst_argb
|
| + mov ecx, [esp + 12] // pix
|
| + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
|
| + vpslld ymm5, ymm5, 24
|
| +
|
| + convertloop:
|
| + vmovdqu xmm0, [eax]
|
| + lea eax, [eax + 16]
|
| + vpermq ymm0, ymm0, 0xd8
|
| + vpunpcklbw ymm0, ymm0, ymm0
|
| + vpermq ymm0, ymm0, 0xd8
|
| + vpunpckhwd ymm1, ymm0, ymm0
|
| + vpunpcklwd ymm0, ymm0, ymm0
|
| + vpor ymm0, ymm0, ymm5
|
| + vpor ymm1, ymm1, ymm5
|
| + vmovdqu [edx], ymm0
|
| + vmovdqu [edx + 32], ymm1
|
| + lea edx, [edx + 64]
|
| + sub ecx, 16
|
| + jg convertloop
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_J400TOARGBROW_AVX2
|
| +
|
| +__declspec(naked)
|
| void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
|
| __asm {
|
| mov eax, [esp + 4] // src_rgb24
|
| @@ -322,7 +419,7 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
|
| int pix) {
|
| __asm {
|
| @@ -368,7 +465,7 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
|
| // v * (256 + 8)
|
| // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
|
| // 20 instructions.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
|
| int pix) {
|
| __asm {
|
| @@ -417,8 +514,155 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
|
| }
|
| }
|
|
|
| +#ifdef HAS_RGB565TOARGBROW_AVX2
|
| +// pmul method to replicate bits.
|
| +// Math to replicate bits:
|
| +// (v << 8) | (v << 3)
|
| +// v * 256 + v * 8
|
| +// v * (256 + 8)
|
| +// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
|
| +__declspec(naked)
|
| +void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
|
| + int pix) {
|
| + __asm {
|
| + mov eax, 0x01080108 // generate multiplier to repeat 5 bits
|
| + vmovd xmm5, eax
|
| + vbroadcastss ymm5, xmm5
|
| + mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
|
| + movd xmm6, eax
|
| + vbroadcastss ymm6, xmm6
|
| + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
|
| + vpsllw ymm3, ymm3, 11
|
| + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green
|
| + vpsllw ymm4, ymm4, 10
|
| + vpsrlw ymm4, ymm4, 5
|
| + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
|
| + vpsllw ymm7, ymm7, 8
|
| +
|
| + mov eax, [esp + 4] // src_rgb565
|
| + mov edx, [esp + 8] // dst_argb
|
| + mov ecx, [esp + 12] // pix
|
| + sub edx, eax
|
| + sub edx, eax
|
| +
|
| + convertloop:
|
| + vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565
|
| + vpand ymm1, ymm0, ymm3 // R in upper 5 bits
|
| + vpsllw ymm2, ymm0, 11 // B in upper 5 bits
|
| + vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
|
| + vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
|
| + vpsllw ymm1, ymm1, 8
|
| + vpor ymm1, ymm1, ymm2 // RB
|
| + vpand ymm0, ymm0, ymm4 // G in middle 6 bits
|
| + vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4)
|
| + vpor ymm0, ymm0, ymm7 // AG
|
| + vpermq ymm0, ymm0, 0xd8 // mutate for unpack
|
| + vpermq ymm1, ymm1, 0xd8
|
| + vpunpckhbw ymm2, ymm1, ymm0
|
| + vpunpcklbw ymm1, ymm1, ymm0
|
| + vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB
|
| + vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB
|
| + lea eax, [eax + 32]
|
| + sub ecx, 16
|
| + jg convertloop
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_RGB565TOARGBROW_AVX2
|
| +
|
| +#ifdef HAS_ARGB1555TOARGBROW_AVX2
|
| +__declspec(naked)
|
| +void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
|
| + int pix) {
|
| + __asm {
|
| + mov eax, 0x01080108 // generate multiplier to repeat 5 bits
|
| + vmovd xmm5, eax
|
| + vbroadcastss ymm5, xmm5
|
| + mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
|
| + movd xmm6, eax
|
| + vbroadcastss ymm6, xmm6
|
| + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
|
| + vpsllw ymm3, ymm3, 11
|
| + vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
|
| + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
|
| + vpsllw ymm7, ymm7, 8
|
| +
|
| + mov eax, [esp + 4] // src_argb1555
|
| + mov edx, [esp + 8] // dst_argb
|
| + mov ecx, [esp + 12] // pix
|
| + sub edx, eax
|
| + sub edx, eax
|
| +
|
| + convertloop:
|
| + vmovdqu ymm0, [eax] // fetch 16 pixels of 1555
|
| + vpsllw ymm1, ymm0, 1 // R in upper 5 bits
|
| + vpsllw ymm2, ymm0, 11 // B in upper 5 bits
|
| + vpand ymm1, ymm1, ymm3
|
| + vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
|
| + vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
|
| + vpsllw ymm1, ymm1, 8
|
| + vpor ymm1, ymm1, ymm2 // RB
|
| + vpsraw ymm2, ymm0, 8 // A
|
| + vpand ymm0, ymm0, ymm4 // G in middle 5 bits
|
| + vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8)
|
| + vpand ymm2, ymm2, ymm7
|
| + vpor ymm0, ymm0, ymm2 // AG
|
| + vpermq ymm0, ymm0, 0xd8 // mutate for unpack
|
| + vpermq ymm1, ymm1, 0xd8
|
| + vpunpckhbw ymm2, ymm1, ymm0
|
| + vpunpcklbw ymm1, ymm1, ymm0
|
| + vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB
|
| + vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB
|
| + lea eax, [eax + 32]
|
| + sub ecx, 16
|
| + jg convertloop
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_ARGB1555TOARGBROW_AVX2
|
| +
|
| +#ifdef HAS_ARGB4444TOARGBROW_AVX2
|
| +__declspec(naked)
|
| +void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
|
| + int pix) {
|
| + __asm {
|
| + mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
|
| + vmovd xmm4, eax
|
| + vbroadcastss ymm4, xmm4
|
| + vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
|
| + mov eax, [esp + 4] // src_argb4444
|
| + mov edx, [esp + 8] // dst_argb
|
| + mov ecx, [esp + 12] // pix
|
| + sub edx, eax
|
| + sub edx, eax
|
| +
|
| + convertloop:
|
| + vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444
|
| + vpand ymm2, ymm0, ymm5 // mask high nibbles
|
| + vpand ymm0, ymm0, ymm4 // mask low nibbles
|
| + vpsrlw ymm3, ymm2, 4
|
| + vpsllw ymm1, ymm0, 4
|
| + vpor ymm2, ymm2, ymm3
|
| + vpor ymm0, ymm0, ymm1
|
| + vpermq ymm0, ymm0, 0xd8 // mutate for unpack
|
| + vpermq ymm2, ymm2, 0xd8
|
| + vpunpckhbw ymm1, ymm0, ymm2
|
| + vpunpcklbw ymm0, ymm0, ymm2
|
| + vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB
|
| + vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB
|
| + lea eax, [eax + 32]
|
| + sub ecx, 16
|
| + jg convertloop
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_ARGB4444TOARGBROW_AVX2
|
| +
|
| // 24 instructions
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
|
| int pix) {
|
| __asm {
|
| @@ -471,7 +715,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
|
| }
|
|
|
| // 18 instructions.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
|
| int pix) {
|
| __asm {
|
| @@ -509,7 +753,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| __asm {
|
| mov eax, [esp + 4] // src_argb
|
| @@ -547,7 +791,7 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| __asm {
|
| mov eax, [esp + 4] // src_argb
|
| @@ -585,7 +829,8 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +// 4 pixels
|
| +__declspec(naked)
|
| void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| __asm {
|
| mov eax, [esp + 4] // src_argb
|
| @@ -622,8 +867,97 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| }
|
| }
|
|
|
| +// 8 pixels
|
| +__declspec(naked)
|
| +void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
|
| + const uint32 dither4, int pix) {
|
| + __asm {
|
| +
|
| + mov eax, [esp + 4] // src_argb
|
| + mov edx, [esp + 8] // dst_rgb
|
| + movd xmm6, [esp + 12] // dither4
|
| + mov ecx, [esp + 16] // pix
|
| + punpcklbw xmm6, xmm6 // make dither 16 bytes
|
| + movdqa xmm7, xmm6
|
| + punpcklwd xmm6, xmm6
|
| + punpckhwd xmm7, xmm7
|
| + pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
|
| + psrld xmm3, 27
|
| + pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
|
| + psrld xmm4, 26
|
| + pslld xmm4, 5
|
| + pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
|
| + pslld xmm5, 11
|
| +
|
| + convertloop:
|
| + movdqu xmm0, [eax] // fetch 4 pixels of argb
|
| + paddusb xmm0, xmm6 // add dither
|
| + movdqa xmm1, xmm0 // B
|
| + movdqa xmm2, xmm0 // G
|
| + pslld xmm0, 8 // R
|
| + psrld xmm1, 3 // B
|
| + psrld xmm2, 5 // G
|
| + psrad xmm0, 16 // R
|
| + pand xmm1, xmm3 // B
|
| + pand xmm2, xmm4 // G
|
| + pand xmm0, xmm5 // R
|
| + por xmm1, xmm2 // BG
|
| + por xmm0, xmm1 // BGR
|
| + packssdw xmm0, xmm0
|
| + lea eax, [eax + 16]
|
| + movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
|
| + lea edx, [edx + 8]
|
| + sub ecx, 4
|
| + jg convertloop
|
| + ret
|
| + }
|
| +}
|
| +
|
| +#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
|
| +__declspec(naked)
|
| +void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
|
| + const uint32 dither4, int pix) {
|
| + __asm {
|
| + mov eax, [esp + 4] // src_argb
|
| + mov edx, [esp + 8] // dst_rgb
|
| + vbroadcastss xmm6, [esp + 12] // dither4
|
| + mov ecx, [esp + 16] // pix
|
| + vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes
|
| + vpermq ymm6, ymm6, 0xd8
|
| + vpunpcklwd ymm6, ymm6, ymm6
|
| + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
|
| + vpsrld ymm3, ymm3, 27
|
| + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
|
| + vpsrld ymm4, ymm4, 26
|
| + vpslld ymm4, ymm4, 5
|
| + vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
|
| +
|
| + convertloop:
|
| + vmovdqu ymm0, [eax] // fetch 8 pixels of argb
|
| + vpaddusb ymm0, ymm0, ymm6 // add dither
|
| + vpsrld ymm2, ymm0, 5 // G
|
| + vpsrld ymm1, ymm0, 3 // B
|
| + vpsrld ymm0, ymm0, 8 // R
|
| + vpand ymm2, ymm2, ymm4 // G
|
| + vpand ymm1, ymm1, ymm3 // B
|
| + vpand ymm0, ymm0, ymm5 // R
|
| + vpor ymm1, ymm1, ymm2 // BG
|
| + vpor ymm0, ymm0, ymm1 // BGR
|
| + vpackusdw ymm0, ymm0, ymm0
|
| + vpermq ymm0, ymm0, 0xd8
|
| + lea eax, [eax + 32]
|
| + vmovdqu [edx], xmm0 // store 8 pixels of RGB565
|
| + lea edx, [edx + 16]
|
| + sub ecx, 8
|
| + jg convertloop
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_ARGBTORGB565DITHERROW_AVX2
|
| +
|
| // TODO(fbarchard): Improve sign extension/packing.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| __asm {
|
| mov eax, [esp + 4] // src_argb
|
| @@ -664,7 +998,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| __asm {
|
| mov eax, [esp + 4] // src_argb
|
| @@ -694,7 +1028,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| }
|
|
|
| #ifdef HAS_ARGBTORGB565ROW_AVX2
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| __asm {
|
| mov eax, [esp + 4] // src_argb
|
| @@ -705,21 +1039,19 @@ void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
|
| vpsrld ymm4, ymm4, 26
|
| vpslld ymm4, ymm4, 5
|
| - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xfffff800
|
| - vpslld ymm5, ymm5, 11
|
| + vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
|
|
|
| convertloop:
|
| vmovdqu ymm0, [eax] // fetch 8 pixels of argb
|
| vpsrld ymm2, ymm0, 5 // G
|
| vpsrld ymm1, ymm0, 3 // B
|
| - vpslld ymm0, ymm0, 8 // R
|
| + vpsrld ymm0, ymm0, 8 // R
|
| vpand ymm2, ymm2, ymm4 // G
|
| vpand ymm1, ymm1, ymm3 // B
|
| - vpsrad ymm0, ymm0, 16 // R
|
| vpand ymm0, ymm0, ymm5 // R
|
| vpor ymm1, ymm1, ymm2 // BG
|
| vpor ymm0, ymm0, ymm1 // BGR
|
| - vpackssdw ymm0, ymm0, ymm0
|
| + vpackusdw ymm0, ymm0, ymm0
|
| vpermq ymm0, ymm0, 0xd8
|
| lea eax, [eax + 32]
|
| vmovdqu [edx], xmm0 // store 8 pixels of RGB565
|
| @@ -733,7 +1065,7 @@ void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| #endif // HAS_ARGBTORGB565ROW_AVX2
|
|
|
| #ifdef HAS_ARGBTOARGB1555ROW_AVX2
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| __asm {
|
| mov eax, [esp + 4] // src_argb
|
| @@ -773,7 +1105,7 @@ void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| #endif // HAS_ARGBTOARGB1555ROW_AVX2
|
|
|
| #ifdef HAS_ARGBTOARGB4444ROW_AVX2
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| __asm {
|
| mov eax, [esp + 4] // src_argb
|
| @@ -804,7 +1136,7 @@ void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| #endif // HAS_ARGBTOARGB4444ROW_AVX2
|
|
|
| // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| __asm {
|
| mov eax, [esp + 4] /* src_argb */
|
| @@ -839,7 +1171,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
|
|
| // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
|
| // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| __asm {
|
| mov eax, [esp + 4] /* src_argb */
|
| @@ -880,7 +1212,7 @@ static const lvec32 kPermdARGBToY_AVX = {
|
| };
|
|
|
| // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
|
| -__declspec(naked) __declspec(align(32))
|
| +__declspec(naked)
|
| void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
|
| __asm {
|
| mov eax, [esp + 4] /* src_argb */
|
| @@ -917,9 +1249,9 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
|
| }
|
| #endif // HAS_ARGBTOYROW_AVX2
|
|
|
| -#ifdef HAS_ARGBTOYROW_AVX2
|
| +#ifdef HAS_ARGBTOYJROW_AVX2
|
| // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
|
| -__declspec(naked) __declspec(align(32))
|
| +__declspec(naked)
|
| void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
|
| __asm {
|
| mov eax, [esp + 4] /* src_argb */
|
| @@ -958,7 +1290,7 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
|
| }
|
| #endif // HAS_ARGBTOYJROW_AVX2
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| __asm {
|
| mov eax, [esp + 4] /* src_argb */
|
| @@ -991,7 +1323,7 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| __asm {
|
| mov eax, [esp + 4] /* src_argb */
|
| @@ -1024,7 +1356,7 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| __asm {
|
| mov eax, [esp + 4] /* src_argb */
|
| @@ -1057,7 +1389,7 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| uint8* dst_u, uint8* dst_v, int width) {
|
| __asm {
|
| @@ -1127,7 +1459,7 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| uint8* dst_u, uint8* dst_v, int width) {
|
| __asm {
|
| @@ -1199,7 +1531,7 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| }
|
|
|
| #ifdef HAS_ARGBTOUVROW_AVX2
|
| -__declspec(naked) __declspec(align(32))
|
| +__declspec(naked)
|
| void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
|
| uint8* dst_u, uint8* dst_v, int width) {
|
| __asm {
|
| @@ -1264,7 +1596,7 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
|
| }
|
| #endif // HAS_ARGBTOUVROW_AVX2
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
|
| uint8* dst_u, uint8* dst_v, int width) {
|
| __asm {
|
| @@ -1321,7 +1653,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
|
| uint8* dst_u, uint8* dst_v, int width) {
|
| __asm {
|
| @@ -1379,7 +1711,7 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| uint8* dst_u, uint8* dst_v, int width) {
|
| __asm {
|
| @@ -1449,7 +1781,7 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| uint8* dst_u, uint8* dst_v, int width) {
|
| __asm {
|
| @@ -1519,7 +1851,7 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| uint8* dst_u, uint8* dst_v, int width) {
|
| __asm {
|
| @@ -1590,6 +1922,16 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| }
|
| #endif // HAS_ARGBTOYROW_SSSE3
|
|
|
| +// Read 16 UV from 444
|
| +#define READYUV444_AVX2 __asm { \
|
| + __asm vmovdqu xmm0, [esi] /* U */ /* NOLINT */ \
|
| + __asm vmovdqu xmm1, [esi + edi] /* V */ /* NOLINT */ \
|
| + __asm lea esi, [esi + 16] \
|
| + __asm vpermq ymm0, ymm0, 0xd8 \
|
| + __asm vpermq ymm1, ymm1, 0xd8 \
|
| + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
|
| + }
|
| +
|
| // Read 8 UV from 422, upsample to 16 UV.
|
| #define READYUV422_AVX2 __asm { \
|
| __asm vmovq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \
|
| @@ -1600,6 +1942,17 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
|
| }
|
|
|
| +// Read 4 UV from 411, upsample to 16 UV.
|
| +#define READYUV411_AVX2 __asm { \
|
| + __asm vmovd xmm0, dword ptr [esi] /* U */ /* NOLINT */ \
|
| + __asm vmovd xmm1, dword ptr [esi + edi] /* V */ /* NOLINT */ \
|
| + __asm lea esi, [esi + 4] \
|
| + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
|
| + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
|
| + __asm vpermq ymm0, ymm0, 0xd8 \
|
| + __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \
|
| + }
|
| +
|
| // Read 8 UV from NV12, upsample to 16 UV.
|
| #define READNV12_AVX2 __asm { \
|
| __asm vmovdqu xmm0, [esi] /* UV */ \
|
| @@ -1646,15 +1999,15 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| __asm vpermq ymm2, ymm2, 0xd8 \
|
| __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
|
| __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
|
| - __asm vmovdqu [edx], ymm1 \
|
| - __asm vmovdqu [edx + 32], ymm0 \
|
| + __asm vmovdqu 0[edx], ymm1 \
|
| + __asm vmovdqu 32[edx], ymm0 \
|
| __asm lea edx, [edx + 64] \
|
| }
|
|
|
| #ifdef HAS_I422TOARGBROW_AVX2
|
| // 16 pixels
|
| // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void I422ToARGBRow_AVX2(const uint8* y_buf,
|
| const uint8* u_buf,
|
| const uint8* v_buf,
|
| @@ -1687,10 +2040,118 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
|
| }
|
| #endif // HAS_I422TOARGBROW_AVX2
|
|
|
| +#ifdef HAS_J422TOARGBROW_AVX2
|
| +// 16 pixels
|
| +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
|
| +__declspec(naked)
|
| +void J422ToARGBRow_AVX2(const uint8* y_buf,
|
| + const uint8* u_buf,
|
| + const uint8* v_buf,
|
| + uint8* dst_argb,
|
| + int width) {
|
| + __asm {
|
| + push esi
|
| + push edi
|
| + mov eax, [esp + 8 + 4] // Y
|
| + mov esi, [esp + 8 + 8] // U
|
| + mov edi, [esp + 8 + 12] // V
|
| + mov edx, [esp + 8 + 16] // argb
|
| + mov ecx, [esp + 8 + 20] // width
|
| + sub edi, esi
|
| + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
|
| +
|
| + convertloop:
|
| + READYUV422_AVX2
|
| + YUVTORGB_AVX2(kYuvJConstants)
|
| + STOREARGB_AVX2
|
| +
|
| + sub ecx, 16
|
| + jg convertloop
|
| +
|
| + pop edi
|
| + pop esi
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_J422TOARGBROW_AVX2
|
| +
|
| +#ifdef HAS_I444TOARGBROW_AVX2
|
| +// 16 pixels
|
| +// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
|
| +__declspec(naked)
|
| +void I444ToARGBRow_AVX2(const uint8* y_buf,
|
| + const uint8* u_buf,
|
| + const uint8* v_buf,
|
| + uint8* dst_argb,
|
| + int width) {
|
| + __asm {
|
| + push esi
|
| + push edi
|
| + mov eax, [esp + 8 + 4] // Y
|
| + mov esi, [esp + 8 + 8] // U
|
| + mov edi, [esp + 8 + 12] // V
|
| + mov edx, [esp + 8 + 16] // argb
|
| + mov ecx, [esp + 8 + 20] // width
|
| + sub edi, esi
|
| + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
|
| +
|
| + convertloop:
|
| + READYUV444_AVX2
|
| + YUVTORGB_AVX2(kYuvConstants)
|
| + STOREARGB_AVX2
|
| +
|
| + sub ecx, 16
|
| + jg convertloop
|
| +
|
| + pop edi
|
| + pop esi
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_I444TOARGBROW_AVX2
|
| +
|
| +#ifdef HAS_I411TOARGBROW_AVX2
|
| +// 16 pixels
|
| +// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
|
| +__declspec(naked)
|
| +void I411ToARGBRow_AVX2(const uint8* y_buf,
|
| + const uint8* u_buf,
|
| + const uint8* v_buf,
|
| + uint8* dst_argb,
|
| + int width) {
|
| + __asm {
|
| + push esi
|
| + push edi
|
| + mov eax, [esp + 8 + 4] // Y
|
| + mov esi, [esp + 8 + 8] // U
|
| + mov edi, [esp + 8 + 12] // V
|
| + mov edx, [esp + 8 + 16] // argb
|
| + mov ecx, [esp + 8 + 20] // width
|
| + sub edi, esi
|
| + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
|
| +
|
| + convertloop:
|
| + READYUV411_AVX2
|
| + YUVTORGB_AVX2(kYuvConstants)
|
| + STOREARGB_AVX2
|
| +
|
| + sub ecx, 16
|
| + jg convertloop
|
| +
|
| + pop edi
|
| + pop esi
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_I411TOARGBROW_AVX2
|
| +
|
| #ifdef HAS_NV12TOARGBROW_AVX2
|
| // 16 pixels.
|
| // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void NV12ToARGBRow_AVX2(const uint8* y_buf,
|
| const uint8* uv_buf,
|
| uint8* dst_argb,
|
| @@ -1712,6 +2173,7 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf,
|
| jg convertloop
|
|
|
| pop esi
|
| + vzeroupper
|
| ret
|
| }
|
| }
|
| @@ -1720,7 +2182,7 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf,
|
| #ifdef HAS_NV21TOARGBROW_AVX2
|
| // 16 pixels.
|
| // 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes).
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void NV21ToARGBRow_AVX2(const uint8* y_buf,
|
| const uint8* uv_buf,
|
| uint8* dst_argb,
|
| @@ -1742,6 +2204,7 @@ void NV21ToARGBRow_AVX2(const uint8* y_buf,
|
| jg convertloop
|
|
|
| pop esi
|
| + vzeroupper
|
| ret
|
| }
|
| }
|
| @@ -1751,7 +2214,7 @@ void NV21ToARGBRow_AVX2(const uint8* y_buf,
|
| // 16 pixels
|
| // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
|
| // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void I422ToBGRARow_AVX2(const uint8* y_buf,
|
| const uint8* u_buf,
|
| const uint8* v_buf,
|
| @@ -1797,7 +2260,7 @@ void I422ToBGRARow_AVX2(const uint8* y_buf,
|
| // 16 pixels
|
| // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
|
| // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void I422ToRGBARow_AVX2(const uint8* y_buf,
|
| const uint8* u_buf,
|
| const uint8* v_buf,
|
| @@ -1843,7 +2306,7 @@ void I422ToRGBARow_AVX2(const uint8* y_buf,
|
| // 16 pixels
|
| // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
|
| // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void I422ToABGRRow_AVX2(const uint8* y_buf,
|
| const uint8* u_buf,
|
| const uint8* v_buf,
|
| @@ -1914,7 +2377,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
|
| __asm lea esi, [esi + 2] \
|
| __asm punpcklbw xmm0, xmm1 /* UV */ \
|
| __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
|
| - __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \
|
| + __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \
|
| }
|
|
|
| // Read 4 UV from NV12, upsample to 8 UV.
|
| @@ -1963,8 +2426,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
|
| __asm movdqa xmm1, xmm0 \
|
| __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
|
| __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
|
| - __asm movdqu [edx], xmm0 \
|
| - __asm movdqu [edx + 16], xmm1 \
|
| + __asm movdqu 0[edx], xmm0 \
|
| + __asm movdqu 16[edx], xmm1 \
|
| __asm lea edx, [edx + 32] \
|
| }
|
|
|
| @@ -1977,8 +2440,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
|
| __asm movdqa xmm0, xmm5 \
|
| __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
|
| __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
|
| - __asm movdqu [edx], xmm5 \
|
| - __asm movdqu [edx + 16], xmm0 \
|
| + __asm movdqu 0[edx], xmm5 \
|
| + __asm movdqu 16[edx], xmm0 \
|
| __asm lea edx, [edx + 32] \
|
| }
|
|
|
| @@ -1990,8 +2453,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
|
| __asm movdqa xmm1, xmm2 \
|
| __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \
|
| __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \
|
| - __asm movdqu [edx], xmm2 \
|
| - __asm movdqu [edx + 16], xmm1 \
|
| + __asm movdqu 0[edx], xmm2 \
|
| + __asm movdqu 16[edx], xmm1 \
|
| __asm lea edx, [edx + 32] \
|
| }
|
|
|
| @@ -2004,8 +2467,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
|
| __asm movdqa xmm0, xmm5 \
|
| __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
|
| __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
|
| - __asm movdqu [edx], xmm5 \
|
| - __asm movdqu [edx + 16], xmm0 \
|
| + __asm movdqu 0[edx], xmm5 \
|
| + __asm movdqu 16[edx], xmm0 \
|
| __asm lea edx, [edx + 32] \
|
| }
|
|
|
| @@ -2021,8 +2484,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
|
| __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
|
| __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
|
| __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
|
| - __asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \
|
| - __asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \
|
| + __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
|
| + __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
|
| __asm lea edx, [edx + 24] \
|
| }
|
|
|
| @@ -2038,8 +2501,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
|
| __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
|
| __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
|
| __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
|
| - __asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \
|
| - __asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \
|
| + __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
|
| + __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
|
| __asm lea edx, [edx + 24] \
|
| }
|
|
|
| @@ -2075,13 +2538,13 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
|
| __asm por xmm3, xmm2 /* BG */ \
|
| __asm por xmm1, xmm3 /* BGR */ \
|
| __asm packssdw xmm0, xmm1 \
|
| - __asm movdqu [edx], xmm0 /* store 8 pixels of RGB565 */ \
|
| + __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
|
| __asm lea edx, [edx + 16] \
|
| }
|
|
|
| // 8 pixels.
|
| // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void I444ToARGBRow_SSSE3(const uint8* y_buf,
|
| const uint8* u_buf,
|
| const uint8* v_buf,
|
| @@ -2114,7 +2577,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
|
|
|
| // 8 pixels.
|
| // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void I422ToRGB24Row_SSSE3(const uint8* y_buf,
|
| const uint8* u_buf,
|
| const uint8* v_buf,
|
| @@ -2148,7 +2611,7 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf,
|
|
|
| // 8 pixels.
|
| // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes).
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void I422ToRAWRow_SSSE3(const uint8* y_buf,
|
| const uint8* u_buf,
|
| const uint8* v_buf,
|
| @@ -2182,7 +2645,7 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf,
|
|
|
| // 8 pixels
|
| // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void I422ToRGB565Row_SSSE3(const uint8* y_buf,
|
| const uint8* u_buf,
|
| const uint8* v_buf,
|
| @@ -2221,7 +2684,7 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
|
|
|
| // 8 pixels.
|
| // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void I422ToARGBRow_SSSE3(const uint8* y_buf,
|
| const uint8* u_buf,
|
| const uint8* v_buf,
|
| @@ -2253,9 +2716,43 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
|
| }
|
|
|
| // 8 pixels.
|
| +// JPeg color space version of I422ToARGB
|
| +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
|
| +__declspec(naked)
|
| +void J422ToARGBRow_SSSE3(const uint8* y_buf,
|
| + const uint8* u_buf,
|
| + const uint8* v_buf,
|
| + uint8* dst_argb,
|
| + int width) {
|
| + __asm {
|
| + push esi
|
| + push edi
|
| + mov eax, [esp + 8 + 4] // Y
|
| + mov esi, [esp + 8 + 8] // U
|
| + mov edi, [esp + 8 + 12] // V
|
| + mov edx, [esp + 8 + 16] // argb
|
| + mov ecx, [esp + 8 + 20] // width
|
| + sub edi, esi
|
| + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
| +
|
| + convertloop:
|
| + READYUV422
|
| + YUVTORGB(kYuvJConstants)
|
| + STOREARGB
|
| +
|
| + sub ecx, 8
|
| + jg convertloop
|
| +
|
| + pop edi
|
| + pop esi
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// 8 pixels.
|
| // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
|
| // Similar to I420 but duplicate UV once more.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void I411ToARGBRow_SSSE3(const uint8* y_buf,
|
| const uint8* u_buf,
|
| const uint8* v_buf,
|
| @@ -2290,7 +2787,7 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
|
|
|
| // 8 pixels.
|
| // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void NV12ToARGBRow_SSSE3(const uint8* y_buf,
|
| const uint8* uv_buf,
|
| uint8* dst_argb,
|
| @@ -2318,7 +2815,7 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
|
|
|
| // 8 pixels.
|
| // 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes).
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void NV21ToARGBRow_SSSE3(const uint8* y_buf,
|
| const uint8* uv_buf,
|
| uint8* dst_argb,
|
| @@ -2344,7 +2841,7 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf,
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void I422ToBGRARow_SSSE3(const uint8* y_buf,
|
| const uint8* u_buf,
|
| const uint8* v_buf,
|
| @@ -2374,7 +2871,7 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf,
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void I422ToABGRRow_SSSE3(const uint8* y_buf,
|
| const uint8* u_buf,
|
| const uint8* v_buf,
|
| @@ -2405,7 +2902,7 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf,
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void I422ToRGBARow_SSSE3(const uint8* y_buf,
|
| const uint8* u_buf,
|
| const uint8* v_buf,
|
| @@ -2437,12 +2934,12 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
|
|
|
| #endif // HAS_I422TOARGBROW_SSSE3
|
|
|
| -#ifdef HAS_YTOARGBROW_SSE2
|
| +#ifdef HAS_I400TOARGBROW_SSE2
|
| // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
|
| -__declspec(naked) __declspec(align(16))
|
| -void YToARGBRow_SSE2(const uint8* y_buf,
|
| - uint8* rgb_buf,
|
| - int width) {
|
| +__declspec(naked)
|
| +void I400ToARGBRow_SSE2(const uint8* y_buf,
|
| + uint8* rgb_buf,
|
| + int width) {
|
| __asm {
|
| mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
|
| movd xmm2, eax
|
| @@ -2482,15 +2979,15 @@ void YToARGBRow_SSE2(const uint8* y_buf,
|
| ret
|
| }
|
| }
|
| -#endif // HAS_YTOARGBROW_SSE2
|
| +#endif // HAS_I400TOARGBROW_SSE2
|
|
|
| -#ifdef HAS_YTOARGBROW_AVX2
|
| +#ifdef HAS_I400TOARGBROW_AVX2
|
| // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
|
| // note: vpunpcklbw mutates and vpackuswb unmutates.
|
| -__declspec(naked) __declspec(align(16))
|
| -void YToARGBRow_AVX2(const uint8* y_buf,
|
| - uint8* rgb_buf,
|
| - int width) {
|
| +__declspec(naked)
|
| +void I400ToARGBRow_AVX2(const uint8* y_buf,
|
| + uint8* rgb_buf,
|
| + int width) {
|
| __asm {
|
| mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
|
| vmovd xmm2, eax
|
| @@ -2506,7 +3003,7 @@ void YToARGBRow_AVX2(const uint8* y_buf,
|
| mov ecx, [esp + 12] // width
|
|
|
| convertloop:
|
| - // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
|
| + // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
|
| vmovdqu xmm0, [eax]
|
| lea eax, [eax + 16]
|
| vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates
|
| @@ -2533,7 +3030,7 @@ void YToARGBRow_AVX2(const uint8* y_buf,
|
| ret
|
| }
|
| }
|
| -#endif // HAS_YTOARGBROW_AVX2
|
| +#endif // HAS_I400TOARGBROW_AVX2
|
|
|
| #ifdef HAS_MIRRORROW_SSSE3
|
| // Shuffle table for reversing the bytes.
|
| @@ -2542,7 +3039,7 @@ static const uvec8 kShuffleMirror = {
|
| };
|
|
|
| // TODO(fbarchard): Replace lea with -16 offset.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
|
| __asm {
|
| mov eax, [esp + 4] // src
|
| @@ -2563,7 +3060,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
|
| #endif // HAS_MIRRORROW_SSSE3
|
|
|
| #ifdef HAS_MIRRORROW_AVX2
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
|
| __asm {
|
| mov eax, [esp + 4] // src
|
| @@ -2586,7 +3083,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
|
| #endif // HAS_MIRRORROW_AVX2
|
|
|
| #ifdef HAS_MIRRORROW_SSE2
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
|
| __asm {
|
| mov eax, [esp + 4] // src
|
| @@ -2617,7 +3114,7 @@ static const uvec8 kShuffleMirrorUV = {
|
| 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
|
| };
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
|
| int width) {
|
| __asm {
|
| @@ -2647,7 +3144,7 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
|
| #endif // HAS_MIRRORROW_UV_SSSE3
|
|
|
| #ifdef HAS_ARGBMIRRORROW_SSE2
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
|
| __asm {
|
| mov eax, [esp + 4] // src
|
| @@ -2674,7 +3171,7 @@ static const ulvec32 kARGBShuffleMirror_AVX2 = {
|
| 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
|
| };
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
|
| __asm {
|
| mov eax, [esp + 4] // src
|
| @@ -2695,7 +3192,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
|
| #endif // HAS_ARGBMIRRORROW_AVX2
|
|
|
| #ifdef HAS_SPLITUVROW_SSE2
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
|
| __asm {
|
| push edi
|
| @@ -2733,7 +3230,7 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
|
| #endif // HAS_SPLITUVROW_SSE2
|
|
|
| #ifdef HAS_SPLITUVROW_AVX2
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
|
| __asm {
|
| push edi
|
| @@ -2771,7 +3268,7 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
|
| #endif // HAS_SPLITUVROW_AVX2
|
|
|
| #ifdef HAS_MERGEUVROW_SSE2
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
|
| int width) {
|
| __asm {
|
| @@ -2802,7 +3299,7 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
|
| #endif // HAS_MERGEUVROW_SSE2
|
|
|
| #ifdef HAS_MERGEUVROW_AVX2
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
|
| int width) {
|
| __asm {
|
| @@ -2836,7 +3333,7 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
|
|
|
| #ifdef HAS_COPYROW_SSE2
|
| // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
|
| __asm {
|
| mov eax, [esp + 4] // src
|
| @@ -2859,7 +3356,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
|
|
|
| #ifdef HAS_COPYROW_AVX
|
| // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
|
| __asm {
|
| mov eax, [esp + 4] // src
|
| @@ -2883,7 +3380,7 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
|
| #endif // HAS_COPYROW_AVX
|
|
|
| // Multiple of 1.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
|
| __asm {
|
| mov eax, esi
|
| @@ -2900,7 +3397,7 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
|
|
|
| #ifdef HAS_ARGBCOPYALPHAROW_SSE2
|
| // width in pixels
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
|
| __asm {
|
| mov eax, [esp + 4] // src
|
| @@ -2936,7 +3433,7 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
|
|
|
| #ifdef HAS_ARGBCOPYALPHAROW_AVX2
|
| // width in pixels
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
|
| __asm {
|
| mov eax, [esp + 4] // src
|
| @@ -2965,7 +3462,7 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
|
|
|
| #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
|
| // width in pixels
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
|
| __asm {
|
| mov eax, [esp + 4] // src
|
| @@ -3003,7 +3500,7 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
|
|
|
| #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
|
| // width in pixels
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
|
| __asm {
|
| mov eax, [esp + 4] // src
|
| @@ -3035,7 +3532,7 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
|
| #ifdef HAS_SETROW_X86
|
| // Write 'count' bytes using an 8 bit value repeated.
|
| // Count should be multiple of 4.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void SetRow_X86(uint8* dst, uint8 v8, int count) {
|
| __asm {
|
| movzx eax, byte ptr [esp + 8] // v8
|
| @@ -3052,7 +3549,7 @@ void SetRow_X86(uint8* dst, uint8 v8, int count) {
|
| }
|
|
|
| // Write 'count' bytes using an 8 bit value repeated.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
|
| __asm {
|
| mov edx, edi
|
| @@ -3066,7 +3563,7 @@ void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
|
| }
|
|
|
| // Write 'count' 32 bit values.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
|
| __asm {
|
| mov edx, edi
|
| @@ -3081,7 +3578,7 @@ void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
|
| #endif // HAS_SETROW_X86
|
|
|
| #ifdef HAS_YUY2TOYROW_AVX2
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void YUY2ToYRow_AVX2(const uint8* src_yuy2,
|
| uint8* dst_y, int pix) {
|
| __asm {
|
| @@ -3108,7 +3605,7 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
|
| uint8* dst_u, uint8* dst_v, int pix) {
|
| __asm {
|
| @@ -3152,7 +3649,7 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
|
| uint8* dst_u, uint8* dst_v, int pix) {
|
| __asm {
|
| @@ -3191,7 +3688,7 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void UYVYToYRow_AVX2(const uint8* src_uyvy,
|
| uint8* dst_y, int pix) {
|
| __asm {
|
| @@ -3216,7 +3713,7 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
|
| uint8* dst_u, uint8* dst_v, int pix) {
|
| __asm {
|
| @@ -3260,7 +3757,7 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
|
| uint8* dst_u, uint8* dst_v, int pix) {
|
| __asm {
|
| @@ -3301,7 +3798,7 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
|
| #endif // HAS_YUY2TOYROW_AVX2
|
|
|
| #ifdef HAS_YUY2TOYROW_SSE2
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void YUY2ToYRow_SSE2(const uint8* src_yuy2,
|
| uint8* dst_y, int pix) {
|
| __asm {
|
| @@ -3326,7 +3823,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
|
| uint8* dst_u, uint8* dst_v, int pix) {
|
| __asm {
|
| @@ -3369,7 +3866,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
|
| uint8* dst_u, uint8* dst_v, int pix) {
|
| __asm {
|
| @@ -3405,7 +3902,7 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void UYVYToYRow_SSE2(const uint8* src_uyvy,
|
| uint8* dst_y, int pix) {
|
| __asm {
|
| @@ -3428,7 +3925,7 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
| uint8* dst_u, uint8* dst_v, int pix) {
|
| __asm {
|
| @@ -3471,7 +3968,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
|
| uint8* dst_u, uint8* dst_v, int pix) {
|
| __asm {
|
| @@ -3510,7 +4007,7 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
|
|
|
| #ifdef HAS_ARGBBLENDROW_SSE2
|
| // Blend 8 pixels at a time.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
| uint8* dst_argb, int width) {
|
| __asm {
|
| @@ -3527,43 +4024,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
| psllw xmm5, 8
|
| pcmpeqb xmm4, xmm4 // generate mask 0xff000000
|
| pslld xmm4, 24
|
| -
|
| - sub ecx, 1
|
| - je convertloop1 // only 1 pixel?
|
| - jl convertloop1b
|
| -
|
| - // 1 pixel loop until destination pointer is aligned.
|
| - alignloop1:
|
| - test edx, 15 // aligned?
|
| - je alignloop1b
|
| - movd xmm3, [eax]
|
| - lea eax, [eax + 4]
|
| - movdqa xmm0, xmm3 // src argb
|
| - pxor xmm3, xmm4 // ~alpha
|
| - movd xmm2, [esi] // _r_b
|
| - psrlw xmm3, 8 // alpha
|
| - pshufhw xmm3, xmm3, 0F5h // 8 alpha words
|
| - pshuflw xmm3, xmm3, 0F5h
|
| - pand xmm2, xmm6 // _r_b
|
| - paddw xmm3, xmm7 // 256 - alpha
|
| - pmullw xmm2, xmm3 // _r_b * alpha
|
| - movd xmm1, [esi] // _a_g
|
| - lea esi, [esi + 4]
|
| - psrlw xmm1, 8 // _a_g
|
| - por xmm0, xmm4 // set alpha to 255
|
| - pmullw xmm1, xmm3 // _a_g * alpha
|
| - psrlw xmm2, 8 // _r_b convert to 8 bits again
|
| - paddusb xmm0, xmm2 // + src argb
|
| - pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
| - paddusb xmm0, xmm1 // + src argb
|
| - movd [edx], xmm0
|
| - lea edx, [edx + 4]
|
| - sub ecx, 1
|
| - jge alignloop1
|
| -
|
| - alignloop1b:
|
| - add ecx, 1 - 4
|
| - jl convertloop4b
|
| + sub ecx, 4
|
| + jl convertloop4b // less than 4 pixels?
|
|
|
| // 4 pixel loop.
|
| convertloop4:
|
| @@ -3644,7 +4106,7 @@ static const uvec8 kShuffleAlpha = {
|
| // pshufb xmm3, kShuffleAlpha // alpha
|
| // Blend 8 pixels at a time.
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
| uint8* dst_argb, int width) {
|
| __asm {
|
| @@ -3661,41 +4123,8 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
| psllw xmm5, 8
|
| pcmpeqb xmm4, xmm4 // generate mask 0xff000000
|
| pslld xmm4, 24
|
| -
|
| - sub ecx, 1
|
| - je convertloop1 // only 1 pixel?
|
| - jl convertloop1b
|
| -
|
| - // 1 pixel loop until destination pointer is aligned.
|
| - alignloop1:
|
| - test edx, 15 // aligned?
|
| - je alignloop1b
|
| - movd xmm3, [eax]
|
| - lea eax, [eax + 4]
|
| - movdqa xmm0, xmm3 // src argb
|
| - pxor xmm3, xmm4 // ~alpha
|
| - movd xmm2, [esi] // _r_b
|
| - pshufb xmm3, kShuffleAlpha // alpha
|
| - pand xmm2, xmm6 // _r_b
|
| - paddw xmm3, xmm7 // 256 - alpha
|
| - pmullw xmm2, xmm3 // _r_b * alpha
|
| - movd xmm1, [esi] // _a_g
|
| - lea esi, [esi + 4]
|
| - psrlw xmm1, 8 // _a_g
|
| - por xmm0, xmm4 // set alpha to 255
|
| - pmullw xmm1, xmm3 // _a_g * alpha
|
| - psrlw xmm2, 8 // _r_b convert to 8 bits again
|
| - paddusb xmm0, xmm2 // + src argb
|
| - pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
| - paddusb xmm0, xmm1 // + src argb
|
| - movd [edx], xmm0
|
| - lea edx, [edx + 4]
|
| - sub ecx, 1
|
| - jge alignloop1
|
| -
|
| - alignloop1b:
|
| - add ecx, 1 - 4
|
| - jl convertloop4b
|
| + sub ecx, 4
|
| + jl convertloop4b // less than 4 pixels?
|
|
|
| // 4 pixel loop.
|
| convertloop4:
|
| @@ -3760,7 +4189,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
|
|
| #ifdef HAS_ARGBATTENUATEROW_SSE2
|
| // Attenuate 4 pixels at a time.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
|
| __asm {
|
| mov eax, [esp + 4] // src_argb0
|
| @@ -3809,7 +4238,7 @@ static const uvec8 kShuffleAlpha1 = {
|
| 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
|
| 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
|
| };
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
|
| __asm {
|
| mov eax, [esp + 4] // src_argb0
|
| @@ -3853,7 +4282,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
|
| static const uvec8 kShuffleAlpha_AVX2 = {
|
| 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
|
| };
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
|
| __asm {
|
| mov eax, [esp + 4] // src_argb0
|
| @@ -3890,7 +4319,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
|
|
|
| #ifdef HAS_ARGBUNATTENUATEROW_SSE2
|
| // Unattenuate 4 pixels at a time.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
| int width) {
|
| __asm {
|
| @@ -3944,7 +4373,7 @@ static const uvec8 kUnattenShuffleAlpha_AVX2 = {
|
| // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
|
| // USE_GATHER is not on by default, due to being a slow instruction.
|
| #ifdef USE_GATHER
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
|
| int width) {
|
| __asm {
|
| @@ -3978,7 +4407,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
|
| }
|
| }
|
| #else // USE_GATHER
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
|
| int width) {
|
| __asm {
|
| @@ -4045,7 +4474,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
|
|
|
| #ifdef HAS_ARGBGRAYROW_SSSE3
|
| // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
|
| __asm {
|
| mov eax, [esp + 4] /* src_argb */
|
| @@ -4104,7 +4533,7 @@ static const vec8 kARGBToSepiaR = {
|
| };
|
|
|
| // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
|
| __asm {
|
| mov eax, [esp + 4] /* dst_argb */
|
| @@ -4161,7 +4590,7 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
|
| // Same as Sepia except matrix is provided.
|
| // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
|
| // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
| const int8* matrix_argb, int width) {
|
| __asm {
|
| @@ -4222,7 +4651,7 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
|
|
| #ifdef HAS_ARGBQUANTIZEROW_SSE2
|
| // Quantize 4 ARGB pixels (16 bytes).
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
|
| int interval_offset, int width) {
|
| __asm {
|
| @@ -4267,7 +4696,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
|
|
|
| #ifdef HAS_ARGBSHADEROW_SSE2
|
| // Shade 4 pixels at a time by specified value.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
|
| uint32 value) {
|
| __asm {
|
| @@ -4301,7 +4730,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
|
|
|
| #ifdef HAS_ARGBMULTIPLYROW_SSE2
|
| // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
| uint8* dst_argb, int width) {
|
| __asm {
|
| @@ -4340,7 +4769,7 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
| #ifdef HAS_ARGBADDROW_SSE2
|
| // Add 2 rows of ARGB pixels together, 4 pixels at a time.
|
| // TODO(fbarchard): Port this to posix, neon and other math functions.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
| uint8* dst_argb, int width) {
|
| __asm {
|
| @@ -4388,7 +4817,7 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
|
|
| #ifdef HAS_ARGBSUBTRACTROW_SSE2
|
| // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
| uint8* dst_argb, int width) {
|
| __asm {
|
| @@ -4417,7 +4846,7 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
|
|
| #ifdef HAS_ARGBMULTIPLYROW_AVX2
|
| // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
|
| uint8* dst_argb, int width) {
|
| __asm {
|
| @@ -4454,7 +4883,7 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
|
|
|
| #ifdef HAS_ARGBADDROW_AVX2
|
| // Add 2 rows of ARGB pixels together, 8 pixels at a time.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
|
| uint8* dst_argb, int width) {
|
| __asm {
|
| @@ -4483,7 +4912,7 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
|
|
|
| #ifdef HAS_ARGBSUBTRACTROW_AVX2
|
| // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
|
| uint8* dst_argb, int width) {
|
| __asm {
|
| @@ -4515,7 +4944,7 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
|
| // -1 0 1
|
| // -2 0 2
|
| // -1 0 1
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
|
| const uint8* src_y2, uint8* dst_sobelx, int width) {
|
| __asm {
|
| @@ -4571,7 +5000,7 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
|
| // -1 -2 -1
|
| // 0 0 0
|
| // 1 2 1
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
|
| uint8* dst_sobely, int width) {
|
| __asm {
|
| @@ -4624,7 +5053,7 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
|
| // R = Sobel
|
| // G = Sobel
|
| // B = Sobel
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
|
| uint8* dst_argb, int width) {
|
| __asm {
|
| @@ -4671,7 +5100,7 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
|
|
|
| #ifdef HAS_SOBELTOPLANEROW_SSE2
|
| // Adds Sobel X and Sobel Y and stores Sobel into a plane.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
|
| uint8* dst_y, int width) {
|
| __asm {
|
| @@ -4704,7 +5133,7 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
|
| // R = Sobel X
|
| // G = Sobel
|
| // B = Sobel Y
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
|
| uint8* dst_argb, int width) {
|
| __asm {
|
| @@ -4991,7 +5420,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
|
|
|
| #ifdef HAS_ARGBAFFINEROW_SSE2
|
| // Copy ARGB pixels from source image with slope to a row of destination.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| LIBYUV_API
|
| void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
|
| uint8* dst_argb, const float* uv_dudv, int width) {
|
| @@ -5076,7 +5505,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
|
|
|
| #ifdef HAS_INTERPOLATEROW_AVX2
|
| // Bilinear filter 32x2 -> 32x1
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
|
| ptrdiff_t src_stride, int dst_width,
|
| int source_y_fraction) {
|
| @@ -5173,7 +5602,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
|
| #endif // HAS_INTERPOLATEROW_AVX2
|
|
|
| // Bilinear filter 16x2 -> 16x1
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| ptrdiff_t src_stride, int dst_width,
|
| int source_y_fraction) {
|
| @@ -5274,7 +5703,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
|
|
| #ifdef HAS_INTERPOLATEROW_SSE2
|
| // Bilinear filter 16x2 -> 16x1
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
| ptrdiff_t src_stride, int dst_width,
|
| int source_y_fraction) {
|
| @@ -5380,38 +5809,8 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
| }
|
| #endif // HAS_INTERPOLATEROW_SSE2
|
|
|
| -// Specialized ARGB to Bayer that just isolates G channel.
|
| -__declspec(naked) __declspec(align(16))
|
| -void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
|
| - uint32 selector, int pix) {
|
| - __asm {
|
| - mov eax, [esp + 4] // src_argb
|
| - mov edx, [esp + 8] // dst_bayer
|
| - // selector
|
| - mov ecx, [esp + 16] // pix
|
| - pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
|
| - psrld xmm5, 24
|
| -
|
| - wloop:
|
| - movdqu xmm0, [eax]
|
| - movdqu xmm1, [eax + 16]
|
| - lea eax, [eax + 32]
|
| - psrld xmm0, 8 // Move green to bottom.
|
| - psrld xmm1, 8
|
| - pand xmm0, xmm5
|
| - pand xmm1, xmm5
|
| - packssdw xmm0, xmm1
|
| - packuswb xmm0, xmm1
|
| - movq qword ptr [edx], xmm0
|
| - lea edx, [edx + 8]
|
| - sub ecx, 8
|
| - jg wloop
|
| - ret
|
| - }
|
| -}
|
| -
|
| // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
| const uint8* shuffler, int pix) {
|
| __asm {
|
| @@ -5437,7 +5836,7 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
| }
|
|
|
| #ifdef HAS_ARGBSHUFFLEROW_AVX2
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
|
| const uint8* shuffler, int pix) {
|
| __asm {
|
| @@ -5465,7 +5864,7 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
|
| }
|
| #endif // HAS_ARGBSHUFFLEROW_AVX2
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
| const uint8* shuffler, int pix) {
|
| __asm {
|
| @@ -5587,7 +5986,7 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
| // UYVY - Macro-pixel = 2 image pixels
|
| // U0Y0V0Y1
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void I422ToYUY2Row_SSE2(const uint8* src_y,
|
| const uint8* src_u,
|
| const uint8* src_v,
|
| @@ -5624,7 +6023,7 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void I422ToUYVYRow_SSE2(const uint8* src_y,
|
| const uint8* src_u,
|
| const uint8* src_v,
|
| @@ -5662,7 +6061,7 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
|
| }
|
|
|
| #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBPolynomialRow_SSE2(const uint8* src_argb,
|
| uint8* dst_argb, const float* poly,
|
| int width) {
|
| @@ -5721,7 +6120,7 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
|
| #endif // HAS_ARGBPOLYNOMIALROW_SSE2
|
|
|
| #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
| uint8* dst_argb, const float* poly,
|
| int width) {
|
| @@ -5761,7 +6160,7 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
|
|
| #ifdef HAS_ARGBCOLORTABLEROW_X86
|
| // Tranform ARGB pixels with color table.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
|
| int width) {
|
| __asm {
|
| @@ -5795,7 +6194,7 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
|
|
|
| #ifdef HAS_RGBCOLORTABLEROW_X86
|
| // Tranform RGB pixels with color table.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
|
| __asm {
|
| push esi
|
| @@ -5826,7 +6225,7 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
|
|
|
| #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
|
| // Tranform RGB pixels with luma table.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
| int width,
|
| const uint8* luma, uint32 lumacoeff) {
|
| @@ -5924,7 +6323,7 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
| #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
|
|
|
| #endif // defined(_M_X64)
|
| -#endif // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
|
| +#endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
|
|
|
| #ifdef __cplusplus
|
| } // extern "C"
|
|
|