| Index: source/row_win.cc
|
| diff --git a/source/row_win.cc b/source/row_win.cc
|
| index 2af97ae40e705630909843b66b89eb3dc7efe312..752eb78dfca637ca582e4e9ef07d6aeb2d24f3b6 100644
|
| --- a/source/row_win.cc
|
| +++ b/source/row_win.cc
|
| @@ -243,6 +243,30 @@ static const uvec8 kShuffleMaskARGBToRAW_0 = {
|
| 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
|
| };
|
|
|
| +// YUY2 shuf 16 Y to 32 Y.
|
| +static const lvec8 kShuffleYUY2Y = {
|
| + 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
|
| + 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
|
| +};
|
| +
|
| +// YUY2 shuf 8 UV to 16 UV.
|
| +static const lvec8 kShuffleYUY2UV = {
|
| + 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
|
| + 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
|
| +};
|
| +
|
| +// UYVY shuf 16 Y to 32 Y.
|
| +static const lvec8 kShuffleUYVYY = {
|
| + 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
|
| + 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
|
| +};
|
| +
|
| +// UYVY shuf 8 UV to 16 UV.
|
| +static const lvec8 kShuffleUYVYUV = {
|
| + 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
|
| + 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
|
| +};
|
| +
|
| // Duplicates gray value 3 times and fills in alpha opaque.
|
| __declspec(naked)
|
| void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
|
| @@ -1899,6 +1923,24 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| __asm lea eax, [eax + 16] \
|
| }
|
|
|
| +// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
|
| +#define READYUY2_AVX2 __asm { \
|
| + __asm vmovdqu ymm4, [eax] /* YUY2 */ \
|
| + __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
|
| + __asm vmovdqu ymm0, [eax] /* UV */ \
|
| + __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \
|
| + __asm lea eax, [eax + 32] \
|
| + }
|
| +
|
| +// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
|
| +#define READUYVY_AVX2 __asm { \
|
| + __asm vmovdqu ymm4, [eax] /* UYVY */ \
|
| + __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
|
| + __asm vmovdqu ymm0, [eax] /* UV */ \
|
| + __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \
|
| + __asm lea eax, [eax + 32] \
|
| + }
|
| +
|
| // Convert 16 pixels: 16 UV and 16 Y.
|
| #define YUVTORGB_AVX2(YuvConstants) __asm { \
|
| __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
|
| @@ -2168,6 +2210,65 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf,
|
| }
|
| #endif // HAS_NV12TOARGBROW_AVX2
|
|
|
| +// 16 pixels.
|
| +// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
|
| +__declspec(naked)
|
| +void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
|
| + uint8* dst_argb,
|
| + struct YuvConstants* yuvconstants,
|
| + int width) {
|
| + __asm {
|
| + push ebp
|
| + mov eax, [esp + 4 + 4] // yuy2
|
| + mov edx, [esp + 4 + 8] // argb
|
| + mov ebp, [esp + 4 + 12] // yuvconstants
|
| + mov ecx, [esp + 4 + 16] // width
|
| + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
|
| +
|
| + convertloop:
|
| + READYUY2_AVX2
|
| + YUVTORGB_AVX2(ebp)
|
| + STOREARGB_AVX2
|
| +
|
| + sub ecx, 16
|
| + jg convertloop
|
| +
|
| + pop ebp
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// 16 pixels.
|
| +// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
|
| +__declspec(naked)
|
| +void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
|
| + uint8* dst_argb,
|
| + struct YuvConstants* yuvconstants,
|
| + int width) {
|
| + __asm {
|
| + push ebp
|
| + mov eax, [esp + 4 + 4] // uyvy
|
| + mov edx, [esp + 4 + 8] // argb
|
| + mov ebp, [esp + 4 + 12] // yuvconstants
|
| + mov ecx, [esp + 4 + 16] // width
|
| + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
|
| +
|
| + convertloop:
|
| + READUYVY_AVX2
|
| + YUVTORGB_AVX2(ebp)
|
| + STOREARGB_AVX2
|
| +
|
| + sub ecx, 16
|
| + jg convertloop
|
| +
|
| + pop ebp
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +
|
| +
|
| #ifdef HAS_I422TOBGRAROW_AVX2
|
| // 16 pixels
|
| // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
|
| @@ -2338,17 +2439,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
|
| __asm lea eax, [eax + 8] \
|
| }
|
|
|
| -// YUY2 shuf 8 Y to 16 Y.
|
| -static const vec8 kShuffleYUY2Y = {
|
| - 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
|
| -};
|
| -
|
| -// YUY2 shuf 4 UV to 8 UV.
|
| -static const vec8 kShuffleYUY2UV = {
|
| - 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
|
| -};
|
| -
|
| -// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
|
| +// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
|
| #define READYUY2 __asm { \
|
| __asm movdqu xmm4, [eax] /* YUY2 */ \
|
| __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
|
| @@ -2357,24 +2448,13 @@ static const vec8 kShuffleYUY2UV = {
|
| __asm lea eax, [eax + 16] \
|
| }
|
|
|
| -// UYVY shuf 8 Y to 16 Y.
|
| -static const vec8 kShuffleUYVYY = {
|
| - 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
|
| -};
|
| -
|
| -// UYVY shuf 4 UV to 8 UV.
|
| -static const vec8 kShuffleUYVYUV = {
|
| - 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
|
| -};
|
| -
|
| -// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
|
| +// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
|
| #define READUYVY __asm { \
|
| __asm movdqu xmm4, [eax] /* UYVY */ \
|
| __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
|
| __asm movdqu xmm0, [eax] /* UV */ \
|
| __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
|
| __asm lea eax, [eax + 16] \
|
| - __asm lea eax, [eax + 8] \
|
| }
|
|
|
| // Convert 8 pixels: 8 UV and 8 Y.
|
|
|