| Index: source/row_win.cc
|
| diff --git a/source/row_win.cc b/source/row_win.cc
|
| index fff337a7d4da76581cca08599a32d70cf2f428f2..64c029ce2de6c5c732f5b57b25222bacac344e60 100644
|
| --- a/source/row_win.cc
|
| +++ b/source/row_win.cc
|
| @@ -1896,6 +1896,23 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| __asm lea eax, [eax + 16] \
|
| }
|
|
|
| +// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
|
| +#define READYUVA422_AVX2 __asm { \
|
| + __asm vmovq xmm0, qword ptr [esi] /* U */ \
|
| + __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
|
| + __asm lea esi, [esi + 8] \
|
| + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
|
| + __asm vpermq ymm0, ymm0, 0xd8 \
|
| + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
|
| + __asm vmovdqu xmm4, [eax] /* Y */ \
|
| + __asm vpermq ymm4, ymm4, 0xd8 \
|
| + __asm vpunpcklbw ymm4, ymm4, ymm4 \
|
| + __asm lea eax, [eax + 16] \
|
| + __asm vmovdqu xmm5, [ebp] /* A */ \
|
| + __asm vpermq ymm5, ymm5, 0xd8 \
|
| + __asm lea ebp, [ebp + 16] \
|
| + }
|
| +
|
| // Read 4 UV from 411, upsample to 16 UV.
|
| #define READYUV411_AVX2 __asm { \
|
| __asm vmovd xmm0, dword ptr [esi] /* U */ \
|
| @@ -2057,6 +2074,92 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
|
| }
|
| #endif // HAS_I422TOARGBROW_AVX2
|
|
|
| +#ifdef HAS_I422ALPHATOARGBROW_AVX2
|
| +// 16 pixels
|
| +// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
|
| +__declspec(naked)
|
| +void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
|
| + const uint8* u_buf,
|
| + const uint8* v_buf,
|
| + const uint8* a_buf,
|
| + uint8* dst_argb,
|
| + struct YuvConstants* yuvconstants,
|
| + int width) {
|
| + __asm {
|
| + push esi
|
| + push edi
|
| + push ebx
|
| + push ebp
|
| + mov eax, [esp + 16 + 4] // Y
|
| + mov esi, [esp + 16 + 8] // U
|
| + mov edi, [esp + 16 + 12] // V
|
| + mov ebp, [esp + 16 + 16] // A
|
| + mov edx, [esp + 16 + 20] // argb
|
| + mov ebx, [esp + 16 + 24] // yuvconstants
|
| + mov ecx, [esp + 16 + 28] // width
|
| + sub edi, esi
|
| +
|
| + convertloop:
|
| + READYUVA422_AVX2
|
| + YUVTORGB_AVX2(ebx)
|
| + STOREARGB_AVX2
|
| +
|
| + sub ecx, 16
|
| + jg convertloop
|
| +
|
| + pop ebp
|
| + pop ebx
|
| + pop edi
|
| + pop esi
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_I422ALPHATOARGBROW_AVX2
|
| +
|
| +#ifdef HAS_I422ALPHATOABGRROW_AVX2
|
| +// 16 pixels
|
| +// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ABGR.
|
| +__declspec(naked)
|
| +void I422AlphaToABGRRow_AVX2(const uint8* y_buf,
|
| + const uint8* u_buf,
|
| + const uint8* v_buf,
|
| + const uint8* a_buf,
|
| + uint8* dst_abgr,
|
| + struct YuvConstants* yuvconstants,
|
| + int width) {
|
| + __asm {
|
| + push esi
|
| + push edi
|
| + push ebx
|
| + push ebp
|
| + mov eax, [esp + 16 + 4] // Y
|
| + mov esi, [esp + 16 + 8] // U
|
| + mov edi, [esp + 16 + 12] // V
|
| + mov ebp, [esp + 16 + 16] // A
|
| + mov edx, [esp + 16 + 20] // abgr
|
| + mov ebx, [esp + 16 + 24] // yuvconstants
|
| + mov ecx, [esp + 16 + 28] // width
|
| + sub edi, esi
|
| +
|
| + convertloop:
|
| + READYUVA422_AVX2
|
| + YUVTORGB_AVX2(ebx)
|
| + STOREABGR_AVX2
|
| +
|
| + sub ecx, 16
|
| + jg convertloop
|
| +
|
| + pop ebp
|
| + pop ebx
|
| + pop edi
|
| + pop esi
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_I422ALPHATOABGRROW_AVX2
|
| +
|
| #ifdef HAS_I444TOARGBROW_AVX2
|
| // 16 pixels
|
| // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
|
| @@ -2848,7 +2951,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
|
| }
|
|
|
| // 8 pixels.
|
| -// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB (32 bytes).
|
| +// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
|
| __declspec(naked)
|
| void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
|
| const uint8* u_buf,
|
| @@ -2870,7 +2973,6 @@ void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
|
| mov ebx, [esp + 16 + 24] // yuvconstants
|
| mov ecx, [esp + 16 + 28] // width
|
| sub edi, esi
|
| - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
|
|
| convertloop:
|
| READYUVA422
|
| @@ -2889,7 +2991,7 @@ void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
|
| }
|
|
|
| // 8 pixels.
|
| -// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ABGR (32 bytes).
|
| +// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ABGR.
|
| __declspec(naked)
|
| void I422AlphaToABGRRow_SSSE3(const uint8* y_buf,
|
| const uint8* u_buf,
|
| @@ -2911,7 +3013,6 @@ void I422AlphaToABGRRow_SSSE3(const uint8* y_buf,
|
| mov ebx, [esp + 16 + 24] // yuvconstants
|
| mov ecx, [esp + 16 + 28] // width
|
| sub edi, esi
|
| - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
|
|
| convertloop:
|
| READYUVA422
|
|
|