| Index: source/row_win.cc
|
| diff --git a/source/row_win.cc b/source/row_win.cc
|
| index 6e35c70c6c94aa924a5268f6d9958acefde0397c..d0e691cdb04ff7e8d5fef8f7667102be757d9f89 100644
|
| --- a/source/row_win.cc
|
| +++ b/source/row_win.cc
|
| @@ -319,6 +319,12 @@ static const lvec8 kShuffleUYVYUV = {
|
| 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
|
| };
|
|
|
| +// NV21 shuf 8 VU to 16 UV.
|
| +static const lvec8 kShuffleNV21 = {
|
| + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
|
| + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
|
| +};
|
| +
|
| // Duplicates gray value 3 times and fills in alpha opaque.
|
| __declspec(naked)
|
| void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
|
| @@ -1992,6 +1998,18 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| __asm lea eax, [eax + 16] \
|
| }
|
|
|
| +// Read 8 UV from NV21, upsample to 16 UV.
|
| +#define READNV21_AVX2 __asm { \
|
| + __asm vmovdqu xmm0, [esi] /* UV */ \
|
| + __asm lea esi, [esi + 16] \
|
| + __asm vpermq ymm0, ymm0, 0xd8 \
|
| + __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \
|
| + __asm vmovdqu xmm4, [eax] /* Y */ \
|
| + __asm vpermq ymm4, ymm4, 0xd8 \
|
| + __asm vpunpcklbw ymm4, ymm4, ymm4 \
|
| + __asm lea eax, [eax + 16] \
|
| + }
|
| +
|
| // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
|
| #define READYUY2_AVX2 __asm { \
|
| __asm vmovdqu ymm4, [eax] /* YUY2 */ \
|
| @@ -2365,6 +2383,41 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf,
|
| }
|
| #endif // HAS_NV12TOARGBROW_AVX2
|
|
|
| +#ifdef HAS_NV21TOARGBROW_AVX2
|
| +// 16 pixels.
|
| +// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
|
| +__declspec(naked)
|
| +void NV21ToARGBRow_AVX2(const uint8* y_buf,
|
| + const uint8* vu_buf,
|
| + uint8* dst_argb,
|
| + struct YuvConstants* yuvconstants,
|
| + int width) {
|
| + __asm {
|
| + push esi
|
| + push ebx
|
| + mov eax, [esp + 8 + 4] // Y
|
| + mov esi, [esp + 8 + 8] // VU
|
| + mov edx, [esp + 8 + 12] // argb
|
| + mov ebx, [esp + 8 + 16] // yuvconstants
|
| + mov ecx, [esp + 8 + 20] // width
|
| + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
|
| +
|
| + convertloop:
|
| + READNV21_AVX2
|
| + YUVTORGB_AVX2(ebx)
|
| + STOREARGB_AVX2
|
| +
|
| + sub ecx, 16
|
| + jg convertloop
|
| +
|
| + pop ebx
|
| + pop esi
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_NV21TOARGBROW_AVX2
|
| +
|
| // 16 pixels.
|
| // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
|
| __declspec(naked)
|
| @@ -2608,6 +2661,16 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
|
| __asm lea eax, [eax + 8] \
|
| }
|
|
|
| +// Read 4 VU from NV21, upsample to 8 UV.
|
| +#define READNV21 __asm { \
|
| + __asm movq xmm0, qword ptr [esi] /* UV */ \
|
| + __asm lea esi, [esi + 8] \
|
| + __asm pshufb xmm0, xmmword ptr kShuffleNV21 \
|
| + __asm movq xmm4, qword ptr [eax] \
|
| + __asm punpcklbw xmm4, xmm4 \
|
| + __asm lea eax, [eax + 8] \
|
| + }
|
| +
|
| // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
|
| #define READYUY2 __asm { \
|
| __asm movdqu xmm4, [eax] /* YUY2 */ \
|
| @@ -3153,6 +3216,38 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
|
| }
|
|
|
| // 8 pixels.
|
| +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
|
| +__declspec(naked)
|
| +void NV21ToARGBRow_SSSE3(const uint8* y_buf,
|
| + const uint8* vu_buf,
|
| + uint8* dst_argb,
|
| + struct YuvConstants* yuvconstants,
|
| + int width) {
|
| + __asm {
|
| + push esi
|
| + push ebx
|
| + mov eax, [esp + 8 + 4] // Y
|
| + mov esi, [esp + 8 + 8] // VU
|
| + mov edx, [esp + 8 + 12] // argb
|
| + mov ebx, [esp + 8 + 16] // yuvconstants
|
| + mov ecx, [esp + 8 + 20] // width
|
| + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
| +
|
| + convertloop:
|
| + READNV21
|
| + YUVTORGB(ebx)
|
| + STOREARGB
|
| +
|
| + sub ecx, 8
|
| + jg convertloop
|
| +
|
| + pop ebx
|
| + pop esi
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// 8 pixels.
|
| // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
|
| __declspec(naked)
|
| void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
|
|
|