| Index: source/row_win.cc
|
| diff --git a/source/row_win.cc b/source/row_win.cc
|
| index b843998d89c0da59b6e41e3113dd96519db85d90..a8c16c3c1ef3a4c0ed99814f9c3898636213e504 100644
|
| --- a/source/row_win.cc
|
| +++ b/source/row_win.cc
|
| @@ -1505,7 +1505,7 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| pmaddubsw xmm3, xmm6
|
| phaddw xmm0, xmm2
|
| phaddw xmm1, xmm3
|
| - paddw xmm0, xmm5 // +.5 rounding -> unsigned
|
| + paddw xmm0, xmm5 // +.5 rounding -> unsigned
|
| paddw xmm1, xmm5
|
| psraw xmm0, 8
|
| psraw xmm1, 8
|
| @@ -1590,6 +1590,73 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
|
| }
|
| #endif // HAS_ARGBTOUVROW_AVX2
|
|
|
| +#ifdef HAS_ARGBTOUVJROW_AVX2
|
| +__declspec(naked)
|
| +void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
|
| + uint8* dst_u, uint8* dst_v, int width) {
|
| + __asm {
|
| + push esi
|
| + push edi
|
| + mov eax, [esp + 8 + 4] // src_argb
|
| + mov esi, [esp + 8 + 8] // src_stride_argb
|
| + mov edx, [esp + 8 + 12] // dst_u
|
| + mov edi, [esp + 8 + 16] // dst_v
|
| + mov ecx, [esp + 8 + 20] // width
|
| + vbroadcastf128 ymm5, xmmword ptr kAddUV128
|
| + vbroadcastf128 ymm6, xmmword ptr kARGBToV
|
| + vbroadcastf128 ymm7, xmmword ptr kARGBToU
|
| + sub edi, edx // stride from u to v
|
| +
|
| + convertloop:
|
| + /* step 1 - subsample 32x2 argb pixels to 16x1 */
|
| + vmovdqu ymm0, [eax]
|
| + vmovdqu ymm1, [eax + 32]
|
| + vmovdqu ymm2, [eax + 64]
|
| + vmovdqu ymm3, [eax + 96]
|
| + vpavgb ymm0, ymm0, [eax + esi]
|
| + vpavgb ymm1, ymm1, [eax + esi + 32]
|
| + vpavgb ymm2, ymm2, [eax + esi + 64]
|
| + vpavgb ymm3, ymm3, [eax + esi + 96]
|
| + lea eax, [eax + 128]
|
| + vshufps ymm4, ymm0, ymm1, 0x88
|
| + vshufps ymm0, ymm0, ymm1, 0xdd
|
| + vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
|
| + vshufps ymm4, ymm2, ymm3, 0x88
|
| + vshufps ymm2, ymm2, ymm3, 0xdd
|
| + vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
|
| +
|
| + // step 2 - convert to U and V
|
| + // from here down is very similar to Y code except
|
| + // instead of 32 different pixels, its 16 pixels of U and 16 of V
|
| + vpmaddubsw ymm1, ymm0, ymm7 // U
|
| + vpmaddubsw ymm3, ymm2, ymm7
|
| + vpmaddubsw ymm0, ymm0, ymm6 // V
|
| + vpmaddubsw ymm2, ymm2, ymm6
|
| + vphaddw ymm1, ymm1, ymm3 // mutates
|
| + vphaddw ymm0, ymm0, ymm2
|
| + vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned
|
| + vpaddw ymm0, ymm0, ymm5
|
| + vpsraw ymm1, ymm1, 8
|
| + vpsraw ymm0, ymm0, 8
|
| + vpacksswb ymm0, ymm1, ymm0 // mutates
|
| + vpermq ymm0, ymm0, 0xd8 // For vpacksswb
|
| + vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
|
| +
|
| + // step 3 - store 16 U and 16 V values
|
| + vextractf128 [edx], ymm0, 0 // U
|
| + vextractf128 [edx + edi], ymm0, 1 // V
|
| + lea edx, [edx + 16]
|
| + sub ecx, 32
|
| + jg convertloop
|
| +
|
| + pop edi
|
| + pop esi
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_ARGBTOUVJROW_AVX2
|
| +
|
| __declspec(naked)
|
| void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
|
| uint8* dst_u, uint8* dst_v, int width) {
|
|
|