| Index: source/row_win.cc
|
| diff --git a/source/row_win.cc b/source/row_win.cc
|
| index 13076ce604edf2532ab96571a4396fa91816b084..0a28430081f0df89d4b2778b99e8880c86b25f62 100644
|
| --- a/source/row_win.cc
|
| +++ b/source/row_win.cc
|
| @@ -4141,26 +4141,29 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
|
| sub edx, esi
|
| sub edi, esi
|
|
|
| - // 16 pixel loop.
|
| + // 32 pixel loop.
|
| convertloop16:
|
| - vmovdqu xmm0, [esi] // alpha
|
| - vpermq ymm0, ymm0, 0xd8
|
| - vpunpcklbw ymm0, ymm0, ymm0
|
| + vmovdqu ymm0, [esi] // alpha
|
| + vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
|
| + vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23
|
| + vpxor ymm3, ymm3, ymm5 // a, 255-a
|
| vpxor ymm0, ymm0, ymm5 // a, 255-a
|
| - vmovdqu xmm1, [eax + esi] // src0
|
| - vmovdqu xmm2, [edx + esi] // src1
|
| - vpermq ymm1, ymm1, 0xd8
|
| - vpermq ymm2, ymm2, 0xd8
|
| + vmovdqu ymm1, [eax + esi] // src0
|
| + vmovdqu ymm2, [edx + esi] // src1
|
| + vpunpckhbw ymm4, ymm1, ymm2
|
| vpunpcklbw ymm1, ymm1, ymm2
|
| + vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128
|
| vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128
|
| + vpmaddubsw ymm3, ymm3, ymm4
|
| vpmaddubsw ymm0, ymm0, ymm1
|
| + vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round.
|
| vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.
|
| + vpsrlw ymm3, ymm3, 8
|
| vpsrlw ymm0, ymm0, 8
|
| - vpackuswb ymm0, ymm0, ymm0
|
| - vpermq ymm0, ymm0, 0xd8
|
| - vmovdqu [edi + esi], xmm0
|
| - lea esi, [esi + 16]
|
| - sub ecx, 16
|
| + vpackuswb ymm0, ymm0, ymm3
|
| + vmovdqu [edi + esi], ymm0
|
| + lea esi, [esi + 32]
|
| + sub ecx, 32
|
| jg convertloop16
|
|
|
| pop edi
|
|
|