| Index: source/row_win.cc
|
| diff --git a/source/row_win.cc b/source/row_win.cc
|
| index e3353cabf04bbd2b9ac19823a9b7ed69af8b500e..13076ce604edf2532ab96571a4396fa91816b084 100644
|
| --- a/source/row_win.cc
|
| +++ b/source/row_win.cc
|
| @@ -525,7 +525,7 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
|
| vmovd xmm5, eax
|
| vbroadcastss ymm5, xmm5
|
| mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
|
| - movd xmm6, eax
|
| + vmovd xmm6, eax
|
| vbroadcastss ymm6, xmm6
|
| vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
|
| vpsllw ymm3, ymm3, 11
|
| @@ -576,7 +576,7 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
|
| vmovd xmm5, eax
|
| vbroadcastss ymm5, xmm5
|
| mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
|
| - movd xmm6, eax
|
| + vmovd xmm6, eax
|
| vbroadcastss ymm6, xmm6
|
| vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
|
| vpsllw ymm3, ymm3, 11
|
| @@ -4106,7 +4106,7 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
|
| movq qword ptr [edi + esi], xmm0
|
| lea esi, [esi + 8]
|
| sub ecx, 8
|
| - jge convertloop8
|
| + jg convertloop8
|
|
|
| pop edi
|
| pop esi
|
| @@ -4115,6 +4115,62 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
|
| }
|
| #endif // HAS_BLENDPLANEROW_SSSE3
|
|
|
| +#ifdef HAS_BLENDPLANEROW_AVX2
|
| +// Blend 16 pixels at a time.
|
| +// =((G2*C2)+(H2*(D2))+32768+127)/256
|
| +__declspec(naked)
|
| +void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
|
| + const uint8* alpha, uint8* dst, int width) {
|
| + __asm {
|
| + push esi
|
| + push edi
|
| + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00
|
| + vpsllw ymm5, ymm5, 8
|
| + mov eax, 0x80808080 // 128 for biasing image to signed.
|
| + vmovd xmm6, eax
|
| + vbroadcastss ymm6, xmm6
|
| + mov eax, 0x807f807f // 32768 + 127 for unbias and round.
|
| + vmovd xmm7, eax
|
| + vbroadcastss ymm7, xmm7
|
| + mov eax, [esp + 8 + 4] // src0
|
| + mov edx, [esp + 8 + 8] // src1
|
| + mov esi, [esp + 8 + 12] // alpha
|
| + mov edi, [esp + 8 + 16] // dst
|
| + mov ecx, [esp + 8 + 20] // width
|
| + sub eax, esi
|
| + sub edx, esi
|
| + sub edi, esi
|
| +
|
| + // 16 pixel loop.
|
| + convertloop16:
|
| + vmovdqu xmm0, [esi] // alpha
|
| + vpermq ymm0, ymm0, 0xd8
|
| + vpunpcklbw ymm0, ymm0, ymm0
|
| + vpxor ymm0, ymm0, ymm5 // a, 255-a
|
| + vmovdqu xmm1, [eax + esi] // src0
|
| + vmovdqu xmm2, [edx + esi] // src1
|
| + vpermq ymm1, ymm1, 0xd8
|
| + vpermq ymm2, ymm2, 0xd8
|
| + vpunpcklbw ymm1, ymm1, ymm2
|
| + vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128
|
| + vpmaddubsw ymm0, ymm0, ymm1
|
| + vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.
|
| + vpsrlw ymm0, ymm0, 8
|
| + vpackuswb ymm0, ymm0, ymm0
|
| + vpermq ymm0, ymm0, 0xd8
|
| + vmovdqu [edi + esi], xmm0
|
| + lea esi, [esi + 16]
|
| + sub ecx, 16
|
| + jg convertloop16
|
| +
|
| + pop edi
|
| + pop esi
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_BLENDPLANEROW_AVX2
|
| +
|
| #ifdef HAS_ARGBBLENDROW_SSSE3
|
| // Shuffle table for isolating alpha.
|
| static const uvec8 kShuffleAlpha = {
|
|
|