| Index: source/row_win.cc
|
| diff --git a/source/row_win.cc b/source/row_win.cc
|
| index aa94487c12fb7d3010010c96bae3b1be7ea427ef..e3353cabf04bbd2b9ac19823a9b7ed69af8b500e 100644
|
| --- a/source/row_win.cc
|
| +++ b/source/row_win.cc
|
| @@ -4063,6 +4063,58 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
|
| }
|
| #endif // HAS_YUY2TOYROW_SSE2
|
|
|
| +#ifdef HAS_BLENDPLANEROW_SSSE3
|
| +// Blend 8 pixels at a time.
|
| +// =((G2*C2)+(H2*(D2))+32768+127)/256
|
| +__declspec(naked)
|
| +void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
|
| + const uint8* alpha, uint8* dst, int width) {
|
| + __asm {
|
| + push esi
|
| + push edi
|
| + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
|
| + psllw xmm5, 8
|
| + mov eax, 0x80808080 // 128 for biasing image to signed.
|
| + movd xmm6, eax
|
| + pshufd xmm6, xmm6, 0x00
|
| +
|
| + mov eax, 0x807f807f // 32768 + 127 for unbias and round.
|
| + movd xmm7, eax
|
| + pshufd xmm7, xmm7, 0x00
|
| + mov eax, [esp + 8 + 4] // src0
|
| + mov edx, [esp + 8 + 8] // src1
|
| + mov esi, [esp + 8 + 12] // alpha
|
| + mov edi, [esp + 8 + 16] // dst
|
| + mov ecx, [esp + 8 + 20] // width
|
| + sub eax, esi
|
| + sub edx, esi
|
| + sub edi, esi
|
| +
|
| + // 8 pixel loop.
|
| + convertloop8:
|
| + movq xmm0, qword ptr [esi] // alpha
|
| + punpcklbw xmm0, xmm0
|
| + pxor xmm0, xmm5 // a, 255-a
|
| + movq xmm1, qword ptr [eax + esi] // src0
|
| + movq xmm2, qword ptr [edx + esi] // src1
|
| + punpcklbw xmm1, xmm2
|
| + psubb xmm1, xmm6 // bias src0/1 - 128
|
| + pmaddubsw xmm0, xmm1
|
| + paddw xmm0, xmm7 // unbias result - 32768 and round.
|
| + psrlw xmm0, 8
|
| + packuswb xmm0, xmm0
|
| + movq qword ptr [edi + esi], xmm0
|
| + lea esi, [esi + 8]
|
| + sub ecx, 8
|
| + jge convertloop8
|
| +
|
| + pop edi
|
| + pop esi
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_BLENDPLANEROW_SSSE3
|
| +
|
| #ifdef HAS_ARGBBLENDROW_SSSE3
|
| // Shuffle table for isolating alpha.
|
| static const uvec8 kShuffleAlpha = {
|
|
|