source/row_win.cc - Issue 1505433002: AVX2 YUV alpha blender and improved unittests

Unified Diff: source/row_win.cc

Issue 1505433002: AVX2 YUV alpha blender and improved unittests (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master

Patch Set: off by 1 fix on win Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: source/row_win.cc

diff --git a/source/row_win.cc b/source/row_win.cc

index e3353cabf04bbd2b9ac19823a9b7ed69af8b500e..13076ce604edf2532ab96571a4396fa91816b084 100644

--- a/source/row_win.cc

+++ b/source/row_win.cc

@@ -525,7 +525,7 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,

vmovd xmm5, eax

vbroadcastss ymm5, xmm5

mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits

- movd xmm6, eax

+ vmovd xmm6, eax

vbroadcastss ymm6, xmm6

vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red

vpsllw ymm3, ymm3, 11

@@ -576,7 +576,7 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,

vmovd xmm5, eax

vbroadcastss ymm5, xmm5

mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits

- movd xmm6, eax

+ vmovd xmm6, eax

vbroadcastss ymm6, xmm6

vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red

vpsllw ymm3, ymm3, 11

@@ -4106,7 +4106,7 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,

movq qword ptr [edi + esi], xmm0

lea esi, [esi + 8]

sub ecx, 8

- jge convertloop8

+ jg convertloop8

pop edi

pop esi

@@ -4115,6 +4115,62 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,

}

#endif // HAS_BLENDPLANEROW_SSSE3

+#ifdef HAS_BLENDPLANEROW_AVX2

+// Blend 16 pixels at a time.

+// =((G2*C2)+(H2*(D2))+32768+127)/256

+__declspec(naked)

+void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,

+ const uint8* alpha, uint8* dst, int width) {

+ __asm {

+ push esi

+ push edi

+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00

+ vpsllw ymm5, ymm5, 8

+ mov eax, 0x80808080 // 128 for biasing image to signed.

+ vmovd xmm6, eax

+ vbroadcastss ymm6, xmm6

+ mov eax, 0x807f807f // 32768 + 127 for unbias and round.

+ vmovd xmm7, eax

+ vbroadcastss ymm7, xmm7

+ mov eax, [esp + 8 + 4] // src0

+ mov edx, [esp + 8 + 8] // src1

+ mov esi, [esp + 8 + 12] // alpha

+ mov edi, [esp + 8 + 16] // dst

+ mov ecx, [esp + 8 + 20] // width

+ sub eax, esi

+ sub edx, esi

+ sub edi, esi

+ // 16 pixel loop.

+ convertloop16:

+ vmovdqu xmm0, [esi] // alpha

+ vpermq ymm0, ymm0, 0xd8

+ vpunpcklbw ymm0, ymm0, ymm0

+ vpxor ymm0, ymm0, ymm5 // a, 255-a

+ vmovdqu xmm1, [eax + esi] // src0

+ vmovdqu xmm2, [edx + esi] // src1

+ vpermq ymm1, ymm1, 0xd8

+ vpermq ymm2, ymm2, 0xd8

+ vpunpcklbw ymm1, ymm1, ymm2

+ vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128

+ vpmaddubsw ymm0, ymm0, ymm1

+ vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.

+ vpsrlw ymm0, ymm0, 8

+ vpackuswb ymm0, ymm0, ymm0

+ vpermq ymm0, ymm0, 0xd8

+ vmovdqu [edi + esi], xmm0

+ lea esi, [esi + 16]

+ sub ecx, 16

+ jg convertloop16

+ pop edi

+ pop esi

+ vzeroupper

+ ret

+ }

+#endif // HAS_BLENDPLANEROW_AVX2

#ifdef HAS_ARGBBLENDROW_SSSE3

// Shuffle table for isolating alpha.

static const uvec8 kShuffleAlpha = {

« no previous file with comments | « source/row_gcc.cc ('k') | unit_test/planar_test.cc » ('j') | no next file with comments »