source/row_win.cc - Issue 1505673003: Optimize yuv alpha blend AVX2 code to do 32 pixels at time.

Keyboard Shortcuts

	File
u :	up to issue
j / k :	jump to file after / before current file
J / K :	jump to next file with a comment after / before current file
	Side-by-side diff
i :	toggle intra-line diffs
e :	expand all comments
c :	collapse all comments
s :	toggle showing all comments
n / p :	next / previous diff chunk or comment
N / P :	next / previous comment
<Up> / <Down> :	next / previous line

	Issue
u :	up to list of issues
j / k :	jump to patch after / before current patch
o / <Enter> :	open current patch in side-by-side view
i :	open current patch in unified diff view

	Issue List
j / k :	jump to issue after / before current issue
o / <Enter> :	open current issue

Unified Diff: source/row_win.cc

Issue 1505673003: Optimize yuv alpha blend AVX2 code to do 32 pixels at time. (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master

Patch Set: gcc port of avx2 that does 32 pixels Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: source/row_win.cc

diff --git a/source/row_win.cc b/source/row_win.cc

index 13076ce604edf2532ab96571a4396fa91816b084..0a28430081f0df89d4b2778b99e8880c86b25f62 100644

--- a/source/row_win.cc

+++ b/source/row_win.cc

@@ -4141,26 +4141,29 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,

sub edx, esi

sub edi, esi

- // 16 pixel loop.

+ // 32 pixel loop.

convertloop16:

- vmovdqu xmm0, [esi] // alpha

- vpermq ymm0, ymm0, 0xd8

- vpunpcklbw ymm0, ymm0, ymm0

+ vmovdqu ymm0, [esi] // alpha

+ vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31

+ vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23

+ vpxor ymm3, ymm3, ymm5 // a, 255-a

vpxor ymm0, ymm0, ymm5 // a, 255-a

- vmovdqu xmm1, [eax + esi] // src0

- vmovdqu xmm2, [edx + esi] // src1

- vpermq ymm1, ymm1, 0xd8

- vpermq ymm2, ymm2, 0xd8

+ vmovdqu ymm1, [eax + esi] // src0

+ vmovdqu ymm2, [edx + esi] // src1

+ vpunpckhbw ymm4, ymm1, ymm2

vpunpcklbw ymm1, ymm1, ymm2

+ vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128

vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128

+ vpmaddubsw ymm3, ymm3, ymm4

vpmaddubsw ymm0, ymm0, ymm1

+ vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round.

vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.

+ vpsrlw ymm3, ymm3, 8

vpsrlw ymm0, ymm0, 8

- vpackuswb ymm0, ymm0, ymm0

- vpermq ymm0, ymm0, 0xd8

- vmovdqu [edi + esi], xmm0

- lea esi, [esi + 16]

- sub ecx, 16

+ vpackuswb ymm0, ymm0, ymm3

+ vmovdqu [edi + esi], ymm0

+ lea esi, [esi + 32]

+ sub ecx, 32

jg convertloop16

pop edi

« source/cpu_id.cc ('K') | « source/row_gcc.cc ('k') | no next file » | no next file with comments »