source/row_win.cc - Issue 2421993002: Port HalfFloatRow_SSE2 to AVX2 but not using F16C.

Keyboard Shortcuts

	File
u :	up to issue
j / k :	jump to file after / before current file
J / K :	jump to next file with a comment after / before current file
	Side-by-side diff
i :	toggle intra-line diffs
e :	expand all comments
c :	collapse all comments
s :	toggle showing all comments
n / p :	next / previous diff chunk or comment
N / P :	next / previous comment
<Up> / <Down> :	next / previous line

	Issue
u :	up to list of issues
j / k :	jump to patch after / before current patch
o / <Enter> :	open current patch in side-by-side view
i :	open current patch in unified diff view

	Issue List
j / k :	jump to issue after / before current issue
o / <Enter> :	open current issue

Unified Diff: source/row_win.cc

Issue 2421993002: Port HalfFloatRow_SSE2 to AVX2 but not using F16C. (Closed)

Patch Set: disable f16 Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: source/row_win.cc

diff --git a/source/row_win.cc b/source/row_win.cc

index 9dc8055350bd680d1265a45ddd82e1245161b90b..ecbee30426ad082a2039734550082fe36d786096 100644

--- a/source/row_win.cc

+++ b/source/row_win.cc

@@ -6059,10 +6059,46 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {

__asm {

mov eax, [esp + 4] /* src */

mov edx, [esp + 8] /* dst */

+ movd xmm4, dword ptr [esp + 12] /* scale */

+ mov ecx, [esp + 16] /* width */

+ vmulss xmm4, xmm4, kExpBias

+ vbroadcastss ymm4, xmm4

+ vpxor ymm5, ymm5, ymm5

+ // 16 pixel loop.

+ convertloop:

+ vmovdqu ymm2, [eax] // 16 shorts

+ lea eax, [eax + 32]

+ vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints

+ vpunpcklwd ymm2, ymm2, ymm5

+ vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats

+ vcvtdq2ps ymm2, ymm2

+ vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range.

+ vmulps ymm2, ymm2, ymm4

+ vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate

+ vpsrld ymm2, ymm2, 13

+ vpackssdw ymm2, ymm2, ymm3

+ vmovdqu [edx], ymm2

+ lea edx, [edx + 32]

+ sub ecx, 16

+ jg convertloop

+ vzeroupper

+ ret

+ }

+#endif // HAS_HALFFLOATROW_AVX2

+#ifdef HAS_HALFFLOATROW_F16C

+__declspec(naked)

+void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {

+ __asm {

+ mov eax, [esp + 4] /* src */

+ mov edx, [esp + 8] /* dst */

vbroadcastss ymm4, [esp + 12] /* scale */

mov ecx, [esp + 16] /* width */

- // 8 pixel loop.

+ // 16 pixel loop.

convertloop:

vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints

vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts

@@ -6082,7 +6118,7 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {

ret

}

-#endif // HAS_HALFFLOATROW_AVX2

+#endif // HAS_HALFFLOATROW_F16C

#ifdef HAS_ARGBCOLORTABLEROW_X86

// Tranform ARGB pixels with color table.

« no previous file with comments | « source/row_gcc.cc ('k') | no next file » | no next file with comments »