| Index: source/row_win.cc
|
| diff --git a/source/row_win.cc b/source/row_win.cc
|
| index 9dc8055350bd680d1265a45ddd82e1245161b90b..ecbee30426ad082a2039734550082fe36d786096 100644
|
| --- a/source/row_win.cc
|
| +++ b/source/row_win.cc
|
| @@ -6059,10 +6059,46 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
|
| __asm {
|
| mov eax, [esp + 4] /* src */
|
| mov edx, [esp + 8] /* dst */
|
| + movd xmm4, dword ptr [esp + 12] /* scale */
|
| + mov ecx, [esp + 16] /* width */
|
| +
|
| + vmulss xmm4, xmm4, kExpBias
|
| + vbroadcastss ymm4, xmm4
|
| + vpxor ymm5, ymm5, ymm5
|
| +
|
| + // 16 pixel loop.
|
| + convertloop:
|
| + vmovdqu ymm2, [eax] // 16 shorts
|
| + lea eax, [eax + 32]
|
| + vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints
|
| + vpunpcklwd ymm2, ymm2, ymm5
|
| + vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats
|
| + vcvtdq2ps ymm2, ymm2
|
| + vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range.
|
| + vmulps ymm2, ymm2, ymm4
|
| + vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
|
| + vpsrld ymm2, ymm2, 13
|
| + vpackssdw ymm2, ymm2, ymm3
|
| + vmovdqu [edx], ymm2
|
| + lea edx, [edx + 32]
|
| + sub ecx, 16
|
| + jg convertloop
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_HALFFLOATROW_AVX2
|
| +
|
| +#ifdef HAS_HALFFLOATROW_F16C
|
| +__declspec(naked)
|
| +void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
|
| + __asm {
|
| + mov eax, [esp + 4] /* src */
|
| + mov edx, [esp + 8] /* dst */
|
| vbroadcastss ymm4, [esp + 12] /* scale */
|
| mov ecx, [esp + 16] /* width */
|
|
|
| - // 8 pixel loop.
|
| + // 16 pixel loop.
|
| convertloop:
|
| vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
|
| vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
|
| @@ -6082,7 +6118,7 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
|
| ret
|
| }
|
| }
|
| -#endif // HAS_HALFFLOATROW_AVX2
|
| +#endif // HAS_HALFFLOATROW_F16C
|
|
|
| #ifdef HAS_ARGBCOLORTABLEROW_X86
|
| // Tranform ARGB pixels with color table.
|
|
|