| Index: source/row_win.cc
|
| diff --git a/source/row_win.cc b/source/row_win.cc
|
| index d54f05e29e6b3e4a33918293d2f442d1aacc65b3..baf6c940aa638a63d047ab1343072d615897632f 100644
|
| --- a/source/row_win.cc
|
| +++ b/source/row_win.cc
|
| @@ -6095,13 +6095,9 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
| }
|
| #endif // HAS_ARGBPOLYNOMIALROW_AVX2
|
|
|
| -// Samples assumed to be unsigned in low 9, 10 or 12 bits. Scale factor
|
| -// adjust the sample range to 0 to 1 using a float multiply.
|
| -// e.g. 9 bit scale is 1.0f / 512.0f
|
| -// e.g. 10 bit scale is 1.0f / 1024.0f
|
| -#ifdef HAS_SHORTTOHALFFLOAT_AVX2
|
| +#ifdef HAS_HALFFLOATROW_AVX2
|
| __declspec(naked)
|
| -void ShortToF16Row_AVX2(const uint16* src, int16* dst, float scale, int width) {
|
| +void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
|
| __asm {
|
| mov eax, [esp + 4] /* src */
|
| mov edx, [esp + 8] /* dst */
|
| @@ -6111,19 +6107,24 @@ void ShortToF16Row_AVX2(const uint16* src, int16* dst, float scale, int width) {
|
| // 8 pixel loop.
|
| convertloop:
|
| vpmovzxwd ymm0, xmmword ptr [eax] // 8 shorts -> 8 ints
|
| - lea eax, [eax + 16]
|
| + vpmovzxwd ymm1, xmmword ptr [eax + 16] // 8 more shorts
|
| + lea eax, [eax + 32]
|
| vcvtdq2ps ymm0, ymm0 // convert 8 ints to floats
|
| + vcvtdq2ps ymm1, ymm1
|
| vmulps ymm0, ymm0, ymm4 // scale to normalized range 0 to 1
|
| - vcvtps2ph xmm0, ymm0, 0 // float conver to 8 half floats round even
|
| + vmulps ymm1, ymm1, ymm4
|
| + vcvtps2ph xmm0, ymm0, 3 // float convert to 8 half floats truncate
|
| + vcvtps2ph xmm1, ymm1, 3
|
| vmovdqu [edx], xmm0
|
| - lea edx, [edx + 16]
|
| - sub ecx, 8
|
| + vmovdqu [edx + 16], xmm1
|
| + lea edx, [edx + 32]
|
| + sub ecx, 16
|
| jg convertloop
|
| vzeroupper
|
| ret
|
| }
|
| }
|
| -#endif // HAS_SHORTTOHALFFLOAT_AVX2
|
| +#endif // HAS_HALFFLOATROW_AVX2
|
|
|
| #ifdef HAS_ARGBCOLORTABLEROW_X86
|
| // Tranform ARGB pixels with color table.
|
|
|