| Index: source/row_win.cc
|
| diff --git a/source/row_win.cc b/source/row_win.cc
|
| index baf6c940aa638a63d047ab1343072d615897632f..d2da0e43953c1ee02ddc68716cb2fb8a5baa7281 100644
|
| --- a/source/row_win.cc
|
| +++ b/source/row_win.cc
|
| @@ -6095,6 +6095,42 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
| }
|
| #endif // HAS_ARGBPOLYNOMIALROW_AVX2
|
|
|
| +#ifdef HAS_HALFFLOATROW_SSE2
|
| +static float kExpBias = 1.9259299444e-34f;
|
| +__declspec(naked)
|
| +void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
|
| + __asm {
|
| + mov eax, [esp + 4] /* src */
|
| + mov edx, [esp + 8] /* dst */
|
| + movd xmm4, dword ptr [esp + 12] /* scale */
|
| + mov ecx, [esp + 16] /* width */
|
| + mulss xmm4, kExpBias
|
| + pshufd xmm4, xmm4, 0
|
| + pxor xmm5, xmm5
|
| +
|
| + // 8 pixel loop.
|
| + convertloop:
|
| + movdqu xmm2, xmmword ptr [eax] // 8 shorts
|
| + lea eax, [eax + 16]
|
| + movdqa xmm3, xmm2
|
| + punpcklwd xmm2, xmm5
|
| + cvtdq2ps xmm2, xmm2 // convert 8 ints to floats
|
| + punpckhwd xmm3, xmm5
|
| + cvtdq2ps xmm3, xmm3
|
| + mulps xmm2, xmm4
|
| + mulps xmm3, xmm4
|
| + psrld xmm2, 13
|
| + psrld xmm3, 13
|
| + packssdw xmm2, xmm3
|
| + movdqu [edx], xmm2
|
| + lea edx, [edx + 16]
|
| + sub ecx, 8
|
| + jg convertloop
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_HALFFLOATROW_SSE2
|
| +
|
| #ifdef HAS_HALFFLOATROW_AVX2
|
| __declspec(naked)
|
| void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
|
| @@ -6106,17 +6142,17 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
|
|
|
| // 8 pixel loop.
|
| convertloop:
|
| - vpmovzxwd ymm0, xmmword ptr [eax] // 8 shorts -> 8 ints
|
| - vpmovzxwd ymm1, xmmword ptr [eax + 16] // 8 more shorts
|
| + vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
|
| + vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
|
| lea eax, [eax + 32]
|
| - vcvtdq2ps ymm0, ymm0 // convert 8 ints to floats
|
| - vcvtdq2ps ymm1, ymm1
|
| - vmulps ymm0, ymm0, ymm4 // scale to normalized range 0 to 1
|
| - vmulps ymm1, ymm1, ymm4
|
| - vcvtps2ph xmm0, ymm0, 3 // float convert to 8 half floats truncate
|
| - vcvtps2ph xmm1, ymm1, 3
|
| - vmovdqu [edx], xmm0
|
| - vmovdqu [edx + 16], xmm1
|
| + vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
|
| + vcvtdq2ps ymm3, ymm3
|
| + vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1
|
| + vmulps ymm3, ymm3, ymm4
|
| + vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
|
| + vcvtps2ph xmm3, ymm3, 3
|
| + vmovdqu [edx], xmm2
|
| + vmovdqu [edx + 16], xmm3
|
| lea edx, [edx + 32]
|
| sub ecx, 16
|
| jg convertloop
|
|
|