Index: source/row_win.cc |
diff --git a/source/row_win.cc b/source/row_win.cc |
index baf6c940aa638a63d047ab1343072d615897632f..d2da0e43953c1ee02ddc68716cb2fb8a5baa7281 100644 |
--- a/source/row_win.cc |
+++ b/source/row_win.cc |
@@ -6095,6 +6095,42 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, |
} |
#endif // HAS_ARGBPOLYNOMIALROW_AVX2 |
+#ifdef HAS_HALFFLOATROW_SSE2 |
+static float kExpBias = 1.9259299444e-34f; |
+__declspec(naked) |
+void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) { |
+ __asm { |
+ mov eax, [esp + 4] /* src */ |
+ mov edx, [esp + 8] /* dst */ |
+ movd xmm4, dword ptr [esp + 12] /* scale */ |
+ mov ecx, [esp + 16] /* width */ |
+ mulss xmm4, kExpBias |
+ pshufd xmm4, xmm4, 0 |
+ pxor xmm5, xmm5 |
+ |
+ // 8 pixel loop. |
+ convertloop: |
+ movdqu xmm2, xmmword ptr [eax] // 8 shorts |
+ lea eax, [eax + 16] |
+ movdqa xmm3, xmm2 |
+ punpcklwd xmm2, xmm5 |
+ cvtdq2ps xmm2, xmm2 // convert 8 ints to floats |
+ punpckhwd xmm3, xmm5 |
+ cvtdq2ps xmm3, xmm3 |
+ mulps xmm2, xmm4 |
+ mulps xmm3, xmm4 |
+ psrld xmm2, 13 |
+ psrld xmm3, 13 |
+ packssdw xmm2, xmm3 |
+ movdqu [edx], xmm2 |
+ lea edx, [edx + 16] |
+ sub ecx, 8 |
+ jg convertloop |
+ ret |
+ } |
+} |
+#endif // HAS_HALFFLOATROW_SSE2 |
+ |
#ifdef HAS_HALFFLOATROW_AVX2 |
__declspec(naked) |
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { |
@@ -6106,17 +6142,17 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { |
// 8 pixel loop. |
convertloop: |
- vpmovzxwd ymm0, xmmword ptr [eax] // 8 shorts -> 8 ints |
- vpmovzxwd ymm1, xmmword ptr [eax + 16] // 8 more shorts |
+ vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints |
+ vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts |
lea eax, [eax + 32] |
- vcvtdq2ps ymm0, ymm0 // convert 8 ints to floats |
- vcvtdq2ps ymm1, ymm1 |
- vmulps ymm0, ymm0, ymm4 // scale to normalized range 0 to 1 |
- vmulps ymm1, ymm1, ymm4 |
- vcvtps2ph xmm0, ymm0, 3 // float convert to 8 half floats truncate |
- vcvtps2ph xmm1, ymm1, 3 |
- vmovdqu [edx], xmm0 |
- vmovdqu [edx + 16], xmm1 |
+ vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats |
+ vcvtdq2ps ymm3, ymm3 |
+ vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1 |
+ vmulps ymm3, ymm3, ymm4 |
+ vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate |
+ vcvtps2ph xmm3, ymm3, 3 |
+ vmovdqu [edx], xmm2 |
+ vmovdqu [edx + 16], xmm3 |
lea edx, [edx + 32] |
sub ecx, 16 |
jg convertloop |