Index: source/row_win.cc |
diff --git a/source/row_win.cc b/source/row_win.cc |
index d54f05e29e6b3e4a33918293d2f442d1aacc65b3..baf6c940aa638a63d047ab1343072d615897632f 100644 |
--- a/source/row_win.cc |
+++ b/source/row_win.cc |
@@ -6095,13 +6095,9 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, |
} |
#endif // HAS_ARGBPOLYNOMIALROW_AVX2 |
-// Samples assumed to be unsigned in low 9, 10 or 12 bits. Scale factor |
-// adjust the sample range to 0 to 1 using a float multiply. |
-// e.g. 9 bit scale is 1.0f / 512.0f |
-// e.g. 10 bit scale is 1.0f / 1024.0f |
-#ifdef HAS_SHORTTOHALFFLOAT_AVX2 |
+#ifdef HAS_HALFFLOATROW_AVX2 |
__declspec(naked) |
-void ShortToF16Row_AVX2(const uint16* src, int16* dst, float scale, int width) { |
+void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { |
__asm { |
mov eax, [esp + 4] /* src */ |
mov edx, [esp + 8] /* dst */ |
@@ -6111,19 +6107,24 @@ void ShortToF16Row_AVX2(const uint16* src, int16* dst, float scale, int width) { |
// 8 pixel loop. |
convertloop: |
vpmovzxwd ymm0, xmmword ptr [eax] // 8 shorts -> 8 ints |
- lea eax, [eax + 16] |
+ vpmovzxwd ymm1, xmmword ptr [eax + 16] // 8 more shorts |
+ lea eax, [eax + 32] |
vcvtdq2ps ymm0, ymm0 // convert 8 ints to floats |
+ vcvtdq2ps ymm1, ymm1 |
vmulps ymm0, ymm0, ymm4 // scale to normalized range 0 to 1 |
- vcvtps2ph xmm0, ymm0, 0 // float conver to 8 half floats round even |
+ vmulps ymm1, ymm1, ymm4 |
+ vcvtps2ph xmm0, ymm0, 3 // float convert to 8 half floats truncate |
+ vcvtps2ph xmm1, ymm1, 3 |
vmovdqu [edx], xmm0 |
- lea edx, [edx + 16] |
- sub ecx, 8 |
+ vmovdqu [edx + 16], xmm1 |
+ lea edx, [edx + 32] |
+ sub ecx, 16 |
jg convertloop |
vzeroupper |
ret |
} |
} |
-#endif // HAS_SHORTTOHALFFLOAT_AVX2 |
+#endif // HAS_HALFFLOATROW_AVX2 |
#ifdef HAS_ARGBCOLORTABLEROW_X86 |
// Tranform ARGB pixels with color table. |