Index: source/row_win.cc |
diff --git a/source/row_win.cc b/source/row_win.cc |
index 13076ce604edf2532ab96571a4396fa91816b084..68f37f317b1964670c5441540905cb1f8ffcf952 100644 |
--- a/source/row_win.cc |
+++ b/source/row_win.cc |
@@ -4065,7 +4065,10 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, |
#ifdef HAS_BLENDPLANEROW_SSSE3 |
// Blend 8 pixels at a time. |
-// =((G2*C2)+(H2*(D2))+32768+127)/256 |
+// unsigned version of math |
+// =((A2*C2)+(B2*(255-C2))+255)/256 |
+// signed version of math |
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 |
__declspec(naked) |
void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, |
const uint8* alpha, uint8* dst, int width) { |
@@ -4116,8 +4119,11 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, |
#endif // HAS_BLENDPLANEROW_SSSE3 |
#ifdef HAS_BLENDPLANEROW_AVX2 |
-// Blend 16 pixels at a time. |
-// =((G2*C2)+(H2*(D2))+32768+127)/256 |
+// Blend 32 pixels at a time. |
+// unsigned version of math |
+// =((A2*C2)+(B2*(255-C2))+255)/256 |
+// signed version of math |
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 |
__declspec(naked) |
void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, |
const uint8* alpha, uint8* dst, int width) { |
@@ -4141,27 +4147,30 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, |
sub edx, esi |
sub edi, esi |
- // 16 pixel loop. |
- convertloop16: |
- vmovdqu xmm0, [esi] // alpha |
- vpermq ymm0, ymm0, 0xd8 |
- vpunpcklbw ymm0, ymm0, ymm0 |
+ // 32 pixel loop. |
+ convertloop32: |
+ vmovdqu ymm0, [esi] // alpha |
+ vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31 |
+ vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23 |
+ vpxor ymm3, ymm3, ymm5 // a, 255-a |
vpxor ymm0, ymm0, ymm5 // a, 255-a |
- vmovdqu xmm1, [eax + esi] // src0 |
- vmovdqu xmm2, [edx + esi] // src1 |
- vpermq ymm1, ymm1, 0xd8 |
- vpermq ymm2, ymm2, 0xd8 |
+ vmovdqu ymm1, [eax + esi] // src0 |
+ vmovdqu ymm2, [edx + esi] // src1 |
+ vpunpckhbw ymm4, ymm1, ymm2 |
vpunpcklbw ymm1, ymm1, ymm2 |
+ vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128 |
vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 |
+ vpmaddubsw ymm3, ymm3, ymm4 |
vpmaddubsw ymm0, ymm0, ymm1 |
+ vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round. |
vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. |
+ vpsrlw ymm3, ymm3, 8 |
vpsrlw ymm0, ymm0, 8 |
- vpackuswb ymm0, ymm0, ymm0 |
- vpermq ymm0, ymm0, 0xd8 |
- vmovdqu [edi + esi], xmm0 |
- lea esi, [esi + 16] |
- sub ecx, 16 |
- jg convertloop16 |
+ vpackuswb ymm0, ymm0, ymm3 |
+ vmovdqu [edi + esi], ymm0 |
+ lea esi, [esi + 32] |
+ sub ecx, 32 |
+ jg convertloop32 |
pop edi |
pop esi |