| Index: source/row_win.cc
|
| diff --git a/source/row_win.cc b/source/row_win.cc
|
| index 084fc0444444e057ebcd62ea7f32f83e0ebcc933..494043c626f3107588c067accec0fce0ad4d4386 100644
|
| --- a/source/row_win.cc
|
| +++ b/source/row_win.cc
|
| @@ -5571,12 +5571,8 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
|
| cmp eax, 0
|
| je xloop100 // 0 / 128. Blend 100 / 0.
|
| sub edi, esi
|
| - cmp eax, 32
|
| - je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
|
| cmp eax, 64
|
| je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
|
| - cmp eax, 96
|
| - je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
|
|
|
| vmovd xmm0, eax // high fraction 0..127
|
| neg eax
|
| @@ -5587,6 +5583,10 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
|
| vpxor ymm0, ymm0, ymm0
|
| vpermd ymm5, ymm0, ymm5
|
|
|
| + mov eax, 0x00400040 // 64 for rounding.
|
| + vmovd xmm4, eax
|
| + vbroadcastss ymm4, xmm4
|
| +
|
| xloop:
|
| vmovdqu ymm0, [esi]
|
| vmovdqu ymm2, [esi + edx]
|
| @@ -5594,6 +5594,8 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
|
| vpunpcklbw ymm0, ymm0, ymm2 // mutates
|
| vpmaddubsw ymm0, ymm0, ymm5
|
| vpmaddubsw ymm1, ymm1, ymm5
|
| + vpaddw ymm0, ymm0, ymm4
|
| + vpaddw ymm1, ymm1, ymm4
|
| vpsrlw ymm0, ymm0, 7
|
| vpsrlw ymm1, ymm1, 7
|
| vpackuswb ymm0, ymm0, ymm1 // unmutates
|
| @@ -5603,18 +5605,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
|
| jg xloop
|
| jmp xloop99
|
|
|
| - // Blend 25 / 75.
|
| - xloop25:
|
| - vmovdqu ymm0, [esi]
|
| - vmovdqu ymm1, [esi + edx]
|
| - vpavgb ymm0, ymm0, ymm1
|
| - vpavgb ymm0, ymm0, ymm1
|
| - vmovdqu [esi + edi], ymm0
|
| - lea esi, [esi + 32]
|
| - sub ecx, 32
|
| - jg xloop25
|
| - jmp xloop99
|
| -
|
| // Blend 50 / 50.
|
| xloop50:
|
| vmovdqu ymm0, [esi]
|
| @@ -5625,18 +5615,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
|
| jg xloop50
|
| jmp xloop99
|
|
|
| - // Blend 75 / 25.
|
| - xloop75:
|
| - vmovdqu ymm1, [esi]
|
| - vmovdqu ymm0, [esi + edx]
|
| - vpavgb ymm0, ymm0, ymm1
|
| - vpavgb ymm0, ymm0, ymm1
|
| - vmovdqu [esi + edi], ymm0
|
| - lea esi, [esi + 32]
|
| - sub ecx, 32
|
| - jg xloop75
|
| - jmp xloop99
|
| -
|
| // Blend 100 / 0 - Copy row unchanged.
|
| xloop100:
|
| rep movsb
|
| @@ -5668,12 +5646,8 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| // Dispatch to specialized filters if applicable.
|
| cmp eax, 0
|
| je xloop100 // 0 / 128. Blend 100 / 0.
|
| - cmp eax, 32
|
| - je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
|
| cmp eax, 64
|
| je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
|
| - cmp eax, 96
|
| - je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
|
|
|
| movd xmm0, eax // high fraction 0..127
|
| neg eax
|
| @@ -5683,6 +5657,10 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| punpcklwd xmm5, xmm5
|
| pshufd xmm5, xmm5, 0
|
|
|
| + mov eax, 0x00400040 // 64 for rounding.
|
| + movd xmm4, eax
|
| + pshufd xmm4, xmm4, 0x00
|
| +
|
| xloop:
|
| movdqu xmm0, [esi]
|
| movdqu xmm2, [esi + edx]
|
| @@ -5691,6 +5669,8 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| punpckhbw xmm1, xmm2
|
| pmaddubsw xmm0, xmm5
|
| pmaddubsw xmm1, xmm5
|
| + paddw xmm0, xmm4
|
| + paddw xmm1, xmm4
|
| psrlw xmm0, 7
|
| psrlw xmm1, 7
|
| packuswb xmm0, xmm1
|
| @@ -5700,18 +5680,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| jg xloop
|
| jmp xloop99
|
|
|
| - // Blend 25 / 75.
|
| - xloop25:
|
| - movdqu xmm0, [esi]
|
| - movdqu xmm1, [esi + edx]
|
| - pavgb xmm0, xmm1
|
| - pavgb xmm0, xmm1
|
| - movdqu [esi + edi], xmm0
|
| - lea esi, [esi + 16]
|
| - sub ecx, 16
|
| - jg xloop25
|
| - jmp xloop99
|
| -
|
| // Blend 50 / 50.
|
| xloop50:
|
| movdqu xmm0, [esi]
|
| @@ -5723,125 +5691,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| jg xloop50
|
| jmp xloop99
|
|
|
| - // Blend 75 / 25.
|
| - xloop75:
|
| - movdqu xmm1, [esi]
|
| - movdqu xmm0, [esi + edx]
|
| - pavgb xmm0, xmm1
|
| - pavgb xmm0, xmm1
|
| - movdqu [esi + edi], xmm0
|
| - lea esi, [esi + 16]
|
| - sub ecx, 16
|
| - jg xloop75
|
| - jmp xloop99
|
| -
|
| - // Blend 100 / 0 - Copy row unchanged.
|
| - xloop100:
|
| - movdqu xmm0, [esi]
|
| - movdqu [esi + edi], xmm0
|
| - lea esi, [esi + 16]
|
| - sub ecx, 16
|
| - jg xloop100
|
| -
|
| - xloop99:
|
| - pop edi
|
| - pop esi
|
| - ret
|
| - }
|
| -}
|
| -
|
| -#ifdef HAS_INTERPOLATEROW_SSE2
|
| -// Bilinear filter 16x2 -> 16x1
|
| -__declspec(naked)
|
| -void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
| - ptrdiff_t src_stride, int dst_width,
|
| - int source_y_fraction) {
|
| - __asm {
|
| - push esi
|
| - push edi
|
| - mov edi, [esp + 8 + 4] // dst_ptr
|
| - mov esi, [esp + 8 + 8] // src_ptr
|
| - mov edx, [esp + 8 + 12] // src_stride
|
| - mov ecx, [esp + 8 + 16] // dst_width
|
| - mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
|
| - sub edi, esi
|
| - // Dispatch to specialized filters if applicable.
|
| - cmp eax, 0
|
| - je xloop100 // 0 / 256. Blend 100 / 0.
|
| - cmp eax, 64
|
| - je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
|
| - cmp eax, 128
|
| - je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
|
| - cmp eax, 192
|
| - je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
|
| -
|
| - movd xmm5, eax // xmm5 = y fraction
|
| - punpcklbw xmm5, xmm5
|
| - psrlw xmm5, 1
|
| - punpcklwd xmm5, xmm5
|
| - punpckldq xmm5, xmm5
|
| - punpcklqdq xmm5, xmm5
|
| - pxor xmm4, xmm4
|
| -
|
| - xloop:
|
| - movdqu xmm0, [esi] // row0
|
| - movdqu xmm2, [esi + edx] // row1
|
| - movdqu xmm1, xmm0
|
| - movdqu xmm3, xmm2
|
| - punpcklbw xmm2, xmm4
|
| - punpckhbw xmm3, xmm4
|
| - punpcklbw xmm0, xmm4
|
| - punpckhbw xmm1, xmm4
|
| - psubw xmm2, xmm0 // row1 - row0
|
| - psubw xmm3, xmm1
|
| - paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
|
| - paddw xmm3, xmm3
|
| - pmulhw xmm2, xmm5 // scale diff
|
| - pmulhw xmm3, xmm5
|
| - paddw xmm0, xmm2 // sum rows
|
| - paddw xmm1, xmm3
|
| - packuswb xmm0, xmm1
|
| - movdqu [esi + edi], xmm0
|
| - lea esi, [esi + 16]
|
| - sub ecx, 16
|
| - jg xloop
|
| - jmp xloop99
|
| -
|
| - // Blend 25 / 75.
|
| - xloop25:
|
| - movdqu xmm0, [esi]
|
| - movdqu xmm1, [esi + edx]
|
| - pavgb xmm0, xmm1
|
| - pavgb xmm0, xmm1
|
| - movdqu [esi + edi], xmm0
|
| - lea esi, [esi + 16]
|
| - sub ecx, 16
|
| - jg xloop25
|
| - jmp xloop99
|
| -
|
| - // Blend 50 / 50.
|
| - xloop50:
|
| - movdqu xmm0, [esi]
|
| - movdqu xmm1, [esi + edx]
|
| - pavgb xmm0, xmm1
|
| - movdqu [esi + edi], xmm0
|
| - lea esi, [esi + 16]
|
| - sub ecx, 16
|
| - jg xloop50
|
| - jmp xloop99
|
| -
|
| - // Blend 75 / 25.
|
| - xloop75:
|
| - movdqu xmm1, [esi]
|
| - movdqu xmm0, [esi + edx]
|
| - pavgb xmm0, xmm1
|
| - pavgb xmm0, xmm1
|
| - movdqu [esi + edi], xmm0
|
| - lea esi, [esi + 16]
|
| - sub ecx, 16
|
| - jg xloop75
|
| - jmp xloop99
|
| -
|
| // Blend 100 / 0 - Copy row unchanged.
|
| xloop100:
|
| movdqu xmm0, [esi]
|
| @@ -5856,7 +5705,6 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
| ret
|
| }
|
| }
|
| -#endif // HAS_INTERPOLATEROW_SSE2
|
|
|
| // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
|
| __declspec(naked)
|
|
|