| Index: source/libvpx/third_party/libyuv/source/scale_win.cc
|
| diff --git a/source/libvpx/third_party/libyuv/source/scale_win.cc b/source/libvpx/third_party/libyuv/source/scale_win.cc
|
| index e0209cdec8c243d1b06dd4159c6f7c553b380798..c3896ebad2fd89869118c088f90bfe4c36dd9046 100644
|
| --- a/source/libvpx/third_party/libyuv/source/scale_win.cc
|
| +++ b/source/libvpx/third_party/libyuv/source/scale_win.cc
|
| @@ -9,6 +9,7 @@
|
| */
|
|
|
| #include "libyuv/row.h"
|
| +#include "libyuv/scale_row.h"
|
|
|
| #ifdef __cplusplus
|
| namespace libyuv {
|
| @@ -16,7 +17,8 @@ extern "C" {
|
| #endif
|
|
|
| // This module is for Visual C x86.
|
| -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
| +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
|
| + defined(_MSC_VER) && !defined(__clang__)
|
|
|
| // Offsets for source bytes 0 to 9
|
| static uvec8 kShuf0 =
|
| @@ -93,8 +95,7 @@ static uvec16 kScaleAb2 =
|
| { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
|
|
|
| // Reads 32 pixels, throws half away and writes 16 pixels.
|
| -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| uint8* dst_ptr, int dst_width) {
|
| __asm {
|
| @@ -120,8 +121,7 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| }
|
|
|
| // Blends 32x1 rectangle to 16x1.
|
| -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| uint8* dst_ptr, int dst_width) {
|
| __asm {
|
| @@ -157,8 +157,7 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| }
|
|
|
| // Blends 32x2 rectangle to 16x1.
|
| -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| uint8* dst_ptr, int dst_width) {
|
| __asm {
|
| @@ -199,9 +198,116 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| }
|
| }
|
|
|
| +#ifdef HAS_SCALEROWDOWN2_AVX2
|
| +// Reads 64 pixels, throws half away and writes 32 pixels.
|
| +__declspec(naked)
|
| +void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| + __asm {
|
| + mov eax, [esp + 4] // src_ptr
|
| + // src_stride ignored
|
| + mov edx, [esp + 12] // dst_ptr
|
| + mov ecx, [esp + 16] // dst_width
|
| +
|
| + wloop:
|
| + vmovdqu ymm0, [eax]
|
| + vmovdqu ymm1, [eax + 32]
|
| + lea eax, [eax + 64]
|
| + vpsrlw ymm0, ymm0, 8 // isolate odd pixels.
|
| + vpsrlw ymm1, ymm1, 8
|
| + vpackuswb ymm0, ymm0, ymm1
|
| + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
| + vmovdqu [edx], ymm0
|
| + lea edx, [edx + 32]
|
| + sub ecx, 32
|
| + jg wloop
|
| +
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Blends 64x1 rectangle to 32x1.
|
| +__declspec(naked)
|
| +void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| + __asm {
|
| + mov eax, [esp + 4] // src_ptr
|
| + // src_stride
|
| + mov edx, [esp + 12] // dst_ptr
|
| + mov ecx, [esp + 16] // dst_width
|
| +
|
| + vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
|
| + vpsrlw ymm4, ymm4, 15
|
| + vpackuswb ymm4, ymm4, ymm4
|
| + vpxor ymm5, ymm5, ymm5 // constant 0
|
| +
|
| + wloop:
|
| + vmovdqu ymm0, [eax]
|
| + vmovdqu ymm1, [eax + 32]
|
| + lea eax, [eax + 64]
|
| +
|
| + vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
|
| + vpmaddubsw ymm1, ymm1, ymm4
|
| + vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
|
| + vpavgw ymm1, ymm1, ymm5
|
| + vpackuswb ymm0, ymm0, ymm1
|
| + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
| +
|
| + vmovdqu [edx], ymm0
|
| + lea edx, [edx + 32]
|
| + sub ecx, 32
|
| + jg wloop
|
| +
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Blends 64x2 rectangle to 32x1.
|
| +__declspec(naked)
|
| +void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| + __asm {
|
| + push esi
|
| + mov eax, [esp + 4 + 4] // src_ptr
|
| + mov esi, [esp + 4 + 8] // src_stride
|
| + mov edx, [esp + 4 + 12] // dst_ptr
|
| + mov ecx, [esp + 4 + 16] // dst_width
|
| +
|
| + vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
|
| + vpsrlw ymm4, ymm4, 15
|
| + vpackuswb ymm4, ymm4, ymm4
|
| + vpxor ymm5, ymm5, ymm5 // constant 0
|
| +
|
| + wloop:
|
| + vmovdqu ymm0, [eax] // average rows
|
| + vmovdqu ymm1, [eax + 32]
|
| + vpavgb ymm0, ymm0, [eax + esi]
|
| + vpavgb ymm1, ymm1, [eax + esi + 32]
|
| + lea eax, [eax + 64]
|
| +
|
| + vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
|
| + vpmaddubsw ymm1, ymm1, ymm4
|
| + vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
|
| + vpavgw ymm1, ymm1, ymm5
|
| + vpackuswb ymm0, ymm0, ymm1
|
| + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
| +
|
| + vmovdqu [edx], ymm0
|
| + lea edx, [edx + 32]
|
| + sub ecx, 32
|
| + jg wloop
|
| +
|
| + pop esi
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_SCALEROWDOWN2_AVX2
|
| +
|
| // Point samples 32 pixels to 8 pixels.
|
| -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| uint8* dst_ptr, int dst_width) {
|
| __asm {
|
| @@ -232,8 +338,7 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| }
|
|
|
| // Blends 32x4 rectangle to 8x1.
|
| -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| uint8* dst_ptr, int dst_width) {
|
| __asm {
|
| @@ -248,11 +353,11 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| psrlw xmm7, 8
|
|
|
| wloop:
|
| - movdqu xmm0, [eax]
|
| + movdqu xmm0, [eax] // average rows
|
| movdqu xmm1, [eax + 16]
|
| movdqu xmm2, [eax + esi]
|
| movdqu xmm3, [eax + esi + 16]
|
| - pavgb xmm0, xmm2 // average rows
|
| + pavgb xmm0, xmm2
|
| pavgb xmm1, xmm3
|
| movdqu xmm2, [eax + esi * 2]
|
| movdqu xmm3, [eax + esi * 2 + 16]
|
| @@ -291,13 +396,102 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| }
|
| }
|
|
|
| +#ifdef HAS_SCALEROWDOWN4_AVX2
|
| +// Point samples 64 pixels to 16 pixels.
|
| +__declspec(naked)
|
| +void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| + __asm {
|
| + mov eax, [esp + 4] // src_ptr
|
| + // src_stride ignored
|
| + mov edx, [esp + 12] // dst_ptr
|
| + mov ecx, [esp + 16] // dst_width
|
| + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000
|
| + vpsrld ymm5, ymm5, 24
|
| + vpslld ymm5, ymm5, 16
|
| +
|
| + wloop:
|
| + vmovdqu ymm0, [eax]
|
| + vmovdqu ymm1, [eax + 32]
|
| + lea eax, [eax + 64]
|
| + vpand ymm0, ymm0, ymm5
|
| + vpand ymm1, ymm1, ymm5
|
| + vpackuswb ymm0, ymm0, ymm1
|
| + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
| + vpsrlw ymm0, ymm0, 8
|
| + vpackuswb ymm0, ymm0, ymm0
|
| + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
| + vmovdqu [edx], xmm0
|
| + lea edx, [edx + 16]
|
| + sub ecx, 16
|
| + jg wloop
|
| +
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Blends 64x4 rectangle to 16x1.
|
| +__declspec(naked)
|
| +void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| + __asm {
|
| + push esi
|
| + push edi
|
| + mov eax, [esp + 8 + 4] // src_ptr
|
| + mov esi, [esp + 8 + 8] // src_stride
|
| + mov edx, [esp + 8 + 12] // dst_ptr
|
| + mov ecx, [esp + 8 + 16] // dst_width
|
| + lea edi, [esi + esi * 2] // src_stride * 3
|
| + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff
|
| + vpsrlw ymm7, ymm7, 8
|
| +
|
| + wloop:
|
| + vmovdqu ymm0, [eax] // average rows
|
| + vmovdqu ymm1, [eax + 32]
|
| + vpavgb ymm0, ymm0, [eax + esi]
|
| + vpavgb ymm1, ymm1, [eax + esi + 32]
|
| + vmovdqu ymm2, [eax + esi * 2]
|
| + vmovdqu ymm3, [eax + esi * 2 + 32]
|
| + vpavgb ymm2, ymm2, [eax + edi]
|
| + vpavgb ymm3, ymm3, [eax + edi + 32]
|
| + lea eax, [eax + 64]
|
| + vpavgb ymm0, ymm0, ymm2
|
| + vpavgb ymm1, ymm1, ymm3
|
| +
|
| + vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels)
|
| + vpand ymm3, ymm1, ymm7
|
| + vpsrlw ymm0, ymm0, 8
|
| + vpsrlw ymm1, ymm1, 8
|
| + vpavgw ymm0, ymm0, ymm2
|
| + vpavgw ymm1, ymm1, ymm3
|
| + vpackuswb ymm0, ymm0, ymm1
|
| + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
| +
|
| + vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels)
|
| + vpsrlw ymm0, ymm0, 8
|
| + vpavgw ymm0, ymm0, ymm2
|
| + vpackuswb ymm0, ymm0, ymm0
|
| + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
| +
|
| + vmovdqu [edx], xmm0
|
| + lea edx, [edx + 16]
|
| + sub ecx, 16
|
| + jg wloop
|
| +
|
| + pop edi
|
| + pop esi
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_SCALEROWDOWN4_AVX2
|
| +
|
| // Point samples 32 pixels to 24 pixels.
|
| // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
|
| // Then shuffled to do the scaling.
|
|
|
| -// Note that movdqa+palign may be better than movdqu.
|
| -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
| uint8* dst_ptr, int dst_width) {
|
| __asm {
|
| @@ -344,8 +538,7 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
| // xmm7 kRound34
|
|
|
| // Note that movdqa+palign may be better than movdqu.
|
| -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
|
| ptrdiff_t src_stride,
|
| uint8* dst_ptr, int dst_width) {
|
| @@ -402,8 +595,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
|
| }
|
|
|
| // Note that movdqa+palign may be better than movdqu.
|
| -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
|
| ptrdiff_t src_stride,
|
| uint8* dst_ptr, int dst_width) {
|
| @@ -465,7 +657,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
|
| // 3/8 point sampler
|
|
|
| // Scale 32 pixels to 12
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
| uint8* dst_ptr, int dst_width) {
|
| __asm {
|
| @@ -496,7 +688,7 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
| }
|
|
|
| // Scale 16x3 pixels to 6x1 with interpolation
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
|
| ptrdiff_t src_stride,
|
| uint8* dst_ptr, int dst_width) {
|
| @@ -561,7 +753,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
|
| }
|
|
|
| // Scale 16x2 pixels to 6x1 with interpolation
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
|
| ptrdiff_t src_stride,
|
| uint8* dst_ptr, int dst_width) {
|
| @@ -605,76 +797,68 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
|
| }
|
| }
|
|
|
| -// Reads 16xN bytes and produces 16 shorts at a time.
|
| -// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
|
| -__declspec(naked) __declspec(align(16))
|
| -void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| - uint16* dst_ptr, int src_width,
|
| - int src_height) {
|
| +// Reads 16 bytes and accumulates to 16 shorts at a time.
|
| +__declspec(naked)
|
| +void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
|
| __asm {
|
| - push esi
|
| - push edi
|
| - push ebx
|
| - push ebp
|
| - mov esi, [esp + 16 + 4] // src_ptr
|
| - mov edx, [esp + 16 + 8] // src_stride
|
| - mov edi, [esp + 16 + 12] // dst_ptr
|
| - mov ecx, [esp + 16 + 16] // dst_width
|
| - mov ebx, [esp + 16 + 20] // height
|
| - pxor xmm4, xmm4
|
| - dec ebx
|
| + mov eax, [esp + 4] // src_ptr
|
| + mov edx, [esp + 8] // dst_ptr
|
| + mov ecx, [esp + 12] // src_width
|
| + pxor xmm5, xmm5
|
|
|
| + // sum rows
|
| xloop:
|
| - // first row
|
| - movdqu xmm0, [esi]
|
| - lea eax, [esi + edx]
|
| - movdqa xmm1, xmm0
|
| - punpcklbw xmm0, xmm4
|
| - punpckhbw xmm1, xmm4
|
| - lea esi, [esi + 16]
|
| - mov ebp, ebx
|
| - test ebp, ebp
|
| - je ydone
|
| -
|
| - // sum remaining rows
|
| - yloop:
|
| - movdqu xmm2, [eax] // read 16 pixels
|
| - lea eax, [eax + edx] // advance to next row
|
| - movdqa xmm3, xmm2
|
| - punpcklbw xmm2, xmm4
|
| - punpckhbw xmm3, xmm4
|
| + movdqu xmm3, [eax] // read 16 bytes
|
| + lea eax, [eax + 16]
|
| + movdqu xmm0, [edx] // read 16 words from destination
|
| + movdqu xmm1, [edx + 16]
|
| + movdqa xmm2, xmm3
|
| + punpcklbw xmm2, xmm5
|
| + punpckhbw xmm3, xmm5
|
| paddusw xmm0, xmm2 // sum 16 words
|
| paddusw xmm1, xmm3
|
| - sub ebp, 1
|
| - jg yloop
|
| -
|
| - ydone:
|
| - movdqu [edi], xmm0
|
| - movdqu [edi + 16], xmm1
|
| - lea edi, [edi + 32]
|
| -
|
| + movdqu [edx], xmm0 // write 16 words to destination
|
| + movdqu [edx + 16], xmm1
|
| + lea edx, [edx + 32]
|
| sub ecx, 16
|
| jg xloop
|
| + ret
|
| + }
|
| +}
|
|
|
| - pop ebp
|
| - pop ebx
|
| - pop edi
|
| - pop esi
|
| +#ifdef HAS_SCALEADDROW_AVX2
|
| +// Reads 32 bytes and accumulates to 32 shorts at a time.
|
| +__declspec(naked)
|
| +void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
|
| + __asm {
|
| + mov eax, [esp + 4] // src_ptr
|
| + mov edx, [esp + 8] // dst_ptr
|
| + mov ecx, [esp + 12] // src_width
|
| + vpxor ymm5, ymm5, ymm5
|
| +
|
| + // sum rows
|
| + xloop:
|
| + vmovdqu ymm3, [eax] // read 32 bytes
|
| + lea eax, [eax + 32]
|
| + vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck
|
| + vpunpcklbw ymm2, ymm3, ymm5
|
| + vpunpckhbw ymm3, ymm3, ymm5
|
| + vpaddusw ymm0, ymm2, [edx] // sum 16 words
|
| + vpaddusw ymm1, ymm3, [edx + 32]
|
| + vmovdqu [edx], ymm0 // write 32 words to destination
|
| + vmovdqu [edx + 32], ymm1
|
| + lea edx, [edx + 64]
|
| + sub ecx, 32
|
| + jg xloop
|
| +
|
| + vzeroupper
|
| ret
|
| }
|
| }
|
| +#endif // HAS_SCALEADDROW_AVX2
|
|
|
| // Bilinear column filtering. SSSE3 version.
|
| -// TODO(fbarchard): Port to Neon
|
| -// TODO(fbarchard): Switch the following:
|
| -// xor ebx, ebx
|
| -// mov bx, word ptr [esi + eax] // 2 source x0 pixels
|
| -// To
|
| -// movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
|
| -// when drmemory bug fixed.
|
| -// https://code.google.com/p/drmemory/issues/detail?id=1396
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| int dst_width, int x, int dx) {
|
| __asm {
|
| @@ -751,8 +935,7 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| }
|
|
|
| // Reads 16 pixels, duplicates them and writes 32 pixels.
|
| -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
| int dst_width, int x, int dx) {
|
| __asm {
|
| @@ -777,8 +960,7 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
| }
|
|
|
| // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
|
| -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
|
| ptrdiff_t src_stride,
|
| uint8* dst_argb, int dst_width) {
|
| @@ -803,8 +985,7 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
|
| }
|
|
|
| // Blends 8x1 rectangle to 4x1.
|
| -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
|
| ptrdiff_t src_stride,
|
| uint8* dst_argb, int dst_width) {
|
| @@ -832,8 +1013,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
|
| }
|
|
|
| // Blends 8x2 rectangle to 4x1.
|
| -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
|
| ptrdiff_t src_stride,
|
| uint8* dst_argb, int dst_width) {
|
| @@ -867,8 +1047,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
|
| }
|
|
|
| // Reads 4 pixels at a time.
|
| -// Alignment requirement: dst_argb 16 byte aligned.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
| int src_stepx,
|
| uint8* dst_argb, int dst_width) {
|
| @@ -904,8 +1083,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
| }
|
|
|
| // Blends four 2x2 to 4x1.
|
| -// Alignment requirement: dst_argb 16 byte aligned.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
|
| ptrdiff_t src_stride,
|
| int src_stepx,
|
| @@ -953,7 +1131,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
|
| }
|
|
|
| // Column scaling unfiltered. SSE2 version.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
|
| int dst_width, int x, int dx) {
|
| __asm {
|
| @@ -1044,7 +1222,7 @@ static uvec8 kShuffleFractions = {
|
| 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
|
| };
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
|
| int dst_width, int x, int dx) {
|
| __asm {
|
| @@ -1115,8 +1293,7 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
|
| }
|
|
|
| // Reads 4 pixels, duplicates them and writes 8 pixels.
|
| -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
|
| int dst_width, int x, int dx) {
|
| __asm {
|
| @@ -1141,7 +1318,7 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
|
| }
|
|
|
| // Divide num by div and return as 16.16 fixed point result.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| int FixedDiv_X86(int num, int div) {
|
| __asm {
|
| mov eax, [esp + 4] // num
|
| @@ -1154,7 +1331,7 @@ int FixedDiv_X86(int num, int div) {
|
| }
|
|
|
| // Divide num by div and return as 16.16 fixed point result.
|
| -__declspec(naked) __declspec(align(16))
|
| +__declspec(naked)
|
| int FixedDiv1_X86(int num, int div) {
|
| __asm {
|
| mov eax, [esp + 4] // num
|
| @@ -1169,8 +1346,7 @@ int FixedDiv1_X86(int num, int div) {
|
| ret
|
| }
|
| }
|
| -
|
| -#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
| +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
|
|
|
| #ifdef __cplusplus
|
| } // extern "C"
|
|
|