| Index: source/libvpx/third_party/libyuv/source/scale_win.cc
|
| ===================================================================
|
| --- source/libvpx/third_party/libyuv/source/scale_win.cc (revision 0)
|
| +++ source/libvpx/third_party/libyuv/source/scale_win.cc (revision 0)
|
| @@ -0,0 +1,1320 @@
|
| +/*
|
| + * Copyright 2013 The LibYuv Project Authors. All rights reserved.
|
| + *
|
| + * Use of this source code is governed by a BSD-style license
|
| + * that can be found in the LICENSE file in the root of the source
|
| + * tree. An additional intellectual property rights grant can be found
|
| + * in the file PATENTS. All contributing project authors may
|
| + * be found in the AUTHORS file in the root of the source tree.
|
| + */
|
| +
|
| +#include "third_party/libyuv/include/libyuv/row.h"
|
| +
|
| +#ifdef __cplusplus
|
| +namespace libyuv {
|
| +extern "C" {
|
| +#endif
|
| +
|
| +// This module is for Visual C x86.
|
| +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
| +
|
| +// Offsets for source bytes 0 to 9
|
| +static uvec8 kShuf0 =
|
| + { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
|
| +
|
| +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
|
| +static uvec8 kShuf1 =
|
| + { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
|
| +
|
| +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
|
| +static uvec8 kShuf2 =
|
| + { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
|
| +
|
| +// Offsets for source bytes 0 to 10
|
| +static uvec8 kShuf01 =
|
| + { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
|
| +
|
| +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
|
| +static uvec8 kShuf11 =
|
| + { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
|
| +
|
| +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
|
| +static uvec8 kShuf21 =
|
| + { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
|
| +
|
| +// Coefficients for source bytes 0 to 10
|
| +static uvec8 kMadd01 =
|
| + { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
|
| +
|
| +// Coefficients for source bytes 10 to 21
|
| +static uvec8 kMadd11 =
|
| + { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
|
| +
|
| +// Coefficients for source bytes 21 to 31
|
| +static uvec8 kMadd21 =
|
| + { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
|
| +
|
| +// Coefficients for source bytes 21 to 31
|
| +static vec16 kRound34 =
|
| + { 2, 2, 2, 2, 2, 2, 2, 2 };
|
| +
|
| +static uvec8 kShuf38a =
|
| + { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
|
| +
|
| +static uvec8 kShuf38b =
|
| + { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
|
| +
|
| +// Arrange words 0,3,6 into 0,1,2
|
| +static uvec8 kShufAc =
|
| + { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
|
| +
|
| +// Arrange words 0,3,6 into 3,4,5
|
| +static uvec8 kShufAc3 =
|
| + { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
|
| +
|
| +// Scaling values for boxes of 3x3 and 2x3
|
| +static uvec16 kScaleAc33 =
|
| + { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
|
| +
|
| +// Arrange first value for pixels 0,1,2,3,4,5
|
| +static uvec8 kShufAb0 =
|
| + { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
|
| +
|
| +// Arrange second value for pixels 0,1,2,3,4,5
|
| +static uvec8 kShufAb1 =
|
| + { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
|
| +
|
| +// Arrange third value for pixels 0,1,2,3,4,5
|
| +static uvec8 kShufAb2 =
|
| + { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
|
| +
|
| +// Scaling values for boxes of 3x2 and 2x2
|
| +static uvec16 kScaleAb2 =
|
| + { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
|
| +
|
| +// Reads 32 pixels, throws half away and writes 16 pixels.
|
| +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| + __asm {
|
| + mov eax, [esp + 4] // src_ptr
|
| + // src_stride ignored
|
| + mov edx, [esp + 12] // dst_ptr
|
| + mov ecx, [esp + 16] // dst_width
|
| +
|
| + align 4
|
| + wloop:
|
| + movdqa xmm0, [eax]
|
| + movdqa xmm1, [eax + 16]
|
| + lea eax, [eax + 32]
|
| + psrlw xmm0, 8 // isolate odd pixels.
|
| + psrlw xmm1, 8
|
| + packuswb xmm0, xmm1
|
| + sub ecx, 16
|
| + movdqa [edx], xmm0
|
| + lea edx, [edx + 16]
|
| + jg wloop
|
| +
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Blends 32x1 rectangle to 16x1.
|
| +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| + __asm {
|
| + mov eax, [esp + 4] // src_ptr
|
| + // src_stride
|
| + mov edx, [esp + 12] // dst_ptr
|
| + mov ecx, [esp + 16] // dst_width
|
| + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
| + psrlw xmm5, 8
|
| +
|
| + align 4
|
| + wloop:
|
| + movdqa xmm0, [eax]
|
| + movdqa xmm1, [eax + 16]
|
| + lea eax, [eax + 32]
|
| +
|
| + movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
|
| + psrlw xmm0, 8
|
| + movdqa xmm3, xmm1
|
| + psrlw xmm1, 8
|
| + pand xmm2, xmm5
|
| + pand xmm3, xmm5
|
| + pavgw xmm0, xmm2
|
| + pavgw xmm1, xmm3
|
| + packuswb xmm0, xmm1
|
| +
|
| + sub ecx, 16
|
| + movdqa [edx], xmm0
|
| + lea edx, [edx + 16]
|
| + jg wloop
|
| +
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Blends 32x2 rectangle to 16x1.
|
| +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| + __asm {
|
| + push esi
|
| + mov eax, [esp + 4 + 4] // src_ptr
|
| + mov esi, [esp + 4 + 8] // src_stride
|
| + mov edx, [esp + 4 + 12] // dst_ptr
|
| + mov ecx, [esp + 4 + 16] // dst_width
|
| + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
| + psrlw xmm5, 8
|
| +
|
| + align 4
|
| + wloop:
|
| + movdqa xmm0, [eax]
|
| + movdqa xmm1, [eax + 16]
|
| + movdqa xmm2, [eax + esi]
|
| + movdqa xmm3, [eax + esi + 16]
|
| + lea eax, [eax + 32]
|
| + pavgb xmm0, xmm2 // average rows
|
| + pavgb xmm1, xmm3
|
| +
|
| + movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
|
| + psrlw xmm0, 8
|
| + movdqa xmm3, xmm1
|
| + psrlw xmm1, 8
|
| + pand xmm2, xmm5
|
| + pand xmm3, xmm5
|
| + pavgw xmm0, xmm2
|
| + pavgw xmm1, xmm3
|
| + packuswb xmm0, xmm1
|
| +
|
| + sub ecx, 16
|
| + movdqa [edx], xmm0
|
| + lea edx, [edx + 16]
|
| + jg wloop
|
| +
|
| + pop esi
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Reads 32 pixels, throws half away and writes 16 pixels.
|
| +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
|
| + ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| + __asm {
|
| + mov eax, [esp + 4] // src_ptr
|
| + // src_stride ignored
|
| + mov edx, [esp + 12] // dst_ptr
|
| + mov ecx, [esp + 16] // dst_width
|
| +
|
| + align 4
|
| + wloop:
|
| + movdqu xmm0, [eax]
|
| + movdqu xmm1, [eax + 16]
|
| + lea eax, [eax + 32]
|
| + psrlw xmm0, 8 // isolate odd pixels.
|
| + psrlw xmm1, 8
|
| + packuswb xmm0, xmm1
|
| + sub ecx, 16
|
| + movdqu [edx], xmm0
|
| + lea edx, [edx + 16]
|
| + jg wloop
|
| +
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Blends 32x1 rectangle to 16x1.
|
| +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
|
| + ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| + __asm {
|
| + mov eax, [esp + 4] // src_ptr
|
| + // src_stride
|
| + mov edx, [esp + 12] // dst_ptr
|
| + mov ecx, [esp + 16] // dst_width
|
| + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
| + psrlw xmm5, 8
|
| +
|
| + align 4
|
| + wloop:
|
| + movdqu xmm0, [eax]
|
| + movdqu xmm1, [eax + 16]
|
| + lea eax, [eax + 32]
|
| +
|
| + movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
|
| + psrlw xmm0, 8
|
| + movdqa xmm3, xmm1
|
| + psrlw xmm1, 8
|
| + pand xmm2, xmm5
|
| + pand xmm3, xmm5
|
| + pavgw xmm0, xmm2
|
| + pavgw xmm1, xmm3
|
| + packuswb xmm0, xmm1
|
| +
|
| + sub ecx, 16
|
| + movdqu [edx], xmm0
|
| + lea edx, [edx + 16]
|
| + jg wloop
|
| +
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Blends 32x2 rectangle to 16x1.
|
| +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
|
| + ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| + __asm {
|
| + push esi
|
| + mov eax, [esp + 4 + 4] // src_ptr
|
| + mov esi, [esp + 4 + 8] // src_stride
|
| + mov edx, [esp + 4 + 12] // dst_ptr
|
| + mov ecx, [esp + 4 + 16] // dst_width
|
| + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
| + psrlw xmm5, 8
|
| +
|
| + align 4
|
| + wloop:
|
| + movdqu xmm0, [eax]
|
| + movdqu xmm1, [eax + 16]
|
| + movdqu xmm2, [eax + esi]
|
| + movdqu xmm3, [eax + esi + 16]
|
| + lea eax, [eax + 32]
|
| + pavgb xmm0, xmm2 // average rows
|
| + pavgb xmm1, xmm3
|
| +
|
| + movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
|
| + psrlw xmm0, 8
|
| + movdqa xmm3, xmm1
|
| + psrlw xmm1, 8
|
| + pand xmm2, xmm5
|
| + pand xmm3, xmm5
|
| + pavgw xmm0, xmm2
|
| + pavgw xmm1, xmm3
|
| + packuswb xmm0, xmm1
|
| +
|
| + sub ecx, 16
|
| + movdqu [edx], xmm0
|
| + lea edx, [edx + 16]
|
| + jg wloop
|
| +
|
| + pop esi
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Point samples 32 pixels to 8 pixels.
|
| +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| + __asm {
|
| + mov eax, [esp + 4] // src_ptr
|
| + // src_stride ignored
|
| + mov edx, [esp + 12] // dst_ptr
|
| + mov ecx, [esp + 16] // dst_width
|
| + pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
|
| + psrld xmm5, 24
|
| + pslld xmm5, 16
|
| +
|
| + align 4
|
| + wloop:
|
| + movdqa xmm0, [eax]
|
| + movdqa xmm1, [eax + 16]
|
| + lea eax, [eax + 32]
|
| + pand xmm0, xmm5
|
| + pand xmm1, xmm5
|
| + packuswb xmm0, xmm1
|
| + psrlw xmm0, 8
|
| + packuswb xmm0, xmm0
|
| + sub ecx, 8
|
| + movq qword ptr [edx], xmm0
|
| + lea edx, [edx + 8]
|
| + jg wloop
|
| +
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Blends 32x4 rectangle to 8x1.
|
| +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| + __asm {
|
| + push esi
|
| + push edi
|
| + mov eax, [esp + 8 + 4] // src_ptr
|
| + mov esi, [esp + 8 + 8] // src_stride
|
| + mov edx, [esp + 8 + 12] // dst_ptr
|
| + mov ecx, [esp + 8 + 16] // dst_width
|
| + lea edi, [esi + esi * 2] // src_stride * 3
|
| + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
|
| + psrlw xmm7, 8
|
| +
|
| + align 4
|
| + wloop:
|
| + movdqa xmm0, [eax]
|
| + movdqa xmm1, [eax + 16]
|
| + movdqa xmm2, [eax + esi]
|
| + movdqa xmm3, [eax + esi + 16]
|
| + pavgb xmm0, xmm2 // average rows
|
| + pavgb xmm1, xmm3
|
| + movdqa xmm2, [eax + esi * 2]
|
| + movdqa xmm3, [eax + esi * 2 + 16]
|
| + movdqa xmm4, [eax + edi]
|
| + movdqa xmm5, [eax + edi + 16]
|
| + lea eax, [eax + 32]
|
| + pavgb xmm2, xmm4
|
| + pavgb xmm3, xmm5
|
| + pavgb xmm0, xmm2
|
| + pavgb xmm1, xmm3
|
| +
|
| + movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
|
| + psrlw xmm0, 8
|
| + movdqa xmm3, xmm1
|
| + psrlw xmm1, 8
|
| + pand xmm2, xmm7
|
| + pand xmm3, xmm7
|
| + pavgw xmm0, xmm2
|
| + pavgw xmm1, xmm3
|
| + packuswb xmm0, xmm1
|
| +
|
| + movdqa xmm2, xmm0 // average columns (16 to 8 pixels)
|
| + psrlw xmm0, 8
|
| + pand xmm2, xmm7
|
| + pavgw xmm0, xmm2
|
| + packuswb xmm0, xmm0
|
| +
|
| + sub ecx, 8
|
| + movq qword ptr [edx], xmm0
|
| + lea edx, [edx + 8]
|
| + jg wloop
|
| +
|
| + pop edi
|
| + pop esi
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Point samples 32 pixels to 24 pixels.
|
| +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
|
| +// Then shuffled to do the scaling.
|
| +
|
| +// Note that movdqa+palign may be better than movdqu.
|
| +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| + __asm {
|
| + mov eax, [esp + 4] // src_ptr
|
| + // src_stride ignored
|
| + mov edx, [esp + 12] // dst_ptr
|
| + mov ecx, [esp + 16] // dst_width
|
| + movdqa xmm3, kShuf0
|
| + movdqa xmm4, kShuf1
|
| + movdqa xmm5, kShuf2
|
| +
|
| + align 4
|
| + wloop:
|
| + movdqa xmm0, [eax]
|
| + movdqa xmm1, [eax + 16]
|
| + lea eax, [eax + 32]
|
| + movdqa xmm2, xmm1
|
| + palignr xmm1, xmm0, 8
|
| + pshufb xmm0, xmm3
|
| + pshufb xmm1, xmm4
|
| + pshufb xmm2, xmm5
|
| + movq qword ptr [edx], xmm0
|
| + movq qword ptr [edx + 8], xmm1
|
| + movq qword ptr [edx + 16], xmm2
|
| + lea edx, [edx + 24]
|
| + sub ecx, 24
|
| + jg wloop
|
| +
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Blends 32x2 rectangle to 24x1
|
| +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
|
| +// Then shuffled to do the scaling.
|
| +
|
| +// Register usage:
|
| +// xmm0 src_row 0
|
| +// xmm1 src_row 1
|
| +// xmm2 shuf 0
|
| +// xmm3 shuf 1
|
| +// xmm4 shuf 2
|
| +// xmm5 madd 0
|
| +// xmm6 madd 1
|
| +// xmm7 kRound34
|
| +
|
| +// Note that movdqa+palign may be better than movdqu.
|
| +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
|
| + ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| + __asm {
|
| + push esi
|
| + mov eax, [esp + 4 + 4] // src_ptr
|
| + mov esi, [esp + 4 + 8] // src_stride
|
| + mov edx, [esp + 4 + 12] // dst_ptr
|
| + mov ecx, [esp + 4 + 16] // dst_width
|
| + movdqa xmm2, kShuf01
|
| + movdqa xmm3, kShuf11
|
| + movdqa xmm4, kShuf21
|
| + movdqa xmm5, kMadd01
|
| + movdqa xmm6, kMadd11
|
| + movdqa xmm7, kRound34
|
| +
|
| + align 4
|
| + wloop:
|
| + movdqa xmm0, [eax] // pixels 0..7
|
| + movdqa xmm1, [eax + esi]
|
| + pavgb xmm0, xmm1
|
| + pshufb xmm0, xmm2
|
| + pmaddubsw xmm0, xmm5
|
| + paddsw xmm0, xmm7
|
| + psrlw xmm0, 2
|
| + packuswb xmm0, xmm0
|
| + movq qword ptr [edx], xmm0
|
| + movdqu xmm0, [eax + 8] // pixels 8..15
|
| + movdqu xmm1, [eax + esi + 8]
|
| + pavgb xmm0, xmm1
|
| + pshufb xmm0, xmm3
|
| + pmaddubsw xmm0, xmm6
|
| + paddsw xmm0, xmm7
|
| + psrlw xmm0, 2
|
| + packuswb xmm0, xmm0
|
| + movq qword ptr [edx + 8], xmm0
|
| + movdqa xmm0, [eax + 16] // pixels 16..23
|
| + movdqa xmm1, [eax + esi + 16]
|
| + lea eax, [eax + 32]
|
| + pavgb xmm0, xmm1
|
| + pshufb xmm0, xmm4
|
| + movdqa xmm1, kMadd21
|
| + pmaddubsw xmm0, xmm1
|
| + paddsw xmm0, xmm7
|
| + psrlw xmm0, 2
|
| + packuswb xmm0, xmm0
|
| + sub ecx, 24
|
| + movq qword ptr [edx + 16], xmm0
|
| + lea edx, [edx + 24]
|
| + jg wloop
|
| +
|
| + pop esi
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Note that movdqa+palign may be better than movdqu.
|
| +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
|
| + ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| + __asm {
|
| + push esi
|
| + mov eax, [esp + 4 + 4] // src_ptr
|
| + mov esi, [esp + 4 + 8] // src_stride
|
| + mov edx, [esp + 4 + 12] // dst_ptr
|
| + mov ecx, [esp + 4 + 16] // dst_width
|
| + movdqa xmm2, kShuf01
|
| + movdqa xmm3, kShuf11
|
| + movdqa xmm4, kShuf21
|
| + movdqa xmm5, kMadd01
|
| + movdqa xmm6, kMadd11
|
| + movdqa xmm7, kRound34
|
| +
|
| + align 4
|
| + wloop:
|
| + movdqa xmm0, [eax] // pixels 0..7
|
| + movdqa xmm1, [eax + esi]
|
| + pavgb xmm1, xmm0
|
| + pavgb xmm0, xmm1
|
| + pshufb xmm0, xmm2
|
| + pmaddubsw xmm0, xmm5
|
| + paddsw xmm0, xmm7
|
| + psrlw xmm0, 2
|
| + packuswb xmm0, xmm0
|
| + movq qword ptr [edx], xmm0
|
| + movdqu xmm0, [eax + 8] // pixels 8..15
|
| + movdqu xmm1, [eax + esi + 8]
|
| + pavgb xmm1, xmm0
|
| + pavgb xmm0, xmm1
|
| + pshufb xmm0, xmm3
|
| + pmaddubsw xmm0, xmm6
|
| + paddsw xmm0, xmm7
|
| + psrlw xmm0, 2
|
| + packuswb xmm0, xmm0
|
| + movq qword ptr [edx + 8], xmm0
|
| + movdqa xmm0, [eax + 16] // pixels 16..23
|
| + movdqa xmm1, [eax + esi + 16]
|
| + lea eax, [eax + 32]
|
| + pavgb xmm1, xmm0
|
| + pavgb xmm0, xmm1
|
| + pshufb xmm0, xmm4
|
| + movdqa xmm1, kMadd21
|
| + pmaddubsw xmm0, xmm1
|
| + paddsw xmm0, xmm7
|
| + psrlw xmm0, 2
|
| + packuswb xmm0, xmm0
|
| + sub ecx, 24
|
| + movq qword ptr [edx + 16], xmm0
|
| + lea edx, [edx+24]
|
| + jg wloop
|
| +
|
| + pop esi
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// 3/8 point sampler
|
| +
|
| +// Scale 32 pixels to 12
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| + __asm {
|
| + mov eax, [esp + 4] // src_ptr
|
| + // src_stride ignored
|
| + mov edx, [esp + 12] // dst_ptr
|
| + mov ecx, [esp + 16] // dst_width
|
| + movdqa xmm4, kShuf38a
|
| + movdqa xmm5, kShuf38b
|
| +
|
| + align 4
|
| + xloop:
|
| + movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
|
| + movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
|
| + lea eax, [eax + 32]
|
| + pshufb xmm0, xmm4
|
| + pshufb xmm1, xmm5
|
| + paddusb xmm0, xmm1
|
| +
|
| + sub ecx, 12
|
| + movq qword ptr [edx], xmm0 // write 12 pixels
|
| + movhlps xmm1, xmm0
|
| + movd [edx + 8], xmm1
|
| + lea edx, [edx + 12]
|
| + jg xloop
|
| +
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Scale 16x3 pixels to 6x1 with interpolation
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
|
| + ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| + __asm {
|
| + push esi
|
| + mov eax, [esp + 4 + 4] // src_ptr
|
| + mov esi, [esp + 4 + 8] // src_stride
|
| + mov edx, [esp + 4 + 12] // dst_ptr
|
| + mov ecx, [esp + 4 + 16] // dst_width
|
| + movdqa xmm2, kShufAc
|
| + movdqa xmm3, kShufAc3
|
| + movdqa xmm4, kScaleAc33
|
| + pxor xmm5, xmm5
|
| +
|
| + align 4
|
| + xloop:
|
| + movdqa xmm0, [eax] // sum up 3 rows into xmm0/1
|
| + movdqa xmm6, [eax + esi]
|
| + movhlps xmm1, xmm0
|
| + movhlps xmm7, xmm6
|
| + punpcklbw xmm0, xmm5
|
| + punpcklbw xmm1, xmm5
|
| + punpcklbw xmm6, xmm5
|
| + punpcklbw xmm7, xmm5
|
| + paddusw xmm0, xmm6
|
| + paddusw xmm1, xmm7
|
| + movdqa xmm6, [eax + esi * 2]
|
| + lea eax, [eax + 16]
|
| + movhlps xmm7, xmm6
|
| + punpcklbw xmm6, xmm5
|
| + punpcklbw xmm7, xmm5
|
| + paddusw xmm0, xmm6
|
| + paddusw xmm1, xmm7
|
| +
|
| + movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
|
| + psrldq xmm0, 2
|
| + paddusw xmm6, xmm0
|
| + psrldq xmm0, 2
|
| + paddusw xmm6, xmm0
|
| + pshufb xmm6, xmm2
|
| +
|
| + movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
|
| + psrldq xmm1, 2
|
| + paddusw xmm7, xmm1
|
| + psrldq xmm1, 2
|
| + paddusw xmm7, xmm1
|
| + pshufb xmm7, xmm3
|
| + paddusw xmm6, xmm7
|
| +
|
| + pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
|
| + packuswb xmm6, xmm6
|
| +
|
| + sub ecx, 6
|
| + movd [edx], xmm6 // write 6 pixels
|
| + psrlq xmm6, 16
|
| + movd [edx + 2], xmm6
|
| + lea edx, [edx + 6]
|
| + jg xloop
|
| +
|
| + pop esi
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Scale 16x2 pixels to 6x1 with interpolation
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
|
| + ptrdiff_t src_stride,
|
| + uint8* dst_ptr, int dst_width) {
|
| + __asm {
|
| + push esi
|
| + mov eax, [esp + 4 + 4] // src_ptr
|
| + mov esi, [esp + 4 + 8] // src_stride
|
| + mov edx, [esp + 4 + 12] // dst_ptr
|
| + mov ecx, [esp + 4 + 16] // dst_width
|
| + movdqa xmm2, kShufAb0
|
| + movdqa xmm3, kShufAb1
|
| + movdqa xmm4, kShufAb2
|
| + movdqa xmm5, kScaleAb2
|
| +
|
| + align 4
|
| + xloop:
|
| + movdqa xmm0, [eax] // average 2 rows into xmm0
|
| + pavgb xmm0, [eax + esi]
|
| + lea eax, [eax + 16]
|
| +
|
| + movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
|
| + pshufb xmm1, xmm2
|
| + movdqa xmm6, xmm0
|
| + pshufb xmm6, xmm3
|
| + paddusw xmm1, xmm6
|
| + pshufb xmm0, xmm4
|
| + paddusw xmm1, xmm0
|
| +
|
| + pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
|
| + packuswb xmm1, xmm1
|
| +
|
| + sub ecx, 6
|
| + movd [edx], xmm1 // write 6 pixels
|
| + psrlq xmm1, 16
|
| + movd [edx + 2], xmm1
|
| + lea edx, [edx + 6]
|
| + jg xloop
|
| +
|
| + pop esi
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Reads 16xN bytes and produces 16 shorts at a time.
|
| +// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
| + uint16* dst_ptr, int src_width,
|
| + int src_height) {
|
| + __asm {
|
| + push esi
|
| + push edi
|
| + push ebx
|
| + push ebp
|
| + mov esi, [esp + 16 + 4] // src_ptr
|
| + mov edx, [esp + 16 + 8] // src_stride
|
| + mov edi, [esp + 16 + 12] // dst_ptr
|
| + mov ecx, [esp + 16 + 16] // dst_width
|
| + mov ebx, [esp + 16 + 20] // height
|
| + pxor xmm4, xmm4
|
| + dec ebx
|
| +
|
| + align 4
|
| + xloop:
|
| + // first row
|
| + movdqa xmm0, [esi]
|
| + lea eax, [esi + edx]
|
| + movdqa xmm1, xmm0
|
| + punpcklbw xmm0, xmm4
|
| + punpckhbw xmm1, xmm4
|
| + lea esi, [esi + 16]
|
| + mov ebp, ebx
|
| + test ebp, ebp
|
| + je ydone
|
| +
|
| + // sum remaining rows
|
| + align 4
|
| + yloop:
|
| + movdqa xmm2, [eax] // read 16 pixels
|
| + lea eax, [eax + edx] // advance to next row
|
| + movdqa xmm3, xmm2
|
| + punpcklbw xmm2, xmm4
|
| + punpckhbw xmm3, xmm4
|
| + paddusw xmm0, xmm2 // sum 16 words
|
| + paddusw xmm1, xmm3
|
| + sub ebp, 1
|
| + jg yloop
|
| +
|
| + align 4
|
| + ydone:
|
| + movdqa [edi], xmm0
|
| + movdqa [edi + 16], xmm1
|
| + lea edi, [edi + 32]
|
| +
|
| + sub ecx, 16
|
| + jg xloop
|
| +
|
| + pop ebp
|
| + pop ebx
|
| + pop edi
|
| + pop esi
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Bilinear column filtering. SSSE3 version.
|
| +// TODO(fbarchard): Port to Neon
|
| +// TODO(fbarchard): Switch the following:
|
| +// xor ebx, ebx
|
| +// mov bx, word ptr [esi + eax] // 2 source x0 pixels
|
| +// To
|
| +// movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
|
| +// when drmemory bug fixed.
|
| +// https://code.google.com/p/drmemory/issues/detail?id=1396
|
| +
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| + int dst_width, int x, int dx) {
|
| + __asm {
|
| + push ebx
|
| + push esi
|
| + push edi
|
| + mov edi, [esp + 12 + 4] // dst_ptr
|
| + mov esi, [esp + 12 + 8] // src_ptr
|
| + mov ecx, [esp + 12 + 12] // dst_width
|
| + movd xmm2, [esp + 12 + 16] // x
|
| + movd xmm3, [esp + 12 + 20] // dx
|
| + mov eax, 0x04040000 // shuffle to line up fractions with pixel.
|
| + movd xmm5, eax
|
| + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
|
| + psrlw xmm6, 9
|
| + pextrw eax, xmm2, 1 // get x0 integer. preroll
|
| + sub ecx, 2
|
| + jl xloop29
|
| +
|
| + movdqa xmm0, xmm2 // x1 = x0 + dx
|
| + paddd xmm0, xmm3
|
| + punpckldq xmm2, xmm0 // x0 x1
|
| + punpckldq xmm3, xmm3 // dx dx
|
| + paddd xmm3, xmm3 // dx * 2, dx * 2
|
| + pextrw edx, xmm2, 3 // get x1 integer. preroll
|
| +
|
| + // 2 Pixel loop.
|
| + align 4
|
| + xloop2:
|
| + movdqa xmm1, xmm2 // x0, x1 fractions.
|
| + paddd xmm2, xmm3 // x += dx
|
| + movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
|
| + movd xmm0, ebx
|
| + psrlw xmm1, 9 // 7 bit fractions.
|
| + movzx ebx, word ptr [esi + edx] // 2 source x1 pixels
|
| + movd xmm4, ebx
|
| + pshufb xmm1, xmm5 // 0011
|
| + punpcklwd xmm0, xmm4
|
| + pxor xmm1, xmm6 // 0..7f and 7f..0
|
| + pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels.
|
| + pextrw eax, xmm2, 1 // get x0 integer. next iteration.
|
| + pextrw edx, xmm2, 3 // get x1 integer. next iteration.
|
| + psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
|
| + packuswb xmm0, xmm0 // 8 bits, 2 pixels.
|
| + movd ebx, xmm0
|
| + mov [edi], bx
|
| + lea edi, [edi + 2]
|
| + sub ecx, 2 // 2 pixels
|
| + jge xloop2
|
| +
|
| + align 4
|
| + xloop29:
|
| +
|
| + add ecx, 2 - 1
|
| + jl xloop99
|
| +
|
| + // 1 pixel remainder
|
| + movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
|
| + movd xmm0, ebx
|
| + psrlw xmm2, 9 // 7 bit fractions.
|
| + pshufb xmm2, xmm5 // 0011
|
| + pxor xmm2, xmm6 // 0..7f and 7f..0
|
| + pmaddubsw xmm0, xmm2 // 16 bit
|
| + psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
|
| + packuswb xmm0, xmm0 // 8 bits
|
| + movd ebx, xmm0
|
| + mov [edi], bl
|
| +
|
| + align 4
|
| + xloop99:
|
| +
|
| + pop edi
|
| + pop esi
|
| + pop ebx
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Reads 16 pixels, duplicates them and writes 32 pixels.
|
| +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
| + int dst_width, int x, int dx) {
|
| + __asm {
|
| + mov edx, [esp + 4] // dst_ptr
|
| + mov eax, [esp + 8] // src_ptr
|
| + mov ecx, [esp + 12] // dst_width
|
| +
|
| + align 4
|
| + wloop:
|
| + movdqa xmm0, [eax]
|
| + lea eax, [eax + 16]
|
| + movdqa xmm1, xmm0
|
| + punpcklbw xmm0, xmm0
|
| + punpckhbw xmm1, xmm1
|
| + sub ecx, 32
|
| + movdqa [edx], xmm0
|
| + movdqa [edx + 16], xmm1
|
| + lea edx, [edx + 32]
|
| + jg wloop
|
| +
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
|
| +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
|
| + ptrdiff_t src_stride,
|
| + uint8* dst_argb, int dst_width) {
|
| + __asm {
|
| + mov eax, [esp + 4] // src_argb
|
| + // src_stride ignored
|
| + mov edx, [esp + 12] // dst_argb
|
| + mov ecx, [esp + 16] // dst_width
|
| +
|
| + align 4
|
| + wloop:
|
| + movdqa xmm0, [eax]
|
| + movdqa xmm1, [eax + 16]
|
| + lea eax, [eax + 32]
|
| + shufps xmm0, xmm1, 0xdd
|
| + sub ecx, 4
|
| + movdqa [edx], xmm0
|
| + lea edx, [edx + 16]
|
| + jg wloop
|
| +
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Blends 8x1 rectangle to 4x1.
|
| +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
|
| + ptrdiff_t src_stride,
|
| + uint8* dst_argb, int dst_width) {
|
| + __asm {
|
| + mov eax, [esp + 4] // src_argb
|
| + // src_stride ignored
|
| + mov edx, [esp + 12] // dst_argb
|
| + mov ecx, [esp + 16] // dst_width
|
| +
|
| + align 4
|
| + wloop:
|
| + movdqa xmm0, [eax]
|
| + movdqa xmm1, [eax + 16]
|
| + lea eax, [eax + 32]
|
| + movdqa xmm2, xmm0
|
| + shufps xmm0, xmm1, 0x88 // even pixels
|
| + shufps xmm2, xmm1, 0xdd // odd pixels
|
| + pavgb xmm0, xmm2
|
| + sub ecx, 4
|
| + movdqa [edx], xmm0
|
| + lea edx, [edx + 16]
|
| + jg wloop
|
| +
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Blends 8x2 rectangle to 4x1.
|
| +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
|
| + ptrdiff_t src_stride,
|
| + uint8* dst_argb, int dst_width) {
|
| + __asm {
|
| + push esi
|
| + mov eax, [esp + 4 + 4] // src_argb
|
| + mov esi, [esp + 4 + 8] // src_stride
|
| + mov edx, [esp + 4 + 12] // dst_argb
|
| + mov ecx, [esp + 4 + 16] // dst_width
|
| +
|
| + align 4
|
| + wloop:
|
| + movdqa xmm0, [eax]
|
| + movdqa xmm1, [eax + 16]
|
| + movdqa xmm2, [eax + esi]
|
| + movdqa xmm3, [eax + esi + 16]
|
| + lea eax, [eax + 32]
|
| + pavgb xmm0, xmm2 // average rows
|
| + pavgb xmm1, xmm3
|
| + movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
|
| + shufps xmm0, xmm1, 0x88 // even pixels
|
| + shufps xmm2, xmm1, 0xdd // odd pixels
|
| + pavgb xmm0, xmm2
|
| + sub ecx, 4
|
| + movdqa [edx], xmm0
|
| + lea edx, [edx + 16]
|
| + jg wloop
|
| +
|
| + pop esi
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Reads 4 pixels at a time.
|
| +// Alignment requirement: dst_argb 16 byte aligned.
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
| + int src_stepx,
|
| + uint8* dst_argb, int dst_width) {
|
| + __asm {
|
| + push ebx
|
| + push edi
|
| + mov eax, [esp + 8 + 4] // src_argb
|
| + // src_stride ignored
|
| + mov ebx, [esp + 8 + 12] // src_stepx
|
| + mov edx, [esp + 8 + 16] // dst_argb
|
| + mov ecx, [esp + 8 + 20] // dst_width
|
| + lea ebx, [ebx * 4]
|
| + lea edi, [ebx + ebx * 2]
|
| +
|
| + align 4
|
| + wloop:
|
| + movd xmm0, [eax]
|
| + movd xmm1, [eax + ebx]
|
| + punpckldq xmm0, xmm1
|
| + movd xmm2, [eax + ebx * 2]
|
| + movd xmm3, [eax + edi]
|
| + lea eax, [eax + ebx * 4]
|
| + punpckldq xmm2, xmm3
|
| + punpcklqdq xmm0, xmm2
|
| + sub ecx, 4
|
| + movdqa [edx], xmm0
|
| + lea edx, [edx + 16]
|
| + jg wloop
|
| +
|
| + pop edi
|
| + pop ebx
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Blends four 2x2 to 4x1.
|
| +// Alignment requirement: dst_argb 16 byte aligned.
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
|
| + ptrdiff_t src_stride,
|
| + int src_stepx,
|
| + uint8* dst_argb, int dst_width) {
|
| + __asm {
|
| + push ebx
|
| + push esi
|
| + push edi
|
| + mov eax, [esp + 12 + 4] // src_argb
|
| + mov esi, [esp + 12 + 8] // src_stride
|
| + mov ebx, [esp + 12 + 12] // src_stepx
|
| + mov edx, [esp + 12 + 16] // dst_argb
|
| + mov ecx, [esp + 12 + 20] // dst_width
|
| + lea esi, [eax + esi] // row1 pointer
|
| + lea ebx, [ebx * 4]
|
| + lea edi, [ebx + ebx * 2]
|
| +
|
| + align 4
|
| + wloop:
|
| + movq xmm0, qword ptr [eax] // row0 4 pairs
|
| + movhps xmm0, qword ptr [eax + ebx]
|
| + movq xmm1, qword ptr [eax + ebx * 2]
|
| + movhps xmm1, qword ptr [eax + edi]
|
| + lea eax, [eax + ebx * 4]
|
| + movq xmm2, qword ptr [esi] // row1 4 pairs
|
| + movhps xmm2, qword ptr [esi + ebx]
|
| + movq xmm3, qword ptr [esi + ebx * 2]
|
| + movhps xmm3, qword ptr [esi + edi]
|
| + lea esi, [esi + ebx * 4]
|
| + pavgb xmm0, xmm2 // average rows
|
| + pavgb xmm1, xmm3
|
| + movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
|
| + shufps xmm0, xmm1, 0x88 // even pixels
|
| + shufps xmm2, xmm1, 0xdd // odd pixels
|
| + pavgb xmm0, xmm2
|
| + sub ecx, 4
|
| + movdqa [edx], xmm0
|
| + lea edx, [edx + 16]
|
| + jg wloop
|
| +
|
| + pop edi
|
| + pop esi
|
| + pop ebx
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Column scaling unfiltered. SSE2 version.
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
|
| + int dst_width, int x, int dx) {
|
| + __asm {
|
| + push edi
|
| + push esi
|
| + mov edi, [esp + 8 + 4] // dst_argb
|
| + mov esi, [esp + 8 + 8] // src_argb
|
| + mov ecx, [esp + 8 + 12] // dst_width
|
| + movd xmm2, [esp + 8 + 16] // x
|
| + movd xmm3, [esp + 8 + 20] // dx
|
| +
|
| + pshufd xmm2, xmm2, 0 // x0 x0 x0 x0
|
| + pshufd xmm0, xmm3, 0x11 // dx 0 dx 0
|
| + paddd xmm2, xmm0
|
| + paddd xmm3, xmm3 // 0, 0, 0, dx * 2
|
| + pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0
|
| + paddd xmm2, xmm0 // x3 x2 x1 x0
|
| + paddd xmm3, xmm3 // 0, 0, 0, dx * 4
|
| + pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4
|
| +
|
| + pextrw eax, xmm2, 1 // get x0 integer.
|
| + pextrw edx, xmm2, 3 // get x1 integer.
|
| +
|
| + cmp ecx, 0
|
| + jle xloop99
|
| + sub ecx, 4
|
| + jl xloop49
|
| +
|
| + // 4 Pixel loop.
|
| + align 4
|
| + xloop4:
|
| + movd xmm0, [esi + eax * 4] // 1 source x0 pixels
|
| + movd xmm1, [esi + edx * 4] // 1 source x1 pixels
|
| + pextrw eax, xmm2, 5 // get x2 integer.
|
| + pextrw edx, xmm2, 7 // get x3 integer.
|
| + paddd xmm2, xmm3 // x += dx
|
| + punpckldq xmm0, xmm1 // x0 x1
|
| +
|
| + movd xmm1, [esi + eax * 4] // 1 source x2 pixels
|
| + movd xmm4, [esi + edx * 4] // 1 source x3 pixels
|
| + pextrw eax, xmm2, 1 // get x0 integer. next iteration.
|
| + pextrw edx, xmm2, 3 // get x1 integer. next iteration.
|
| + punpckldq xmm1, xmm4 // x2 x3
|
| + punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
|
| + sub ecx, 4 // 4 pixels
|
| + movdqu [edi], xmm0
|
| + lea edi, [edi + 16]
|
| + jge xloop4
|
| +
|
| + align 4
|
| + xloop49:
|
| + test ecx, 2
|
| + je xloop29
|
| +
|
| + // 2 Pixels.
|
| + movd xmm0, [esi + eax * 4] // 1 source x0 pixels
|
| + movd xmm1, [esi + edx * 4] // 1 source x1 pixels
|
| + pextrw eax, xmm2, 5 // get x2 integer.
|
| + punpckldq xmm0, xmm1 // x0 x1
|
| +
|
| + movq qword ptr [edi], xmm0
|
| + lea edi, [edi + 8]
|
| +
|
| + xloop29:
|
| + test ecx, 1
|
| + je xloop99
|
| +
|
| + // 1 Pixels.
|
| + movd xmm0, [esi + eax * 4] // 1 source x2 pixels
|
| + movd dword ptr [edi], xmm0
|
| + align 4
|
| + xloop99:
|
| +
|
| + pop esi
|
| + pop edi
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
|
| +// TODO(fbarchard): Port to Neon
|
| +
|
| +// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
|
| +static uvec8 kShuffleColARGB = {
|
| + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
|
| + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
|
| +};
|
| +
|
| +// Shuffle table for duplicating 2 fractions into 8 bytes each
|
| +static uvec8 kShuffleFractions = {
|
| + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
|
| +};
|
| +
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
|
| + int dst_width, int x, int dx) {
|
| + __asm {
|
| + push esi
|
| + push edi
|
| + mov edi, [esp + 8 + 4] // dst_argb
|
| + mov esi, [esp + 8 + 8] // src_argb
|
| + mov ecx, [esp + 8 + 12] // dst_width
|
| + movd xmm2, [esp + 8 + 16] // x
|
| + movd xmm3, [esp + 8 + 20] // dx
|
| + movdqa xmm4, kShuffleColARGB
|
| + movdqa xmm5, kShuffleFractions
|
| + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
|
| + psrlw xmm6, 9
|
| + pextrw eax, xmm2, 1 // get x0 integer. preroll
|
| + sub ecx, 2
|
| + jl xloop29
|
| +
|
| + movdqa xmm0, xmm2 // x1 = x0 + dx
|
| + paddd xmm0, xmm3
|
| + punpckldq xmm2, xmm0 // x0 x1
|
| + punpckldq xmm3, xmm3 // dx dx
|
| + paddd xmm3, xmm3 // dx * 2, dx * 2
|
| + pextrw edx, xmm2, 3 // get x1 integer. preroll
|
| +
|
| + // 2 Pixel loop.
|
| + align 4
|
| + xloop2:
|
| + movdqa xmm1, xmm2 // x0, x1 fractions.
|
| + paddd xmm2, xmm3 // x += dx
|
| + movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
|
| + psrlw xmm1, 9 // 7 bit fractions.
|
| + movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels
|
| + pshufb xmm1, xmm5 // 0000000011111111
|
| + pshufb xmm0, xmm4 // arrange pixels into pairs
|
| + pxor xmm1, xmm6 // 0..7f and 7f..0
|
| + pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
|
| + pextrw eax, xmm2, 1 // get x0 integer. next iteration.
|
| + pextrw edx, xmm2, 3 // get x1 integer. next iteration.
|
| + psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.
|
| + packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
|
| + movq qword ptr [edi], xmm0
|
| + lea edi, [edi + 8]
|
| + sub ecx, 2 // 2 pixels
|
| + jge xloop2
|
| +
|
| + align 4
|
| + xloop29:
|
| +
|
| + add ecx, 2 - 1
|
| + jl xloop99
|
| +
|
| + // 1 pixel remainder
|
| + psrlw xmm2, 9 // 7 bit fractions.
|
| + movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
|
| + pshufb xmm2, xmm5 // 00000000
|
| + pshufb xmm0, xmm4 // arrange pixels into pairs
|
| + pxor xmm2, xmm6 // 0..7f and 7f..0
|
| + pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.
|
| + psrlw xmm0, 7
|
| + packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
|
| + movd [edi], xmm0
|
| +
|
| + align 4
|
| + xloop99:
|
| +
|
| + pop edi
|
| + pop esi
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Reads 4 pixels, duplicates them and writes 8 pixels.
|
| +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
|
| +__declspec(naked) __declspec(align(16))
|
| +void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
|
| + int dst_width, int x, int dx) {
|
| + __asm {
|
| + mov edx, [esp + 4] // dst_argb
|
| + mov eax, [esp + 8] // src_argb
|
| + mov ecx, [esp + 12] // dst_width
|
| +
|
| + align 4
|
| + wloop:
|
| + movdqa xmm0, [eax]
|
| + lea eax, [eax + 16]
|
| + movdqa xmm1, xmm0
|
| + punpckldq xmm0, xmm0
|
| + punpckhdq xmm1, xmm1
|
| + sub ecx, 8
|
| + movdqa [edx], xmm0
|
| + movdqa [edx + 16], xmm1
|
| + lea edx, [edx + 32]
|
| + jg wloop
|
| +
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Divide num by div and return as 16.16 fixed point result.
|
| +__declspec(naked) __declspec(align(16))
|
| +int FixedDiv_X86(int num, int div) {
|
| + __asm {
|
| + mov eax, [esp + 4] // num
|
| + cdq // extend num to 64 bits
|
| + shld edx, eax, 16 // 32.16
|
| + shl eax, 16
|
| + idiv dword ptr [esp + 8]
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Divide num by div and return as 16.16 fixed point result.
|
| +__declspec(naked) __declspec(align(16))
|
| +int FixedDiv1_X86(int num, int div) {
|
| + __asm {
|
| + mov eax, [esp + 4] // num
|
| + mov ecx, [esp + 8] // denom
|
| + cdq // extend num to 64 bits
|
| + shld edx, eax, 16 // 32.16
|
| + shl eax, 16
|
| + sub eax, 0x00010001
|
| + sbb edx, 0
|
| + sub ecx, 1
|
| + idiv ecx
|
| + ret
|
| + }
|
| +}
|
| +
|
| +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
| +
|
| +#ifdef __cplusplus
|
| +} // extern "C"
|
| +} // namespace libyuv
|
| +#endif
|
|
|