source/libvpx/vp9/common/x86/vp9_subpixel_sse2.asm - Issue 11555023: libvpx: Add VP9 decoder.

Unified Diff: source/libvpx/vp9/common/x86/vp9_subpixel_sse2.asm

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: source/libvpx/vp9/common/x86/vp9_subpixel_sse2.asm

===================================================================

--- source/libvpx/vp9/common/x86/vp9_subpixel_sse2.asm (revision 0)

+++ source/libvpx/vp9/common/x86/vp9_subpixel_sse2.asm (revision 0)

@@ -0,0 +1,1372 @@

+; Use of this source code is governed by a BSD-style license

+; that can be found in the LICENSE file in the root of the source

+; tree. An additional intellectual property rights grant can be found

+; in the file PATENTS. All contributing project authors may

+; be found in the AUTHORS file in the root of the source tree.

+%include "vpx_ports/x86_abi_support.asm"

+%define BLOCK_HEIGHT_WIDTH 4

+%define VP9_FILTER_WEIGHT 128

+%define VP9_FILTER_SHIFT 7

+;/************************************************************************************

+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The

+; input pixel array has output_height rows. This routine assumes that output_height is an

+; even number. This function handles 8 pixels in horizontal direction, calculating ONE

+; rows each iteration to take advantage of the 128 bits operations.

+;*************************************************************************************/

+;void vp9_filter_block1d8_h6_sse2

+;(

+; unsigned char *src_ptr,

+; unsigned short *output_ptr,

+; unsigned int src_pixels_per_line,

+; unsigned int pixel_step,

+; unsigned int output_height,

+; unsigned int output_width,

+; short *vp9_filter

+;)

+global sym(vp9_filter_block1d8_h6_sse2)

+sym(vp9_filter_block1d8_h6_sse2):

+ push rbp

+ mov rbp, rsp

+ SHADOW_ARGS_TO_STACK 7

+ SAVE_XMM 7

+ GET_GOT rbx

+ push rsi

+ push rdi

+ ; end prolog

+ mov rdx, arg(6) ;vp9_filter

+ mov rsi, arg(0) ;src_ptr

+ mov rdi, arg(1) ;output_ptr

+ movsxd rcx, dword ptr arg(4) ;output_height

+ movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source

+%if ABI_IS_32BIT=0

+ movsxd r8, dword ptr arg(5) ;output_width

+%endif

+ pxor xmm0, xmm0 ; clear xmm0 for unpack

+.filter_block1d8_h6_rowloop:

+ movq xmm3, MMWORD PTR [rsi - 2]

+ movq xmm1, MMWORD PTR [rsi + 6]

+ prefetcht2 [rsi+rax-2]

+ pslldq xmm1, 8

+ por xmm1, xmm3

+ movdqa xmm4, xmm1

+ movdqa xmm5, xmm1

+ movdqa xmm6, xmm1

+ movdqa xmm7, xmm1

+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1

+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2

+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3

+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4

+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5

+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6

+ paddsw xmm4, xmm7

+ paddsw xmm4, xmm5

+ paddsw xmm4, xmm3

+ paddsw xmm4, xmm6

+ paddsw xmm4, xmm1

+ paddsw xmm4, [GLOBAL(rd)]

+ psraw xmm4, 7

+ packuswb xmm4, xmm0

+ punpcklbw xmm4, xmm0

+ movdqa XMMWORD Ptr [rdi], xmm4

+ lea rsi, [rsi + rax]

+%if ABI_IS_32BIT

+ add rdi, DWORD Ptr arg(5) ;[output_width]

+%else

+ add rdi, r8

+%endif

+ dec rcx

+ jnz .filter_block1d8_h6_rowloop ; next row

+ ; begin epilog

+ pop rdi

+ pop rsi

+ RESTORE_GOT

+ RESTORE_XMM

+ UNSHADOW_ARGS

+ pop rbp

+ ret

+;void vp9_filter_block1d16_h6_sse2

+;(

+; unsigned char *src_ptr,

+; unsigned short *output_ptr,

+; unsigned int src_pixels_per_line,

+; unsigned int pixel_step,

+; unsigned int output_height,

+; unsigned int output_width,

+; short *vp9_filter

+;)

+;/************************************************************************************

+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The

+; input pixel array has output_height rows. This routine assumes that output_height is an

+; even number. This function handles 8 pixels in horizontal direction, calculating ONE

+; rows each iteration to take advantage of the 128 bits operations.

+;*************************************************************************************/

+global sym(vp9_filter_block1d16_h6_sse2)

+sym(vp9_filter_block1d16_h6_sse2):

+ push rbp

+ mov rbp, rsp

+ SHADOW_ARGS_TO_STACK 7

+ SAVE_XMM 7

+ GET_GOT rbx

+ push rsi

+ push rdi

+ ; end prolog

+ mov rdx, arg(6) ;vp9_filter

+ mov rsi, arg(0) ;src_ptr

+ mov rdi, arg(1) ;output_ptr

+ movsxd rcx, dword ptr arg(4) ;output_height

+ movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source

+%if ABI_IS_32BIT=0

+ movsxd r8, dword ptr arg(5) ;output_width

+%endif

+ pxor xmm0, xmm0 ; clear xmm0 for unpack

+.filter_block1d16_h6_sse2_rowloop:

+ movq xmm3, MMWORD PTR [rsi - 2]

+ movq xmm1, MMWORD PTR [rsi + 6]

+ movq xmm2, MMWORD PTR [rsi +14]

+ pslldq xmm2, 8

+ por xmm2, xmm1

+ prefetcht2 [rsi+rax-2]

+ pslldq xmm1, 8

+ por xmm1, xmm3

+ movdqa xmm4, xmm1

+ movdqa xmm5, xmm1

+ movdqa xmm6, xmm1

+ movdqa xmm7, xmm1

+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1

+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2

+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3

+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4

+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5

+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6

+ paddsw xmm4, xmm7

+ paddsw xmm4, xmm5

+ paddsw xmm4, xmm3

+ paddsw xmm4, xmm6

+ paddsw xmm4, xmm1

+ paddsw xmm4, [GLOBAL(rd)]

+ psraw xmm4, 7

+ packuswb xmm4, xmm0

+ punpcklbw xmm4, xmm0

+ movdqa XMMWORD Ptr [rdi], xmm4

+ movdqa xmm3, xmm2

+ movdqa xmm4, xmm2

+ movdqa xmm5, xmm2

+ movdqa xmm6, xmm2

+ movdqa xmm7, xmm2

+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1

+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2

+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3

+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4

+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

+ psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5

+ punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

+ pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6

+ paddsw xmm4, xmm7

+ paddsw xmm4, xmm5

+ paddsw xmm4, xmm3

+ paddsw xmm4, xmm6

+ paddsw xmm4, xmm2

+ paddsw xmm4, [GLOBAL(rd)]

+ psraw xmm4, 7

+ packuswb xmm4, xmm0

+ punpcklbw xmm4, xmm0

+ movdqa XMMWORD Ptr [rdi+16], xmm4

+ lea rsi, [rsi + rax]

+%if ABI_IS_32BIT

+ add rdi, DWORD Ptr arg(5) ;[output_width]

+%else

+ add rdi, r8

+%endif

+ dec rcx

+ jnz .filter_block1d16_h6_sse2_rowloop ; next row

+ ; begin epilog

+ pop rdi

+ pop rsi

+ RESTORE_GOT

+ RESTORE_XMM

+ UNSHADOW_ARGS

+ pop rbp

+ ret

+;void vp9_filter_block1d8_v6_sse2

+;(

+; short *src_ptr,

+; unsigned char *output_ptr,

+; int dst_ptich,

+; unsigned int pixels_per_line,

+; unsigned int pixel_step,

+; unsigned int output_height,

+; unsigned int output_width,

+; short * vp9_filter

+;)

+;/************************************************************************************

+; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The

+; input pixel array has output_height rows.

+;*************************************************************************************/

+global sym(vp9_filter_block1d8_v6_sse2)

+sym(vp9_filter_block1d8_v6_sse2):

+ push rbp

+ mov rbp, rsp

+ SHADOW_ARGS_TO_STACK 8

+ SAVE_XMM 7

+ GET_GOT rbx

+ push rsi

+ push rdi

+ ; end prolog

+ mov rax, arg(7) ;vp9_filter

+ movsxd rdx, dword ptr arg(3) ;pixels_per_line

+ mov rdi, arg(1) ;output_ptr

+ mov rsi, arg(0) ;src_ptr

+ sub rsi, rdx

+ movsxd rcx, DWORD PTR arg(5) ;[output_height]

+ pxor xmm0, xmm0 ; clear xmm0

+ movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]

+%if ABI_IS_32BIT=0

+ movsxd r8, dword ptr arg(2) ; dst_ptich

+%endif

+.vp9_filter_block1d8_v6_sse2_loop:

+ movdqa xmm1, XMMWORD PTR [rsi]

+ pmullw xmm1, [rax]

+ movdqa xmm2, XMMWORD PTR [rsi + rdx]

+ pmullw xmm2, [rax + 16]

+ movdqa xmm3, XMMWORD PTR [rsi + rdx * 2]

+ pmullw xmm3, [rax + 32]

+ movdqa xmm5, XMMWORD PTR [rsi + rdx * 4]

+ pmullw xmm5, [rax + 64]

+ add rsi, rdx

+ movdqa xmm4, XMMWORD PTR [rsi + rdx * 2]

+ pmullw xmm4, [rax + 48]

+ movdqa xmm6, XMMWORD PTR [rsi + rdx * 4]

+ pmullw xmm6, [rax + 80]

+ paddsw xmm2, xmm5

+ paddsw xmm2, xmm3

+ paddsw xmm2, xmm1

+ paddsw xmm2, xmm4

+ paddsw xmm2, xmm6

+ paddsw xmm2, xmm7

+ psraw xmm2, 7

+ packuswb xmm2, xmm0 ; pack and saturate

+ movq QWORD PTR [rdi], xmm2 ; store the results in the destination

+%if ABI_IS_32BIT

+ add rdi, DWORD PTR arg(2) ;[dst_ptich]

+%else

+ add rdi, r8

+%endif

+ dec rcx ; decrement count

+ jnz .vp9_filter_block1d8_v6_sse2_loop ; next row

+ ; begin epilog

+ pop rdi

+ pop rsi

+ RESTORE_GOT

+ RESTORE_XMM

+ UNSHADOW_ARGS

+ pop rbp

+ ret

+;void vp9_filter_block1d16_v6_sse2

+;(

+; unsigned short *src_ptr,

+; unsigned char *output_ptr,

+; int dst_ptich,

+; unsigned int pixels_per_line,

+; unsigned int pixel_step,

+; unsigned int output_height,

+; unsigned int output_width,

+; const short *vp9_filter

+;)

+;/************************************************************************************

+; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The

+; input pixel array has output_height rows.

+;*************************************************************************************/

+global sym(vp9_filter_block1d16_v6_sse2)

+sym(vp9_filter_block1d16_v6_sse2):

+ push rbp

+ mov rbp, rsp

+ SHADOW_ARGS_TO_STACK 8

+ SAVE_XMM 7

+ GET_GOT rbx

+ push rsi

+ push rdi

+ ; end prolog

+ mov rax, arg(7) ;vp9_filter

+ movsxd rdx, dword ptr arg(3) ;pixels_per_line

+ mov rdi, arg(1) ;output_ptr

+ mov rsi, arg(0) ;src_ptr

+ sub rsi, rdx

+ movsxd rcx, DWORD PTR arg(5) ;[output_height]

+%if ABI_IS_32BIT=0

+ movsxd r8, dword ptr arg(2) ; dst_ptich

+%endif

+.vp9_filter_block1d16_v6_sse2_loop:

+; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.

+ movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2

+ movdqa xmm2, XMMWORD PTR [rsi + rdx + 16]

+ pmullw xmm1, [rax + 16]

+ pmullw xmm2, [rax + 16]

+ movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5

+ movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16]

+ pmullw xmm3, [rax + 64]

+ pmullw xmm4, [rax + 64]

+ movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3

+ movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16]

+ pmullw xmm5, [rax + 32]

+ pmullw xmm6, [rax + 32]

+ movdqa xmm7, XMMWORD PTR [rsi] ; line 1

+ movdqa xmm0, XMMWORD PTR [rsi + 16]

+ pmullw xmm7, [rax]

+ pmullw xmm0, [rax]

+ paddsw xmm1, xmm3

+ paddsw xmm2, xmm4

+ paddsw xmm1, xmm5

+ paddsw xmm2, xmm6

+ paddsw xmm1, xmm7

+ paddsw xmm2, xmm0

+ add rsi, rdx

+ movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4

+ movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16]

+ pmullw xmm3, [rax + 48]

+ pmullw xmm4, [rax + 48]

+ movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6

+ movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16]

+ pmullw xmm5, [rax + 80]

+ pmullw xmm6, [rax + 80]

+ movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]

+ pxor xmm0, xmm0 ; clear xmm0

+ paddsw xmm1, xmm3

+ paddsw xmm2, xmm4

+ paddsw xmm1, xmm5

+ paddsw xmm2, xmm6

+ paddsw xmm1, xmm7

+ paddsw xmm2, xmm7

+ psraw xmm1, 7

+ psraw xmm2, 7

+ packuswb xmm1, xmm2 ; pack and saturate

+ movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination

+%if ABI_IS_32BIT

+ add rdi, DWORD PTR arg(2) ;[dst_ptich]

+%else

+ add rdi, r8

+%endif

+ dec rcx ; decrement count

+ jnz .vp9_filter_block1d16_v6_sse2_loop ; next row

+ ; begin epilog

+ pop rdi

+ pop rsi

+ RESTORE_GOT

+ RESTORE_XMM

+ UNSHADOW_ARGS

+ pop rbp

+ ret

+;void vp9_filter_block1d8_h6_only_sse2

+;(

+; unsigned char *src_ptr,

+; unsigned int src_pixels_per_line,

+; unsigned char *output_ptr,

+; int dst_ptich,

+; unsigned int output_height,

+; const short *vp9_filter

+;)

+; First-pass filter only when yoffset==0

+global sym(vp9_filter_block1d8_h6_only_sse2)

+sym(vp9_filter_block1d8_h6_only_sse2):

+ push rbp

+ mov rbp, rsp

+ SHADOW_ARGS_TO_STACK 6

+ SAVE_XMM 7

+ GET_GOT rbx

+ push rsi

+ push rdi

+ ; end prolog

+ mov rdx, arg(5) ;vp9_filter

+ mov rsi, arg(0) ;src_ptr

+ mov rdi, arg(2) ;output_ptr

+ movsxd rcx, dword ptr arg(4) ;output_height

+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source

+%if ABI_IS_32BIT=0

+ movsxd r8, dword ptr arg(3) ;dst_ptich

+%endif

+ pxor xmm0, xmm0 ; clear xmm0 for unpack

+.filter_block1d8_h6_only_rowloop:

+ movq xmm3, MMWORD PTR [rsi - 2]

+ movq xmm1, MMWORD PTR [rsi + 6]

+ prefetcht2 [rsi+rax-2]

+ pslldq xmm1, 8

+ por xmm1, xmm3

+ movdqa xmm4, xmm1

+ movdqa xmm5, xmm1

+ movdqa xmm6, xmm1

+ movdqa xmm7, xmm1

+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1

+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2

+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3

+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4

+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5

+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6

+ paddsw xmm4, xmm7

+ paddsw xmm4, xmm5

+ paddsw xmm4, xmm3

+ paddsw xmm4, xmm6

+ paddsw xmm4, xmm1

+ paddsw xmm4, [GLOBAL(rd)]

+ psraw xmm4, 7

+ packuswb xmm4, xmm0

+ movq QWORD PTR [rdi], xmm4 ; store the results in the destination

+ lea rsi, [rsi + rax]

+%if ABI_IS_32BIT

+ add rdi, DWORD Ptr arg(3) ;dst_ptich

+%else

+ add rdi, r8

+%endif

+ dec rcx

+ jnz .filter_block1d8_h6_only_rowloop ; next row

+ ; begin epilog

+ pop rdi

+ pop rsi

+ RESTORE_GOT

+ RESTORE_XMM

+ UNSHADOW_ARGS

+ pop rbp

+ ret

+;void vp9_filter_block1d16_h6_only_sse2

+;(

+; unsigned char *src_ptr,

+; unsigned int src_pixels_per_line,

+; unsigned char *output_ptr,

+; int dst_ptich,

+; unsigned int output_height,

+; const short *vp9_filter

+;)

+; First-pass filter only when yoffset==0

+global sym(vp9_filter_block1d16_h6_only_sse2)

+sym(vp9_filter_block1d16_h6_only_sse2):

+ push rbp

+ mov rbp, rsp

+ SHADOW_ARGS_TO_STACK 6

+ SAVE_XMM 7

+ GET_GOT rbx

+ push rsi

+ push rdi

+ ; end prolog

+ mov rdx, arg(5) ;vp9_filter

+ mov rsi, arg(0) ;src_ptr

+ mov rdi, arg(2) ;output_ptr

+ movsxd rcx, dword ptr arg(4) ;output_height

+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source

+%if ABI_IS_32BIT=0

+ movsxd r8, dword ptr arg(3) ;dst_ptich

+%endif

+ pxor xmm0, xmm0 ; clear xmm0 for unpack

+.filter_block1d16_h6_only_sse2_rowloop:

+ movq xmm3, MMWORD PTR [rsi - 2]

+ movq xmm1, MMWORD PTR [rsi + 6]

+ movq xmm2, MMWORD PTR [rsi +14]

+ pslldq xmm2, 8

+ por xmm2, xmm1

+ prefetcht2 [rsi+rax-2]

+ pslldq xmm1, 8

+ por xmm1, xmm3

+ movdqa xmm4, xmm1

+ movdqa xmm5, xmm1

+ movdqa xmm6, xmm1

+ movdqa xmm7, xmm1

+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1

+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2

+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3

+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4

+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5

+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6

+ paddsw xmm4, xmm7

+ paddsw xmm4, xmm5

+ paddsw xmm4, xmm3

+ paddsw xmm4, xmm6

+ paddsw xmm4, xmm1

+ paddsw xmm4, [GLOBAL(rd)]

+ psraw xmm4, 7

+ packuswb xmm4, xmm0 ; lower 8 bytes

+ movq QWORD Ptr [rdi], xmm4 ; store the results in the destination

+ movdqa xmm3, xmm2

+ movdqa xmm4, xmm2

+ movdqa xmm5, xmm2

+ movdqa xmm6, xmm2

+ movdqa xmm7, xmm2

+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1

+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2

+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3

+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4

+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

+ psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5

+ punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

+ pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6

+ paddsw xmm4, xmm7

+ paddsw xmm4, xmm5

+ paddsw xmm4, xmm3

+ paddsw xmm4, xmm6

+ paddsw xmm4, xmm2

+ paddsw xmm4, [GLOBAL(rd)]

+ psraw xmm4, 7

+ packuswb xmm4, xmm0 ; higher 8 bytes

+ movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination

+ lea rsi, [rsi + rax]

+%if ABI_IS_32BIT

+ add rdi, DWORD Ptr arg(3) ;dst_ptich

+%else

+ add rdi, r8

+%endif

+ dec rcx

+ jnz .filter_block1d16_h6_only_sse2_rowloop ; next row

+ ; begin epilog

+ pop rdi

+ pop rsi

+ RESTORE_GOT

+ RESTORE_XMM

+ UNSHADOW_ARGS

+ pop rbp

+ ret

+;void vp9_filter_block1d8_v6_only_sse2

+;(

+; unsigned char *src_ptr,

+; unsigned int src_pixels_per_line,

+; unsigned char *output_ptr,

+; int dst_ptich,

+; unsigned int output_height,

+; const short *vp9_filter

+;)

+; Second-pass filter only when xoffset==0

+global sym(vp9_filter_block1d8_v6_only_sse2)

+sym(vp9_filter_block1d8_v6_only_sse2):

+ push rbp

+ mov rbp, rsp

+ SHADOW_ARGS_TO_STACK 6

+ SAVE_XMM 7

+ GET_GOT rbx

+ push rsi

+ push rdi

+ ; end prolog

+ mov rsi, arg(0) ;src_ptr

+ mov rdi, arg(2) ;output_ptr

+ movsxd rcx, dword ptr arg(4) ;output_height

+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line

+ mov rax, arg(5) ;vp9_filter

+ pxor xmm0, xmm0 ; clear xmm0

+ movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]

+%if ABI_IS_32BIT=0

+ movsxd r8, dword ptr arg(3) ; dst_ptich

+%endif

+.vp9_filter_block1d8_v6_only_sse2_loop:

+ movq xmm1, MMWORD PTR [rsi]

+ movq xmm2, MMWORD PTR [rsi + rdx]

+ movq xmm3, MMWORD PTR [rsi + rdx * 2]

+ movq xmm5, MMWORD PTR [rsi + rdx * 4]

+ add rsi, rdx

+ movq xmm4, MMWORD PTR [rsi + rdx * 2]

+ movq xmm6, MMWORD PTR [rsi + rdx * 4]

+ punpcklbw xmm1, xmm0

+ pmullw xmm1, [rax]

+ punpcklbw xmm2, xmm0

+ pmullw xmm2, [rax + 16]

+ punpcklbw xmm3, xmm0

+ pmullw xmm3, [rax + 32]

+ punpcklbw xmm5, xmm0

+ pmullw xmm5, [rax + 64]

+ punpcklbw xmm4, xmm0

+ pmullw xmm4, [rax + 48]

+ punpcklbw xmm6, xmm0

+ pmullw xmm6, [rax + 80]

+ paddsw xmm2, xmm5

+ paddsw xmm2, xmm3

+ paddsw xmm2, xmm1

+ paddsw xmm2, xmm4

+ paddsw xmm2, xmm6

+ paddsw xmm2, xmm7

+ psraw xmm2, 7

+ packuswb xmm2, xmm0 ; pack and saturate

+ movq QWORD PTR [rdi], xmm2 ; store the results in the destination

+%if ABI_IS_32BIT

+ add rdi, DWORD PTR arg(3) ;[dst_ptich]

+%else

+ add rdi, r8

+%endif

+ dec rcx ; decrement count

+ jnz .vp9_filter_block1d8_v6_only_sse2_loop ; next row

+ ; begin epilog

+ pop rdi

+ pop rsi

+ RESTORE_GOT

+ RESTORE_XMM

+ UNSHADOW_ARGS

+ pop rbp

+ ret

+;void vp9_unpack_block1d16_h6_sse2

+;(

+; unsigned char *src_ptr,

+; unsigned short *output_ptr,

+; unsigned int src_pixels_per_line,

+; unsigned int output_height,

+; unsigned int output_width

+;)

+global sym(vp9_unpack_block1d16_h6_sse2)

+sym(vp9_unpack_block1d16_h6_sse2):

+ push rbp

+ mov rbp, rsp

+ SHADOW_ARGS_TO_STACK 5

+ GET_GOT rbx

+ push rsi

+ push rdi

+ ; end prolog

+ mov rsi, arg(0) ;src_ptr

+ mov rdi, arg(1) ;output_ptr

+ movsxd rcx, dword ptr arg(3) ;output_height

+ movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source

+ pxor xmm0, xmm0 ; clear xmm0 for unpack

+%if ABI_IS_32BIT=0

+ movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source

+%endif

+.unpack_block1d16_h6_sse2_rowloop:

+ movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2

+ movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1

+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

+ punpcklbw xmm1, xmm0

+ movdqa XMMWORD Ptr [rdi], xmm1

+ movdqa XMMWORD Ptr [rdi + 16], xmm3

+ lea rsi, [rsi + rax]

+%if ABI_IS_32BIT

+ add rdi, DWORD Ptr arg(4) ;[output_width]

+%else

+ add rdi, r8

+%endif

+ dec rcx

+ jnz .unpack_block1d16_h6_sse2_rowloop ; next row

+ ; begin epilog

+ pop rdi

+ pop rsi

+ RESTORE_GOT

+ UNSHADOW_ARGS

+ pop rbp

+ ret

+;void vp9_bilinear_predict16x16_sse2

+;(

+; unsigned char *src_ptr,

+; int src_pixels_per_line,

+; int xoffset,

+; int yoffset,

+; unsigned char *dst_ptr,

+; int dst_pitch

+;)

+extern sym(vp9_bilinear_filters_mmx)

+global sym(vp9_bilinear_predict16x16_sse2)

+sym(vp9_bilinear_predict16x16_sse2):

+ push rbp

+ mov rbp, rsp

+ SHADOW_ARGS_TO_STACK 6

+ SAVE_XMM 7

+ GET_GOT rbx

+ push rsi

+ push rdi

+ ; end prolog

+ ;const short *HFilter = bilinear_filters_mmx[xoffset]

+ ;const short *VFilter = bilinear_filters_mmx[yoffset]

+ lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))]

+ movsxd rax, dword ptr arg(2) ;xoffset

+ cmp rax, 0 ;skip first_pass filter if xoffset=0

+ je .b16x16_sp_only

+ shl rax, 5

+ add rax, rcx ;HFilter

+ mov rdi, arg(4) ;dst_ptr

+ mov rsi, arg(0) ;src_ptr

+ movsxd rdx, dword ptr arg(5) ;dst_pitch

+ movdqa xmm1, [rax]

+ movdqa xmm2, [rax+16]

+ movsxd rax, dword ptr arg(3) ;yoffset

+ cmp rax, 0 ;skip second_pass filter if yoffset=0

+ je .b16x16_fp_only

+ shl rax, 5

+ add rax, rcx ;VFilter

+ lea rcx, [rdi+rdx*8]

+ lea rcx, [rcx+rdx*8]

+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line

+ pxor xmm0, xmm0

+%if ABI_IS_32BIT=0

+ movsxd r8, dword ptr arg(5) ;dst_pitch

+%endif

+ ; get the first horizontal line done

+ movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

+ movdqa xmm4, xmm3 ; make a copy of current line

+ punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06

+ punpckhbw xmm4, xmm0

+ pmullw xmm3, xmm1

+ pmullw xmm4, xmm1

+ movdqu xmm5, [rsi+1]

+ movdqa xmm6, xmm5

+ punpcklbw xmm5, xmm0

+ punpckhbw xmm6, xmm0

+ pmullw xmm5, xmm2

+ pmullw xmm6, xmm2

+ paddw xmm3, xmm5

+ paddw xmm4, xmm6

+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value

+ psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128

+ paddw xmm4, [GLOBAL(rd)]

+ psraw xmm4, VP9_FILTER_SHIFT

+ movdqa xmm7, xmm3

+ packuswb xmm7, xmm4

+ add rsi, rdx ; next line

+.next_row:

+ movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

+ movdqa xmm4, xmm3 ; make a copy of current line

+ punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06

+ punpckhbw xmm4, xmm0

+ pmullw xmm3, xmm1

+ pmullw xmm4, xmm1

+ movdqu xmm5, [rsi+1]

+ movdqa xmm6, xmm5

+ punpcklbw xmm5, xmm0

+ punpckhbw xmm6, xmm0

+ pmullw xmm5, xmm2

+ pmullw xmm6, xmm2

+ paddw xmm3, xmm5

+ paddw xmm4, xmm6

+ movdqa xmm5, xmm7

+ movdqa xmm6, xmm7

+ punpcklbw xmm5, xmm0

+ punpckhbw xmm6, xmm0

+ pmullw xmm5, [rax]

+ pmullw xmm6, [rax]

+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value

+ psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128

+ paddw xmm4, [GLOBAL(rd)]

+ psraw xmm4, VP9_FILTER_SHIFT

+ movdqa xmm7, xmm3

+ packuswb xmm7, xmm4

+ pmullw xmm3, [rax+16]

+ pmullw xmm4, [rax+16]

+ paddw xmm3, xmm5

+ paddw xmm4, xmm6

+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value

+ psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128

+ paddw xmm4, [GLOBAL(rd)]

+ psraw xmm4, VP9_FILTER_SHIFT

+ packuswb xmm3, xmm4

+ movdqa [rdi], xmm3 ; store the results in the destination

+ add rsi, rdx ; next line

+%if ABI_IS_32BIT

+ add rdi, DWORD PTR arg(5) ;dst_pitch

+%else

+ add rdi, r8

+%endif

+ cmp rdi, rcx

+ jne .next_row

+ jmp .done

+.b16x16_sp_only:

+ movsxd rax, dword ptr arg(3) ;yoffset

+ shl rax, 5

+ add rax, rcx ;VFilter

+ mov rdi, arg(4) ;dst_ptr

+ mov rsi, arg(0) ;src_ptr

+ movsxd rdx, dword ptr arg(5) ;dst_pitch

+ movdqa xmm1, [rax]

+ movdqa xmm2, [rax+16]

+ lea rcx, [rdi+rdx*8]

+ lea rcx, [rcx+rdx*8]

+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line

+ pxor xmm0, xmm0

+ ; get the first horizontal line done

+ movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

+ add rsi, rax ; next line

+.next_row_spo:

+ movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

+ movdqa xmm5, xmm7

+ movdqa xmm6, xmm7

+ movdqa xmm4, xmm3 ; make a copy of current line

+ movdqa xmm7, xmm3

+ punpcklbw xmm5, xmm0

+ punpckhbw xmm6, xmm0

+ punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06

+ punpckhbw xmm4, xmm0

+ pmullw xmm5, xmm1

+ pmullw xmm6, xmm1

+ pmullw xmm3, xmm2

+ pmullw xmm4, xmm2

+ paddw xmm3, xmm5

+ paddw xmm4, xmm6

+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value

+ psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128

+ paddw xmm4, [GLOBAL(rd)]

+ psraw xmm4, VP9_FILTER_SHIFT

+ packuswb xmm3, xmm4

+ movdqa [rdi], xmm3 ; store the results in the destination

+ add rsi, rax ; next line

+ add rdi, rdx ;dst_pitch

+ cmp rdi, rcx

+ jne .next_row_spo

+ jmp .done

+.b16x16_fp_only:

+ lea rcx, [rdi+rdx*8]

+ lea rcx, [rcx+rdx*8]

+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line

+ pxor xmm0, xmm0

+.next_row_fpo:

+ movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

+ movdqa xmm4, xmm3 ; make a copy of current line

+ punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06

+ punpckhbw xmm4, xmm0

+ pmullw xmm3, xmm1

+ pmullw xmm4, xmm1

+ movdqu xmm5, [rsi+1]

+ movdqa xmm6, xmm5

+ punpcklbw xmm5, xmm0

+ punpckhbw xmm6, xmm0

+ pmullw xmm5, xmm2

+ pmullw xmm6, xmm2

+ paddw xmm3, xmm5

+ paddw xmm4, xmm6

+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value

+ psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128

+ paddw xmm4, [GLOBAL(rd)]

+ psraw xmm4, VP9_FILTER_SHIFT

+ packuswb xmm3, xmm4

+ movdqa [rdi], xmm3 ; store the results in the destination

+ add rsi, rax ; next line

+ add rdi, rdx ; dst_pitch

+ cmp rdi, rcx

+ jne .next_row_fpo

+.done:

+ ; begin epilog

+ pop rdi

+ pop rsi

+ RESTORE_GOT

+ RESTORE_XMM

+ UNSHADOW_ARGS

+ pop rbp

+ ret

+;void vp9_bilinear_predict8x8_sse2

+;(

+; unsigned char *src_ptr,

+; int src_pixels_per_line,

+; int xoffset,

+; int yoffset,

+; unsigned char *dst_ptr,

+; int dst_pitch

+;)

+extern sym(vp9_bilinear_filters_mmx)

+global sym(vp9_bilinear_predict8x8_sse2)

+sym(vp9_bilinear_predict8x8_sse2):

+ push rbp

+ mov rbp, rsp

+ SHADOW_ARGS_TO_STACK 6

+ SAVE_XMM 7

+ GET_GOT rbx

+ push rsi

+ push rdi

+ ; end prolog

+ ALIGN_STACK 16, rax

+ sub rsp, 144 ; reserve 144 bytes

+ ;const short *HFilter = bilinear_filters_mmx[xoffset]

+ ;const short *VFilter = bilinear_filters_mmx[yoffset]

+ lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))]

+ mov rsi, arg(0) ;src_ptr

+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line

+ ;Read 9-line unaligned data in and put them on stack. This gives a big

+ ;performance boost.

+ movdqu xmm0, [rsi]

+ lea rax, [rdx + rdx*2]

+ movdqu xmm1, [rsi+rdx]

+ movdqu xmm2, [rsi+rdx*2]

+ add rsi, rax

+ movdqu xmm3, [rsi]

+ movdqu xmm4, [rsi+rdx]

+ movdqu xmm5, [rsi+rdx*2]

+ add rsi, rax

+ movdqu xmm6, [rsi]

+ movdqu xmm7, [rsi+rdx]

+ movdqa XMMWORD PTR [rsp], xmm0

+ movdqu xmm0, [rsi+rdx*2]

+ movdqa XMMWORD PTR [rsp+16], xmm1

+ movdqa XMMWORD PTR [rsp+32], xmm2

+ movdqa XMMWORD PTR [rsp+48], xmm3

+ movdqa XMMWORD PTR [rsp+64], xmm4

+ movdqa XMMWORD PTR [rsp+80], xmm5

+ movdqa XMMWORD PTR [rsp+96], xmm6

+ movdqa XMMWORD PTR [rsp+112], xmm7

+ movdqa XMMWORD PTR [rsp+128], xmm0

+ movsxd rax, dword ptr arg(2) ;xoffset

+ shl rax, 5

+ add rax, rcx ;HFilter

+ mov rdi, arg(4) ;dst_ptr

+ movsxd rdx, dword ptr arg(5) ;dst_pitch

+ movdqa xmm1, [rax]

+ movdqa xmm2, [rax+16]

+ movsxd rax, dword ptr arg(3) ;yoffset

+ shl rax, 5

+ add rax, rcx ;VFilter

+ lea rcx, [rdi+rdx*8]

+ movdqa xmm5, [rax]

+ movdqa xmm6, [rax+16]

+ pxor xmm0, xmm0

+ ; get the first horizontal line done

+ movdqa xmm3, XMMWORD PTR [rsp]

+ movdqa xmm4, xmm3 ; make a copy of current line

+ psrldq xmm4, 1

+ punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07

+ punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08

+ pmullw xmm3, xmm1

+ pmullw xmm4, xmm2

+ paddw xmm3, xmm4

+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value

+ psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128

+ movdqa xmm7, xmm3

+ add rsp, 16 ; next line

+.next_row8x8:

+ movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

+ movdqa xmm4, xmm3 ; make a copy of current line

+ psrldq xmm4, 1

+ punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07

+ punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08

+ pmullw xmm3, xmm1

+ pmullw xmm4, xmm2

+ paddw xmm3, xmm4

+ pmullw xmm7, xmm5

+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value

+ psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128

+ movdqa xmm4, xmm3

+ pmullw xmm3, xmm6

+ paddw xmm3, xmm7

+ movdqa xmm7, xmm4

+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value

+ psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128

+ packuswb xmm3, xmm0

+ movq [rdi], xmm3 ; store the results in the destination

+ add rsp, 16 ; next line

+ add rdi, rdx

+ cmp rdi, rcx

+ jne .next_row8x8

+ ;add rsp, 144

+ pop rsp

+ ; begin epilog

+ pop rdi

+ pop rsi

+ RESTORE_GOT

+ RESTORE_XMM

+ UNSHADOW_ARGS

+ pop rbp

+ ret

+SECTION_RODATA

+align 16

+rd:

+ times 8 dw 0x40

« libvpx.gyp ('K') | « source/libvpx/vp9/common/x86/vp9_subpixel_mmx.asm ('k') | source/libvpx/vp9/common/x86/vp9_subpixel_ssse3.asm » ('j') | no next file with comments »