source/libvpx/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm - Issue 756673003: libvpx: Pull from upstream

Unified Diff: source/libvpx/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm

Issue 756673003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_highbd_subpel_variance.asm ('k') | source/libvpx/vp9/encoder/x86/vp9_highbd_variance_sse2.c » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: source/libvpx/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm

===================================================================

--- source/libvpx/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm (revision 0)

+++ source/libvpx/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm (working copy)

@@ -0,0 +1,313 @@

+; Use of this source code is governed by a BSD-style license

+; that can be found in the LICENSE file in the root of the source

+; tree. An additional intellectual property rights grant can be found

+; in the file PATENTS. All contributing project authors may

+; be found in the AUTHORS file in the root of the source tree.

+%include "vpx_ports/x86_abi_support.asm"

+;unsigned int vp9_highbd_calc16x16var_sse2

+;(

+; unsigned char * src_ptr,

+; int source_stride,

+; unsigned char * ref_ptr,

+; int recon_stride,

+; unsigned int * SSE,

+; int * Sum

+;)

+global sym(vp9_highbd_calc16x16var_sse2) PRIVATE

+sym(vp9_highbd_calc16x16var_sse2):

+ push rbp

+ mov rbp, rsp

+ SHADOW_ARGS_TO_STACK 6

+ SAVE_XMM 7

+ push rbx

+ push rsi

+ push rdi

+ ; end prolog

+ mov rsi, arg(0) ;[src_ptr]

+ mov rdi, arg(2) ;[ref_ptr]

+ movsxd rax, DWORD PTR arg(1) ;[source_stride]

+ movsxd rdx, DWORD PTR arg(3) ;[recon_stride]

+ add rax, rax ; source stride in bytes

+ add rdx, rdx ; recon stride in bytes

+ ; Prefetch data

+ prefetcht0 [rsi]

+ prefetcht0 [rsi+16]

+ prefetcht0 [rsi+rax]

+ prefetcht0 [rsi+rax+16]

+ lea rbx, [rsi+rax*2]

+ prefetcht0 [rbx]

+ prefetcht0 [rbx+16]

+ prefetcht0 [rbx+rax]

+ prefetcht0 [rbx+rax+16]

+ prefetcht0 [rdi]

+ prefetcht0 [rdi+16]

+ prefetcht0 [rdi+rdx]

+ prefetcht0 [rdi+rdx+16]

+ lea rbx, [rdi+rdx*2]

+ prefetcht0 [rbx]

+ prefetcht0 [rbx+16]

+ prefetcht0 [rbx+rdx]

+ prefetcht0 [rbx+rdx+16]

+ pxor xmm0, xmm0 ; clear xmm0 for unpack

+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs

+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse

+ mov rcx, 16

+.var16loop:

+ movdqu xmm1, XMMWORD PTR [rsi]

+ movdqu xmm2, XMMWORD PTR [rdi]

+ lea rbx, [rsi+rax*2]

+ prefetcht0 [rbx]

+ prefetcht0 [rbx+16]

+ prefetcht0 [rbx+rax]

+ prefetcht0 [rbx+rax+16]

+ lea rbx, [rdi+rdx*2]

+ prefetcht0 [rbx]

+ prefetcht0 [rbx+16]

+ prefetcht0 [rbx+rdx]

+ prefetcht0 [rbx+rdx+16]

+ pxor xmm5, xmm5

+ psubw xmm1, xmm2

+ movdqu xmm3, XMMWORD PTR [rsi+16]

+ paddw xmm5, xmm1

+ pmaddwd xmm1, xmm1

+ movdqu xmm2, XMMWORD PTR [rdi+16]

+ paddd xmm6, xmm1

+ psubw xmm3, xmm2

+ movdqu xmm1, XMMWORD PTR [rsi+rax]

+ paddw xmm5, xmm3

+ pmaddwd xmm3, xmm3

+ movdqu xmm2, XMMWORD PTR [rdi+rdx]

+ paddd xmm6, xmm3

+ psubw xmm1, xmm2

+ movdqu xmm3, XMMWORD PTR [rsi+rax+16]

+ paddw xmm5, xmm1

+ pmaddwd xmm1, xmm1

+ movdqu xmm2, XMMWORD PTR [rdi+rdx+16]

+ paddd xmm6, xmm1

+ psubw xmm3, xmm2

+ paddw xmm5, xmm3

+ pmaddwd xmm3, xmm3

+ paddd xmm6, xmm3

+ movdqa xmm1, xmm5

+ movdqa xmm2, xmm5

+ pcmpgtw xmm1, xmm0

+ pcmpeqw xmm2, xmm0

+ por xmm1, xmm2

+ pcmpeqw xmm1, xmm0

+ movdqa xmm2, xmm5

+ punpcklwd xmm5, xmm1

+ punpckhwd xmm2, xmm1

+ paddd xmm7, xmm5

+ paddd xmm7, xmm2

+ lea rsi, [rsi + 2*rax]

+ lea rdi, [rdi + 2*rdx]

+ sub rcx, 2

+ jnz .var16loop

+ movdqa xmm4, xmm6

+ punpckldq xmm6, xmm0

+ punpckhdq xmm4, xmm0

+ movdqa xmm5, xmm7

+ paddd xmm6, xmm4

+ punpckldq xmm7, xmm0

+ punpckhdq xmm5, xmm0

+ paddd xmm7, xmm5

+ movdqa xmm4, xmm6

+ movdqa xmm5, xmm7

+ psrldq xmm4, 8

+ psrldq xmm5, 8

+ paddd xmm6, xmm4

+ paddd xmm7, xmm5

+ mov rdi, arg(4) ; [SSE]

+ mov rax, arg(5) ; [Sum]

+ movd DWORD PTR [rdi], xmm6

+ movd DWORD PTR [rax], xmm7

+ ; begin epilog

+ pop rdi

+ pop rsi

+ pop rbx

+ RESTORE_XMM

+ UNSHADOW_ARGS

+ pop rbp

+ ret

+;unsigned int vp9_highbd_calc8x8var_sse2

+;(

+; unsigned char * src_ptr,

+; int source_stride,

+; unsigned char * ref_ptr,

+; int recon_stride,

+; unsigned int * SSE,

+; int * Sum

+;)

+global sym(vp9_highbd_calc8x8var_sse2) PRIVATE

+sym(vp9_highbd_calc8x8var_sse2):

+ push rbp

+ mov rbp, rsp

+ SHADOW_ARGS_TO_STACK 6

+ SAVE_XMM 7

+ push rbx

+ push rsi

+ push rdi

+ ; end prolog

+ mov rsi, arg(0) ;[src_ptr]

+ mov rdi, arg(2) ;[ref_ptr]

+ movsxd rax, DWORD PTR arg(1) ;[source_stride]

+ movsxd rdx, DWORD PTR arg(3) ;[recon_stride]

+ add rax, rax ; source stride in bytes

+ add rdx, rdx ; recon stride in bytes

+ ; Prefetch data

+ prefetcht0 [rsi]

+ prefetcht0 [rsi+rax]

+ lea rbx, [rsi+rax*2]

+ prefetcht0 [rbx]

+ prefetcht0 [rbx+rax]

+ prefetcht0 [rdi]

+ prefetcht0 [rdi+rdx]

+ lea rbx, [rdi+rdx*2]

+ prefetcht0 [rbx]

+ prefetcht0 [rbx+rdx]

+ pxor xmm0, xmm0 ; clear xmm0 for unpack

+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs

+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse

+ mov rcx, 8

+.var8loop:

+ movdqu xmm1, XMMWORD PTR [rsi]

+ movdqu xmm2, XMMWORD PTR [rdi]

+ lea rbx, [rsi+rax*4]

+ prefetcht0 [rbx]

+ prefetcht0 [rbx+rax]

+ lea rbx, [rbx+rax*2]

+ prefetcht0 [rbx]

+ prefetcht0 [rbx+rax]

+ lea rbx, [rdi+rdx*4]

+ prefetcht0 [rbx]

+ prefetcht0 [rbx+rdx]

+ lea rbx, [rbx+rdx*2]

+ prefetcht0 [rbx]

+ prefetcht0 [rbx+rdx]

+ pxor xmm5, xmm5

+ psubw xmm1, xmm2

+ movdqu xmm3, XMMWORD PTR [rsi+rax]

+ paddw xmm5, xmm1

+ pmaddwd xmm1, xmm1

+ movdqu xmm2, XMMWORD PTR [rdi+rdx]

+ paddd xmm6, xmm1

+ lea rsi, [rsi + 2*rax]

+ lea rdi, [rdi + 2*rdx]

+ psubw xmm3, xmm2

+ movdqu xmm1, XMMWORD PTR [rsi]

+ paddw xmm5, xmm3

+ pmaddwd xmm3, xmm3

+ movdqu xmm2, XMMWORD PTR [rdi]

+ paddd xmm6, xmm3

+ psubw xmm1, xmm2

+ movdqu xmm3, XMMWORD PTR [rsi+rax]

+ paddw xmm5, xmm1

+ pmaddwd xmm1, xmm1

+ movdqu xmm2, XMMWORD PTR [rdi+rdx]

+ paddd xmm6, xmm1

+ psubw xmm3, xmm2

+ paddw xmm5, xmm3

+ pmaddwd xmm3, xmm3

+ paddd xmm6, xmm3

+ movdqa xmm1, xmm5

+ movdqa xmm2, xmm5

+ pcmpgtw xmm1, xmm0

+ pcmpeqw xmm2, xmm0

+ por xmm1, xmm2

+ pcmpeqw xmm1, xmm0

+ movdqa xmm2, xmm5

+ punpcklwd xmm5, xmm1

+ punpckhwd xmm2, xmm1

+ paddd xmm7, xmm5

+ paddd xmm7, xmm2

+ lea rsi, [rsi + 2*rax]

+ lea rdi, [rdi + 2*rdx]

+ sub rcx, 4

+ jnz .var8loop

+ movdqa xmm4, xmm6

+ punpckldq xmm6, xmm0

+ punpckhdq xmm4, xmm0

+ movdqa xmm5, xmm7

+ paddd xmm6, xmm4

+ punpckldq xmm7, xmm0

+ punpckhdq xmm5, xmm0

+ paddd xmm7, xmm5

+ movdqa xmm4, xmm6

+ movdqa xmm5, xmm7

+ psrldq xmm4, 8

+ psrldq xmm5, 8

+ paddd xmm6, xmm4

+ paddd xmm7, xmm5

+ mov rdi, arg(4) ; [SSE]

+ mov rax, arg(5) ; [Sum]

+ movd DWORD PTR [rdi], xmm6

+ movd DWORD PTR [rax], xmm7

+ ; begin epilog

+ pop rdi

+ pop rsi

+ pop rbx

+ RESTORE_XMM

+ UNSHADOW_ARGS

+ pop rbp

+ ret