| Index: libvpx/source/libvpx/vp8/encoder/x86/variance_impl_sse2.asm
|
| diff --git a/libvpx/source/libvpx/vp8/encoder/x86/variance_impl_sse2.asm b/libvpx/source/libvpx/vp8/encoder/x86/variance_impl_sse2.asm
|
| index b7a6b3286c17f4f7b1401fa624168b71a617ca86..c2c30deb27a74973f67eb334b89d63e8a8f4c940 100644
|
| --- a/libvpx/source/libvpx/vp8/encoder/x86/variance_impl_sse2.asm
|
| +++ b/libvpx/source/libvpx/vp8/encoder/x86/variance_impl_sse2.asm
|
| @@ -85,7 +85,6 @@ sym(vp8_get16x16var_sse2):
|
| push rbp
|
| mov rbp, rsp
|
| SHADOW_ARGS_TO_STACK 6
|
| - SAVE_XMM 7
|
| push rbx
|
| push rsi
|
| push rdi
|
| @@ -207,12 +206,125 @@ var16loop:
|
| pop rdi
|
| pop rsi
|
| pop rbx
|
| - RESTORE_XMM
|
| UNSHADOW_ARGS
|
| pop rbp
|
| ret
|
|
|
|
|
| +;unsigned int vp8_get16x16pred_error_sse2
|
| +;(
|
| +; unsigned char *src_ptr,
|
| +; int src_stride,
|
| +; unsigned char *ref_ptr,
|
| +; int ref_stride
|
| +;)
|
| +global sym(vp8_get16x16pred_error_sse2)
|
| +sym(vp8_get16x16pred_error_sse2):
|
| + push rbp
|
| + mov rbp, rsp
|
| + SHADOW_ARGS_TO_STACK 4
|
| + GET_GOT rbx
|
| + push rsi
|
| + push rdi
|
| + sub rsp, 16
|
| + ; end prolog
|
| +
|
| + mov rsi, arg(0) ;[src_ptr]
|
| + mov rdi, arg(2) ;[ref_ptr]
|
| +
|
| + movsxd rax, DWORD PTR arg(1) ;[src_stride]
|
| + movsxd rdx, DWORD PTR arg(3) ;[ref_stride]
|
| +
|
| + pxor xmm0, xmm0 ; clear xmm0 for unpack
|
| + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
|
| +
|
| + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
|
| + mov rcx, 16
|
| +
|
| +var16peloop:
|
| + movdqu xmm1, XMMWORD PTR [rsi]
|
| + movdqu xmm2, XMMWORD PTR [rdi]
|
| +
|
| + movdqa xmm3, xmm1
|
| + movdqa xmm4, xmm2
|
| +
|
| + punpcklbw xmm1, xmm0
|
| + punpckhbw xmm3, xmm0
|
| +
|
| + punpcklbw xmm2, xmm0
|
| + punpckhbw xmm4, xmm0
|
| +
|
| + psubw xmm1, xmm2
|
| + psubw xmm3, xmm4
|
| +
|
| + paddw xmm7, xmm1
|
| + pmaddwd xmm1, xmm1
|
| +
|
| + paddw xmm7, xmm3
|
| + pmaddwd xmm3, xmm3
|
| +
|
| + paddd xmm6, xmm1
|
| + paddd xmm6, xmm3
|
| +
|
| + add rsi, rax
|
| + add rdi, rdx
|
| +
|
| + sub rcx, 1
|
| + jnz var16peloop
|
| +
|
| +
|
| + movdqa xmm1, xmm6
|
| + pxor xmm6, xmm6
|
| +
|
| + pxor xmm5, xmm5
|
| + punpcklwd xmm6, xmm7
|
| +
|
| + punpckhwd xmm5, xmm7
|
| + psrad xmm5, 16
|
| +
|
| + psrad xmm6, 16
|
| + paddd xmm6, xmm5
|
| +
|
| + movdqa xmm2, xmm1
|
| + punpckldq xmm1, xmm0
|
| +
|
| + punpckhdq xmm2, xmm0
|
| + movdqa xmm7, xmm6
|
| +
|
| + paddd xmm1, xmm2
|
| + punpckldq xmm6, xmm0
|
| +
|
| + punpckhdq xmm7, xmm0
|
| + paddd xmm6, xmm7
|
| +
|
| + movdqa xmm2, xmm1
|
| + movdqa xmm7, xmm6
|
| +
|
| + psrldq xmm1, 8
|
| + psrldq xmm6, 8
|
| +
|
| + paddd xmm7, xmm6
|
| + paddd xmm1, xmm2
|
| +
|
| + movd DWORD PTR [rsp], xmm7 ;Sum
|
| + movd DWORD PTR [rsp+4], xmm1 ;SSE
|
| +
|
| + ; return (SSE-((Sum*Sum)>>8));
|
| + movsxd rdx, dword ptr [rsp]
|
| + imul rdx, rdx
|
| + sar rdx, 8
|
| + movsxd rax, dword ptr [rsp + 4]
|
| + sub rax, rdx
|
| +
|
| + ; begin epilog
|
| + add rsp, 16
|
| + pop rdi
|
| + pop rsi
|
| + RESTORE_GOT
|
| + UNSHADOW_ARGS
|
| + pop rbp
|
| + ret
|
| +
|
|
|
|
|
| ;unsigned int vp8_get8x8var_sse2
|
| @@ -229,7 +341,6 @@ sym(vp8_get8x8var_sse2):
|
| push rbp
|
| mov rbp, rsp
|
| SHADOW_ARGS_TO_STACK 6
|
| - SAVE_XMM 7
|
| GET_GOT rbx
|
| push rsi
|
| push rdi
|
| @@ -395,7 +506,6 @@ sym(vp8_get8x8var_sse2):
|
| pop rdi
|
| pop rsi
|
| RESTORE_GOT
|
| - RESTORE_XMM
|
| UNSHADOW_ARGS
|
| pop rbp
|
| ret
|
| @@ -418,7 +528,7 @@ sym(vp8_filter_block2d_bil_var_sse2):
|
| push rbp
|
| mov rbp, rsp
|
| SHADOW_ARGS_TO_STACK 9
|
| - SAVE_XMM 7
|
| + SAVE_XMM
|
| GET_GOT rbx
|
| push rsi
|
| push rdi
|
| @@ -695,7 +805,6 @@ sym(vp8_half_horiz_vert_variance8x_h_sse2):
|
| push rbp
|
| mov rbp, rsp
|
| SHADOW_ARGS_TO_STACK 7
|
| - SAVE_XMM 7
|
| GET_GOT rbx
|
| push rsi
|
| push rdi
|
| @@ -797,7 +906,6 @@ vp8_half_horiz_vert_variance8x_h_1:
|
| pop rdi
|
| pop rsi
|
| RESTORE_GOT
|
| - RESTORE_XMM
|
| UNSHADOW_ARGS
|
| pop rbp
|
| ret
|
| @@ -817,7 +925,7 @@ sym(vp8_half_horiz_vert_variance16x_h_sse2):
|
| push rbp
|
| mov rbp, rsp
|
| SHADOW_ARGS_TO_STACK 7
|
| - SAVE_XMM 7
|
| + SAVE_XMM
|
| GET_GOT rbx
|
| push rsi
|
| push rdi
|
| @@ -933,7 +1041,6 @@ sym(vp8_half_vert_variance8x_h_sse2):
|
| push rbp
|
| mov rbp, rsp
|
| SHADOW_ARGS_TO_STACK 7
|
| - SAVE_XMM 7
|
| GET_GOT rbx
|
| push rsi
|
| push rdi
|
| @@ -1020,7 +1127,6 @@ vp8_half_vert_variance8x_h_1:
|
| pop rdi
|
| pop rsi
|
| RESTORE_GOT
|
| - RESTORE_XMM
|
| UNSHADOW_ARGS
|
| pop rbp
|
| ret
|
| @@ -1040,7 +1146,7 @@ sym(vp8_half_vert_variance16x_h_sse2):
|
| push rbp
|
| mov rbp, rsp
|
| SHADOW_ARGS_TO_STACK 7
|
| - SAVE_XMM 7
|
| + SAVE_XMM
|
| GET_GOT rbx
|
| push rsi
|
| push rdi
|
| @@ -1148,7 +1254,6 @@ sym(vp8_half_horiz_variance8x_h_sse2):
|
| push rbp
|
| mov rbp, rsp
|
| SHADOW_ARGS_TO_STACK 7
|
| - SAVE_XMM 7
|
| GET_GOT rbx
|
| push rsi
|
| push rdi
|
| @@ -1233,7 +1338,6 @@ vp8_half_horiz_variance8x_h_1:
|
| pop rdi
|
| pop rsi
|
| RESTORE_GOT
|
| - RESTORE_XMM
|
| UNSHADOW_ARGS
|
| pop rbp
|
| ret
|
| @@ -1253,7 +1357,7 @@ sym(vp8_half_horiz_variance16x_h_sse2):
|
| push rbp
|
| mov rbp, rsp
|
| SHADOW_ARGS_TO_STACK 7
|
| - SAVE_XMM 7
|
| + SAVE_XMM
|
| GET_GOT rbx
|
| push rsi
|
| push rdi
|
|
|