Index: libvpx/source/libvpx/vp8/encoder/x86/variance_impl_sse2.asm |
diff --git a/libvpx/source/libvpx/vp8/encoder/x86/variance_impl_sse2.asm b/libvpx/source/libvpx/vp8/encoder/x86/variance_impl_sse2.asm |
index b7a6b3286c17f4f7b1401fa624168b71a617ca86..c2c30deb27a74973f67eb334b89d63e8a8f4c940 100644 |
--- a/libvpx/source/libvpx/vp8/encoder/x86/variance_impl_sse2.asm |
+++ b/libvpx/source/libvpx/vp8/encoder/x86/variance_impl_sse2.asm |
@@ -85,7 +85,6 @@ sym(vp8_get16x16var_sse2): |
push rbp |
mov rbp, rsp |
SHADOW_ARGS_TO_STACK 6 |
- SAVE_XMM 7 |
push rbx |
push rsi |
push rdi |
@@ -207,12 +206,125 @@ var16loop: |
pop rdi |
pop rsi |
pop rbx |
- RESTORE_XMM |
UNSHADOW_ARGS |
pop rbp |
ret |
+;unsigned int vp8_get16x16pred_error_sse2 |
+;( |
+; unsigned char *src_ptr, |
+; int src_stride, |
+; unsigned char *ref_ptr, |
+; int ref_stride |
+;) |
+global sym(vp8_get16x16pred_error_sse2) |
+sym(vp8_get16x16pred_error_sse2): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 4 |
+ GET_GOT rbx |
+ push rsi |
+ push rdi |
+ sub rsp, 16 |
+ ; end prolog |
+ |
+ mov rsi, arg(0) ;[src_ptr] |
+ mov rdi, arg(2) ;[ref_ptr] |
+ |
+ movsxd rax, DWORD PTR arg(1) ;[src_stride] |
+ movsxd rdx, DWORD PTR arg(3) ;[ref_stride] |
+ |
+ pxor xmm0, xmm0 ; clear xmm0 for unpack |
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs |
+ |
+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse |
+ mov rcx, 16 |
+ |
+var16peloop: |
+ movdqu xmm1, XMMWORD PTR [rsi] |
+ movdqu xmm2, XMMWORD PTR [rdi] |
+ |
+ movdqa xmm3, xmm1 |
+ movdqa xmm4, xmm2 |
+ |
+ punpcklbw xmm1, xmm0 |
+ punpckhbw xmm3, xmm0 |
+ |
+ punpcklbw xmm2, xmm0 |
+ punpckhbw xmm4, xmm0 |
+ |
+ psubw xmm1, xmm2 |
+ psubw xmm3, xmm4 |
+ |
+ paddw xmm7, xmm1 |
+ pmaddwd xmm1, xmm1 |
+ |
+ paddw xmm7, xmm3 |
+ pmaddwd xmm3, xmm3 |
+ |
+ paddd xmm6, xmm1 |
+ paddd xmm6, xmm3 |
+ |
+ add rsi, rax |
+ add rdi, rdx |
+ |
+ sub rcx, 1 |
+ jnz var16peloop |
+ |
+ |
+ movdqa xmm1, xmm6 |
+ pxor xmm6, xmm6 |
+ |
+ pxor xmm5, xmm5 |
+ punpcklwd xmm6, xmm7 |
+ |
+ punpckhwd xmm5, xmm7 |
+ psrad xmm5, 16 |
+ |
+ psrad xmm6, 16 |
+ paddd xmm6, xmm5 |
+ |
+ movdqa xmm2, xmm1 |
+ punpckldq xmm1, xmm0 |
+ |
+ punpckhdq xmm2, xmm0 |
+ movdqa xmm7, xmm6 |
+ |
+ paddd xmm1, xmm2 |
+ punpckldq xmm6, xmm0 |
+ |
+ punpckhdq xmm7, xmm0 |
+ paddd xmm6, xmm7 |
+ |
+ movdqa xmm2, xmm1 |
+ movdqa xmm7, xmm6 |
+ |
+ psrldq xmm1, 8 |
+ psrldq xmm6, 8 |
+ |
+ paddd xmm7, xmm6 |
+ paddd xmm1, xmm2 |
+ |
+ movd DWORD PTR [rsp], xmm7 ;Sum |
+ movd DWORD PTR [rsp+4], xmm1 ;SSE |
+ |
+ ; return (SSE-((Sum*Sum)>>8)); |
+ movsxd rdx, dword ptr [rsp] |
+ imul rdx, rdx |
+ sar rdx, 8 |
+ movsxd rax, dword ptr [rsp + 4] |
+ sub rax, rdx |
+ |
+ ; begin epilog |
+ add rsp, 16 |
+ pop rdi |
+ pop rsi |
+ RESTORE_GOT |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
;unsigned int vp8_get8x8var_sse2 |
@@ -229,7 +341,6 @@ sym(vp8_get8x8var_sse2): |
push rbp |
mov rbp, rsp |
SHADOW_ARGS_TO_STACK 6 |
- SAVE_XMM 7 |
GET_GOT rbx |
push rsi |
push rdi |
@@ -395,7 +506,6 @@ sym(vp8_get8x8var_sse2): |
pop rdi |
pop rsi |
RESTORE_GOT |
- RESTORE_XMM |
UNSHADOW_ARGS |
pop rbp |
ret |
@@ -418,7 +528,7 @@ sym(vp8_filter_block2d_bil_var_sse2): |
push rbp |
mov rbp, rsp |
SHADOW_ARGS_TO_STACK 9 |
- SAVE_XMM 7 |
+ SAVE_XMM |
GET_GOT rbx |
push rsi |
push rdi |
@@ -695,7 +805,6 @@ sym(vp8_half_horiz_vert_variance8x_h_sse2): |
push rbp |
mov rbp, rsp |
SHADOW_ARGS_TO_STACK 7 |
- SAVE_XMM 7 |
GET_GOT rbx |
push rsi |
push rdi |
@@ -797,7 +906,6 @@ vp8_half_horiz_vert_variance8x_h_1: |
pop rdi |
pop rsi |
RESTORE_GOT |
- RESTORE_XMM |
UNSHADOW_ARGS |
pop rbp |
ret |
@@ -817,7 +925,7 @@ sym(vp8_half_horiz_vert_variance16x_h_sse2): |
push rbp |
mov rbp, rsp |
SHADOW_ARGS_TO_STACK 7 |
- SAVE_XMM 7 |
+ SAVE_XMM |
GET_GOT rbx |
push rsi |
push rdi |
@@ -933,7 +1041,6 @@ sym(vp8_half_vert_variance8x_h_sse2): |
push rbp |
mov rbp, rsp |
SHADOW_ARGS_TO_STACK 7 |
- SAVE_XMM 7 |
GET_GOT rbx |
push rsi |
push rdi |
@@ -1020,7 +1127,6 @@ vp8_half_vert_variance8x_h_1: |
pop rdi |
pop rsi |
RESTORE_GOT |
- RESTORE_XMM |
UNSHADOW_ARGS |
pop rbp |
ret |
@@ -1040,7 +1146,7 @@ sym(vp8_half_vert_variance16x_h_sse2): |
push rbp |
mov rbp, rsp |
SHADOW_ARGS_TO_STACK 7 |
- SAVE_XMM 7 |
+ SAVE_XMM |
GET_GOT rbx |
push rsi |
push rdi |
@@ -1148,7 +1254,6 @@ sym(vp8_half_horiz_variance8x_h_sse2): |
push rbp |
mov rbp, rsp |
SHADOW_ARGS_TO_STACK 7 |
- SAVE_XMM 7 |
GET_GOT rbx |
push rsi |
push rdi |
@@ -1233,7 +1338,6 @@ vp8_half_horiz_variance8x_h_1: |
pop rdi |
pop rsi |
RESTORE_GOT |
- RESTORE_XMM |
UNSHADOW_ARGS |
pop rbp |
ret |
@@ -1253,7 +1357,7 @@ sym(vp8_half_horiz_variance16x_h_sse2): |
push rbp |
mov rbp, rsp |
SHADOW_ARGS_TO_STACK 7 |
- SAVE_XMM 7 |
+ SAVE_XMM |
GET_GOT rbx |
push rsi |
push rdi |