Index: libvpx/source/libvpx/vp8/encoder/x86/variance_impl_mmx.asm |
diff --git a/libvpx/source/libvpx/vp8/encoder/x86/variance_impl_mmx.asm b/libvpx/source/libvpx/vp8/encoder/x86/variance_impl_mmx.asm |
index 13b76ea91fd62dfc5d32dfd254d0ab4b3ac488bb..67a9b4d3efa2e624acb41754045e601b6b10da3d 100644 |
--- a/libvpx/source/libvpx/vp8/encoder/x86/variance_impl_mmx.asm |
+++ b/libvpx/source/libvpx/vp8/encoder/x86/variance_impl_mmx.asm |
@@ -843,6 +843,136 @@ filter_block2d_bil_var_mmx_loop: |
pop rbp |
ret |
+;unsigned int vp8_get16x16pred_error_mmx |
+;( |
+; unsigned char *src_ptr, |
+; int src_stride, |
+; unsigned char *ref_ptr, |
+; int ref_stride |
+;) |
+global sym(vp8_get16x16pred_error_mmx) |
+sym(vp8_get16x16pred_error_mmx): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 4 |
+ GET_GOT rbx |
+ push rsi |
+ push rdi |
+ sub rsp, 16 |
+ ; end prolog |
+ |
+ mov rsi, arg(0) ;DWORD PTR [src_ptr] |
+ mov rdi, arg(2) ;DWORD PTR [ref_ptr] |
+ |
+ movsxd rax, DWORD PTR arg(1) ;[src_stride] |
+ movsxd rdx, DWORD PTR arg(3) ;[ref_stride] |
+ |
+ pxor mm0, mm0 ; clear xmm0 for unpack |
+ pxor mm7, mm7 ; clear xmm7 for accumulating diffs |
+ |
+ pxor mm6, mm6 ; clear xmm6 for accumulating sse |
+ mov rcx, 16 |
+ |
+var16loop: |
+ |
+ movq mm1, [rsi] |
+ movq mm2, [rdi] |
+ |
+ movq mm3, mm1 |
+ movq mm4, mm2 |
+ |
+ punpcklbw mm1, mm0 |
+ punpckhbw mm3, mm0 |
+ |
+ punpcklbw mm2, mm0 |
+ punpckhbw mm4, mm0 |
+ |
+ psubw mm1, mm2 |
+ psubw mm3, mm4 |
+ |
+ paddw mm7, mm1 |
+ pmaddwd mm1, mm1 |
+ |
+ paddw mm7, mm3 |
+ pmaddwd mm3, mm3 |
+ |
+ paddd mm6, mm1 |
+ paddd mm6, mm3 |
+ |
+ |
+ movq mm1, [rsi+8] |
+ movq mm2, [rdi+8] |
+ |
+ movq mm3, mm1 |
+ movq mm4, mm2 |
+ |
+ punpcklbw mm1, mm0 |
+ punpckhbw mm3, mm0 |
+ |
+ punpcklbw mm2, mm0 |
+ punpckhbw mm4, mm0 |
+ |
+ psubw mm1, mm2 |
+ psubw mm3, mm4 |
+ |
+ paddw mm7, mm1 |
+ pmaddwd mm1, mm1 |
+ |
+ paddw mm7, mm3 |
+ pmaddwd mm3, mm3 |
+ |
+ paddd mm6, mm1 |
+ paddd mm6, mm3 |
+ |
+ add rsi, rax |
+ add rdi, rdx |
+ |
+ sub rcx, 1 |
+ jnz var16loop |
+ |
+ |
+ movq mm1, mm6 |
+ pxor mm6, mm6 |
+ |
+ pxor mm5, mm5 |
+ punpcklwd mm6, mm7 |
+ |
+ punpckhwd mm5, mm7 |
+ psrad mm5, 16 |
+ |
+ psrad mm6, 16 |
+ paddd mm6, mm5 |
+ |
+ movq mm2, mm1 |
+ psrlq mm1, 32 |
+ |
+ paddd mm2, mm1 |
+ movq mm7, mm6 |
+ |
+ psrlq mm6, 32 |
+ paddd mm6, mm7 |
+ |
+ movd DWORD PTR [rsp], mm6 ;Sum |
+ movd DWORD PTR [rsp+4], mm2 ;SSE |
+ |
+ ; return (SSE-((Sum*Sum)>>8)); |
+ movsxd rdx, dword ptr [rsp] |
+ imul rdx, rdx |
+ sar rdx, 8 |
+ movsxd rax, dword ptr [rsp + 4] |
+ sub rax, rdx |
+ |
+ |
+ ; begin epilog |
+ add rsp, 16 |
+ pop rdi |
+ pop rsi |
+ RESTORE_GOT |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+ |
SECTION_RODATA |
;short mmx_bi_rd[4] = { 64, 64, 64, 64}; |