| Index: libvpx/source/libvpx/vp8/encoder/x86/variance_impl_mmx.asm
|
| diff --git a/libvpx/source/libvpx/vp8/encoder/x86/variance_impl_mmx.asm b/libvpx/source/libvpx/vp8/encoder/x86/variance_impl_mmx.asm
|
| index 13b76ea91fd62dfc5d32dfd254d0ab4b3ac488bb..67a9b4d3efa2e624acb41754045e601b6b10da3d 100644
|
| --- a/libvpx/source/libvpx/vp8/encoder/x86/variance_impl_mmx.asm
|
| +++ b/libvpx/source/libvpx/vp8/encoder/x86/variance_impl_mmx.asm
|
| @@ -843,6 +843,136 @@ filter_block2d_bil_var_mmx_loop:
|
| pop rbp
|
| ret
|
|
|
| +;unsigned int vp8_get16x16pred_error_mmx
|
| +;(
|
| +; unsigned char *src_ptr,
|
| +; int src_stride,
|
| +; unsigned char *ref_ptr,
|
| +; int ref_stride
|
| +;)
|
| +global sym(vp8_get16x16pred_error_mmx)
|
| +sym(vp8_get16x16pred_error_mmx):
|
| + push rbp
|
| + mov rbp, rsp
|
| + SHADOW_ARGS_TO_STACK 4
|
| + GET_GOT rbx
|
| + push rsi
|
| + push rdi
|
| + sub rsp, 16
|
| + ; end prolog
|
| +
|
| + mov rsi, arg(0) ;DWORD PTR [src_ptr]
|
| + mov rdi, arg(2) ;DWORD PTR [ref_ptr]
|
| +
|
| + movsxd rax, DWORD PTR arg(1) ;[src_stride]
|
| + movsxd rdx, DWORD PTR arg(3) ;[ref_stride]
|
| +
|
| + pxor mm0, mm0 ; clear xmm0 for unpack
|
| + pxor mm7, mm7 ; clear xmm7 for accumulating diffs
|
| +
|
| + pxor mm6, mm6 ; clear xmm6 for accumulating sse
|
| + mov rcx, 16
|
| +
|
| +var16loop:
|
| +
|
| + movq mm1, [rsi]
|
| + movq mm2, [rdi]
|
| +
|
| + movq mm3, mm1
|
| + movq mm4, mm2
|
| +
|
| + punpcklbw mm1, mm0
|
| + punpckhbw mm3, mm0
|
| +
|
| + punpcklbw mm2, mm0
|
| + punpckhbw mm4, mm0
|
| +
|
| + psubw mm1, mm2
|
| + psubw mm3, mm4
|
| +
|
| + paddw mm7, mm1
|
| + pmaddwd mm1, mm1
|
| +
|
| + paddw mm7, mm3
|
| + pmaddwd mm3, mm3
|
| +
|
| + paddd mm6, mm1
|
| + paddd mm6, mm3
|
| +
|
| +
|
| + movq mm1, [rsi+8]
|
| + movq mm2, [rdi+8]
|
| +
|
| + movq mm3, mm1
|
| + movq mm4, mm2
|
| +
|
| + punpcklbw mm1, mm0
|
| + punpckhbw mm3, mm0
|
| +
|
| + punpcklbw mm2, mm0
|
| + punpckhbw mm4, mm0
|
| +
|
| + psubw mm1, mm2
|
| + psubw mm3, mm4
|
| +
|
| + paddw mm7, mm1
|
| + pmaddwd mm1, mm1
|
| +
|
| + paddw mm7, mm3
|
| + pmaddwd mm3, mm3
|
| +
|
| + paddd mm6, mm1
|
| + paddd mm6, mm3
|
| +
|
| + add rsi, rax
|
| + add rdi, rdx
|
| +
|
| + sub rcx, 1
|
| + jnz var16loop
|
| +
|
| +
|
| + movq mm1, mm6
|
| + pxor mm6, mm6
|
| +
|
| + pxor mm5, mm5
|
| + punpcklwd mm6, mm7
|
| +
|
| + punpckhwd mm5, mm7
|
| + psrad mm5, 16
|
| +
|
| + psrad mm6, 16
|
| + paddd mm6, mm5
|
| +
|
| + movq mm2, mm1
|
| + psrlq mm1, 32
|
| +
|
| + paddd mm2, mm1
|
| + movq mm7, mm6
|
| +
|
| + psrlq mm6, 32
|
| + paddd mm6, mm7
|
| +
|
| + movd DWORD PTR [rsp], mm6 ;Sum
|
| + movd DWORD PTR [rsp+4], mm2 ;SSE
|
| +
|
| + ; return (SSE-((Sum*Sum)>>8));
|
| + movsxd rdx, dword ptr [rsp]
|
| + imul rdx, rdx
|
| + sar rdx, 8
|
| + movsxd rax, dword ptr [rsp + 4]
|
| + sub rax, rdx
|
| +
|
| +
|
| + ; begin epilog
|
| + add rsp, 16
|
| + pop rdi
|
| + pop rsi
|
| + RESTORE_GOT
|
| + UNSHADOW_ARGS
|
| + pop rbp
|
| + ret
|
| +
|
| +
|
|
|
| SECTION_RODATA
|
| ;short mmx_bi_rd[4] = { 64, 64, 64, 64};
|
|
|