Index: source/libvpx/vp9/encoder/x86/vp9_encodeopt.asm |
=================================================================== |
--- source/libvpx/vp9/encoder/x86/vp9_encodeopt.asm (revision 0) |
+++ source/libvpx/vp9/encoder/x86/vp9_encodeopt.asm (revision 0) |
@@ -0,0 +1,386 @@ |
+; |
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
+; |
+; Use of this source code is governed by a BSD-style license |
+; that can be found in the LICENSE file in the root of the source |
+; tree. An additional intellectual property rights grant can be found |
+; in the file PATENTS. All contributing project authors may |
+; be found in the AUTHORS file in the root of the source tree. |
+; |
+ |
+ |
+%include "vpx_ports/x86_abi_support.asm" |
+ |
+;int vp9_block_error_xmm(short *coeff_ptr, short *dcoef_ptr) |
+global sym(vp9_block_error_xmm) |
+sym(vp9_block_error_xmm): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 2 |
+ push rsi |
+ push rdi |
+ ; end prologue |
+ |
+ mov rsi, arg(0) ;coeff_ptr |
+ mov rdi, arg(1) ;dcoef_ptr |
+ |
+ movdqa xmm0, [rsi] |
+ movdqa xmm1, [rdi] |
+ |
+ movdqa xmm2, [rsi+16] |
+ movdqa xmm3, [rdi+16] |
+ |
+ psubw xmm0, xmm1 |
+ psubw xmm2, xmm3 |
+ |
+ pmaddwd xmm0, xmm0 |
+ pmaddwd xmm2, xmm2 |
+ |
+ paddd xmm0, xmm2 |
+ |
+ pxor xmm5, xmm5 |
+ movdqa xmm1, xmm0 |
+ |
+ punpckldq xmm0, xmm5 |
+ punpckhdq xmm1, xmm5 |
+ |
+ paddd xmm0, xmm1 |
+ movdqa xmm1, xmm0 |
+ |
+ psrldq xmm0, 8 |
+ paddd xmm0, xmm1 |
+ |
+ movq rax, xmm0 |
+ |
+ pop rdi |
+ pop rsi |
+ ; begin epilog |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+;int vp9_block_error_mmx(short *coeff_ptr, short *dcoef_ptr) |
+global sym(vp9_block_error_mmx) |
+sym(vp9_block_error_mmx): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 2 |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ |
+ |
+ mov rsi, arg(0) ;coeff_ptr |
+ pxor mm7, mm7 |
+ |
+ mov rdi, arg(1) ;dcoef_ptr |
+ movq mm3, [rsi] |
+ |
+ movq mm4, [rdi] |
+ movq mm5, [rsi+8] |
+ |
+ movq mm6, [rdi+8] |
+ pxor mm1, mm1 ; from movd mm1, dc ; dc =0 |
+ |
+ movq mm2, mm7 |
+ psubw mm5, mm6 |
+ |
+ por mm1, mm2 |
+ pmaddwd mm5, mm5 |
+ |
+ pcmpeqw mm1, mm7 |
+ psubw mm3, mm4 |
+ |
+ pand mm1, mm3 |
+ pmaddwd mm1, mm1 |
+ |
+ paddd mm1, mm5 |
+ movq mm3, [rsi+16] |
+ |
+ movq mm4, [rdi+16] |
+ movq mm5, [rsi+24] |
+ |
+ movq mm6, [rdi+24] |
+ psubw mm5, mm6 |
+ |
+ pmaddwd mm5, mm5 |
+ psubw mm3, mm4 |
+ |
+ pmaddwd mm3, mm3 |
+ paddd mm3, mm5 |
+ |
+ paddd mm1, mm3 |
+ movq mm0, mm1 |
+ |
+ psrlq mm1, 32 |
+ paddd mm0, mm1 |
+ |
+ movq rax, mm0 |
+ |
+ pop rdi |
+ pop rsi |
+ ; begin epilog |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+ |
+;int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); |
+global sym(vp9_mbblock_error_mmx_impl) |
+sym(vp9_mbblock_error_mmx_impl): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 3 |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ |
+ |
+ mov rsi, arg(0) ;coeff_ptr |
+ pxor mm7, mm7 |
+ |
+ mov rdi, arg(1) ;dcoef_ptr |
+ pxor mm2, mm2 |
+ |
+ movd mm1, dword ptr arg(2) ;dc |
+ por mm1, mm2 |
+ |
+ pcmpeqw mm1, mm7 |
+ mov rcx, 16 |
+ |
+.mberror_loop_mmx: |
+ movq mm3, [rsi] |
+ movq mm4, [rdi] |
+ |
+ movq mm5, [rsi+8] |
+ movq mm6, [rdi+8] |
+ |
+ |
+ psubw mm5, mm6 |
+ pmaddwd mm5, mm5 |
+ |
+ psubw mm3, mm4 |
+ pand mm3, mm1 |
+ |
+ pmaddwd mm3, mm3 |
+ paddd mm2, mm5 |
+ |
+ paddd mm2, mm3 |
+ movq mm3, [rsi+16] |
+ |
+ movq mm4, [rdi+16] |
+ movq mm5, [rsi+24] |
+ |
+ movq mm6, [rdi+24] |
+ psubw mm5, mm6 |
+ |
+ pmaddwd mm5, mm5 |
+ psubw mm3, mm4 |
+ |
+ pmaddwd mm3, mm3 |
+ paddd mm2, mm5 |
+ |
+ paddd mm2, mm3 |
+ add rsi, 32 |
+ |
+ add rdi, 32 |
+ sub rcx, 1 |
+ |
+ jnz .mberror_loop_mmx |
+ |
+ movq mm0, mm2 |
+ psrlq mm2, 32 |
+ |
+ paddd mm0, mm2 |
+ movq rax, mm0 |
+ |
+ pop rdi |
+ pop rsi |
+ ; begin epilog |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+ |
+;int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); |
+global sym(vp9_mbblock_error_xmm_impl) |
+sym(vp9_mbblock_error_xmm_impl): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 3 |
+ SAVE_XMM 6 |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ |
+ |
+ mov rsi, arg(0) ;coeff_ptr |
+ pxor xmm6, xmm6 |
+ |
+ mov rdi, arg(1) ;dcoef_ptr |
+ pxor xmm4, xmm4 |
+ |
+ movd xmm5, dword ptr arg(2) ;dc |
+ por xmm5, xmm4 |
+ |
+ pcmpeqw xmm5, xmm6 |
+ mov rcx, 16 |
+ |
+.mberror_loop: |
+ movdqa xmm0, [rsi] |
+ movdqa xmm1, [rdi] |
+ |
+ movdqa xmm2, [rsi+16] |
+ movdqa xmm3, [rdi+16] |
+ |
+ |
+ psubw xmm2, xmm3 |
+ pmaddwd xmm2, xmm2 |
+ |
+ psubw xmm0, xmm1 |
+ pand xmm0, xmm5 |
+ |
+ pmaddwd xmm0, xmm0 |
+ add rsi, 32 |
+ |
+ add rdi, 32 |
+ |
+ sub rcx, 1 |
+ paddd xmm4, xmm2 |
+ |
+ paddd xmm4, xmm0 |
+ jnz .mberror_loop |
+ |
+ movdqa xmm0, xmm4 |
+ punpckldq xmm0, xmm6 |
+ |
+ punpckhdq xmm4, xmm6 |
+ paddd xmm0, xmm4 |
+ |
+ movdqa xmm1, xmm0 |
+ psrldq xmm0, 8 |
+ |
+ paddd xmm0, xmm1 |
+ movq rax, xmm0 |
+ |
+ pop rdi |
+ pop rsi |
+ ; begin epilog |
+ RESTORE_XMM |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+ |
+;int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr); |
+global sym(vp9_mbuverror_mmx_impl) |
+sym(vp9_mbuverror_mmx_impl): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 2 |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ |
+ |
+ mov rsi, arg(0) ;s_ptr |
+ mov rdi, arg(1) ;d_ptr |
+ |
+ mov rcx, 16 |
+ pxor mm7, mm7 |
+ |
+.mbuverror_loop_mmx: |
+ |
+ movq mm1, [rsi] |
+ movq mm2, [rdi] |
+ |
+ psubw mm1, mm2 |
+ pmaddwd mm1, mm1 |
+ |
+ |
+ movq mm3, [rsi+8] |
+ movq mm4, [rdi+8] |
+ |
+ psubw mm3, mm4 |
+ pmaddwd mm3, mm3 |
+ |
+ |
+ paddd mm7, mm1 |
+ paddd mm7, mm3 |
+ |
+ |
+ add rsi, 16 |
+ add rdi, 16 |
+ |
+ dec rcx |
+ jnz .mbuverror_loop_mmx |
+ |
+ movq mm0, mm7 |
+ psrlq mm7, 32 |
+ |
+ paddd mm0, mm7 |
+ movq rax, mm0 |
+ |
+ pop rdi |
+ pop rsi |
+ ; begin epilog |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+ |
+;int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr); |
+global sym(vp9_mbuverror_xmm_impl) |
+sym(vp9_mbuverror_xmm_impl): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 2 |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ |
+ |
+ mov rsi, arg(0) ;s_ptr |
+ mov rdi, arg(1) ;d_ptr |
+ |
+ mov rcx, 16 |
+ pxor xmm3, xmm3 |
+ |
+.mbuverror_loop: |
+ |
+ movdqa xmm1, [rsi] |
+ movdqa xmm2, [rdi] |
+ |
+ psubw xmm1, xmm2 |
+ pmaddwd xmm1, xmm1 |
+ |
+ paddd xmm3, xmm1 |
+ |
+ add rsi, 16 |
+ add rdi, 16 |
+ |
+ dec rcx |
+ jnz .mbuverror_loop |
+ |
+ pxor xmm0, xmm0 |
+ movdqa xmm1, xmm3 |
+ |
+ movdqa xmm2, xmm1 |
+ punpckldq xmm1, xmm0 |
+ |
+ punpckhdq xmm2, xmm0 |
+ paddd xmm1, xmm2 |
+ |
+ movdqa xmm2, xmm1 |
+ |
+ psrldq xmm1, 8 |
+ paddd xmm1, xmm2 |
+ |
+ movq rax, xmm1 |
+ |
+ pop rdi |
+ pop rsi |
+ ; begin epilog |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |