Index: source/libvpx/vp9/common/x86/vp9_iwalsh_mmx.asm |
=================================================================== |
--- source/libvpx/vp9/common/x86/vp9_iwalsh_mmx.asm (revision 0) |
+++ source/libvpx/vp9/common/x86/vp9_iwalsh_mmx.asm (revision 0) |
@@ -0,0 +1,173 @@ |
+; |
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
+; |
+; Use of this source code is governed by a BSD-style license |
+; that can be found in the LICENSE file in the root of the source |
+; tree. An additional intellectual property rights grant can be found |
+; in the file PATENTS. All contributing project authors may |
+; be found in the AUTHORS file in the root of the source tree. |
+; |
+ |
+ |
+%include "vpx_ports/x86_abi_support.asm" |
+ |
+;void vp9_short_inv_walsh4x4_1_mmx(short *input, short *output) |
+global sym(vp9_short_inv_walsh4x4_1_mmx) |
+sym(vp9_short_inv_walsh4x4_1_mmx): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 2 |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ |
+ mov rsi, arg(0) |
+ mov rax, 3 |
+ |
+ mov rdi, arg(1) |
+ add rax, [rsi] ;input[0] + 3 |
+ |
+ movd mm0, eax |
+ |
+ punpcklwd mm0, mm0 ;x x val val |
+ |
+ punpckldq mm0, mm0 ;val val val val |
+ |
+ psraw mm0, 3 ;(input[0] + 3) >> 3 |
+ |
+ movq [rdi + 0], mm0 |
+ movq [rdi + 8], mm0 |
+ movq [rdi + 16], mm0 |
+ movq [rdi + 24], mm0 |
+ |
+ ; begin epilog |
+ pop rdi |
+ pop rsi |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+;void vp9_short_inv_walsh4x4_mmx(short *input, short *output) |
+global sym(vp9_short_inv_walsh4x4_mmx) |
+sym(vp9_short_inv_walsh4x4_mmx): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 2 |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ |
+ mov rax, 3 |
+ mov rsi, arg(0) |
+ mov rdi, arg(1) |
+ shl rax, 16 |
+ |
+ movq mm0, [rsi + 0] ;ip[0] |
+ movq mm1, [rsi + 8] ;ip[4] |
+ or rax, 3 ;00030003h |
+ |
+ movq mm2, [rsi + 16] ;ip[8] |
+ movq mm3, [rsi + 24] ;ip[12] |
+ |
+ movq mm7, rax |
+ movq mm4, mm0 |
+ |
+ punpcklwd mm7, mm7 ;0003000300030003h |
+ movq mm5, mm1 |
+ |
+ paddw mm4, mm3 ;ip[0] + ip[12] aka al |
+ paddw mm5, mm2 ;ip[4] + ip[8] aka bl |
+ |
+ movq mm6, mm4 ;temp al |
+ |
+ paddw mm4, mm5 ;al + bl |
+ psubw mm6, mm5 ;al - bl |
+ |
+ psubw mm0, mm3 ;ip[0] - ip[12] aka d1 |
+ psubw mm1, mm2 ;ip[4] - ip[8] aka c1 |
+ |
+ movq mm5, mm0 ;temp dl |
+ |
+ paddw mm0, mm1 ;dl + cl |
+ psubw mm5, mm1 ;dl - cl |
+ |
+ ; 03 02 01 00 |
+ ; 13 12 11 10 |
+ ; 23 22 21 20 |
+ ; 33 32 31 30 |
+ |
+ movq mm3, mm4 ; 03 02 01 00 |
+ punpcklwd mm4, mm0 ; 11 01 10 00 |
+ punpckhwd mm3, mm0 ; 13 03 12 02 |
+ |
+ movq mm1, mm6 ; 23 22 21 20 |
+ punpcklwd mm6, mm5 ; 31 21 30 20 |
+ punpckhwd mm1, mm5 ; 33 23 32 22 |
+ |
+ movq mm0, mm4 ; 11 01 10 00 |
+ movq mm2, mm3 ; 13 03 12 02 |
+ |
+ punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] |
+ punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4] |
+ |
+ punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8] |
+ punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12] |
+;~~~~~~~~~~~~~~~~~~~~~ |
+ movq mm1, mm0 |
+ movq mm5, mm4 |
+ |
+ paddw mm1, mm3 ;ip[0] + ip[12] aka al |
+ paddw mm5, mm2 ;ip[4] + ip[8] aka bl |
+ |
+ movq mm6, mm1 ;temp al |
+ |
+ paddw mm1, mm5 ;al + bl |
+ psubw mm6, mm5 ;al - bl |
+ |
+ psubw mm0, mm3 ;ip[0] - ip[12] aka d1 |
+ psubw mm4, mm2 ;ip[4] - ip[8] aka c1 |
+ |
+ movq mm5, mm0 ;temp dl |
+ |
+ paddw mm0, mm4 ;dl + cl |
+ psubw mm5, mm4 ;dl - cl |
+;~~~~~~~~~~~~~~~~~~~~~ |
+ movq mm3, mm1 ; 03 02 01 00 |
+ punpcklwd mm1, mm0 ; 11 01 10 00 |
+ punpckhwd mm3, mm0 ; 13 03 12 02 |
+ |
+ movq mm4, mm6 ; 23 22 21 20 |
+ punpcklwd mm6, mm5 ; 31 21 30 20 |
+ punpckhwd mm4, mm5 ; 33 23 32 22 |
+ |
+ movq mm0, mm1 ; 11 01 10 00 |
+ movq mm2, mm3 ; 13 03 12 02 |
+ |
+ punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] |
+ punpckhdq mm1, mm6 ; 31 21 11 01 aka ip[4] |
+ |
+ punpckldq mm2, mm4 ; 32 22 12 02 aka ip[8] |
+ punpckhdq mm3, mm4 ; 33 23 13 03 aka ip[12] |
+ |
+ paddw mm0, mm7 |
+ paddw mm1, mm7 |
+ paddw mm2, mm7 |
+ paddw mm3, mm7 |
+ |
+ psraw mm0, 3 |
+ psraw mm1, 3 |
+ psraw mm2, 3 |
+ psraw mm3, 3 |
+ |
+ movq [rdi + 0], mm0 |
+ movq [rdi + 8], mm1 |
+ movq [rdi + 16], mm2 |
+ movq [rdi + 24], mm3 |
+ |
+ ; begin epilog |
+ pop rdi |
+ pop rsi |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |