Index: source/libvpx/vp9/encoder/x86/vp9_sad_sse4.asm |
=================================================================== |
--- source/libvpx/vp9/encoder/x86/vp9_sad_sse4.asm (revision 0) |
+++ source/libvpx/vp9/encoder/x86/vp9_sad_sse4.asm (revision 0) |
@@ -0,0 +1,353 @@ |
+; |
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
+; |
+; Use of this source code is governed by a BSD-style license |
+; that can be found in the LICENSE file in the root of the source |
+; tree. An additional intellectual property rights grant can be found |
+; in the file PATENTS. All contributing project authors may |
+; be found in the AUTHORS file in the root of the source tree. |
+; |
+ |
+ |
+%include "vpx_ports/x86_abi_support.asm" |
+ |
+%macro PROCESS_16X2X8 1 |
+%if %1 |
+ movdqa xmm0, XMMWORD PTR [rsi] |
+ movq xmm1, MMWORD PTR [rdi] |
+ movq xmm3, MMWORD PTR [rdi+8] |
+ movq xmm2, MMWORD PTR [rdi+16] |
+ punpcklqdq xmm1, xmm3 |
+ punpcklqdq xmm3, xmm2 |
+ |
+ movdqa xmm2, xmm1 |
+ mpsadbw xmm1, xmm0, 0x0 |
+ mpsadbw xmm2, xmm0, 0x5 |
+ |
+ psrldq xmm0, 8 |
+ |
+ movdqa xmm4, xmm3 |
+ mpsadbw xmm3, xmm0, 0x0 |
+ mpsadbw xmm4, xmm0, 0x5 |
+ |
+ paddw xmm1, xmm2 |
+ paddw xmm1, xmm3 |
+ paddw xmm1, xmm4 |
+%else |
+ movdqa xmm0, XMMWORD PTR [rsi] |
+ movq xmm5, MMWORD PTR [rdi] |
+ movq xmm3, MMWORD PTR [rdi+8] |
+ movq xmm2, MMWORD PTR [rdi+16] |
+ punpcklqdq xmm5, xmm3 |
+ punpcklqdq xmm3, xmm2 |
+ |
+ movdqa xmm2, xmm5 |
+ mpsadbw xmm5, xmm0, 0x0 |
+ mpsadbw xmm2, xmm0, 0x5 |
+ |
+ psrldq xmm0, 8 |
+ |
+ movdqa xmm4, xmm3 |
+ mpsadbw xmm3, xmm0, 0x0 |
+ mpsadbw xmm4, xmm0, 0x5 |
+ |
+ paddw xmm5, xmm2 |
+ paddw xmm5, xmm3 |
+ paddw xmm5, xmm4 |
+ |
+ paddw xmm1, xmm5 |
+%endif |
+ movdqa xmm0, XMMWORD PTR [rsi + rax] |
+ movq xmm5, MMWORD PTR [rdi+ rdx] |
+ movq xmm3, MMWORD PTR [rdi+ rdx+8] |
+ movq xmm2, MMWORD PTR [rdi+ rdx+16] |
+ punpcklqdq xmm5, xmm3 |
+ punpcklqdq xmm3, xmm2 |
+ |
+ lea rsi, [rsi+rax*2] |
+ lea rdi, [rdi+rdx*2] |
+ |
+ movdqa xmm2, xmm5 |
+ mpsadbw xmm5, xmm0, 0x0 |
+ mpsadbw xmm2, xmm0, 0x5 |
+ |
+ psrldq xmm0, 8 |
+ movdqa xmm4, xmm3 |
+ mpsadbw xmm3, xmm0, 0x0 |
+ mpsadbw xmm4, xmm0, 0x5 |
+ |
+ paddw xmm5, xmm2 |
+ paddw xmm5, xmm3 |
+ paddw xmm5, xmm4 |
+ |
+ paddw xmm1, xmm5 |
+%endmacro |
+ |
+%macro PROCESS_8X2X8 1 |
+%if %1 |
+ movq xmm0, MMWORD PTR [rsi] |
+ movq xmm1, MMWORD PTR [rdi] |
+ movq xmm3, MMWORD PTR [rdi+8] |
+ punpcklqdq xmm1, xmm3 |
+ |
+ movdqa xmm2, xmm1 |
+ mpsadbw xmm1, xmm0, 0x0 |
+ mpsadbw xmm2, xmm0, 0x5 |
+ paddw xmm1, xmm2 |
+%else |
+ movq xmm0, MMWORD PTR [rsi] |
+ movq xmm5, MMWORD PTR [rdi] |
+ movq xmm3, MMWORD PTR [rdi+8] |
+ punpcklqdq xmm5, xmm3 |
+ |
+ movdqa xmm2, xmm5 |
+ mpsadbw xmm5, xmm0, 0x0 |
+ mpsadbw xmm2, xmm0, 0x5 |
+ paddw xmm5, xmm2 |
+ |
+ paddw xmm1, xmm5 |
+%endif |
+ movq xmm0, MMWORD PTR [rsi + rax] |
+ movq xmm5, MMWORD PTR [rdi+ rdx] |
+ movq xmm3, MMWORD PTR [rdi+ rdx+8] |
+ punpcklqdq xmm5, xmm3 |
+ |
+ lea rsi, [rsi+rax*2] |
+ lea rdi, [rdi+rdx*2] |
+ |
+ movdqa xmm2, xmm5 |
+ mpsadbw xmm5, xmm0, 0x0 |
+ mpsadbw xmm2, xmm0, 0x5 |
+ paddw xmm5, xmm2 |
+ |
+ paddw xmm1, xmm5 |
+%endmacro |
+ |
+%macro PROCESS_4X2X8 1 |
+%if %1 |
+ movd xmm0, [rsi] |
+ movq xmm1, MMWORD PTR [rdi] |
+ movq xmm3, MMWORD PTR [rdi+8] |
+ punpcklqdq xmm1, xmm3 |
+ |
+ mpsadbw xmm1, xmm0, 0x0 |
+%else |
+ movd xmm0, [rsi] |
+ movq xmm5, MMWORD PTR [rdi] |
+ movq xmm3, MMWORD PTR [rdi+8] |
+ punpcklqdq xmm5, xmm3 |
+ |
+ mpsadbw xmm5, xmm0, 0x0 |
+ |
+ paddw xmm1, xmm5 |
+%endif |
+ movd xmm0, [rsi + rax] |
+ movq xmm5, MMWORD PTR [rdi+ rdx] |
+ movq xmm3, MMWORD PTR [rdi+ rdx+8] |
+ punpcklqdq xmm5, xmm3 |
+ |
+ lea rsi, [rsi+rax*2] |
+ lea rdi, [rdi+rdx*2] |
+ |
+ mpsadbw xmm5, xmm0, 0x0 |
+ |
+ paddw xmm1, xmm5 |
+%endmacro |
+ |
+ |
+;void vp9_sad16x16x8_sse4( |
+; const unsigned char *src_ptr, |
+; int src_stride, |
+; const unsigned char *ref_ptr, |
+; int ref_stride, |
+; unsigned short *sad_array); |
+global sym(vp9_sad16x16x8_sse4) |
+sym(vp9_sad16x16x8_sse4): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 5 |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ |
+ mov rsi, arg(0) ;src_ptr |
+ mov rdi, arg(2) ;ref_ptr |
+ |
+ movsxd rax, dword ptr arg(1) ;src_stride |
+ movsxd rdx, dword ptr arg(3) ;ref_stride |
+ |
+ PROCESS_16X2X8 1 |
+ PROCESS_16X2X8 0 |
+ PROCESS_16X2X8 0 |
+ PROCESS_16X2X8 0 |
+ PROCESS_16X2X8 0 |
+ PROCESS_16X2X8 0 |
+ PROCESS_16X2X8 0 |
+ PROCESS_16X2X8 0 |
+ |
+ mov rdi, arg(4) ;Results |
+ movdqa XMMWORD PTR [rdi], xmm1 |
+ |
+ ; begin epilog |
+ pop rdi |
+ pop rsi |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+ |
+;void vp9_sad16x8x8_sse4( |
+; const unsigned char *src_ptr, |
+; int src_stride, |
+; const unsigned char *ref_ptr, |
+; int ref_stride, |
+; unsigned short *sad_array |
+;); |
+global sym(vp9_sad16x8x8_sse4) |
+sym(vp9_sad16x8x8_sse4): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 5 |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ |
+ mov rsi, arg(0) ;src_ptr |
+ mov rdi, arg(2) ;ref_ptr |
+ |
+ movsxd rax, dword ptr arg(1) ;src_stride |
+ movsxd rdx, dword ptr arg(3) ;ref_stride |
+ |
+ PROCESS_16X2X8 1 |
+ PROCESS_16X2X8 0 |
+ PROCESS_16X2X8 0 |
+ PROCESS_16X2X8 0 |
+ |
+ mov rdi, arg(4) ;Results |
+ movdqa XMMWORD PTR [rdi], xmm1 |
+ |
+ ; begin epilog |
+ pop rdi |
+ pop rsi |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+ |
+;void vp9_sad8x8x8_sse4( |
+; const unsigned char *src_ptr, |
+; int src_stride, |
+; const unsigned char *ref_ptr, |
+; int ref_stride, |
+; unsigned short *sad_array |
+;); |
+global sym(vp9_sad8x8x8_sse4) |
+sym(vp9_sad8x8x8_sse4): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 5 |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ |
+ mov rsi, arg(0) ;src_ptr |
+ mov rdi, arg(2) ;ref_ptr |
+ |
+ movsxd rax, dword ptr arg(1) ;src_stride |
+ movsxd rdx, dword ptr arg(3) ;ref_stride |
+ |
+ PROCESS_8X2X8 1 |
+ PROCESS_8X2X8 0 |
+ PROCESS_8X2X8 0 |
+ PROCESS_8X2X8 0 |
+ |
+ mov rdi, arg(4) ;Results |
+ movdqa XMMWORD PTR [rdi], xmm1 |
+ |
+ ; begin epilog |
+ pop rdi |
+ pop rsi |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+ |
+;void vp9_sad8x16x8_sse4( |
+; const unsigned char *src_ptr, |
+; int src_stride, |
+; const unsigned char *ref_ptr, |
+; int ref_stride, |
+; unsigned short *sad_array |
+;); |
+global sym(vp9_sad8x16x8_sse4) |
+sym(vp9_sad8x16x8_sse4): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 5 |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ |
+ mov rsi, arg(0) ;src_ptr |
+ mov rdi, arg(2) ;ref_ptr |
+ |
+ movsxd rax, dword ptr arg(1) ;src_stride |
+ movsxd rdx, dword ptr arg(3) ;ref_stride |
+ |
+ PROCESS_8X2X8 1 |
+ PROCESS_8X2X8 0 |
+ PROCESS_8X2X8 0 |
+ PROCESS_8X2X8 0 |
+ PROCESS_8X2X8 0 |
+ PROCESS_8X2X8 0 |
+ PROCESS_8X2X8 0 |
+ PROCESS_8X2X8 0 |
+ mov rdi, arg(4) ;Results |
+ movdqa XMMWORD PTR [rdi], xmm1 |
+ |
+ ; begin epilog |
+ pop rdi |
+ pop rsi |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+ |
+;void vp9_sad4x4x8_c( |
+; const unsigned char *src_ptr, |
+; int src_stride, |
+; const unsigned char *ref_ptr, |
+; int ref_stride, |
+; unsigned short *sad_array |
+;); |
+global sym(vp9_sad4x4x8_sse4) |
+sym(vp9_sad4x4x8_sse4): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 5 |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ |
+ mov rsi, arg(0) ;src_ptr |
+ mov rdi, arg(2) ;ref_ptr |
+ |
+ movsxd rax, dword ptr arg(1) ;src_stride |
+ movsxd rdx, dword ptr arg(3) ;ref_stride |
+ |
+ PROCESS_4X2X8 1 |
+ PROCESS_4X2X8 0 |
+ |
+ mov rdi, arg(4) ;Results |
+ movdqa XMMWORD PTR [rdi], xmm1 |
+ |
+ ; begin epilog |
+ pop rdi |
+ pop rsi |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+ |
+ |
+ |