Index: source/libvpx/vp9/encoder/x86/vp9_sad_sse3.asm |
=================================================================== |
--- source/libvpx/vp9/encoder/x86/vp9_sad_sse3.asm (revision 0) |
+++ source/libvpx/vp9/encoder/x86/vp9_sad_sse3.asm (revision 0) |
@@ -0,0 +1,960 @@ |
+; |
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
+; |
+; Use of this source code is governed by a BSD-style license |
+; that can be found in the LICENSE file in the root of the source |
+; tree. An additional intellectual property rights grant can be found |
+; in the file PATENTS. All contributing project authors may |
+; be found in the AUTHORS file in the root of the source tree. |
+; |
+ |
+%include "vpx_ports/x86_abi_support.asm" |
+ |
+%macro STACK_FRAME_CREATE_X3 0 |
+%if ABI_IS_32BIT |
+ %define src_ptr rsi |
+ %define src_stride rax |
+ %define ref_ptr rdi |
+ %define ref_stride rdx |
+ %define end_ptr rcx |
+ %define ret_var rbx |
+ %define result_ptr arg(4) |
+ %define max_err arg(4) |
+ %define height dword ptr arg(4) |
+ push rbp |
+ mov rbp, rsp |
+ push rsi |
+ push rdi |
+ push rbx |
+ |
+ mov rsi, arg(0) ; src_ptr |
+ mov rdi, arg(2) ; ref_ptr |
+ |
+ movsxd rax, dword ptr arg(1) ; src_stride |
+ movsxd rdx, dword ptr arg(3) ; ref_stride |
+%else |
+ %ifidn __OUTPUT_FORMAT__,x64 |
+ SAVE_XMM 7, u |
+ %define src_ptr rcx |
+ %define src_stride rdx |
+ %define ref_ptr r8 |
+ %define ref_stride r9 |
+ %define end_ptr r10 |
+ %define ret_var r11 |
+ %define result_ptr [rsp+xmm_stack_space+8+4*8] |
+ %define max_err [rsp+xmm_stack_space+8+4*8] |
+ %define height dword ptr [rsp+xmm_stack_space+8+4*8] |
+ %else |
+ %define src_ptr rdi |
+ %define src_stride rsi |
+ %define ref_ptr rdx |
+ %define ref_stride rcx |
+ %define end_ptr r9 |
+ %define ret_var r10 |
+ %define result_ptr r8 |
+ %define max_err r8 |
+ %define height r8 |
+ %endif |
+%endif |
+ |
+%endmacro |
+ |
+%macro STACK_FRAME_DESTROY_X3 0 |
+ %define src_ptr |
+ %define src_stride |
+ %define ref_ptr |
+ %define ref_stride |
+ %define end_ptr |
+ %define ret_var |
+ %define result_ptr |
+ %define max_err |
+ %define height |
+ |
+%if ABI_IS_32BIT |
+ pop rbx |
+ pop rdi |
+ pop rsi |
+ pop rbp |
+%else |
+ %ifidn __OUTPUT_FORMAT__,x64 |
+ RESTORE_XMM |
+ %endif |
+%endif |
+ ret |
+%endmacro |
+ |
+%macro STACK_FRAME_CREATE_X4 0 |
+%if ABI_IS_32BIT |
+ %define src_ptr rsi |
+ %define src_stride rax |
+ %define r0_ptr rcx |
+ %define r1_ptr rdx |
+ %define r2_ptr rbx |
+ %define r3_ptr rdi |
+ %define ref_stride rbp |
+ %define result_ptr arg(4) |
+ push rbp |
+ mov rbp, rsp |
+ push rsi |
+ push rdi |
+ push rbx |
+ |
+ push rbp |
+ mov rdi, arg(2) ; ref_ptr_base |
+ |
+ LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi |
+ |
+ mov rsi, arg(0) ; src_ptr |
+ |
+ movsxd rbx, dword ptr arg(1) ; src_stride |
+ movsxd rbp, dword ptr arg(3) ; ref_stride |
+ |
+ xchg rbx, rax |
+%else |
+ %ifidn __OUTPUT_FORMAT__,x64 |
+ SAVE_XMM 7, u |
+ %define src_ptr rcx |
+ %define src_stride rdx |
+ %define r0_ptr rsi |
+ %define r1_ptr r10 |
+ %define r2_ptr r11 |
+ %define r3_ptr r8 |
+ %define ref_stride r9 |
+ %define result_ptr [rsp+xmm_stack_space+16+4*8] |
+ push rsi |
+ |
+ LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr |
+ %else |
+ %define src_ptr rdi |
+ %define src_stride rsi |
+ %define r0_ptr r9 |
+ %define r1_ptr r10 |
+ %define r2_ptr r11 |
+ %define r3_ptr rdx |
+ %define ref_stride rcx |
+ %define result_ptr r8 |
+ |
+ LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr |
+ |
+ %endif |
+%endif |
+%endmacro |
+ |
+%macro STACK_FRAME_DESTROY_X4 0 |
+ %define src_ptr |
+ %define src_stride |
+ %define r0_ptr |
+ %define r1_ptr |
+ %define r2_ptr |
+ %define r3_ptr |
+ %define ref_stride |
+ %define result_ptr |
+ |
+%if ABI_IS_32BIT |
+ pop rbx |
+ pop rdi |
+ pop rsi |
+ pop rbp |
+%else |
+ %ifidn __OUTPUT_FORMAT__,x64 |
+ pop rsi |
+ RESTORE_XMM |
+ %endif |
+%endif |
+ ret |
+%endmacro |
+ |
+%macro PROCESS_16X2X3 5 |
+%if %1==0 |
+ movdqa xmm0, XMMWORD PTR [%2] |
+ lddqu xmm5, XMMWORD PTR [%3] |
+ lddqu xmm6, XMMWORD PTR [%3+1] |
+ lddqu xmm7, XMMWORD PTR [%3+2] |
+ |
+ psadbw xmm5, xmm0 |
+ psadbw xmm6, xmm0 |
+ psadbw xmm7, xmm0 |
+%else |
+ movdqa xmm0, XMMWORD PTR [%2] |
+ lddqu xmm1, XMMWORD PTR [%3] |
+ lddqu xmm2, XMMWORD PTR [%3+1] |
+ lddqu xmm3, XMMWORD PTR [%3+2] |
+ |
+ psadbw xmm1, xmm0 |
+ psadbw xmm2, xmm0 |
+ psadbw xmm3, xmm0 |
+ |
+ paddw xmm5, xmm1 |
+ paddw xmm6, xmm2 |
+ paddw xmm7, xmm3 |
+%endif |
+ movdqa xmm0, XMMWORD PTR [%2+%4] |
+ lddqu xmm1, XMMWORD PTR [%3+%5] |
+ lddqu xmm2, XMMWORD PTR [%3+%5+1] |
+ lddqu xmm3, XMMWORD PTR [%3+%5+2] |
+ |
+%if %1==0 || %1==1 |
+ lea %2, [%2+%4*2] |
+ lea %3, [%3+%5*2] |
+%endif |
+ |
+ psadbw xmm1, xmm0 |
+ psadbw xmm2, xmm0 |
+ psadbw xmm3, xmm0 |
+ |
+ paddw xmm5, xmm1 |
+ paddw xmm6, xmm2 |
+ paddw xmm7, xmm3 |
+%endmacro |
+ |
+%macro PROCESS_8X2X3 5 |
+%if %1==0 |
+ movq mm0, QWORD PTR [%2] |
+ movq mm5, QWORD PTR [%3] |
+ movq mm6, QWORD PTR [%3+1] |
+ movq mm7, QWORD PTR [%3+2] |
+ |
+ psadbw mm5, mm0 |
+ psadbw mm6, mm0 |
+ psadbw mm7, mm0 |
+%else |
+ movq mm0, QWORD PTR [%2] |
+ movq mm1, QWORD PTR [%3] |
+ movq mm2, QWORD PTR [%3+1] |
+ movq mm3, QWORD PTR [%3+2] |
+ |
+ psadbw mm1, mm0 |
+ psadbw mm2, mm0 |
+ psadbw mm3, mm0 |
+ |
+ paddw mm5, mm1 |
+ paddw mm6, mm2 |
+ paddw mm7, mm3 |
+%endif |
+ movq mm0, QWORD PTR [%2+%4] |
+ movq mm1, QWORD PTR [%3+%5] |
+ movq mm2, QWORD PTR [%3+%5+1] |
+ movq mm3, QWORD PTR [%3+%5+2] |
+ |
+%if %1==0 || %1==1 |
+ lea %2, [%2+%4*2] |
+ lea %3, [%3+%5*2] |
+%endif |
+ |
+ psadbw mm1, mm0 |
+ psadbw mm2, mm0 |
+ psadbw mm3, mm0 |
+ |
+ paddw mm5, mm1 |
+ paddw mm6, mm2 |
+ paddw mm7, mm3 |
+%endmacro |
+ |
+%macro LOAD_X4_ADDRESSES 5 |
+ mov %2, [%1+REG_SZ_BYTES*0] |
+ mov %3, [%1+REG_SZ_BYTES*1] |
+ |
+ mov %4, [%1+REG_SZ_BYTES*2] |
+ mov %5, [%1+REG_SZ_BYTES*3] |
+%endmacro |
+ |
+%macro PROCESS_16X2X4 8 |
+%if %1==0 |
+ movdqa xmm0, XMMWORD PTR [%2] |
+ lddqu xmm4, XMMWORD PTR [%3] |
+ lddqu xmm5, XMMWORD PTR [%4] |
+ lddqu xmm6, XMMWORD PTR [%5] |
+ lddqu xmm7, XMMWORD PTR [%6] |
+ |
+ psadbw xmm4, xmm0 |
+ psadbw xmm5, xmm0 |
+ psadbw xmm6, xmm0 |
+ psadbw xmm7, xmm0 |
+%else |
+ movdqa xmm0, XMMWORD PTR [%2] |
+ lddqu xmm1, XMMWORD PTR [%3] |
+ lddqu xmm2, XMMWORD PTR [%4] |
+ lddqu xmm3, XMMWORD PTR [%5] |
+ |
+ psadbw xmm1, xmm0 |
+ psadbw xmm2, xmm0 |
+ psadbw xmm3, xmm0 |
+ |
+ paddw xmm4, xmm1 |
+ lddqu xmm1, XMMWORD PTR [%6] |
+ paddw xmm5, xmm2 |
+ paddw xmm6, xmm3 |
+ |
+ psadbw xmm1, xmm0 |
+ paddw xmm7, xmm1 |
+%endif |
+ movdqa xmm0, XMMWORD PTR [%2+%7] |
+ lddqu xmm1, XMMWORD PTR [%3+%8] |
+ lddqu xmm2, XMMWORD PTR [%4+%8] |
+ lddqu xmm3, XMMWORD PTR [%5+%8] |
+ |
+ psadbw xmm1, xmm0 |
+ psadbw xmm2, xmm0 |
+ psadbw xmm3, xmm0 |
+ |
+ paddw xmm4, xmm1 |
+ lddqu xmm1, XMMWORD PTR [%6+%8] |
+ paddw xmm5, xmm2 |
+ paddw xmm6, xmm3 |
+ |
+%if %1==0 || %1==1 |
+ lea %2, [%2+%7*2] |
+ lea %3, [%3+%8*2] |
+ |
+ lea %4, [%4+%8*2] |
+ lea %5, [%5+%8*2] |
+ |
+ lea %6, [%6+%8*2] |
+%endif |
+ psadbw xmm1, xmm0 |
+ paddw xmm7, xmm1 |
+ |
+%endmacro |
+ |
+%macro PROCESS_8X2X4 8 |
+%if %1==0 |
+ movq mm0, QWORD PTR [%2] |
+ movq mm4, QWORD PTR [%3] |
+ movq mm5, QWORD PTR [%4] |
+ movq mm6, QWORD PTR [%5] |
+ movq mm7, QWORD PTR [%6] |
+ |
+ psadbw mm4, mm0 |
+ psadbw mm5, mm0 |
+ psadbw mm6, mm0 |
+ psadbw mm7, mm0 |
+%else |
+ movq mm0, QWORD PTR [%2] |
+ movq mm1, QWORD PTR [%3] |
+ movq mm2, QWORD PTR [%4] |
+ movq mm3, QWORD PTR [%5] |
+ |
+ psadbw mm1, mm0 |
+ psadbw mm2, mm0 |
+ psadbw mm3, mm0 |
+ |
+ paddw mm4, mm1 |
+ movq mm1, QWORD PTR [%6] |
+ paddw mm5, mm2 |
+ paddw mm6, mm3 |
+ |
+ psadbw mm1, mm0 |
+ paddw mm7, mm1 |
+%endif |
+ movq mm0, QWORD PTR [%2+%7] |
+ movq mm1, QWORD PTR [%3+%8] |
+ movq mm2, QWORD PTR [%4+%8] |
+ movq mm3, QWORD PTR [%5+%8] |
+ |
+ psadbw mm1, mm0 |
+ psadbw mm2, mm0 |
+ psadbw mm3, mm0 |
+ |
+ paddw mm4, mm1 |
+ movq mm1, QWORD PTR [%6+%8] |
+ paddw mm5, mm2 |
+ paddw mm6, mm3 |
+ |
+%if %1==0 || %1==1 |
+ lea %2, [%2+%7*2] |
+ lea %3, [%3+%8*2] |
+ |
+ lea %4, [%4+%8*2] |
+ lea %5, [%5+%8*2] |
+ |
+ lea %6, [%6+%8*2] |
+%endif |
+ psadbw mm1, mm0 |
+ paddw mm7, mm1 |
+ |
+%endmacro |
+ |
+;void int vp9_sad16x16x3_sse3( |
+; unsigned char *src_ptr, |
+; int src_stride, |
+; unsigned char *ref_ptr, |
+; int ref_stride, |
+; int *results) |
+global sym(vp9_sad16x16x3_sse3) |
+sym(vp9_sad16x16x3_sse3): |
+ |
+ STACK_FRAME_CREATE_X3 |
+ |
+ PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
+ PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride |
+ |
+ mov rcx, result_ptr |
+ |
+ movq xmm0, xmm5 |
+ psrldq xmm5, 8 |
+ |
+ paddw xmm0, xmm5 |
+ movd [rcx], xmm0 |
+;- |
+ movq xmm0, xmm6 |
+ psrldq xmm6, 8 |
+ |
+ paddw xmm0, xmm6 |
+ movd [rcx+4], xmm0 |
+;- |
+ movq xmm0, xmm7 |
+ psrldq xmm7, 8 |
+ |
+ paddw xmm0, xmm7 |
+ movd [rcx+8], xmm0 |
+ |
+ STACK_FRAME_DESTROY_X3 |
+ |
+;void int vp9_sad16x8x3_sse3( |
+; unsigned char *src_ptr, |
+; int src_stride, |
+; unsigned char *ref_ptr, |
+; int ref_stride, |
+; int *results) |
+global sym(vp9_sad16x8x3_sse3) |
+sym(vp9_sad16x8x3_sse3): |
+ |
+ STACK_FRAME_CREATE_X3 |
+ |
+ PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
+ PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride |
+ |
+ mov rcx, result_ptr |
+ |
+ movq xmm0, xmm5 |
+ psrldq xmm5, 8 |
+ |
+ paddw xmm0, xmm5 |
+ movd [rcx], xmm0 |
+;- |
+ movq xmm0, xmm6 |
+ psrldq xmm6, 8 |
+ |
+ paddw xmm0, xmm6 |
+ movd [rcx+4], xmm0 |
+;- |
+ movq xmm0, xmm7 |
+ psrldq xmm7, 8 |
+ |
+ paddw xmm0, xmm7 |
+ movd [rcx+8], xmm0 |
+ |
+ STACK_FRAME_DESTROY_X3 |
+ |
+;void int vp9_sad8x16x3_sse3( |
+; unsigned char *src_ptr, |
+; int src_stride, |
+; unsigned char *ref_ptr, |
+; int ref_stride, |
+; int *results) |
+global sym(vp9_sad8x16x3_sse3) |
+sym(vp9_sad8x16x3_sse3): |
+ |
+ STACK_FRAME_CREATE_X3 |
+ |
+ PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
+ PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride |
+ |
+ mov rcx, result_ptr |
+ |
+ punpckldq mm5, mm6 |
+ |
+ movq [rcx], mm5 |
+ movd [rcx+8], mm7 |
+ |
+ STACK_FRAME_DESTROY_X3 |
+ |
+;void int vp9_sad8x8x3_sse3( |
+; unsigned char *src_ptr, |
+; int src_stride, |
+; unsigned char *ref_ptr, |
+; int ref_stride, |
+; int *results) |
+global sym(vp9_sad8x8x3_sse3) |
+sym(vp9_sad8x8x3_sse3): |
+ |
+ STACK_FRAME_CREATE_X3 |
+ |
+ PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
+ PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride |
+ |
+ mov rcx, result_ptr |
+ |
+ punpckldq mm5, mm6 |
+ |
+ movq [rcx], mm5 |
+ movd [rcx+8], mm7 |
+ |
+ STACK_FRAME_DESTROY_X3 |
+ |
+;void int vp9_sad4x4x3_sse3( |
+; unsigned char *src_ptr, |
+; int src_stride, |
+; unsigned char *ref_ptr, |
+; int ref_stride, |
+; int *results) |
+global sym(vp9_sad4x4x3_sse3) |
+sym(vp9_sad4x4x3_sse3): |
+ |
+ STACK_FRAME_CREATE_X3 |
+ |
+ movd mm0, DWORD PTR [src_ptr] |
+ movd mm1, DWORD PTR [ref_ptr] |
+ |
+ movd mm2, DWORD PTR [src_ptr+src_stride] |
+ movd mm3, DWORD PTR [ref_ptr+ref_stride] |
+ |
+ punpcklbw mm0, mm2 |
+ punpcklbw mm1, mm3 |
+ |
+ movd mm4, DWORD PTR [ref_ptr+1] |
+ movd mm5, DWORD PTR [ref_ptr+2] |
+ |
+ movd mm2, DWORD PTR [ref_ptr+ref_stride+1] |
+ movd mm3, DWORD PTR [ref_ptr+ref_stride+2] |
+ |
+ psadbw mm1, mm0 |
+ |
+ punpcklbw mm4, mm2 |
+ punpcklbw mm5, mm3 |
+ |
+ psadbw mm4, mm0 |
+ psadbw mm5, mm0 |
+ |
+ lea src_ptr, [src_ptr+src_stride*2] |
+ lea ref_ptr, [ref_ptr+ref_stride*2] |
+ |
+ movd mm0, DWORD PTR [src_ptr] |
+ movd mm2, DWORD PTR [ref_ptr] |
+ |
+ movd mm3, DWORD PTR [src_ptr+src_stride] |
+ movd mm6, DWORD PTR [ref_ptr+ref_stride] |
+ |
+ punpcklbw mm0, mm3 |
+ punpcklbw mm2, mm6 |
+ |
+ movd mm3, DWORD PTR [ref_ptr+1] |
+ movd mm7, DWORD PTR [ref_ptr+2] |
+ |
+ psadbw mm2, mm0 |
+ |
+ paddw mm1, mm2 |
+ |
+ movd mm2, DWORD PTR [ref_ptr+ref_stride+1] |
+ movd mm6, DWORD PTR [ref_ptr+ref_stride+2] |
+ |
+ punpcklbw mm3, mm2 |
+ punpcklbw mm7, mm6 |
+ |
+ psadbw mm3, mm0 |
+ psadbw mm7, mm0 |
+ |
+ paddw mm3, mm4 |
+ paddw mm7, mm5 |
+ |
+ mov rcx, result_ptr |
+ |
+ punpckldq mm1, mm3 |
+ |
+ movq [rcx], mm1 |
+ movd [rcx+8], mm7 |
+ |
+ STACK_FRAME_DESTROY_X3 |
+ |
+;unsigned int vp9_sad16x16_sse3( |
+; unsigned char *src_ptr, |
+; int src_stride, |
+; unsigned char *ref_ptr, |
+; int ref_stride, |
+; int max_err) |
+;%define lddqu movdqu |
+global sym(vp9_sad16x16_sse3) |
+sym(vp9_sad16x16_sse3): |
+ |
+ STACK_FRAME_CREATE_X3 |
+ |
+ mov end_ptr, 4 |
+ pxor xmm7, xmm7 |
+ |
+.vp9_sad16x16_sse3_loop: |
+ movdqa xmm0, XMMWORD PTR [src_ptr] |
+ movdqu xmm1, XMMWORD PTR [ref_ptr] |
+ movdqa xmm2, XMMWORD PTR [src_ptr+src_stride] |
+ movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride] |
+ |
+ lea src_ptr, [src_ptr+src_stride*2] |
+ lea ref_ptr, [ref_ptr+ref_stride*2] |
+ |
+ movdqa xmm4, XMMWORD PTR [src_ptr] |
+ movdqu xmm5, XMMWORD PTR [ref_ptr] |
+ movdqa xmm6, XMMWORD PTR [src_ptr+src_stride] |
+ |
+ psadbw xmm0, xmm1 |
+ |
+ movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride] |
+ |
+ psadbw xmm2, xmm3 |
+ psadbw xmm4, xmm5 |
+ psadbw xmm6, xmm1 |
+ |
+ lea src_ptr, [src_ptr+src_stride*2] |
+ lea ref_ptr, [ref_ptr+ref_stride*2] |
+ |
+ paddw xmm7, xmm0 |
+ paddw xmm7, xmm2 |
+ paddw xmm7, xmm4 |
+ paddw xmm7, xmm6 |
+ |
+ sub end_ptr, 1 |
+ jne .vp9_sad16x16_sse3_loop |
+ |
+ movq xmm0, xmm7 |
+ psrldq xmm7, 8 |
+ paddw xmm0, xmm7 |
+ movq rax, xmm0 |
+ |
+ STACK_FRAME_DESTROY_X3 |
+ |
+;void vp9_copy32xn_sse3( |
+; unsigned char *src_ptr, |
+; int src_stride, |
+; unsigned char *dst_ptr, |
+; int dst_stride, |
+; int height); |
+global sym(vp9_copy32xn_sse3) |
+sym(vp9_copy32xn_sse3): |
+ |
+ STACK_FRAME_CREATE_X3 |
+ |
+.block_copy_sse3_loopx4: |
+ lea end_ptr, [src_ptr+src_stride*2] |
+ |
+ movdqu xmm0, XMMWORD PTR [src_ptr] |
+ movdqu xmm1, XMMWORD PTR [src_ptr + 16] |
+ movdqu xmm2, XMMWORD PTR [src_ptr + src_stride] |
+ movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16] |
+ movdqu xmm4, XMMWORD PTR [end_ptr] |
+ movdqu xmm5, XMMWORD PTR [end_ptr + 16] |
+ movdqu xmm6, XMMWORD PTR [end_ptr + src_stride] |
+ movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16] |
+ |
+ lea src_ptr, [src_ptr+src_stride*4] |
+ |
+ lea end_ptr, [ref_ptr+ref_stride*2] |
+ |
+ movdqa XMMWORD PTR [ref_ptr], xmm0 |
+ movdqa XMMWORD PTR [ref_ptr + 16], xmm1 |
+ movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2 |
+ movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 |
+ movdqa XMMWORD PTR [end_ptr], xmm4 |
+ movdqa XMMWORD PTR [end_ptr + 16], xmm5 |
+ movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6 |
+ movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 |
+ |
+ lea ref_ptr, [ref_ptr+ref_stride*4] |
+ |
+ sub height, 4 |
+ cmp height, 4 |
+ jge .block_copy_sse3_loopx4 |
+ |
+ ;Check to see if there is more rows need to be copied. |
+ cmp height, 0 |
+ je .copy_is_done |
+ |
+.block_copy_sse3_loop: |
+ movdqu xmm0, XMMWORD PTR [src_ptr] |
+ movdqu xmm1, XMMWORD PTR [src_ptr + 16] |
+ lea src_ptr, [src_ptr+src_stride] |
+ |
+ movdqa XMMWORD PTR [ref_ptr], xmm0 |
+ movdqa XMMWORD PTR [ref_ptr + 16], xmm1 |
+ lea ref_ptr, [ref_ptr+ref_stride] |
+ |
+ sub height, 1 |
+ jne .block_copy_sse3_loop |
+ |
+.copy_is_done: |
+ STACK_FRAME_DESTROY_X3 |
+ |
+;void vp9_sad16x16x4d_sse3( |
+; unsigned char *src_ptr, |
+; int src_stride, |
+; unsigned char *ref_ptr_base, |
+; int ref_stride, |
+; int *results) |
+global sym(vp9_sad16x16x4d_sse3) |
+sym(vp9_sad16x16x4d_sse3): |
+ |
+ STACK_FRAME_CREATE_X4 |
+ |
+ PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ |
+%if ABI_IS_32BIT |
+ pop rbp |
+%endif |
+ mov rcx, result_ptr |
+ |
+ movq xmm0, xmm4 |
+ psrldq xmm4, 8 |
+ |
+ paddw xmm0, xmm4 |
+ movd [rcx], xmm0 |
+;- |
+ movq xmm0, xmm5 |
+ psrldq xmm5, 8 |
+ |
+ paddw xmm0, xmm5 |
+ movd [rcx+4], xmm0 |
+;- |
+ movq xmm0, xmm6 |
+ psrldq xmm6, 8 |
+ |
+ paddw xmm0, xmm6 |
+ movd [rcx+8], xmm0 |
+;- |
+ movq xmm0, xmm7 |
+ psrldq xmm7, 8 |
+ |
+ paddw xmm0, xmm7 |
+ movd [rcx+12], xmm0 |
+ |
+ STACK_FRAME_DESTROY_X4 |
+ |
+;void vp9_sad16x8x4d_sse3( |
+; unsigned char *src_ptr, |
+; int src_stride, |
+; unsigned char *ref_ptr_base, |
+; int ref_stride, |
+; int *results) |
+global sym(vp9_sad16x8x4d_sse3) |
+sym(vp9_sad16x8x4d_sse3): |
+ |
+ STACK_FRAME_CREATE_X4 |
+ |
+ PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ |
+%if ABI_IS_32BIT |
+ pop rbp |
+%endif |
+ mov rcx, result_ptr |
+ |
+ movq xmm0, xmm4 |
+ psrldq xmm4, 8 |
+ |
+ paddw xmm0, xmm4 |
+ movd [rcx], xmm0 |
+;- |
+ movq xmm0, xmm5 |
+ psrldq xmm5, 8 |
+ |
+ paddw xmm0, xmm5 |
+ movd [rcx+4], xmm0 |
+;- |
+ movq xmm0, xmm6 |
+ psrldq xmm6, 8 |
+ |
+ paddw xmm0, xmm6 |
+ movd [rcx+8], xmm0 |
+;- |
+ movq xmm0, xmm7 |
+ psrldq xmm7, 8 |
+ |
+ paddw xmm0, xmm7 |
+ movd [rcx+12], xmm0 |
+ |
+ STACK_FRAME_DESTROY_X4 |
+ |
+;void int vp9_sad8x16x4d_sse3( |
+; unsigned char *src_ptr, |
+; int src_stride, |
+; unsigned char *ref_ptr, |
+; int ref_stride, |
+; int *results) |
+global sym(vp9_sad8x16x4d_sse3) |
+sym(vp9_sad8x16x4d_sse3): |
+ |
+ STACK_FRAME_CREATE_X4 |
+ |
+ PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ |
+%if ABI_IS_32BIT |
+ pop rbp |
+%endif |
+ mov rcx, result_ptr |
+ |
+ punpckldq mm4, mm5 |
+ punpckldq mm6, mm7 |
+ |
+ movq [rcx], mm4 |
+ movq [rcx+8], mm6 |
+ |
+ STACK_FRAME_DESTROY_X4 |
+ |
+;void int vp9_sad8x8x4d_sse3( |
+; unsigned char *src_ptr, |
+; int src_stride, |
+; unsigned char *ref_ptr, |
+; int ref_stride, |
+; int *results) |
+global sym(vp9_sad8x8x4d_sse3) |
+sym(vp9_sad8x8x4d_sse3): |
+ |
+ STACK_FRAME_CREATE_X4 |
+ |
+ PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
+ |
+%if ABI_IS_32BIT |
+ pop rbp |
+%endif |
+ mov rcx, result_ptr |
+ |
+ punpckldq mm4, mm5 |
+ punpckldq mm6, mm7 |
+ |
+ movq [rcx], mm4 |
+ movq [rcx+8], mm6 |
+ |
+ STACK_FRAME_DESTROY_X4 |
+ |
+;void int vp9_sad4x4x4d_sse3( |
+; unsigned char *src_ptr, |
+; int src_stride, |
+; unsigned char *ref_ptr, |
+; int ref_stride, |
+; int *results) |
+global sym(vp9_sad4x4x4d_sse3) |
+sym(vp9_sad4x4x4d_sse3): |
+ |
+ STACK_FRAME_CREATE_X4 |
+ |
+ movd mm0, DWORD PTR [src_ptr] |
+ movd mm1, DWORD PTR [r0_ptr] |
+ |
+ movd mm2, DWORD PTR [src_ptr+src_stride] |
+ movd mm3, DWORD PTR [r0_ptr+ref_stride] |
+ |
+ punpcklbw mm0, mm2 |
+ punpcklbw mm1, mm3 |
+ |
+ movd mm4, DWORD PTR [r1_ptr] |
+ movd mm5, DWORD PTR [r2_ptr] |
+ |
+ movd mm6, DWORD PTR [r3_ptr] |
+ movd mm2, DWORD PTR [r1_ptr+ref_stride] |
+ |
+ movd mm3, DWORD PTR [r2_ptr+ref_stride] |
+ movd mm7, DWORD PTR [r3_ptr+ref_stride] |
+ |
+ psadbw mm1, mm0 |
+ |
+ punpcklbw mm4, mm2 |
+ punpcklbw mm5, mm3 |
+ |
+ punpcklbw mm6, mm7 |
+ psadbw mm4, mm0 |
+ |
+ psadbw mm5, mm0 |
+ psadbw mm6, mm0 |
+ |
+ |
+ |
+ lea src_ptr, [src_ptr+src_stride*2] |
+ lea r0_ptr, [r0_ptr+ref_stride*2] |
+ |
+ lea r1_ptr, [r1_ptr+ref_stride*2] |
+ lea r2_ptr, [r2_ptr+ref_stride*2] |
+ |
+ lea r3_ptr, [r3_ptr+ref_stride*2] |
+ |
+ movd mm0, DWORD PTR [src_ptr] |
+ movd mm2, DWORD PTR [r0_ptr] |
+ |
+ movd mm3, DWORD PTR [src_ptr+src_stride] |
+ movd mm7, DWORD PTR [r0_ptr+ref_stride] |
+ |
+ punpcklbw mm0, mm3 |
+ punpcklbw mm2, mm7 |
+ |
+ movd mm3, DWORD PTR [r1_ptr] |
+ movd mm7, DWORD PTR [r2_ptr] |
+ |
+ psadbw mm2, mm0 |
+%if ABI_IS_32BIT |
+ mov rax, rbp |
+ |
+ pop rbp |
+%define ref_stride rax |
+%endif |
+ mov rsi, result_ptr |
+ |
+ paddw mm1, mm2 |
+ movd [rsi], mm1 |
+ |
+ movd mm2, DWORD PTR [r1_ptr+ref_stride] |
+ movd mm1, DWORD PTR [r2_ptr+ref_stride] |
+ |
+ punpcklbw mm3, mm2 |
+ punpcklbw mm7, mm1 |
+ |
+ psadbw mm3, mm0 |
+ psadbw mm7, mm0 |
+ |
+ movd mm2, DWORD PTR [r3_ptr] |
+ movd mm1, DWORD PTR [r3_ptr+ref_stride] |
+ |
+ paddw mm3, mm4 |
+ paddw mm7, mm5 |
+ |
+ movd [rsi+4], mm3 |
+ punpcklbw mm2, mm1 |
+ |
+ movd [rsi+8], mm7 |
+ psadbw mm2, mm0 |
+ |
+ paddw mm2, mm6 |
+ movd [rsi+12], mm2 |
+ |
+ |
+ STACK_FRAME_DESTROY_X4 |
+ |