| Index: source/libvpx/vp9/encoder/x86/vp9_sad_sse3.asm
|
| ===================================================================
|
| --- source/libvpx/vp9/encoder/x86/vp9_sad_sse3.asm (revision 0)
|
| +++ source/libvpx/vp9/encoder/x86/vp9_sad_sse3.asm (revision 0)
|
| @@ -0,0 +1,960 @@
|
| +;
|
| +; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
| +;
|
| +; Use of this source code is governed by a BSD-style license
|
| +; that can be found in the LICENSE file in the root of the source
|
| +; tree. An additional intellectual property rights grant can be found
|
| +; in the file PATENTS. All contributing project authors may
|
| +; be found in the AUTHORS file in the root of the source tree.
|
| +;
|
| +
|
| +%include "vpx_ports/x86_abi_support.asm"
|
| +
|
| +%macro STACK_FRAME_CREATE_X3 0
|
| +%if ABI_IS_32BIT
|
| + %define src_ptr rsi
|
| + %define src_stride rax
|
| + %define ref_ptr rdi
|
| + %define ref_stride rdx
|
| + %define end_ptr rcx
|
| + %define ret_var rbx
|
| + %define result_ptr arg(4)
|
| + %define max_err arg(4)
|
| + %define height dword ptr arg(4)
|
| + push rbp
|
| + mov rbp, rsp
|
| + push rsi
|
| + push rdi
|
| + push rbx
|
| +
|
| + mov rsi, arg(0) ; src_ptr
|
| + mov rdi, arg(2) ; ref_ptr
|
| +
|
| + movsxd rax, dword ptr arg(1) ; src_stride
|
| + movsxd rdx, dword ptr arg(3) ; ref_stride
|
| +%else
|
| + %ifidn __OUTPUT_FORMAT__,x64
|
| + SAVE_XMM 7, u
|
| + %define src_ptr rcx
|
| + %define src_stride rdx
|
| + %define ref_ptr r8
|
| + %define ref_stride r9
|
| + %define end_ptr r10
|
| + %define ret_var r11
|
| + %define result_ptr [rsp+xmm_stack_space+8+4*8]
|
| + %define max_err [rsp+xmm_stack_space+8+4*8]
|
| + %define height dword ptr [rsp+xmm_stack_space+8+4*8]
|
| + %else
|
| + %define src_ptr rdi
|
| + %define src_stride rsi
|
| + %define ref_ptr rdx
|
| + %define ref_stride rcx
|
| + %define end_ptr r9
|
| + %define ret_var r10
|
| + %define result_ptr r8
|
| + %define max_err r8
|
| + %define height r8
|
| + %endif
|
| +%endif
|
| +
|
| +%endmacro
|
| +
|
| +%macro STACK_FRAME_DESTROY_X3 0
|
| + %define src_ptr
|
| + %define src_stride
|
| + %define ref_ptr
|
| + %define ref_stride
|
| + %define end_ptr
|
| + %define ret_var
|
| + %define result_ptr
|
| + %define max_err
|
| + %define height
|
| +
|
| +%if ABI_IS_32BIT
|
| + pop rbx
|
| + pop rdi
|
| + pop rsi
|
| + pop rbp
|
| +%else
|
| + %ifidn __OUTPUT_FORMAT__,x64
|
| + RESTORE_XMM
|
| + %endif
|
| +%endif
|
| + ret
|
| +%endmacro
|
| +
|
| +%macro STACK_FRAME_CREATE_X4 0
|
| +%if ABI_IS_32BIT
|
| + %define src_ptr rsi
|
| + %define src_stride rax
|
| + %define r0_ptr rcx
|
| + %define r1_ptr rdx
|
| + %define r2_ptr rbx
|
| + %define r3_ptr rdi
|
| + %define ref_stride rbp
|
| + %define result_ptr arg(4)
|
| + push rbp
|
| + mov rbp, rsp
|
| + push rsi
|
| + push rdi
|
| + push rbx
|
| +
|
| + push rbp
|
| + mov rdi, arg(2) ; ref_ptr_base
|
| +
|
| + LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
|
| +
|
| + mov rsi, arg(0) ; src_ptr
|
| +
|
| + movsxd rbx, dword ptr arg(1) ; src_stride
|
| + movsxd rbp, dword ptr arg(3) ; ref_stride
|
| +
|
| + xchg rbx, rax
|
| +%else
|
| + %ifidn __OUTPUT_FORMAT__,x64
|
| + SAVE_XMM 7, u
|
| + %define src_ptr rcx
|
| + %define src_stride rdx
|
| + %define r0_ptr rsi
|
| + %define r1_ptr r10
|
| + %define r2_ptr r11
|
| + %define r3_ptr r8
|
| + %define ref_stride r9
|
| + %define result_ptr [rsp+xmm_stack_space+16+4*8]
|
| + push rsi
|
| +
|
| + LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
|
| + %else
|
| + %define src_ptr rdi
|
| + %define src_stride rsi
|
| + %define r0_ptr r9
|
| + %define r1_ptr r10
|
| + %define r2_ptr r11
|
| + %define r3_ptr rdx
|
| + %define ref_stride rcx
|
| + %define result_ptr r8
|
| +
|
| + LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
|
| +
|
| + %endif
|
| +%endif
|
| +%endmacro
|
| +
|
| +%macro STACK_FRAME_DESTROY_X4 0
|
| + %define src_ptr
|
| + %define src_stride
|
| + %define r0_ptr
|
| + %define r1_ptr
|
| + %define r2_ptr
|
| + %define r3_ptr
|
| + %define ref_stride
|
| + %define result_ptr
|
| +
|
| +%if ABI_IS_32BIT
|
| + pop rbx
|
| + pop rdi
|
| + pop rsi
|
| + pop rbp
|
| +%else
|
| + %ifidn __OUTPUT_FORMAT__,x64
|
| + pop rsi
|
| + RESTORE_XMM
|
| + %endif
|
| +%endif
|
| + ret
|
| +%endmacro
|
| +
|
| +%macro PROCESS_16X2X3 5
|
| +%if %1==0
|
| + movdqa xmm0, XMMWORD PTR [%2]
|
| + lddqu xmm5, XMMWORD PTR [%3]
|
| + lddqu xmm6, XMMWORD PTR [%3+1]
|
| + lddqu xmm7, XMMWORD PTR [%3+2]
|
| +
|
| + psadbw xmm5, xmm0
|
| + psadbw xmm6, xmm0
|
| + psadbw xmm7, xmm0
|
| +%else
|
| + movdqa xmm0, XMMWORD PTR [%2]
|
| + lddqu xmm1, XMMWORD PTR [%3]
|
| + lddqu xmm2, XMMWORD PTR [%3+1]
|
| + lddqu xmm3, XMMWORD PTR [%3+2]
|
| +
|
| + psadbw xmm1, xmm0
|
| + psadbw xmm2, xmm0
|
| + psadbw xmm3, xmm0
|
| +
|
| + paddw xmm5, xmm1
|
| + paddw xmm6, xmm2
|
| + paddw xmm7, xmm3
|
| +%endif
|
| + movdqa xmm0, XMMWORD PTR [%2+%4]
|
| + lddqu xmm1, XMMWORD PTR [%3+%5]
|
| + lddqu xmm2, XMMWORD PTR [%3+%5+1]
|
| + lddqu xmm3, XMMWORD PTR [%3+%5+2]
|
| +
|
| +%if %1==0 || %1==1
|
| + lea %2, [%2+%4*2]
|
| + lea %3, [%3+%5*2]
|
| +%endif
|
| +
|
| + psadbw xmm1, xmm0
|
| + psadbw xmm2, xmm0
|
| + psadbw xmm3, xmm0
|
| +
|
| + paddw xmm5, xmm1
|
| + paddw xmm6, xmm2
|
| + paddw xmm7, xmm3
|
| +%endmacro
|
| +
|
| +%macro PROCESS_8X2X3 5
|
| +%if %1==0
|
| + movq mm0, QWORD PTR [%2]
|
| + movq mm5, QWORD PTR [%3]
|
| + movq mm6, QWORD PTR [%3+1]
|
| + movq mm7, QWORD PTR [%3+2]
|
| +
|
| + psadbw mm5, mm0
|
| + psadbw mm6, mm0
|
| + psadbw mm7, mm0
|
| +%else
|
| + movq mm0, QWORD PTR [%2]
|
| + movq mm1, QWORD PTR [%3]
|
| + movq mm2, QWORD PTR [%3+1]
|
| + movq mm3, QWORD PTR [%3+2]
|
| +
|
| + psadbw mm1, mm0
|
| + psadbw mm2, mm0
|
| + psadbw mm3, mm0
|
| +
|
| + paddw mm5, mm1
|
| + paddw mm6, mm2
|
| + paddw mm7, mm3
|
| +%endif
|
| + movq mm0, QWORD PTR [%2+%4]
|
| + movq mm1, QWORD PTR [%3+%5]
|
| + movq mm2, QWORD PTR [%3+%5+1]
|
| + movq mm3, QWORD PTR [%3+%5+2]
|
| +
|
| +%if %1==0 || %1==1
|
| + lea %2, [%2+%4*2]
|
| + lea %3, [%3+%5*2]
|
| +%endif
|
| +
|
| + psadbw mm1, mm0
|
| + psadbw mm2, mm0
|
| + psadbw mm3, mm0
|
| +
|
| + paddw mm5, mm1
|
| + paddw mm6, mm2
|
| + paddw mm7, mm3
|
| +%endmacro
|
| +
|
| +%macro LOAD_X4_ADDRESSES 5
|
| + mov %2, [%1+REG_SZ_BYTES*0]
|
| + mov %3, [%1+REG_SZ_BYTES*1]
|
| +
|
| + mov %4, [%1+REG_SZ_BYTES*2]
|
| + mov %5, [%1+REG_SZ_BYTES*3]
|
| +%endmacro
|
| +
|
| +%macro PROCESS_16X2X4 8
|
| +%if %1==0
|
| + movdqa xmm0, XMMWORD PTR [%2]
|
| + lddqu xmm4, XMMWORD PTR [%3]
|
| + lddqu xmm5, XMMWORD PTR [%4]
|
| + lddqu xmm6, XMMWORD PTR [%5]
|
| + lddqu xmm7, XMMWORD PTR [%6]
|
| +
|
| + psadbw xmm4, xmm0
|
| + psadbw xmm5, xmm0
|
| + psadbw xmm6, xmm0
|
| + psadbw xmm7, xmm0
|
| +%else
|
| + movdqa xmm0, XMMWORD PTR [%2]
|
| + lddqu xmm1, XMMWORD PTR [%3]
|
| + lddqu xmm2, XMMWORD PTR [%4]
|
| + lddqu xmm3, XMMWORD PTR [%5]
|
| +
|
| + psadbw xmm1, xmm0
|
| + psadbw xmm2, xmm0
|
| + psadbw xmm3, xmm0
|
| +
|
| + paddw xmm4, xmm1
|
| + lddqu xmm1, XMMWORD PTR [%6]
|
| + paddw xmm5, xmm2
|
| + paddw xmm6, xmm3
|
| +
|
| + psadbw xmm1, xmm0
|
| + paddw xmm7, xmm1
|
| +%endif
|
| + movdqa xmm0, XMMWORD PTR [%2+%7]
|
| + lddqu xmm1, XMMWORD PTR [%3+%8]
|
| + lddqu xmm2, XMMWORD PTR [%4+%8]
|
| + lddqu xmm3, XMMWORD PTR [%5+%8]
|
| +
|
| + psadbw xmm1, xmm0
|
| + psadbw xmm2, xmm0
|
| + psadbw xmm3, xmm0
|
| +
|
| + paddw xmm4, xmm1
|
| + lddqu xmm1, XMMWORD PTR [%6+%8]
|
| + paddw xmm5, xmm2
|
| + paddw xmm6, xmm3
|
| +
|
| +%if %1==0 || %1==1
|
| + lea %2, [%2+%7*2]
|
| + lea %3, [%3+%8*2]
|
| +
|
| + lea %4, [%4+%8*2]
|
| + lea %5, [%5+%8*2]
|
| +
|
| + lea %6, [%6+%8*2]
|
| +%endif
|
| + psadbw xmm1, xmm0
|
| + paddw xmm7, xmm1
|
| +
|
| +%endmacro
|
| +
|
| +%macro PROCESS_8X2X4 8
|
| +%if %1==0
|
| + movq mm0, QWORD PTR [%2]
|
| + movq mm4, QWORD PTR [%3]
|
| + movq mm5, QWORD PTR [%4]
|
| + movq mm6, QWORD PTR [%5]
|
| + movq mm7, QWORD PTR [%6]
|
| +
|
| + psadbw mm4, mm0
|
| + psadbw mm5, mm0
|
| + psadbw mm6, mm0
|
| + psadbw mm7, mm0
|
| +%else
|
| + movq mm0, QWORD PTR [%2]
|
| + movq mm1, QWORD PTR [%3]
|
| + movq mm2, QWORD PTR [%4]
|
| + movq mm3, QWORD PTR [%5]
|
| +
|
| + psadbw mm1, mm0
|
| + psadbw mm2, mm0
|
| + psadbw mm3, mm0
|
| +
|
| + paddw mm4, mm1
|
| + movq mm1, QWORD PTR [%6]
|
| + paddw mm5, mm2
|
| + paddw mm6, mm3
|
| +
|
| + psadbw mm1, mm0
|
| + paddw mm7, mm1
|
| +%endif
|
| + movq mm0, QWORD PTR [%2+%7]
|
| + movq mm1, QWORD PTR [%3+%8]
|
| + movq mm2, QWORD PTR [%4+%8]
|
| + movq mm3, QWORD PTR [%5+%8]
|
| +
|
| + psadbw mm1, mm0
|
| + psadbw mm2, mm0
|
| + psadbw mm3, mm0
|
| +
|
| + paddw mm4, mm1
|
| + movq mm1, QWORD PTR [%6+%8]
|
| + paddw mm5, mm2
|
| + paddw mm6, mm3
|
| +
|
| +%if %1==0 || %1==1
|
| + lea %2, [%2+%7*2]
|
| + lea %3, [%3+%8*2]
|
| +
|
| + lea %4, [%4+%8*2]
|
| + lea %5, [%5+%8*2]
|
| +
|
| + lea %6, [%6+%8*2]
|
| +%endif
|
| + psadbw mm1, mm0
|
| + paddw mm7, mm1
|
| +
|
| +%endmacro
|
| +
|
| +;void int vp9_sad16x16x3_sse3(
|
| +; unsigned char *src_ptr,
|
| +; int src_stride,
|
| +; unsigned char *ref_ptr,
|
| +; int ref_stride,
|
| +; int *results)
|
| +global sym(vp9_sad16x16x3_sse3)
|
| +sym(vp9_sad16x16x3_sse3):
|
| +
|
| + STACK_FRAME_CREATE_X3
|
| +
|
| + PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
|
| + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
|
| + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
|
| + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
|
| + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
|
| + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
|
| + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
|
| + PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
|
| +
|
| + mov rcx, result_ptr
|
| +
|
| + movq xmm0, xmm5
|
| + psrldq xmm5, 8
|
| +
|
| + paddw xmm0, xmm5
|
| + movd [rcx], xmm0
|
| +;-
|
| + movq xmm0, xmm6
|
| + psrldq xmm6, 8
|
| +
|
| + paddw xmm0, xmm6
|
| + movd [rcx+4], xmm0
|
| +;-
|
| + movq xmm0, xmm7
|
| + psrldq xmm7, 8
|
| +
|
| + paddw xmm0, xmm7
|
| + movd [rcx+8], xmm0
|
| +
|
| + STACK_FRAME_DESTROY_X3
|
| +
|
| +;void int vp9_sad16x8x3_sse3(
|
| +; unsigned char *src_ptr,
|
| +; int src_stride,
|
| +; unsigned char *ref_ptr,
|
| +; int ref_stride,
|
| +; int *results)
|
| +global sym(vp9_sad16x8x3_sse3)
|
| +sym(vp9_sad16x8x3_sse3):
|
| +
|
| + STACK_FRAME_CREATE_X3
|
| +
|
| + PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
|
| + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
|
| + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
|
| + PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
|
| +
|
| + mov rcx, result_ptr
|
| +
|
| + movq xmm0, xmm5
|
| + psrldq xmm5, 8
|
| +
|
| + paddw xmm0, xmm5
|
| + movd [rcx], xmm0
|
| +;-
|
| + movq xmm0, xmm6
|
| + psrldq xmm6, 8
|
| +
|
| + paddw xmm0, xmm6
|
| + movd [rcx+4], xmm0
|
| +;-
|
| + movq xmm0, xmm7
|
| + psrldq xmm7, 8
|
| +
|
| + paddw xmm0, xmm7
|
| + movd [rcx+8], xmm0
|
| +
|
| + STACK_FRAME_DESTROY_X3
|
| +
|
| +;void int vp9_sad8x16x3_sse3(
|
| +; unsigned char *src_ptr,
|
| +; int src_stride,
|
| +; unsigned char *ref_ptr,
|
| +; int ref_stride,
|
| +; int *results)
|
| +global sym(vp9_sad8x16x3_sse3)
|
| +sym(vp9_sad8x16x3_sse3):
|
| +
|
| + STACK_FRAME_CREATE_X3
|
| +
|
| + PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
|
| + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
|
| + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
|
| + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
|
| + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
|
| + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
|
| + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
|
| + PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
|
| +
|
| + mov rcx, result_ptr
|
| +
|
| + punpckldq mm5, mm6
|
| +
|
| + movq [rcx], mm5
|
| + movd [rcx+8], mm7
|
| +
|
| + STACK_FRAME_DESTROY_X3
|
| +
|
| +;void int vp9_sad8x8x3_sse3(
|
| +; unsigned char *src_ptr,
|
| +; int src_stride,
|
| +; unsigned char *ref_ptr,
|
| +; int ref_stride,
|
| +; int *results)
|
| +global sym(vp9_sad8x8x3_sse3)
|
| +sym(vp9_sad8x8x3_sse3):
|
| +
|
| + STACK_FRAME_CREATE_X3
|
| +
|
| + PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
|
| + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
|
| + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
|
| + PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
|
| +
|
| + mov rcx, result_ptr
|
| +
|
| + punpckldq mm5, mm6
|
| +
|
| + movq [rcx], mm5
|
| + movd [rcx+8], mm7
|
| +
|
| + STACK_FRAME_DESTROY_X3
|
| +
|
| +;void int vp9_sad4x4x3_sse3(
|
| +; unsigned char *src_ptr,
|
| +; int src_stride,
|
| +; unsigned char *ref_ptr,
|
| +; int ref_stride,
|
| +; int *results)
|
| +global sym(vp9_sad4x4x3_sse3)
|
| +sym(vp9_sad4x4x3_sse3):
|
| +
|
| + STACK_FRAME_CREATE_X3
|
| +
|
| + movd mm0, DWORD PTR [src_ptr]
|
| + movd mm1, DWORD PTR [ref_ptr]
|
| +
|
| + movd mm2, DWORD PTR [src_ptr+src_stride]
|
| + movd mm3, DWORD PTR [ref_ptr+ref_stride]
|
| +
|
| + punpcklbw mm0, mm2
|
| + punpcklbw mm1, mm3
|
| +
|
| + movd mm4, DWORD PTR [ref_ptr+1]
|
| + movd mm5, DWORD PTR [ref_ptr+2]
|
| +
|
| + movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
|
| + movd mm3, DWORD PTR [ref_ptr+ref_stride+2]
|
| +
|
| + psadbw mm1, mm0
|
| +
|
| + punpcklbw mm4, mm2
|
| + punpcklbw mm5, mm3
|
| +
|
| + psadbw mm4, mm0
|
| + psadbw mm5, mm0
|
| +
|
| + lea src_ptr, [src_ptr+src_stride*2]
|
| + lea ref_ptr, [ref_ptr+ref_stride*2]
|
| +
|
| + movd mm0, DWORD PTR [src_ptr]
|
| + movd mm2, DWORD PTR [ref_ptr]
|
| +
|
| + movd mm3, DWORD PTR [src_ptr+src_stride]
|
| + movd mm6, DWORD PTR [ref_ptr+ref_stride]
|
| +
|
| + punpcklbw mm0, mm3
|
| + punpcklbw mm2, mm6
|
| +
|
| + movd mm3, DWORD PTR [ref_ptr+1]
|
| + movd mm7, DWORD PTR [ref_ptr+2]
|
| +
|
| + psadbw mm2, mm0
|
| +
|
| + paddw mm1, mm2
|
| +
|
| + movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
|
| + movd mm6, DWORD PTR [ref_ptr+ref_stride+2]
|
| +
|
| + punpcklbw mm3, mm2
|
| + punpcklbw mm7, mm6
|
| +
|
| + psadbw mm3, mm0
|
| + psadbw mm7, mm0
|
| +
|
| + paddw mm3, mm4
|
| + paddw mm7, mm5
|
| +
|
| + mov rcx, result_ptr
|
| +
|
| + punpckldq mm1, mm3
|
| +
|
| + movq [rcx], mm1
|
| + movd [rcx+8], mm7
|
| +
|
| + STACK_FRAME_DESTROY_X3
|
| +
|
| +;unsigned int vp9_sad16x16_sse3(
|
| +; unsigned char *src_ptr,
|
| +; int src_stride,
|
| +; unsigned char *ref_ptr,
|
| +; int ref_stride,
|
| +; int max_err)
|
| +;%define lddqu movdqu
|
| +global sym(vp9_sad16x16_sse3)
|
| +sym(vp9_sad16x16_sse3):
|
| +
|
| + STACK_FRAME_CREATE_X3
|
| +
|
| + mov end_ptr, 4
|
| + pxor xmm7, xmm7
|
| +
|
| +.vp9_sad16x16_sse3_loop:
|
| + movdqa xmm0, XMMWORD PTR [src_ptr]
|
| + movdqu xmm1, XMMWORD PTR [ref_ptr]
|
| + movdqa xmm2, XMMWORD PTR [src_ptr+src_stride]
|
| + movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride]
|
| +
|
| + lea src_ptr, [src_ptr+src_stride*2]
|
| + lea ref_ptr, [ref_ptr+ref_stride*2]
|
| +
|
| + movdqa xmm4, XMMWORD PTR [src_ptr]
|
| + movdqu xmm5, XMMWORD PTR [ref_ptr]
|
| + movdqa xmm6, XMMWORD PTR [src_ptr+src_stride]
|
| +
|
| + psadbw xmm0, xmm1
|
| +
|
| + movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride]
|
| +
|
| + psadbw xmm2, xmm3
|
| + psadbw xmm4, xmm5
|
| + psadbw xmm6, xmm1
|
| +
|
| + lea src_ptr, [src_ptr+src_stride*2]
|
| + lea ref_ptr, [ref_ptr+ref_stride*2]
|
| +
|
| + paddw xmm7, xmm0
|
| + paddw xmm7, xmm2
|
| + paddw xmm7, xmm4
|
| + paddw xmm7, xmm6
|
| +
|
| + sub end_ptr, 1
|
| + jne .vp9_sad16x16_sse3_loop
|
| +
|
| + movq xmm0, xmm7
|
| + psrldq xmm7, 8
|
| + paddw xmm0, xmm7
|
| + movq rax, xmm0
|
| +
|
| + STACK_FRAME_DESTROY_X3
|
| +
|
| +;void vp9_copy32xn_sse3(
|
| +; unsigned char *src_ptr,
|
| +; int src_stride,
|
| +; unsigned char *dst_ptr,
|
| +; int dst_stride,
|
| +; int height);
|
| +global sym(vp9_copy32xn_sse3)
|
| +sym(vp9_copy32xn_sse3):
|
| +
|
| + STACK_FRAME_CREATE_X3
|
| +
|
| +.block_copy_sse3_loopx4:
|
| + lea end_ptr, [src_ptr+src_stride*2]
|
| +
|
| + movdqu xmm0, XMMWORD PTR [src_ptr]
|
| + movdqu xmm1, XMMWORD PTR [src_ptr + 16]
|
| + movdqu xmm2, XMMWORD PTR [src_ptr + src_stride]
|
| + movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16]
|
| + movdqu xmm4, XMMWORD PTR [end_ptr]
|
| + movdqu xmm5, XMMWORD PTR [end_ptr + 16]
|
| + movdqu xmm6, XMMWORD PTR [end_ptr + src_stride]
|
| + movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16]
|
| +
|
| + lea src_ptr, [src_ptr+src_stride*4]
|
| +
|
| + lea end_ptr, [ref_ptr+ref_stride*2]
|
| +
|
| + movdqa XMMWORD PTR [ref_ptr], xmm0
|
| + movdqa XMMWORD PTR [ref_ptr + 16], xmm1
|
| + movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2
|
| + movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
|
| + movdqa XMMWORD PTR [end_ptr], xmm4
|
| + movdqa XMMWORD PTR [end_ptr + 16], xmm5
|
| + movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6
|
| + movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
|
| +
|
| + lea ref_ptr, [ref_ptr+ref_stride*4]
|
| +
|
| + sub height, 4
|
| + cmp height, 4
|
| + jge .block_copy_sse3_loopx4
|
| +
|
| + ;Check to see if there is more rows need to be copied.
|
| + cmp height, 0
|
| + je .copy_is_done
|
| +
|
| +.block_copy_sse3_loop:
|
| + movdqu xmm0, XMMWORD PTR [src_ptr]
|
| + movdqu xmm1, XMMWORD PTR [src_ptr + 16]
|
| + lea src_ptr, [src_ptr+src_stride]
|
| +
|
| + movdqa XMMWORD PTR [ref_ptr], xmm0
|
| + movdqa XMMWORD PTR [ref_ptr + 16], xmm1
|
| + lea ref_ptr, [ref_ptr+ref_stride]
|
| +
|
| + sub height, 1
|
| + jne .block_copy_sse3_loop
|
| +
|
| +.copy_is_done:
|
| + STACK_FRAME_DESTROY_X3
|
| +
|
| +;void vp9_sad16x16x4d_sse3(
|
| +; unsigned char *src_ptr,
|
| +; int src_stride,
|
| +; unsigned char *ref_ptr_base,
|
| +; int ref_stride,
|
| +; int *results)
|
| +global sym(vp9_sad16x16x4d_sse3)
|
| +sym(vp9_sad16x16x4d_sse3):
|
| +
|
| + STACK_FRAME_CREATE_X4
|
| +
|
| + PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| + PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| + PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| + PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| + PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| + PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| + PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| + PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| +
|
| +%if ABI_IS_32BIT
|
| + pop rbp
|
| +%endif
|
| + mov rcx, result_ptr
|
| +
|
| + movq xmm0, xmm4
|
| + psrldq xmm4, 8
|
| +
|
| + paddw xmm0, xmm4
|
| + movd [rcx], xmm0
|
| +;-
|
| + movq xmm0, xmm5
|
| + psrldq xmm5, 8
|
| +
|
| + paddw xmm0, xmm5
|
| + movd [rcx+4], xmm0
|
| +;-
|
| + movq xmm0, xmm6
|
| + psrldq xmm6, 8
|
| +
|
| + paddw xmm0, xmm6
|
| + movd [rcx+8], xmm0
|
| +;-
|
| + movq xmm0, xmm7
|
| + psrldq xmm7, 8
|
| +
|
| + paddw xmm0, xmm7
|
| + movd [rcx+12], xmm0
|
| +
|
| + STACK_FRAME_DESTROY_X4
|
| +
|
| +;void vp9_sad16x8x4d_sse3(
|
| +; unsigned char *src_ptr,
|
| +; int src_stride,
|
| +; unsigned char *ref_ptr_base,
|
| +; int ref_stride,
|
| +; int *results)
|
| +global sym(vp9_sad16x8x4d_sse3)
|
| +sym(vp9_sad16x8x4d_sse3):
|
| +
|
| + STACK_FRAME_CREATE_X4
|
| +
|
| + PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| + PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| + PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| + PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| +
|
| +%if ABI_IS_32BIT
|
| + pop rbp
|
| +%endif
|
| + mov rcx, result_ptr
|
| +
|
| + movq xmm0, xmm4
|
| + psrldq xmm4, 8
|
| +
|
| + paddw xmm0, xmm4
|
| + movd [rcx], xmm0
|
| +;-
|
| + movq xmm0, xmm5
|
| + psrldq xmm5, 8
|
| +
|
| + paddw xmm0, xmm5
|
| + movd [rcx+4], xmm0
|
| +;-
|
| + movq xmm0, xmm6
|
| + psrldq xmm6, 8
|
| +
|
| + paddw xmm0, xmm6
|
| + movd [rcx+8], xmm0
|
| +;-
|
| + movq xmm0, xmm7
|
| + psrldq xmm7, 8
|
| +
|
| + paddw xmm0, xmm7
|
| + movd [rcx+12], xmm0
|
| +
|
| + STACK_FRAME_DESTROY_X4
|
| +
|
| +;void int vp9_sad8x16x4d_sse3(
|
| +; unsigned char *src_ptr,
|
| +; int src_stride,
|
| +; unsigned char *ref_ptr,
|
| +; int ref_stride,
|
| +; int *results)
|
| +global sym(vp9_sad8x16x4d_sse3)
|
| +sym(vp9_sad8x16x4d_sse3):
|
| +
|
| + STACK_FRAME_CREATE_X4
|
| +
|
| + PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| + PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| + PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| + PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| + PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| + PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| + PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| + PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| +
|
| +%if ABI_IS_32BIT
|
| + pop rbp
|
| +%endif
|
| + mov rcx, result_ptr
|
| +
|
| + punpckldq mm4, mm5
|
| + punpckldq mm6, mm7
|
| +
|
| + movq [rcx], mm4
|
| + movq [rcx+8], mm6
|
| +
|
| + STACK_FRAME_DESTROY_X4
|
| +
|
| +;void int vp9_sad8x8x4d_sse3(
|
| +; unsigned char *src_ptr,
|
| +; int src_stride,
|
| +; unsigned char *ref_ptr,
|
| +; int ref_stride,
|
| +; int *results)
|
| +global sym(vp9_sad8x8x4d_sse3)
|
| +sym(vp9_sad8x8x4d_sse3):
|
| +
|
| + STACK_FRAME_CREATE_X4
|
| +
|
| + PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| + PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| + PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| + PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
|
| +
|
| +%if ABI_IS_32BIT
|
| + pop rbp
|
| +%endif
|
| + mov rcx, result_ptr
|
| +
|
| + punpckldq mm4, mm5
|
| + punpckldq mm6, mm7
|
| +
|
| + movq [rcx], mm4
|
| + movq [rcx+8], mm6
|
| +
|
| + STACK_FRAME_DESTROY_X4
|
| +
|
| +;void int vp9_sad4x4x4d_sse3(
|
| +; unsigned char *src_ptr,
|
| +; int src_stride,
|
| +; unsigned char *ref_ptr,
|
| +; int ref_stride,
|
| +; int *results)
|
| +global sym(vp9_sad4x4x4d_sse3)
|
| +sym(vp9_sad4x4x4d_sse3):
|
| +
|
| + STACK_FRAME_CREATE_X4
|
| +
|
| + movd mm0, DWORD PTR [src_ptr]
|
| + movd mm1, DWORD PTR [r0_ptr]
|
| +
|
| + movd mm2, DWORD PTR [src_ptr+src_stride]
|
| + movd mm3, DWORD PTR [r0_ptr+ref_stride]
|
| +
|
| + punpcklbw mm0, mm2
|
| + punpcklbw mm1, mm3
|
| +
|
| + movd mm4, DWORD PTR [r1_ptr]
|
| + movd mm5, DWORD PTR [r2_ptr]
|
| +
|
| + movd mm6, DWORD PTR [r3_ptr]
|
| + movd mm2, DWORD PTR [r1_ptr+ref_stride]
|
| +
|
| + movd mm3, DWORD PTR [r2_ptr+ref_stride]
|
| + movd mm7, DWORD PTR [r3_ptr+ref_stride]
|
| +
|
| + psadbw mm1, mm0
|
| +
|
| + punpcklbw mm4, mm2
|
| + punpcklbw mm5, mm3
|
| +
|
| + punpcklbw mm6, mm7
|
| + psadbw mm4, mm0
|
| +
|
| + psadbw mm5, mm0
|
| + psadbw mm6, mm0
|
| +
|
| +
|
| +
|
| + lea src_ptr, [src_ptr+src_stride*2]
|
| + lea r0_ptr, [r0_ptr+ref_stride*2]
|
| +
|
| + lea r1_ptr, [r1_ptr+ref_stride*2]
|
| + lea r2_ptr, [r2_ptr+ref_stride*2]
|
| +
|
| + lea r3_ptr, [r3_ptr+ref_stride*2]
|
| +
|
| + movd mm0, DWORD PTR [src_ptr]
|
| + movd mm2, DWORD PTR [r0_ptr]
|
| +
|
| + movd mm3, DWORD PTR [src_ptr+src_stride]
|
| + movd mm7, DWORD PTR [r0_ptr+ref_stride]
|
| +
|
| + punpcklbw mm0, mm3
|
| + punpcklbw mm2, mm7
|
| +
|
| + movd mm3, DWORD PTR [r1_ptr]
|
| + movd mm7, DWORD PTR [r2_ptr]
|
| +
|
| + psadbw mm2, mm0
|
| +%if ABI_IS_32BIT
|
| + mov rax, rbp
|
| +
|
| + pop rbp
|
| +%define ref_stride rax
|
| +%endif
|
| + mov rsi, result_ptr
|
| +
|
| + paddw mm1, mm2
|
| + movd [rsi], mm1
|
| +
|
| + movd mm2, DWORD PTR [r1_ptr+ref_stride]
|
| + movd mm1, DWORD PTR [r2_ptr+ref_stride]
|
| +
|
| + punpcklbw mm3, mm2
|
| + punpcklbw mm7, mm1
|
| +
|
| + psadbw mm3, mm0
|
| + psadbw mm7, mm0
|
| +
|
| + movd mm2, DWORD PTR [r3_ptr]
|
| + movd mm1, DWORD PTR [r3_ptr+ref_stride]
|
| +
|
| + paddw mm3, mm4
|
| + paddw mm7, mm5
|
| +
|
| + movd [rsi+4], mm3
|
| + punpcklbw mm2, mm1
|
| +
|
| + movd [rsi+8], mm7
|
| + psadbw mm2, mm0
|
| +
|
| + paddw mm2, mm6
|
| + movd [rsi+12], mm2
|
| +
|
| +
|
| + STACK_FRAME_DESTROY_X4
|
| +
|
|
|