| Index: source/libvpx/vpx_dsp/x86/variance_impl_mmx.asm
|
| diff --git a/source/libvpx/vp8/common/x86/variance_impl_mmx.asm b/source/libvpx/vpx_dsp/x86/variance_impl_mmx.asm
|
| similarity index 52%
|
| copy from source/libvpx/vp8/common/x86/variance_impl_mmx.asm
|
| copy to source/libvpx/vpx_dsp/x86/variance_impl_mmx.asm
|
| index 7d5e6810bf0d35c1ca896a81c3cff6503044e237..a8d7d99dbc0d043ffb36bfe02d42a5aaeb48d2d9 100644
|
| --- a/source/libvpx/vp8/common/x86/variance_impl_mmx.asm
|
| +++ b/source/libvpx/vpx_dsp/x86/variance_impl_mmx.asm
|
| @@ -11,9 +11,9 @@
|
|
|
| %include "vpx_ports/x86_abi_support.asm"
|
|
|
| -;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
|
| -global sym(vp8_get_mb_ss_mmx) PRIVATE
|
| -sym(vp8_get_mb_ss_mmx):
|
| +;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
|
| +global sym(vpx_get_mb_ss_mmx) PRIVATE
|
| +sym(vpx_get_mb_ss_mmx):
|
| push rbp
|
| mov rbp, rsp
|
| SHADOW_ARGS_TO_STACK 7
|
| @@ -63,7 +63,7 @@ sym(vp8_get_mb_ss_mmx):
|
| ret
|
|
|
|
|
| -;unsigned int vp8_get8x8var_mmx
|
| +;void vpx_get8x8var_mmx
|
| ;(
|
| ; unsigned char *src_ptr,
|
| ; int source_stride,
|
| @@ -72,8 +72,8 @@ sym(vp8_get_mb_ss_mmx):
|
| ; unsigned int *SSE,
|
| ; int *Sum
|
| ;)
|
| -global sym(vp8_get8x8var_mmx) PRIVATE
|
| -sym(vp8_get8x8var_mmx):
|
| +global sym(vpx_get8x8var_mmx) PRIVATE
|
| +sym(vpx_get8x8var_mmx):
|
| push rbp
|
| mov rbp, rsp
|
| SHADOW_ARGS_TO_STACK 6
|
| @@ -310,8 +310,8 @@ sym(vp8_get8x8var_mmx):
|
|
|
|
|
|
|
| -;unsigned int
|
| -;vp8_get4x4var_mmx
|
| +;void
|
| +;vpx_get4x4var_mmx
|
| ;(
|
| ; unsigned char *src_ptr,
|
| ; int source_stride,
|
| @@ -320,8 +320,8 @@ sym(vp8_get8x8var_mmx):
|
| ; unsigned int *SSE,
|
| ; int *Sum
|
| ;)
|
| -global sym(vp8_get4x4var_mmx) PRIVATE
|
| -sym(vp8_get4x4var_mmx):
|
| +global sym(vpx_get4x4var_mmx) PRIVATE
|
| +sym(vpx_get4x4var_mmx):
|
| push rbp
|
| mov rbp, rsp
|
| SHADOW_ARGS_TO_STACK 6
|
| @@ -422,430 +422,3 @@ sym(vp8_get4x4var_mmx):
|
| UNSHADOW_ARGS
|
| pop rbp
|
| ret
|
| -
|
| -
|
| -
|
| -;unsigned int
|
| -;vp8_get4x4sse_cs_mmx
|
| -;(
|
| -; unsigned char *src_ptr,
|
| -; int source_stride,
|
| -; unsigned char *ref_ptr,
|
| -; int recon_stride
|
| -;)
|
| -global sym(vp8_get4x4sse_cs_mmx) PRIVATE
|
| -sym(vp8_get4x4sse_cs_mmx):
|
| - push rbp
|
| - mov rbp, rsp
|
| - SHADOW_ARGS_TO_STACK 4
|
| - push rsi
|
| - push rdi
|
| - push rbx
|
| - ; end prolog
|
| -
|
| -
|
| - pxor mm6, mm6 ; Blank mmx7
|
| - pxor mm7, mm7 ; Blank mmx7
|
| -
|
| - mov rax, arg(0) ;[src_ptr] ; Load base addresses
|
| - mov rbx, arg(2) ;[ref_ptr]
|
| - movsxd rcx, dword ptr arg(1) ;[source_stride]
|
| - movsxd rdx, dword ptr arg(3) ;[recon_stride]
|
| - ; Row 1
|
| - movd mm0, [rax] ; Copy eight bytes to mm0
|
| - movd mm1, [rbx] ; Copy eight bytes to mm1
|
| - punpcklbw mm0, mm6 ; unpack to higher prrcision
|
| - punpcklbw mm1, mm6
|
| - psubsw mm0, mm1 ; A-B (low order) to MM0
|
| - pmaddwd mm0, mm0 ; square and accumulate
|
| - add rbx,rdx ; Inc pointer into ref data
|
| - add rax,rcx ; Inc pointer into the new data
|
| - movd mm1, [rbx] ; Copy eight bytes to mm1
|
| - paddd mm7, mm0 ; accumulate in mm7
|
| -
|
| - ; Row 2
|
| - movd mm0, [rax] ; Copy eight bytes to mm0
|
| - punpcklbw mm0, mm6 ; unpack to higher prrcision
|
| - punpcklbw mm1, mm6
|
| - psubsw mm0, mm1 ; A-B (low order) to MM0
|
| - pmaddwd mm0, mm0 ; square and accumulate
|
| - add rbx,rdx ; Inc pointer into ref data
|
| - add rax,rcx ; Inc pointer into the new data
|
| - movd mm1, [rbx] ; Copy eight bytes to mm1
|
| - paddd mm7, mm0 ; accumulate in mm7
|
| -
|
| - ; Row 3
|
| - movd mm0, [rax] ; Copy eight bytes to mm0
|
| - punpcklbw mm1, mm6
|
| - punpcklbw mm0, mm6 ; unpack to higher prrcision
|
| - psubsw mm0, mm1 ; A-B (low order) to MM0
|
| -
|
| - pmaddwd mm0, mm0 ; square and accumulate
|
| - add rbx,rdx ; Inc pointer into ref data
|
| - add rax,rcx ; Inc pointer into the new data
|
| - movd mm1, [rbx] ; Copy eight bytes to mm1
|
| - paddd mm7, mm0 ; accumulate in mm7
|
| -
|
| - ; Row 4
|
| - movd mm0, [rax] ; Copy eight bytes to mm0
|
| - punpcklbw mm0, mm6 ; unpack to higher prrcision
|
| - punpcklbw mm1, mm6
|
| - psubsw mm0, mm1 ; A-B (low order) to MM0
|
| - pmaddwd mm0, mm0 ; square and accumulate
|
| - paddd mm7, mm0 ; accumulate in mm7
|
| -
|
| - movq mm0, mm7 ;
|
| - psrlq mm7, 32
|
| -
|
| - paddd mm0, mm7
|
| - movq rax, mm0
|
| -
|
| -
|
| - ; begin epilog
|
| - pop rbx
|
| - pop rdi
|
| - pop rsi
|
| - UNSHADOW_ARGS
|
| - pop rbp
|
| - ret
|
| -
|
| -%define mmx_filter_shift 7
|
| -
|
| -;void vp8_filter_block2d_bil4x4_var_mmx
|
| -;(
|
| -; unsigned char *ref_ptr,
|
| -; int ref_pixels_per_line,
|
| -; unsigned char *src_ptr,
|
| -; int src_pixels_per_line,
|
| -; unsigned short *HFilter,
|
| -; unsigned short *VFilter,
|
| -; int *sum,
|
| -; unsigned int *sumsquared
|
| -;)
|
| -global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE
|
| -sym(vp8_filter_block2d_bil4x4_var_mmx):
|
| - push rbp
|
| - mov rbp, rsp
|
| - SHADOW_ARGS_TO_STACK 8
|
| - GET_GOT rbx
|
| - push rsi
|
| - push rdi
|
| - sub rsp, 16
|
| - ; end prolog
|
| -
|
| -
|
| - pxor mm6, mm6 ;
|
| - pxor mm7, mm7 ;
|
| -
|
| - mov rax, arg(4) ;HFilter ;
|
| - mov rdx, arg(5) ;VFilter ;
|
| -
|
| - mov rsi, arg(0) ;ref_ptr ;
|
| - mov rdi, arg(2) ;src_ptr ;
|
| -
|
| - mov rcx, 4 ;
|
| - pxor mm0, mm0 ;
|
| -
|
| - movd mm1, [rsi] ;
|
| - movd mm3, [rsi+1] ;
|
| -
|
| - punpcklbw mm1, mm0 ;
|
| - pmullw mm1, [rax] ;
|
| -
|
| - punpcklbw mm3, mm0 ;
|
| - pmullw mm3, [rax+8] ;
|
| -
|
| - paddw mm1, mm3 ;
|
| - paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
| -
|
| - psraw mm1, mmx_filter_shift ;
|
| - movq mm5, mm1
|
| -
|
| -%if ABI_IS_32BIT
|
| - add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
|
| -%else
|
| - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
|
| - add rsi, r8
|
| -%endif
|
| -
|
| -.filter_block2d_bil4x4_var_mmx_loop:
|
| -
|
| - movd mm1, [rsi] ;
|
| - movd mm3, [rsi+1] ;
|
| -
|
| - punpcklbw mm1, mm0 ;
|
| - pmullw mm1, [rax] ;
|
| -
|
| - punpcklbw mm3, mm0 ;
|
| - pmullw mm3, [rax+8] ;
|
| -
|
| - paddw mm1, mm3 ;
|
| - paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
| -
|
| - psraw mm1, mmx_filter_shift ;
|
| - movq mm3, mm5 ;
|
| -
|
| - movq mm5, mm1 ;
|
| - pmullw mm3, [rdx] ;
|
| -
|
| - pmullw mm1, [rdx+8] ;
|
| - paddw mm1, mm3 ;
|
| -
|
| -
|
| - paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
| - psraw mm1, mmx_filter_shift ;
|
| -
|
| - movd mm3, [rdi] ;
|
| - punpcklbw mm3, mm0 ;
|
| -
|
| - psubw mm1, mm3 ;
|
| - paddw mm6, mm1 ;
|
| -
|
| - pmaddwd mm1, mm1 ;
|
| - paddd mm7, mm1 ;
|
| -
|
| -%if ABI_IS_32BIT
|
| - add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
|
| - add rdi, dword ptr arg(3) ;src_pixels_per_line ;
|
| -%else
|
| - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
|
| - movsxd r9, dword ptr arg(3) ;src_pixels_per_line
|
| - add rsi, r8
|
| - add rdi, r9
|
| -%endif
|
| - sub rcx, 1 ;
|
| - jnz .filter_block2d_bil4x4_var_mmx_loop ;
|
| -
|
| -
|
| - pxor mm3, mm3 ;
|
| - pxor mm2, mm2 ;
|
| -
|
| - punpcklwd mm2, mm6 ;
|
| - punpckhwd mm3, mm6 ;
|
| -
|
| - paddd mm2, mm3 ;
|
| - movq mm6, mm2 ;
|
| -
|
| - psrlq mm6, 32 ;
|
| - paddd mm2, mm6 ;
|
| -
|
| - psrad mm2, 16 ;
|
| - movq mm4, mm7 ;
|
| -
|
| - psrlq mm4, 32 ;
|
| - paddd mm4, mm7 ;
|
| -
|
| - mov rdi, arg(6) ;sum
|
| - mov rsi, arg(7) ;sumsquared
|
| -
|
| - movd dword ptr [rdi], mm2 ;
|
| - movd dword ptr [rsi], mm4 ;
|
| -
|
| -
|
| -
|
| - ; begin epilog
|
| - add rsp, 16
|
| - pop rdi
|
| - pop rsi
|
| - RESTORE_GOT
|
| - UNSHADOW_ARGS
|
| - pop rbp
|
| - ret
|
| -
|
| -
|
| -
|
| -
|
| -;void vp8_filter_block2d_bil_var_mmx
|
| -;(
|
| -; unsigned char *ref_ptr,
|
| -; int ref_pixels_per_line,
|
| -; unsigned char *src_ptr,
|
| -; int src_pixels_per_line,
|
| -; unsigned int Height,
|
| -; unsigned short *HFilter,
|
| -; unsigned short *VFilter,
|
| -; int *sum,
|
| -; unsigned int *sumsquared
|
| -;)
|
| -global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE
|
| -sym(vp8_filter_block2d_bil_var_mmx):
|
| - push rbp
|
| - mov rbp, rsp
|
| - SHADOW_ARGS_TO_STACK 9
|
| - GET_GOT rbx
|
| - push rsi
|
| - push rdi
|
| - sub rsp, 16
|
| - ; end prolog
|
| -
|
| - pxor mm6, mm6 ;
|
| - pxor mm7, mm7 ;
|
| - mov rax, arg(5) ;HFilter ;
|
| -
|
| - mov rdx, arg(6) ;VFilter ;
|
| - mov rsi, arg(0) ;ref_ptr ;
|
| -
|
| - mov rdi, arg(2) ;src_ptr ;
|
| - movsxd rcx, dword ptr arg(4) ;Height ;
|
| -
|
| - pxor mm0, mm0 ;
|
| - movq mm1, [rsi] ;
|
| -
|
| - movq mm3, [rsi+1] ;
|
| - movq mm2, mm1 ;
|
| -
|
| - movq mm4, mm3 ;
|
| - punpcklbw mm1, mm0 ;
|
| -
|
| - punpckhbw mm2, mm0 ;
|
| - pmullw mm1, [rax] ;
|
| -
|
| - pmullw mm2, [rax] ;
|
| - punpcklbw mm3, mm0 ;
|
| -
|
| - punpckhbw mm4, mm0 ;
|
| - pmullw mm3, [rax+8] ;
|
| -
|
| - pmullw mm4, [rax+8] ;
|
| - paddw mm1, mm3 ;
|
| -
|
| - paddw mm2, mm4 ;
|
| - paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
| -
|
| - psraw mm1, mmx_filter_shift ;
|
| - paddw mm2, [GLOBAL(mmx_bi_rd)] ;
|
| -
|
| - psraw mm2, mmx_filter_shift ;
|
| - movq mm5, mm1
|
| -
|
| - packuswb mm5, mm2 ;
|
| -%if ABI_IS_32BIT
|
| - add rsi, dword ptr arg(1) ;ref_pixels_per_line
|
| -%else
|
| - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
|
| - add rsi, r8
|
| -%endif
|
| -
|
| -.filter_block2d_bil_var_mmx_loop:
|
| -
|
| - movq mm1, [rsi] ;
|
| - movq mm3, [rsi+1] ;
|
| -
|
| - movq mm2, mm1 ;
|
| - movq mm4, mm3 ;
|
| -
|
| - punpcklbw mm1, mm0 ;
|
| - punpckhbw mm2, mm0 ;
|
| -
|
| - pmullw mm1, [rax] ;
|
| - pmullw mm2, [rax] ;
|
| -
|
| - punpcklbw mm3, mm0 ;
|
| - punpckhbw mm4, mm0 ;
|
| -
|
| - pmullw mm3, [rax+8] ;
|
| - pmullw mm4, [rax+8] ;
|
| -
|
| - paddw mm1, mm3 ;
|
| - paddw mm2, mm4 ;
|
| -
|
| - paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
| - psraw mm1, mmx_filter_shift ;
|
| -
|
| - paddw mm2, [GLOBAL(mmx_bi_rd)] ;
|
| - psraw mm2, mmx_filter_shift ;
|
| -
|
| - movq mm3, mm5 ;
|
| - movq mm4, mm5 ;
|
| -
|
| - punpcklbw mm3, mm0 ;
|
| - punpckhbw mm4, mm0 ;
|
| -
|
| - movq mm5, mm1 ;
|
| - packuswb mm5, mm2 ;
|
| -
|
| - pmullw mm3, [rdx] ;
|
| - pmullw mm4, [rdx] ;
|
| -
|
| - pmullw mm1, [rdx+8] ;
|
| - pmullw mm2, [rdx+8] ;
|
| -
|
| - paddw mm1, mm3 ;
|
| - paddw mm2, mm4 ;
|
| -
|
| - paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
| - paddw mm2, [GLOBAL(mmx_bi_rd)] ;
|
| -
|
| - psraw mm1, mmx_filter_shift ;
|
| - psraw mm2, mmx_filter_shift ;
|
| -
|
| - movq mm3, [rdi] ;
|
| - movq mm4, mm3 ;
|
| -
|
| - punpcklbw mm3, mm0 ;
|
| - punpckhbw mm4, mm0 ;
|
| -
|
| - psubw mm1, mm3 ;
|
| - psubw mm2, mm4 ;
|
| -
|
| - paddw mm6, mm1 ;
|
| - pmaddwd mm1, mm1 ;
|
| -
|
| - paddw mm6, mm2 ;
|
| - pmaddwd mm2, mm2 ;
|
| -
|
| - paddd mm7, mm1 ;
|
| - paddd mm7, mm2 ;
|
| -
|
| -%if ABI_IS_32BIT
|
| - add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
|
| - add rdi, dword ptr arg(3) ;src_pixels_per_line ;
|
| -%else
|
| - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
|
| - movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
|
| - add rsi, r8
|
| - add rdi, r9
|
| -%endif
|
| - sub rcx, 1 ;
|
| - jnz .filter_block2d_bil_var_mmx_loop ;
|
| -
|
| -
|
| - pxor mm3, mm3 ;
|
| - pxor mm2, mm2 ;
|
| -
|
| - punpcklwd mm2, mm6 ;
|
| - punpckhwd mm3, mm6 ;
|
| -
|
| - paddd mm2, mm3 ;
|
| - movq mm6, mm2 ;
|
| -
|
| - psrlq mm6, 32 ;
|
| - paddd mm2, mm6 ;
|
| -
|
| - psrad mm2, 16 ;
|
| - movq mm4, mm7 ;
|
| -
|
| - psrlq mm4, 32 ;
|
| - paddd mm4, mm7 ;
|
| -
|
| - mov rdi, arg(7) ;sum
|
| - mov rsi, arg(8) ;sumsquared
|
| -
|
| - movd dword ptr [rdi], mm2 ;
|
| - movd dword ptr [rsi], mm4 ;
|
| -
|
| - ; begin epilog
|
| - add rsp, 16
|
| - pop rdi
|
| - pop rsi
|
| - RESTORE_GOT
|
| - UNSHADOW_ARGS
|
| - pop rbp
|
| - ret
|
| -
|
| -
|
| -SECTION_RODATA
|
| -;short mmx_bi_rd[4] = { 64, 64, 64, 64};
|
| -align 16
|
| -mmx_bi_rd:
|
| - times 4 dw 64
|
|
|