| Index: source/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm
|
| ===================================================================
|
| --- source/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm (revision 0)
|
| +++ source/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm (revision 0)
|
| @@ -0,0 +1,851 @@
|
| +;
|
| +; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
| +;
|
| +; Use of this source code is governed by a BSD-style license
|
| +; that can be found in the LICENSE file in the root of the source
|
| +; tree. An additional intellectual property rights grant can be found
|
| +; in the file PATENTS. All contributing project authors may
|
| +; be found in the AUTHORS file in the root of the source tree.
|
| +;
|
| +
|
| +
|
| +%include "vpx_ports/x86_abi_support.asm"
|
| +
|
| +;unsigned int vp9_get_mb_ss_mmx( short *src_ptr )
|
| +global sym(vp9_get_mb_ss_mmx)
|
| +sym(vp9_get_mb_ss_mmx):
|
| + push rbp
|
| + mov rbp, rsp
|
| + SHADOW_ARGS_TO_STACK 7
|
| + GET_GOT rbx
|
| + push rsi
|
| + push rdi
|
| + sub rsp, 8
|
| + ; end prolog
|
| +
|
| + mov rax, arg(0) ;src_ptr
|
| + mov rcx, 16
|
| + pxor mm4, mm4
|
| +
|
| +.NEXTROW:
|
| + movq mm0, [rax]
|
| + movq mm1, [rax+8]
|
| + movq mm2, [rax+16]
|
| + movq mm3, [rax+24]
|
| + pmaddwd mm0, mm0
|
| + pmaddwd mm1, mm1
|
| + pmaddwd mm2, mm2
|
| + pmaddwd mm3, mm3
|
| +
|
| + paddd mm4, mm0
|
| + paddd mm4, mm1
|
| + paddd mm4, mm2
|
| + paddd mm4, mm3
|
| +
|
| + add rax, 32
|
| + dec rcx
|
| + ja .NEXTROW
|
| + movq QWORD PTR [rsp], mm4
|
| +
|
| + ;return sum[0]+sum[1];
|
| + movsxd rax, dword ptr [rsp]
|
| + movsxd rcx, dword ptr [rsp+4]
|
| + add rax, rcx
|
| +
|
| +
|
| + ; begin epilog
|
| + add rsp, 8
|
| + pop rdi
|
| + pop rsi
|
| + RESTORE_GOT
|
| + UNSHADOW_ARGS
|
| + pop rbp
|
| + ret
|
| +
|
| +
|
| +;unsigned int vp9_get8x8var_mmx
|
| +;(
|
| +; unsigned char *src_ptr,
|
| +; int source_stride,
|
| +; unsigned char *ref_ptr,
|
| +; int recon_stride,
|
| +; unsigned int *SSE,
|
| +; int *Sum
|
| +;)
|
| +global sym(vp9_get8x8var_mmx)
|
| +sym(vp9_get8x8var_mmx):
|
| + push rbp
|
| + mov rbp, rsp
|
| + SHADOW_ARGS_TO_STACK 6
|
| + push rsi
|
| + push rdi
|
| + push rbx
|
| + sub rsp, 16
|
| + ; end prolog
|
| +
|
| +
|
| + pxor mm5, mm5 ; Blank mmx6
|
| + pxor mm6, mm6 ; Blank mmx7
|
| + pxor mm7, mm7 ; Blank mmx7
|
| +
|
| + mov rax, arg(0) ;[src_ptr] ; Load base addresses
|
| + mov rbx, arg(2) ;[ref_ptr]
|
| + movsxd rcx, dword ptr arg(1) ;[source_stride]
|
| + movsxd rdx, dword ptr arg(3) ;[recon_stride]
|
| +
|
| + ; Row 1
|
| + movq mm0, [rax] ; Copy eight bytes to mm0
|
| + movq mm1, [rbx] ; Copy eight bytes to mm1
|
| + movq mm2, mm0 ; Take copies
|
| + movq mm3, mm1 ; Take copies
|
| +
|
| + punpcklbw mm0, mm6 ; unpack to higher prrcision
|
| + punpcklbw mm1, mm6
|
| + punpckhbw mm2, mm6 ; unpack to higher prrcision
|
| + punpckhbw mm3, mm6
|
| + psubsw mm0, mm1 ; A-B (low order) to MM0
|
| + psubsw mm2, mm3 ; A-B (high order) to MM2
|
| +
|
| + paddw mm5, mm0 ; accumulate differences in mm5
|
| + paddw mm5, mm2 ; accumulate differences in mm5
|
| +
|
| + pmaddwd mm0, mm0 ; square and accumulate
|
| + pmaddwd mm2, mm2 ; square and accumulate
|
| + add rbx,rdx ; Inc pointer into ref data
|
| + add rax,rcx ; Inc pointer into the new data
|
| + movq mm1, [rbx] ; Copy eight bytes to mm1
|
| + paddd mm7, mm0 ; accumulate in mm7
|
| + paddd mm7, mm2 ; accumulate in mm7
|
| +
|
| +
|
| + ; Row 2
|
| + movq mm0, [rax] ; Copy eight bytes to mm0
|
| + movq mm2, mm0 ; Take copies
|
| + movq mm3, mm1 ; Take copies
|
| +
|
| + punpcklbw mm0, mm6 ; unpack to higher prrcision
|
| + punpcklbw mm1, mm6
|
| + punpckhbw mm2, mm6 ; unpack to higher prrcision
|
| + punpckhbw mm3, mm6
|
| + psubsw mm0, mm1 ; A-B (low order) to MM0
|
| + psubsw mm2, mm3 ; A-B (high order) to MM2
|
| +
|
| + paddw mm5, mm0 ; accumulate differences in mm5
|
| + paddw mm5, mm2 ; accumulate differences in mm5
|
| +
|
| + pmaddwd mm0, mm0 ; square and accumulate
|
| + pmaddwd mm2, mm2 ; square and accumulate
|
| + add rbx,rdx ; Inc pointer into ref data
|
| + add rax,rcx ; Inc pointer into the new data
|
| + movq mm1, [rbx] ; Copy eight bytes to mm1
|
| + paddd mm7, mm0 ; accumulate in mm7
|
| + paddd mm7, mm2 ; accumulate in mm7
|
| +
|
| + ; Row 3
|
| + movq mm0, [rax] ; Copy eight bytes to mm0
|
| + movq mm2, mm0 ; Take copies
|
| + movq mm3, mm1 ; Take copies
|
| +
|
| + punpcklbw mm0, mm6 ; unpack to higher prrcision
|
| + punpcklbw mm1, mm6
|
| + punpckhbw mm2, mm6 ; unpack to higher prrcision
|
| + punpckhbw mm3, mm6
|
| + psubsw mm0, mm1 ; A-B (low order) to MM0
|
| + psubsw mm2, mm3 ; A-B (high order) to MM2
|
| +
|
| + paddw mm5, mm0 ; accumulate differences in mm5
|
| + paddw mm5, mm2 ; accumulate differences in mm5
|
| +
|
| + pmaddwd mm0, mm0 ; square and accumulate
|
| + pmaddwd mm2, mm2 ; square and accumulate
|
| + add rbx,rdx ; Inc pointer into ref data
|
| + add rax,rcx ; Inc pointer into the new data
|
| + movq mm1, [rbx] ; Copy eight bytes to mm1
|
| + paddd mm7, mm0 ; accumulate in mm7
|
| + paddd mm7, mm2 ; accumulate in mm7
|
| +
|
| + ; Row 4
|
| + movq mm0, [rax] ; Copy eight bytes to mm0
|
| + movq mm2, mm0 ; Take copies
|
| + movq mm3, mm1 ; Take copies
|
| +
|
| + punpcklbw mm0, mm6 ; unpack to higher prrcision
|
| + punpcklbw mm1, mm6
|
| + punpckhbw mm2, mm6 ; unpack to higher prrcision
|
| + punpckhbw mm3, mm6
|
| + psubsw mm0, mm1 ; A-B (low order) to MM0
|
| + psubsw mm2, mm3 ; A-B (high order) to MM2
|
| +
|
| + paddw mm5, mm0 ; accumulate differences in mm5
|
| + paddw mm5, mm2 ; accumulate differences in mm5
|
| +
|
| + pmaddwd mm0, mm0 ; square and accumulate
|
| + pmaddwd mm2, mm2 ; square and accumulate
|
| + add rbx,rdx ; Inc pointer into ref data
|
| + add rax,rcx ; Inc pointer into the new data
|
| + movq mm1, [rbx] ; Copy eight bytes to mm1
|
| + paddd mm7, mm0 ; accumulate in mm7
|
| + paddd mm7, mm2 ; accumulate in mm7
|
| +
|
| + ; Row 5
|
| + movq mm0, [rax] ; Copy eight bytes to mm0
|
| + movq mm2, mm0 ; Take copies
|
| + movq mm3, mm1 ; Take copies
|
| +
|
| + punpcklbw mm0, mm6 ; unpack to higher prrcision
|
| + punpcklbw mm1, mm6
|
| + punpckhbw mm2, mm6 ; unpack to higher prrcision
|
| + punpckhbw mm3, mm6
|
| + psubsw mm0, mm1 ; A-B (low order) to MM0
|
| + psubsw mm2, mm3 ; A-B (high order) to MM2
|
| +
|
| + paddw mm5, mm0 ; accumulate differences in mm5
|
| + paddw mm5, mm2 ; accumulate differences in mm5
|
| +
|
| + pmaddwd mm0, mm0 ; square and accumulate
|
| + pmaddwd mm2, mm2 ; square and accumulate
|
| + add rbx,rdx ; Inc pointer into ref data
|
| + add rax,rcx ; Inc pointer into the new data
|
| + movq mm1, [rbx] ; Copy eight bytes to mm1
|
| + ; movq mm4, [rbx + rdx]
|
| + paddd mm7, mm0 ; accumulate in mm7
|
| + paddd mm7, mm2 ; accumulate in mm7
|
| +
|
| + ; Row 6
|
| + movq mm0, [rax] ; Copy eight bytes to mm0
|
| + movq mm2, mm0 ; Take copies
|
| + movq mm3, mm1 ; Take copies
|
| +
|
| + punpcklbw mm0, mm6 ; unpack to higher prrcision
|
| + punpcklbw mm1, mm6
|
| + punpckhbw mm2, mm6 ; unpack to higher prrcision
|
| + punpckhbw mm3, mm6
|
| + psubsw mm0, mm1 ; A-B (low order) to MM0
|
| + psubsw mm2, mm3 ; A-B (high order) to MM2
|
| +
|
| + paddw mm5, mm0 ; accumulate differences in mm5
|
| + paddw mm5, mm2 ; accumulate differences in mm5
|
| +
|
| + pmaddwd mm0, mm0 ; square and accumulate
|
| + pmaddwd mm2, mm2 ; square and accumulate
|
| + add rbx,rdx ; Inc pointer into ref data
|
| + add rax,rcx ; Inc pointer into the new data
|
| + movq mm1, [rbx] ; Copy eight bytes to mm1
|
| + paddd mm7, mm0 ; accumulate in mm7
|
| + paddd mm7, mm2 ; accumulate in mm7
|
| +
|
| + ; Row 7
|
| + movq mm0, [rax] ; Copy eight bytes to mm0
|
| + movq mm2, mm0 ; Take copies
|
| + movq mm3, mm1 ; Take copies
|
| +
|
| + punpcklbw mm0, mm6 ; unpack to higher prrcision
|
| + punpcklbw mm1, mm6
|
| + punpckhbw mm2, mm6 ; unpack to higher prrcision
|
| + punpckhbw mm3, mm6
|
| + psubsw mm0, mm1 ; A-B (low order) to MM0
|
| + psubsw mm2, mm3 ; A-B (high order) to MM2
|
| +
|
| + paddw mm5, mm0 ; accumulate differences in mm5
|
| + paddw mm5, mm2 ; accumulate differences in mm5
|
| +
|
| + pmaddwd mm0, mm0 ; square and accumulate
|
| + pmaddwd mm2, mm2 ; square and accumulate
|
| + add rbx,rdx ; Inc pointer into ref data
|
| + add rax,rcx ; Inc pointer into the new data
|
| + movq mm1, [rbx] ; Copy eight bytes to mm1
|
| + paddd mm7, mm0 ; accumulate in mm7
|
| + paddd mm7, mm2 ; accumulate in mm7
|
| +
|
| + ; Row 8
|
| + movq mm0, [rax] ; Copy eight bytes to mm0
|
| + movq mm2, mm0 ; Take copies
|
| + movq mm3, mm1 ; Take copies
|
| +
|
| + punpcklbw mm0, mm6 ; unpack to higher prrcision
|
| + punpcklbw mm1, mm6
|
| + punpckhbw mm2, mm6 ; unpack to higher prrcision
|
| + punpckhbw mm3, mm6
|
| + psubsw mm0, mm1 ; A-B (low order) to MM0
|
| + psubsw mm2, mm3 ; A-B (high order) to MM2
|
| +
|
| + paddw mm5, mm0 ; accumulate differences in mm5
|
| + paddw mm5, mm2 ; accumulate differences in mm5
|
| +
|
| + pmaddwd mm0, mm0 ; square and accumulate
|
| + pmaddwd mm2, mm2 ; square and accumulate
|
| + add rbx,rdx ; Inc pointer into ref data
|
| + add rax,rcx ; Inc pointer into the new data
|
| + paddd mm7, mm0 ; accumulate in mm7
|
| + paddd mm7, mm2 ; accumulate in mm7
|
| +
|
| + ; Now accumulate the final results.
|
| + movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
|
| + movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
|
| + movsx rdx, WORD PTR [rsp+8]
|
| + movsx rcx, WORD PTR [rsp+10]
|
| + movsx rbx, WORD PTR [rsp+12]
|
| + movsx rax, WORD PTR [rsp+14]
|
| + add rdx, rcx
|
| + add rbx, rax
|
| + add rdx, rbx ;XSum
|
| + movsxd rax, DWORD PTR [rsp]
|
| + movsxd rcx, DWORD PTR [rsp+4]
|
| + add rax, rcx ;XXSum
|
| + mov rsi, arg(4) ;SSE
|
| + mov rdi, arg(5) ;Sum
|
| + mov dword ptr [rsi], eax
|
| + mov dword ptr [rdi], edx
|
| + xor rax, rax ; return 0
|
| +
|
| +
|
| + ; begin epilog
|
| + add rsp, 16
|
| + pop rbx
|
| + pop rdi
|
| + pop rsi
|
| + UNSHADOW_ARGS
|
| + pop rbp
|
| + ret
|
| +
|
| +
|
| +
|
| +;unsigned int
|
| +;vp9_get4x4var_mmx
|
| +;(
|
| +; unsigned char *src_ptr,
|
| +; int source_stride,
|
| +; unsigned char *ref_ptr,
|
| +; int recon_stride,
|
| +; unsigned int *SSE,
|
| +; int *Sum
|
| +;)
|
| +global sym(vp9_get4x4var_mmx)
|
| +sym(vp9_get4x4var_mmx):
|
| + push rbp
|
| + mov rbp, rsp
|
| + SHADOW_ARGS_TO_STACK 6
|
| + push rsi
|
| + push rdi
|
| + push rbx
|
| + sub rsp, 16
|
| + ; end prolog
|
| +
|
| +
|
| + pxor mm5, mm5 ; Blank mmx6
|
| + pxor mm6, mm6 ; Blank mmx7
|
| + pxor mm7, mm7 ; Blank mmx7
|
| +
|
| + mov rax, arg(0) ;[src_ptr] ; Load base addresses
|
| + mov rbx, arg(2) ;[ref_ptr]
|
| + movsxd rcx, dword ptr arg(1) ;[source_stride]
|
| + movsxd rdx, dword ptr arg(3) ;[recon_stride]
|
| +
|
| + ; Row 1
|
| + movq mm0, [rax] ; Copy eight bytes to mm0
|
| + movq mm1, [rbx] ; Copy eight bytes to mm1
|
| + punpcklbw mm0, mm6 ; unpack to higher prrcision
|
| + punpcklbw mm1, mm6
|
| + psubsw mm0, mm1 ; A-B (low order) to MM0
|
| + paddw mm5, mm0 ; accumulate differences in mm5
|
| + pmaddwd mm0, mm0 ; square and accumulate
|
| + add rbx,rdx ; Inc pointer into ref data
|
| + add rax,rcx ; Inc pointer into the new data
|
| + movq mm1, [rbx] ; Copy eight bytes to mm1
|
| + paddd mm7, mm0 ; accumulate in mm7
|
| +
|
| +
|
| + ; Row 2
|
| + movq mm0, [rax] ; Copy eight bytes to mm0
|
| + punpcklbw mm0, mm6 ; unpack to higher prrcision
|
| + punpcklbw mm1, mm6
|
| + psubsw mm0, mm1 ; A-B (low order) to MM0
|
| + paddw mm5, mm0 ; accumulate differences in mm5
|
| +
|
| + pmaddwd mm0, mm0 ; square and accumulate
|
| + add rbx,rdx ; Inc pointer into ref data
|
| + add rax,rcx ; Inc pointer into the new data
|
| + movq mm1, [rbx] ; Copy eight bytes to mm1
|
| + paddd mm7, mm0 ; accumulate in mm7
|
| +
|
| + ; Row 3
|
| + movq mm0, [rax] ; Copy eight bytes to mm0
|
| + punpcklbw mm0, mm6 ; unpack to higher prrcision
|
| + punpcklbw mm1, mm6
|
| + psubsw mm0, mm1 ; A-B (low order) to MM0
|
| + paddw mm5, mm0 ; accumulate differences in mm5
|
| +
|
| + pmaddwd mm0, mm0 ; square and accumulate
|
| + add rbx,rdx ; Inc pointer into ref data
|
| + add rax,rcx ; Inc pointer into the new data
|
| + movq mm1, [rbx] ; Copy eight bytes to mm1
|
| + paddd mm7, mm0 ; accumulate in mm7
|
| +
|
| + ; Row 4
|
| + movq mm0, [rax] ; Copy eight bytes to mm0
|
| +
|
| + punpcklbw mm0, mm6 ; unpack to higher prrcision
|
| + punpcklbw mm1, mm6
|
| + psubsw mm0, mm1 ; A-B (low order) to MM0
|
| +
|
| + paddw mm5, mm0 ; accumulate differences in mm5
|
| +
|
| + pmaddwd mm0, mm0 ; square and accumulate
|
| + paddd mm7, mm0 ; accumulate in mm7
|
| +
|
| +
|
| + ; Now accumulate the final results.
|
| + movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
|
| + movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
|
| + movsx rdx, WORD PTR [rsp+8]
|
| + movsx rcx, WORD PTR [rsp+10]
|
| + movsx rbx, WORD PTR [rsp+12]
|
| + movsx rax, WORD PTR [rsp+14]
|
| + add rdx, rcx
|
| + add rbx, rax
|
| + add rdx, rbx ;XSum
|
| + movsxd rax, DWORD PTR [rsp]
|
| + movsxd rcx, DWORD PTR [rsp+4]
|
| + add rax, rcx ;XXSum
|
| + mov rsi, arg(4) ;SSE
|
| + mov rdi, arg(5) ;Sum
|
| + mov dword ptr [rsi], eax
|
| + mov dword ptr [rdi], edx
|
| + xor rax, rax ; return 0
|
| +
|
| +
|
| + ; begin epilog
|
| + add rsp, 16
|
| + pop rbx
|
| + pop rdi
|
| + pop rsi
|
| + UNSHADOW_ARGS
|
| + pop rbp
|
| + ret
|
| +
|
| +
|
| +
|
| +;unsigned int
|
| +;vp9_get4x4sse_cs_mmx
|
| +;(
|
| +; unsigned char *src_ptr,
|
| +; int source_stride,
|
| +; unsigned char *ref_ptr,
|
| +; int recon_stride
|
| +;)
|
| +global sym(vp9_get4x4sse_cs_mmx)
|
| +sym(vp9_get4x4sse_cs_mmx):
|
| + push rbp
|
| + mov rbp, rsp
|
| + SHADOW_ARGS_TO_STACK 4
|
| + push rsi
|
| + push rdi
|
| + push rbx
|
| + ; end prolog
|
| +
|
| +
|
| + pxor mm6, mm6 ; Blank mmx7
|
| + pxor mm7, mm7 ; Blank mmx7
|
| +
|
| + mov rax, arg(0) ;[src_ptr] ; Load base addresses
|
| + mov rbx, arg(2) ;[ref_ptr]
|
| + movsxd rcx, dword ptr arg(1) ;[source_stride]
|
| + movsxd rdx, dword ptr arg(3) ;[recon_stride]
|
| + ; Row 1
|
| + movd mm0, [rax] ; Copy eight bytes to mm0
|
| + movd mm1, [rbx] ; Copy eight bytes to mm1
|
| + punpcklbw mm0, mm6 ; unpack to higher prrcision
|
| + punpcklbw mm1, mm6
|
| + psubsw mm0, mm1 ; A-B (low order) to MM0
|
| + pmaddwd mm0, mm0 ; square and accumulate
|
| + add rbx,rdx ; Inc pointer into ref data
|
| + add rax,rcx ; Inc pointer into the new data
|
| + movd mm1, [rbx] ; Copy eight bytes to mm1
|
| + paddd mm7, mm0 ; accumulate in mm7
|
| +
|
| + ; Row 2
|
| + movd mm0, [rax] ; Copy eight bytes to mm0
|
| + punpcklbw mm0, mm6 ; unpack to higher prrcision
|
| + punpcklbw mm1, mm6
|
| + psubsw mm0, mm1 ; A-B (low order) to MM0
|
| + pmaddwd mm0, mm0 ; square and accumulate
|
| + add rbx,rdx ; Inc pointer into ref data
|
| + add rax,rcx ; Inc pointer into the new data
|
| + movd mm1, [rbx] ; Copy eight bytes to mm1
|
| + paddd mm7, mm0 ; accumulate in mm7
|
| +
|
| + ; Row 3
|
| + movd mm0, [rax] ; Copy eight bytes to mm0
|
| + punpcklbw mm1, mm6
|
| + punpcklbw mm0, mm6 ; unpack to higher prrcision
|
| + psubsw mm0, mm1 ; A-B (low order) to MM0
|
| +
|
| + pmaddwd mm0, mm0 ; square and accumulate
|
| + add rbx,rdx ; Inc pointer into ref data
|
| + add rax,rcx ; Inc pointer into the new data
|
| + movd mm1, [rbx] ; Copy eight bytes to mm1
|
| + paddd mm7, mm0 ; accumulate in mm7
|
| +
|
| + ; Row 4
|
| + movd mm0, [rax] ; Copy eight bytes to mm0
|
| + punpcklbw mm0, mm6 ; unpack to higher prrcision
|
| + punpcklbw mm1, mm6
|
| + psubsw mm0, mm1 ; A-B (low order) to MM0
|
| + pmaddwd mm0, mm0 ; square and accumulate
|
| + paddd mm7, mm0 ; accumulate in mm7
|
| +
|
| + movq mm0, mm7 ;
|
| + psrlq mm7, 32
|
| +
|
| + paddd mm0, mm7
|
| + movq rax, mm0
|
| +
|
| +
|
| + ; begin epilog
|
| + pop rbx
|
| + pop rdi
|
| + pop rsi
|
| + UNSHADOW_ARGS
|
| + pop rbp
|
| + ret
|
| +
|
| +%define mmx_filter_shift 7
|
| +
|
| +;void vp9_filter_block2d_bil4x4_var_mmx
|
| +;(
|
| +; unsigned char *ref_ptr,
|
| +; int ref_pixels_per_line,
|
| +; unsigned char *src_ptr,
|
| +; int src_pixels_per_line,
|
| +; unsigned short *HFilter,
|
| +; unsigned short *VFilter,
|
| +; int *sum,
|
| +; unsigned int *sumsquared
|
| +;)
|
| +global sym(vp9_filter_block2d_bil4x4_var_mmx)
|
| +sym(vp9_filter_block2d_bil4x4_var_mmx):
|
| + push rbp
|
| + mov rbp, rsp
|
| + SHADOW_ARGS_TO_STACK 8
|
| + GET_GOT rbx
|
| + push rsi
|
| + push rdi
|
| + sub rsp, 16
|
| + ; end prolog
|
| +
|
| +
|
| + pxor mm6, mm6 ;
|
| + pxor mm7, mm7 ;
|
| +
|
| + mov rax, arg(4) ;HFilter ;
|
| + mov rdx, arg(5) ;VFilter ;
|
| +
|
| + mov rsi, arg(0) ;ref_ptr ;
|
| + mov rdi, arg(2) ;src_ptr ;
|
| +
|
| + mov rcx, 4 ;
|
| + pxor mm0, mm0 ;
|
| +
|
| + movd mm1, [rsi] ;
|
| + movd mm3, [rsi+1] ;
|
| +
|
| + punpcklbw mm1, mm0 ;
|
| + pmullw mm1, [rax] ;
|
| +
|
| + punpcklbw mm3, mm0 ;
|
| + pmullw mm3, [rax+8] ;
|
| +
|
| + paddw mm1, mm3 ;
|
| + paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
| +
|
| + psraw mm1, mmx_filter_shift ;
|
| + movq mm5, mm1
|
| +
|
| +%if ABI_IS_32BIT
|
| + add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
|
| +%else
|
| + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
|
| + add rsi, r8
|
| +%endif
|
| +
|
| +.filter_block2d_bil4x4_var_mmx_loop:
|
| +
|
| + movd mm1, [rsi] ;
|
| + movd mm3, [rsi+1] ;
|
| +
|
| + punpcklbw mm1, mm0 ;
|
| + pmullw mm1, [rax] ;
|
| +
|
| + punpcklbw mm3, mm0 ;
|
| + pmullw mm3, [rax+8] ;
|
| +
|
| + paddw mm1, mm3 ;
|
| + paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
| +
|
| + psraw mm1, mmx_filter_shift ;
|
| + movq mm3, mm5 ;
|
| +
|
| + movq mm5, mm1 ;
|
| + pmullw mm3, [rdx] ;
|
| +
|
| + pmullw mm1, [rdx+8] ;
|
| + paddw mm1, mm3 ;
|
| +
|
| +
|
| + paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
| + psraw mm1, mmx_filter_shift ;
|
| +
|
| + movd mm3, [rdi] ;
|
| + punpcklbw mm3, mm0 ;
|
| +
|
| + psubw mm1, mm3 ;
|
| + paddw mm6, mm1 ;
|
| +
|
| + pmaddwd mm1, mm1 ;
|
| + paddd mm7, mm1 ;
|
| +
|
| +%if ABI_IS_32BIT
|
| + add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
|
| + add rdi, dword ptr arg(3) ;src_pixels_per_line ;
|
| +%else
|
| + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
|
| + movsxd r9, dword ptr arg(3) ;src_pixels_per_line
|
| + add rsi, r8
|
| + add rdi, r9
|
| +%endif
|
| + sub rcx, 1 ;
|
| + jnz .filter_block2d_bil4x4_var_mmx_loop ;
|
| +
|
| +
|
| + pxor mm3, mm3 ;
|
| + pxor mm2, mm2 ;
|
| +
|
| + punpcklwd mm2, mm6 ;
|
| + punpckhwd mm3, mm6 ;
|
| +
|
| + paddd mm2, mm3 ;
|
| + movq mm6, mm2 ;
|
| +
|
| + psrlq mm6, 32 ;
|
| + paddd mm2, mm6 ;
|
| +
|
| + psrad mm2, 16 ;
|
| + movq mm4, mm7 ;
|
| +
|
| + psrlq mm4, 32 ;
|
| + paddd mm4, mm7 ;
|
| +
|
| + mov rdi, arg(6) ;sum
|
| + mov rsi, arg(7) ;sumsquared
|
| +
|
| + movd dword ptr [rdi], mm2 ;
|
| + movd dword ptr [rsi], mm4 ;
|
| +
|
| +
|
| +
|
| + ; begin epilog
|
| + add rsp, 16
|
| + pop rdi
|
| + pop rsi
|
| + RESTORE_GOT
|
| + UNSHADOW_ARGS
|
| + pop rbp
|
| + ret
|
| +
|
| +
|
| +
|
| +
|
| +;void vp9_filter_block2d_bil_var_mmx
|
| +;(
|
| +; unsigned char *ref_ptr,
|
| +; int ref_pixels_per_line,
|
| +; unsigned char *src_ptr,
|
| +; int src_pixels_per_line,
|
| +; unsigned int Height,
|
| +; unsigned short *HFilter,
|
| +; unsigned short *VFilter,
|
| +; int *sum,
|
| +; unsigned int *sumsquared
|
| +;)
|
| +global sym(vp9_filter_block2d_bil_var_mmx)
|
| +sym(vp9_filter_block2d_bil_var_mmx):
|
| + push rbp
|
| + mov rbp, rsp
|
| + SHADOW_ARGS_TO_STACK 9
|
| + GET_GOT rbx
|
| + push rsi
|
| + push rdi
|
| + sub rsp, 16
|
| + ; end prolog
|
| +
|
| + pxor mm6, mm6 ;
|
| + pxor mm7, mm7 ;
|
| + mov rax, arg(5) ;HFilter ;
|
| +
|
| + mov rdx, arg(6) ;VFilter ;
|
| + mov rsi, arg(0) ;ref_ptr ;
|
| +
|
| + mov rdi, arg(2) ;src_ptr ;
|
| + movsxd rcx, dword ptr arg(4) ;Height ;
|
| +
|
| + pxor mm0, mm0 ;
|
| + movq mm1, [rsi] ;
|
| +
|
| + movq mm3, [rsi+1] ;
|
| + movq mm2, mm1 ;
|
| +
|
| + movq mm4, mm3 ;
|
| + punpcklbw mm1, mm0 ;
|
| +
|
| + punpckhbw mm2, mm0 ;
|
| + pmullw mm1, [rax] ;
|
| +
|
| + pmullw mm2, [rax] ;
|
| + punpcklbw mm3, mm0 ;
|
| +
|
| + punpckhbw mm4, mm0 ;
|
| + pmullw mm3, [rax+8] ;
|
| +
|
| + pmullw mm4, [rax+8] ;
|
| + paddw mm1, mm3 ;
|
| +
|
| + paddw mm2, mm4 ;
|
| + paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
| +
|
| + psraw mm1, mmx_filter_shift ;
|
| + paddw mm2, [GLOBAL(mmx_bi_rd)] ;
|
| +
|
| + psraw mm2, mmx_filter_shift ;
|
| + movq mm5, mm1
|
| +
|
| + packuswb mm5, mm2 ;
|
| +%if ABI_IS_32BIT
|
| + add rsi, dword ptr arg(1) ;ref_pixels_per_line
|
| +%else
|
| + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
|
| + add rsi, r8
|
| +%endif
|
| +
|
| +.filter_block2d_bil_var_mmx_loop:
|
| +
|
| + movq mm1, [rsi] ;
|
| + movq mm3, [rsi+1] ;
|
| +
|
| + movq mm2, mm1 ;
|
| + movq mm4, mm3 ;
|
| +
|
| + punpcklbw mm1, mm0 ;
|
| + punpckhbw mm2, mm0 ;
|
| +
|
| + pmullw mm1, [rax] ;
|
| + pmullw mm2, [rax] ;
|
| +
|
| + punpcklbw mm3, mm0 ;
|
| + punpckhbw mm4, mm0 ;
|
| +
|
| + pmullw mm3, [rax+8] ;
|
| + pmullw mm4, [rax+8] ;
|
| +
|
| + paddw mm1, mm3 ;
|
| + paddw mm2, mm4 ;
|
| +
|
| + paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
| + psraw mm1, mmx_filter_shift ;
|
| +
|
| + paddw mm2, [GLOBAL(mmx_bi_rd)] ;
|
| + psraw mm2, mmx_filter_shift ;
|
| +
|
| + movq mm3, mm5 ;
|
| + movq mm4, mm5 ;
|
| +
|
| + punpcklbw mm3, mm0 ;
|
| + punpckhbw mm4, mm0 ;
|
| +
|
| + movq mm5, mm1 ;
|
| + packuswb mm5, mm2 ;
|
| +
|
| + pmullw mm3, [rdx] ;
|
| + pmullw mm4, [rdx] ;
|
| +
|
| + pmullw mm1, [rdx+8] ;
|
| + pmullw mm2, [rdx+8] ;
|
| +
|
| + paddw mm1, mm3 ;
|
| + paddw mm2, mm4 ;
|
| +
|
| + paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
| + paddw mm2, [GLOBAL(mmx_bi_rd)] ;
|
| +
|
| + psraw mm1, mmx_filter_shift ;
|
| + psraw mm2, mmx_filter_shift ;
|
| +
|
| + movq mm3, [rdi] ;
|
| + movq mm4, mm3 ;
|
| +
|
| + punpcklbw mm3, mm0 ;
|
| + punpckhbw mm4, mm0 ;
|
| +
|
| + psubw mm1, mm3 ;
|
| + psubw mm2, mm4 ;
|
| +
|
| + paddw mm6, mm1 ;
|
| + pmaddwd mm1, mm1 ;
|
| +
|
| + paddw mm6, mm2 ;
|
| + pmaddwd mm2, mm2 ;
|
| +
|
| + paddd mm7, mm1 ;
|
| + paddd mm7, mm2 ;
|
| +
|
| +%if ABI_IS_32BIT
|
| + add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
|
| + add rdi, dword ptr arg(3) ;src_pixels_per_line ;
|
| +%else
|
| + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
|
| + movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
|
| + add rsi, r8
|
| + add rdi, r9
|
| +%endif
|
| + sub rcx, 1 ;
|
| + jnz .filter_block2d_bil_var_mmx_loop ;
|
| +
|
| +
|
| + pxor mm3, mm3 ;
|
| + pxor mm2, mm2 ;
|
| +
|
| + punpcklwd mm2, mm6 ;
|
| + punpckhwd mm3, mm6 ;
|
| +
|
| + paddd mm2, mm3 ;
|
| + movq mm6, mm2 ;
|
| +
|
| + psrlq mm6, 32 ;
|
| + paddd mm2, mm6 ;
|
| +
|
| + psrad mm2, 16 ;
|
| + movq mm4, mm7 ;
|
| +
|
| + psrlq mm4, 32 ;
|
| + paddd mm4, mm7 ;
|
| +
|
| + mov rdi, arg(7) ;sum
|
| + mov rsi, arg(8) ;sumsquared
|
| +
|
| + movd dword ptr [rdi], mm2 ;
|
| + movd dword ptr [rsi], mm4 ;
|
| +
|
| + ; begin epilog
|
| + add rsp, 16
|
| + pop rdi
|
| + pop rsi
|
| + RESTORE_GOT
|
| + UNSHADOW_ARGS
|
| + pop rbp
|
| + ret
|
| +
|
| +
|
| +SECTION_RODATA
|
| +;short mmx_bi_rd[4] = { 64, 64, 64, 64};
|
| +align 16
|
| +mmx_bi_rd:
|
| + times 4 dw 64
|
|
|