| Index: source/libvpx/vp9/common/x86/vp9_mask_sse3.asm
|
| ===================================================================
|
| --- source/libvpx/vp9/common/x86/vp9_mask_sse3.asm (revision 0)
|
| +++ source/libvpx/vp9/common/x86/vp9_mask_sse3.asm (revision 0)
|
| @@ -0,0 +1,484 @@
|
| +;
|
| +; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
| +;
|
| +; Use of this source code is governed by a BSD-style license
|
| +; that can be found in the LICENSE file in the root of the source
|
| +; tree. An additional intellectual property rights grant can be found
|
| +; in the file PATENTS. All contributing project authors may
|
| +; be found in the AUTHORS file in the root of the source tree.
|
| +;
|
| +
|
| +
|
| +%include "vpx_ports/x86_abi_support.asm"
|
| +
|
| +;void int vp8_makemask_sse3(
|
| +; unsigned char *y,
|
| +; unsigned char *u,
|
| +; unsigned char *v,
|
| +; unsigned char *ym,
|
| +; unsigned char *uvm,
|
| +; int yp,
|
| +; int uvp,
|
| +; int ys,
|
| +; int us,
|
| +; int vs,
|
| +; int yt,
|
| +; int ut,
|
| +; int vt)
|
| +global sym(vp8_makemask_sse3)
|
| +sym(vp8_makemask_sse3):
|
| + push rbp
|
| + mov rbp, rsp
|
| + SHADOW_ARGS_TO_STACK 14
|
| + push rsi
|
| + push rdi
|
| + ; end prolog
|
| +
|
| + mov rsi, arg(0) ;y
|
| + mov rdi, arg(1) ;u
|
| + mov rcx, arg(2) ;v
|
| + mov rax, arg(3) ;ym
|
| + movsxd rbx, dword arg(4) ;yp
|
| + movsxd rdx, dword arg(5) ;uvp
|
| +
|
| + pxor xmm0,xmm0
|
| +
|
| + ;make 16 copies of the center y value
|
| + movd xmm1, arg(6)
|
| + pshufb xmm1, xmm0
|
| +
|
| + ; make 16 copies of the center u value
|
| + movd xmm2, arg(7)
|
| + pshufb xmm2, xmm0
|
| +
|
| + ; make 16 copies of the center v value
|
| + movd xmm3, arg(8)
|
| + pshufb xmm3, xmm0
|
| + unpcklpd xmm2, xmm3
|
| +
|
| + ;make 16 copies of the y tolerance
|
| + movd xmm3, arg(9)
|
| + pshufb xmm3, xmm0
|
| +
|
| + ;make 16 copies of the u tolerance
|
| + movd xmm4, arg(10)
|
| + pshufb xmm4, xmm0
|
| +
|
| + ;make 16 copies of the v tolerance
|
| + movd xmm5, arg(11)
|
| + pshufb xmm5, xmm0
|
| + unpckhpd xmm4, xmm5
|
| +
|
| + mov r8,8
|
| +
|
| +NextPairOfRows:
|
| +
|
| + ;grab the y source values
|
| + movdqu xmm0, [rsi]
|
| +
|
| + ;compute abs difference between source and y target
|
| + movdqa xmm6, xmm1
|
| + movdqa xmm7, xmm0
|
| + psubusb xmm0, xmm1
|
| + psubusb xmm6, xmm7
|
| + por xmm0, xmm6
|
| +
|
| + ;compute abs difference between
|
| + movdqa xmm6, xmm3
|
| + pcmpgtb xmm6, xmm0
|
| +
|
| + ;grab the y source values
|
| + add rsi, rbx
|
| + movdqu xmm0, [rsi]
|
| +
|
| + ;compute abs difference between source and y target
|
| + movdqa xmm11, xmm1
|
| + movdqa xmm7, xmm0
|
| + psubusb xmm0, xmm1
|
| + psubusb xmm11, xmm7
|
| + por xmm0, xmm11
|
| +
|
| + ;compute abs difference between
|
| + movdqa xmm11, xmm3
|
| + pcmpgtb xmm11, xmm0
|
| +
|
| +
|
| + ;grab the u and v source values
|
| + movdqu xmm7, [rdi]
|
| + movdqu xmm8, [rcx]
|
| + unpcklpd xmm7, xmm8
|
| +
|
| + ;compute abs difference between source and uv targets
|
| + movdqa xmm9, xmm2
|
| + movdqa xmm10, xmm7
|
| + psubusb xmm7, xmm2
|
| + psubusb xmm9, xmm10
|
| + por xmm7, xmm9
|
| +
|
| + ;check whether the number is < tolerance
|
| + movdqa xmm0, xmm4
|
| + pcmpgtb xmm0, xmm7
|
| +
|
| + ;double u and v masks
|
| + movdqa xmm8, xmm0
|
| + punpckhbw xmm0, xmm0
|
| + punpcklbw xmm8, xmm8
|
| +
|
| + ;mask row 0 and output
|
| + pand xmm6, xmm8
|
| + pand xmm6, xmm0
|
| + movdqa [rax],xmm6
|
| +
|
| + ;mask row 1 and output
|
| + pand xmm11, xmm8
|
| + pand xmm11, xmm0
|
| + movdqa [rax+16],xmm11
|
| +
|
| +
|
| + ; to the next row or set of rows
|
| + add rsi, rbx
|
| + add rdi, rdx
|
| + add rcx, rdx
|
| + add rax,32
|
| + dec r8
|
| + jnz NextPairOfRows
|
| +
|
| +
|
| + ; begin epilog
|
| + pop rdi
|
| + pop rsi
|
| + UNSHADOW_ARGS
|
| + pop rbp
|
| + ret
|
| +
|
| +;GROW_HORIZ (register for result, source register or mem local)
|
| +; takes source and shifts left and ors with source
|
| +; then shifts right and ors with source
|
| +%macro GROW_HORIZ 2
|
| + movdqa %1, %2
|
| + movdqa xmm14, %1
|
| + movdqa xmm15, %1
|
| + pslldq xmm14, 1
|
| + psrldq xmm15, 1
|
| + por %1,xmm14
|
| + por %1,xmm15
|
| +%endmacro
|
| +;GROW_VERT (result, center row, above row, below row)
|
| +%macro GROW_VERT 4
|
| + movdqa %1,%2
|
| + por %1,%3
|
| + por %1,%4
|
| +%endmacro
|
| +
|
| +;GROW_NEXTLINE (new line to grow, new source, line to write)
|
| +%macro GROW_NEXTLINE 3
|
| + GROW_HORIZ %1, %2
|
| + GROW_VERT xmm3, xmm0, xmm1, xmm2
|
| + movdqa %3,xmm3
|
| +%endmacro
|
| +
|
| +
|
| +;void int vp8_growmaskmb_sse3(
|
| +; unsigned char *om,
|
| +; unsigned char *nm,
|
| +global sym(vp8_growmaskmb_sse3)
|
| +sym(vp8_growmaskmb_sse3):
|
| + push rbp
|
| + mov rbp, rsp
|
| + SHADOW_ARGS_TO_STACK 2
|
| + push rsi
|
| + push rdi
|
| + ; end prolog
|
| +
|
| + mov rsi, arg(0) ;src
|
| + mov rdi, arg(1) ;rst
|
| +
|
| + GROW_HORIZ xmm0, [rsi]
|
| + GROW_HORIZ xmm1, [rsi+16]
|
| + GROW_HORIZ xmm2, [rsi+32]
|
| +
|
| + GROW_VERT xmm3, xmm0, xmm1, xmm2
|
| + por xmm0,xmm1
|
| + movdqa [rdi], xmm0
|
| + movdqa [rdi+16],xmm3
|
| +
|
| + GROW_NEXTLINE xmm0,[rsi+48],[rdi+32]
|
| + GROW_NEXTLINE xmm1,[rsi+64],[rdi+48]
|
| + GROW_NEXTLINE xmm2,[rsi+80],[rdi+64]
|
| + GROW_NEXTLINE xmm0,[rsi+96],[rdi+80]
|
| + GROW_NEXTLINE xmm1,[rsi+112],[rdi+96]
|
| + GROW_NEXTLINE xmm2,[rsi+128],[rdi+112]
|
| + GROW_NEXTLINE xmm0,[rsi+144],[rdi+128]
|
| + GROW_NEXTLINE xmm1,[rsi+160],[rdi+144]
|
| + GROW_NEXTLINE xmm2,[rsi+176],[rdi+160]
|
| + GROW_NEXTLINE xmm0,[rsi+192],[rdi+176]
|
| + GROW_NEXTLINE xmm1,[rsi+208],[rdi+192]
|
| + GROW_NEXTLINE xmm2,[rsi+224],[rdi+208]
|
| + GROW_NEXTLINE xmm0,[rsi+240],[rdi+224]
|
| +
|
| + por xmm0,xmm2
|
| + movdqa [rdi+240], xmm0
|
| +
|
| + ; begin epilog
|
| + pop rdi
|
| + pop rsi
|
| + UNSHADOW_ARGS
|
| + pop rbp
|
| + ret
|
| +
|
| +
|
| +
|
| +;unsigned int vp8_sad16x16_masked_wmt(
|
| +; unsigned char *src_ptr,
|
| +; int src_stride,
|
| +; unsigned char *ref_ptr,
|
| +; int ref_stride,
|
| +; unsigned char *mask)
|
| +global sym(vp8_sad16x16_masked_wmt)
|
| +sym(vp8_sad16x16_masked_wmt):
|
| + push rbp
|
| + mov rbp, rsp
|
| + SHADOW_ARGS_TO_STACK 5
|
| + push rsi
|
| + push rdi
|
| + ; end prolog
|
| + mov rsi, arg(0) ;src_ptr
|
| + mov rdi, arg(2) ;ref_ptr
|
| +
|
| + mov rbx, arg(4) ;mask
|
| + movsxd rax, dword ptr arg(1) ;src_stride
|
| + movsxd rdx, dword ptr arg(3) ;ref_stride
|
| +
|
| + mov rcx, 16
|
| +
|
| + pxor xmm3, xmm3
|
| +
|
| +NextSadRow:
|
| + movdqu xmm0, [rsi]
|
| + movdqu xmm1, [rdi]
|
| + movdqu xmm2, [rbx]
|
| + pand xmm0, xmm2
|
| + pand xmm1, xmm2
|
| +
|
| + psadbw xmm0, xmm1
|
| + paddw xmm3, xmm0
|
| +
|
| + add rsi, rax
|
| + add rdi, rdx
|
| + add rbx, 16
|
| +
|
| + dec rcx
|
| + jnz NextSadRow
|
| +
|
| + movdqa xmm4 , xmm3
|
| + psrldq xmm4, 8
|
| + paddw xmm3, xmm4
|
| + movq rax, xmm3
|
| + ; begin epilog
|
| + pop rdi
|
| + pop rsi
|
| + UNSHADOW_ARGS
|
| + pop rbp
|
| + ret
|
| +
|
| +
|
| +;unsigned int vp8_sad16x16_unmasked_wmt(
|
| +; unsigned char *src_ptr,
|
| +; int src_stride,
|
| +; unsigned char *ref_ptr,
|
| +; int ref_stride,
|
| +; unsigned char *mask)
|
| +global sym(vp8_sad16x16_unmasked_wmt)
|
| +sym(vp8_sad16x16_unmasked_wmt):
|
| + push rbp
|
| + mov rbp, rsp
|
| + SHADOW_ARGS_TO_STACK 5
|
| + push rsi
|
| + push rdi
|
| + ; end prolog
|
| + mov rsi, arg(0) ;src_ptr
|
| + mov rdi, arg(2) ;ref_ptr
|
| +
|
| + mov rbx, arg(4) ;mask
|
| + movsxd rax, dword ptr arg(1) ;src_stride
|
| + movsxd rdx, dword ptr arg(3) ;ref_stride
|
| +
|
| + mov rcx, 16
|
| +
|
| + pxor xmm3, xmm3
|
| +
|
| +next_vp8_sad16x16_unmasked_wmt:
|
| + movdqu xmm0, [rsi]
|
| + movdqu xmm1, [rdi]
|
| + movdqu xmm2, [rbx]
|
| + por xmm0, xmm2
|
| + por xmm1, xmm2
|
| +
|
| + psadbw xmm0, xmm1
|
| + paddw xmm3, xmm0
|
| +
|
| + add rsi, rax
|
| + add rdi, rdx
|
| + add rbx, 16
|
| +
|
| + dec rcx
|
| + jnz next_vp8_sad16x16_unmasked_wmt
|
| +
|
| + movdqa xmm4 , xmm3
|
| + psrldq xmm4, 8
|
| + paddw xmm3, xmm4
|
| + movq rax, xmm3
|
| + ; begin epilog
|
| + pop rdi
|
| + pop rsi
|
| + UNSHADOW_ARGS
|
| + pop rbp
|
| + ret
|
| +
|
| +
|
| +;unsigned int vp8_masked_predictor_wmt(
|
| +; unsigned char *masked,
|
| +; unsigned char *unmasked,
|
| +; int src_stride,
|
| +; unsigned char *dst_ptr,
|
| +; int dst_stride,
|
| +; unsigned char *mask)
|
| +global sym(vp8_masked_predictor_wmt)
|
| +sym(vp8_masked_predictor_wmt):
|
| + push rbp
|
| + mov rbp, rsp
|
| + SHADOW_ARGS_TO_STACK 6
|
| + push rsi
|
| + push rdi
|
| + ; end prolog
|
| + mov rsi, arg(0) ;src_ptr
|
| + mov rdi, arg(1) ;ref_ptr
|
| +
|
| + mov rbx, arg(5) ;mask
|
| + movsxd rax, dword ptr arg(2) ;src_stride
|
| + mov r11, arg(3) ; destination
|
| + movsxd rdx, dword ptr arg(4) ;dst_stride
|
| +
|
| + mov rcx, 16
|
| +
|
| + pxor xmm3, xmm3
|
| +
|
| +next_vp8_masked_predictor_wmt:
|
| + movdqu xmm0, [rsi]
|
| + movdqu xmm1, [rdi]
|
| + movdqu xmm2, [rbx]
|
| +
|
| + pand xmm0, xmm2
|
| + pandn xmm2, xmm1
|
| + por xmm0, xmm2
|
| + movdqu [r11], xmm0
|
| +
|
| + add r11, rdx
|
| + add rsi, rax
|
| + add rdi, rdx
|
| + add rbx, 16
|
| +
|
| + dec rcx
|
| + jnz next_vp8_masked_predictor_wmt
|
| +
|
| + ; begin epilog
|
| + pop rdi
|
| + pop rsi
|
| + UNSHADOW_ARGS
|
| + pop rbp
|
| + ret
|
| +
|
| +;unsigned int vp8_masked_predictor_uv_wmt(
|
| +; unsigned char *masked,
|
| +; unsigned char *unmasked,
|
| +; int src_stride,
|
| +; unsigned char *dst_ptr,
|
| +; int dst_stride,
|
| +; unsigned char *mask)
|
| +global sym(vp8_masked_predictor_uv_wmt)
|
| +sym(vp8_masked_predictor_uv_wmt):
|
| + push rbp
|
| + mov rbp, rsp
|
| + SHADOW_ARGS_TO_STACK 6
|
| + push rsi
|
| + push rdi
|
| + ; end prolog
|
| + mov rsi, arg(0) ;src_ptr
|
| + mov rdi, arg(1) ;ref_ptr
|
| +
|
| + mov rbx, arg(5) ;mask
|
| + movsxd rax, dword ptr arg(2) ;src_stride
|
| + mov r11, arg(3) ; destination
|
| + movsxd rdx, dword ptr arg(4) ;dst_stride
|
| +
|
| + mov rcx, 8
|
| +
|
| + pxor xmm3, xmm3
|
| +
|
| +next_vp8_masked_predictor_uv_wmt:
|
| + movq xmm0, [rsi]
|
| + movq xmm1, [rdi]
|
| + movq xmm2, [rbx]
|
| +
|
| + pand xmm0, xmm2
|
| + pandn xmm2, xmm1
|
| + por xmm0, xmm2
|
| + movq [r11], xmm0
|
| +
|
| + add r11, rdx
|
| + add rsi, rax
|
| + add rdi, rax
|
| + add rbx, 8
|
| +
|
| + dec rcx
|
| + jnz next_vp8_masked_predictor_uv_wmt
|
| +
|
| + ; begin epilog
|
| + pop rdi
|
| + pop rsi
|
| + UNSHADOW_ARGS
|
| + pop rbp
|
| + ret
|
| +
|
| +
|
| +;unsigned int vp8_uv_from_y_mask(
|
| +; unsigned char *ymask,
|
| +; unsigned char *uvmask)
|
| +global sym(vp8_uv_from_y_mask)
|
| +sym(vp8_uv_from_y_mask):
|
| + push rbp
|
| + mov rbp, rsp
|
| + SHADOW_ARGS_TO_STACK 6
|
| + push rsi
|
| + push rdi
|
| + ; end prolog
|
| + mov rsi, arg(0) ;src_ptr
|
| + mov rdi, arg(1) ;dst_ptr
|
| +
|
| +
|
| + mov rcx, 8
|
| +
|
| + pxor xmm3, xmm3
|
| +
|
| +next_p8_uv_from_y_mask:
|
| + movdqu xmm0, [rsi]
|
| + pshufb xmm0, [shuf1b] ;[GLOBAL(shuf1b)]
|
| + movq [rdi],xmm0
|
| + add rdi, 8
|
| + add rsi,32
|
| +
|
| + dec rcx
|
| + jnz next_p8_uv_from_y_mask
|
| +
|
| + ; begin epilog
|
| + pop rdi
|
| + pop rsi
|
| + UNSHADOW_ARGS
|
| + pop rbp
|
| + ret
|
| +
|
| +SECTION_RODATA
|
| +align 16
|
| +shuf1b:
|
| + db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
|
| +
|
|
|