Index: source/libvpx/vp9/common/x86/vp9_mask_sse3.asm |
=================================================================== |
--- source/libvpx/vp9/common/x86/vp9_mask_sse3.asm (revision 0) |
+++ source/libvpx/vp9/common/x86/vp9_mask_sse3.asm (revision 0) |
@@ -0,0 +1,484 @@ |
+; |
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
+; |
+; Use of this source code is governed by a BSD-style license |
+; that can be found in the LICENSE file in the root of the source |
+; tree. An additional intellectual property rights grant can be found |
+; in the file PATENTS. All contributing project authors may |
+; be found in the AUTHORS file in the root of the source tree. |
+; |
+ |
+ |
+%include "vpx_ports/x86_abi_support.asm" |
+ |
+;void int vp8_makemask_sse3( |
+; unsigned char *y, |
+; unsigned char *u, |
+; unsigned char *v, |
+; unsigned char *ym, |
+; unsigned char *uvm, |
+; int yp, |
+; int uvp, |
+; int ys, |
+; int us, |
+; int vs, |
+; int yt, |
+; int ut, |
+; int vt) |
+global sym(vp8_makemask_sse3) |
+sym(vp8_makemask_sse3): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 14 |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ |
+ mov rsi, arg(0) ;y |
+ mov rdi, arg(1) ;u |
+ mov rcx, arg(2) ;v |
+ mov rax, arg(3) ;ym |
+ movsxd rbx, dword arg(4) ;yp |
+ movsxd rdx, dword arg(5) ;uvp |
+ |
+ pxor xmm0,xmm0 |
+ |
+ ;make 16 copies of the center y value |
+ movd xmm1, arg(6) |
+ pshufb xmm1, xmm0 |
+ |
+ ; make 16 copies of the center u value |
+ movd xmm2, arg(7) |
+ pshufb xmm2, xmm0 |
+ |
+ ; make 16 copies of the center v value |
+ movd xmm3, arg(8) |
+ pshufb xmm3, xmm0 |
+ unpcklpd xmm2, xmm3 |
+ |
+ ;make 16 copies of the y tolerance |
+ movd xmm3, arg(9) |
+ pshufb xmm3, xmm0 |
+ |
+ ;make 16 copies of the u tolerance |
+ movd xmm4, arg(10) |
+ pshufb xmm4, xmm0 |
+ |
+ ;make 16 copies of the v tolerance |
+ movd xmm5, arg(11) |
+ pshufb xmm5, xmm0 |
+ unpckhpd xmm4, xmm5 |
+ |
+ mov r8,8 |
+ |
+NextPairOfRows: |
+ |
+ ;grab the y source values |
+ movdqu xmm0, [rsi] |
+ |
+ ;compute abs difference between source and y target |
+ movdqa xmm6, xmm1 |
+ movdqa xmm7, xmm0 |
+ psubusb xmm0, xmm1 |
+ psubusb xmm6, xmm7 |
+ por xmm0, xmm6 |
+ |
+ ;compute abs difference between |
+ movdqa xmm6, xmm3 |
+ pcmpgtb xmm6, xmm0 |
+ |
+ ;grab the y source values |
+ add rsi, rbx |
+ movdqu xmm0, [rsi] |
+ |
+ ;compute abs difference between source and y target |
+ movdqa xmm11, xmm1 |
+ movdqa xmm7, xmm0 |
+ psubusb xmm0, xmm1 |
+ psubusb xmm11, xmm7 |
+ por xmm0, xmm11 |
+ |
+ ;compute abs difference between |
+ movdqa xmm11, xmm3 |
+ pcmpgtb xmm11, xmm0 |
+ |
+ |
+ ;grab the u and v source values |
+ movdqu xmm7, [rdi] |
+ movdqu xmm8, [rcx] |
+ unpcklpd xmm7, xmm8 |
+ |
+ ;compute abs difference between source and uv targets |
+ movdqa xmm9, xmm2 |
+ movdqa xmm10, xmm7 |
+ psubusb xmm7, xmm2 |
+ psubusb xmm9, xmm10 |
+ por xmm7, xmm9 |
+ |
+ ;check whether the number is < tolerance |
+ movdqa xmm0, xmm4 |
+ pcmpgtb xmm0, xmm7 |
+ |
+ ;double u and v masks |
+ movdqa xmm8, xmm0 |
+ punpckhbw xmm0, xmm0 |
+ punpcklbw xmm8, xmm8 |
+ |
+ ;mask row 0 and output |
+ pand xmm6, xmm8 |
+ pand xmm6, xmm0 |
+ movdqa [rax],xmm6 |
+ |
+ ;mask row 1 and output |
+ pand xmm11, xmm8 |
+ pand xmm11, xmm0 |
+ movdqa [rax+16],xmm11 |
+ |
+ |
+ ; to the next row or set of rows |
+ add rsi, rbx |
+ add rdi, rdx |
+ add rcx, rdx |
+ add rax,32 |
+ dec r8 |
+ jnz NextPairOfRows |
+ |
+ |
+ ; begin epilog |
+ pop rdi |
+ pop rsi |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+;GROW_HORIZ (register for result, source register or mem local) |
+; takes source and shifts left and ors with source |
+; then shifts right and ors with source |
+%macro GROW_HORIZ 2 |
+ movdqa %1, %2 |
+ movdqa xmm14, %1 |
+ movdqa xmm15, %1 |
+ pslldq xmm14, 1 |
+ psrldq xmm15, 1 |
+ por %1,xmm14 |
+ por %1,xmm15 |
+%endmacro |
+;GROW_VERT (result, center row, above row, below row) |
+%macro GROW_VERT 4 |
+ movdqa %1,%2 |
+ por %1,%3 |
+ por %1,%4 |
+%endmacro |
+ |
+;GROW_NEXTLINE (new line to grow, new source, line to write) |
+%macro GROW_NEXTLINE 3 |
+ GROW_HORIZ %1, %2 |
+ GROW_VERT xmm3, xmm0, xmm1, xmm2 |
+ movdqa %3,xmm3 |
+%endmacro |
+ |
+ |
+;void int vp8_growmaskmb_sse3( |
+; unsigned char *om, |
+; unsigned char *nm, |
+global sym(vp8_growmaskmb_sse3) |
+sym(vp8_growmaskmb_sse3): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 2 |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ |
+ mov rsi, arg(0) ;src |
+ mov rdi, arg(1) ;rst |
+ |
+ GROW_HORIZ xmm0, [rsi] |
+ GROW_HORIZ xmm1, [rsi+16] |
+ GROW_HORIZ xmm2, [rsi+32] |
+ |
+ GROW_VERT xmm3, xmm0, xmm1, xmm2 |
+ por xmm0,xmm1 |
+ movdqa [rdi], xmm0 |
+ movdqa [rdi+16],xmm3 |
+ |
+ GROW_NEXTLINE xmm0,[rsi+48],[rdi+32] |
+ GROW_NEXTLINE xmm1,[rsi+64],[rdi+48] |
+ GROW_NEXTLINE xmm2,[rsi+80],[rdi+64] |
+ GROW_NEXTLINE xmm0,[rsi+96],[rdi+80] |
+ GROW_NEXTLINE xmm1,[rsi+112],[rdi+96] |
+ GROW_NEXTLINE xmm2,[rsi+128],[rdi+112] |
+ GROW_NEXTLINE xmm0,[rsi+144],[rdi+128] |
+ GROW_NEXTLINE xmm1,[rsi+160],[rdi+144] |
+ GROW_NEXTLINE xmm2,[rsi+176],[rdi+160] |
+ GROW_NEXTLINE xmm0,[rsi+192],[rdi+176] |
+ GROW_NEXTLINE xmm1,[rsi+208],[rdi+192] |
+ GROW_NEXTLINE xmm2,[rsi+224],[rdi+208] |
+ GROW_NEXTLINE xmm0,[rsi+240],[rdi+224] |
+ |
+ por xmm0,xmm2 |
+ movdqa [rdi+240], xmm0 |
+ |
+ ; begin epilog |
+ pop rdi |
+ pop rsi |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+ |
+ |
+;unsigned int vp8_sad16x16_masked_wmt( |
+; unsigned char *src_ptr, |
+; int src_stride, |
+; unsigned char *ref_ptr, |
+; int ref_stride, |
+; unsigned char *mask) |
+global sym(vp8_sad16x16_masked_wmt) |
+sym(vp8_sad16x16_masked_wmt): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 5 |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ mov rsi, arg(0) ;src_ptr |
+ mov rdi, arg(2) ;ref_ptr |
+ |
+ mov rbx, arg(4) ;mask |
+ movsxd rax, dword ptr arg(1) ;src_stride |
+ movsxd rdx, dword ptr arg(3) ;ref_stride |
+ |
+ mov rcx, 16 |
+ |
+ pxor xmm3, xmm3 |
+ |
+NextSadRow: |
+ movdqu xmm0, [rsi] |
+ movdqu xmm1, [rdi] |
+ movdqu xmm2, [rbx] |
+ pand xmm0, xmm2 |
+ pand xmm1, xmm2 |
+ |
+ psadbw xmm0, xmm1 |
+ paddw xmm3, xmm0 |
+ |
+ add rsi, rax |
+ add rdi, rdx |
+ add rbx, 16 |
+ |
+ dec rcx |
+ jnz NextSadRow |
+ |
+ movdqa xmm4 , xmm3 |
+ psrldq xmm4, 8 |
+ paddw xmm3, xmm4 |
+ movq rax, xmm3 |
+ ; begin epilog |
+ pop rdi |
+ pop rsi |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+ |
+;unsigned int vp8_sad16x16_unmasked_wmt( |
+; unsigned char *src_ptr, |
+; int src_stride, |
+; unsigned char *ref_ptr, |
+; int ref_stride, |
+; unsigned char *mask) |
+global sym(vp8_sad16x16_unmasked_wmt) |
+sym(vp8_sad16x16_unmasked_wmt): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 5 |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ mov rsi, arg(0) ;src_ptr |
+ mov rdi, arg(2) ;ref_ptr |
+ |
+ mov rbx, arg(4) ;mask |
+ movsxd rax, dword ptr arg(1) ;src_stride |
+ movsxd rdx, dword ptr arg(3) ;ref_stride |
+ |
+ mov rcx, 16 |
+ |
+ pxor xmm3, xmm3 |
+ |
+next_vp8_sad16x16_unmasked_wmt: |
+ movdqu xmm0, [rsi] |
+ movdqu xmm1, [rdi] |
+ movdqu xmm2, [rbx] |
+ por xmm0, xmm2 |
+ por xmm1, xmm2 |
+ |
+ psadbw xmm0, xmm1 |
+ paddw xmm3, xmm0 |
+ |
+ add rsi, rax |
+ add rdi, rdx |
+ add rbx, 16 |
+ |
+ dec rcx |
+ jnz next_vp8_sad16x16_unmasked_wmt |
+ |
+ movdqa xmm4 , xmm3 |
+ psrldq xmm4, 8 |
+ paddw xmm3, xmm4 |
+ movq rax, xmm3 |
+ ; begin epilog |
+ pop rdi |
+ pop rsi |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+ |
+;unsigned int vp8_masked_predictor_wmt( |
+; unsigned char *masked, |
+; unsigned char *unmasked, |
+; int src_stride, |
+; unsigned char *dst_ptr, |
+; int dst_stride, |
+; unsigned char *mask) |
+global sym(vp8_masked_predictor_wmt) |
+sym(vp8_masked_predictor_wmt): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 6 |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ mov rsi, arg(0) ;src_ptr |
+ mov rdi, arg(1) ;ref_ptr |
+ |
+ mov rbx, arg(5) ;mask |
+ movsxd rax, dword ptr arg(2) ;src_stride |
+ mov r11, arg(3) ; destination |
+ movsxd rdx, dword ptr arg(4) ;dst_stride |
+ |
+ mov rcx, 16 |
+ |
+ pxor xmm3, xmm3 |
+ |
+next_vp8_masked_predictor_wmt: |
+ movdqu xmm0, [rsi] |
+ movdqu xmm1, [rdi] |
+ movdqu xmm2, [rbx] |
+ |
+ pand xmm0, xmm2 |
+ pandn xmm2, xmm1 |
+ por xmm0, xmm2 |
+ movdqu [r11], xmm0 |
+ |
+ add r11, rdx |
+ add rsi, rax |
+ add rdi, rdx |
+ add rbx, 16 |
+ |
+ dec rcx |
+ jnz next_vp8_masked_predictor_wmt |
+ |
+ ; begin epilog |
+ pop rdi |
+ pop rsi |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+;unsigned int vp8_masked_predictor_uv_wmt( |
+; unsigned char *masked, |
+; unsigned char *unmasked, |
+; int src_stride, |
+; unsigned char *dst_ptr, |
+; int dst_stride, |
+; unsigned char *mask) |
+global sym(vp8_masked_predictor_uv_wmt) |
+sym(vp8_masked_predictor_uv_wmt): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 6 |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ mov rsi, arg(0) ;src_ptr |
+ mov rdi, arg(1) ;ref_ptr |
+ |
+ mov rbx, arg(5) ;mask |
+ movsxd rax, dword ptr arg(2) ;src_stride |
+ mov r11, arg(3) ; destination |
+ movsxd rdx, dword ptr arg(4) ;dst_stride |
+ |
+ mov rcx, 8 |
+ |
+ pxor xmm3, xmm3 |
+ |
+next_vp8_masked_predictor_uv_wmt: |
+ movq xmm0, [rsi] |
+ movq xmm1, [rdi] |
+ movq xmm2, [rbx] |
+ |
+ pand xmm0, xmm2 |
+ pandn xmm2, xmm1 |
+ por xmm0, xmm2 |
+ movq [r11], xmm0 |
+ |
+ add r11, rdx |
+ add rsi, rax |
+ add rdi, rax |
+ add rbx, 8 |
+ |
+ dec rcx |
+ jnz next_vp8_masked_predictor_uv_wmt |
+ |
+ ; begin epilog |
+ pop rdi |
+ pop rsi |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+ |
+;unsigned int vp8_uv_from_y_mask( |
+; unsigned char *ymask, |
+; unsigned char *uvmask) |
+global sym(vp8_uv_from_y_mask) |
+sym(vp8_uv_from_y_mask): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 6 |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ mov rsi, arg(0) ;src_ptr |
+ mov rdi, arg(1) ;dst_ptr |
+ |
+ |
+ mov rcx, 8 |
+ |
+ pxor xmm3, xmm3 |
+ |
+next_p8_uv_from_y_mask: |
+ movdqu xmm0, [rsi] |
+ pshufb xmm0, [shuf1b] ;[GLOBAL(shuf1b)] |
+ movq [rdi],xmm0 |
+ add rdi, 8 |
+ add rsi,32 |
+ |
+ dec rcx |
+ jnz next_p8_uv_from_y_mask |
+ |
+ ; begin epilog |
+ pop rdi |
+ pop rsi |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+SECTION_RODATA |
+align 16 |
+shuf1b: |
+ db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0 |
+ |