| Index: source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
|
| ===================================================================
|
| --- source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm (revision 240950)
|
| +++ source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm (working copy)
|
| @@ -11,17 +11,6 @@
|
|
|
| %include "vpx_ports/x86_abi_support.asm"
|
|
|
| -;/************************************************************************************
|
| -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
|
| -; input pixel array has output_height rows. This routine assumes that output_height is an
|
| -; even number. This function handles 8 pixels in horizontal direction, calculating ONE
|
| -; rows each iteration to take advantage of the 128 bits operations.
|
| -;
|
| -; This is an implementation of some of the SSE optimizations first seen in ffvp8
|
| -;
|
| -;*************************************************************************************/
|
| -
|
| -
|
| %macro VERTx4 1
|
| mov rdx, arg(5) ;filter ptr
|
| mov rsi, arg(0) ;src_ptr
|
| @@ -81,11 +70,14 @@
|
| pmaddubsw xmm4, k4k5
|
| pmaddubsw xmm6, k6k7
|
|
|
| + movdqa xmm1, xmm2
|
| paddsw xmm0, xmm6
|
| + pmaxsw xmm2, xmm4
|
| + pminsw xmm4, xmm1
|
| + paddsw xmm0, xmm4
|
| paddsw xmm0, xmm2
|
| - paddsw xmm0, xmm4
|
| +
|
| paddsw xmm0, krd
|
| -
|
| psraw xmm0, 7
|
| packuswb xmm0, xmm0
|
|
|
| @@ -166,10 +158,13 @@
|
| pmaddubsw xmm6, k6k7
|
|
|
| paddsw xmm0, xmm6
|
| + movdqa xmm1, xmm2
|
| + pmaxsw xmm2, xmm4
|
| + pminsw xmm4, xmm1
|
| + paddsw xmm0, xmm4
|
| paddsw xmm0, xmm2
|
| - paddsw xmm0, xmm4
|
| +
|
| paddsw xmm0, krd
|
| -
|
| psraw xmm0, 7
|
| packuswb xmm0, xmm0
|
|
|
| @@ -251,10 +246,13 @@
|
| pmaddubsw xmm6, k6k7
|
|
|
| paddsw xmm0, xmm6
|
| + movdqa xmm1, xmm2
|
| + pmaxsw xmm2, xmm4
|
| + pminsw xmm4, xmm1
|
| + paddsw xmm0, xmm4
|
| paddsw xmm0, xmm2
|
| - paddsw xmm0, xmm4
|
| +
|
| paddsw xmm0, krd
|
| -
|
| psraw xmm0, 7
|
| packuswb xmm0, xmm0
|
| %if %1
|
| @@ -538,14 +536,22 @@
|
| movdqa %2, %1
|
| pshufb %1, [GLOBAL(shuf_t0t1)]
|
| pshufb %2, [GLOBAL(shuf_t2t3)]
|
| - pmaddubsw %1, xmm6
|
| - pmaddubsw %2, xmm7
|
| + pmaddubsw %1, k0k1k4k5
|
| + pmaddubsw %2, k2k3k6k7
|
|
|
| - paddsw %1, %2
|
| - movdqa %2, %1
|
| + movdqa xmm4, %1
|
| + movdqa xmm5, %2
|
| + psrldq %1, 8
|
| psrldq %2, 8
|
| - paddsw %1, %2
|
| - paddsw %1, xmm5
|
| + movdqa xmm6, xmm5
|
| +
|
| + paddsw xmm4, %2
|
| + pmaxsw xmm5, %1
|
| + pminsw %1, xmm6
|
| + paddsw %1, xmm4
|
| + paddsw %1, xmm5
|
| +
|
| + paddsw %1, krd
|
| psraw %1, 7
|
| packuswb %1, %1
|
| %endm
|
| @@ -565,6 +571,10 @@
|
| pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7
|
| pshufd xmm5, xmm5, 0 ;rounding
|
|
|
| + movdqa k0k1k4k5, xmm6
|
| + movdqa k2k3k6k7, xmm7
|
| + movdqa krd, xmm5
|
| +
|
| movsxd rax, dword ptr arg(1) ;src_pixels_per_line
|
| movsxd rdx, dword ptr arg(3) ;output_pitch
|
| movsxd rcx, dword ptr arg(4) ;output_height
|
| @@ -631,9 +641,13 @@
|
| pmaddubsw %3, k4k5
|
| pmaddubsw %4, k6k7
|
|
|
| - paddsw %1, %2
|
| paddsw %1, %4
|
| + movdqa %4, %2
|
| + pmaxsw %2, %3
|
| + pminsw %3, %4
|
| paddsw %1, %3
|
| + paddsw %1, %2
|
| +
|
| paddsw %1, krd
|
| psraw %1, 7
|
| packuswb %1, %1
|
| @@ -779,12 +793,19 @@
|
| pmaddubsw xmm6, k4k5
|
| pmaddubsw xmm7, k6k7
|
|
|
| - paddsw xmm0, xmm1
|
| paddsw xmm0, xmm3
|
| + movdqa xmm3, xmm1
|
| + pmaxsw xmm1, xmm2
|
| + pminsw xmm2, xmm3
|
| paddsw xmm0, xmm2
|
| - paddsw xmm4, xmm5
|
| + paddsw xmm0, xmm1
|
| +
|
| paddsw xmm4, xmm7
|
| + movdqa xmm7, xmm5
|
| + pmaxsw xmm5, xmm6
|
| + pminsw xmm6, xmm7
|
| paddsw xmm4, xmm6
|
| + paddsw xmm4, xmm5
|
|
|
| paddsw xmm0, krd
|
| paddsw xmm4, krd
|
| @@ -826,8 +847,16 @@
|
| push rdi
|
| ; end prolog
|
|
|
| + ALIGN_STACK 16, rax
|
| + sub rsp, 16 * 3
|
| + %define k0k1k4k5 [rsp + 16 * 0]
|
| + %define k2k3k6k7 [rsp + 16 * 1]
|
| + %define krd [rsp + 16 * 2]
|
| +
|
| HORIZx4 0
|
|
|
| + add rsp, 16 * 3
|
| + pop rsp
|
| ; begin epilog
|
| pop rdi
|
| pop rsi
|
| @@ -932,8 +961,16 @@
|
| push rdi
|
| ; end prolog
|
|
|
| + ALIGN_STACK 16, rax
|
| + sub rsp, 16 * 3
|
| + %define k0k1k4k5 [rsp + 16 * 0]
|
| + %define k2k3k6k7 [rsp + 16 * 1]
|
| + %define krd [rsp + 16 * 2]
|
| +
|
| HORIZx4 1
|
|
|
| + add rsp, 16 * 3
|
| + pop rsp
|
| ; begin epilog
|
| pop rdi
|
| pop rsi
|
|
|