Index: source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm |
=================================================================== |
--- source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm (revision 240950) |
+++ source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm (working copy) |
@@ -11,17 +11,6 @@ |
%include "vpx_ports/x86_abi_support.asm" |
-;/************************************************************************************ |
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The |
-; input pixel array has output_height rows. This routine assumes that output_height is an |
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE |
-; rows each iteration to take advantage of the 128 bits operations. |
-; |
-; This is an implementation of some of the SSE optimizations first seen in ffvp8 |
-; |
-;*************************************************************************************/ |
- |
- |
%macro VERTx4 1 |
mov rdx, arg(5) ;filter ptr |
mov rsi, arg(0) ;src_ptr |
@@ -81,11 +70,14 @@ |
pmaddubsw xmm4, k4k5 |
pmaddubsw xmm6, k6k7 |
+ movdqa xmm1, xmm2 |
paddsw xmm0, xmm6 |
+ pmaxsw xmm2, xmm4 |
+ pminsw xmm4, xmm1 |
+ paddsw xmm0, xmm4 |
paddsw xmm0, xmm2 |
- paddsw xmm0, xmm4 |
+ |
paddsw xmm0, krd |
- |
psraw xmm0, 7 |
packuswb xmm0, xmm0 |
@@ -166,10 +158,13 @@ |
pmaddubsw xmm6, k6k7 |
paddsw xmm0, xmm6 |
+ movdqa xmm1, xmm2 |
+ pmaxsw xmm2, xmm4 |
+ pminsw xmm4, xmm1 |
+ paddsw xmm0, xmm4 |
paddsw xmm0, xmm2 |
- paddsw xmm0, xmm4 |
+ |
paddsw xmm0, krd |
- |
psraw xmm0, 7 |
packuswb xmm0, xmm0 |
@@ -251,10 +246,13 @@ |
pmaddubsw xmm6, k6k7 |
paddsw xmm0, xmm6 |
+ movdqa xmm1, xmm2 |
+ pmaxsw xmm2, xmm4 |
+ pminsw xmm4, xmm1 |
+ paddsw xmm0, xmm4 |
paddsw xmm0, xmm2 |
- paddsw xmm0, xmm4 |
+ |
paddsw xmm0, krd |
- |
psraw xmm0, 7 |
packuswb xmm0, xmm0 |
%if %1 |
@@ -538,14 +536,22 @@ |
movdqa %2, %1 |
pshufb %1, [GLOBAL(shuf_t0t1)] |
pshufb %2, [GLOBAL(shuf_t2t3)] |
- pmaddubsw %1, xmm6 |
- pmaddubsw %2, xmm7 |
+ pmaddubsw %1, k0k1k4k5 |
+ pmaddubsw %2, k2k3k6k7 |
- paddsw %1, %2 |
- movdqa %2, %1 |
+ movdqa xmm4, %1 |
+ movdqa xmm5, %2 |
+ psrldq %1, 8 |
psrldq %2, 8 |
- paddsw %1, %2 |
- paddsw %1, xmm5 |
+ movdqa xmm6, xmm5 |
+ |
+ paddsw xmm4, %2 |
+ pmaxsw xmm5, %1 |
+ pminsw %1, xmm6 |
+ paddsw %1, xmm4 |
+ paddsw %1, xmm5 |
+ |
+ paddsw %1, krd |
psraw %1, 7 |
packuswb %1, %1 |
%endm |
@@ -565,6 +571,10 @@ |
pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7 |
pshufd xmm5, xmm5, 0 ;rounding |
+ movdqa k0k1k4k5, xmm6 |
+ movdqa k2k3k6k7, xmm7 |
+ movdqa krd, xmm5 |
+ |
movsxd rax, dword ptr arg(1) ;src_pixels_per_line |
movsxd rdx, dword ptr arg(3) ;output_pitch |
movsxd rcx, dword ptr arg(4) ;output_height |
@@ -631,9 +641,13 @@ |
pmaddubsw %3, k4k5 |
pmaddubsw %4, k6k7 |
- paddsw %1, %2 |
paddsw %1, %4 |
+ movdqa %4, %2 |
+ pmaxsw %2, %3 |
+ pminsw %3, %4 |
paddsw %1, %3 |
+ paddsw %1, %2 |
+ |
paddsw %1, krd |
psraw %1, 7 |
packuswb %1, %1 |
@@ -779,12 +793,19 @@ |
pmaddubsw xmm6, k4k5 |
pmaddubsw xmm7, k6k7 |
- paddsw xmm0, xmm1 |
paddsw xmm0, xmm3 |
+ movdqa xmm3, xmm1 |
+ pmaxsw xmm1, xmm2 |
+ pminsw xmm2, xmm3 |
paddsw xmm0, xmm2 |
- paddsw xmm4, xmm5 |
+ paddsw xmm0, xmm1 |
+ |
paddsw xmm4, xmm7 |
+ movdqa xmm7, xmm5 |
+ pmaxsw xmm5, xmm6 |
+ pminsw xmm6, xmm7 |
paddsw xmm4, xmm6 |
+ paddsw xmm4, xmm5 |
paddsw xmm0, krd |
paddsw xmm4, krd |
@@ -826,8 +847,16 @@ |
push rdi |
; end prolog |
+ ALIGN_STACK 16, rax |
+ sub rsp, 16 * 3 |
+ %define k0k1k4k5 [rsp + 16 * 0] |
+ %define k2k3k6k7 [rsp + 16 * 1] |
+ %define krd [rsp + 16 * 2] |
+ |
HORIZx4 0 |
+ add rsp, 16 * 3 |
+ pop rsp |
; begin epilog |
pop rdi |
pop rsi |
@@ -932,8 +961,16 @@ |
push rdi |
; end prolog |
+ ALIGN_STACK 16, rax |
+ sub rsp, 16 * 3 |
+ %define k0k1k4k5 [rsp + 16 * 0] |
+ %define k2k3k6k7 [rsp + 16 * 1] |
+ %define krd [rsp + 16 * 2] |
+ |
HORIZx4 1 |
+ add rsp, 16 * 3 |
+ pop rsp |
; begin epilog |
pop rdi |
pop rsi |