| Index: source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
 | 
| ===================================================================
 | 
| --- source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm	(revision 240950)
 | 
| +++ source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm	(working copy)
 | 
| @@ -11,17 +11,6 @@
 | 
|  
 | 
|  %include "vpx_ports/x86_abi_support.asm"
 | 
|  
 | 
| -;/************************************************************************************
 | 
| -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
 | 
| -; input pixel array has output_height rows. This routine assumes that output_height is an
 | 
| -; even number. This function handles 8 pixels in horizontal direction, calculating ONE
 | 
| -; rows each iteration to take advantage of the 128 bits operations.
 | 
| -;
 | 
| -; This is an implementation of some of the SSE optimizations first seen in ffvp8
 | 
| -;
 | 
| -;*************************************************************************************/
 | 
| -
 | 
| -
 | 
|  %macro VERTx4 1
 | 
|      mov         rdx, arg(5)                 ;filter ptr
 | 
|      mov         rsi, arg(0)                 ;src_ptr
 | 
| @@ -81,11 +70,14 @@
 | 
|      pmaddubsw   xmm4, k4k5
 | 
|      pmaddubsw   xmm6, k6k7
 | 
|  
 | 
| +    movdqa      xmm1, xmm2
 | 
|      paddsw      xmm0, xmm6
 | 
| +    pmaxsw      xmm2, xmm4
 | 
| +    pminsw      xmm4, xmm1
 | 
| +    paddsw      xmm0, xmm4
 | 
|      paddsw      xmm0, xmm2
 | 
| -    paddsw      xmm0, xmm4
 | 
| +
 | 
|      paddsw      xmm0, krd
 | 
| -
 | 
|      psraw       xmm0, 7
 | 
|      packuswb    xmm0, xmm0
 | 
|  
 | 
| @@ -166,10 +158,13 @@
 | 
|      pmaddubsw   xmm6, k6k7
 | 
|  
 | 
|      paddsw      xmm0, xmm6
 | 
| +    movdqa      xmm1, xmm2
 | 
| +    pmaxsw      xmm2, xmm4
 | 
| +    pminsw      xmm4, xmm1
 | 
| +    paddsw      xmm0, xmm4
 | 
|      paddsw      xmm0, xmm2
 | 
| -    paddsw      xmm0, xmm4
 | 
| +
 | 
|      paddsw      xmm0, krd
 | 
| -
 | 
|      psraw       xmm0, 7
 | 
|      packuswb    xmm0, xmm0
 | 
|  
 | 
| @@ -251,10 +246,13 @@
 | 
|      pmaddubsw   xmm6, k6k7
 | 
|  
 | 
|      paddsw      xmm0, xmm6
 | 
| +    movdqa      xmm1, xmm2
 | 
| +    pmaxsw      xmm2, xmm4
 | 
| +    pminsw      xmm4, xmm1
 | 
| +    paddsw      xmm0, xmm4
 | 
|      paddsw      xmm0, xmm2
 | 
| -    paddsw      xmm0, xmm4
 | 
| +
 | 
|      paddsw      xmm0, krd
 | 
| -
 | 
|      psraw       xmm0, 7
 | 
|      packuswb    xmm0, xmm0
 | 
|  %if %1
 | 
| @@ -538,14 +536,22 @@
 | 
|      movdqa      %2,   %1
 | 
|      pshufb      %1,   [GLOBAL(shuf_t0t1)]
 | 
|      pshufb      %2,   [GLOBAL(shuf_t2t3)]
 | 
| -    pmaddubsw   %1,   xmm6
 | 
| -    pmaddubsw   %2,   xmm7
 | 
| +    pmaddubsw   %1,   k0k1k4k5
 | 
| +    pmaddubsw   %2,   k2k3k6k7
 | 
|  
 | 
| -    paddsw      %1,   %2
 | 
| -    movdqa      %2,   %1
 | 
| +    movdqa      xmm4, %1
 | 
| +    movdqa      xmm5, %2
 | 
| +    psrldq      %1,   8
 | 
|      psrldq      %2,   8
 | 
| -    paddsw      %1,   %2
 | 
| -    paddsw      %1,   xmm5
 | 
| +    movdqa      xmm6, xmm5
 | 
| +
 | 
| +    paddsw      xmm4, %2
 | 
| +    pmaxsw      xmm5, %1
 | 
| +    pminsw      %1, xmm6
 | 
| +    paddsw      %1, xmm4
 | 
| +    paddsw      %1, xmm5
 | 
| +
 | 
| +    paddsw      %1,   krd
 | 
|      psraw       %1,   7
 | 
|      packuswb    %1,   %1
 | 
|  %endm
 | 
| @@ -565,6 +571,10 @@
 | 
|      pshufhw     xmm7, xmm7, 11111111b       ;k2_k3_k6_k7
 | 
|      pshufd      xmm5, xmm5, 0               ;rounding
 | 
|  
 | 
| +    movdqa      k0k1k4k5, xmm6
 | 
| +    movdqa      k2k3k6k7, xmm7
 | 
| +    movdqa      krd, xmm5
 | 
| +
 | 
|      movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
 | 
|      movsxd      rdx, dword ptr arg(3)       ;output_pitch
 | 
|      movsxd      rcx, dword ptr arg(4)       ;output_height
 | 
| @@ -631,9 +641,13 @@
 | 
|      pmaddubsw   %3,   k4k5
 | 
|      pmaddubsw   %4,   k6k7
 | 
|  
 | 
| -    paddsw      %1,   %2
 | 
|      paddsw      %1,   %4
 | 
| +    movdqa      %4,   %2
 | 
| +    pmaxsw      %2,   %3
 | 
| +    pminsw      %3,   %4
 | 
|      paddsw      %1,   %3
 | 
| +    paddsw      %1,   %2
 | 
| +
 | 
|      paddsw      %1,   krd
 | 
|      psraw       %1,   7
 | 
|      packuswb    %1,   %1
 | 
| @@ -779,12 +793,19 @@
 | 
|      pmaddubsw   xmm6,   k4k5
 | 
|      pmaddubsw   xmm7,   k6k7
 | 
|  
 | 
| -    paddsw      xmm0,   xmm1
 | 
|      paddsw      xmm0,   xmm3
 | 
| +    movdqa      xmm3,   xmm1
 | 
| +    pmaxsw      xmm1,   xmm2
 | 
| +    pminsw      xmm2,   xmm3
 | 
|      paddsw      xmm0,   xmm2
 | 
| -    paddsw      xmm4,   xmm5
 | 
| +    paddsw      xmm0,   xmm1
 | 
| +
 | 
|      paddsw      xmm4,   xmm7
 | 
| +    movdqa      xmm7,   xmm5
 | 
| +    pmaxsw      xmm5,   xmm6
 | 
| +    pminsw      xmm6,   xmm7
 | 
|      paddsw      xmm4,   xmm6
 | 
| +    paddsw      xmm4,   xmm5
 | 
|  
 | 
|      paddsw      xmm0,   krd
 | 
|      paddsw      xmm4,   krd
 | 
| @@ -826,8 +847,16 @@
 | 
|      push        rdi
 | 
|      ; end prolog
 | 
|  
 | 
| +    ALIGN_STACK 16, rax
 | 
| +    sub         rsp, 16 * 3
 | 
| +    %define k0k1k4k5 [rsp + 16 * 0]
 | 
| +    %define k2k3k6k7 [rsp + 16 * 1]
 | 
| +    %define krd      [rsp + 16 * 2]
 | 
| +
 | 
|      HORIZx4 0
 | 
|  
 | 
| +    add rsp, 16 * 3
 | 
| +    pop rsp
 | 
|      ; begin epilog
 | 
|      pop rdi
 | 
|      pop rsi
 | 
| @@ -932,8 +961,16 @@
 | 
|      push        rdi
 | 
|      ; end prolog
 | 
|  
 | 
| +    ALIGN_STACK 16, rax
 | 
| +    sub         rsp, 16 * 3
 | 
| +    %define k0k1k4k5 [rsp + 16 * 0]
 | 
| +    %define k2k3k6k7 [rsp + 16 * 1]
 | 
| +    %define krd      [rsp + 16 * 2]
 | 
| +
 | 
|      HORIZx4 1
 | 
|  
 | 
| +    add rsp, 16 * 3
 | 
| +    pop rsp
 | 
|      ; begin epilog
 | 
|      pop rdi
 | 
|      pop rsi
 | 
| 
 |