| Index: source/libvpx/vp8/encoder/x86/sad_sse3.asm | 
| =================================================================== | 
| --- source/libvpx/vp8/encoder/x86/sad_sse3.asm	(revision 96967) | 
| +++ source/libvpx/vp8/encoder/x86/sad_sse3.asm	(working copy) | 
| @@ -20,6 +20,7 @@ | 
| %define     ret_var       rbx | 
| %define     result_ptr    arg(4) | 
| %define     max_err       arg(4) | 
| +  %define     height        dword ptr arg(4) | 
| push        rbp | 
| mov         rbp,        rsp | 
| push        rsi | 
| @@ -33,14 +34,16 @@ | 
| movsxd      rdx,        dword ptr arg(3)    ; ref_stride | 
| %else | 
| %ifidn __OUTPUT_FORMAT__,x64 | 
| +    SAVE_XMM 7, u | 
| %define     src_ptr     rcx | 
| %define     src_stride  rdx | 
| %define     ref_ptr     r8 | 
| %define     ref_stride  r9 | 
| %define     end_ptr     r10 | 
| %define     ret_var     r11 | 
| -    %define     result_ptr  [rsp+8+4*8] | 
| -    %define     max_err     [rsp+8+4*8] | 
| +    %define     result_ptr  [rsp+xmm_stack_space+8+4*8] | 
| +    %define     max_err     [rsp+xmm_stack_space+8+4*8] | 
| +    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8] | 
| %else | 
| %define     src_ptr     rdi | 
| %define     src_stride  rsi | 
| @@ -50,6 +53,7 @@ | 
| %define     ret_var     r10 | 
| %define     result_ptr  r8 | 
| %define     max_err     r8 | 
| +    %define     height      r8 | 
| %endif | 
| %endif | 
|  | 
| @@ -64,6 +68,7 @@ | 
| %define     ret_var | 
| %define     result_ptr | 
| %define     max_err | 
| +  %define     height | 
|  | 
| %if ABI_IS_32BIT | 
| pop         rbx | 
| @@ -72,6 +77,7 @@ | 
| pop         rbp | 
| %else | 
| %ifidn __OUTPUT_FORMAT__,x64 | 
| +    RESTORE_XMM | 
| %endif | 
| %endif | 
| ret | 
| @@ -106,6 +112,7 @@ | 
| xchg        rbx,        rax | 
| %else | 
| %ifidn __OUTPUT_FORMAT__,x64 | 
| +    SAVE_XMM 7, u | 
| %define     src_ptr     rcx | 
| %define     src_stride  rdx | 
| %define     r0_ptr      rsi | 
| @@ -113,7 +120,7 @@ | 
| %define     r2_ptr      r11 | 
| %define     r3_ptr      r8 | 
| %define     ref_stride  r9 | 
| -    %define     result_ptr  [rsp+16+4*8] | 
| +    %define     result_ptr  [rsp+xmm_stack_space+16+4*8] | 
| push        rsi | 
|  | 
| LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr | 
| @@ -151,6 +158,7 @@ | 
| %else | 
| %ifidn __OUTPUT_FORMAT__,x64 | 
| pop         rsi | 
| +    RESTORE_XMM | 
| %endif | 
| %endif | 
| ret | 
| @@ -628,6 +636,67 @@ | 
|  | 
| STACK_FRAME_DESTROY_X3 | 
|  | 
| +;void vp8_copy32xn_sse3( | 
| +;    unsigned char *src_ptr, | 
| +;    int  src_stride, | 
| +;    unsigned char *dst_ptr, | 
| +;    int  dst_stride, | 
| +;    int height); | 
| +global sym(vp8_copy32xn_sse3) | 
| +sym(vp8_copy32xn_sse3): | 
| + | 
| +    STACK_FRAME_CREATE_X3 | 
| + | 
| +block_copy_sse3_loopx4: | 
| +        lea             end_ptr,    [src_ptr+src_stride*2] | 
| + | 
| +        movdqu          xmm0,       XMMWORD PTR [src_ptr] | 
| +        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16] | 
| +        movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride] | 
| +        movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16] | 
| +        movdqu          xmm4,       XMMWORD PTR [end_ptr] | 
| +        movdqu          xmm5,       XMMWORD PTR [end_ptr + 16] | 
| +        movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride] | 
| +        movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16] | 
| + | 
| +        lea             src_ptr,    [src_ptr+src_stride*4] | 
| + | 
| +        lea             end_ptr,    [ref_ptr+ref_stride*2] | 
| + | 
| +        movdqa          XMMWORD PTR [ref_ptr], xmm0 | 
| +        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1 | 
| +        movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2 | 
| +        movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 | 
| +        movdqa          XMMWORD PTR [end_ptr], xmm4 | 
| +        movdqa          XMMWORD PTR [end_ptr + 16], xmm5 | 
| +        movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6 | 
| +        movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 | 
| + | 
| +        lea             ref_ptr,    [ref_ptr+ref_stride*4] | 
| + | 
| +        sub             height,     4 | 
| +        cmp             height,     4 | 
| +        jge             block_copy_sse3_loopx4 | 
| + | 
| +        ;Check to see if there is more rows need to be copied. | 
| +        cmp             height, 0 | 
| +        je              copy_is_done | 
| + | 
| +block_copy_sse3_loop: | 
| +        movdqu          xmm0,       XMMWORD PTR [src_ptr] | 
| +        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16] | 
| +        lea             src_ptr,    [src_ptr+src_stride] | 
| + | 
| +        movdqa          XMMWORD PTR [ref_ptr], xmm0 | 
| +        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1 | 
| +        lea             ref_ptr,    [ref_ptr+ref_stride] | 
| + | 
| +        sub             height,     1 | 
| +        jne             block_copy_sse3_loop | 
| + | 
| +copy_is_done: | 
| +    STACK_FRAME_DESTROY_X3 | 
| + | 
| ;void vp8_sad16x16x4d_sse3( | 
| ;    unsigned char *src_ptr, | 
| ;    int  src_stride, | 
| @@ -888,3 +957,4 @@ | 
|  | 
|  | 
| STACK_FRAME_DESTROY_X4 | 
| + | 
|  |