| Index: source/libvpx/vp8/encoder/x86/sad_sse2.asm | 
| =================================================================== | 
| --- source/libvpx/vp8/encoder/x86/sad_sse2.asm	(revision 96967) | 
| +++ source/libvpx/vp8/encoder/x86/sad_sse2.asm	(working copy) | 
| @@ -21,6 +21,7 @@ | 
| push        rbp | 
| mov         rbp, rsp | 
| SHADOW_ARGS_TO_STACK 4 | 
| +    SAVE_XMM 6 | 
| push        rsi | 
| push        rdi | 
| ; end prolog | 
| @@ -34,7 +35,7 @@ | 
| lea             rcx,        [rsi+rax*8] | 
|  | 
| lea             rcx,        [rcx+rax*8] | 
| -        pxor            xmm7,       xmm7 | 
| +        pxor            xmm6,       xmm6 | 
|  | 
| x16x16sad_wmt_loop: | 
|  | 
| @@ -52,32 +53,33 @@ | 
| punpcklbw       xmm1,       xmm3 | 
|  | 
| psadbw          xmm0,       xmm1 | 
| -        movq            xmm6,       QWORD PTR [rsi+rax+8] | 
| +        movq            xmm2,       QWORD PTR [rsi+rax+8] | 
|  | 
| movq            xmm3,       QWORD PTR [rdi+rdx+8] | 
| lea             rsi,        [rsi+rax*2] | 
|  | 
| lea             rdi,        [rdi+rdx*2] | 
| -        punpcklbw       xmm4,       xmm6 | 
| +        punpcklbw       xmm4,       xmm2 | 
|  | 
| punpcklbw       xmm5,       xmm3 | 
| psadbw          xmm4,       xmm5 | 
|  | 
| -        paddw           xmm7,       xmm0 | 
| -        paddw           xmm7,       xmm4 | 
| +        paddw           xmm6,       xmm0 | 
| +        paddw           xmm6,       xmm4 | 
|  | 
| cmp             rsi,        rcx | 
| jne             x16x16sad_wmt_loop | 
|  | 
| -        movq            xmm0,       xmm7 | 
| -        psrldq          xmm7,       8 | 
| +        movq            xmm0,       xmm6 | 
| +        psrldq          xmm6,       8 | 
|  | 
| -        paddw           xmm0,       xmm7 | 
| +        paddw           xmm0,       xmm6 | 
| movq            rax,        xmm0 | 
|  | 
| ; begin epilog | 
| pop rdi | 
| pop rsi | 
| +    RESTORE_XMM | 
| UNSHADOW_ARGS | 
| pop         rbp | 
| ret | 
| @@ -112,7 +114,7 @@ | 
| x8x16sad_wmt_loop: | 
|  | 
| movq            rax,        mm7 | 
| -        cmp             rax,        arg(4) | 
| +        cmp             eax,        arg(4) | 
| jg              x8x16sad_wmt_early_exit | 
|  | 
| movq            mm0,        QWORD PTR [rsi] | 
| @@ -173,7 +175,7 @@ | 
| x8x8sad_wmt_loop: | 
|  | 
| movq            rax,        mm7 | 
| -        cmp             rax,        arg(4) | 
| +        cmp             eax,        arg(4) | 
| jg              x8x8sad_wmt_early_exit | 
|  | 
| movq            mm0,        QWORD PTR [rsi] | 
| @@ -282,7 +284,7 @@ | 
| x16x8sad_wmt_loop: | 
|  | 
| movq            rax,        mm7 | 
| -        cmp             rax,        arg(4) | 
| +        cmp             eax,        arg(4) | 
| jg              x16x8sad_wmt_early_exit | 
|  | 
| movq            mm0,        QWORD PTR [rsi] | 
| @@ -326,3 +328,83 @@ | 
| UNSHADOW_ARGS | 
| pop         rbp | 
| ret | 
| + | 
| +;void vp8_copy32xn_sse2( | 
| +;    unsigned char *src_ptr, | 
| +;    int  src_stride, | 
| +;    unsigned char *dst_ptr, | 
| +;    int  dst_stride, | 
| +;    int height); | 
| +global sym(vp8_copy32xn_sse2) | 
| +sym(vp8_copy32xn_sse2): | 
| +    push        rbp | 
| +    mov         rbp, rsp | 
| +    SHADOW_ARGS_TO_STACK 5 | 
| +    SAVE_XMM 7 | 
| +    push        rsi | 
| +    push        rdi | 
| +    ; end prolog | 
| + | 
| +        mov             rsi,        arg(0) ;src_ptr | 
| +        mov             rdi,        arg(2) ;dst_ptr | 
| + | 
| +        movsxd          rax,        dword ptr arg(1) ;src_stride | 
| +        movsxd          rdx,        dword ptr arg(3) ;dst_stride | 
| +        movsxd          rcx,        dword ptr arg(4) ;height | 
| + | 
| +block_copy_sse2_loopx4: | 
| +        movdqu          xmm0,       XMMWORD PTR [rsi] | 
| +        movdqu          xmm1,       XMMWORD PTR [rsi + 16] | 
| +        movdqu          xmm2,       XMMWORD PTR [rsi + rax] | 
| +        movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16] | 
| + | 
| +        lea             rsi,        [rsi+rax*2] | 
| + | 
| +        movdqu          xmm4,       XMMWORD PTR [rsi] | 
| +        movdqu          xmm5,       XMMWORD PTR [rsi + 16] | 
| +        movdqu          xmm6,       XMMWORD PTR [rsi + rax] | 
| +        movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16] | 
| + | 
| +        lea             rsi,    [rsi+rax*2] | 
| + | 
| +        movdqa          XMMWORD PTR [rdi], xmm0 | 
| +        movdqa          XMMWORD PTR [rdi + 16], xmm1 | 
| +        movdqa          XMMWORD PTR [rdi + rdx], xmm2 | 
| +        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3 | 
| + | 
| +        lea             rdi,    [rdi+rdx*2] | 
| + | 
| +        movdqa          XMMWORD PTR [rdi], xmm4 | 
| +        movdqa          XMMWORD PTR [rdi + 16], xmm5 | 
| +        movdqa          XMMWORD PTR [rdi + rdx], xmm6 | 
| +        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7 | 
| + | 
| +        lea             rdi,    [rdi+rdx*2] | 
| + | 
| +        sub             rcx,     4 | 
| +        cmp             rcx,     4 | 
| +        jge             block_copy_sse2_loopx4 | 
| + | 
| +        cmp             rcx, 0 | 
| +        je              copy_is_done | 
| + | 
| +block_copy_sse2_loop: | 
| +        movdqu          xmm0,       XMMWORD PTR [rsi] | 
| +        movdqu          xmm1,       XMMWORD PTR [rsi + 16] | 
| +        lea             rsi,    [rsi+rax] | 
| + | 
| +        movdqa          XMMWORD PTR [rdi], xmm0 | 
| +        movdqa          XMMWORD PTR [rdi + 16], xmm1 | 
| +        lea             rdi,    [rdi+rdx] | 
| + | 
| +        sub             rcx,     1 | 
| +        jne             block_copy_sse2_loop | 
| + | 
| +copy_is_done: | 
| +    ; begin epilog | 
| +    pop rdi | 
| +    pop rsi | 
| +    RESTORE_XMM | 
| +    UNSHADOW_ARGS | 
| +    pop         rbp | 
| +    ret | 
|  |