| Index: source/libvpx/vp8/common/x86/recon_sse2.asm | 
| =================================================================== | 
| --- source/libvpx/vp8/common/x86/recon_sse2.asm	(revision 96967) | 
| +++ source/libvpx/vp8/common/x86/recon_sse2.asm	(working copy) | 
| @@ -67,7 +67,7 @@ | 
| push        rbp | 
| mov         rbp, rsp | 
| SHADOW_ARGS_TO_STACK 4 | 
| -    SAVE_XMM | 
| +    SAVE_XMM 7 | 
| push        rsi | 
| push        rdi | 
| ; end prolog | 
| @@ -229,3 +229,460 @@ | 
| UNSHADOW_ARGS | 
| pop         rbp | 
| ret | 
| + | 
| + | 
| +;void vp8_intra_pred_uv_dc_mmx2( | 
| +;    unsigned char *dst, | 
| +;    int dst_stride | 
| +;    unsigned char *src, | 
| +;    int src_stride, | 
| +;    ) | 
| +global sym(vp8_intra_pred_uv_dc_mmx2) | 
| +sym(vp8_intra_pred_uv_dc_mmx2): | 
| +    push        rbp | 
| +    mov         rbp, rsp | 
| +    SHADOW_ARGS_TO_STACK 4 | 
| +    push        rsi | 
| +    push        rdi | 
| +    ; end prolog | 
| + | 
| +    ; from top | 
| +    mov         rsi,        arg(2) ;src; | 
| +    movsxd      rax,        dword ptr arg(3) ;src_stride; | 
| +    sub         rsi,        rax | 
| +    pxor        mm0,        mm0 | 
| +    movq        mm1,        [rsi] | 
| +    psadbw      mm1,        mm0 | 
| + | 
| +    ; from left | 
| +    dec         rsi | 
| +    lea         rdi,        [rax*3] | 
| +    movzx       ecx,        byte [rsi+rax] | 
| +    movzx       edx,        byte [rsi+rax*2] | 
| +    add         ecx,        edx | 
| +    movzx       edx,        byte [rsi+rdi] | 
| +    add         ecx,        edx | 
| +    lea         rsi,        [rsi+rax*4] | 
| +    movzx       edx,        byte [rsi] | 
| +    add         ecx,        edx | 
| +    movzx       edx,        byte [rsi+rax] | 
| +    add         ecx,        edx | 
| +    movzx       edx,        byte [rsi+rax*2] | 
| +    add         ecx,        edx | 
| +    movzx       edx,        byte [rsi+rdi] | 
| +    add         ecx,        edx | 
| +    movzx       edx,        byte [rsi+rax*4] | 
| +    add         ecx,        edx | 
| + | 
| +    ; add up | 
| +    pextrw      edx,        mm1, 0x0 | 
| +    lea         edx,        [edx+ecx+8] | 
| +    sar         edx,        4 | 
| +    movd        mm1,        edx | 
| +    pshufw      mm1,        mm1, 0x0 | 
| +    packuswb    mm1,        mm1 | 
| + | 
| +    ; write out | 
| +    mov         rdi,        arg(0) ;dst; | 
| +    movsxd      rcx,        dword ptr arg(1) ;dst_stride | 
| +    lea         rax,        [rcx*3] | 
| + | 
| +    movq [rdi      ],       mm1 | 
| +    movq [rdi+rcx  ],       mm1 | 
| +    movq [rdi+rcx*2],       mm1 | 
| +    movq [rdi+rax  ],       mm1 | 
| +    lea         rdi,        [rdi+rcx*4] | 
| +    movq [rdi      ],       mm1 | 
| +    movq [rdi+rcx  ],       mm1 | 
| +    movq [rdi+rcx*2],       mm1 | 
| +    movq [rdi+rax  ],       mm1 | 
| + | 
| +    ; begin epilog | 
| +    pop         rdi | 
| +    pop         rsi | 
| +    UNSHADOW_ARGS | 
| +    pop         rbp | 
| +    ret | 
| + | 
| +;void vp8_intra_pred_uv_dctop_mmx2( | 
| +;    unsigned char *dst, | 
| +;    int dst_stride | 
| +;    unsigned char *src, | 
| +;    int src_stride, | 
| +;    ) | 
| +global sym(vp8_intra_pred_uv_dctop_mmx2) | 
| +sym(vp8_intra_pred_uv_dctop_mmx2): | 
| +    push        rbp | 
| +    mov         rbp, rsp | 
| +    SHADOW_ARGS_TO_STACK 4 | 
| +    GET_GOT     rbx | 
| +    push        rsi | 
| +    push        rdi | 
| +    ; end prolog | 
| + | 
| +    ; from top | 
| +    mov         rsi,        arg(2) ;src; | 
| +    movsxd      rax,        dword ptr arg(3) ;src_stride; | 
| +    sub         rsi,        rax | 
| +    pxor        mm0,        mm0 | 
| +    movq        mm1,        [rsi] | 
| +    psadbw      mm1,        mm0 | 
| + | 
| +    ; add up | 
| +    paddw       mm1,        [GLOBAL(dc_4)] | 
| +    psraw       mm1,        3 | 
| +    pshufw      mm1,        mm1, 0x0 | 
| +    packuswb    mm1,        mm1 | 
| + | 
| +    ; write out | 
| +    mov         rdi,        arg(0) ;dst; | 
| +    movsxd      rcx,        dword ptr arg(1) ;dst_stride | 
| +    lea         rax,        [rcx*3] | 
| + | 
| +    movq [rdi      ],       mm1 | 
| +    movq [rdi+rcx  ],       mm1 | 
| +    movq [rdi+rcx*2],       mm1 | 
| +    movq [rdi+rax  ],       mm1 | 
| +    lea         rdi,        [rdi+rcx*4] | 
| +    movq [rdi      ],       mm1 | 
| +    movq [rdi+rcx  ],       mm1 | 
| +    movq [rdi+rcx*2],       mm1 | 
| +    movq [rdi+rax  ],       mm1 | 
| + | 
| +    ; begin epilog | 
| +    pop         rdi | 
| +    pop         rsi | 
| +    RESTORE_GOT | 
| +    UNSHADOW_ARGS | 
| +    pop         rbp | 
| +    ret | 
| + | 
| +;void vp8_intra_pred_uv_dcleft_mmx2( | 
| +;    unsigned char *dst, | 
| +;    int dst_stride | 
| +;    unsigned char *src, | 
| +;    int src_stride, | 
| +;    ) | 
| +global sym(vp8_intra_pred_uv_dcleft_mmx2) | 
| +sym(vp8_intra_pred_uv_dcleft_mmx2): | 
| +    push        rbp | 
| +    mov         rbp, rsp | 
| +    SHADOW_ARGS_TO_STACK 4 | 
| +    push        rsi | 
| +    push        rdi | 
| +    ; end prolog | 
| + | 
| +    ; from left | 
| +    mov         rsi,        arg(2) ;src; | 
| +    movsxd      rax,        dword ptr arg(3) ;src_stride; | 
| +    dec         rsi | 
| +    lea         rdi,        [rax*3] | 
| +    movzx       ecx,        byte [rsi] | 
| +    movzx       edx,        byte [rsi+rax] | 
| +    add         ecx,        edx | 
| +    movzx       edx,        byte [rsi+rax*2] | 
| +    add         ecx,        edx | 
| +    movzx       edx,        byte [rsi+rdi] | 
| +    add         ecx,        edx | 
| +    lea         rsi,        [rsi+rax*4] | 
| +    movzx       edx,        byte [rsi] | 
| +    add         ecx,        edx | 
| +    movzx       edx,        byte [rsi+rax] | 
| +    add         ecx,        edx | 
| +    movzx       edx,        byte [rsi+rax*2] | 
| +    add         ecx,        edx | 
| +    movzx       edx,        byte [rsi+rdi] | 
| +    lea         edx,        [ecx+edx+4] | 
| + | 
| +    ; add up | 
| +    shr         edx,        3 | 
| +    movd        mm1,        edx | 
| +    pshufw      mm1,        mm1, 0x0 | 
| +    packuswb    mm1,        mm1 | 
| + | 
| +    ; write out | 
| +    mov         rdi,        arg(0) ;dst; | 
| +    movsxd      rcx,        dword ptr arg(1) ;dst_stride | 
| +    lea         rax,        [rcx*3] | 
| + | 
| +    movq [rdi      ],       mm1 | 
| +    movq [rdi+rcx  ],       mm1 | 
| +    movq [rdi+rcx*2],       mm1 | 
| +    movq [rdi+rax  ],       mm1 | 
| +    lea         rdi,        [rdi+rcx*4] | 
| +    movq [rdi      ],       mm1 | 
| +    movq [rdi+rcx  ],       mm1 | 
| +    movq [rdi+rcx*2],       mm1 | 
| +    movq [rdi+rax  ],       mm1 | 
| + | 
| +    ; begin epilog | 
| +    pop         rdi | 
| +    pop         rsi | 
| +    UNSHADOW_ARGS | 
| +    pop         rbp | 
| +    ret | 
| + | 
| +;void vp8_intra_pred_uv_dc128_mmx( | 
| +;    unsigned char *dst, | 
| +;    int dst_stride | 
| +;    unsigned char *src, | 
| +;    int src_stride, | 
| +;    ) | 
| +global sym(vp8_intra_pred_uv_dc128_mmx) | 
| +sym(vp8_intra_pred_uv_dc128_mmx): | 
| +    push        rbp | 
| +    mov         rbp, rsp | 
| +    SHADOW_ARGS_TO_STACK 4 | 
| +    GET_GOT     rbx | 
| +    ; end prolog | 
| + | 
| +    ; write out | 
| +    movq        mm1,        [GLOBAL(dc_128)] | 
| +    mov         rax,        arg(0) ;dst; | 
| +    movsxd      rdx,        dword ptr arg(1) ;dst_stride | 
| +    lea         rcx,        [rdx*3] | 
| + | 
| +    movq [rax      ],       mm1 | 
| +    movq [rax+rdx  ],       mm1 | 
| +    movq [rax+rdx*2],       mm1 | 
| +    movq [rax+rcx  ],       mm1 | 
| +    lea         rax,        [rax+rdx*4] | 
| +    movq [rax      ],       mm1 | 
| +    movq [rax+rdx  ],       mm1 | 
| +    movq [rax+rdx*2],       mm1 | 
| +    movq [rax+rcx  ],       mm1 | 
| + | 
| +    ; begin epilog | 
| +    RESTORE_GOT | 
| +    UNSHADOW_ARGS | 
| +    pop         rbp | 
| +    ret | 
| + | 
| +;void vp8_intra_pred_uv_tm_sse2( | 
| +;    unsigned char *dst, | 
| +;    int dst_stride | 
| +;    unsigned char *src, | 
| +;    int src_stride, | 
| +;    ) | 
| +%macro vp8_intra_pred_uv_tm 1 | 
| +global sym(vp8_intra_pred_uv_tm_%1) | 
| +sym(vp8_intra_pred_uv_tm_%1): | 
| +    push        rbp | 
| +    mov         rbp, rsp | 
| +    SHADOW_ARGS_TO_STACK 4 | 
| +    GET_GOT     rbx | 
| +    push        rsi | 
| +    push        rdi | 
| +    ; end prolog | 
| + | 
| +    ; read top row | 
| +    mov         edx,        4 | 
| +    mov         rsi,        arg(2) ;src; | 
| +    movsxd      rax,        dword ptr arg(3) ;src_stride; | 
| +    sub         rsi,        rax | 
| +    pxor        xmm0,       xmm0 | 
| +%ifidn %1, ssse3 | 
| +    movdqa      xmm2,       [GLOBAL(dc_1024)] | 
| +%endif | 
| +    movq        xmm1,       [rsi] | 
| +    punpcklbw   xmm1,       xmm0 | 
| + | 
| +    ; set up left ptrs ans subtract topleft | 
| +    movd        xmm3,       [rsi-1] | 
| +    lea         rsi,        [rsi+rax-1] | 
| +%ifidn %1, sse2 | 
| +    punpcklbw   xmm3,       xmm0 | 
| +    pshuflw     xmm3,       xmm3, 0x0 | 
| +    punpcklqdq  xmm3,       xmm3 | 
| +%else | 
| +    pshufb      xmm3,       xmm2 | 
| +%endif | 
| +    psubw       xmm1,       xmm3 | 
| + | 
| +    ; set up dest ptrs | 
| +    mov         rdi,        arg(0) ;dst; | 
| +    movsxd      rcx,        dword ptr arg(1) ;dst_stride | 
| + | 
| +vp8_intra_pred_uv_tm_%1_loop: | 
| +    movd        xmm3,       [rsi] | 
| +    movd        xmm5,       [rsi+rax] | 
| +%ifidn %1, sse2 | 
| +    punpcklbw   xmm3,       xmm0 | 
| +    punpcklbw   xmm5,       xmm0 | 
| +    pshuflw     xmm3,       xmm3, 0x0 | 
| +    pshuflw     xmm5,       xmm5, 0x0 | 
| +    punpcklqdq  xmm3,       xmm3 | 
| +    punpcklqdq  xmm5,       xmm5 | 
| +%else | 
| +    pshufb      xmm3,       xmm2 | 
| +    pshufb      xmm5,       xmm2 | 
| +%endif | 
| +    paddw       xmm3,       xmm1 | 
| +    paddw       xmm5,       xmm1 | 
| +    packuswb    xmm3,       xmm5 | 
| +    movq  [rdi    ],        xmm3 | 
| +    movhps[rdi+rcx],        xmm3 | 
| +    lea         rsi,        [rsi+rax*2] | 
| +    lea         rdi,        [rdi+rcx*2] | 
| +    dec         edx | 
| +    jnz vp8_intra_pred_uv_tm_%1_loop | 
| + | 
| +    ; begin epilog | 
| +    pop         rdi | 
| +    pop         rsi | 
| +    RESTORE_GOT | 
| +    UNSHADOW_ARGS | 
| +    pop         rbp | 
| +    ret | 
| +%endmacro | 
| + | 
| +vp8_intra_pred_uv_tm sse2 | 
| +vp8_intra_pred_uv_tm ssse3 | 
| + | 
| +;void vp8_intra_pred_uv_ve_mmx( | 
| +;    unsigned char *dst, | 
| +;    int dst_stride | 
| +;    unsigned char *src, | 
| +;    int src_stride, | 
| +;    ) | 
| +global sym(vp8_intra_pred_uv_ve_mmx) | 
| +sym(vp8_intra_pred_uv_ve_mmx): | 
| +    push        rbp | 
| +    mov         rbp, rsp | 
| +    SHADOW_ARGS_TO_STACK 4 | 
| +    ; end prolog | 
| + | 
| +    ; read from top | 
| +    mov         rax,        arg(2) ;src; | 
| +    movsxd      rdx,        dword ptr arg(3) ;src_stride; | 
| +    sub         rax,        rdx | 
| +    movq        mm1,        [rax] | 
| + | 
| +    ; write out | 
| +    mov         rax,        arg(0) ;dst; | 
| +    movsxd      rdx,        dword ptr arg(1) ;dst_stride | 
| +    lea         rcx,        [rdx*3] | 
| + | 
| +    movq [rax      ],       mm1 | 
| +    movq [rax+rdx  ],       mm1 | 
| +    movq [rax+rdx*2],       mm1 | 
| +    movq [rax+rcx  ],       mm1 | 
| +    lea         rax,        [rax+rdx*4] | 
| +    movq [rax      ],       mm1 | 
| +    movq [rax+rdx  ],       mm1 | 
| +    movq [rax+rdx*2],       mm1 | 
| +    movq [rax+rcx  ],       mm1 | 
| + | 
| +    ; begin epilog | 
| +    UNSHADOW_ARGS | 
| +    pop         rbp | 
| +    ret | 
| + | 
| +;void vp8_intra_pred_uv_ho_mmx2( | 
| +;    unsigned char *dst, | 
| +;    int dst_stride | 
| +;    unsigned char *src, | 
| +;    int src_stride, | 
| +;    ) | 
| +%macro vp8_intra_pred_uv_ho 1 | 
| +global sym(vp8_intra_pred_uv_ho_%1) | 
| +sym(vp8_intra_pred_uv_ho_%1): | 
| +    push        rbp | 
| +    mov         rbp, rsp | 
| +    SHADOW_ARGS_TO_STACK 4 | 
| +    push        rsi | 
| +    push        rdi | 
| +%ifidn %1, ssse3 | 
| +%ifndef GET_GOT_SAVE_ARG | 
| +    push        rbx | 
| +%endif | 
| +    GET_GOT     rbx | 
| +%endif | 
| +    ; end prolog | 
| + | 
| +    ; read from left and write out | 
| +%ifidn %1, mmx2 | 
| +    mov         edx,        4 | 
| +%endif | 
| +    mov         rsi,        arg(2) ;src; | 
| +    movsxd      rax,        dword ptr arg(3) ;src_stride; | 
| +    mov         rdi,        arg(0) ;dst; | 
| +    movsxd      rcx,        dword ptr arg(1) ;dst_stride | 
| +%ifidn %1, ssse3 | 
| +    lea         rdx,        [rcx*3] | 
| +    movdqa      xmm2,       [GLOBAL(dc_00001111)] | 
| +    lea         rbx,        [rax*3] | 
| +%endif | 
| +    dec         rsi | 
| +%ifidn %1, mmx2 | 
| +vp8_intra_pred_uv_ho_%1_loop: | 
| +    movd        mm0,        [rsi] | 
| +    movd        mm1,        [rsi+rax] | 
| +    punpcklbw   mm0,        mm0 | 
| +    punpcklbw   mm1,        mm1 | 
| +    pshufw      mm0,        mm0, 0x0 | 
| +    pshufw      mm1,        mm1, 0x0 | 
| +    movq  [rdi    ],        mm0 | 
| +    movq  [rdi+rcx],        mm1 | 
| +    lea         rsi,        [rsi+rax*2] | 
| +    lea         rdi,        [rdi+rcx*2] | 
| +    dec         edx | 
| +    jnz vp8_intra_pred_uv_ho_%1_loop | 
| +%else | 
| +    movd        xmm0,       [rsi] | 
| +    movd        xmm3,       [rsi+rax] | 
| +    movd        xmm1,       [rsi+rax*2] | 
| +    movd        xmm4,       [rsi+rbx] | 
| +    punpcklbw   xmm0,       xmm3 | 
| +    punpcklbw   xmm1,       xmm4 | 
| +    pshufb      xmm0,       xmm2 | 
| +    pshufb      xmm1,       xmm2 | 
| +    movq   [rdi    ],       xmm0 | 
| +    movhps [rdi+rcx],       xmm0 | 
| +    movq [rdi+rcx*2],       xmm1 | 
| +    movhps [rdi+rdx],       xmm1 | 
| +    lea         rsi,        [rsi+rax*4] | 
| +    lea         rdi,        [rdi+rcx*4] | 
| +    movd        xmm0,       [rsi] | 
| +    movd        xmm3,       [rsi+rax] | 
| +    movd        xmm1,       [rsi+rax*2] | 
| +    movd        xmm4,       [rsi+rbx] | 
| +    punpcklbw   xmm0,       xmm3 | 
| +    punpcklbw   xmm1,       xmm4 | 
| +    pshufb      xmm0,       xmm2 | 
| +    pshufb      xmm1,       xmm2 | 
| +    movq   [rdi    ],       xmm0 | 
| +    movhps [rdi+rcx],       xmm0 | 
| +    movq [rdi+rcx*2],       xmm1 | 
| +    movhps [rdi+rdx],       xmm1 | 
| +%endif | 
| + | 
| +    ; begin epilog | 
| +%ifidn %1, ssse3 | 
| +    RESTORE_GOT | 
| +%ifndef GET_GOT_SAVE_ARG | 
| +    pop         rbx | 
| +%endif | 
| +%endif | 
| +    pop         rdi | 
| +    pop         rsi | 
| +    UNSHADOW_ARGS | 
| +    pop         rbp | 
| +    ret | 
| +%endmacro | 
| + | 
| +vp8_intra_pred_uv_ho mmx2 | 
| +vp8_intra_pred_uv_ho ssse3 | 
| + | 
| +SECTION_RODATA | 
| +dc_128: | 
| +    times 8 db 128 | 
| +dc_4: | 
| +    times 4 dw 4 | 
| +align 16 | 
| +dc_1024: | 
| +    times 8 dw 0x400 | 
| +align 16 | 
| +dc_00001111: | 
| +    times 8 db 0 | 
| +    times 8 db 1 | 
|  |