| Index: source/libvpx/vp8/encoder/x86/quantize_sse2.asm | 
| =================================================================== | 
| --- source/libvpx/vp8/encoder/x86/quantize_sse2.asm	(revision 96967) | 
| +++ source/libvpx/vp8/encoder/x86/quantize_sse2.asm	(working copy) | 
| @@ -20,37 +20,38 @@ | 
| sym(vp8_regular_quantize_b_sse2): | 
| push        rbp | 
| mov         rbp, rsp | 
| -    SAVE_XMM | 
| +    SAVE_XMM 7 | 
| GET_GOT     rbx | 
| -    push        rsi | 
|  | 
| %if ABI_IS_32BIT | 
| push        rdi | 
| +    push        rsi | 
| %else | 
| %ifidn __OUTPUT_FORMAT__,x64 | 
| push        rdi | 
| +    push        rsi | 
| %endif | 
| %endif | 
|  | 
| ALIGN_STACK 16, rax | 
| -    %define BLOCKD_d          0  ;  8 | 
| -    %define zrun_zbin_boost   8  ;  8 | 
| -    %define abs_minus_zbin    16 ; 32 | 
| -    %define temp_qcoeff       48 ; 32 | 
| -    %define qcoeff            80 ; 32 | 
| -    %define stack_size        112 | 
| +    %define zrun_zbin_boost   0  ;  8 | 
| +    %define abs_minus_zbin    8  ; 32 | 
| +    %define temp_qcoeff       40 ; 32 | 
| +    %define qcoeff            72 ; 32 | 
| +    %define stack_size        104 | 
| sub         rsp, stack_size | 
| ; end prolog | 
|  | 
| %if ABI_IS_32BIT | 
| -    mov         rdi, arg(0) | 
| +    mov         rdi, arg(0)                 ; BLOCK *b | 
| +    mov         rsi, arg(1)                 ; BLOCKD *d | 
| %else | 
| %ifidn __OUTPUT_FORMAT__,x64 | 
| mov         rdi, rcx                    ; BLOCK *b | 
| -    mov         [rsp + BLOCKD_d], rdx | 
| +    mov         rsi, rdx                    ; BLOCKD *d | 
| %else | 
| ;mov         rdi, rdi                    ; BLOCK *b | 
| -    mov         [rsp + BLOCKD_d], rsi | 
| +    ;mov         rsi, rsi                    ; BLOCKD *d | 
| %endif | 
| %endif | 
|  | 
| @@ -125,60 +126,53 @@ | 
| movdqa      [rsp + qcoeff], xmm6 | 
| movdqa      [rsp + qcoeff + 16], xmm6 | 
|  | 
| -    mov         rsi, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr | 
| +    mov         rdx, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr | 
| mov         rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr | 
| -    mov         [rsp + zrun_zbin_boost], rsi | 
| +    mov         [rsp + zrun_zbin_boost], rdx | 
|  | 
| %macro ZIGZAG_LOOP 1 | 
| -    movsx       edx, WORD PTR[GLOBAL(zig_zag + (%1 * 2))] ; rc | 
| - | 
| ; x | 
| -    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2] | 
| +    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2] | 
|  | 
| ; if (x >= zbin) | 
| -    sub         cx, WORD PTR[rsi]           ; x - zbin | 
| -    lea         rsi, [rsi + 2]              ; zbin_boost_ptr++ | 
| +    sub         cx, WORD PTR[rdx]           ; x - zbin | 
| +    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++ | 
| jl          rq_zigzag_loop_%1           ; x < zbin | 
|  | 
| -    movsx       edi, WORD PTR[rsp + temp_qcoeff + rdx *2] | 
| +    movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2] | 
|  | 
| -    ; downshift by quant_shift[rdx] | 
| -    movsx       ecx, WORD PTR[rax + rdx*2]  ; quant_shift_ptr[rc] | 
| +    ; downshift by quant_shift[rc] | 
| +    movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc] | 
| sar         edi, cl                     ; also sets Z bit | 
| je          rq_zigzag_loop_%1           ; !y | 
| -    mov         WORD PTR[rsp + qcoeff + rdx*2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc] | 
| -    mov         rsi, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost | 
| +    mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc] | 
| +    mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost | 
| rq_zigzag_loop_%1: | 
| %endmacro | 
| -ZIGZAG_LOOP 0 | 
| -ZIGZAG_LOOP 1 | 
| -ZIGZAG_LOOP 2 | 
| -ZIGZAG_LOOP 3 | 
| -ZIGZAG_LOOP 4 | 
| -ZIGZAG_LOOP 5 | 
| -ZIGZAG_LOOP 6 | 
| -ZIGZAG_LOOP 7 | 
| -ZIGZAG_LOOP 8 | 
| -ZIGZAG_LOOP 9 | 
| +; in vp8_default_zig_zag1d order: see vp8/common/entropy.c | 
| +ZIGZAG_LOOP  0 | 
| +ZIGZAG_LOOP  1 | 
| +ZIGZAG_LOOP  4 | 
| +ZIGZAG_LOOP  8 | 
| +ZIGZAG_LOOP  5 | 
| +ZIGZAG_LOOP  2 | 
| +ZIGZAG_LOOP  3 | 
| +ZIGZAG_LOOP  6 | 
| +ZIGZAG_LOOP  9 | 
| +ZIGZAG_LOOP 12 | 
| +ZIGZAG_LOOP 13 | 
| ZIGZAG_LOOP 10 | 
| +ZIGZAG_LOOP  7 | 
| ZIGZAG_LOOP 11 | 
| -ZIGZAG_LOOP 12 | 
| -ZIGZAG_LOOP 13 | 
| ZIGZAG_LOOP 14 | 
| ZIGZAG_LOOP 15 | 
|  | 
| movdqa      xmm2, [rsp + qcoeff] | 
| movdqa      xmm3, [rsp + qcoeff + 16] | 
|  | 
| -%if ABI_IS_32BIT | 
| -    mov         rdi, arg(1) | 
| -%else | 
| -    mov         rdi, [rsp + BLOCKD_d] | 
| -%endif | 
| +    mov         rcx, [rsi + vp8_blockd_dequant] ; dequant_ptr | 
| +    mov         rdi, [rsi + vp8_blockd_dqcoeff] ; dqcoeff_ptr | 
|  | 
| -    mov         rcx, [rdi + vp8_blockd_dequant] ; dequant_ptr | 
| -    mov         rsi, [rdi + vp8_blockd_dqcoeff] ; dqcoeff_ptr | 
| - | 
| ; y ^ sz | 
| pxor        xmm2, xmm0 | 
| pxor        xmm3, xmm4 | 
| @@ -190,15 +184,15 @@ | 
| movdqa      xmm0, [rcx] | 
| movdqa      xmm1, [rcx + 16] | 
|  | 
| -    mov         rcx, [rdi + vp8_blockd_qcoeff] ; qcoeff_ptr | 
| +    mov         rcx, [rsi + vp8_blockd_qcoeff] ; qcoeff_ptr | 
|  | 
| pmullw      xmm0, xmm2 | 
| pmullw      xmm1, xmm3 | 
|  | 
| movdqa      [rcx], xmm2        ; store qcoeff | 
| movdqa      [rcx + 16], xmm3 | 
| -    movdqa      [rsi], xmm0        ; store dqcoeff | 
| -    movdqa      [rsi + 16], xmm1 | 
| +    movdqa      [rdi], xmm0        ; store dqcoeff | 
| +    movdqa      [rdi + 16], xmm1 | 
|  | 
| ; select the last value (in zig_zag order) for EOB | 
| pcmpeqw     xmm2, xmm6 | 
| @@ -220,90 +214,116 @@ | 
| pmaxsw      xmm2, xmm3 | 
| movd        eax, xmm2 | 
| and         eax, 0xff | 
| -    mov         [rdi + vp8_blockd_eob], eax | 
| +    mov         [rsi + vp8_blockd_eob], eax | 
|  | 
| ; begin epilog | 
| add         rsp, stack_size | 
| pop         rsp | 
| %if ABI_IS_32BIT | 
| +    pop         rsi | 
| pop         rdi | 
| %else | 
| %ifidn __OUTPUT_FORMAT__,x64 | 
| +    pop         rsi | 
| pop         rdi | 
| %endif | 
| %endif | 
| -    pop         rsi | 
| RESTORE_GOT | 
| RESTORE_XMM | 
| pop         rbp | 
| ret | 
|  | 
| -; int vp8_fast_quantize_b_impl_sse2 | arg | 
| -;  (short *coeff_ptr,               |  0 | 
| -;   short *qcoeff_ptr,              |  1 | 
| -;   short *dequant_ptr,             |  2 | 
| -;   short *inv_scan_order,          |  3 | 
| -;   short *round_ptr,               |  4 | 
| -;   short *quant_ptr,               |  5 | 
| -;   short *dqcoeff_ptr)             |  6 | 
| +; void vp8_fast_quantize_b_sse2 | arg | 
| +;  (BLOCK  *b,                  |  0 | 
| +;   BLOCKD *d)                  |  1 | 
|  | 
| -global sym(vp8_fast_quantize_b_impl_sse2) | 
| -sym(vp8_fast_quantize_b_impl_sse2): | 
| +global sym(vp8_fast_quantize_b_sse2) | 
| +sym(vp8_fast_quantize_b_sse2): | 
| push        rbp | 
| mov         rbp, rsp | 
| -    SHADOW_ARGS_TO_STACK 7 | 
| +    GET_GOT     rbx | 
| + | 
| +%if ABI_IS_32BIT | 
| +    push        rdi | 
| push        rsi | 
| +%else | 
| +  %ifidn __OUTPUT_FORMAT__,x64 | 
| push        rdi | 
| +    push        rsi | 
| +  %else | 
| +    ; these registers are used for passing arguments | 
| +  %endif | 
| +%endif | 
| + | 
| ; end prolog | 
|  | 
| -    mov         rdx, arg(0)                 ;coeff_ptr | 
| -    mov         rcx, arg(2)                 ;dequant_ptr | 
| -    mov         rdi, arg(4)                 ;round_ptr | 
| -    mov         rsi, arg(5)                 ;quant_ptr | 
| +%if ABI_IS_32BIT | 
| +    mov         rdi, arg(0)                 ; BLOCK *b | 
| +    mov         rsi, arg(1)                 ; BLOCKD *d | 
| +%else | 
| +  %ifidn __OUTPUT_FORMAT__,x64 | 
| +    mov         rdi, rcx                    ; BLOCK *b | 
| +    mov         rsi, rdx                    ; BLOCKD *d | 
| +  %else | 
| +    ;mov         rdi, rdi                    ; BLOCK *b | 
| +    ;mov         rsi, rsi                    ; BLOCKD *d | 
| +  %endif | 
| +%endif | 
|  | 
| -    movdqa      xmm0, XMMWORD PTR[rdx] | 
| -    movdqa      xmm4, XMMWORD PTR[rdx + 16] | 
| +    mov         rax, [rdi + vp8_block_coeff] | 
| +    mov         rcx, [rdi + vp8_block_round] | 
| +    mov         rdx, [rdi + vp8_block_quant_fast] | 
|  | 
| -    movdqa      xmm2, XMMWORD PTR[rdi]      ;round lo | 
| -    movdqa      xmm3, XMMWORD PTR[rdi + 16] ;round hi | 
| +    ; z = coeff | 
| +    movdqa      xmm0, [rax] | 
| +    movdqa      xmm4, [rax + 16] | 
|  | 
| +    ; dup z so we can save sz | 
| movdqa      xmm1, xmm0 | 
| movdqa      xmm5, xmm4 | 
|  | 
| -    psraw       xmm0, 15                    ;sign of z (aka sz) | 
| -    psraw       xmm4, 15                    ;sign of z (aka sz) | 
| +    ; sz = z >> 15 | 
| +    psraw       xmm0, 15 | 
| +    psraw       xmm4, 15 | 
|  | 
| +    ; x = abs(z) = (z ^ sz) - sz | 
| pxor        xmm1, xmm0 | 
| pxor        xmm5, xmm4 | 
| -    psubw       xmm1, xmm0                  ;x = abs(z) | 
| -    psubw       xmm5, xmm4                  ;x = abs(z) | 
| +    psubw       xmm1, xmm0 | 
| +    psubw       xmm5, xmm4 | 
|  | 
| -    paddw       xmm1, xmm2 | 
| -    paddw       xmm5, xmm3 | 
| +    ; x += round | 
| +    paddw       xmm1, [rcx] | 
| +    paddw       xmm5, [rcx + 16] | 
|  | 
| -    pmulhw      xmm1, XMMWORD PTR[rsi] | 
| -    pmulhw      xmm5, XMMWORD PTR[rsi + 16] | 
| +    mov         rax, [rsi + vp8_blockd_qcoeff] | 
| +    mov         rcx, [rsi + vp8_blockd_dequant] | 
| +    mov         rdi, [rsi + vp8_blockd_dqcoeff] | 
|  | 
| -    mov         rdi, arg(1)                 ;qcoeff_ptr | 
| -    mov         rsi, arg(6)                 ;dqcoeff_ptr | 
| +    ; y = x * quant >> 16 | 
| +    pmulhw      xmm1, [rdx] | 
| +    pmulhw      xmm5, [rdx + 16] | 
|  | 
| -    movdqa      xmm2, XMMWORD PTR[rcx] | 
| -    movdqa      xmm3, XMMWORD PTR[rcx + 16] | 
| - | 
| +    ; x = (y ^ sz) - sz | 
| pxor        xmm1, xmm0 | 
| pxor        xmm5, xmm4 | 
| psubw       xmm1, xmm0 | 
| psubw       xmm5, xmm4 | 
|  | 
| -    movdqa      XMMWORD PTR[rdi], xmm1 | 
| -    movdqa      XMMWORD PTR[rdi + 16], xmm5 | 
| +    ; qcoeff = x | 
| +    movdqa      [rax], xmm1 | 
| +    movdqa      [rax + 16], xmm5 | 
|  | 
| -    pmullw      xmm2, xmm1 | 
| -    pmullw      xmm3, xmm5 | 
| +    ; x * dequant | 
| +    movdqa      xmm2, xmm1 | 
| +    movdqa      xmm3, xmm5 | 
| +    pmullw      xmm2, [rcx] | 
| +    pmullw      xmm3, [rcx + 16] | 
|  | 
| -    mov         rdi, arg(3)                 ;inv_scan_order | 
| +    ; dqcoeff = x * dequant | 
| +    movdqa      [rdi], xmm2 | 
| +    movdqa      [rdi + 16], xmm3 | 
|  | 
| -    ; Start with 16 | 
| pxor        xmm4, xmm4                  ;clear all bits | 
| pcmpeqw     xmm1, xmm4 | 
| pcmpeqw     xmm5, xmm4 | 
| @@ -312,8 +332,8 @@ | 
| pxor        xmm1, xmm4 | 
| pxor        xmm5, xmm4 | 
|  | 
| -    pand        xmm1, XMMWORD PTR[rdi] | 
| -    pand        xmm5, XMMWORD PTR[rdi+16] | 
| +    pand        xmm1, [GLOBAL(inv_zig_zag)] | 
| +    pand        xmm5, [GLOBAL(inv_zig_zag + 16)] | 
|  | 
| pmaxsw      xmm1, xmm5 | 
|  | 
| @@ -332,26 +352,27 @@ | 
|  | 
| pmaxsw      xmm1, xmm5 | 
|  | 
| -    movd        rax, xmm1 | 
| -    and         rax, 0xff | 
| +    movd        eax, xmm1 | 
| +    and         eax, 0xff | 
| +    mov         [rsi + vp8_blockd_eob], eax | 
|  | 
| -    movdqa      XMMWORD PTR[rsi], xmm2        ;store dqcoeff | 
| -    movdqa      XMMWORD PTR[rsi + 16], xmm3   ;store dqcoeff | 
| - | 
| ; begin epilog | 
| +%if ABI_IS_32BIT | 
| +    pop         rsi | 
| pop         rdi | 
| +%else | 
| +  %ifidn __OUTPUT_FORMAT__,x64 | 
| pop         rsi | 
| -    UNSHADOW_ARGS | 
| +    pop         rdi | 
| +  %endif | 
| +%endif | 
| + | 
| +    RESTORE_GOT | 
| pop         rbp | 
| ret | 
|  | 
| SECTION_RODATA | 
| align 16 | 
| -zig_zag: | 
| -  dw 0x0000, 0x0001, 0x0004, 0x0008 | 
| -  dw 0x0005, 0x0002, 0x0003, 0x0006 | 
| -  dw 0x0009, 0x000c, 0x000d, 0x000a | 
| -  dw 0x0007, 0x000b, 0x000e, 0x000f | 
| inv_zig_zag: | 
| dw 0x0001, 0x0002, 0x0006, 0x0007 | 
| dw 0x0003, 0x0005, 0x0008, 0x000d | 
|  |