| OLD | NEW | 
|---|
| 1 ; | 1 ; | 
| 2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 
| 3 ; | 3 ; | 
| 4 ;  Use of this source code is governed by a BSD-style license and patent | 4 ;  Use of this source code is governed by a BSD-style license and patent | 
| 5 ;  grant that can be found in the LICENSE file in the root of the source | 5 ;  grant that can be found in the LICENSE file in the root of the source | 
| 6 ;  tree. All contributing project authors may be found in the AUTHORS | 6 ;  tree. All contributing project authors may be found in the AUTHORS | 
| 7 ;  file in the root of the source tree. | 7 ;  file in the root of the source tree. | 
| 8 ; | 8 ; | 
| 9 | 9 | 
| 10 | 10 | 
| 11 %include "vpx_ports/x86_abi_support.asm" | 11 %include "vpx_ports/x86_abi_support.asm" | 
|  | 12 %include "asm_enc_offsets.asm" | 
| 12 | 13 | 
| 13 | 14 | 
| 14 ;int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr | 15 ; void vp8_fast_quantize_b_ssse3 | arg | 
| 15 ;               short *qcoeff_ptr,short *dequant_ptr, | 16 ;  (BLOCK  *b,                   |  0 | 
| 16 ;               short *round_ptr, | 17 ;   BLOCKD *d)                   |  1 | 
| 17 ;               short *quant_ptr, short *dqcoeff_ptr); |  | 
| 18 ; | 18 ; | 
| 19 global sym(vp8_fast_quantize_b_impl_ssse3) | 19 | 
| 20 sym(vp8_fast_quantize_b_impl_ssse3): | 20 global sym(vp8_fast_quantize_b_ssse3) | 
|  | 21 sym(vp8_fast_quantize_b_ssse3): | 
| 21     push        rbp | 22     push        rbp | 
| 22     mov         rbp, rsp | 23     mov         rbp, rsp | 
| 23     SHADOW_ARGS_TO_STACK 6 |  | 
| 24     GET_GOT     rbx | 24     GET_GOT     rbx | 
|  | 25 | 
|  | 26 %if ABI_IS_32BIT | 
|  | 27     push        rdi | 
| 25     push        rsi | 28     push        rsi | 
|  | 29 %else | 
|  | 30   %ifidn __OUTPUT_FORMAT__,x64 | 
| 26     push        rdi | 31     push        rdi | 
|  | 32     push        rsi | 
|  | 33   %endif | 
|  | 34 %endif | 
| 27     ; end prolog | 35     ; end prolog | 
| 28 | 36 | 
| 29     mov         rdx, arg(0)                 ;coeff_ptr | 37 %if ABI_IS_32BIT | 
| 30     mov         rdi, arg(3)                 ;round_ptr | 38     mov         rdi, arg(0)                 ; BLOCK *b | 
| 31     mov         rsi, arg(4)                 ;quant_ptr | 39     mov         rsi, arg(1)                 ; BLOCKD *d | 
|  | 40 %else | 
|  | 41   %ifidn __OUTPUT_FORMAT__,x64 | 
|  | 42     mov         rdi, rcx                    ; BLOCK *b | 
|  | 43     mov         rsi, rdx                    ; BLOCKD *d | 
|  | 44   %else | 
|  | 45     ;mov         rdi, rdi                    ; BLOCK *b | 
|  | 46     ;mov         rsi, rsi                    ; BLOCKD *d | 
|  | 47   %endif | 
|  | 48 %endif | 
| 32 | 49 | 
| 33     movdqa      xmm0, [rdx] | 50     mov         rax, [rdi + vp8_block_coeff] | 
| 34     movdqa      xmm4, [rdx + 16] | 51     mov         rcx, [rdi + vp8_block_round] | 
|  | 52     mov         rdx, [rdi + vp8_block_quant_fast] | 
| 35 | 53 | 
| 36     movdqa      xmm2, [rdi]                 ;round lo | 54     ; coeff | 
| 37     movdqa      xmm3, [rdi + 16]            ;round hi | 55     movdqa      xmm0, [rax] | 
|  | 56     movdqa      xmm4, [rax + 16] | 
|  | 57 | 
|  | 58     ; round | 
|  | 59     movdqa      xmm2, [rcx] | 
|  | 60     movdqa      xmm3, [rcx + 16] | 
| 38 | 61 | 
| 39     movdqa      xmm1, xmm0 | 62     movdqa      xmm1, xmm0 | 
| 40     movdqa      xmm5, xmm4 | 63     movdqa      xmm5, xmm4 | 
| 41 | 64 | 
| 42     psraw       xmm0, 15                    ;sign of z (aka sz) | 65     ; sz = z >> 15 | 
| 43     psraw       xmm4, 15                    ;sign of z (aka sz) | 66     psraw       xmm0, 15 | 
|  | 67     psraw       xmm4, 15 | 
| 44 | 68 | 
| 45     pabsw       xmm1, xmm1 | 69     pabsw       xmm1, xmm1 | 
| 46     pabsw       xmm5, xmm5 | 70     pabsw       xmm5, xmm5 | 
| 47 | 71 | 
| 48     paddw       xmm1, xmm2 | 72     paddw       xmm1, xmm2 | 
| 49     paddw       xmm5, xmm3 | 73     paddw       xmm5, xmm3 | 
| 50 | 74 | 
| 51     pmulhw      xmm1, [rsi] | 75     ; quant_fast | 
| 52     pmulhw      xmm5, [rsi + 16] | 76     pmulhw      xmm1, [rdx] | 
|  | 77     pmulhw      xmm5, [rdx + 16] | 
| 53 | 78 | 
| 54     mov         rdi, arg(1)                 ;qcoeff_ptr | 79     mov         rax, [rsi + vp8_blockd_qcoeff] | 
| 55     mov         rcx, arg(2)                 ;dequant_ptr | 80     mov         rdi, [rsi + vp8_blockd_dequant] | 
| 56     mov         rsi, arg(5)                 ;dqcoeff_ptr | 81     mov         rcx, [rsi + vp8_blockd_dqcoeff] | 
| 57 | 82 | 
| 58     pxor        xmm1, xmm0 | 83     pxor        xmm1, xmm0 | 
| 59     pxor        xmm5, xmm4 | 84     pxor        xmm5, xmm4 | 
| 60     psubw       xmm1, xmm0 | 85     psubw       xmm1, xmm0 | 
| 61     psubw       xmm5, xmm4 | 86     psubw       xmm5, xmm4 | 
| 62 | 87 | 
| 63     movdqa      [rdi], xmm1 | 88     movdqa      [rax], xmm1 | 
| 64     movdqa      [rdi + 16], xmm5 | 89     movdqa      [rax + 16], xmm5 | 
| 65 | 90 | 
| 66     movdqa      xmm2, [rcx] | 91     movdqa      xmm2, [rdi] | 
| 67     movdqa      xmm3, [rcx + 16] | 92     movdqa      xmm3, [rdi + 16] | 
| 68 | 93 | 
| 69     pxor        xmm4, xmm4 | 94     pxor        xmm4, xmm4 | 
| 70     pmullw      xmm2, xmm1 | 95     pmullw      xmm2, xmm1 | 
| 71     pmullw      xmm3, xmm5 | 96     pmullw      xmm3, xmm5 | 
| 72 | 97 | 
| 73     pcmpeqw     xmm1, xmm4                  ;non zero mask | 98     pcmpeqw     xmm1, xmm4                  ;non zero mask | 
| 74     pcmpeqw     xmm5, xmm4                  ;non zero mask | 99     pcmpeqw     xmm5, xmm4                  ;non zero mask | 
| 75     packsswb    xmm1, xmm5 | 100     packsswb    xmm1, xmm5 | 
| 76     pshufb      xmm1, [ GLOBAL(zz_shuf)] | 101     pshufb      xmm1, [GLOBAL(zz_shuf)] | 
| 77 | 102 | 
| 78     pmovmskb    edx, xmm1 | 103     pmovmskb    edx, xmm1 | 
| 79 | 104 | 
| 80 ;    xor         ecx, ecx |  | 
| 81 ;    mov         eax, -1 |  | 
| 82 ;find_eob_loop: |  | 
| 83 ;    shr         edx, 1 |  | 
| 84 ;    jc          fq_skip |  | 
| 85 ;    mov         eax, ecx |  | 
| 86 ;fq_skip: |  | 
| 87 ;    inc         ecx |  | 
| 88 ;    cmp         ecx, 16 |  | 
| 89 ;    jne         find_eob_loop |  | 
| 90     xor         rdi, rdi | 105     xor         rdi, rdi | 
| 91     mov         eax, -1 | 106     mov         eax, -1 | 
| 92     xor         dx, ax                      ;flip the bits for bsr | 107     xor         dx, ax                      ;flip the bits for bsr | 
| 93     bsr         eax, edx | 108     bsr         eax, edx | 
| 94 | 109 | 
| 95     movdqa      [rsi], xmm2                 ;store dqcoeff | 110     movdqa      [rcx], xmm2                 ;store dqcoeff | 
| 96     movdqa      [rsi + 16], xmm3            ;store dqcoeff | 111     movdqa      [rcx + 16], xmm3            ;store dqcoeff | 
| 97 | 112 | 
| 98     sub         edi, edx                    ;check for all zeros in bit mask | 113     sub         edi, edx                    ;check for all zeros in bit mask | 
| 99     sar         edi, 31                     ;0 or -1 | 114     sar         edi, 31                     ;0 or -1 | 
| 100     add         eax, 1 | 115     add         eax, 1 | 
| 101     and         eax, edi                    ;if the bit mask was all zero, | 116     and         eax, edi                    ;if the bit mask was all zero, | 
| 102                                             ;then eob = 0 | 117                                             ;then eob = 0 | 
|  | 118     mov         [rsi + vp8_blockd_eob], eax | 
|  | 119 | 
| 103     ; begin epilog | 120     ; begin epilog | 
|  | 121 %if ABI_IS_32BIT | 
|  | 122     pop         rsi | 
| 104     pop         rdi | 123     pop         rdi | 
|  | 124 %else | 
|  | 125   %ifidn __OUTPUT_FORMAT__,x64 | 
| 105     pop         rsi | 126     pop         rsi | 
|  | 127     pop         rdi | 
|  | 128   %endif | 
|  | 129 %endif | 
|  | 130 | 
| 106     RESTORE_GOT | 131     RESTORE_GOT | 
| 107     UNSHADOW_ARGS |  | 
| 108     pop         rbp | 132     pop         rbp | 
| 109     ret | 133     ret | 
| 110 | 134 | 
| 111 SECTION_RODATA | 135 SECTION_RODATA | 
| 112 align 16 | 136 align 16 | 
| 113 zz_shuf: | 137 zz_shuf: | 
| 114     db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 | 138     db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 | 
| OLD | NEW | 
|---|