| OLD | NEW | 
|---|
| 1 ; | 1 ; | 
| 2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 
| 3 ; | 3 ; | 
| 4 ;  Use of this source code is governed by a BSD-style license and patent | 4 ;  Use of this source code is governed by a BSD-style license and patent | 
| 5 ;  grant that can be found in the LICENSE file in the root of the source | 5 ;  grant that can be found in the LICENSE file in the root of the source | 
| 6 ;  tree. All contributing project authors may be found in the AUTHORS | 6 ;  tree. All contributing project authors may be found in the AUTHORS | 
| 7 ;  file in the root of the source tree. | 7 ;  file in the root of the source tree. | 
| 8 ; | 8 ; | 
| 9 | 9 | 
| 10 | 10 | 
| 11 %include "vpx_ports/x86_abi_support.asm" | 11 %include "vpx_ports/x86_abi_support.asm" | 
| 12 %include "asm_enc_offsets.asm" | 12 %include "asm_enc_offsets.asm" | 
| 13 | 13 | 
| 14 | 14 | 
| 15 ; void vp8_regular_quantize_b_sse2 | arg | 15 ; void vp8_regular_quantize_b_sse2 | arg | 
| 16 ;  (BLOCK  *b,                     |  0 | 16 ;  (BLOCK  *b,                     |  0 | 
| 17 ;   BLOCKD *d)                     |  1 | 17 ;   BLOCKD *d)                     |  1 | 
| 18 | 18 | 
| 19 global sym(vp8_regular_quantize_b_sse2) | 19 global sym(vp8_regular_quantize_b_sse2) | 
| 20 sym(vp8_regular_quantize_b_sse2): | 20 sym(vp8_regular_quantize_b_sse2): | 
| 21     push        rbp | 21     push        rbp | 
| 22     mov         rbp, rsp | 22     mov         rbp, rsp | 
| 23     SAVE_XMM | 23     SAVE_XMM 7 | 
| 24     GET_GOT     rbx | 24     GET_GOT     rbx | 
| 25     push        rsi |  | 
| 26 | 25 | 
| 27 %if ABI_IS_32BIT | 26 %if ABI_IS_32BIT | 
| 28     push        rdi | 27     push        rdi | 
|  | 28     push        rsi | 
| 29 %else | 29 %else | 
| 30   %ifidn __OUTPUT_FORMAT__,x64 | 30   %ifidn __OUTPUT_FORMAT__,x64 | 
| 31     push        rdi | 31     push        rdi | 
|  | 32     push        rsi | 
| 32   %endif | 33   %endif | 
| 33 %endif | 34 %endif | 
| 34 | 35 | 
| 35     ALIGN_STACK 16, rax | 36     ALIGN_STACK 16, rax | 
| 36     %define BLOCKD_d          0  ;  8 | 37     %define zrun_zbin_boost   0  ;  8 | 
| 37     %define zrun_zbin_boost   8  ;  8 | 38     %define abs_minus_zbin    8  ; 32 | 
| 38     %define abs_minus_zbin    16 ; 32 | 39     %define temp_qcoeff       40 ; 32 | 
| 39     %define temp_qcoeff       48 ; 32 | 40     %define qcoeff            72 ; 32 | 
| 40     %define qcoeff            80 ; 32 | 41     %define stack_size        104 | 
| 41     %define stack_size        112 |  | 
| 42     sub         rsp, stack_size | 42     sub         rsp, stack_size | 
| 43     ; end prolog | 43     ; end prolog | 
| 44 | 44 | 
| 45 %if ABI_IS_32BIT | 45 %if ABI_IS_32BIT | 
| 46     mov         rdi, arg(0) | 46     mov         rdi, arg(0)                 ; BLOCK *b | 
|  | 47     mov         rsi, arg(1)                 ; BLOCKD *d | 
| 47 %else | 48 %else | 
| 48   %ifidn __OUTPUT_FORMAT__,x64 | 49   %ifidn __OUTPUT_FORMAT__,x64 | 
| 49     mov         rdi, rcx                    ; BLOCK *b | 50     mov         rdi, rcx                    ; BLOCK *b | 
| 50     mov         [rsp + BLOCKD_d], rdx | 51     mov         rsi, rdx                    ; BLOCKD *d | 
| 51   %else | 52   %else | 
| 52     ;mov         rdi, rdi                    ; BLOCK *b | 53     ;mov         rdi, rdi                    ; BLOCK *b | 
| 53     mov         [rsp + BLOCKD_d], rsi | 54     ;mov         rsi, rsi                    ; BLOCKD *d | 
| 54   %endif | 55   %endif | 
| 55 %endif | 56 %endif | 
| 56 | 57 | 
| 57     mov         rdx, [rdi + vp8_block_coeff] ; coeff_ptr | 58     mov         rdx, [rdi + vp8_block_coeff] ; coeff_ptr | 
| 58     mov         rcx, [rdi + vp8_block_zbin] ; zbin_ptr | 59     mov         rcx, [rdi + vp8_block_zbin] ; zbin_ptr | 
| 59     movd        xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value | 60     movd        xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value | 
| 60 | 61 | 
| 61     ; z | 62     ; z | 
| 62     movdqa      xmm0, [rdx] | 63     movdqa      xmm0, [rdx] | 
| 63     movdqa      xmm4, [rdx + 16] | 64     movdqa      xmm4, [rdx + 16] | 
| (...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 118     paddw       xmm5, xmm7 | 119     paddw       xmm5, xmm7 | 
| 119 | 120 | 
| 120     movdqa      [rsp + temp_qcoeff], xmm1 | 121     movdqa      [rsp + temp_qcoeff], xmm1 | 
| 121     movdqa      [rsp + temp_qcoeff + 16], xmm5 | 122     movdqa      [rsp + temp_qcoeff + 16], xmm5 | 
| 122 | 123 | 
| 123     pxor        xmm6, xmm6 | 124     pxor        xmm6, xmm6 | 
| 124     ; zero qcoeff | 125     ; zero qcoeff | 
| 125     movdqa      [rsp + qcoeff], xmm6 | 126     movdqa      [rsp + qcoeff], xmm6 | 
| 126     movdqa      [rsp + qcoeff + 16], xmm6 | 127     movdqa      [rsp + qcoeff + 16], xmm6 | 
| 127 | 128 | 
| 128     mov         rsi, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr | 129     mov         rdx, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr | 
| 129     mov         rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr | 130     mov         rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr | 
| 130     mov         [rsp + zrun_zbin_boost], rsi | 131     mov         [rsp + zrun_zbin_boost], rdx | 
| 131 | 132 | 
| 132 %macro ZIGZAG_LOOP 1 | 133 %macro ZIGZAG_LOOP 1 | 
| 133     movsx       edx, WORD PTR[GLOBAL(zig_zag + (%1 * 2))] ; rc |  | 
| 134 |  | 
| 135     ; x | 134     ; x | 
| 136     movsx       ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2] | 135     movsx       ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2] | 
| 137 | 136 | 
| 138     ; if (x >= zbin) | 137     ; if (x >= zbin) | 
| 139     sub         cx, WORD PTR[rsi]           ; x - zbin | 138     sub         cx, WORD PTR[rdx]           ; x - zbin | 
| 140     lea         rsi, [rsi + 2]              ; zbin_boost_ptr++ | 139     lea         rdx, [rdx + 2]              ; zbin_boost_ptr++ | 
| 141     jl          rq_zigzag_loop_%1           ; x < zbin | 140     jl          rq_zigzag_loop_%1           ; x < zbin | 
| 142 | 141 | 
| 143     movsx       edi, WORD PTR[rsp + temp_qcoeff + rdx *2] | 142     movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2] | 
| 144 | 143 | 
| 145     ; downshift by quant_shift[rdx] | 144     ; downshift by quant_shift[rc] | 
| 146     movsx       ecx, WORD PTR[rax + rdx*2]  ; quant_shift_ptr[rc] | 145     movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc] | 
| 147     sar         edi, cl                     ; also sets Z bit | 146     sar         edi, cl                     ; also sets Z bit | 
| 148     je          rq_zigzag_loop_%1           ; !y | 147     je          rq_zigzag_loop_%1           ; !y | 
| 149     mov         WORD PTR[rsp + qcoeff + rdx*2], di ;qcoeff_ptr[rc] = temp_qcoeff
     [rc] | 148     mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoef
     f[rc] | 
| 150     mov         rsi, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost | 149     mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost | 
| 151 rq_zigzag_loop_%1: | 150 rq_zigzag_loop_%1: | 
| 152 %endmacro | 151 %endmacro | 
| 153 ZIGZAG_LOOP 0 | 152 ; in vp8_default_zig_zag1d order: see vp8/common/entropy.c | 
| 154 ZIGZAG_LOOP 1 | 153 ZIGZAG_LOOP  0 | 
| 155 ZIGZAG_LOOP 2 | 154 ZIGZAG_LOOP  1 | 
| 156 ZIGZAG_LOOP 3 | 155 ZIGZAG_LOOP  4 | 
| 157 ZIGZAG_LOOP 4 | 156 ZIGZAG_LOOP  8 | 
| 158 ZIGZAG_LOOP 5 | 157 ZIGZAG_LOOP  5 | 
| 159 ZIGZAG_LOOP 6 | 158 ZIGZAG_LOOP  2 | 
| 160 ZIGZAG_LOOP 7 | 159 ZIGZAG_LOOP  3 | 
| 161 ZIGZAG_LOOP 8 | 160 ZIGZAG_LOOP  6 | 
| 162 ZIGZAG_LOOP 9 | 161 ZIGZAG_LOOP  9 | 
| 163 ZIGZAG_LOOP 10 |  | 
| 164 ZIGZAG_LOOP 11 |  | 
| 165 ZIGZAG_LOOP 12 | 162 ZIGZAG_LOOP 12 | 
| 166 ZIGZAG_LOOP 13 | 163 ZIGZAG_LOOP 13 | 
|  | 164 ZIGZAG_LOOP 10 | 
|  | 165 ZIGZAG_LOOP  7 | 
|  | 166 ZIGZAG_LOOP 11 | 
| 167 ZIGZAG_LOOP 14 | 167 ZIGZAG_LOOP 14 | 
| 168 ZIGZAG_LOOP 15 | 168 ZIGZAG_LOOP 15 | 
| 169 | 169 | 
| 170     movdqa      xmm2, [rsp + qcoeff] | 170     movdqa      xmm2, [rsp + qcoeff] | 
| 171     movdqa      xmm3, [rsp + qcoeff + 16] | 171     movdqa      xmm3, [rsp + qcoeff + 16] | 
| 172 | 172 | 
| 173 %if ABI_IS_32BIT | 173     mov         rcx, [rsi + vp8_blockd_dequant] ; dequant_ptr | 
| 174     mov         rdi, arg(1) | 174     mov         rdi, [rsi + vp8_blockd_dqcoeff] ; dqcoeff_ptr | 
| 175 %else |  | 
| 176     mov         rdi, [rsp + BLOCKD_d] |  | 
| 177 %endif |  | 
| 178 |  | 
| 179     mov         rcx, [rdi + vp8_blockd_dequant] ; dequant_ptr |  | 
| 180     mov         rsi, [rdi + vp8_blockd_dqcoeff] ; dqcoeff_ptr |  | 
| 181 | 175 | 
| 182     ; y ^ sz | 176     ; y ^ sz | 
| 183     pxor        xmm2, xmm0 | 177     pxor        xmm2, xmm0 | 
| 184     pxor        xmm3, xmm4 | 178     pxor        xmm3, xmm4 | 
| 185     ; x = (y ^ sz) - sz | 179     ; x = (y ^ sz) - sz | 
| 186     psubw       xmm2, xmm0 | 180     psubw       xmm2, xmm0 | 
| 187     psubw       xmm3, xmm4 | 181     psubw       xmm3, xmm4 | 
| 188 | 182 | 
| 189     ; dequant | 183     ; dequant | 
| 190     movdqa      xmm0, [rcx] | 184     movdqa      xmm0, [rcx] | 
| 191     movdqa      xmm1, [rcx + 16] | 185     movdqa      xmm1, [rcx + 16] | 
| 192 | 186 | 
| 193     mov         rcx, [rdi + vp8_blockd_qcoeff] ; qcoeff_ptr | 187     mov         rcx, [rsi + vp8_blockd_qcoeff] ; qcoeff_ptr | 
| 194 | 188 | 
| 195     pmullw      xmm0, xmm2 | 189     pmullw      xmm0, xmm2 | 
| 196     pmullw      xmm1, xmm3 | 190     pmullw      xmm1, xmm3 | 
| 197 | 191 | 
| 198     movdqa      [rcx], xmm2        ; store qcoeff | 192     movdqa      [rcx], xmm2        ; store qcoeff | 
| 199     movdqa      [rcx + 16], xmm3 | 193     movdqa      [rcx + 16], xmm3 | 
| 200     movdqa      [rsi], xmm0        ; store dqcoeff | 194     movdqa      [rdi], xmm0        ; store dqcoeff | 
| 201     movdqa      [rsi + 16], xmm1 | 195     movdqa      [rdi + 16], xmm1 | 
| 202 | 196 | 
| 203     ; select the last value (in zig_zag order) for EOB | 197     ; select the last value (in zig_zag order) for EOB | 
| 204     pcmpeqw     xmm2, xmm6 | 198     pcmpeqw     xmm2, xmm6 | 
| 205     pcmpeqw     xmm3, xmm6 | 199     pcmpeqw     xmm3, xmm6 | 
| 206     ; ! | 200     ; ! | 
| 207     pcmpeqw     xmm6, xmm6 | 201     pcmpeqw     xmm6, xmm6 | 
| 208     pxor        xmm2, xmm6 | 202     pxor        xmm2, xmm6 | 
| 209     pxor        xmm3, xmm6 | 203     pxor        xmm3, xmm6 | 
| 210     ; mask inv_zig_zag | 204     ; mask inv_zig_zag | 
| 211     pand        xmm2, [GLOBAL(inv_zig_zag)] | 205     pand        xmm2, [GLOBAL(inv_zig_zag)] | 
| 212     pand        xmm3, [GLOBAL(inv_zig_zag + 16)] | 206     pand        xmm3, [GLOBAL(inv_zig_zag + 16)] | 
| 213     ; select the max value | 207     ; select the max value | 
| 214     pmaxsw      xmm2, xmm3 | 208     pmaxsw      xmm2, xmm3 | 
| 215     pshufd      xmm3, xmm2, 00001110b | 209     pshufd      xmm3, xmm2, 00001110b | 
| 216     pmaxsw      xmm2, xmm3 | 210     pmaxsw      xmm2, xmm3 | 
| 217     pshuflw     xmm3, xmm2, 00001110b | 211     pshuflw     xmm3, xmm2, 00001110b | 
| 218     pmaxsw      xmm2, xmm3 | 212     pmaxsw      xmm2, xmm3 | 
| 219     pshuflw     xmm3, xmm2, 00000001b | 213     pshuflw     xmm3, xmm2, 00000001b | 
| 220     pmaxsw      xmm2, xmm3 | 214     pmaxsw      xmm2, xmm3 | 
| 221     movd        eax, xmm2 | 215     movd        eax, xmm2 | 
| 222     and         eax, 0xff | 216     and         eax, 0xff | 
| 223     mov         [rdi + vp8_blockd_eob], eax | 217     mov         [rsi + vp8_blockd_eob], eax | 
| 224 | 218 | 
| 225     ; begin epilog | 219     ; begin epilog | 
| 226     add         rsp, stack_size | 220     add         rsp, stack_size | 
| 227     pop         rsp | 221     pop         rsp | 
| 228 %if ABI_IS_32BIT | 222 %if ABI_IS_32BIT | 
|  | 223     pop         rsi | 
| 229     pop         rdi | 224     pop         rdi | 
| 230 %else | 225 %else | 
| 231   %ifidn __OUTPUT_FORMAT__,x64 | 226   %ifidn __OUTPUT_FORMAT__,x64 | 
|  | 227     pop         rsi | 
| 232     pop         rdi | 228     pop         rdi | 
| 233   %endif | 229   %endif | 
| 234 %endif | 230 %endif | 
| 235     pop         rsi |  | 
| 236     RESTORE_GOT | 231     RESTORE_GOT | 
| 237     RESTORE_XMM | 232     RESTORE_XMM | 
| 238     pop         rbp | 233     pop         rbp | 
| 239     ret | 234     ret | 
| 240 | 235 | 
| 241 ; int vp8_fast_quantize_b_impl_sse2 | arg | 236 ; void vp8_fast_quantize_b_sse2 | arg | 
| 242 ;  (short *coeff_ptr,               |  0 | 237 ;  (BLOCK  *b,                  |  0 | 
| 243 ;   short *qcoeff_ptr,              |  1 | 238 ;   BLOCKD *d)                  |  1 | 
| 244 ;   short *dequant_ptr,             |  2 |  | 
| 245 ;   short *inv_scan_order,          |  3 |  | 
| 246 ;   short *round_ptr,               |  4 |  | 
| 247 ;   short *quant_ptr,               |  5 |  | 
| 248 ;   short *dqcoeff_ptr)             |  6 |  | 
| 249 | 239 | 
| 250 global sym(vp8_fast_quantize_b_impl_sse2) | 240 global sym(vp8_fast_quantize_b_sse2) | 
| 251 sym(vp8_fast_quantize_b_impl_sse2): | 241 sym(vp8_fast_quantize_b_sse2): | 
| 252     push        rbp | 242     push        rbp | 
| 253     mov         rbp, rsp | 243     mov         rbp, rsp | 
| 254     SHADOW_ARGS_TO_STACK 7 | 244     GET_GOT     rbx | 
|  | 245 | 
|  | 246 %if ABI_IS_32BIT | 
|  | 247     push        rdi | 
| 255     push        rsi | 248     push        rsi | 
|  | 249 %else | 
|  | 250   %ifidn __OUTPUT_FORMAT__,x64 | 
| 256     push        rdi | 251     push        rdi | 
|  | 252     push        rsi | 
|  | 253   %else | 
|  | 254     ; these registers are used for passing arguments | 
|  | 255   %endif | 
|  | 256 %endif | 
|  | 257 | 
| 257     ; end prolog | 258     ; end prolog | 
| 258 | 259 | 
| 259     mov         rdx, arg(0)                 ;coeff_ptr | 260 %if ABI_IS_32BIT | 
| 260     mov         rcx, arg(2)                 ;dequant_ptr | 261     mov         rdi, arg(0)                 ; BLOCK *b | 
| 261     mov         rdi, arg(4)                 ;round_ptr | 262     mov         rsi, arg(1)                 ; BLOCKD *d | 
| 262     mov         rsi, arg(5)                 ;quant_ptr | 263 %else | 
|  | 264   %ifidn __OUTPUT_FORMAT__,x64 | 
|  | 265     mov         rdi, rcx                    ; BLOCK *b | 
|  | 266     mov         rsi, rdx                    ; BLOCKD *d | 
|  | 267   %else | 
|  | 268     ;mov         rdi, rdi                    ; BLOCK *b | 
|  | 269     ;mov         rsi, rsi                    ; BLOCKD *d | 
|  | 270   %endif | 
|  | 271 %endif | 
| 263 | 272 | 
| 264     movdqa      xmm0, XMMWORD PTR[rdx] | 273     mov         rax, [rdi + vp8_block_coeff] | 
| 265     movdqa      xmm4, XMMWORD PTR[rdx + 16] | 274     mov         rcx, [rdi + vp8_block_round] | 
|  | 275     mov         rdx, [rdi + vp8_block_quant_fast] | 
| 266 | 276 | 
| 267     movdqa      xmm2, XMMWORD PTR[rdi]      ;round lo | 277     ; z = coeff | 
| 268     movdqa      xmm3, XMMWORD PTR[rdi + 16] ;round hi | 278     movdqa      xmm0, [rax] | 
|  | 279     movdqa      xmm4, [rax + 16] | 
| 269 | 280 | 
|  | 281     ; dup z so we can save sz | 
| 270     movdqa      xmm1, xmm0 | 282     movdqa      xmm1, xmm0 | 
| 271     movdqa      xmm5, xmm4 | 283     movdqa      xmm5, xmm4 | 
| 272 | 284 | 
| 273     psraw       xmm0, 15                    ;sign of z (aka sz) | 285     ; sz = z >> 15 | 
| 274     psraw       xmm4, 15                    ;sign of z (aka sz) | 286     psraw       xmm0, 15 | 
|  | 287     psraw       xmm4, 15 | 
| 275 | 288 | 
| 276     pxor        xmm1, xmm0 | 289     ; x = abs(z) = (z ^ sz) - sz | 
| 277     pxor        xmm5, xmm4 |  | 
| 278     psubw       xmm1, xmm0                  ;x = abs(z) |  | 
| 279     psubw       xmm5, xmm4                  ;x = abs(z) |  | 
| 280 |  | 
| 281     paddw       xmm1, xmm2 |  | 
| 282     paddw       xmm5, xmm3 |  | 
| 283 |  | 
| 284     pmulhw      xmm1, XMMWORD PTR[rsi] |  | 
| 285     pmulhw      xmm5, XMMWORD PTR[rsi + 16] |  | 
| 286 |  | 
| 287     mov         rdi, arg(1)                 ;qcoeff_ptr |  | 
| 288     mov         rsi, arg(6)                 ;dqcoeff_ptr |  | 
| 289 |  | 
| 290     movdqa      xmm2, XMMWORD PTR[rcx] |  | 
| 291     movdqa      xmm3, XMMWORD PTR[rcx + 16] |  | 
| 292 |  | 
| 293     pxor        xmm1, xmm0 | 290     pxor        xmm1, xmm0 | 
| 294     pxor        xmm5, xmm4 | 291     pxor        xmm5, xmm4 | 
| 295     psubw       xmm1, xmm0 | 292     psubw       xmm1, xmm0 | 
| 296     psubw       xmm5, xmm4 | 293     psubw       xmm5, xmm4 | 
| 297 | 294 | 
| 298     movdqa      XMMWORD PTR[rdi], xmm1 | 295     ; x += round | 
| 299     movdqa      XMMWORD PTR[rdi + 16], xmm5 | 296     paddw       xmm1, [rcx] | 
|  | 297     paddw       xmm5, [rcx + 16] | 
| 300 | 298 | 
| 301     pmullw      xmm2, xmm1 | 299     mov         rax, [rsi + vp8_blockd_qcoeff] | 
| 302     pmullw      xmm3, xmm5 | 300     mov         rcx, [rsi + vp8_blockd_dequant] | 
|  | 301     mov         rdi, [rsi + vp8_blockd_dqcoeff] | 
| 303 | 302 | 
| 304     mov         rdi, arg(3)                 ;inv_scan_order | 303     ; y = x * quant >> 16 | 
|  | 304     pmulhw      xmm1, [rdx] | 
|  | 305     pmulhw      xmm5, [rdx + 16] | 
| 305 | 306 | 
| 306     ; Start with 16 | 307     ; x = (y ^ sz) - sz | 
|  | 308     pxor        xmm1, xmm0 | 
|  | 309     pxor        xmm5, xmm4 | 
|  | 310     psubw       xmm1, xmm0 | 
|  | 311     psubw       xmm5, xmm4 | 
|  | 312 | 
|  | 313     ; qcoeff = x | 
|  | 314     movdqa      [rax], xmm1 | 
|  | 315     movdqa      [rax + 16], xmm5 | 
|  | 316 | 
|  | 317     ; x * dequant | 
|  | 318     movdqa      xmm2, xmm1 | 
|  | 319     movdqa      xmm3, xmm5 | 
|  | 320     pmullw      xmm2, [rcx] | 
|  | 321     pmullw      xmm3, [rcx + 16] | 
|  | 322 | 
|  | 323     ; dqcoeff = x * dequant | 
|  | 324     movdqa      [rdi], xmm2 | 
|  | 325     movdqa      [rdi + 16], xmm3 | 
|  | 326 | 
| 307     pxor        xmm4, xmm4                  ;clear all bits | 327     pxor        xmm4, xmm4                  ;clear all bits | 
| 308     pcmpeqw     xmm1, xmm4 | 328     pcmpeqw     xmm1, xmm4 | 
| 309     pcmpeqw     xmm5, xmm4 | 329     pcmpeqw     xmm5, xmm4 | 
| 310 | 330 | 
| 311     pcmpeqw     xmm4, xmm4                  ;set all bits | 331     pcmpeqw     xmm4, xmm4                  ;set all bits | 
| 312     pxor        xmm1, xmm4 | 332     pxor        xmm1, xmm4 | 
| 313     pxor        xmm5, xmm4 | 333     pxor        xmm5, xmm4 | 
| 314 | 334 | 
| 315     pand        xmm1, XMMWORD PTR[rdi] | 335     pand        xmm1, [GLOBAL(inv_zig_zag)] | 
| 316     pand        xmm5, XMMWORD PTR[rdi+16] | 336     pand        xmm5, [GLOBAL(inv_zig_zag + 16)] | 
| 317 | 337 | 
| 318     pmaxsw      xmm1, xmm5 | 338     pmaxsw      xmm1, xmm5 | 
| 319 | 339 | 
| 320     ; now down to 8 | 340     ; now down to 8 | 
| 321     pshufd      xmm5, xmm1, 00001110b | 341     pshufd      xmm5, xmm1, 00001110b | 
| 322 | 342 | 
| 323     pmaxsw      xmm1, xmm5 | 343     pmaxsw      xmm1, xmm5 | 
| 324 | 344 | 
| 325     ; only 4 left | 345     ; only 4 left | 
| 326     pshuflw     xmm5, xmm1, 00001110b | 346     pshuflw     xmm5, xmm1, 00001110b | 
| 327 | 347 | 
| 328     pmaxsw      xmm1, xmm5 | 348     pmaxsw      xmm1, xmm5 | 
| 329 | 349 | 
| 330     ; okay, just 2! | 350     ; okay, just 2! | 
| 331     pshuflw     xmm5, xmm1, 00000001b | 351     pshuflw     xmm5, xmm1, 00000001b | 
| 332 | 352 | 
| 333     pmaxsw      xmm1, xmm5 | 353     pmaxsw      xmm1, xmm5 | 
| 334 | 354 | 
| 335     movd        rax, xmm1 | 355     movd        eax, xmm1 | 
| 336     and         rax, 0xff | 356     and         eax, 0xff | 
| 337 | 357     mov         [rsi + vp8_blockd_eob], eax | 
| 338     movdqa      XMMWORD PTR[rsi], xmm2        ;store dqcoeff |  | 
| 339     movdqa      XMMWORD PTR[rsi + 16], xmm3   ;store dqcoeff |  | 
| 340 | 358 | 
| 341     ; begin epilog | 359     ; begin epilog | 
|  | 360 %if ABI_IS_32BIT | 
|  | 361     pop         rsi | 
| 342     pop         rdi | 362     pop         rdi | 
|  | 363 %else | 
|  | 364   %ifidn __OUTPUT_FORMAT__,x64 | 
| 343     pop         rsi | 365     pop         rsi | 
| 344     UNSHADOW_ARGS | 366     pop         rdi | 
|  | 367   %endif | 
|  | 368 %endif | 
|  | 369 | 
|  | 370     RESTORE_GOT | 
| 345     pop         rbp | 371     pop         rbp | 
| 346     ret | 372     ret | 
| 347 | 373 | 
| 348 SECTION_RODATA | 374 SECTION_RODATA | 
| 349 align 16 | 375 align 16 | 
| 350 zig_zag: |  | 
| 351   dw 0x0000, 0x0001, 0x0004, 0x0008 |  | 
| 352   dw 0x0005, 0x0002, 0x0003, 0x0006 |  | 
| 353   dw 0x0009, 0x000c, 0x000d, 0x000a |  | 
| 354   dw 0x0007, 0x000b, 0x000e, 0x000f |  | 
| 355 inv_zig_zag: | 376 inv_zig_zag: | 
| 356   dw 0x0001, 0x0002, 0x0006, 0x0007 | 377   dw 0x0001, 0x0002, 0x0006, 0x0007 | 
| 357   dw 0x0003, 0x0005, 0x0008, 0x000d | 378   dw 0x0003, 0x0005, 0x0008, 0x000d | 
| 358   dw 0x0004, 0x0009, 0x000c, 0x000e | 379   dw 0x0004, 0x0009, 0x000c, 0x000e | 
| 359   dw 0x000a, 0x000b, 0x000f, 0x0010 | 380   dw 0x000a, 0x000b, 0x000f, 0x0010 | 
| OLD | NEW | 
|---|