| OLD | NEW |
| 1 ; | 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; | 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license and patent | 4 ; Use of this source code is governed by a BSD-style license and patent |
| 5 ; grant that can be found in the LICENSE file in the root of the source | 5 ; grant that can be found in the LICENSE file in the root of the source |
| 6 ; tree. All contributing project authors may be found in the AUTHORS | 6 ; tree. All contributing project authors may be found in the AUTHORS |
| 7 ; file in the root of the source tree. | 7 ; file in the root of the source tree. |
| 8 ; | 8 ; |
| 9 | 9 |
| 10 | 10 |
| 11 %include "vpx_ports/x86_abi_support.asm" | 11 %include "vpx_ports/x86_abi_support.asm" |
| 12 %include "asm_enc_offsets.asm" |
| 12 | 13 |
| 13 | 14 |
| 14 ;int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr | 15 ; void vp8_fast_quantize_b_ssse3 | arg |
| 15 ; short *qcoeff_ptr,short *dequant_ptr, | 16 ; (BLOCK *b, | 0 |
| 16 ; short *round_ptr, | 17 ; BLOCKD *d) | 1 |
| 17 ; short *quant_ptr, short *dqcoeff_ptr); | |
| 18 ; | 18 ; |
| 19 global sym(vp8_fast_quantize_b_impl_ssse3) | 19 |
| 20 sym(vp8_fast_quantize_b_impl_ssse3): | 20 global sym(vp8_fast_quantize_b_ssse3) |
| 21 sym(vp8_fast_quantize_b_ssse3): |
| 21 push rbp | 22 push rbp |
| 22 mov rbp, rsp | 23 mov rbp, rsp |
| 23 SHADOW_ARGS_TO_STACK 6 | |
| 24 GET_GOT rbx | 24 GET_GOT rbx |
| 25 |
| 26 %if ABI_IS_32BIT |
| 27 push rdi |
| 25 push rsi | 28 push rsi |
| 29 %else |
| 30 %ifidn __OUTPUT_FORMAT__,x64 |
| 26 push rdi | 31 push rdi |
| 32 push rsi |
| 33 %endif |
| 34 %endif |
| 27 ; end prolog | 35 ; end prolog |
| 28 | 36 |
| 29 mov rdx, arg(0) ;coeff_ptr | 37 %if ABI_IS_32BIT |
| 30 mov rdi, arg(3) ;round_ptr | 38 mov rdi, arg(0) ; BLOCK *b |
| 31 mov rsi, arg(4) ;quant_ptr | 39 mov rsi, arg(1) ; BLOCKD *d |
| 40 %else |
| 41 %ifidn __OUTPUT_FORMAT__,x64 |
| 42 mov rdi, rcx ; BLOCK *b |
| 43 mov rsi, rdx ; BLOCKD *d |
| 44 %else |
| 45 ;mov rdi, rdi ; BLOCK *b |
| 46 ;mov rsi, rsi ; BLOCKD *d |
| 47 %endif |
| 48 %endif |
| 32 | 49 |
| 33 movdqa xmm0, [rdx] | 50 mov rax, [rdi + vp8_block_coeff] |
| 34 movdqa xmm4, [rdx + 16] | 51 mov rcx, [rdi + vp8_block_round] |
| 52 mov rdx, [rdi + vp8_block_quant_fast] |
| 35 | 53 |
| 36 movdqa xmm2, [rdi] ;round lo | 54 ; coeff |
| 37 movdqa xmm3, [rdi + 16] ;round hi | 55 movdqa xmm0, [rax] |
| 56 movdqa xmm4, [rax + 16] |
| 57 |
| 58 ; round |
| 59 movdqa xmm2, [rcx] |
| 60 movdqa xmm3, [rcx + 16] |
| 38 | 61 |
| 39 movdqa xmm1, xmm0 | 62 movdqa xmm1, xmm0 |
| 40 movdqa xmm5, xmm4 | 63 movdqa xmm5, xmm4 |
| 41 | 64 |
| 42 psraw xmm0, 15 ;sign of z (aka sz) | 65 ; sz = z >> 15 |
| 43 psraw xmm4, 15 ;sign of z (aka sz) | 66 psraw xmm0, 15 |
| 67 psraw xmm4, 15 |
| 44 | 68 |
| 45 pabsw xmm1, xmm1 | 69 pabsw xmm1, xmm1 |
| 46 pabsw xmm5, xmm5 | 70 pabsw xmm5, xmm5 |
| 47 | 71 |
| 48 paddw xmm1, xmm2 | 72 paddw xmm1, xmm2 |
| 49 paddw xmm5, xmm3 | 73 paddw xmm5, xmm3 |
| 50 | 74 |
| 51 pmulhw xmm1, [rsi] | 75 ; quant_fast |
| 52 pmulhw xmm5, [rsi + 16] | 76 pmulhw xmm1, [rdx] |
| 77 pmulhw xmm5, [rdx + 16] |
| 53 | 78 |
| 54 mov rdi, arg(1) ;qcoeff_ptr | 79 mov rax, [rsi + vp8_blockd_qcoeff] |
| 55 mov rcx, arg(2) ;dequant_ptr | 80 mov rdi, [rsi + vp8_blockd_dequant] |
| 56 mov rsi, arg(5) ;dqcoeff_ptr | 81 mov rcx, [rsi + vp8_blockd_dqcoeff] |
| 57 | 82 |
| 58 pxor xmm1, xmm0 | 83 pxor xmm1, xmm0 |
| 59 pxor xmm5, xmm4 | 84 pxor xmm5, xmm4 |
| 60 psubw xmm1, xmm0 | 85 psubw xmm1, xmm0 |
| 61 psubw xmm5, xmm4 | 86 psubw xmm5, xmm4 |
| 62 | 87 |
| 63 movdqa [rdi], xmm1 | 88 movdqa [rax], xmm1 |
| 64 movdqa [rdi + 16], xmm5 | 89 movdqa [rax + 16], xmm5 |
| 65 | 90 |
| 66 movdqa xmm2, [rcx] | 91 movdqa xmm2, [rdi] |
| 67 movdqa xmm3, [rcx + 16] | 92 movdqa xmm3, [rdi + 16] |
| 68 | 93 |
| 69 pxor xmm4, xmm4 | 94 pxor xmm4, xmm4 |
| 70 pmullw xmm2, xmm1 | 95 pmullw xmm2, xmm1 |
| 71 pmullw xmm3, xmm5 | 96 pmullw xmm3, xmm5 |
| 72 | 97 |
| 73 pcmpeqw xmm1, xmm4 ;non zero mask | 98 pcmpeqw xmm1, xmm4 ;non zero mask |
| 74 pcmpeqw xmm5, xmm4 ;non zero mask | 99 pcmpeqw xmm5, xmm4 ;non zero mask |
| 75 packsswb xmm1, xmm5 | 100 packsswb xmm1, xmm5 |
| 76 pshufb xmm1, [ GLOBAL(zz_shuf)] | 101 pshufb xmm1, [GLOBAL(zz_shuf)] |
| 77 | 102 |
| 78 pmovmskb edx, xmm1 | 103 pmovmskb edx, xmm1 |
| 79 | 104 |
| 80 ; xor ecx, ecx | |
| 81 ; mov eax, -1 | |
| 82 ;find_eob_loop: | |
| 83 ; shr edx, 1 | |
| 84 ; jc fq_skip | |
| 85 ; mov eax, ecx | |
| 86 ;fq_skip: | |
| 87 ; inc ecx | |
| 88 ; cmp ecx, 16 | |
| 89 ; jne find_eob_loop | |
| 90 xor rdi, rdi | 105 xor rdi, rdi |
| 91 mov eax, -1 | 106 mov eax, -1 |
| 92 xor dx, ax ;flip the bits for bsr | 107 xor dx, ax ;flip the bits for bsr |
| 93 bsr eax, edx | 108 bsr eax, edx |
| 94 | 109 |
| 95 movdqa [rsi], xmm2 ;store dqcoeff | 110 movdqa [rcx], xmm2 ;store dqcoeff |
| 96 movdqa [rsi + 16], xmm3 ;store dqcoeff | 111 movdqa [rcx + 16], xmm3 ;store dqcoeff |
| 97 | 112 |
| 98 sub edi, edx ;check for all zeros in bit mask | 113 sub edi, edx ;check for all zeros in bit mask |
| 99 sar edi, 31 ;0 or -1 | 114 sar edi, 31 ;0 or -1 |
| 100 add eax, 1 | 115 add eax, 1 |
| 101 and eax, edi ;if the bit mask was all zero, | 116 and eax, edi ;if the bit mask was all zero, |
| 102 ;then eob = 0 | 117 ;then eob = 0 |
| 118 mov [rsi + vp8_blockd_eob], eax |
| 119 |
| 103 ; begin epilog | 120 ; begin epilog |
| 121 %if ABI_IS_32BIT |
| 122 pop rsi |
| 104 pop rdi | 123 pop rdi |
| 124 %else |
| 125 %ifidn __OUTPUT_FORMAT__,x64 |
| 105 pop rsi | 126 pop rsi |
| 127 pop rdi |
| 128 %endif |
| 129 %endif |
| 130 |
| 106 RESTORE_GOT | 131 RESTORE_GOT |
| 107 UNSHADOW_ARGS | |
| 108 pop rbp | 132 pop rbp |
| 109 ret | 133 ret |
| 110 | 134 |
| 111 SECTION_RODATA | 135 SECTION_RODATA |
| 112 align 16 | 136 align 16 |
| 113 zz_shuf: | 137 zz_shuf: |
| 114 db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 | 138 db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 |
| OLD | NEW |