OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 |
| 12 %include "vpx_ports/x86_abi_support.asm" |
| 13 |
| 14 ;int vp9_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, |
| 15 ; short *qcoeff_ptr,short *dequant_ptr, |
| 16 ; short *scan_mask, short *round_ptr, |
| 17 ; short *quant_ptr, short *dqcoeff_ptr); |
| 18 global sym(vp9_fast_quantize_b_impl_mmx) |
| 19 sym(vp9_fast_quantize_b_impl_mmx): |
| 20 push rbp |
| 21 mov rbp, rsp |
| 22 SHADOW_ARGS_TO_STACK 8 |
| 23 push rsi |
| 24 push rdi |
| 25 ; end prolog |
| 26 |
| 27 |
| 28 mov rsi, arg(0) ;coeff_ptr |
| 29 movq mm0, [rsi] |
| 30 |
| 31 mov rax, arg(1) ;zbin_ptr |
| 32 movq mm1, [rax] |
| 33 |
| 34 movq mm3, mm0 |
| 35 psraw mm0, 15 |
| 36 |
| 37 pxor mm3, mm0 |
| 38 psubw mm3, mm0 ; abs |
| 39 |
| 40 movq mm2, mm3 |
| 41 pcmpgtw mm1, mm2 |
| 42 |
| 43 pandn mm1, mm2 |
| 44 movq mm3, mm1 |
| 45 |
| 46 mov rdx, arg(6) ;quant_ptr |
| 47 movq mm1, [rdx] |
| 48 |
| 49 mov rcx, arg(5) ;round_ptr |
| 50 movq mm2, [rcx] |
| 51 |
| 52 paddw mm3, mm2 |
| 53 pmulhuw mm3, mm1 |
| 54 |
| 55 pxor mm3, mm0 |
| 56 psubw mm3, mm0 ;gain the sign back |
| 57 |
| 58 mov rdi, arg(2) ;qcoeff_ptr |
| 59 movq mm0, mm3 |
| 60 |
| 61 movq [rdi], mm3 |
| 62 |
| 63 mov rax, arg(3) ;dequant_ptr |
| 64 movq mm2, [rax] |
| 65 |
| 66 pmullw mm3, mm2 |
| 67 mov rax, arg(7) ;dqcoeff_ptr |
| 68 |
| 69 movq [rax], mm3 |
| 70 |
| 71 ; next 8 |
| 72 movq mm4, [rsi+8] |
| 73 |
| 74 mov rax, arg(1) ;zbin_ptr |
| 75 movq mm5, [rax+8] |
| 76 |
| 77 movq mm7, mm4 |
| 78 psraw mm4, 15 |
| 79 |
| 80 pxor mm7, mm4 |
| 81 psubw mm7, mm4 ; abs |
| 82 |
| 83 movq mm6, mm7 |
| 84 pcmpgtw mm5, mm6 |
| 85 |
| 86 pandn mm5, mm6 |
| 87 movq mm7, mm5 |
| 88 |
| 89 movq mm5, [rdx+8] |
| 90 movq mm6, [rcx+8] |
| 91 |
| 92 paddw mm7, mm6 |
| 93 pmulhuw mm7, mm5 |
| 94 |
| 95 pxor mm7, mm4 |
| 96 psubw mm7, mm4;gain the sign back |
| 97 |
| 98 mov rdi, arg(2) ;qcoeff_ptr |
| 99 |
| 100 movq mm1, mm7 |
| 101 movq [rdi+8], mm7 |
| 102 |
| 103 mov rax, arg(3) ;dequant_ptr |
| 104 movq mm6, [rax+8] |
| 105 |
| 106 pmullw mm7, mm6 |
| 107 mov rax, arg(7) ;dqcoeff_ptr |
| 108 |
| 109 movq [rax+8], mm7 |
| 110 |
| 111 |
| 112 ; next 8 |
| 113 movq mm4, [rsi+16] |
| 114 |
| 115 mov rax, arg(1) ;zbin_ptr |
| 116 movq mm5, [rax+16] |
| 117 |
| 118 movq mm7, mm4 |
| 119 psraw mm4, 15 |
| 120 |
| 121 pxor mm7, mm4 |
| 122 psubw mm7, mm4 ; abs |
| 123 |
| 124 movq mm6, mm7 |
| 125 pcmpgtw mm5, mm6 |
| 126 |
| 127 pandn mm5, mm6 |
| 128 movq mm7, mm5 |
| 129 |
| 130 movq mm5, [rdx+16] |
| 131 movq mm6, [rcx+16] |
| 132 |
| 133 paddw mm7, mm6 |
| 134 pmulhuw mm7, mm5 |
| 135 |
| 136 pxor mm7, mm4 |
| 137 psubw mm7, mm4;gain the sign back |
| 138 |
| 139 mov rdi, arg(2) ;qcoeff_ptr |
| 140 |
| 141 movq mm1, mm7 |
| 142 movq [rdi+16], mm7 |
| 143 |
| 144 mov rax, arg(3) ;dequant_ptr |
| 145 movq mm6, [rax+16] |
| 146 |
| 147 pmullw mm7, mm6 |
| 148 mov rax, arg(7) ;dqcoeff_ptr |
| 149 |
| 150 movq [rax+16], mm7 |
| 151 |
| 152 |
| 153 ; next 8 |
| 154 movq mm4, [rsi+24] |
| 155 |
| 156 mov rax, arg(1) ;zbin_ptr |
| 157 movq mm5, [rax+24] |
| 158 |
| 159 movq mm7, mm4 |
| 160 psraw mm4, 15 |
| 161 |
| 162 pxor mm7, mm4 |
| 163 psubw mm7, mm4 ; abs |
| 164 |
| 165 movq mm6, mm7 |
| 166 pcmpgtw mm5, mm6 |
| 167 |
| 168 pandn mm5, mm6 |
| 169 movq mm7, mm5 |
| 170 |
| 171 movq mm5, [rdx+24] |
| 172 movq mm6, [rcx+24] |
| 173 |
| 174 paddw mm7, mm6 |
| 175 pmulhuw mm7, mm5 |
| 176 |
| 177 pxor mm7, mm4 |
| 178 psubw mm7, mm4;gain the sign back |
| 179 |
| 180 mov rdi, arg(2) ;qcoeff_ptr |
| 181 |
| 182 movq mm1, mm7 |
| 183 movq [rdi+24], mm7 |
| 184 |
| 185 mov rax, arg(3) ;dequant_ptr |
| 186 movq mm6, [rax+24] |
| 187 |
| 188 pmullw mm7, mm6 |
| 189 mov rax, arg(7) ;dqcoeff_ptr |
| 190 |
| 191 movq [rax+24], mm7 |
| 192 |
| 193 |
| 194 |
| 195 mov rdi, arg(4) ;scan_mask |
| 196 mov rsi, arg(2) ;qcoeff_ptr |
| 197 |
| 198 pxor mm5, mm5 |
| 199 pxor mm7, mm7 |
| 200 |
| 201 movq mm0, [rsi] |
| 202 movq mm1, [rsi+8] |
| 203 |
| 204 movq mm2, [rdi] |
| 205 movq mm3, [rdi+8]; |
| 206 |
| 207 pcmpeqw mm0, mm7 |
| 208 pcmpeqw mm1, mm7 |
| 209 |
| 210 pcmpeqw mm6, mm6 |
| 211 pxor mm0, mm6 |
| 212 |
| 213 pxor mm1, mm6 |
| 214 psrlw mm0, 15 |
| 215 |
| 216 psrlw mm1, 15 |
| 217 pmaddwd mm0, mm2 |
| 218 |
| 219 pmaddwd mm1, mm3 |
| 220 movq mm5, mm0 |
| 221 |
| 222 paddd mm5, mm1 |
| 223 |
| 224 movq mm0, [rsi+16] |
| 225 movq mm1, [rsi+24] |
| 226 |
| 227 movq mm2, [rdi+16] |
| 228 movq mm3, [rdi+24]; |
| 229 |
| 230 pcmpeqw mm0, mm7 |
| 231 pcmpeqw mm1, mm7 |
| 232 |
| 233 pcmpeqw mm6, mm6 |
| 234 pxor mm0, mm6 |
| 235 |
| 236 pxor mm1, mm6 |
| 237 psrlw mm0, 15 |
| 238 |
| 239 psrlw mm1, 15 |
| 240 pmaddwd mm0, mm2 |
| 241 |
| 242 pmaddwd mm1, mm3 |
| 243 paddd mm5, mm0 |
| 244 |
| 245 paddd mm5, mm1 |
| 246 movq mm0, mm5 |
| 247 |
| 248 psrlq mm5, 32 |
| 249 paddd mm0, mm5 |
| 250 |
| 251 ; eob adjustment begins here |
| 252 movq rcx, mm0 |
| 253 and rcx, 0xffff |
| 254 |
| 255 xor rdx, rdx |
| 256 sub rdx, rcx ; rdx=-rcx |
| 257 |
| 258 bsr rax, rcx |
| 259 inc rax |
| 260 |
| 261 sar rdx, 31 |
| 262 and rax, rdx |
| 263 ; Substitute the sse assembly for the old mmx mixed assembly/C. The |
| 264 ; following is kept as reference |
| 265 ; movq rcx, mm0 |
| 266 ; bsr rax, rcx |
| 267 ; |
| 268 ; mov eob, rax |
| 269 ; mov eee, rcx |
| 270 ; |
| 271 ;if(eee==0) |
| 272 ;{ |
| 273 ; eob=-1; |
| 274 ;} |
| 275 ;else if(eee<0) |
| 276 ;{ |
| 277 ; eob=15; |
| 278 ;} |
| 279 ;d->eob = eob+1; |
| 280 |
| 281 ; begin epilog |
| 282 pop rdi |
| 283 pop rsi |
| 284 UNSHADOW_ARGS |
| 285 pop rbp |
| 286 ret |
OLD | NEW |