| OLD | NEW |
| 1 ; | 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; | 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; | 9 ; |
| 10 | 10 |
| (...skipping 216 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 227 | 227 |
| 228 ; actual quantize loop - setup pointers, rounders, etc. | 228 ; actual quantize loop - setup pointers, rounders, etc. |
| 229 movifnidn coeffq, coeffmp | 229 movifnidn coeffq, coeffmp |
| 230 movifnidn ncoeffq, ncoeffmp | 230 movifnidn ncoeffq, ncoeffmp |
| 231 mov r2, dequantmp | 231 mov r2, dequantmp |
| 232 movifnidn zbinq, zbinmp | 232 movifnidn zbinq, zbinmp |
| 233 movifnidn roundq, roundmp | 233 movifnidn roundq, roundmp |
| 234 movifnidn quantq, quantmp | 234 movifnidn quantq, quantmp |
| 235 mova m1, [roundq] ; m1 = round | 235 mova m1, [roundq] ; m1 = round |
| 236 mova m2, [quantq] ; m2 = quant | 236 mova m2, [quantq] ; m2 = quant |
| 237 %ifidn %1, b_32x32 | 237 %ifidn %1, fp_32x32 |
| 238 ; TODO(jingning) to be continued with 32x32 quantization process | |
| 239 pcmpeqw m5, m5 | 238 pcmpeqw m5, m5 |
| 240 psrlw m5, 15 | 239 psrlw m5, 15 |
| 241 paddw m0, m5 | |
| 242 paddw m1, m5 | 240 paddw m1, m5 |
| 243 psrlw m0, 1 ; m0 = (m0 + 1) / 2 | |
| 244 psrlw m1, 1 ; m1 = (m1 + 1) / 2 | 241 psrlw m1, 1 ; m1 = (m1 + 1) / 2 |
| 245 %endif | 242 %endif |
| 246 mova m3, [r2q] ; m3 = dequant | 243 mova m3, [r2q] ; m3 = dequant |
| 247 mov r3, qcoeffmp | 244 mov r3, qcoeffmp |
| 248 mov r4, dqcoeffmp | 245 mov r4, dqcoeffmp |
| 249 mov r5, iscanmp | 246 mov r5, iscanmp |
| 250 %ifidn %1, b_32x32 | 247 %ifidn %1, fp_32x32 |
| 251 psllw m4, 1 | 248 psllw m2, 1 |
| 252 %endif | 249 %endif |
| 253 pxor m5, m5 ; m5 = dedicated zero | 250 pxor m5, m5 ; m5 = dedicated zero |
| 254 DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob | 251 DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob |
| 255 lea coeffq, [ coeffq+ncoeffq*2] | 252 lea coeffq, [ coeffq+ncoeffq*2] |
| 256 lea iscanq, [ iscanq+ncoeffq*2] | 253 lea iscanq, [ iscanq+ncoeffq*2] |
| 257 lea qcoeffq, [ qcoeffq+ncoeffq*2] | 254 lea qcoeffq, [ qcoeffq+ncoeffq*2] |
| 258 lea dqcoeffq, [dqcoeffq+ncoeffq*2] | 255 lea dqcoeffq, [dqcoeffq+ncoeffq*2] |
| 259 neg ncoeffq | 256 neg ncoeffq |
| 260 | 257 |
| 261 ; get DC and first 15 AC coeffs | 258 ; get DC and first 15 AC coeffs |
| 262 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] | 259 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] |
| 263 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] | 260 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] |
| 264 pabsw m6, m9 ; m6 = abs(m9) | 261 pabsw m6, m9 ; m6 = abs(m9) |
| 265 pabsw m11, m10 ; m11 = abs(m10) | 262 pabsw m11, m10 ; m11 = abs(m10) |
| 266 pcmpeqw m7, m7 | 263 pcmpeqw m7, m7 |
| 267 | 264 |
| 268 paddsw m6, m1 ; m6 += round | 265 paddsw m6, m1 ; m6 += round |
| 269 punpckhqdq m1, m1 | 266 punpckhqdq m1, m1 |
| 270 paddsw m11, m1 ; m11 += round | 267 paddsw m11, m1 ; m11 += round |
| 271 pmulhw m8, m6, m2 ; m8 = m6*q>>16 | 268 pmulhw m8, m6, m2 ; m8 = m6*q>>16 |
| 272 punpckhqdq m2, m2 | 269 punpckhqdq m2, m2 |
| 273 pmulhw m13, m11, m2 ; m13 = m11*q>>16 | 270 pmulhw m13, m11, m2 ; m13 = m11*q>>16 |
| 274 psignw m8, m9 ; m8 = reinsert sign | 271 psignw m8, m9 ; m8 = reinsert sign |
| 275 psignw m13, m10 ; m13 = reinsert sign | 272 psignw m13, m10 ; m13 = reinsert sign |
| 276 mova [qcoeffq+ncoeffq*2+ 0], m8 | 273 mova [qcoeffq+ncoeffq*2+ 0], m8 |
| 277 mova [qcoeffq+ncoeffq*2+16], m13 | 274 mova [qcoeffq+ncoeffq*2+16], m13 |
| 278 %ifidn %1, b_32x32 | 275 %ifidn %1, fp_32x32 |
| 279 pabsw m8, m8 | 276 pabsw m8, m8 |
| 280 pabsw m13, m13 | 277 pabsw m13, m13 |
| 281 %endif | 278 %endif |
| 282 pmullw m8, m3 ; dqc[i] = qc[i] * q | 279 pmullw m8, m3 ; dqc[i] = qc[i] * q |
| 283 punpckhqdq m3, m3 | 280 punpckhqdq m3, m3 |
| 284 pmullw m13, m3 ; dqc[i] = qc[i] * q | 281 pmullw m13, m3 ; dqc[i] = qc[i] * q |
| 285 %ifidn %1, b_32x32 | 282 %ifidn %1, fp_32x32 |
| 286 psrlw m8, 1 | 283 psrlw m8, 1 |
| 287 psrlw m13, 1 | 284 psrlw m13, 1 |
| 288 psignw m8, m9 | 285 psignw m8, m9 |
| 289 psignw m13, m10 | 286 psignw m13, m10 |
| 287 psrlw m0, m3, 2 |
| 290 %endif | 288 %endif |
| 291 mova [dqcoeffq+ncoeffq*2+ 0], m8 | 289 mova [dqcoeffq+ncoeffq*2+ 0], m8 |
| 292 mova [dqcoeffq+ncoeffq*2+16], m13 | 290 mova [dqcoeffq+ncoeffq*2+16], m13 |
| 293 pcmpeqw m8, m5 ; m8 = c[i] == 0 | 291 pcmpeqw m8, m5 ; m8 = c[i] == 0 |
| 294 pcmpeqw m13, m5 ; m13 = c[i] == 0 | 292 pcmpeqw m13, m5 ; m13 = c[i] == 0 |
| 295 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] | 293 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] |
| 296 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] | 294 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] |
| 297 psubw m6, m7 ; m6 = scan[i] + 1 | 295 psubw m6, m7 ; m6 = scan[i] + 1 |
| 298 psubw m11, m7 ; m11 = scan[i] + 1 | 296 psubw m11, m7 ; m11 = scan[i] + 1 |
| 299 pandn m8, m6 ; m8 = max(eob) | 297 pandn m8, m6 ; m8 = max(eob) |
| 300 pandn m13, m11 ; m13 = max(eob) | 298 pandn m13, m11 ; m13 = max(eob) |
| 301 pmaxsw m8, m13 | 299 pmaxsw m8, m13 |
| 302 add ncoeffq, mmsize | 300 add ncoeffq, mmsize |
| 303 jz .accumulate_eob | 301 jz .accumulate_eob |
| 304 | 302 |
| 305 .ac_only_loop: | 303 .ac_only_loop: |
| 306 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] | 304 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] |
| 307 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] | 305 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] |
| 308 pabsw m6, m9 ; m6 = abs(m9) | 306 pabsw m6, m9 ; m6 = abs(m9) |
| 309 pabsw m11, m10 ; m11 = abs(m10) | 307 pabsw m11, m10 ; m11 = abs(m10) |
| 310 pcmpeqw m7, m7 | 308 %ifidn %1, fp_32x32 |
| 311 %ifidn %1, b_32x32 | 309 pcmpgtw m7, m6, m0 |
| 310 pcmpgtw m12, m11, m0 |
| 312 pmovmskb r6, m7 | 311 pmovmskb r6, m7 |
| 313 pmovmskb r2, m7 | 312 pmovmskb r2, m12 |
| 313 |
| 314 or r6, r2 | 314 or r6, r2 |
| 315 jz .skip_iter | 315 jz .skip_iter |
| 316 %endif | 316 %endif |
| 317 pcmpeqw m7, m7 |
| 318 |
| 317 paddsw m6, m1 ; m6 += round | 319 paddsw m6, m1 ; m6 += round |
| 318 paddsw m11, m1 ; m11 += round | 320 paddsw m11, m1 ; m11 += round |
| 319 pmulhw m14, m6, m2 ; m14 = m6*q>>16 | 321 pmulhw m14, m6, m2 ; m14 = m6*q>>16 |
| 320 pmulhw m13, m11, m2 ; m13 = m11*q>>16 | 322 pmulhw m13, m11, m2 ; m13 = m11*q>>16 |
| 321 psignw m14, m9 ; m14 = reinsert sign | 323 psignw m14, m9 ; m14 = reinsert sign |
| 322 psignw m13, m10 ; m13 = reinsert sign | 324 psignw m13, m10 ; m13 = reinsert sign |
| 323 mova [qcoeffq+ncoeffq*2+ 0], m14 | 325 mova [qcoeffq+ncoeffq*2+ 0], m14 |
| 324 mova [qcoeffq+ncoeffq*2+16], m13 | 326 mova [qcoeffq+ncoeffq*2+16], m13 |
| 325 %ifidn %1, b_32x32 | 327 %ifidn %1, fp_32x32 |
| 326 pabsw m14, m14 | 328 pabsw m14, m14 |
| 327 pabsw m13, m13 | 329 pabsw m13, m13 |
| 328 %endif | 330 %endif |
| 329 pmullw m14, m3 ; dqc[i] = qc[i] * q | 331 pmullw m14, m3 ; dqc[i] = qc[i] * q |
| 330 pmullw m13, m3 ; dqc[i] = qc[i] * q | 332 pmullw m13, m3 ; dqc[i] = qc[i] * q |
| 331 %ifidn %1, b_32x32 | 333 %ifidn %1, fp_32x32 |
| 332 psrlw m14, 1 | 334 psrlw m14, 1 |
| 333 psrlw m13, 1 | 335 psrlw m13, 1 |
| 334 psignw m14, m9 | 336 psignw m14, m9 |
| 335 psignw m13, m10 | 337 psignw m13, m10 |
| 336 %endif | 338 %endif |
| 337 mova [dqcoeffq+ncoeffq*2+ 0], m14 | 339 mova [dqcoeffq+ncoeffq*2+ 0], m14 |
| 338 mova [dqcoeffq+ncoeffq*2+16], m13 | 340 mova [dqcoeffq+ncoeffq*2+16], m13 |
| 339 pcmpeqw m14, m5 ; m14 = c[i] == 0 | 341 pcmpeqw m14, m5 ; m14 = c[i] == 0 |
| 340 pcmpeqw m13, m5 ; m13 = c[i] == 0 | 342 pcmpeqw m13, m5 ; m13 = c[i] == 0 |
| 341 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] | 343 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] |
| 342 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] | 344 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] |
| 343 psubw m6, m7 ; m6 = scan[i] + 1 | 345 psubw m6, m7 ; m6 = scan[i] + 1 |
| 344 psubw m11, m7 ; m11 = scan[i] + 1 | 346 psubw m11, m7 ; m11 = scan[i] + 1 |
| 345 pandn m14, m6 ; m14 = max(eob) | 347 pandn m14, m6 ; m14 = max(eob) |
| 346 pandn m13, m11 ; m13 = max(eob) | 348 pandn m13, m11 ; m13 = max(eob) |
| 347 pmaxsw m8, m14 | 349 pmaxsw m8, m14 |
| 348 pmaxsw m8, m13 | 350 pmaxsw m8, m13 |
| 349 add ncoeffq, mmsize | 351 add ncoeffq, mmsize |
| 350 jl .ac_only_loop | 352 jl .ac_only_loop |
| 351 | 353 |
| 352 %ifidn %1, b_32x32 | 354 %ifidn %1, fp_32x32 |
| 353 jmp .accumulate_eob | 355 jmp .accumulate_eob |
| 354 .skip_iter: | 356 .skip_iter: |
| 355 mova [qcoeffq+ncoeffq*2+ 0], m5 | 357 mova [qcoeffq+ncoeffq*2+ 0], m5 |
| 356 mova [qcoeffq+ncoeffq*2+16], m5 | 358 mova [qcoeffq+ncoeffq*2+16], m5 |
| 357 mova [dqcoeffq+ncoeffq*2+ 0], m5 | 359 mova [dqcoeffq+ncoeffq*2+ 0], m5 |
| 358 mova [dqcoeffq+ncoeffq*2+16], m5 | 360 mova [dqcoeffq+ncoeffq*2+16], m5 |
| 359 add ncoeffq, mmsize | 361 add ncoeffq, mmsize |
| 360 jl .ac_only_loop | 362 jl .ac_only_loop |
| 361 %endif | 363 %endif |
| 362 | 364 |
| (...skipping 27 matching lines...) Expand all Loading... |
| 390 mova [qcoeffq+ncoeffq*2+ 0], m7 | 392 mova [qcoeffq+ncoeffq*2+ 0], m7 |
| 391 mova [qcoeffq+ncoeffq*2+16], m7 | 393 mova [qcoeffq+ncoeffq*2+16], m7 |
| 392 add ncoeffq, mmsize | 394 add ncoeffq, mmsize |
| 393 jl .blank_loop | 395 jl .blank_loop |
| 394 mov word [eobq], 0 | 396 mov word [eobq], 0 |
| 395 RET | 397 RET |
| 396 %endmacro | 398 %endmacro |
| 397 | 399 |
| 398 INIT_XMM ssse3 | 400 INIT_XMM ssse3 |
| 399 QUANTIZE_FP fp, 7 | 401 QUANTIZE_FP fp, 7 |
| 402 QUANTIZE_FP fp_32x32, 7 |
| OLD | NEW |