| OLD | NEW |
| 1 ; | 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; | 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; | 9 ; |
| 10 | 10 |
| 11 %include "third_party/x86inc/x86inc.asm" | 11 %include "third_party/x86inc/x86inc.asm" |
| 12 | 12 |
| 13 SECTION_RODATA | 13 SECTION_RODATA |
| 14 pw_1: times 8 dw 1 | 14 pw_1: times 8 dw 1 |
| 15 | 15 |
| 16 SECTION .text | 16 SECTION .text |
| 17 | 17 |
| 18 ; TODO(yunqingwang)fix quantize_b code for skip=1 case. |
| 18 %macro QUANTIZE_FN 2 | 19 %macro QUANTIZE_FN 2 |
| 19 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ | 20 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ |
| 20 shift, qcoeff, dqcoeff, dequant, \ | 21 shift, qcoeff, dqcoeff, dequant, \ |
| 21 eob, scan, iscan | 22 eob, scan, iscan |
| 22 cmp dword skipm, 0 | 23 cmp dword skipm, 0 |
| 23 jne .blank | 24 jne .blank |
| 24 | 25 |
| 25 ; actual quantize loop - setup pointers, rounders, etc. | 26 ; actual quantize loop - setup pointers, rounders, etc. |
| 26 movifnidn coeffq, coeffmp | 27 movifnidn coeffq, coeffmp |
| 27 movifnidn ncoeffq, ncoeffmp | 28 movifnidn ncoeffq, ncoeffmp |
| (...skipping 209 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 237 psrlw m1, 1 ; m1 = (m1 + 1) / 2 | 238 psrlw m1, 1 ; m1 = (m1 + 1) / 2 |
| 238 %endif | 239 %endif |
| 239 mova m3, [r2q] ; m3 = dequant | 240 mova m3, [r2q] ; m3 = dequant |
| 240 mov r3, qcoeffmp | 241 mov r3, qcoeffmp |
| 241 mov r4, dqcoeffmp | 242 mov r4, dqcoeffmp |
| 242 mov r5, iscanmp | 243 mov r5, iscanmp |
| 243 %ifidn %1, fp_32x32 | 244 %ifidn %1, fp_32x32 |
| 244 psllw m2, 1 | 245 psllw m2, 1 |
| 245 %endif | 246 %endif |
| 246 pxor m5, m5 ; m5 = dedicated zero | 247 pxor m5, m5 ; m5 = dedicated zero |
| 247 DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob | 248 |
| 248 lea coeffq, [ coeffq+ncoeffq*2] | 249 lea coeffq, [ coeffq+ncoeffq*2] |
| 249 lea iscanq, [ iscanq+ncoeffq*2] | 250 lea r5q, [ r5q+ncoeffq*2] |
| 250 lea qcoeffq, [ qcoeffq+ncoeffq*2] | 251 lea r3q, [ r3q+ncoeffq*2] |
| 251 lea dqcoeffq, [dqcoeffq+ncoeffq*2] | 252 lea r4q, [r4q+ncoeffq*2] |
| 252 neg ncoeffq | 253 neg ncoeffq |
| 253 | 254 |
| 254 ; get DC and first 15 AC coeffs | 255 ; get DC and first 15 AC coeffs |
| 255 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] | 256 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] |
| 256 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] | 257 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] |
| 257 pabsw m6, m9 ; m6 = abs(m9) | 258 pabsw m6, m9 ; m6 = abs(m9) |
| 258 pabsw m11, m10 ; m11 = abs(m10) | 259 pabsw m11, m10 ; m11 = abs(m10) |
| 259 pcmpeqw m7, m7 | 260 pcmpeqw m7, m7 |
| 260 | 261 |
| 261 paddsw m6, m1 ; m6 += round | 262 paddsw m6, m1 ; m6 += round |
| 262 punpckhqdq m1, m1 | 263 punpckhqdq m1, m1 |
| 263 paddsw m11, m1 ; m11 += round | 264 paddsw m11, m1 ; m11 += round |
| 264 pmulhw m8, m6, m2 ; m8 = m6*q>>16 | 265 pmulhw m8, m6, m2 ; m8 = m6*q>>16 |
| 265 punpckhqdq m2, m2 | 266 punpckhqdq m2, m2 |
| 266 pmulhw m13, m11, m2 ; m13 = m11*q>>16 | 267 pmulhw m13, m11, m2 ; m13 = m11*q>>16 |
| 267 psignw m8, m9 ; m8 = reinsert sign | 268 psignw m8, m9 ; m8 = reinsert sign |
| 268 psignw m13, m10 ; m13 = reinsert sign | 269 psignw m13, m10 ; m13 = reinsert sign |
| 269 mova [qcoeffq+ncoeffq*2+ 0], m8 | 270 mova [r3q+ncoeffq*2+ 0], m8 |
| 270 mova [qcoeffq+ncoeffq*2+16], m13 | 271 mova [r3q+ncoeffq*2+16], m13 |
| 271 %ifidn %1, fp_32x32 | 272 %ifidn %1, fp_32x32 |
| 272 pabsw m8, m8 | 273 pabsw m8, m8 |
| 273 pabsw m13, m13 | 274 pabsw m13, m13 |
| 274 %endif | 275 %endif |
| 275 pmullw m8, m3 ; dqc[i] = qc[i] * q | 276 pmullw m8, m3 ; r4[i] = r3[i] * q |
| 276 punpckhqdq m3, m3 | 277 punpckhqdq m3, m3 |
| 277 pmullw m13, m3 ; dqc[i] = qc[i] * q | 278 pmullw m13, m3 ; r4[i] = r3[i] * q |
| 278 %ifidn %1, fp_32x32 | 279 %ifidn %1, fp_32x32 |
| 279 psrlw m8, 1 | 280 psrlw m8, 1 |
| 280 psrlw m13, 1 | 281 psrlw m13, 1 |
| 281 psignw m8, m9 | 282 psignw m8, m9 |
| 282 psignw m13, m10 | 283 psignw m13, m10 |
| 283 psrlw m0, m3, 2 | 284 psrlw m0, m3, 2 |
| 284 %endif | 285 %endif |
| 285 mova [dqcoeffq+ncoeffq*2+ 0], m8 | 286 mova [r4q+ncoeffq*2+ 0], m8 |
| 286 mova [dqcoeffq+ncoeffq*2+16], m13 | 287 mova [r4q+ncoeffq*2+16], m13 |
| 287 pcmpeqw m8, m5 ; m8 = c[i] == 0 | 288 pcmpeqw m8, m5 ; m8 = c[i] == 0 |
| 288 pcmpeqw m13, m5 ; m13 = c[i] == 0 | 289 pcmpeqw m13, m5 ; m13 = c[i] == 0 |
| 289 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] | 290 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] |
| 290 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] | 291 mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] |
| 291 psubw m6, m7 ; m6 = scan[i] + 1 | 292 psubw m6, m7 ; m6 = scan[i] + 1 |
| 292 psubw m11, m7 ; m11 = scan[i] + 1 | 293 psubw m11, m7 ; m11 = scan[i] + 1 |
| 293 pandn m8, m6 ; m8 = max(eob) | 294 pandn m8, m6 ; m8 = max(eob) |
| 294 pandn m13, m11 ; m13 = max(eob) | 295 pandn m13, m11 ; m13 = max(eob) |
| 295 pmaxsw m8, m13 | 296 pmaxsw m8, m13 |
| 296 add ncoeffq, mmsize | 297 add ncoeffq, mmsize |
| 297 jz .accumulate_eob | 298 jz .accumulate_eob |
| 298 | 299 |
| 299 .ac_only_loop: | 300 .ac_only_loop: |
| 300 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] | 301 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] |
| (...skipping 10 matching lines...) Expand all Loading... |
| 311 jz .skip_iter | 312 jz .skip_iter |
| 312 %endif | 313 %endif |
| 313 pcmpeqw m7, m7 | 314 pcmpeqw m7, m7 |
| 314 | 315 |
| 315 paddsw m6, m1 ; m6 += round | 316 paddsw m6, m1 ; m6 += round |
| 316 paddsw m11, m1 ; m11 += round | 317 paddsw m11, m1 ; m11 += round |
| 317 pmulhw m14, m6, m2 ; m14 = m6*q>>16 | 318 pmulhw m14, m6, m2 ; m14 = m6*q>>16 |
| 318 pmulhw m13, m11, m2 ; m13 = m11*q>>16 | 319 pmulhw m13, m11, m2 ; m13 = m11*q>>16 |
| 319 psignw m14, m9 ; m14 = reinsert sign | 320 psignw m14, m9 ; m14 = reinsert sign |
| 320 psignw m13, m10 ; m13 = reinsert sign | 321 psignw m13, m10 ; m13 = reinsert sign |
| 321 mova [qcoeffq+ncoeffq*2+ 0], m14 | 322 mova [r3q+ncoeffq*2+ 0], m14 |
| 322 mova [qcoeffq+ncoeffq*2+16], m13 | 323 mova [r3q+ncoeffq*2+16], m13 |
| 323 %ifidn %1, fp_32x32 | 324 %ifidn %1, fp_32x32 |
| 324 pabsw m14, m14 | 325 pabsw m14, m14 |
| 325 pabsw m13, m13 | 326 pabsw m13, m13 |
| 326 %endif | 327 %endif |
| 327 pmullw m14, m3 ; dqc[i] = qc[i] * q | 328 pmullw m14, m3 ; r4[i] = r3[i] * q |
| 328 pmullw m13, m3 ; dqc[i] = qc[i] * q | 329 pmullw m13, m3 ; r4[i] = r3[i] * q |
| 329 %ifidn %1, fp_32x32 | 330 %ifidn %1, fp_32x32 |
| 330 psrlw m14, 1 | 331 psrlw m14, 1 |
| 331 psrlw m13, 1 | 332 psrlw m13, 1 |
| 332 psignw m14, m9 | 333 psignw m14, m9 |
| 333 psignw m13, m10 | 334 psignw m13, m10 |
| 334 %endif | 335 %endif |
| 335 mova [dqcoeffq+ncoeffq*2+ 0], m14 | 336 mova [r4q+ncoeffq*2+ 0], m14 |
| 336 mova [dqcoeffq+ncoeffq*2+16], m13 | 337 mova [r4q+ncoeffq*2+16], m13 |
| 337 pcmpeqw m14, m5 ; m14 = c[i] == 0 | 338 pcmpeqw m14, m5 ; m14 = c[i] == 0 |
| 338 pcmpeqw m13, m5 ; m13 = c[i] == 0 | 339 pcmpeqw m13, m5 ; m13 = c[i] == 0 |
| 339 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] | 340 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] |
| 340 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] | 341 mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] |
| 341 psubw m6, m7 ; m6 = scan[i] + 1 | 342 psubw m6, m7 ; m6 = scan[i] + 1 |
| 342 psubw m11, m7 ; m11 = scan[i] + 1 | 343 psubw m11, m7 ; m11 = scan[i] + 1 |
| 343 pandn m14, m6 ; m14 = max(eob) | 344 pandn m14, m6 ; m14 = max(eob) |
| 344 pandn m13, m11 ; m13 = max(eob) | 345 pandn m13, m11 ; m13 = max(eob) |
| 345 pmaxsw m8, m14 | 346 pmaxsw m8, m14 |
| 346 pmaxsw m8, m13 | 347 pmaxsw m8, m13 |
| 347 add ncoeffq, mmsize | 348 add ncoeffq, mmsize |
| 348 jl .ac_only_loop | 349 jl .ac_only_loop |
| 349 | 350 |
| 350 %ifidn %1, fp_32x32 | 351 %ifidn %1, fp_32x32 |
| 351 jmp .accumulate_eob | 352 jmp .accumulate_eob |
| 352 .skip_iter: | 353 .skip_iter: |
| 353 mova [qcoeffq+ncoeffq*2+ 0], m5 | 354 mova [r3q+ncoeffq*2+ 0], m5 |
| 354 mova [qcoeffq+ncoeffq*2+16], m5 | 355 mova [r3q+ncoeffq*2+16], m5 |
| 355 mova [dqcoeffq+ncoeffq*2+ 0], m5 | 356 mova [r4q+ncoeffq*2+ 0], m5 |
| 356 mova [dqcoeffq+ncoeffq*2+16], m5 | 357 mova [r4q+ncoeffq*2+16], m5 |
| 357 add ncoeffq, mmsize | 358 add ncoeffq, mmsize |
| 358 jl .ac_only_loop | 359 jl .ac_only_loop |
| 359 %endif | 360 %endif |
| 360 | 361 |
| 361 .accumulate_eob: | 362 .accumulate_eob: |
| 362 ; horizontally accumulate/max eobs and write into [eob] memory pointer | 363 ; horizontally accumulate/max eobs and write into [eob] memory pointer |
| 363 mov r2, eobmp | 364 mov r2, eobmp |
| 364 pshufd m7, m8, 0xe | 365 pshufd m7, m8, 0xe |
| 365 pmaxsw m8, m7 | 366 pmaxsw m8, m7 |
| 366 pshuflw m7, m8, 0xe | 367 pshuflw m7, m8, 0xe |
| 367 pmaxsw m8, m7 | 368 pmaxsw m8, m7 |
| 368 pshuflw m7, m8, 0x1 | 369 pshuflw m7, m8, 0x1 |
| 369 pmaxsw m8, m7 | 370 pmaxsw m8, m7 |
| 370 pextrw r6, m8, 0 | 371 pextrw r6, m8, 0 |
| 371 mov [r2], r6 | 372 mov [r2], r6 |
| 372 RET | 373 RET |
| 373 | 374 |
| 374 ; skip-block, i.e. just write all zeroes | 375 ; skip-block, i.e. just write all zeroes |
| 375 .blank: | 376 .blank: |
| 376 mov r0, dqcoeffmp | 377 mov r0, dqcoeffmp |
| 377 movifnidn ncoeffq, ncoeffmp | 378 movifnidn ncoeffq, ncoeffmp |
| 378 mov r2, qcoeffmp | 379 mov r2, qcoeffmp |
| 379 mov r3, eobmp | 380 mov r3, eobmp |
| 380 DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob | 381 |
| 381 lea dqcoeffq, [dqcoeffq+ncoeffq*2] | 382 lea r0q, [r0q+ncoeffq*2] |
| 382 lea qcoeffq, [ qcoeffq+ncoeffq*2] | 383 lea r2q, [r2q+ncoeffq*2] |
| 383 neg ncoeffq | 384 neg ncoeffq |
| 384 pxor m7, m7 | 385 pxor m7, m7 |
| 385 .blank_loop: | 386 .blank_loop: |
| 386 mova [dqcoeffq+ncoeffq*2+ 0], m7 | 387 mova [r0q+ncoeffq*2+ 0], m7 |
| 387 mova [dqcoeffq+ncoeffq*2+16], m7 | 388 mova [r0q+ncoeffq*2+16], m7 |
| 388 mova [qcoeffq+ncoeffq*2+ 0], m7 | 389 mova [r2q+ncoeffq*2+ 0], m7 |
| 389 mova [qcoeffq+ncoeffq*2+16], m7 | 390 mova [r2q+ncoeffq*2+16], m7 |
| 390 add ncoeffq, mmsize | 391 add ncoeffq, mmsize |
| 391 jl .blank_loop | 392 jl .blank_loop |
| 392 mov word [eobq], 0 | 393 mov word [r3q], 0 |
| 393 RET | 394 RET |
| 394 %endmacro | 395 %endmacro |
| 395 | 396 |
| 396 INIT_XMM ssse3 | 397 INIT_XMM ssse3 |
| 397 QUANTIZE_FP fp, 7 | 398 QUANTIZE_FP fp, 7 |
| 398 QUANTIZE_FP fp_32x32, 7 | 399 QUANTIZE_FP fp_32x32, 7 |
| OLD | NEW |