OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
11 %include "third_party/x86inc/x86inc.asm" | 11 %include "third_party/x86inc/x86inc.asm" |
12 | 12 |
13 SECTION_RODATA | 13 SECTION_RODATA |
14 pw_1: times 8 dw 1 | 14 pw_1: times 8 dw 1 |
15 | 15 |
16 SECTION .text | 16 SECTION .text |
17 | 17 |
| 18 ; TODO(yunqingwang)fix quantize_b code for skip=1 case. |
18 %macro QUANTIZE_FN 2 | 19 %macro QUANTIZE_FN 2 |
19 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ | 20 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ |
20 shift, qcoeff, dqcoeff, dequant, \ | 21 shift, qcoeff, dqcoeff, dequant, \ |
21 eob, scan, iscan | 22 eob, scan, iscan |
22 cmp dword skipm, 0 | 23 cmp dword skipm, 0 |
23 jne .blank | 24 jne .blank |
24 | 25 |
25 ; actual quantize loop - setup pointers, rounders, etc. | 26 ; actual quantize loop - setup pointers, rounders, etc. |
26 movifnidn coeffq, coeffmp | 27 movifnidn coeffq, coeffmp |
27 movifnidn ncoeffq, ncoeffmp | 28 movifnidn ncoeffq, ncoeffmp |
(...skipping 209 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
237 psrlw m1, 1 ; m1 = (m1 + 1) / 2 | 238 psrlw m1, 1 ; m1 = (m1 + 1) / 2 |
238 %endif | 239 %endif |
239 mova m3, [r2q] ; m3 = dequant | 240 mova m3, [r2q] ; m3 = dequant |
240 mov r3, qcoeffmp | 241 mov r3, qcoeffmp |
241 mov r4, dqcoeffmp | 242 mov r4, dqcoeffmp |
242 mov r5, iscanmp | 243 mov r5, iscanmp |
243 %ifidn %1, fp_32x32 | 244 %ifidn %1, fp_32x32 |
244 psllw m2, 1 | 245 psllw m2, 1 |
245 %endif | 246 %endif |
246 pxor m5, m5 ; m5 = dedicated zero | 247 pxor m5, m5 ; m5 = dedicated zero |
247 DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob | 248 |
248 lea coeffq, [ coeffq+ncoeffq*2] | 249 lea coeffq, [ coeffq+ncoeffq*2] |
249 lea iscanq, [ iscanq+ncoeffq*2] | 250 lea r5q, [ r5q+ncoeffq*2] |
250 lea qcoeffq, [ qcoeffq+ncoeffq*2] | 251 lea r3q, [ r3q+ncoeffq*2] |
251 lea dqcoeffq, [dqcoeffq+ncoeffq*2] | 252 lea r4q, [r4q+ncoeffq*2] |
252 neg ncoeffq | 253 neg ncoeffq |
253 | 254 |
254 ; get DC and first 15 AC coeffs | 255 ; get DC and first 15 AC coeffs |
255 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] | 256 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] |
256 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] | 257 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] |
257 pabsw m6, m9 ; m6 = abs(m9) | 258 pabsw m6, m9 ; m6 = abs(m9) |
258 pabsw m11, m10 ; m11 = abs(m10) | 259 pabsw m11, m10 ; m11 = abs(m10) |
259 pcmpeqw m7, m7 | 260 pcmpeqw m7, m7 |
260 | 261 |
261 paddsw m6, m1 ; m6 += round | 262 paddsw m6, m1 ; m6 += round |
262 punpckhqdq m1, m1 | 263 punpckhqdq m1, m1 |
263 paddsw m11, m1 ; m11 += round | 264 paddsw m11, m1 ; m11 += round |
264 pmulhw m8, m6, m2 ; m8 = m6*q>>16 | 265 pmulhw m8, m6, m2 ; m8 = m6*q>>16 |
265 punpckhqdq m2, m2 | 266 punpckhqdq m2, m2 |
266 pmulhw m13, m11, m2 ; m13 = m11*q>>16 | 267 pmulhw m13, m11, m2 ; m13 = m11*q>>16 |
267 psignw m8, m9 ; m8 = reinsert sign | 268 psignw m8, m9 ; m8 = reinsert sign |
268 psignw m13, m10 ; m13 = reinsert sign | 269 psignw m13, m10 ; m13 = reinsert sign |
269 mova [qcoeffq+ncoeffq*2+ 0], m8 | 270 mova [r3q+ncoeffq*2+ 0], m8 |
270 mova [qcoeffq+ncoeffq*2+16], m13 | 271 mova [r3q+ncoeffq*2+16], m13 |
271 %ifidn %1, fp_32x32 | 272 %ifidn %1, fp_32x32 |
272 pabsw m8, m8 | 273 pabsw m8, m8 |
273 pabsw m13, m13 | 274 pabsw m13, m13 |
274 %endif | 275 %endif |
275 pmullw m8, m3 ; dqc[i] = qc[i] * q | 276 pmullw m8, m3 ; r4[i] = r3[i] * q |
276 punpckhqdq m3, m3 | 277 punpckhqdq m3, m3 |
277 pmullw m13, m3 ; dqc[i] = qc[i] * q | 278 pmullw m13, m3 ; r4[i] = r3[i] * q |
278 %ifidn %1, fp_32x32 | 279 %ifidn %1, fp_32x32 |
279 psrlw m8, 1 | 280 psrlw m8, 1 |
280 psrlw m13, 1 | 281 psrlw m13, 1 |
281 psignw m8, m9 | 282 psignw m8, m9 |
282 psignw m13, m10 | 283 psignw m13, m10 |
283 psrlw m0, m3, 2 | 284 psrlw m0, m3, 2 |
284 %endif | 285 %endif |
285 mova [dqcoeffq+ncoeffq*2+ 0], m8 | 286 mova [r4q+ncoeffq*2+ 0], m8 |
286 mova [dqcoeffq+ncoeffq*2+16], m13 | 287 mova [r4q+ncoeffq*2+16], m13 |
287 pcmpeqw m8, m5 ; m8 = c[i] == 0 | 288 pcmpeqw m8, m5 ; m8 = c[i] == 0 |
288 pcmpeqw m13, m5 ; m13 = c[i] == 0 | 289 pcmpeqw m13, m5 ; m13 = c[i] == 0 |
289 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] | 290 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] |
290 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] | 291 mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] |
291 psubw m6, m7 ; m6 = scan[i] + 1 | 292 psubw m6, m7 ; m6 = scan[i] + 1 |
292 psubw m11, m7 ; m11 = scan[i] + 1 | 293 psubw m11, m7 ; m11 = scan[i] + 1 |
293 pandn m8, m6 ; m8 = max(eob) | 294 pandn m8, m6 ; m8 = max(eob) |
294 pandn m13, m11 ; m13 = max(eob) | 295 pandn m13, m11 ; m13 = max(eob) |
295 pmaxsw m8, m13 | 296 pmaxsw m8, m13 |
296 add ncoeffq, mmsize | 297 add ncoeffq, mmsize |
297 jz .accumulate_eob | 298 jz .accumulate_eob |
298 | 299 |
299 .ac_only_loop: | 300 .ac_only_loop: |
300 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] | 301 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] |
(...skipping 10 matching lines...) Expand all Loading... |
311 jz .skip_iter | 312 jz .skip_iter |
312 %endif | 313 %endif |
313 pcmpeqw m7, m7 | 314 pcmpeqw m7, m7 |
314 | 315 |
315 paddsw m6, m1 ; m6 += round | 316 paddsw m6, m1 ; m6 += round |
316 paddsw m11, m1 ; m11 += round | 317 paddsw m11, m1 ; m11 += round |
317 pmulhw m14, m6, m2 ; m14 = m6*q>>16 | 318 pmulhw m14, m6, m2 ; m14 = m6*q>>16 |
318 pmulhw m13, m11, m2 ; m13 = m11*q>>16 | 319 pmulhw m13, m11, m2 ; m13 = m11*q>>16 |
319 psignw m14, m9 ; m14 = reinsert sign | 320 psignw m14, m9 ; m14 = reinsert sign |
320 psignw m13, m10 ; m13 = reinsert sign | 321 psignw m13, m10 ; m13 = reinsert sign |
321 mova [qcoeffq+ncoeffq*2+ 0], m14 | 322 mova [r3q+ncoeffq*2+ 0], m14 |
322 mova [qcoeffq+ncoeffq*2+16], m13 | 323 mova [r3q+ncoeffq*2+16], m13 |
323 %ifidn %1, fp_32x32 | 324 %ifidn %1, fp_32x32 |
324 pabsw m14, m14 | 325 pabsw m14, m14 |
325 pabsw m13, m13 | 326 pabsw m13, m13 |
326 %endif | 327 %endif |
327 pmullw m14, m3 ; dqc[i] = qc[i] * q | 328 pmullw m14, m3 ; r4[i] = r3[i] * q |
328 pmullw m13, m3 ; dqc[i] = qc[i] * q | 329 pmullw m13, m3 ; r4[i] = r3[i] * q |
329 %ifidn %1, fp_32x32 | 330 %ifidn %1, fp_32x32 |
330 psrlw m14, 1 | 331 psrlw m14, 1 |
331 psrlw m13, 1 | 332 psrlw m13, 1 |
332 psignw m14, m9 | 333 psignw m14, m9 |
333 psignw m13, m10 | 334 psignw m13, m10 |
334 %endif | 335 %endif |
335 mova [dqcoeffq+ncoeffq*2+ 0], m14 | 336 mova [r4q+ncoeffq*2+ 0], m14 |
336 mova [dqcoeffq+ncoeffq*2+16], m13 | 337 mova [r4q+ncoeffq*2+16], m13 |
337 pcmpeqw m14, m5 ; m14 = c[i] == 0 | 338 pcmpeqw m14, m5 ; m14 = c[i] == 0 |
338 pcmpeqw m13, m5 ; m13 = c[i] == 0 | 339 pcmpeqw m13, m5 ; m13 = c[i] == 0 |
339 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] | 340 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] |
340 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] | 341 mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] |
341 psubw m6, m7 ; m6 = scan[i] + 1 | 342 psubw m6, m7 ; m6 = scan[i] + 1 |
342 psubw m11, m7 ; m11 = scan[i] + 1 | 343 psubw m11, m7 ; m11 = scan[i] + 1 |
343 pandn m14, m6 ; m14 = max(eob) | 344 pandn m14, m6 ; m14 = max(eob) |
344 pandn m13, m11 ; m13 = max(eob) | 345 pandn m13, m11 ; m13 = max(eob) |
345 pmaxsw m8, m14 | 346 pmaxsw m8, m14 |
346 pmaxsw m8, m13 | 347 pmaxsw m8, m13 |
347 add ncoeffq, mmsize | 348 add ncoeffq, mmsize |
348 jl .ac_only_loop | 349 jl .ac_only_loop |
349 | 350 |
350 %ifidn %1, fp_32x32 | 351 %ifidn %1, fp_32x32 |
351 jmp .accumulate_eob | 352 jmp .accumulate_eob |
352 .skip_iter: | 353 .skip_iter: |
353 mova [qcoeffq+ncoeffq*2+ 0], m5 | 354 mova [r3q+ncoeffq*2+ 0], m5 |
354 mova [qcoeffq+ncoeffq*2+16], m5 | 355 mova [r3q+ncoeffq*2+16], m5 |
355 mova [dqcoeffq+ncoeffq*2+ 0], m5 | 356 mova [r4q+ncoeffq*2+ 0], m5 |
356 mova [dqcoeffq+ncoeffq*2+16], m5 | 357 mova [r4q+ncoeffq*2+16], m5 |
357 add ncoeffq, mmsize | 358 add ncoeffq, mmsize |
358 jl .ac_only_loop | 359 jl .ac_only_loop |
359 %endif | 360 %endif |
360 | 361 |
361 .accumulate_eob: | 362 .accumulate_eob: |
362 ; horizontally accumulate/max eobs and write into [eob] memory pointer | 363 ; horizontally accumulate/max eobs and write into [eob] memory pointer |
363 mov r2, eobmp | 364 mov r2, eobmp |
364 pshufd m7, m8, 0xe | 365 pshufd m7, m8, 0xe |
365 pmaxsw m8, m7 | 366 pmaxsw m8, m7 |
366 pshuflw m7, m8, 0xe | 367 pshuflw m7, m8, 0xe |
367 pmaxsw m8, m7 | 368 pmaxsw m8, m7 |
368 pshuflw m7, m8, 0x1 | 369 pshuflw m7, m8, 0x1 |
369 pmaxsw m8, m7 | 370 pmaxsw m8, m7 |
370 pextrw r6, m8, 0 | 371 pextrw r6, m8, 0 |
371 mov [r2], r6 | 372 mov [r2], r6 |
372 RET | 373 RET |
373 | 374 |
374 ; skip-block, i.e. just write all zeroes | 375 ; skip-block, i.e. just write all zeroes |
375 .blank: | 376 .blank: |
376 mov r0, dqcoeffmp | 377 mov r0, dqcoeffmp |
377 movifnidn ncoeffq, ncoeffmp | 378 movifnidn ncoeffq, ncoeffmp |
378 mov r2, qcoeffmp | 379 mov r2, qcoeffmp |
379 mov r3, eobmp | 380 mov r3, eobmp |
380 DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob | 381 |
381 lea dqcoeffq, [dqcoeffq+ncoeffq*2] | 382 lea r0q, [r0q+ncoeffq*2] |
382 lea qcoeffq, [ qcoeffq+ncoeffq*2] | 383 lea r2q, [r2q+ncoeffq*2] |
383 neg ncoeffq | 384 neg ncoeffq |
384 pxor m7, m7 | 385 pxor m7, m7 |
385 .blank_loop: | 386 .blank_loop: |
386 mova [dqcoeffq+ncoeffq*2+ 0], m7 | 387 mova [r0q+ncoeffq*2+ 0], m7 |
387 mova [dqcoeffq+ncoeffq*2+16], m7 | 388 mova [r0q+ncoeffq*2+16], m7 |
388 mova [qcoeffq+ncoeffq*2+ 0], m7 | 389 mova [r2q+ncoeffq*2+ 0], m7 |
389 mova [qcoeffq+ncoeffq*2+16], m7 | 390 mova [r2q+ncoeffq*2+16], m7 |
390 add ncoeffq, mmsize | 391 add ncoeffq, mmsize |
391 jl .blank_loop | 392 jl .blank_loop |
392 mov word [eobq], 0 | 393 mov word [r3q], 0 |
393 RET | 394 RET |
394 %endmacro | 395 %endmacro |
395 | 396 |
396 INIT_XMM ssse3 | 397 INIT_XMM ssse3 |
397 QUANTIZE_FP fp, 7 | 398 QUANTIZE_FP fp, 7 |
398 QUANTIZE_FP fp_32x32, 7 | 399 QUANTIZE_FP fp_32x32, 7 |
OLD | NEW |