OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
(...skipping 264 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
275 %endif | 275 %endif |
276 pmullw m8, m3 ; r4[i] = r3[i] * q | 276 pmullw m8, m3 ; r4[i] = r3[i] * q |
277 punpckhqdq m3, m3 | 277 punpckhqdq m3, m3 |
278 pmullw m13, m3 ; r4[i] = r3[i] * q | 278 pmullw m13, m3 ; r4[i] = r3[i] * q |
279 %ifidn %1, fp_32x32 | 279 %ifidn %1, fp_32x32 |
280 psrlw m8, 1 | 280 psrlw m8, 1 |
281 psrlw m13, 1 | 281 psrlw m13, 1 |
282 psignw m8, m9 | 282 psignw m8, m9 |
283 psignw m13, m10 | 283 psignw m13, m10 |
284 psrlw m0, m3, 2 | 284 psrlw m0, m3, 2 |
| 285 %else |
| 286 psrlw m0, m3, 1 |
285 %endif | 287 %endif |
286 mova [r4q+ncoeffq*2+ 0], m8 | 288 mova [r4q+ncoeffq*2+ 0], m8 |
287 mova [r4q+ncoeffq*2+16], m13 | 289 mova [r4q+ncoeffq*2+16], m13 |
288 pcmpeqw m8, m5 ; m8 = c[i] == 0 | 290 pcmpeqw m8, m5 ; m8 = c[i] == 0 |
289 pcmpeqw m13, m5 ; m13 = c[i] == 0 | 291 pcmpeqw m13, m5 ; m13 = c[i] == 0 |
290 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] | 292 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] |
291 mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] | 293 mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] |
292 psubw m6, m7 ; m6 = scan[i] + 1 | 294 psubw m6, m7 ; m6 = scan[i] + 1 |
293 psubw m11, m7 ; m11 = scan[i] + 1 | 295 psubw m11, m7 ; m11 = scan[i] + 1 |
294 pandn m8, m6 ; m8 = max(eob) | 296 pandn m8, m6 ; m8 = max(eob) |
295 pandn m13, m11 ; m13 = max(eob) | 297 pandn m13, m11 ; m13 = max(eob) |
296 pmaxsw m8, m13 | 298 pmaxsw m8, m13 |
297 add ncoeffq, mmsize | 299 add ncoeffq, mmsize |
298 jz .accumulate_eob | 300 jz .accumulate_eob |
299 | 301 |
300 .ac_only_loop: | 302 .ac_only_loop: |
301 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] | 303 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] |
302 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] | 304 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] |
303 pabsw m6, m9 ; m6 = abs(m9) | 305 pabsw m6, m9 ; m6 = abs(m9) |
304 pabsw m11, m10 ; m11 = abs(m10) | 306 pabsw m11, m10 ; m11 = abs(m10) |
305 %ifidn %1, fp_32x32 | 307 |
306 pcmpgtw m7, m6, m0 | 308 pcmpgtw m7, m6, m0 |
307 pcmpgtw m12, m11, m0 | 309 pcmpgtw m12, m11, m0 |
308 pmovmskb r6d, m7 | 310 pmovmskb r6d, m7 |
309 pmovmskb r2d, m12 | 311 pmovmskb r2d, m12 |
310 | 312 |
311 or r6, r2 | 313 or r6, r2 |
312 jz .skip_iter | 314 jz .skip_iter |
313 %endif | 315 |
314 pcmpeqw m7, m7 | 316 pcmpeqw m7, m7 |
315 | 317 |
316 paddsw m6, m1 ; m6 += round | 318 paddsw m6, m1 ; m6 += round |
317 paddsw m11, m1 ; m11 += round | 319 paddsw m11, m1 ; m11 += round |
318 pmulhw m14, m6, m2 ; m14 = m6*q>>16 | 320 pmulhw m14, m6, m2 ; m14 = m6*q>>16 |
319 pmulhw m13, m11, m2 ; m13 = m11*q>>16 | 321 pmulhw m13, m11, m2 ; m13 = m11*q>>16 |
320 psignw m14, m9 ; m14 = reinsert sign | 322 psignw m14, m9 ; m14 = reinsert sign |
321 psignw m13, m10 ; m13 = reinsert sign | 323 psignw m13, m10 ; m13 = reinsert sign |
322 mova [r3q+ncoeffq*2+ 0], m14 | 324 mova [r3q+ncoeffq*2+ 0], m14 |
323 mova [r3q+ncoeffq*2+16], m13 | 325 mova [r3q+ncoeffq*2+16], m13 |
(...skipping 17 matching lines...) Expand all Loading... |
341 mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] | 343 mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] |
342 psubw m6, m7 ; m6 = scan[i] + 1 | 344 psubw m6, m7 ; m6 = scan[i] + 1 |
343 psubw m11, m7 ; m11 = scan[i] + 1 | 345 psubw m11, m7 ; m11 = scan[i] + 1 |
344 pandn m14, m6 ; m14 = max(eob) | 346 pandn m14, m6 ; m14 = max(eob) |
345 pandn m13, m11 ; m13 = max(eob) | 347 pandn m13, m11 ; m13 = max(eob) |
346 pmaxsw m8, m14 | 348 pmaxsw m8, m14 |
347 pmaxsw m8, m13 | 349 pmaxsw m8, m13 |
348 add ncoeffq, mmsize | 350 add ncoeffq, mmsize |
349 jl .ac_only_loop | 351 jl .ac_only_loop |
350 | 352 |
351 %ifidn %1, fp_32x32 | |
352 jmp .accumulate_eob | 353 jmp .accumulate_eob |
353 .skip_iter: | 354 .skip_iter: |
354 mova [r3q+ncoeffq*2+ 0], m5 | 355 mova [r3q+ncoeffq*2+ 0], m5 |
355 mova [r3q+ncoeffq*2+16], m5 | 356 mova [r3q+ncoeffq*2+16], m5 |
356 mova [r4q+ncoeffq*2+ 0], m5 | 357 mova [r4q+ncoeffq*2+ 0], m5 |
357 mova [r4q+ncoeffq*2+16], m5 | 358 mova [r4q+ncoeffq*2+16], m5 |
358 add ncoeffq, mmsize | 359 add ncoeffq, mmsize |
359 jl .ac_only_loop | 360 jl .ac_only_loop |
360 %endif | |
361 | 361 |
362 .accumulate_eob: | 362 .accumulate_eob: |
363 ; horizontally accumulate/max eobs and write into [eob] memory pointer | 363 ; horizontally accumulate/max eobs and write into [eob] memory pointer |
364 mov r2, eobmp | 364 mov r2, eobmp |
365 pshufd m7, m8, 0xe | 365 pshufd m7, m8, 0xe |
366 pmaxsw m8, m7 | 366 pmaxsw m8, m7 |
367 pshuflw m7, m8, 0xe | 367 pshuflw m7, m8, 0xe |
368 pmaxsw m8, m7 | 368 pmaxsw m8, m7 |
369 pshuflw m7, m8, 0x1 | 369 pshuflw m7, m8, 0x1 |
370 pmaxsw m8, m7 | 370 pmaxsw m8, m7 |
(...skipping 19 matching lines...) Expand all Loading... |
390 mova [r2q+ncoeffq*2+16], m7 | 390 mova [r2q+ncoeffq*2+16], m7 |
391 add ncoeffq, mmsize | 391 add ncoeffq, mmsize |
392 jl .blank_loop | 392 jl .blank_loop |
393 mov word [r3q], 0 | 393 mov word [r3q], 0 |
394 RET | 394 RET |
395 %endmacro | 395 %endmacro |
396 | 396 |
397 INIT_XMM ssse3 | 397 INIT_XMM ssse3 |
398 QUANTIZE_FP fp, 7 | 398 QUANTIZE_FP fp, 7 |
399 QUANTIZE_FP fp_32x32, 7 | 399 QUANTIZE_FP fp_32x32, 7 |
OLD | NEW |