| OLD | NEW |
| 1 ; | 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; | 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; | 9 ; |
| 10 | 10 |
| (...skipping 14 matching lines...) Expand all Loading... |
| 25 sym(idct_dequant_0_2x_sse2): | 25 sym(idct_dequant_0_2x_sse2): |
| 26 push rbp | 26 push rbp |
| 27 mov rbp, rsp | 27 mov rbp, rsp |
| 28 SHADOW_ARGS_TO_STACK 6 | 28 SHADOW_ARGS_TO_STACK 6 |
| 29 GET_GOT rbx | 29 GET_GOT rbx |
| 30 ; end prolog | 30 ; end prolog |
| 31 | 31 |
| 32 mov rdx, arg(1) ; dequant | 32 mov rdx, arg(1) ; dequant |
| 33 mov rax, arg(0) ; qcoeff | 33 mov rax, arg(0) ; qcoeff |
| 34 | 34 |
| 35 ; Zero out xmm7, for use unpacking | |
| 36 pxor xmm7, xmm7 | |
| 37 | |
| 38 movd xmm4, [rax] | 35 movd xmm4, [rax] |
| 39 movd xmm5, [rdx] | 36 movd xmm5, [rdx] |
| 40 | 37 |
| 41 pinsrw xmm4, [rax+32], 4 | 38 pinsrw xmm4, [rax+32], 4 |
| 42 pinsrw xmm5, [rdx], 4 | 39 pinsrw xmm5, [rdx], 4 |
| 43 | 40 |
| 44 pmullw xmm4, xmm5 | 41 pmullw xmm4, xmm5 |
| 45 | 42 |
| 43 ; Zero out xmm5, for use unpacking |
| 44 pxor xmm5, xmm5 |
| 45 |
| 46 ; clear coeffs | 46 ; clear coeffs |
| 47 movd [rax], xmm7 | 47 movd [rax], xmm5 |
| 48 movd [rax+32], xmm7 | 48 movd [rax+32], xmm5 |
| 49 ;pshufb | 49 ;pshufb |
| 50 pshuflw xmm4, xmm4, 00000000b | 50 pshuflw xmm4, xmm4, 00000000b |
| 51 pshufhw xmm4, xmm4, 00000000b | 51 pshufhw xmm4, xmm4, 00000000b |
| 52 | 52 |
| 53 mov rax, arg(2) ; pre | 53 mov rax, arg(2) ; pre |
| 54 paddw xmm4, [GLOBAL(fours)] | 54 paddw xmm4, [GLOBAL(fours)] |
| 55 | 55 |
| 56 movsxd rcx, dword ptr arg(5) ; blk_stride | 56 movsxd rcx, dword ptr arg(5) ; blk_stride |
| 57 psraw xmm4, 3 | 57 psraw xmm4, 3 |
| 58 | 58 |
| 59 movq xmm0, [rax] | 59 movq xmm0, [rax] |
| 60 movq xmm1, [rax+rcx] | 60 movq xmm1, [rax+rcx] |
| 61 movq xmm2, [rax+2*rcx] | 61 movq xmm2, [rax+2*rcx] |
| 62 lea rcx, [3*rcx] | 62 lea rcx, [3*rcx] |
| 63 movq xmm3, [rax+rcx] | 63 movq xmm3, [rax+rcx] |
| 64 | 64 |
| 65 punpcklbw xmm0, xmm7 | 65 punpcklbw xmm0, xmm5 |
| 66 punpcklbw xmm1, xmm7 | 66 punpcklbw xmm1, xmm5 |
| 67 punpcklbw xmm2, xmm7 | 67 punpcklbw xmm2, xmm5 |
| 68 punpcklbw xmm3, xmm7 | 68 punpcklbw xmm3, xmm5 |
| 69 | 69 |
| 70 mov rax, arg(3) ; dst | 70 mov rax, arg(3) ; dst |
| 71 movsxd rdx, dword ptr arg(4) ; dst_stride | 71 movsxd rdx, dword ptr arg(4) ; dst_stride |
| 72 | 72 |
| 73 ; Add to predict buffer | 73 ; Add to predict buffer |
| 74 paddw xmm0, xmm4 | 74 paddw xmm0, xmm4 |
| 75 paddw xmm1, xmm4 | 75 paddw xmm1, xmm4 |
| 76 paddw xmm2, xmm4 | 76 paddw xmm2, xmm4 |
| 77 paddw xmm3, xmm4 | 77 paddw xmm3, xmm4 |
| 78 | 78 |
| 79 ; pack up before storing | 79 ; pack up before storing |
| 80 packuswb xmm0, xmm7 | 80 packuswb xmm0, xmm5 |
| 81 packuswb xmm1, xmm7 | 81 packuswb xmm1, xmm5 |
| 82 packuswb xmm2, xmm7 | 82 packuswb xmm2, xmm5 |
| 83 packuswb xmm3, xmm7 | 83 packuswb xmm3, xmm5 |
| 84 | 84 |
| 85 ; store blocks back out | 85 ; store blocks back out |
| 86 movq [rax], xmm0 | 86 movq [rax], xmm0 |
| 87 movq [rax + rdx], xmm1 | 87 movq [rax + rdx], xmm1 |
| 88 | 88 |
| 89 lea rax, [rax + 2*rdx] | 89 lea rax, [rax + 2*rdx] |
| 90 | 90 |
| 91 movq [rax], xmm2 | 91 movq [rax], xmm2 |
| 92 movq [rax + rdx], xmm3 | 92 movq [rax + rdx], xmm3 |
| 93 | 93 |
| 94 ; begin epilog | 94 ; begin epilog |
| 95 RESTORE_GOT | 95 RESTORE_GOT |
| 96 UNSHADOW_ARGS | 96 UNSHADOW_ARGS |
| 97 pop rbp | 97 pop rbp |
| 98 ret | 98 ret |
| 99 | 99 |
| 100 global sym(idct_dequant_full_2x_sse2) | 100 global sym(idct_dequant_full_2x_sse2) |
| 101 sym(idct_dequant_full_2x_sse2): | 101 sym(idct_dequant_full_2x_sse2): |
| 102 push rbp | 102 push rbp |
| 103 mov rbp, rsp | 103 mov rbp, rsp |
| 104 SHADOW_ARGS_TO_STACK 7 | 104 SHADOW_ARGS_TO_STACK 7 |
| 105 SAVE_XMM 7 |
| 105 GET_GOT rbx | 106 GET_GOT rbx |
| 106 push rsi | 107 push rsi |
| 107 push rdi | 108 push rdi |
| 108 ; end prolog | 109 ; end prolog |
| 109 | 110 |
| 110 ; special case when 2 blocks have 0 or 1 coeffs | 111 ; special case when 2 blocks have 0 or 1 coeffs |
| 111 ; dc is set as first coeff, so no need to load qcoeff | 112 ; dc is set as first coeff, so no need to load qcoeff |
| 112 mov rax, arg(0) ; qcoeff | 113 mov rax, arg(0) ; qcoeff |
| 113 mov rsi, arg(2) ; pre | 114 mov rsi, arg(2) ; pre |
| 114 mov rdi, arg(3) ; dst | 115 mov rdi, arg(3) ; dst |
| (...skipping 225 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 340 | 341 |
| 341 lea rdi, [rdi + 2*rdx] | 342 lea rdi, [rdi + 2*rdx] |
| 342 | 343 |
| 343 movq [rdi], xmm2 | 344 movq [rdi], xmm2 |
| 344 movq [rdi + rdx], xmm3 | 345 movq [rdi + rdx], xmm3 |
| 345 | 346 |
| 346 ; begin epilog | 347 ; begin epilog |
| 347 pop rdi | 348 pop rdi |
| 348 pop rsi | 349 pop rsi |
| 349 RESTORE_GOT | 350 RESTORE_GOT |
| 351 RESTORE_XMM |
| 350 UNSHADOW_ARGS | 352 UNSHADOW_ARGS |
| 351 pop rbp | 353 pop rbp |
| 352 ret | 354 ret |
| 353 | 355 |
| 354 ;void idct_dequant_dc_0_2x_sse2 | 356 ;void idct_dequant_dc_0_2x_sse2 |
| 355 ; ( | 357 ; ( |
| 356 ; short *qcoeff - 0 | 358 ; short *qcoeff - 0 |
| 357 ; short *dequant - 1 | 359 ; short *dequant - 1 |
| 358 ; unsigned char *pre - 2 | 360 ; unsigned char *pre - 2 |
| 359 ; unsigned char *dst - 3 | 361 ; unsigned char *dst - 3 |
| (...skipping 10 matching lines...) Expand all Loading... |
| 370 push rdi | 372 push rdi |
| 371 ; end prolog | 373 ; end prolog |
| 372 | 374 |
| 373 ; special case when 2 blocks have 0 or 1 coeffs | 375 ; special case when 2 blocks have 0 or 1 coeffs |
| 374 ; dc is set as first coeff, so no need to load qcoeff | 376 ; dc is set as first coeff, so no need to load qcoeff |
| 375 mov rax, arg(0) ; qcoeff | 377 mov rax, arg(0) ; qcoeff |
| 376 mov rsi, arg(2) ; pre | 378 mov rsi, arg(2) ; pre |
| 377 mov rdi, arg(3) ; dst | 379 mov rdi, arg(3) ; dst |
| 378 mov rdx, arg(5) ; dc | 380 mov rdx, arg(5) ; dc |
| 379 | 381 |
| 380 ; Zero out xmm7, for use unpacking | 382 ; Zero out xmm5, for use unpacking |
| 381 pxor xmm7, xmm7 | 383 pxor xmm5, xmm5 |
| 382 | 384 |
| 383 ; load up 2 dc words here == 2*16 = doubleword | 385 ; load up 2 dc words here == 2*16 = doubleword |
| 384 movd xmm4, [rdx] | 386 movd xmm4, [rdx] |
| 385 | 387 |
| 386 ; Load up predict blocks | 388 ; Load up predict blocks |
| 387 movq xmm0, [rsi] | 389 movq xmm0, [rsi] |
| 388 movq xmm1, [rsi+16] | 390 movq xmm1, [rsi+16] |
| 389 movq xmm2, [rsi+32] | 391 movq xmm2, [rsi+32] |
| 390 movq xmm3, [rsi+48] | 392 movq xmm3, [rsi+48] |
| 391 | 393 |
| 392 ; Duplicate and expand dc across | 394 ; Duplicate and expand dc across |
| 393 punpcklwd xmm4, xmm4 | 395 punpcklwd xmm4, xmm4 |
| 394 punpckldq xmm4, xmm4 | 396 punpckldq xmm4, xmm4 |
| 395 | 397 |
| 396 ; Rounding to dequant and downshift | 398 ; Rounding to dequant and downshift |
| 397 paddw xmm4, [GLOBAL(fours)] | 399 paddw xmm4, [GLOBAL(fours)] |
| 398 psraw xmm4, 3 | 400 psraw xmm4, 3 |
| 399 | 401 |
| 400 ; Predict buffer needs to be expanded from bytes to words | 402 ; Predict buffer needs to be expanded from bytes to words |
| 401 punpcklbw xmm0, xmm7 | 403 punpcklbw xmm0, xmm5 |
| 402 punpcklbw xmm1, xmm7 | 404 punpcklbw xmm1, xmm5 |
| 403 punpcklbw xmm2, xmm7 | 405 punpcklbw xmm2, xmm5 |
| 404 punpcklbw xmm3, xmm7 | 406 punpcklbw xmm3, xmm5 |
| 405 | 407 |
| 406 ; Add to predict buffer | 408 ; Add to predict buffer |
| 407 paddw xmm0, xmm4 | 409 paddw xmm0, xmm4 |
| 408 paddw xmm1, xmm4 | 410 paddw xmm1, xmm4 |
| 409 paddw xmm2, xmm4 | 411 paddw xmm2, xmm4 |
| 410 paddw xmm3, xmm4 | 412 paddw xmm3, xmm4 |
| 411 | 413 |
| 412 ; pack up before storing | 414 ; pack up before storing |
| 413 packuswb xmm0, xmm7 | 415 packuswb xmm0, xmm5 |
| 414 packuswb xmm1, xmm7 | 416 packuswb xmm1, xmm5 |
| 415 packuswb xmm2, xmm7 | 417 packuswb xmm2, xmm5 |
| 416 packuswb xmm3, xmm7 | 418 packuswb xmm3, xmm5 |
| 417 | 419 |
| 418 ; Load destination stride before writing out, | 420 ; Load destination stride before writing out, |
| 419 ; doesn't need to persist | 421 ; doesn't need to persist |
| 420 movsxd rdx, dword ptr arg(4) ; dst_stride | 422 movsxd rdx, dword ptr arg(4) ; dst_stride |
| 421 | 423 |
| 422 ; store blocks back out | 424 ; store blocks back out |
| 423 movq [rdi], xmm0 | 425 movq [rdi], xmm0 |
| 424 movq [rdi + rdx], xmm1 | 426 movq [rdi + rdx], xmm1 |
| 425 | 427 |
| 426 lea rdi, [rdi + 2*rdx] | 428 lea rdi, [rdi + 2*rdx] |
| 427 | 429 |
| 428 movq [rdi], xmm2 | 430 movq [rdi], xmm2 |
| 429 movq [rdi + rdx], xmm3 | 431 movq [rdi + rdx], xmm3 |
| 430 | 432 |
| 431 ; begin epilog | 433 ; begin epilog |
| 432 pop rdi | 434 pop rdi |
| 433 pop rsi | 435 pop rsi |
| 434 RESTORE_GOT | 436 RESTORE_GOT |
| 435 UNSHADOW_ARGS | 437 UNSHADOW_ARGS |
| 436 pop rbp | 438 pop rbp |
| 437 ret | 439 ret |
| 438 | 440 |
| 439 global sym(idct_dequant_dc_full_2x_sse2) | 441 global sym(idct_dequant_dc_full_2x_sse2) |
| 440 sym(idct_dequant_dc_full_2x_sse2): | 442 sym(idct_dequant_dc_full_2x_sse2): |
| 441 push rbp | 443 push rbp |
| 442 mov rbp, rsp | 444 mov rbp, rsp |
| 443 SHADOW_ARGS_TO_STACK 7 | 445 SHADOW_ARGS_TO_STACK 7 |
| 446 SAVE_XMM 7 |
| 444 GET_GOT rbx | 447 GET_GOT rbx |
| 445 push rsi | 448 push rsi |
| 446 push rdi | 449 push rdi |
| 447 ; end prolog | 450 ; end prolog |
| 448 | 451 |
| 449 ; special case when 2 blocks have 0 or 1 coeffs | 452 ; special case when 2 blocks have 0 or 1 coeffs |
| 450 ; dc is set as first coeff, so no need to load qcoeff | 453 ; dc is set as first coeff, so no need to load qcoeff |
| 451 mov rax, arg(0) ; qcoeff | 454 mov rax, arg(0) ; qcoeff |
| 452 mov rsi, arg(2) ; pre | 455 mov rsi, arg(2) ; pre |
| 453 mov rdi, arg(3) ; dst | 456 mov rdi, arg(3) ; dst |
| (...skipping 231 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 685 lea rdi, [rdi + 2*rdx] | 688 lea rdi, [rdi + 2*rdx] |
| 686 | 689 |
| 687 movq [rdi], xmm2 | 690 movq [rdi], xmm2 |
| 688 movq [rdi + rdx], xmm3 | 691 movq [rdi + rdx], xmm3 |
| 689 | 692 |
| 690 | 693 |
| 691 ; begin epilog | 694 ; begin epilog |
| 692 pop rdi | 695 pop rdi |
| 693 pop rsi | 696 pop rsi |
| 694 RESTORE_GOT | 697 RESTORE_GOT |
| 698 RESTORE_XMM |
| 695 UNSHADOW_ARGS | 699 UNSHADOW_ARGS |
| 696 pop rbp | 700 pop rbp |
| 697 ret | 701 ret |
| 698 | 702 |
| 699 SECTION_RODATA | 703 SECTION_RODATA |
| 700 align 16 | 704 align 16 |
| 701 fours: | 705 fours: |
| 702 times 8 dw 0x0004 | 706 times 8 dw 0x0004 |
| 703 align 16 | 707 align 16 |
| 704 x_s1sqr2: | 708 x_s1sqr2: |
| 705 times 8 dw 0x8A8C | 709 times 8 dw 0x8A8C |
| 706 align 16 | 710 align 16 |
| 707 x_c1sqr2less1: | 711 x_c1sqr2less1: |
| 708 times 8 dw 0x4E7B | 712 times 8 dw 0x4E7B |
| OLD | NEW |