| OLD | NEW |
| 1 ; | 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; | 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; | 9 ; |
| 10 | 10 |
| (...skipping 382 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 393 ; begin epilog | 393 ; begin epilog |
| 394 add rsp, 16 | 394 add rsp, 16 |
| 395 pop rdi | 395 pop rdi |
| 396 pop rsi | 396 pop rsi |
| 397 RESTORE_GOT | 397 RESTORE_GOT |
| 398 RESTORE_XMM | 398 RESTORE_XMM |
| 399 UNSHADOW_ARGS | 399 UNSHADOW_ARGS |
| 400 pop rbp | 400 pop rbp |
| 401 ret | 401 ret |
| 402 | 402 |
| 403 ;void vp9_filter_block2d_bil_var_sse2 | |
| 404 ;( | |
| 405 ; unsigned char *ref_ptr, | |
| 406 ; int ref_pixels_per_line, | |
| 407 ; unsigned char *src_ptr, | |
| 408 ; int src_pixels_per_line, | |
| 409 ; unsigned int Height, | |
| 410 ; int xoffset, | |
| 411 ; int yoffset, | |
| 412 ; int *sum, | |
| 413 ; unsigned int *sumsquared;; | |
| 414 ; | |
| 415 ;) | |
| 416 global sym(vp9_filter_block2d_bil_var_sse2) PRIVATE | |
| 417 sym(vp9_filter_block2d_bil_var_sse2): | |
| 418 push rbp | |
| 419 mov rbp, rsp | |
| 420 SHADOW_ARGS_TO_STACK 9 | |
| 421 SAVE_XMM 7 | |
| 422 GET_GOT rbx | |
| 423 push rsi | |
| 424 push rdi | |
| 425 push rbx | |
| 426 ; end prolog | |
| 427 | |
| 428 pxor xmm6, xmm6 ; | |
| 429 pxor xmm7, xmm7 ; | |
| 430 | |
| 431 lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding | |
| 432 movdqa xmm4, XMMWORD PTR [rsi] | |
| 433 | |
| 434 lea rcx, [GLOBAL(bilinear_filters_sse2)] | |
| 435 movsxd rax, dword ptr arg(5) ; xoffset | |
| 436 | |
| 437 cmp rax, 0 ; skip first_pass f
ilter if xoffset=0 | |
| 438 je filter_block2d_bil_var_sse2_sp_only | |
| 439 | |
| 440 shl rax, 5 ; point to filter c
oeff with xoffset | |
| 441 lea rax, [rax + rcx] ; HFilter | |
| 442 | |
| 443 movsxd rdx, dword ptr arg(6) ; yoffset | |
| 444 | |
| 445 cmp rdx, 0 ; skip second_pass
filter if yoffset=0 | |
| 446 je filter_block2d_bil_var_sse2_fp_only | |
| 447 | |
| 448 shl rdx, 5 | |
| 449 lea rdx, [rdx + rcx] ; VFilter | |
| 450 | |
| 451 mov rsi, arg(0) ;ref_ptr | |
| 452 mov rdi, arg(2) ;src_ptr | |
| 453 movsxd rcx, dword ptr arg(4) ;Height | |
| 454 | |
| 455 pxor xmm0, xmm0 ; | |
| 456 movq xmm1, QWORD PTR [rsi] ; | |
| 457 movq xmm3, QWORD PTR [rsi+1] ; | |
| 458 | |
| 459 punpcklbw xmm1, xmm0 ; | |
| 460 pmullw xmm1, [rax] ; | |
| 461 punpcklbw xmm3, xmm0 | |
| 462 pmullw xmm3, [rax+16] ; | |
| 463 | |
| 464 paddw xmm1, xmm3 ; | |
| 465 paddw xmm1, xmm4 ; | |
| 466 psraw xmm1, xmm_filter_shift ; | |
| 467 movdqa xmm5, xmm1 | |
| 468 | |
| 469 movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line | |
| 470 lea rsi, [rsi + rbx] | |
| 471 %if ABI_IS_32BIT=0 | |
| 472 movsxd r9, dword ptr arg(3) ;src_pixels_per_line | |
| 473 %endif | |
| 474 | |
| 475 filter_block2d_bil_var_sse2_loop: | |
| 476 movq xmm1, QWORD PTR [rsi] ; | |
| 477 movq xmm3, QWORD PTR [rsi+1] ; | |
| 478 | |
| 479 punpcklbw xmm1, xmm0 ; | |
| 480 pmullw xmm1, [rax] ; | |
| 481 punpcklbw xmm3, xmm0 ; | |
| 482 pmullw xmm3, [rax+16] ; | |
| 483 | |
| 484 paddw xmm1, xmm3 ; | |
| 485 paddw xmm1, xmm4 ; | |
| 486 psraw xmm1, xmm_filter_shift ; | |
| 487 | |
| 488 movdqa xmm3, xmm5 ; | |
| 489 movdqa xmm5, xmm1 ; | |
| 490 | |
| 491 pmullw xmm3, [rdx] ; | |
| 492 pmullw xmm1, [rdx+16] ; | |
| 493 paddw xmm1, xmm3 ; | |
| 494 paddw xmm1, xmm4 ; | |
| 495 psraw xmm1, xmm_filter_shift ; | |
| 496 | |
| 497 movq xmm3, QWORD PTR [rdi] ; | |
| 498 punpcklbw xmm3, xmm0 ; | |
| 499 | |
| 500 psubw xmm1, xmm3 ; | |
| 501 paddw xmm6, xmm1 ; | |
| 502 | |
| 503 pmaddwd xmm1, xmm1 ; | |
| 504 paddd xmm7, xmm1 ; | |
| 505 | |
| 506 lea rsi, [rsi + rbx] ;ref_pixels_per_lin
e | |
| 507 %if ABI_IS_32BIT | |
| 508 add rdi, dword ptr arg(3) ;src_pixels_per_lin
e | |
| 509 %else | |
| 510 lea rdi, [rdi + r9] | |
| 511 %endif | |
| 512 | |
| 513 sub rcx, 1 ; | |
| 514 jnz filter_block2d_bil_var_sse2_loop ; | |
| 515 | |
| 516 jmp filter_block2d_bil_variance | |
| 517 | |
| 518 filter_block2d_bil_var_sse2_sp_only: | |
| 519 movsxd rdx, dword ptr arg(6) ; yoffset | |
| 520 | |
| 521 cmp rdx, 0 ; skip all if both
xoffset=0 and yoffset=0 | |
| 522 je filter_block2d_bil_var_sse2_full_pixel | |
| 523 | |
| 524 shl rdx, 5 | |
| 525 lea rdx, [rdx + rcx] ; VFilter | |
| 526 | |
| 527 mov rsi, arg(0) ;ref_ptr | |
| 528 mov rdi, arg(2) ;src_ptr | |
| 529 movsxd rcx, dword ptr arg(4) ;Height | |
| 530 movsxd rax, dword ptr arg(1) ;ref_pixels_per_lin
e | |
| 531 | |
| 532 pxor xmm0, xmm0 ; | |
| 533 movq xmm1, QWORD PTR [rsi] ; | |
| 534 punpcklbw xmm1, xmm0 ; | |
| 535 | |
| 536 movsxd rbx, dword ptr arg(3) ;src_pixels_per_lin
e | |
| 537 lea rsi, [rsi + rax] | |
| 538 | |
| 539 filter_block2d_bil_sp_only_loop: | |
| 540 movq xmm3, QWORD PTR [rsi] ; | |
| 541 punpcklbw xmm3, xmm0 ; | |
| 542 movdqa xmm5, xmm3 | |
| 543 | |
| 544 pmullw xmm1, [rdx] ; | |
| 545 pmullw xmm3, [rdx+16] ; | |
| 546 paddw xmm1, xmm3 ; | |
| 547 paddw xmm1, xmm4 ; | |
| 548 psraw xmm1, xmm_filter_shift ; | |
| 549 | |
| 550 movq xmm3, QWORD PTR [rdi] ; | |
| 551 punpcklbw xmm3, xmm0 ; | |
| 552 | |
| 553 psubw xmm1, xmm3 ; | |
| 554 paddw xmm6, xmm1 ; | |
| 555 | |
| 556 pmaddwd xmm1, xmm1 ; | |
| 557 paddd xmm7, xmm1 ; | |
| 558 | |
| 559 movdqa xmm1, xmm5 ; | |
| 560 lea rsi, [rsi + rax] ;ref_pixels_per_lin
e | |
| 561 lea rdi, [rdi + rbx] ;src_pixels_per_lin
e | |
| 562 | |
| 563 sub rcx, 1 ; | |
| 564 jnz filter_block2d_bil_sp_only_loop ; | |
| 565 | |
| 566 jmp filter_block2d_bil_variance | |
| 567 | |
| 568 filter_block2d_bil_var_sse2_full_pixel: | |
| 569 mov rsi, arg(0) ;ref_ptr | |
| 570 mov rdi, arg(2) ;src_ptr | |
| 571 movsxd rcx, dword ptr arg(4) ;Height | |
| 572 movsxd rax, dword ptr arg(1) ;ref_pixels_per_lin
e | |
| 573 movsxd rbx, dword ptr arg(3) ;src_pixels_per_lin
e | |
| 574 pxor xmm0, xmm0 ; | |
| 575 | |
| 576 filter_block2d_bil_full_pixel_loop: | |
| 577 movq xmm1, QWORD PTR [rsi] ; | |
| 578 punpcklbw xmm1, xmm0 ; | |
| 579 | |
| 580 movq xmm2, QWORD PTR [rdi] ; | |
| 581 punpcklbw xmm2, xmm0 ; | |
| 582 | |
| 583 psubw xmm1, xmm2 ; | |
| 584 paddw xmm6, xmm1 ; | |
| 585 | |
| 586 pmaddwd xmm1, xmm1 ; | |
| 587 paddd xmm7, xmm1 ; | |
| 588 | |
| 589 lea rsi, [rsi + rax] ;ref_pixels_per_lin
e | |
| 590 lea rdi, [rdi + rbx] ;src_pixels_per_lin
e | |
| 591 | |
| 592 sub rcx, 1 ; | |
| 593 jnz filter_block2d_bil_full_pixel_loop ; | |
| 594 | |
| 595 jmp filter_block2d_bil_variance | |
| 596 | |
| 597 filter_block2d_bil_var_sse2_fp_only: | |
| 598 mov rsi, arg(0) ;ref_ptr | |
| 599 mov rdi, arg(2) ;src_ptr | |
| 600 movsxd rcx, dword ptr arg(4) ;Height | |
| 601 movsxd rdx, dword ptr arg(1) ;ref_pixels_per_lin
e | |
| 602 | |
| 603 pxor xmm0, xmm0 ; | |
| 604 movsxd rbx, dword ptr arg(3) ;src_pixels_per_lin
e | |
| 605 | |
| 606 filter_block2d_bil_fp_only_loop: | |
| 607 movq xmm1, QWORD PTR [rsi] ; | |
| 608 movq xmm3, QWORD PTR [rsi+1] ; | |
| 609 | |
| 610 punpcklbw xmm1, xmm0 ; | |
| 611 pmullw xmm1, [rax] ; | |
| 612 punpcklbw xmm3, xmm0 ; | |
| 613 pmullw xmm3, [rax+16] ; | |
| 614 | |
| 615 paddw xmm1, xmm3 ; | |
| 616 paddw xmm1, xmm4 ; | |
| 617 psraw xmm1, xmm_filter_shift ; | |
| 618 | |
| 619 movq xmm3, QWORD PTR [rdi] ; | |
| 620 punpcklbw xmm3, xmm0 ; | |
| 621 | |
| 622 psubw xmm1, xmm3 ; | |
| 623 paddw xmm6, xmm1 ; | |
| 624 | |
| 625 pmaddwd xmm1, xmm1 ; | |
| 626 paddd xmm7, xmm1 ; | |
| 627 lea rsi, [rsi + rdx] | |
| 628 lea rdi, [rdi + rbx] ;src_pixels_per_lin
e | |
| 629 | |
| 630 sub rcx, 1 ; | |
| 631 jnz filter_block2d_bil_fp_only_loop ; | |
| 632 | |
| 633 jmp filter_block2d_bil_variance | |
| 634 | |
| 635 filter_block2d_bil_variance: | |
| 636 movdq2q mm6, xmm6 ; | |
| 637 movdq2q mm7, xmm7 ; | |
| 638 | |
| 639 psrldq xmm6, 8 | |
| 640 psrldq xmm7, 8 | |
| 641 | |
| 642 movdq2q mm2, xmm6 | |
| 643 movdq2q mm3, xmm7 | |
| 644 | |
| 645 paddw mm6, mm2 | |
| 646 paddd mm7, mm3 | |
| 647 | |
| 648 pxor mm3, mm3 ; | |
| 649 pxor mm2, mm2 ; | |
| 650 | |
| 651 punpcklwd mm2, mm6 ; | |
| 652 punpckhwd mm3, mm6 ; | |
| 653 | |
| 654 paddd mm2, mm3 ; | |
| 655 movq mm6, mm2 ; | |
| 656 | |
| 657 psrlq mm6, 32 ; | |
| 658 paddd mm2, mm6 ; | |
| 659 | |
| 660 psrad mm2, 16 ; | |
| 661 movq mm4, mm7 ; | |
| 662 | |
| 663 psrlq mm4, 32 ; | |
| 664 paddd mm4, mm7 ; | |
| 665 | |
| 666 mov rsi, arg(7) ; sum | |
| 667 mov rdi, arg(8) ; sumsquared | |
| 668 | |
| 669 movd [rsi], mm2 ; xsum | |
| 670 movd [rdi], mm4 ; xxsum | |
| 671 | |
| 672 ; begin epilog | |
| 673 pop rbx | |
| 674 pop rdi | |
| 675 pop rsi | |
| 676 RESTORE_GOT | |
| 677 RESTORE_XMM | |
| 678 UNSHADOW_ARGS | |
| 679 pop rbp | |
| 680 ret | |
| 681 | |
| 682 | |
| 683 ;void vp9_half_horiz_vert_variance8x_h_sse2 | 403 ;void vp9_half_horiz_vert_variance8x_h_sse2 |
| 684 ;( | 404 ;( |
| 685 ; unsigned char *ref_ptr, | 405 ; unsigned char *ref_ptr, |
| 686 ; int ref_pixels_per_line, | 406 ; int ref_pixels_per_line, |
| 687 ; unsigned char *src_ptr, | 407 ; unsigned char *src_ptr, |
| 688 ; int src_pixels_per_line, | 408 ; int src_pixels_per_line, |
| 689 ; unsigned int Height, | 409 ; unsigned int Height, |
| 690 ; int *sum, | 410 ; int *sum, |
| 691 ; unsigned int *sumsquared | 411 ; unsigned int *sumsquared |
| 692 ;) | 412 ;) |
| (...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 795 | 515 |
| 796 ; begin epilog | 516 ; begin epilog |
| 797 pop rdi | 517 pop rdi |
| 798 pop rsi | 518 pop rsi |
| 799 RESTORE_GOT | 519 RESTORE_GOT |
| 800 RESTORE_XMM | 520 RESTORE_XMM |
| 801 UNSHADOW_ARGS | 521 UNSHADOW_ARGS |
| 802 pop rbp | 522 pop rbp |
| 803 ret | 523 ret |
| 804 | 524 |
| 805 ;void vp9_half_horiz_vert_variance16x_h_sse2 | |
| 806 ;( | |
| 807 ; unsigned char *ref_ptr, | |
| 808 ; int ref_pixels_per_line, | |
| 809 ; unsigned char *src_ptr, | |
| 810 ; int src_pixels_per_line, | |
| 811 ; unsigned int Height, | |
| 812 ; int *sum, | |
| 813 ; unsigned int *sumsquared | |
| 814 ;) | |
| 815 global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE | |
| 816 sym(vp9_half_horiz_vert_variance16x_h_sse2): | |
| 817 push rbp | |
| 818 mov rbp, rsp | |
| 819 SHADOW_ARGS_TO_STACK 7 | |
| 820 SAVE_XMM 7 | |
| 821 GET_GOT rbx | |
| 822 push rsi | |
| 823 push rdi | |
| 824 ; end prolog | |
| 825 | |
| 826 pxor xmm6, xmm6 ; error accumulator | |
| 827 pxor xmm7, xmm7 ; sse eaccumulator | |
| 828 mov rsi, arg(0) ;ref_ptr ; | |
| 829 | |
| 830 mov rdi, arg(2) ;src_ptr ; | |
| 831 movsxd rcx, dword ptr arg(4) ;Height ; | |
| 832 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line | |
| 833 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line | |
| 834 | |
| 835 pxor xmm0, xmm0 ; | |
| 836 | |
| 837 movdqu xmm5, XMMWORD PTR [rsi] | |
| 838 movdqu xmm3, XMMWORD PTR [rsi+1] | |
| 839 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x
mm3) horizontal line 1 | |
| 840 | |
| 841 lea rsi, [rsi + rax] | |
| 842 | |
| 843 .half_horiz_vert_variance16x_h_1: | |
| 844 movdqu xmm1, XMMWORD PTR [rsi] ; | |
| 845 movdqu xmm2, XMMWORD PTR [rsi+1] ; | |
| 846 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,x
mm3) horizontal line i+1 | |
| 847 | |
| 848 pavgb xmm5, xmm1 ; xmm = vertical av
erage of the above | |
| 849 | |
| 850 movdqa xmm4, xmm5 | |
| 851 punpcklbw xmm5, xmm0 ; xmm5 = words of a
bove | |
| 852 punpckhbw xmm4, xmm0 | |
| 853 | |
| 854 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..
d7 | |
| 855 punpcklbw xmm3, xmm0 ; xmm3 = words of a
bove | |
| 856 psubw xmm5, xmm3 ; xmm5 -= xmm3 | |
| 857 | |
| 858 movq xmm3, QWORD PTR [rdi+8] | |
| 859 punpcklbw xmm3, xmm0 | |
| 860 psubw xmm4, xmm3 | |
| 861 | |
| 862 paddw xmm6, xmm5 ; xmm6 += accumulat
ed column differences | |
| 863 paddw xmm6, xmm4 | |
| 864 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 | |
| 865 pmaddwd xmm4, xmm4 | |
| 866 paddd xmm7, xmm5 ; xmm7 += accumulat
ed square column differences | |
| 867 paddd xmm7, xmm4 | |
| 868 | |
| 869 movdqa xmm5, xmm1 ; save xmm1 for use
on the next row | |
| 870 | |
| 871 lea rsi, [rsi + rax] | |
| 872 lea rdi, [rdi + rdx] | |
| 873 | |
| 874 sub rcx, 1 ; | |
| 875 jnz .half_horiz_vert_variance16x_h_1 ; | |
| 876 | |
| 877 pxor xmm1, xmm1 | |
| 878 pxor xmm5, xmm5 | |
| 879 | |
| 880 punpcklwd xmm0, xmm6 | |
| 881 punpckhwd xmm1, xmm6 | |
| 882 psrad xmm0, 16 | |
| 883 psrad xmm1, 16 | |
| 884 paddd xmm0, xmm1 | |
| 885 movdqa xmm1, xmm0 | |
| 886 | |
| 887 movdqa xmm6, xmm7 | |
| 888 punpckldq xmm6, xmm5 | |
| 889 punpckhdq xmm7, xmm5 | |
| 890 paddd xmm6, xmm7 | |
| 891 | |
| 892 punpckldq xmm0, xmm5 | |
| 893 punpckhdq xmm1, xmm5 | |
| 894 paddd xmm0, xmm1 | |
| 895 | |
| 896 movdqa xmm7, xmm6 | |
| 897 movdqa xmm1, xmm0 | |
| 898 | |
| 899 psrldq xmm7, 8 | |
| 900 psrldq xmm1, 8 | |
| 901 | |
| 902 paddd xmm6, xmm7 | |
| 903 paddd xmm0, xmm1 | |
| 904 | |
| 905 mov rsi, arg(5) ;[Sum] | |
| 906 mov rdi, arg(6) ;[SSE] | |
| 907 | |
| 908 movd [rsi], xmm0 | |
| 909 movd [rdi], xmm6 | |
| 910 | |
| 911 ; begin epilog | |
| 912 pop rdi | |
| 913 pop rsi | |
| 914 RESTORE_GOT | |
| 915 RESTORE_XMM | |
| 916 UNSHADOW_ARGS | |
| 917 pop rbp | |
| 918 ret | |
| 919 | |
| 920 | |
| 921 ;void vp9_half_vert_variance8x_h_sse2 | 525 ;void vp9_half_vert_variance8x_h_sse2 |
| 922 ;( | 526 ;( |
| 923 ; unsigned char *ref_ptr, | 527 ; unsigned char *ref_ptr, |
| 924 ; int ref_pixels_per_line, | 528 ; int ref_pixels_per_line, |
| 925 ; unsigned char *src_ptr, | 529 ; unsigned char *src_ptr, |
| 926 ; int src_pixels_per_line, | 530 ; int src_pixels_per_line, |
| 927 ; unsigned int Height, | 531 ; unsigned int Height, |
| 928 ; int *sum, | 532 ; int *sum, |
| 929 ; unsigned int *sumsquared | 533 ; unsigned int *sumsquared |
| 930 ;) | 534 ;) |
| (...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1018 | 622 |
| 1019 ; begin epilog | 623 ; begin epilog |
| 1020 pop rdi | 624 pop rdi |
| 1021 pop rsi | 625 pop rsi |
| 1022 RESTORE_GOT | 626 RESTORE_GOT |
| 1023 RESTORE_XMM | 627 RESTORE_XMM |
| 1024 UNSHADOW_ARGS | 628 UNSHADOW_ARGS |
| 1025 pop rbp | 629 pop rbp |
| 1026 ret | 630 ret |
| 1027 | 631 |
| 1028 ;void vp9_half_vert_variance16x_h_sse2 | |
| 1029 ;( | |
| 1030 ; unsigned char *ref_ptr, | |
| 1031 ; int ref_pixels_per_line, | |
| 1032 ; unsigned char *src_ptr, | |
| 1033 ; int src_pixels_per_line, | |
| 1034 ; unsigned int Height, | |
| 1035 ; int *sum, | |
| 1036 ; unsigned int *sumsquared | |
| 1037 ;) | |
| 1038 global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE | |
| 1039 sym(vp9_half_vert_variance16x_h_sse2): | |
| 1040 push rbp | |
| 1041 mov rbp, rsp | |
| 1042 SHADOW_ARGS_TO_STACK 7 | |
| 1043 SAVE_XMM 7 | |
| 1044 GET_GOT rbx | |
| 1045 push rsi | |
| 1046 push rdi | |
| 1047 ; end prolog | |
| 1048 | |
| 1049 pxor xmm6, xmm6 ; error accumulator | |
| 1050 pxor xmm7, xmm7 ; sse eaccumulator | |
| 1051 mov rsi, arg(0) ;ref_ptr | |
| 1052 | |
| 1053 mov rdi, arg(2) ;src_ptr | |
| 1054 movsxd rcx, dword ptr arg(4) ;Height | |
| 1055 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line | |
| 1056 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line | |
| 1057 | |
| 1058 movdqu xmm5, XMMWORD PTR [rsi] | |
| 1059 lea rsi, [rsi + rax ] | |
| 1060 pxor xmm0, xmm0 | |
| 1061 | |
| 1062 .half_vert_variance16x_h_1: | |
| 1063 movdqu xmm3, XMMWORD PTR [rsi] | |
| 1064 | |
| 1065 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x
mm3) | |
| 1066 movdqa xmm4, xmm5 | |
| 1067 punpcklbw xmm5, xmm0 | |
| 1068 punpckhbw xmm4, xmm0 | |
| 1069 | |
| 1070 movq xmm2, QWORD PTR [rdi] | |
| 1071 punpcklbw xmm2, xmm0 | |
| 1072 psubw xmm5, xmm2 | |
| 1073 movq xmm2, QWORD PTR [rdi+8] | |
| 1074 punpcklbw xmm2, xmm0 | |
| 1075 psubw xmm4, xmm2 | |
| 1076 | |
| 1077 paddw xmm6, xmm5 ; xmm6 += accumulat
ed column differences | |
| 1078 paddw xmm6, xmm4 | |
| 1079 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 | |
| 1080 pmaddwd xmm4, xmm4 | |
| 1081 paddd xmm7, xmm5 ; xmm7 += accumulat
ed square column differences | |
| 1082 paddd xmm7, xmm4 | |
| 1083 | |
| 1084 movdqa xmm5, xmm3 | |
| 1085 | |
| 1086 lea rsi, [rsi + rax] | |
| 1087 lea rdi, [rdi + rdx] | |
| 1088 | |
| 1089 sub rcx, 1 | |
| 1090 jnz .half_vert_variance16x_h_1 | |
| 1091 | |
| 1092 pxor xmm1, xmm1 | |
| 1093 pxor xmm5, xmm5 | |
| 1094 | |
| 1095 punpcklwd xmm0, xmm6 | |
| 1096 punpckhwd xmm1, xmm6 | |
| 1097 psrad xmm0, 16 | |
| 1098 psrad xmm1, 16 | |
| 1099 paddd xmm0, xmm1 | |
| 1100 movdqa xmm1, xmm0 | |
| 1101 | |
| 1102 movdqa xmm6, xmm7 | |
| 1103 punpckldq xmm6, xmm5 | |
| 1104 punpckhdq xmm7, xmm5 | |
| 1105 paddd xmm6, xmm7 | |
| 1106 | |
| 1107 punpckldq xmm0, xmm5 | |
| 1108 punpckhdq xmm1, xmm5 | |
| 1109 paddd xmm0, xmm1 | |
| 1110 | |
| 1111 movdqa xmm7, xmm6 | |
| 1112 movdqa xmm1, xmm0 | |
| 1113 | |
| 1114 psrldq xmm7, 8 | |
| 1115 psrldq xmm1, 8 | |
| 1116 | |
| 1117 paddd xmm6, xmm7 | |
| 1118 paddd xmm0, xmm1 | |
| 1119 | |
| 1120 mov rsi, arg(5) ;[Sum] | |
| 1121 mov rdi, arg(6) ;[SSE] | |
| 1122 | |
| 1123 movd [rsi], xmm0 | |
| 1124 movd [rdi], xmm6 | |
| 1125 | |
| 1126 ; begin epilog | |
| 1127 pop rdi | |
| 1128 pop rsi | |
| 1129 RESTORE_GOT | |
| 1130 RESTORE_XMM | |
| 1131 UNSHADOW_ARGS | |
| 1132 pop rbp | |
| 1133 ret | |
| 1134 | |
| 1135 | 632 |
| 1136 ;void vp9_half_horiz_variance8x_h_sse2 | 633 ;void vp9_half_horiz_variance8x_h_sse2 |
| 1137 ;( | 634 ;( |
| 1138 ; unsigned char *ref_ptr, | 635 ; unsigned char *ref_ptr, |
| 1139 ; int ref_pixels_per_line, | 636 ; int ref_pixels_per_line, |
| 1140 ; unsigned char *src_ptr, | 637 ; unsigned char *src_ptr, |
| 1141 ; int src_pixels_per_line, | 638 ; int src_pixels_per_line, |
| 1142 ; unsigned int Height, | 639 ; unsigned int Height, |
| 1143 ; int *sum, | 640 ; int *sum, |
| 1144 ; unsigned int *sumsquared | 641 ; unsigned int *sumsquared |
| (...skipping 86 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1231 | 728 |
| 1232 ; begin epilog | 729 ; begin epilog |
| 1233 pop rdi | 730 pop rdi |
| 1234 pop rsi | 731 pop rsi |
| 1235 RESTORE_GOT | 732 RESTORE_GOT |
| 1236 RESTORE_XMM | 733 RESTORE_XMM |
| 1237 UNSHADOW_ARGS | 734 UNSHADOW_ARGS |
| 1238 pop rbp | 735 pop rbp |
| 1239 ret | 736 ret |
| 1240 | 737 |
| 1241 ;void vp9_half_horiz_variance16x_h_sse2 | |
| 1242 ;( | |
| 1243 ; unsigned char *ref_ptr, | |
| 1244 ; int ref_pixels_per_line, | |
| 1245 ; unsigned char *src_ptr, | |
| 1246 ; int src_pixels_per_line, | |
| 1247 ; unsigned int Height, | |
| 1248 ; int *sum, | |
| 1249 ; unsigned int *sumsquared | |
| 1250 ;) | |
| 1251 global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE | |
| 1252 sym(vp9_half_horiz_variance16x_h_sse2): | |
| 1253 push rbp | |
| 1254 mov rbp, rsp | |
| 1255 SHADOW_ARGS_TO_STACK 7 | |
| 1256 SAVE_XMM 7 | |
| 1257 GET_GOT rbx | |
| 1258 push rsi | |
| 1259 push rdi | |
| 1260 ; end prolog | |
| 1261 | |
| 1262 pxor xmm6, xmm6 ; error accumulator | |
| 1263 pxor xmm7, xmm7 ; sse eaccumulator | |
| 1264 mov rsi, arg(0) ;ref_ptr ; | |
| 1265 | |
| 1266 mov rdi, arg(2) ;src_ptr ; | |
| 1267 movsxd rcx, dword ptr arg(4) ;Height ; | |
| 1268 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line | |
| 1269 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line | |
| 1270 | |
| 1271 pxor xmm0, xmm0 ; | |
| 1272 | |
| 1273 .half_horiz_variance16x_h_1: | |
| 1274 movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2
..s15 | |
| 1275 movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3
..s16 | |
| 1276 | |
| 1277 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x
mm3) | |
| 1278 movdqa xmm1, xmm5 | |
| 1279 punpcklbw xmm5, xmm0 ; xmm5 = words of a
bove | |
| 1280 punpckhbw xmm1, xmm0 | |
| 1281 | |
| 1282 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..
d7 | |
| 1283 punpcklbw xmm3, xmm0 ; xmm3 = words of a
bove | |
| 1284 movq xmm2, QWORD PTR [rdi+8] | |
| 1285 punpcklbw xmm2, xmm0 | |
| 1286 | |
| 1287 psubw xmm5, xmm3 ; xmm5 -= xmm3 | |
| 1288 psubw xmm1, xmm2 | |
| 1289 paddw xmm6, xmm5 ; xmm6 += accumulat
ed column differences | |
| 1290 paddw xmm6, xmm1 | |
| 1291 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 | |
| 1292 pmaddwd xmm1, xmm1 | |
| 1293 paddd xmm7, xmm5 ; xmm7 += accumulat
ed square column differences | |
| 1294 paddd xmm7, xmm1 | |
| 1295 | |
| 1296 lea rsi, [rsi + rax] | |
| 1297 lea rdi, [rdi + rdx] | |
| 1298 | |
| 1299 sub rcx, 1 ; | |
| 1300 jnz .half_horiz_variance16x_h_1 ; | |
| 1301 | |
| 1302 pxor xmm1, xmm1 | |
| 1303 pxor xmm5, xmm5 | |
| 1304 | |
| 1305 punpcklwd xmm0, xmm6 | |
| 1306 punpckhwd xmm1, xmm6 | |
| 1307 psrad xmm0, 16 | |
| 1308 psrad xmm1, 16 | |
| 1309 paddd xmm0, xmm1 | |
| 1310 movdqa xmm1, xmm0 | |
| 1311 | |
| 1312 movdqa xmm6, xmm7 | |
| 1313 punpckldq xmm6, xmm5 | |
| 1314 punpckhdq xmm7, xmm5 | |
| 1315 paddd xmm6, xmm7 | |
| 1316 | |
| 1317 punpckldq xmm0, xmm5 | |
| 1318 punpckhdq xmm1, xmm5 | |
| 1319 paddd xmm0, xmm1 | |
| 1320 | |
| 1321 movdqa xmm7, xmm6 | |
| 1322 movdqa xmm1, xmm0 | |
| 1323 | |
| 1324 psrldq xmm7, 8 | |
| 1325 psrldq xmm1, 8 | |
| 1326 | |
| 1327 paddd xmm6, xmm7 | |
| 1328 paddd xmm0, xmm1 | |
| 1329 | |
| 1330 mov rsi, arg(5) ;[Sum] | |
| 1331 mov rdi, arg(6) ;[SSE] | |
| 1332 | |
| 1333 movd [rsi], xmm0 | |
| 1334 movd [rdi], xmm6 | |
| 1335 | |
| 1336 ; begin epilog | |
| 1337 pop rdi | |
| 1338 pop rsi | |
| 1339 RESTORE_GOT | |
| 1340 RESTORE_XMM | |
| 1341 UNSHADOW_ARGS | |
| 1342 pop rbp | |
| 1343 ret | |
| 1344 | 738 |
| 1345 SECTION_RODATA | 739 SECTION_RODATA |
| 1346 ; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; | 740 ; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; |
| 1347 align 16 | 741 align 16 |
| 1348 xmm_bi_rd: | 742 xmm_bi_rd: |
| 1349 times 8 dw 64 | 743 times 8 dw 64 |
| 1350 align 16 | 744 align 16 |
| 1351 bilinear_filters_sse2: | 745 bilinear_filters_sse2: |
| 1352 dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 | 746 dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 |
| 1353 dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8 | 747 dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8 |
| 1354 dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 | 748 dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 |
| 1355 dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24 | 749 dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24 |
| 1356 dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 | 750 dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 |
| 1357 dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40 | 751 dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40 |
| 1358 dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 | 752 dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 |
| 1359 dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56 | 753 dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56 |
| 1360 dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 | 754 dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 |
| 1361 dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72 | 755 dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72 |
| 1362 dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 | 756 dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 |
| 1363 dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88 | 757 dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88 |
| 1364 dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 | 758 dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 |
| 1365 dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104 | 759 dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104 |
| 1366 dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 | 760 dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 |
| 1367 dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120 | 761 dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120 |
| OLD | NEW |