Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(68)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm

Issue 54923004: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 7 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 ; 1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ; 3 ;
4 ; Use of this source code is governed by a BSD-style license 4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source 5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found 6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may 7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree. 8 ; be found in the AUTHORS file in the root of the source tree.
9 ; 9 ;
10 10
(...skipping 516 matching lines...) Expand 10 before | Expand all | Expand 10 after
527 pop rbx 527 pop rbx
528 ; begin epilog 528 ; begin epilog
529 pop rdi 529 pop rdi
530 pop rsi 530 pop rsi
531 RESTORE_XMM 531 RESTORE_XMM
532 UNSHADOW_ARGS 532 UNSHADOW_ARGS
533 pop rbp 533 pop rbp
534 ret 534 ret
535 535
536 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 536 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
537 %macro HORIZx4_ROW 2
538 movdqa %2, %1
539 pshufb %1, [GLOBAL(shuf_t0t1)]
540 pshufb %2, [GLOBAL(shuf_t2t3)]
541 pmaddubsw %1, xmm6
542 pmaddubsw %2, xmm7
543
544 paddsw %1, %2
545 movdqa %2, %1
546 psrldq %2, 8
547 paddsw %1, %2
548 paddsw %1, xmm5
549 psraw %1, 7
550 packuswb %1, %1
551 %endm
537 552
538 %macro HORIZx4 1 553 %macro HORIZx4 1
539 mov rdx, arg(5) ;filter ptr 554 mov rdx, arg(5) ;filter ptr
540 mov rsi, arg(0) ;src_ptr 555 mov rsi, arg(0) ;src_ptr
541 mov rdi, arg(2) ;output_ptr 556 mov rdi, arg(2) ;output_ptr
542 mov rcx, 0x0400040 557 mov rcx, 0x0400040
543 558
544 movdqa xmm4, [rdx] ;load filters 559 movdqa xmm4, [rdx] ;load filters
545 movq xmm5, rcx 560 movq xmm5, rcx
546 packsswb xmm4, xmm4 561 packsswb xmm4, xmm4
547 pshuflw xmm0, xmm4, 0b ;k0_k1 562 pshuflw xmm6, xmm4, 0b ;k0_k1
548 pshuflw xmm1, xmm4, 01010101b ;k2_k3 563 pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5
549 pshuflw xmm2, xmm4, 10101010b ;k4_k5 564 pshuflw xmm7, xmm4, 01010101b ;k2_k3
550 pshuflw xmm3, xmm4, 11111111b ;k6_k7 565 pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7
551 566 pshufd xmm5, xmm5, 0 ;rounding
552 punpcklqdq xmm0, xmm0
553 punpcklqdq xmm1, xmm1
554 punpcklqdq xmm2, xmm2
555 punpcklqdq xmm3, xmm3
556
557 movdqa k0k1, xmm0
558 movdqa k2k3, xmm1
559 pshufd xmm5, xmm5, 0
560 movdqa k4k5, xmm2
561 movdqa k6k7, xmm3
562 movdqa krd, xmm5
563 567
564 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 568 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
565 movsxd rdx, dword ptr arg(3) ;output_pitch 569 movsxd rdx, dword ptr arg(3) ;output_pitch
566 movsxd rcx, dword ptr arg(4) ;output_height 570 movsxd rcx, dword ptr arg(4) ;output_height
571 shr rcx, 1
572 .loop:
573 ;Do two rows once
574 movq xmm0, [rsi - 3] ;load src
575 movq xmm1, [rsi + 5]
576 movq xmm2, [rsi + rax - 3]
577 movq xmm3, [rsi + rax + 5]
578 punpcklqdq xmm0, xmm1
579 punpcklqdq xmm2, xmm3
567 580
568 .loop: 581 HORIZx4_ROW xmm0, xmm1
569 movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 582 HORIZx4_ROW xmm2, xmm3
583 %if %1
584 movd xmm1, [rdi]
585 pavgb xmm0, xmm1
586 movd xmm3, [rdi + rdx]
587 pavgb xmm2, xmm3
588 %endif
589 movd [rdi], xmm0
590 movd [rdi +rdx], xmm2
570 591
571 movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 592 lea rsi, [rsi + rax]
572 punpcklqdq xmm0, xmm3 593 prefetcht0 [rsi + 4 * rax - 3]
594 lea rsi, [rsi + rax]
595 lea rdi, [rdi + 2 * rdx]
596 prefetcht0 [rsi + 2 * rax - 3]
573 597
574 movdqa xmm1, xmm0 598 dec rcx
575 pshufb xmm0, [GLOBAL(shuf_t0t1)] 599 jnz .loop
576 pmaddubsw xmm0, k0k1
577 600
578 movdqa xmm2, xmm1 601 ; Do last row if output_height is odd
579 pshufb xmm1, [GLOBAL(shuf_t2t3)] 602 movsxd rcx, dword ptr arg(4) ;output_height
580 pmaddubsw xmm1, k2k3 603 and rcx, 1
604 je .done
581 605
582 movdqa xmm4, xmm2 606 movq xmm0, [rsi - 3] ; load src
583 pshufb xmm2, [GLOBAL(shuf_t4t5)] 607 movq xmm1, [rsi + 5]
584 pmaddubsw xmm2, k4k5 608 punpcklqdq xmm0, xmm1
585 609
586 pshufb xmm4, [GLOBAL(shuf_t6t7)] 610 HORIZx4_ROW xmm0, xmm1
587 pmaddubsw xmm4, k6k7
588
589 paddsw xmm0, xmm1
590 paddsw xmm0, xmm4
591 paddsw xmm0, xmm2
592 paddsw xmm0, krd
593 psraw xmm0, 7
594 packuswb xmm0, xmm0
595 %if %1 611 %if %1
596 movd xmm1, [rdi] 612 movd xmm1, [rdi]
597 pavgb xmm0, xmm1 613 pavgb xmm0, xmm1
598 %endif 614 %endif
599 lea rsi, [rsi + rax]
600 movd [rdi], xmm0 615 movd [rdi], xmm0
616 .done
617 %endm
601 618
602 lea rdi, [rdi + rdx] 619 %macro HORIZx8_ROW 4
603 dec rcx 620 movdqa %2, %1
604 jnz .loop 621 movdqa %3, %1
622 movdqa %4, %1
623
624 pshufb %1, [GLOBAL(shuf_t0t1)]
625 pshufb %2, [GLOBAL(shuf_t2t3)]
626 pshufb %3, [GLOBAL(shuf_t4t5)]
627 pshufb %4, [GLOBAL(shuf_t6t7)]
628
629 pmaddubsw %1, k0k1
630 pmaddubsw %2, k2k3
631 pmaddubsw %3, k4k5
632 pmaddubsw %4, k6k7
633
634 paddsw %1, %2
635 paddsw %1, %4
636 paddsw %1, %3
637 paddsw %1, krd
638 psraw %1, 7
639 packuswb %1, %1
605 %endm 640 %endm
606 641
607 %macro HORIZx8 1 642 %macro HORIZx8 1
608 mov rdx, arg(5) ;filter ptr 643 mov rdx, arg(5) ;filter ptr
609 mov rsi, arg(0) ;src_ptr 644 mov rsi, arg(0) ;src_ptr
610 mov rdi, arg(2) ;output_ptr 645 mov rdi, arg(2) ;output_ptr
611 mov rcx, 0x0400040 646 mov rcx, 0x0400040
612 647
613 movdqa xmm4, [rdx] ;load filters 648 movdqa xmm4, [rdx] ;load filters
614 movd xmm5, rcx 649 movd xmm5, rcx
(...skipping 11 matching lines...) Expand all
626 movdqa k0k1, xmm0 661 movdqa k0k1, xmm0
627 movdqa k2k3, xmm1 662 movdqa k2k3, xmm1
628 pshufd xmm5, xmm5, 0 663 pshufd xmm5, xmm5, 0
629 movdqa k4k5, xmm2 664 movdqa k4k5, xmm2
630 movdqa k6k7, xmm3 665 movdqa k6k7, xmm3
631 movdqa krd, xmm5 666 movdqa krd, xmm5
632 667
633 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 668 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
634 movsxd rdx, dword ptr arg(3) ;output_pitch 669 movsxd rdx, dword ptr arg(3) ;output_pitch
635 movsxd rcx, dword ptr arg(4) ;output_height 670 movsxd rcx, dword ptr arg(4) ;output_height
671 shr rcx, 1
636 672
637 .loop: 673 .loop:
638 movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 674 movq xmm0, [rsi - 3] ;load src
675 movq xmm3, [rsi + 5]
676 movq xmm4, [rsi + rax - 3]
677 movq xmm7, [rsi + rax + 5]
678 punpcklqdq xmm0, xmm3
679 punpcklqdq xmm4, xmm7
639 680
640 movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 681 HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
682 HORIZx8_ROW xmm4, xmm5, xmm6, xmm7
683 %if %1
684 movq xmm1, [rdi]
685 movq xmm2, [rdi + rdx]
686 pavgb xmm0, xmm1
687 pavgb xmm4, xmm2
688 %endif
689 movq [rdi], xmm0
690 movq [rdi + rdx], xmm4
691
692 lea rsi, [rsi + rax]
693 prefetcht0 [rsi + 4 * rax - 3]
694 lea rsi, [rsi + rax]
695 lea rdi, [rdi + 2 * rdx]
696 prefetcht0 [rsi + 2 * rax - 3]
697 dec rcx
698 jnz .loop
699
700 ;Do last row if output_height is odd
701 movsxd rcx, dword ptr arg(4) ;output_height
702 and rcx, 1
703 je .done
704
705 movq xmm0, [rsi - 3]
706 movq xmm3, [rsi + 5]
641 punpcklqdq xmm0, xmm3 707 punpcklqdq xmm0, xmm3
642 708
643 movdqa xmm1, xmm0 709 HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
644 pshufb xmm0, [GLOBAL(shuf_t0t1)]
645 pmaddubsw xmm0, k0k1
646
647 movdqa xmm2, xmm1
648 pshufb xmm1, [GLOBAL(shuf_t2t3)]
649 pmaddubsw xmm1, k2k3
650
651 movdqa xmm4, xmm2
652 pshufb xmm2, [GLOBAL(shuf_t4t5)]
653 pmaddubsw xmm2, k4k5
654
655 pshufb xmm4, [GLOBAL(shuf_t6t7)]
656 pmaddubsw xmm4, k6k7
657
658 paddsw xmm0, xmm1
659 paddsw xmm0, xmm4
660 paddsw xmm0, xmm2
661 paddsw xmm0, krd
662 psraw xmm0, 7
663 packuswb xmm0, xmm0
664 %if %1 710 %if %1
665 movq xmm1, [rdi] 711 movq xmm1, [rdi]
666 pavgb xmm0, xmm1 712 pavgb xmm0, xmm1
667 %endif 713 %endif
668
669 lea rsi, [rsi + rax]
670 movq [rdi], xmm0 714 movq [rdi], xmm0
671 715 .done
672 lea rdi, [rdi + rdx]
673 dec rcx
674 jnz .loop
675 %endm 716 %endm
676 717
677 %macro HORIZx16 1 718 %macro HORIZx16 1
678 mov rdx, arg(5) ;filter ptr 719 mov rdx, arg(5) ;filter ptr
679 mov rsi, arg(0) ;src_ptr 720 mov rsi, arg(0) ;src_ptr
680 mov rdi, arg(2) ;output_ptr 721 mov rdi, arg(2) ;output_ptr
681 mov rcx, 0x0400040 722 mov rcx, 0x0400040
682 723
683 movdqa xmm4, [rdx] ;load filters 724 movdqa xmm4, [rdx] ;load filters
684 movq xmm5, rcx 725 movq xmm5, rcx
(...skipping 13 matching lines...) Expand all
698 pshufd xmm5, xmm5, 0 739 pshufd xmm5, xmm5, 0
699 movdqa k4k5, xmm2 740 movdqa k4k5, xmm2
700 movdqa k6k7, xmm3 741 movdqa k6k7, xmm3
701 movdqa krd, xmm5 742 movdqa krd, xmm5
702 743
703 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 744 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
704 movsxd rdx, dword ptr arg(3) ;output_pitch 745 movsxd rdx, dword ptr arg(3) ;output_pitch
705 movsxd rcx, dword ptr arg(4) ;output_height 746 movsxd rcx, dword ptr arg(4) ;output_height
706 747
707 .loop: 748 .loop:
708 movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 749 prefetcht0 [rsi + 2 * rax -3]
709 750
710 movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 751 movq xmm0, [rsi - 3] ;load src data
711 punpcklqdq xmm0, xmm3 752 movq xmm4, [rsi + 5]
753 movq xmm7, [rsi + 13]
754 punpcklqdq xmm0, xmm4
755 punpcklqdq xmm4, xmm7
712 756
713 movdqa xmm1, xmm0 757 movdqa xmm1, xmm0
758 movdqa xmm2, xmm0
759 movdqa xmm3, xmm0
760 movdqa xmm5, xmm4
761 movdqa xmm6, xmm4
762 movdqa xmm7, xmm4
763
714 pshufb xmm0, [GLOBAL(shuf_t0t1)] 764 pshufb xmm0, [GLOBAL(shuf_t0t1)]
765 pshufb xmm1, [GLOBAL(shuf_t2t3)]
766 pshufb xmm2, [GLOBAL(shuf_t4t5)]
767 pshufb xmm3, [GLOBAL(shuf_t6t7)]
768 pshufb xmm4, [GLOBAL(shuf_t0t1)]
769 pshufb xmm5, [GLOBAL(shuf_t2t3)]
770 pshufb xmm6, [GLOBAL(shuf_t4t5)]
771 pshufb xmm7, [GLOBAL(shuf_t6t7)]
772
715 pmaddubsw xmm0, k0k1 773 pmaddubsw xmm0, k0k1
716
717 movdqa xmm2, xmm1
718 pshufb xmm1, [GLOBAL(shuf_t2t3)]
719 pmaddubsw xmm1, k2k3 774 pmaddubsw xmm1, k2k3
720
721 movdqa xmm4, xmm2
722 pshufb xmm2, [GLOBAL(shuf_t4t5)]
723 pmaddubsw xmm2, k4k5 775 pmaddubsw xmm2, k4k5
724 776 pmaddubsw xmm3, k6k7
725 pshufb xmm4, [GLOBAL(shuf_t6t7)] 777 pmaddubsw xmm4, k0k1
726 pmaddubsw xmm4, k6k7 778 pmaddubsw xmm5, k2k3
779 pmaddubsw xmm6, k4k5
780 pmaddubsw xmm7, k6k7
727 781
728 paddsw xmm0, xmm1 782 paddsw xmm0, xmm1
729 paddsw xmm0, xmm4 783 paddsw xmm0, xmm3
730 paddsw xmm0, xmm2 784 paddsw xmm0, xmm2
785 paddsw xmm4, xmm5
786 paddsw xmm4, xmm7
787 paddsw xmm4, xmm6
788
731 paddsw xmm0, krd 789 paddsw xmm0, krd
790 paddsw xmm4, krd
732 psraw xmm0, 7 791 psraw xmm0, 7
792 psraw xmm4, 7
733 packuswb xmm0, xmm0 793 packuswb xmm0, xmm0
734 794 packuswb xmm4, xmm4
735 795 punpcklqdq xmm0, xmm4
736 movq xmm3, [rsi + 5]
737 movq xmm7, [rsi + 13]
738 punpcklqdq xmm3, xmm7
739
740 movdqa xmm1, xmm3
741 pshufb xmm3, [GLOBAL(shuf_t0t1)]
742 pmaddubsw xmm3, k0k1
743
744 movdqa xmm2, xmm1
745 pshufb xmm1, [GLOBAL(shuf_t2t3)]
746 pmaddubsw xmm1, k2k3
747
748 movdqa xmm4, xmm2
749 pshufb xmm2, [GLOBAL(shuf_t4t5)]
750 pmaddubsw xmm2, k4k5
751
752 pshufb xmm4, [GLOBAL(shuf_t6t7)]
753 pmaddubsw xmm4, k6k7
754
755 paddsw xmm3, xmm1
756 paddsw xmm3, xmm4
757 paddsw xmm3, xmm2
758 paddsw xmm3, krd
759 psraw xmm3, 7
760 packuswb xmm3, xmm3
761 punpcklqdq xmm0, xmm3
762 %if %1 796 %if %1
763 movdqa xmm1, [rdi] 797 movdqa xmm1, [rdi]
764 pavgb xmm0, xmm1 798 pavgb xmm0, xmm1
765 %endif 799 %endif
766 800
767 lea rsi, [rsi + rax] 801 lea rsi, [rsi + rax]
768 movdqa [rdi], xmm0 802 movdqa [rdi], xmm0
769 803
770 lea rdi, [rdi + rdx] 804 lea rdi, [rdi + rdx]
771 dec rcx 805 dec rcx
(...skipping 13 matching lines...) Expand all
785 sym(vp9_filter_block1d4_h8_ssse3): 819 sym(vp9_filter_block1d4_h8_ssse3):
786 push rbp 820 push rbp
787 mov rbp, rsp 821 mov rbp, rsp
788 SHADOW_ARGS_TO_STACK 6 822 SHADOW_ARGS_TO_STACK 6
789 SAVE_XMM 7 823 SAVE_XMM 7
790 GET_GOT rbx 824 GET_GOT rbx
791 push rsi 825 push rsi
792 push rdi 826 push rdi
793 ; end prolog 827 ; end prolog
794 828
795 ALIGN_STACK 16, rax
796 sub rsp, 16*5
797 %define k0k1 [rsp + 16*0]
798 %define k2k3 [rsp + 16*1]
799 %define k4k5 [rsp + 16*2]
800 %define k6k7 [rsp + 16*3]
801 %define krd [rsp + 16*4]
802
803 HORIZx4 0 829 HORIZx4 0
804 830
805 add rsp, 16*5
806 pop rsp
807
808 ; begin epilog 831 ; begin epilog
809 pop rdi 832 pop rdi
810 pop rsi 833 pop rsi
811 RESTORE_GOT 834 RESTORE_GOT
812 RESTORE_XMM 835 RESTORE_XMM
813 UNSHADOW_ARGS 836 UNSHADOW_ARGS
814 pop rbp 837 pop rbp
815 ret 838 ret
816 839
817 ;void vp9_filter_block1d8_h8_ssse3 840 ;void vp9_filter_block1d8_h8_ssse3
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after
902 sym(vp9_filter_block1d4_h8_avg_ssse3): 925 sym(vp9_filter_block1d4_h8_avg_ssse3):
903 push rbp 926 push rbp
904 mov rbp, rsp 927 mov rbp, rsp
905 SHADOW_ARGS_TO_STACK 6 928 SHADOW_ARGS_TO_STACK 6
906 SAVE_XMM 7 929 SAVE_XMM 7
907 GET_GOT rbx 930 GET_GOT rbx
908 push rsi 931 push rsi
909 push rdi 932 push rdi
910 ; end prolog 933 ; end prolog
911 934
912 ALIGN_STACK 16, rax
913 sub rsp, 16*5
914 %define k0k1 [rsp + 16*0]
915 %define k2k3 [rsp + 16*1]
916 %define k4k5 [rsp + 16*2]
917 %define k6k7 [rsp + 16*3]
918 %define krd [rsp + 16*4]
919
920 HORIZx4 1 935 HORIZx4 1
921 936
922 add rsp, 16*5
923 pop rsp
924
925 ; begin epilog 937 ; begin epilog
926 pop rdi 938 pop rdi
927 pop rsi 939 pop rsi
928 RESTORE_GOT 940 RESTORE_GOT
929 RESTORE_XMM 941 RESTORE_XMM
930 UNSHADOW_ARGS 942 UNSHADOW_ARGS
931 pop rbp 943 pop rbp
932 ret 944 ret
933 945
934 global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE 946 global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
1002 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 1014 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
1003 align 16 1015 align 16
1004 shuf_t2t3: 1016 shuf_t2t3:
1005 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 1017 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
1006 align 16 1018 align 16
1007 shuf_t4t5: 1019 shuf_t4t5:
1008 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 1020 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
1009 align 16 1021 align 16
1010 shuf_t6t7: 1022 shuf_t6t7:
1011 db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 1023 db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm ('k') | source/libvpx/vp9/encoder/vp9_bitstream.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698