OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
11 | 11 |
12 %include "vpx_ports/x86_abi_support.asm" | 12 %include "vpx_ports/x86_abi_support.asm" |
13 | 13 |
14 %macro VERTx4 1 | 14 %macro VERTx4 1 |
15 mov rdx, arg(5) ;filter ptr | 15 mov rdx, arg(5) ;filter ptr |
16 mov rsi, arg(0) ;src_ptr | 16 mov rsi, arg(0) ;src_ptr |
17 mov rdi, arg(2) ;output_ptr | 17 mov rdi, arg(2) ;output_ptr |
18 mov rcx, 0x0400040 | 18 mov rcx, 0x0400040 |
19 | 19 |
20 movdqa xmm4, [rdx] ;load filters | 20 movdqa xmm4, [rdx] ;load filters |
21 movd xmm5, rcx | 21 movq xmm5, rcx |
22 packsswb xmm4, xmm4 | 22 packsswb xmm4, xmm4 |
23 pshuflw xmm0, xmm4, 0b ;k0_k1 | 23 pshuflw xmm0, xmm4, 0b ;k0_k1 |
24 pshuflw xmm1, xmm4, 01010101b ;k2_k3 | 24 pshuflw xmm1, xmm4, 01010101b ;k2_k3 |
25 pshuflw xmm2, xmm4, 10101010b ;k4_k5 | 25 pshuflw xmm2, xmm4, 10101010b ;k4_k5 |
26 pshuflw xmm3, xmm4, 11111111b ;k6_k7 | 26 pshuflw xmm3, xmm4, 11111111b ;k6_k7 |
27 | 27 |
28 punpcklqdq xmm0, xmm0 | 28 punpcklqdq xmm0, xmm0 |
29 punpcklqdq xmm1, xmm1 | 29 punpcklqdq xmm1, xmm1 |
30 punpcklqdq xmm2, xmm2 | 30 punpcklqdq xmm2, xmm2 |
31 punpcklqdq xmm3, xmm3 | 31 punpcklqdq xmm3, xmm3 |
(...skipping 622 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
654 packuswb %1, %1 | 654 packuswb %1, %1 |
655 %endm | 655 %endm |
656 | 656 |
657 %macro HORIZx8 1 | 657 %macro HORIZx8 1 |
658 mov rdx, arg(5) ;filter ptr | 658 mov rdx, arg(5) ;filter ptr |
659 mov rsi, arg(0) ;src_ptr | 659 mov rsi, arg(0) ;src_ptr |
660 mov rdi, arg(2) ;output_ptr | 660 mov rdi, arg(2) ;output_ptr |
661 mov rcx, 0x0400040 | 661 mov rcx, 0x0400040 |
662 | 662 |
663 movdqa xmm4, [rdx] ;load filters | 663 movdqa xmm4, [rdx] ;load filters |
664 movd xmm5, rcx | 664 movq xmm5, rcx |
665 packsswb xmm4, xmm4 | 665 packsswb xmm4, xmm4 |
666 pshuflw xmm0, xmm4, 0b ;k0_k1 | 666 pshuflw xmm0, xmm4, 0b ;k0_k1 |
667 pshuflw xmm1, xmm4, 01010101b ;k2_k3 | 667 pshuflw xmm1, xmm4, 01010101b ;k2_k3 |
668 pshuflw xmm2, xmm4, 10101010b ;k4_k5 | 668 pshuflw xmm2, xmm4, 10101010b ;k4_k5 |
669 pshuflw xmm3, xmm4, 11111111b ;k6_k7 | 669 pshuflw xmm3, xmm4, 11111111b ;k6_k7 |
670 | 670 |
671 punpcklqdq xmm0, xmm0 | 671 punpcklqdq xmm0, xmm0 |
672 punpcklqdq xmm1, xmm1 | 672 punpcklqdq xmm1, xmm1 |
673 punpcklqdq xmm2, xmm2 | 673 punpcklqdq xmm2, xmm2 |
674 punpcklqdq xmm3, xmm3 | 674 punpcklqdq xmm3, xmm3 |
(...skipping 83 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
758 | 758 |
759 movsxd rax, dword ptr arg(1) ;src_pixels_per_line | 759 movsxd rax, dword ptr arg(1) ;src_pixels_per_line |
760 movsxd rdx, dword ptr arg(3) ;output_pitch | 760 movsxd rdx, dword ptr arg(3) ;output_pitch |
761 movsxd rcx, dword ptr arg(4) ;output_height | 761 movsxd rcx, dword ptr arg(4) ;output_height |
762 | 762 |
763 .loop: | 763 .loop: |
764 prefetcht0 [rsi + 2 * rax -3] | 764 prefetcht0 [rsi + 2 * rax -3] |
765 | 765 |
766 movq xmm0, [rsi - 3] ;load src data | 766 movq xmm0, [rsi - 3] ;load src data |
767 movq xmm4, [rsi + 5] | 767 movq xmm4, [rsi + 5] |
768 movq xmm7, [rsi + 13] | 768 movq xmm6, [rsi + 13] |
769 punpcklqdq xmm0, xmm4 | 769 punpcklqdq xmm0, xmm4 |
770 punpcklqdq xmm4, xmm7 | 770 punpcklqdq xmm4, xmm6 |
771 | 771 |
| 772 movdqa xmm7, xmm0 |
| 773 |
| 774 punpcklbw xmm7, xmm7 |
| 775 punpckhbw xmm0, xmm0 |
772 movdqa xmm1, xmm0 | 776 movdqa xmm1, xmm0 |
773 movdqa xmm2, xmm0 | 777 movdqa xmm2, xmm0 |
774 movdqa xmm3, xmm0 | 778 movdqa xmm3, xmm0 |
| 779 |
| 780 palignr xmm0, xmm7, 1 |
| 781 palignr xmm1, xmm7, 5 |
| 782 pmaddubsw xmm0, k0k1 |
| 783 palignr xmm2, xmm7, 9 |
| 784 pmaddubsw xmm1, k2k3 |
| 785 palignr xmm3, xmm7, 13 |
| 786 |
| 787 pmaddubsw xmm2, k4k5 |
| 788 pmaddubsw xmm3, k6k7 |
| 789 paddsw xmm0, xmm3 |
| 790 |
| 791 movdqa xmm3, xmm4 |
| 792 punpcklbw xmm3, xmm3 |
| 793 punpckhbw xmm4, xmm4 |
| 794 |
775 movdqa xmm5, xmm4 | 795 movdqa xmm5, xmm4 |
776 movdqa xmm6, xmm4 | 796 movdqa xmm6, xmm4 |
777 movdqa xmm7, xmm4 | 797 movdqa xmm7, xmm4 |
778 | 798 |
779 pshufb xmm0, [GLOBAL(shuf_t0t1)] | 799 palignr xmm4, xmm3, 1 |
780 pshufb xmm1, [GLOBAL(shuf_t2t3)] | 800 palignr xmm5, xmm3, 5 |
781 pshufb xmm2, [GLOBAL(shuf_t4t5)] | 801 palignr xmm6, xmm3, 9 |
782 pshufb xmm3, [GLOBAL(shuf_t6t7)] | 802 palignr xmm7, xmm3, 13 |
783 pshufb xmm4, [GLOBAL(shuf_t0t1)] | |
784 pshufb xmm5, [GLOBAL(shuf_t2t3)] | |
785 pshufb xmm6, [GLOBAL(shuf_t4t5)] | |
786 pshufb xmm7, [GLOBAL(shuf_t6t7)] | |
787 | 803 |
788 pmaddubsw xmm0, k0k1 | 804 movdqa xmm3, xmm1 |
789 pmaddubsw xmm1, k2k3 | |
790 pmaddubsw xmm2, k4k5 | |
791 pmaddubsw xmm3, k6k7 | |
792 pmaddubsw xmm4, k0k1 | 805 pmaddubsw xmm4, k0k1 |
| 806 pmaxsw xmm1, xmm2 |
793 pmaddubsw xmm5, k2k3 | 807 pmaddubsw xmm5, k2k3 |
| 808 pminsw xmm2, xmm3 |
794 pmaddubsw xmm6, k4k5 | 809 pmaddubsw xmm6, k4k5 |
| 810 paddsw xmm0, xmm2 |
795 pmaddubsw xmm7, k6k7 | 811 pmaddubsw xmm7, k6k7 |
796 | |
797 paddsw xmm0, xmm3 | |
798 movdqa xmm3, xmm1 | |
799 pmaxsw xmm1, xmm2 | |
800 pminsw xmm2, xmm3 | |
801 paddsw xmm0, xmm2 | |
802 paddsw xmm0, xmm1 | 812 paddsw xmm0, xmm1 |
803 | 813 |
804 paddsw xmm4, xmm7 | 814 paddsw xmm4, xmm7 |
805 movdqa xmm7, xmm5 | 815 movdqa xmm7, xmm5 |
806 pmaxsw xmm5, xmm6 | 816 pmaxsw xmm5, xmm6 |
807 pminsw xmm6, xmm7 | 817 pminsw xmm6, xmm7 |
808 paddsw xmm4, xmm6 | 818 paddsw xmm4, xmm6 |
809 paddsw xmm4, xmm5 | 819 paddsw xmm4, xmm5 |
810 | 820 |
811 paddsw xmm0, krd | 821 paddsw xmm0, krd |
(...skipping 240 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1052 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 | 1062 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
1053 align 16 | 1063 align 16 |
1054 shuf_t2t3: | 1064 shuf_t2t3: |
1055 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 | 1065 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |
1056 align 16 | 1066 align 16 |
1057 shuf_t4t5: | 1067 shuf_t4t5: |
1058 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 | 1068 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 |
1059 align 16 | 1069 align 16 |
1060 shuf_t6t7: | 1070 shuf_t6t7: |
1061 db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 | 1071 db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |
OLD | NEW |