| OLD | NEW |
| 1 ; | 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; | 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; | 9 ; |
| 10 | 10 |
| 11 | 11 |
| 12 %include "vpx_ports/x86_abi_support.asm" | 12 %include "vpx_ports/x86_abi_support.asm" |
| 13 | 13 |
| 14 %macro VERTx4 1 | 14 %macro VERTx4 1 |
| 15 mov rdx, arg(5) ;filter ptr | 15 mov rdx, arg(5) ;filter ptr |
| 16 mov rsi, arg(0) ;src_ptr | 16 mov rsi, arg(0) ;src_ptr |
| 17 mov rdi, arg(2) ;output_ptr | 17 mov rdi, arg(2) ;output_ptr |
| 18 mov rcx, 0x0400040 | 18 mov rcx, 0x0400040 |
| 19 | 19 |
| 20 movdqa xmm4, [rdx] ;load filters | 20 movdqa xmm4, [rdx] ;load filters |
| 21 movd xmm5, rcx | 21 movq xmm5, rcx |
| 22 packsswb xmm4, xmm4 | 22 packsswb xmm4, xmm4 |
| 23 pshuflw xmm0, xmm4, 0b ;k0_k1 | 23 pshuflw xmm0, xmm4, 0b ;k0_k1 |
| 24 pshuflw xmm1, xmm4, 01010101b ;k2_k3 | 24 pshuflw xmm1, xmm4, 01010101b ;k2_k3 |
| 25 pshuflw xmm2, xmm4, 10101010b ;k4_k5 | 25 pshuflw xmm2, xmm4, 10101010b ;k4_k5 |
| 26 pshuflw xmm3, xmm4, 11111111b ;k6_k7 | 26 pshuflw xmm3, xmm4, 11111111b ;k6_k7 |
| 27 | 27 |
| 28 punpcklqdq xmm0, xmm0 | 28 punpcklqdq xmm0, xmm0 |
| 29 punpcklqdq xmm1, xmm1 | 29 punpcklqdq xmm1, xmm1 |
| 30 punpcklqdq xmm2, xmm2 | 30 punpcklqdq xmm2, xmm2 |
| 31 punpcklqdq xmm3, xmm3 | 31 punpcklqdq xmm3, xmm3 |
| (...skipping 622 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 654 packuswb %1, %1 | 654 packuswb %1, %1 |
| 655 %endm | 655 %endm |
| 656 | 656 |
| 657 %macro HORIZx8 1 | 657 %macro HORIZx8 1 |
| 658 mov rdx, arg(5) ;filter ptr | 658 mov rdx, arg(5) ;filter ptr |
| 659 mov rsi, arg(0) ;src_ptr | 659 mov rsi, arg(0) ;src_ptr |
| 660 mov rdi, arg(2) ;output_ptr | 660 mov rdi, arg(2) ;output_ptr |
| 661 mov rcx, 0x0400040 | 661 mov rcx, 0x0400040 |
| 662 | 662 |
| 663 movdqa xmm4, [rdx] ;load filters | 663 movdqa xmm4, [rdx] ;load filters |
| 664 movd xmm5, rcx | 664 movq xmm5, rcx |
| 665 packsswb xmm4, xmm4 | 665 packsswb xmm4, xmm4 |
| 666 pshuflw xmm0, xmm4, 0b ;k0_k1 | 666 pshuflw xmm0, xmm4, 0b ;k0_k1 |
| 667 pshuflw xmm1, xmm4, 01010101b ;k2_k3 | 667 pshuflw xmm1, xmm4, 01010101b ;k2_k3 |
| 668 pshuflw xmm2, xmm4, 10101010b ;k4_k5 | 668 pshuflw xmm2, xmm4, 10101010b ;k4_k5 |
| 669 pshuflw xmm3, xmm4, 11111111b ;k6_k7 | 669 pshuflw xmm3, xmm4, 11111111b ;k6_k7 |
| 670 | 670 |
| 671 punpcklqdq xmm0, xmm0 | 671 punpcklqdq xmm0, xmm0 |
| 672 punpcklqdq xmm1, xmm1 | 672 punpcklqdq xmm1, xmm1 |
| 673 punpcklqdq xmm2, xmm2 | 673 punpcklqdq xmm2, xmm2 |
| 674 punpcklqdq xmm3, xmm3 | 674 punpcklqdq xmm3, xmm3 |
| (...skipping 83 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 758 | 758 |
| 759 movsxd rax, dword ptr arg(1) ;src_pixels_per_line | 759 movsxd rax, dword ptr arg(1) ;src_pixels_per_line |
| 760 movsxd rdx, dword ptr arg(3) ;output_pitch | 760 movsxd rdx, dword ptr arg(3) ;output_pitch |
| 761 movsxd rcx, dword ptr arg(4) ;output_height | 761 movsxd rcx, dword ptr arg(4) ;output_height |
| 762 | 762 |
| 763 .loop: | 763 .loop: |
| 764 prefetcht0 [rsi + 2 * rax -3] | 764 prefetcht0 [rsi + 2 * rax -3] |
| 765 | 765 |
| 766 movq xmm0, [rsi - 3] ;load src data | 766 movq xmm0, [rsi - 3] ;load src data |
| 767 movq xmm4, [rsi + 5] | 767 movq xmm4, [rsi + 5] |
| 768 movq xmm7, [rsi + 13] | 768 movq xmm6, [rsi + 13] |
| 769 punpcklqdq xmm0, xmm4 | 769 punpcklqdq xmm0, xmm4 |
| 770 punpcklqdq xmm4, xmm7 | 770 punpcklqdq xmm4, xmm6 |
| 771 | 771 |
| 772 movdqa xmm7, xmm0 |
| 773 |
| 774 punpcklbw xmm7, xmm7 |
| 775 punpckhbw xmm0, xmm0 |
| 772 movdqa xmm1, xmm0 | 776 movdqa xmm1, xmm0 |
| 773 movdqa xmm2, xmm0 | 777 movdqa xmm2, xmm0 |
| 774 movdqa xmm3, xmm0 | 778 movdqa xmm3, xmm0 |
| 779 |
| 780 palignr xmm0, xmm7, 1 |
| 781 palignr xmm1, xmm7, 5 |
| 782 pmaddubsw xmm0, k0k1 |
| 783 palignr xmm2, xmm7, 9 |
| 784 pmaddubsw xmm1, k2k3 |
| 785 palignr xmm3, xmm7, 13 |
| 786 |
| 787 pmaddubsw xmm2, k4k5 |
| 788 pmaddubsw xmm3, k6k7 |
| 789 paddsw xmm0, xmm3 |
| 790 |
| 791 movdqa xmm3, xmm4 |
| 792 punpcklbw xmm3, xmm3 |
| 793 punpckhbw xmm4, xmm4 |
| 794 |
| 775 movdqa xmm5, xmm4 | 795 movdqa xmm5, xmm4 |
| 776 movdqa xmm6, xmm4 | 796 movdqa xmm6, xmm4 |
| 777 movdqa xmm7, xmm4 | 797 movdqa xmm7, xmm4 |
| 778 | 798 |
| 779 pshufb xmm0, [GLOBAL(shuf_t0t1)] | 799 palignr xmm4, xmm3, 1 |
| 780 pshufb xmm1, [GLOBAL(shuf_t2t3)] | 800 palignr xmm5, xmm3, 5 |
| 781 pshufb xmm2, [GLOBAL(shuf_t4t5)] | 801 palignr xmm6, xmm3, 9 |
| 782 pshufb xmm3, [GLOBAL(shuf_t6t7)] | 802 palignr xmm7, xmm3, 13 |
| 783 pshufb xmm4, [GLOBAL(shuf_t0t1)] | |
| 784 pshufb xmm5, [GLOBAL(shuf_t2t3)] | |
| 785 pshufb xmm6, [GLOBAL(shuf_t4t5)] | |
| 786 pshufb xmm7, [GLOBAL(shuf_t6t7)] | |
| 787 | 803 |
| 788 pmaddubsw xmm0, k0k1 | 804 movdqa xmm3, xmm1 |
| 789 pmaddubsw xmm1, k2k3 | |
| 790 pmaddubsw xmm2, k4k5 | |
| 791 pmaddubsw xmm3, k6k7 | |
| 792 pmaddubsw xmm4, k0k1 | 805 pmaddubsw xmm4, k0k1 |
| 806 pmaxsw xmm1, xmm2 |
| 793 pmaddubsw xmm5, k2k3 | 807 pmaddubsw xmm5, k2k3 |
| 808 pminsw xmm2, xmm3 |
| 794 pmaddubsw xmm6, k4k5 | 809 pmaddubsw xmm6, k4k5 |
| 810 paddsw xmm0, xmm2 |
| 795 pmaddubsw xmm7, k6k7 | 811 pmaddubsw xmm7, k6k7 |
| 796 | |
| 797 paddsw xmm0, xmm3 | |
| 798 movdqa xmm3, xmm1 | |
| 799 pmaxsw xmm1, xmm2 | |
| 800 pminsw xmm2, xmm3 | |
| 801 paddsw xmm0, xmm2 | |
| 802 paddsw xmm0, xmm1 | 812 paddsw xmm0, xmm1 |
| 803 | 813 |
| 804 paddsw xmm4, xmm7 | 814 paddsw xmm4, xmm7 |
| 805 movdqa xmm7, xmm5 | 815 movdqa xmm7, xmm5 |
| 806 pmaxsw xmm5, xmm6 | 816 pmaxsw xmm5, xmm6 |
| 807 pminsw xmm6, xmm7 | 817 pminsw xmm6, xmm7 |
| 808 paddsw xmm4, xmm6 | 818 paddsw xmm4, xmm6 |
| 809 paddsw xmm4, xmm5 | 819 paddsw xmm4, xmm5 |
| 810 | 820 |
| 811 paddsw xmm0, krd | 821 paddsw xmm0, krd |
| (...skipping 240 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1052 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 | 1062 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
| 1053 align 16 | 1063 align 16 |
| 1054 shuf_t2t3: | 1064 shuf_t2t3: |
| 1055 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 | 1065 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |
| 1056 align 16 | 1066 align 16 |
| 1057 shuf_t4t5: | 1067 shuf_t4t5: |
| 1058 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 | 1068 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 |
| 1059 align 16 | 1069 align 16 |
| 1060 shuf_t6t7: | 1070 shuf_t6t7: |
| 1061 db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 | 1071 db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |
| OLD | NEW |