OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
11 | 11 |
12 %include "vpx_ports/x86_abi_support.asm" | 12 %include "vpx_ports/x86_abi_support.asm" |
13 | 13 |
14 ;/******************************************************************************
****** | |
15 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixe
ls. The | |
16 ; input pixel array has output_height rows. This routine assumes that output_hei
ght is an | |
17 ; even number. This function handles 8 pixels in horizontal direction, calculati
ng ONE | |
18 ; rows each iteration to take advantage of the 128 bits operations. | |
19 ; | |
20 ; This is an implementation of some of the SSE optimizations first seen in ffvp8 | |
21 ; | |
22 ;*******************************************************************************
******/ | |
23 | |
24 | |
25 %macro VERTx4 1 | 14 %macro VERTx4 1 |
26 mov rdx, arg(5) ;filter ptr | 15 mov rdx, arg(5) ;filter ptr |
27 mov rsi, arg(0) ;src_ptr | 16 mov rsi, arg(0) ;src_ptr |
28 mov rdi, arg(2) ;output_ptr | 17 mov rdi, arg(2) ;output_ptr |
29 mov rcx, 0x0400040 | 18 mov rcx, 0x0400040 |
30 | 19 |
31 movdqa xmm4, [rdx] ;load filters | 20 movdqa xmm4, [rdx] ;load filters |
32 movd xmm5, rcx | 21 movd xmm5, rcx |
33 packsswb xmm4, xmm4 | 22 packsswb xmm4, xmm4 |
34 pshuflw xmm0, xmm4, 0b ;k0_k1 | 23 pshuflw xmm0, xmm4, 0b ;k0_k1 |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
74 | 63 |
75 movd xmm6, [rsi + rbx] ;G | 64 movd xmm6, [rsi + rbx] ;G |
76 movd xmm7, [rax + rbx] ;H | 65 movd xmm7, [rax + rbx] ;H |
77 | 66 |
78 pmaddubsw xmm0, k0k1 | 67 pmaddubsw xmm0, k0k1 |
79 pmaddubsw xmm2, k2k3 | 68 pmaddubsw xmm2, k2k3 |
80 punpcklbw xmm6, xmm7 ;G H | 69 punpcklbw xmm6, xmm7 ;G H |
81 pmaddubsw xmm4, k4k5 | 70 pmaddubsw xmm4, k4k5 |
82 pmaddubsw xmm6, k6k7 | 71 pmaddubsw xmm6, k6k7 |
83 | 72 |
| 73 movdqa xmm1, xmm2 |
84 paddsw xmm0, xmm6 | 74 paddsw xmm0, xmm6 |
| 75 pmaxsw xmm2, xmm4 |
| 76 pminsw xmm4, xmm1 |
| 77 paddsw xmm0, xmm4 |
85 paddsw xmm0, xmm2 | 78 paddsw xmm0, xmm2 |
86 paddsw xmm0, xmm4 | 79 |
87 paddsw xmm0, krd | 80 paddsw xmm0, krd |
88 | |
89 psraw xmm0, 7 | 81 psraw xmm0, 7 |
90 packuswb xmm0, xmm0 | 82 packuswb xmm0, xmm0 |
91 | 83 |
92 add rsi, rdx | 84 add rsi, rdx |
93 add rax, rdx | 85 add rax, rdx |
94 %if %1 | 86 %if %1 |
95 movd xmm1, [rdi] | 87 movd xmm1, [rdi] |
96 pavgb xmm0, xmm1 | 88 pavgb xmm0, xmm1 |
97 %endif | 89 %endif |
98 movd [rdi], xmm0 | 90 movd [rdi], xmm0 |
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
159 movq xmm6, [rsi + rbx] ;G | 151 movq xmm6, [rsi + rbx] ;G |
160 movq xmm7, [rax + rbx] ;H | 152 movq xmm7, [rax + rbx] ;H |
161 | 153 |
162 pmaddubsw xmm0, k0k1 | 154 pmaddubsw xmm0, k0k1 |
163 pmaddubsw xmm2, k2k3 | 155 pmaddubsw xmm2, k2k3 |
164 punpcklbw xmm6, xmm7 ;G H | 156 punpcklbw xmm6, xmm7 ;G H |
165 pmaddubsw xmm4, k4k5 | 157 pmaddubsw xmm4, k4k5 |
166 pmaddubsw xmm6, k6k7 | 158 pmaddubsw xmm6, k6k7 |
167 | 159 |
168 paddsw xmm0, xmm6 | 160 paddsw xmm0, xmm6 |
| 161 movdqa xmm1, xmm2 |
| 162 pmaxsw xmm2, xmm4 |
| 163 pminsw xmm4, xmm1 |
| 164 paddsw xmm0, xmm4 |
169 paddsw xmm0, xmm2 | 165 paddsw xmm0, xmm2 |
170 paddsw xmm0, xmm4 | 166 |
171 paddsw xmm0, krd | 167 paddsw xmm0, krd |
172 | |
173 psraw xmm0, 7 | 168 psraw xmm0, 7 |
174 packuswb xmm0, xmm0 | 169 packuswb xmm0, xmm0 |
175 | 170 |
176 add rsi, rdx | 171 add rsi, rdx |
177 add rax, rdx | 172 add rax, rdx |
178 %if %1 | 173 %if %1 |
179 movq xmm1, [rdi] | 174 movq xmm1, [rdi] |
180 pavgb xmm0, xmm1 | 175 pavgb xmm0, xmm1 |
181 %endif | 176 %endif |
182 movq [rdi], xmm0 | 177 movq [rdi], xmm0 |
(...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
244 movq xmm6, [rsi + rbx] ;G | 239 movq xmm6, [rsi + rbx] ;G |
245 movq xmm7, [rax + rbx] ;H | 240 movq xmm7, [rax + rbx] ;H |
246 | 241 |
247 pmaddubsw xmm0, k0k1 | 242 pmaddubsw xmm0, k0k1 |
248 pmaddubsw xmm2, k2k3 | 243 pmaddubsw xmm2, k2k3 |
249 punpcklbw xmm6, xmm7 ;G H | 244 punpcklbw xmm6, xmm7 ;G H |
250 pmaddubsw xmm4, k4k5 | 245 pmaddubsw xmm4, k4k5 |
251 pmaddubsw xmm6, k6k7 | 246 pmaddubsw xmm6, k6k7 |
252 | 247 |
253 paddsw xmm0, xmm6 | 248 paddsw xmm0, xmm6 |
| 249 movdqa xmm1, xmm2 |
| 250 pmaxsw xmm2, xmm4 |
| 251 pminsw xmm4, xmm1 |
| 252 paddsw xmm0, xmm4 |
254 paddsw xmm0, xmm2 | 253 paddsw xmm0, xmm2 |
255 paddsw xmm0, xmm4 | 254 |
256 paddsw xmm0, krd | 255 paddsw xmm0, krd |
257 | |
258 psraw xmm0, 7 | 256 psraw xmm0, 7 |
259 packuswb xmm0, xmm0 | 257 packuswb xmm0, xmm0 |
260 %if %1 | 258 %if %1 |
261 movq xmm1, [rdi] | 259 movq xmm1, [rdi] |
262 pavgb xmm0, xmm1 | 260 pavgb xmm0, xmm1 |
263 %endif | 261 %endif |
264 movq [rdi], xmm0 | 262 movq [rdi], xmm0 |
265 | 263 |
266 movq xmm0, [rsi + 8] ;A | 264 movq xmm0, [rsi + 8] ;A |
267 movq xmm1, [rsi + rdx + 8] ;B | 265 movq xmm1, [rsi + rdx + 8] ;B |
(...skipping 263 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
531 RESTORE_XMM | 529 RESTORE_XMM |
532 UNSHADOW_ARGS | 530 UNSHADOW_ARGS |
533 pop rbp | 531 pop rbp |
534 ret | 532 ret |
535 | 533 |
536 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 534 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
537 %macro HORIZx4_ROW 2 | 535 %macro HORIZx4_ROW 2 |
538 movdqa %2, %1 | 536 movdqa %2, %1 |
539 pshufb %1, [GLOBAL(shuf_t0t1)] | 537 pshufb %1, [GLOBAL(shuf_t0t1)] |
540 pshufb %2, [GLOBAL(shuf_t2t3)] | 538 pshufb %2, [GLOBAL(shuf_t2t3)] |
541 pmaddubsw %1, xmm6 | 539 pmaddubsw %1, k0k1k4k5 |
542 pmaddubsw %2, xmm7 | 540 pmaddubsw %2, k2k3k6k7 |
543 | 541 |
544 paddsw %1, %2 | 542 movdqa xmm4, %1 |
545 movdqa %2, %1 | 543 movdqa xmm5, %2 |
| 544 psrldq %1, 8 |
546 psrldq %2, 8 | 545 psrldq %2, 8 |
547 paddsw %1, %2 | 546 movdqa xmm6, xmm5 |
548 paddsw %1, xmm5 | 547 |
| 548 paddsw xmm4, %2 |
| 549 pmaxsw xmm5, %1 |
| 550 pminsw %1, xmm6 |
| 551 paddsw %1, xmm4 |
| 552 paddsw %1, xmm5 |
| 553 |
| 554 paddsw %1, krd |
549 psraw %1, 7 | 555 psraw %1, 7 |
550 packuswb %1, %1 | 556 packuswb %1, %1 |
551 %endm | 557 %endm |
552 | 558 |
553 %macro HORIZx4 1 | 559 %macro HORIZx4 1 |
554 mov rdx, arg(5) ;filter ptr | 560 mov rdx, arg(5) ;filter ptr |
555 mov rsi, arg(0) ;src_ptr | 561 mov rsi, arg(0) ;src_ptr |
556 mov rdi, arg(2) ;output_ptr | 562 mov rdi, arg(2) ;output_ptr |
557 mov rcx, 0x0400040 | 563 mov rcx, 0x0400040 |
558 | 564 |
559 movdqa xmm4, [rdx] ;load filters | 565 movdqa xmm4, [rdx] ;load filters |
560 movq xmm5, rcx | 566 movq xmm5, rcx |
561 packsswb xmm4, xmm4 | 567 packsswb xmm4, xmm4 |
562 pshuflw xmm6, xmm4, 0b ;k0_k1 | 568 pshuflw xmm6, xmm4, 0b ;k0_k1 |
563 pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5 | 569 pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5 |
564 pshuflw xmm7, xmm4, 01010101b ;k2_k3 | 570 pshuflw xmm7, xmm4, 01010101b ;k2_k3 |
565 pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7 | 571 pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7 |
566 pshufd xmm5, xmm5, 0 ;rounding | 572 pshufd xmm5, xmm5, 0 ;rounding |
567 | 573 |
| 574 movdqa k0k1k4k5, xmm6 |
| 575 movdqa k2k3k6k7, xmm7 |
| 576 movdqa krd, xmm5 |
| 577 |
568 movsxd rax, dword ptr arg(1) ;src_pixels_per_line | 578 movsxd rax, dword ptr arg(1) ;src_pixels_per_line |
569 movsxd rdx, dword ptr arg(3) ;output_pitch | 579 movsxd rdx, dword ptr arg(3) ;output_pitch |
570 movsxd rcx, dword ptr arg(4) ;output_height | 580 movsxd rcx, dword ptr arg(4) ;output_height |
571 shr rcx, 1 | 581 shr rcx, 1 |
572 .loop: | 582 .loop: |
573 ;Do two rows once | 583 ;Do two rows once |
574 movq xmm0, [rsi - 3] ;load src | 584 movq xmm0, [rsi - 3] ;load src |
575 movq xmm1, [rsi + 5] | 585 movq xmm1, [rsi + 5] |
576 movq xmm2, [rsi + rax - 3] | 586 movq xmm2, [rsi + rax - 3] |
577 movq xmm3, [rsi + rax + 5] | 587 movq xmm3, [rsi + rax + 5] |
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
624 pshufb %1, [GLOBAL(shuf_t0t1)] | 634 pshufb %1, [GLOBAL(shuf_t0t1)] |
625 pshufb %2, [GLOBAL(shuf_t2t3)] | 635 pshufb %2, [GLOBAL(shuf_t2t3)] |
626 pshufb %3, [GLOBAL(shuf_t4t5)] | 636 pshufb %3, [GLOBAL(shuf_t4t5)] |
627 pshufb %4, [GLOBAL(shuf_t6t7)] | 637 pshufb %4, [GLOBAL(shuf_t6t7)] |
628 | 638 |
629 pmaddubsw %1, k0k1 | 639 pmaddubsw %1, k0k1 |
630 pmaddubsw %2, k2k3 | 640 pmaddubsw %2, k2k3 |
631 pmaddubsw %3, k4k5 | 641 pmaddubsw %3, k4k5 |
632 pmaddubsw %4, k6k7 | 642 pmaddubsw %4, k6k7 |
633 | 643 |
| 644 paddsw %1, %4 |
| 645 movdqa %4, %2 |
| 646 pmaxsw %2, %3 |
| 647 pminsw %3, %4 |
| 648 paddsw %1, %3 |
634 paddsw %1, %2 | 649 paddsw %1, %2 |
635 paddsw %1, %4 | 650 |
636 paddsw %1, %3 | |
637 paddsw %1, krd | 651 paddsw %1, krd |
638 psraw %1, 7 | 652 psraw %1, 7 |
639 packuswb %1, %1 | 653 packuswb %1, %1 |
640 %endm | 654 %endm |
641 | 655 |
642 %macro HORIZx8 1 | 656 %macro HORIZx8 1 |
643 mov rdx, arg(5) ;filter ptr | 657 mov rdx, arg(5) ;filter ptr |
644 mov rsi, arg(0) ;src_ptr | 658 mov rsi, arg(0) ;src_ptr |
645 mov rdi, arg(2) ;output_ptr | 659 mov rdi, arg(2) ;output_ptr |
646 mov rcx, 0x0400040 | 660 mov rcx, 0x0400040 |
(...skipping 125 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
772 | 786 |
773 pmaddubsw xmm0, k0k1 | 787 pmaddubsw xmm0, k0k1 |
774 pmaddubsw xmm1, k2k3 | 788 pmaddubsw xmm1, k2k3 |
775 pmaddubsw xmm2, k4k5 | 789 pmaddubsw xmm2, k4k5 |
776 pmaddubsw xmm3, k6k7 | 790 pmaddubsw xmm3, k6k7 |
777 pmaddubsw xmm4, k0k1 | 791 pmaddubsw xmm4, k0k1 |
778 pmaddubsw xmm5, k2k3 | 792 pmaddubsw xmm5, k2k3 |
779 pmaddubsw xmm6, k4k5 | 793 pmaddubsw xmm6, k4k5 |
780 pmaddubsw xmm7, k6k7 | 794 pmaddubsw xmm7, k6k7 |
781 | 795 |
| 796 paddsw xmm0, xmm3 |
| 797 movdqa xmm3, xmm1 |
| 798 pmaxsw xmm1, xmm2 |
| 799 pminsw xmm2, xmm3 |
| 800 paddsw xmm0, xmm2 |
782 paddsw xmm0, xmm1 | 801 paddsw xmm0, xmm1 |
783 paddsw xmm0, xmm3 | 802 |
784 paddsw xmm0, xmm2 | 803 paddsw xmm4, xmm7 |
| 804 movdqa xmm7, xmm5 |
| 805 pmaxsw xmm5, xmm6 |
| 806 pminsw xmm6, xmm7 |
| 807 paddsw xmm4, xmm6 |
785 paddsw xmm4, xmm5 | 808 paddsw xmm4, xmm5 |
786 paddsw xmm4, xmm7 | |
787 paddsw xmm4, xmm6 | |
788 | 809 |
789 paddsw xmm0, krd | 810 paddsw xmm0, krd |
790 paddsw xmm4, krd | 811 paddsw xmm4, krd |
791 psraw xmm0, 7 | 812 psraw xmm0, 7 |
792 psraw xmm4, 7 | 813 psraw xmm4, 7 |
793 packuswb xmm0, xmm0 | 814 packuswb xmm0, xmm0 |
794 packuswb xmm4, xmm4 | 815 packuswb xmm4, xmm4 |
795 punpcklqdq xmm0, xmm4 | 816 punpcklqdq xmm0, xmm4 |
796 %if %1 | 817 %if %1 |
797 movdqa xmm1, [rdi] | 818 movdqa xmm1, [rdi] |
(...skipping 21 matching lines...) Expand all Loading... |
819 sym(vp9_filter_block1d4_h8_ssse3): | 840 sym(vp9_filter_block1d4_h8_ssse3): |
820 push rbp | 841 push rbp |
821 mov rbp, rsp | 842 mov rbp, rsp |
822 SHADOW_ARGS_TO_STACK 6 | 843 SHADOW_ARGS_TO_STACK 6 |
823 SAVE_XMM 7 | 844 SAVE_XMM 7 |
824 GET_GOT rbx | 845 GET_GOT rbx |
825 push rsi | 846 push rsi |
826 push rdi | 847 push rdi |
827 ; end prolog | 848 ; end prolog |
828 | 849 |
| 850 ALIGN_STACK 16, rax |
| 851 sub rsp, 16 * 3 |
| 852 %define k0k1k4k5 [rsp + 16 * 0] |
| 853 %define k2k3k6k7 [rsp + 16 * 1] |
| 854 %define krd [rsp + 16 * 2] |
| 855 |
829 HORIZx4 0 | 856 HORIZx4 0 |
830 | 857 |
| 858 add rsp, 16 * 3 |
| 859 pop rsp |
831 ; begin epilog | 860 ; begin epilog |
832 pop rdi | 861 pop rdi |
833 pop rsi | 862 pop rsi |
834 RESTORE_GOT | 863 RESTORE_GOT |
835 RESTORE_XMM | 864 RESTORE_XMM |
836 UNSHADOW_ARGS | 865 UNSHADOW_ARGS |
837 pop rbp | 866 pop rbp |
838 ret | 867 ret |
839 | 868 |
840 ;void vp9_filter_block1d8_h8_ssse3 | 869 ;void vp9_filter_block1d8_h8_ssse3 |
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
925 sym(vp9_filter_block1d4_h8_avg_ssse3): | 954 sym(vp9_filter_block1d4_h8_avg_ssse3): |
926 push rbp | 955 push rbp |
927 mov rbp, rsp | 956 mov rbp, rsp |
928 SHADOW_ARGS_TO_STACK 6 | 957 SHADOW_ARGS_TO_STACK 6 |
929 SAVE_XMM 7 | 958 SAVE_XMM 7 |
930 GET_GOT rbx | 959 GET_GOT rbx |
931 push rsi | 960 push rsi |
932 push rdi | 961 push rdi |
933 ; end prolog | 962 ; end prolog |
934 | 963 |
| 964 ALIGN_STACK 16, rax |
| 965 sub rsp, 16 * 3 |
| 966 %define k0k1k4k5 [rsp + 16 * 0] |
| 967 %define k2k3k6k7 [rsp + 16 * 1] |
| 968 %define krd [rsp + 16 * 2] |
| 969 |
935 HORIZx4 1 | 970 HORIZx4 1 |
936 | 971 |
| 972 add rsp, 16 * 3 |
| 973 pop rsp |
937 ; begin epilog | 974 ; begin epilog |
938 pop rdi | 975 pop rdi |
939 pop rsi | 976 pop rsi |
940 RESTORE_GOT | 977 RESTORE_GOT |
941 RESTORE_XMM | 978 RESTORE_XMM |
942 UNSHADOW_ARGS | 979 UNSHADOW_ARGS |
943 pop rbp | 980 pop rbp |
944 ret | 981 ret |
945 | 982 |
946 global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE | 983 global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE |
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1014 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 | 1051 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
1015 align 16 | 1052 align 16 |
1016 shuf_t2t3: | 1053 shuf_t2t3: |
1017 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 | 1054 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |
1018 align 16 | 1055 align 16 |
1019 shuf_t4t5: | 1056 shuf_t4t5: |
1020 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 | 1057 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 |
1021 align 16 | 1058 align 16 |
1022 shuf_t6t7: | 1059 shuf_t6t7: |
1023 db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 | 1060 db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |
OLD | NEW |