Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(207)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm

Issue 111463005: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 7 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 ; 1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ; 3 ;
4 ; Use of this source code is governed by a BSD-style license 4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source 5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found 6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may 7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree. 8 ; be found in the AUTHORS file in the root of the source tree.
9 ; 9 ;
10 10
11 11
12 %include "vpx_ports/x86_abi_support.asm" 12 %include "vpx_ports/x86_abi_support.asm"
13 13
14 ;/****************************************************************************** ******
15 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixe ls. The
16 ; input pixel array has output_height rows. This routine assumes that output_hei ght is an
17 ; even number. This function handles 8 pixels in horizontal direction, calculati ng ONE
18 ; rows each iteration to take advantage of the 128 bits operations.
19 ;
20 ; This is an implementation of some of the SSE optimizations first seen in ffvp8
21 ;
22 ;******************************************************************************* ******/
23
24
25 %macro VERTx4 1 14 %macro VERTx4 1
26 mov rdx, arg(5) ;filter ptr 15 mov rdx, arg(5) ;filter ptr
27 mov rsi, arg(0) ;src_ptr 16 mov rsi, arg(0) ;src_ptr
28 mov rdi, arg(2) ;output_ptr 17 mov rdi, arg(2) ;output_ptr
29 mov rcx, 0x0400040 18 mov rcx, 0x0400040
30 19
31 movdqa xmm4, [rdx] ;load filters 20 movdqa xmm4, [rdx] ;load filters
32 movd xmm5, rcx 21 movd xmm5, rcx
33 packsswb xmm4, xmm4 22 packsswb xmm4, xmm4
34 pshuflw xmm0, xmm4, 0b ;k0_k1 23 pshuflw xmm0, xmm4, 0b ;k0_k1
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after
74 63
75 movd xmm6, [rsi + rbx] ;G 64 movd xmm6, [rsi + rbx] ;G
76 movd xmm7, [rax + rbx] ;H 65 movd xmm7, [rax + rbx] ;H
77 66
78 pmaddubsw xmm0, k0k1 67 pmaddubsw xmm0, k0k1
79 pmaddubsw xmm2, k2k3 68 pmaddubsw xmm2, k2k3
80 punpcklbw xmm6, xmm7 ;G H 69 punpcklbw xmm6, xmm7 ;G H
81 pmaddubsw xmm4, k4k5 70 pmaddubsw xmm4, k4k5
82 pmaddubsw xmm6, k6k7 71 pmaddubsw xmm6, k6k7
83 72
73 movdqa xmm1, xmm2
84 paddsw xmm0, xmm6 74 paddsw xmm0, xmm6
75 pmaxsw xmm2, xmm4
76 pminsw xmm4, xmm1
77 paddsw xmm0, xmm4
85 paddsw xmm0, xmm2 78 paddsw xmm0, xmm2
86 paddsw xmm0, xmm4 79
87 paddsw xmm0, krd 80 paddsw xmm0, krd
88
89 psraw xmm0, 7 81 psraw xmm0, 7
90 packuswb xmm0, xmm0 82 packuswb xmm0, xmm0
91 83
92 add rsi, rdx 84 add rsi, rdx
93 add rax, rdx 85 add rax, rdx
94 %if %1 86 %if %1
95 movd xmm1, [rdi] 87 movd xmm1, [rdi]
96 pavgb xmm0, xmm1 88 pavgb xmm0, xmm1
97 %endif 89 %endif
98 movd [rdi], xmm0 90 movd [rdi], xmm0
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after
159 movq xmm6, [rsi + rbx] ;G 151 movq xmm6, [rsi + rbx] ;G
160 movq xmm7, [rax + rbx] ;H 152 movq xmm7, [rax + rbx] ;H
161 153
162 pmaddubsw xmm0, k0k1 154 pmaddubsw xmm0, k0k1
163 pmaddubsw xmm2, k2k3 155 pmaddubsw xmm2, k2k3
164 punpcklbw xmm6, xmm7 ;G H 156 punpcklbw xmm6, xmm7 ;G H
165 pmaddubsw xmm4, k4k5 157 pmaddubsw xmm4, k4k5
166 pmaddubsw xmm6, k6k7 158 pmaddubsw xmm6, k6k7
167 159
168 paddsw xmm0, xmm6 160 paddsw xmm0, xmm6
161 movdqa xmm1, xmm2
162 pmaxsw xmm2, xmm4
163 pminsw xmm4, xmm1
164 paddsw xmm0, xmm4
169 paddsw xmm0, xmm2 165 paddsw xmm0, xmm2
170 paddsw xmm0, xmm4 166
171 paddsw xmm0, krd 167 paddsw xmm0, krd
172
173 psraw xmm0, 7 168 psraw xmm0, 7
174 packuswb xmm0, xmm0 169 packuswb xmm0, xmm0
175 170
176 add rsi, rdx 171 add rsi, rdx
177 add rax, rdx 172 add rax, rdx
178 %if %1 173 %if %1
179 movq xmm1, [rdi] 174 movq xmm1, [rdi]
180 pavgb xmm0, xmm1 175 pavgb xmm0, xmm1
181 %endif 176 %endif
182 movq [rdi], xmm0 177 movq [rdi], xmm0
(...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after
244 movq xmm6, [rsi + rbx] ;G 239 movq xmm6, [rsi + rbx] ;G
245 movq xmm7, [rax + rbx] ;H 240 movq xmm7, [rax + rbx] ;H
246 241
247 pmaddubsw xmm0, k0k1 242 pmaddubsw xmm0, k0k1
248 pmaddubsw xmm2, k2k3 243 pmaddubsw xmm2, k2k3
249 punpcklbw xmm6, xmm7 ;G H 244 punpcklbw xmm6, xmm7 ;G H
250 pmaddubsw xmm4, k4k5 245 pmaddubsw xmm4, k4k5
251 pmaddubsw xmm6, k6k7 246 pmaddubsw xmm6, k6k7
252 247
253 paddsw xmm0, xmm6 248 paddsw xmm0, xmm6
249 movdqa xmm1, xmm2
250 pmaxsw xmm2, xmm4
251 pminsw xmm4, xmm1
252 paddsw xmm0, xmm4
254 paddsw xmm0, xmm2 253 paddsw xmm0, xmm2
255 paddsw xmm0, xmm4 254
256 paddsw xmm0, krd 255 paddsw xmm0, krd
257
258 psraw xmm0, 7 256 psraw xmm0, 7
259 packuswb xmm0, xmm0 257 packuswb xmm0, xmm0
260 %if %1 258 %if %1
261 movq xmm1, [rdi] 259 movq xmm1, [rdi]
262 pavgb xmm0, xmm1 260 pavgb xmm0, xmm1
263 %endif 261 %endif
264 movq [rdi], xmm0 262 movq [rdi], xmm0
265 263
266 movq xmm0, [rsi + 8] ;A 264 movq xmm0, [rsi + 8] ;A
267 movq xmm1, [rsi + rdx + 8] ;B 265 movq xmm1, [rsi + rdx + 8] ;B
(...skipping 263 matching lines...) Expand 10 before | Expand all | Expand 10 after
531 RESTORE_XMM 529 RESTORE_XMM
532 UNSHADOW_ARGS 530 UNSHADOW_ARGS
533 pop rbp 531 pop rbp
534 ret 532 ret
535 533
536 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 534 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
537 %macro HORIZx4_ROW 2 535 %macro HORIZx4_ROW 2
538 movdqa %2, %1 536 movdqa %2, %1
539 pshufb %1, [GLOBAL(shuf_t0t1)] 537 pshufb %1, [GLOBAL(shuf_t0t1)]
540 pshufb %2, [GLOBAL(shuf_t2t3)] 538 pshufb %2, [GLOBAL(shuf_t2t3)]
541 pmaddubsw %1, xmm6 539 pmaddubsw %1, k0k1k4k5
542 pmaddubsw %2, xmm7 540 pmaddubsw %2, k2k3k6k7
543 541
544 paddsw %1, %2 542 movdqa xmm4, %1
545 movdqa %2, %1 543 movdqa xmm5, %2
544 psrldq %1, 8
546 psrldq %2, 8 545 psrldq %2, 8
547 paddsw %1, %2 546 movdqa xmm6, xmm5
548 paddsw %1, xmm5 547
548 paddsw xmm4, %2
549 pmaxsw xmm5, %1
550 pminsw %1, xmm6
551 paddsw %1, xmm4
552 paddsw %1, xmm5
553
554 paddsw %1, krd
549 psraw %1, 7 555 psraw %1, 7
550 packuswb %1, %1 556 packuswb %1, %1
551 %endm 557 %endm
552 558
553 %macro HORIZx4 1 559 %macro HORIZx4 1
554 mov rdx, arg(5) ;filter ptr 560 mov rdx, arg(5) ;filter ptr
555 mov rsi, arg(0) ;src_ptr 561 mov rsi, arg(0) ;src_ptr
556 mov rdi, arg(2) ;output_ptr 562 mov rdi, arg(2) ;output_ptr
557 mov rcx, 0x0400040 563 mov rcx, 0x0400040
558 564
559 movdqa xmm4, [rdx] ;load filters 565 movdqa xmm4, [rdx] ;load filters
560 movq xmm5, rcx 566 movq xmm5, rcx
561 packsswb xmm4, xmm4 567 packsswb xmm4, xmm4
562 pshuflw xmm6, xmm4, 0b ;k0_k1 568 pshuflw xmm6, xmm4, 0b ;k0_k1
563 pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5 569 pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5
564 pshuflw xmm7, xmm4, 01010101b ;k2_k3 570 pshuflw xmm7, xmm4, 01010101b ;k2_k3
565 pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7 571 pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7
566 pshufd xmm5, xmm5, 0 ;rounding 572 pshufd xmm5, xmm5, 0 ;rounding
567 573
574 movdqa k0k1k4k5, xmm6
575 movdqa k2k3k6k7, xmm7
576 movdqa krd, xmm5
577
568 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 578 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
569 movsxd rdx, dword ptr arg(3) ;output_pitch 579 movsxd rdx, dword ptr arg(3) ;output_pitch
570 movsxd rcx, dword ptr arg(4) ;output_height 580 movsxd rcx, dword ptr arg(4) ;output_height
571 shr rcx, 1 581 shr rcx, 1
572 .loop: 582 .loop:
573 ;Do two rows once 583 ;Do two rows once
574 movq xmm0, [rsi - 3] ;load src 584 movq xmm0, [rsi - 3] ;load src
575 movq xmm1, [rsi + 5] 585 movq xmm1, [rsi + 5]
576 movq xmm2, [rsi + rax - 3] 586 movq xmm2, [rsi + rax - 3]
577 movq xmm3, [rsi + rax + 5] 587 movq xmm3, [rsi + rax + 5]
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after
624 pshufb %1, [GLOBAL(shuf_t0t1)] 634 pshufb %1, [GLOBAL(shuf_t0t1)]
625 pshufb %2, [GLOBAL(shuf_t2t3)] 635 pshufb %2, [GLOBAL(shuf_t2t3)]
626 pshufb %3, [GLOBAL(shuf_t4t5)] 636 pshufb %3, [GLOBAL(shuf_t4t5)]
627 pshufb %4, [GLOBAL(shuf_t6t7)] 637 pshufb %4, [GLOBAL(shuf_t6t7)]
628 638
629 pmaddubsw %1, k0k1 639 pmaddubsw %1, k0k1
630 pmaddubsw %2, k2k3 640 pmaddubsw %2, k2k3
631 pmaddubsw %3, k4k5 641 pmaddubsw %3, k4k5
632 pmaddubsw %4, k6k7 642 pmaddubsw %4, k6k7
633 643
644 paddsw %1, %4
645 movdqa %4, %2
646 pmaxsw %2, %3
647 pminsw %3, %4
648 paddsw %1, %3
634 paddsw %1, %2 649 paddsw %1, %2
635 paddsw %1, %4 650
636 paddsw %1, %3
637 paddsw %1, krd 651 paddsw %1, krd
638 psraw %1, 7 652 psraw %1, 7
639 packuswb %1, %1 653 packuswb %1, %1
640 %endm 654 %endm
641 655
642 %macro HORIZx8 1 656 %macro HORIZx8 1
643 mov rdx, arg(5) ;filter ptr 657 mov rdx, arg(5) ;filter ptr
644 mov rsi, arg(0) ;src_ptr 658 mov rsi, arg(0) ;src_ptr
645 mov rdi, arg(2) ;output_ptr 659 mov rdi, arg(2) ;output_ptr
646 mov rcx, 0x0400040 660 mov rcx, 0x0400040
(...skipping 125 matching lines...) Expand 10 before | Expand all | Expand 10 after
772 786
773 pmaddubsw xmm0, k0k1 787 pmaddubsw xmm0, k0k1
774 pmaddubsw xmm1, k2k3 788 pmaddubsw xmm1, k2k3
775 pmaddubsw xmm2, k4k5 789 pmaddubsw xmm2, k4k5
776 pmaddubsw xmm3, k6k7 790 pmaddubsw xmm3, k6k7
777 pmaddubsw xmm4, k0k1 791 pmaddubsw xmm4, k0k1
778 pmaddubsw xmm5, k2k3 792 pmaddubsw xmm5, k2k3
779 pmaddubsw xmm6, k4k5 793 pmaddubsw xmm6, k4k5
780 pmaddubsw xmm7, k6k7 794 pmaddubsw xmm7, k6k7
781 795
796 paddsw xmm0, xmm3
797 movdqa xmm3, xmm1
798 pmaxsw xmm1, xmm2
799 pminsw xmm2, xmm3
800 paddsw xmm0, xmm2
782 paddsw xmm0, xmm1 801 paddsw xmm0, xmm1
783 paddsw xmm0, xmm3 802
784 paddsw xmm0, xmm2 803 paddsw xmm4, xmm7
804 movdqa xmm7, xmm5
805 pmaxsw xmm5, xmm6
806 pminsw xmm6, xmm7
807 paddsw xmm4, xmm6
785 paddsw xmm4, xmm5 808 paddsw xmm4, xmm5
786 paddsw xmm4, xmm7
787 paddsw xmm4, xmm6
788 809
789 paddsw xmm0, krd 810 paddsw xmm0, krd
790 paddsw xmm4, krd 811 paddsw xmm4, krd
791 psraw xmm0, 7 812 psraw xmm0, 7
792 psraw xmm4, 7 813 psraw xmm4, 7
793 packuswb xmm0, xmm0 814 packuswb xmm0, xmm0
794 packuswb xmm4, xmm4 815 packuswb xmm4, xmm4
795 punpcklqdq xmm0, xmm4 816 punpcklqdq xmm0, xmm4
796 %if %1 817 %if %1
797 movdqa xmm1, [rdi] 818 movdqa xmm1, [rdi]
(...skipping 21 matching lines...) Expand all
819 sym(vp9_filter_block1d4_h8_ssse3): 840 sym(vp9_filter_block1d4_h8_ssse3):
820 push rbp 841 push rbp
821 mov rbp, rsp 842 mov rbp, rsp
822 SHADOW_ARGS_TO_STACK 6 843 SHADOW_ARGS_TO_STACK 6
823 SAVE_XMM 7 844 SAVE_XMM 7
824 GET_GOT rbx 845 GET_GOT rbx
825 push rsi 846 push rsi
826 push rdi 847 push rdi
827 ; end prolog 848 ; end prolog
828 849
850 ALIGN_STACK 16, rax
851 sub rsp, 16 * 3
852 %define k0k1k4k5 [rsp + 16 * 0]
853 %define k2k3k6k7 [rsp + 16 * 1]
854 %define krd [rsp + 16 * 2]
855
829 HORIZx4 0 856 HORIZx4 0
830 857
858 add rsp, 16 * 3
859 pop rsp
831 ; begin epilog 860 ; begin epilog
832 pop rdi 861 pop rdi
833 pop rsi 862 pop rsi
834 RESTORE_GOT 863 RESTORE_GOT
835 RESTORE_XMM 864 RESTORE_XMM
836 UNSHADOW_ARGS 865 UNSHADOW_ARGS
837 pop rbp 866 pop rbp
838 ret 867 ret
839 868
840 ;void vp9_filter_block1d8_h8_ssse3 869 ;void vp9_filter_block1d8_h8_ssse3
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after
925 sym(vp9_filter_block1d4_h8_avg_ssse3): 954 sym(vp9_filter_block1d4_h8_avg_ssse3):
926 push rbp 955 push rbp
927 mov rbp, rsp 956 mov rbp, rsp
928 SHADOW_ARGS_TO_STACK 6 957 SHADOW_ARGS_TO_STACK 6
929 SAVE_XMM 7 958 SAVE_XMM 7
930 GET_GOT rbx 959 GET_GOT rbx
931 push rsi 960 push rsi
932 push rdi 961 push rdi
933 ; end prolog 962 ; end prolog
934 963
964 ALIGN_STACK 16, rax
965 sub rsp, 16 * 3
966 %define k0k1k4k5 [rsp + 16 * 0]
967 %define k2k3k6k7 [rsp + 16 * 1]
968 %define krd [rsp + 16 * 2]
969
935 HORIZx4 1 970 HORIZx4 1
936 971
972 add rsp, 16 * 3
973 pop rsp
937 ; begin epilog 974 ; begin epilog
938 pop rdi 975 pop rdi
939 pop rsi 976 pop rsi
940 RESTORE_GOT 977 RESTORE_GOT
941 RESTORE_XMM 978 RESTORE_XMM
942 UNSHADOW_ARGS 979 UNSHADOW_ARGS
943 pop rbp 980 pop rbp
944 ret 981 ret
945 982
946 global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE 983 global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
1014 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 1051 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
1015 align 16 1052 align 16
1016 shuf_t2t3: 1053 shuf_t2t3:
1017 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 1054 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
1018 align 16 1055 align 16
1019 shuf_t4t5: 1056 shuf_t4t5:
1020 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 1057 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
1021 align 16 1058 align 16
1022 shuf_t6t7: 1059 shuf_t6t7:
1023 db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 1060 db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c ('k') | source/libvpx/vp9/decoder/vp9_dboolhuff.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698