Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(83)

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm

Issue 111463005: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 7 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c ('k') | source/libvpx/vp9/vp9_common.mk » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 ; 1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ; 3 ;
4 ; Use of this source code is governed by a BSD-style license 4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source 5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found 6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may 7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree. 8 ; be found in the AUTHORS file in the root of the source tree.
9 ; 9 ;
10 10
(...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after
111 mov r1, ssem ; r1 = unsigned int *sse 111 mov r1, ssem ; r1 = unsigned int *sse
112 punpcklwd m6, m5 ; sign-extend m6 word->dword 112 punpcklwd m6, m5 ; sign-extend m6 word->dword
113 movd [r1], m7 ; store sse 113 movd [r1], m7 ; store sse
114 pshufw m4, m6, 0xe 114 pshufw m4, m6, 0xe
115 paddd m6, m4 115 paddd m6, m4
116 movd rax, m6 ; store sum as return value 116 movd rax, m6 ; store sum as return value
117 %endif 117 %endif
118 RET 118 RET
119 %endmacro 119 %endmacro
120 120
121 %macro INC_SRC_BY_SRC_STRIDE 0
122 %if ARCH_X86=1 && CONFIG_PIC=1
123 add srcq, src_stridemp
124 %else
125 add srcq, src_strideq
126 %endif
127 %endmacro
128
121 %macro SUBPEL_VARIANCE 1-2 0 ; W 129 %macro SUBPEL_VARIANCE 1-2 0 ; W
122 %if cpuflag(ssse3) 130 %if cpuflag(ssse3)
123 %define bilin_filter_m bilin_filter_m_ssse3 131 %define bilin_filter_m bilin_filter_m_ssse3
124 %define filter_idx_shift 4 132 %define filter_idx_shift 4
125 %else 133 %else
126 %define bilin_filter_m bilin_filter_m_sse2 134 %define bilin_filter_m bilin_filter_m_sse2
127 %define filter_idx_shift 5 135 %define filter_idx_shift 5
128 %endif 136 %endif
129 ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses 137 ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
130 ; 11, not 13, if the registers are ordered correctly. May make a minor speed 138 ; 11, not 13, if the registers are ordered correctly. May make a minor speed
131 ; difference on Win64 139 ; difference on Win64
132 %ifdef PIC 140
133 %if %2 == 1 ; avg 141 %ifdef PIC ; 64bit PIC
134 cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ 142 %if %2 == 1 ; avg
135 x_offset, y_offset, \ 143 cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
136 dst, dst_stride, \ 144 x_offset, y_offset, \
137 sec, sec_stride, height, sse 145 dst, dst_stride, \
138 %define sec_str sec_strideq 146 sec, sec_stride, height, sse
147 %define sec_str sec_strideq
148 %else
149 cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
150 y_offset, dst, dst_stride, height, sse
151 %endif
152 %define h heightd
153 %define bilin_filter sseq
139 %else 154 %else
140 cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \ 155 %if ARCH_X86=1 && CONFIG_PIC=1
141 dst, dst_stride, height, sse 156 %if %2 == 1 ; avg
157 cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
158 x_offset, y_offset, \
159 dst, dst_stride, \
160 sec, sec_stride, \
161 height, sse, g_bilin_filter, g_pw_8
162 %define h dword heightm
163 %define sec_str sec_stridemp
164
165 ;Store bilin_filter and pw_8 location in stack
166 GET_GOT eax
167 add esp, 4 ; restore esp
168
169 lea ecx, [GLOBAL(bilin_filter_m)]
170 mov g_bilin_filterm, ecx
171
172 lea ecx, [GLOBAL(pw_8)]
173 mov g_pw_8m, ecx
174
175 LOAD_IF_USED 0, 1 ; load eax, ecx back
176 %else
177 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
178 y_offset, dst, dst_stride, height, sse, \
179 g_bilin_filter, g_pw_8
180 %define h heightd
181
182 ;Store bilin_filter and pw_8 location in stack
183 GET_GOT eax
184 add esp, 4 ; restore esp
185
186 lea ecx, [GLOBAL(bilin_filter_m)]
187 mov g_bilin_filterm, ecx
188
189 lea ecx, [GLOBAL(pw_8)]
190 mov g_pw_8m, ecx
191
192 LOAD_IF_USED 0, 1 ; load eax, ecx back
193 %endif
194 %else
195 %if %2 == 1 ; avg
196 cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
197 7 + 2 * ARCH_X86_64, 13, src, src_stride, \
198 x_offset, y_offset, \
199 dst, dst_stride, \
200 sec, sec_stride, \
201 height, sse
202 %if ARCH_X86_64
203 %define h heightd
204 %define sec_str sec_strideq
205 %else
206 %define h dword heightm
207 %define sec_str sec_stridemp
208 %endif
209 %else
210 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
211 y_offset, dst, dst_stride, height, sse
212 %define h heightd
213 %endif
214
215 %define bilin_filter bilin_filter_m
216 %endif
142 %endif 217 %endif
143 %define h heightd 218
144 %define bilin_filter sseq
145 %else
146 %if %2 == 1 ; avg
147 cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
148 7 + 2 * ARCH_X86_64, 13, src, src_stride, \
149 x_offset, y_offset, \
150 dst, dst_stride, \
151 sec, sec_stride, \
152 height, sse
153 %if ARCH_X86_64
154 %define h heightd
155 %define sec_str sec_strideq
156 %else
157 %define h dword heightm
158 %define sec_str sec_stridemp
159 %endif
160 %else
161 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
162 dst, dst_stride, height, sse
163 %define h heightd
164 %endif
165 %define bilin_filter bilin_filter_m
166 %endif
167 ASSERT %1 <= 16 ; m6 overflows if w > 16 219 ASSERT %1 <= 16 ; m6 overflows if w > 16
168 pxor m6, m6 ; sum 220 pxor m6, m6 ; sum
169 pxor m7, m7 ; sse 221 pxor m7, m7 ; sse
170 ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we 222 ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
171 ; could perhaps use it for something more productive then 223 ; could perhaps use it for something more productive then
172 pxor m5, m5 ; dedicated zero register 224 pxor m5, m5 ; dedicated zero register
173 %if %1 < 16 225 %if %1 < 16
174 sar h, 1 226 sar h, 1
175 %if %2 == 1 ; avg 227 %if %2 == 1 ; avg
176 shl sec_str, 1 228 shl sec_str, 1
(...skipping 145 matching lines...) Expand 10 before | Expand all | Expand 10 after
322 %if ARCH_X86_64 && mmsize == 16 374 %if ARCH_X86_64 && mmsize == 16
323 mova m8, [bilin_filter+y_offsetq] 375 mova m8, [bilin_filter+y_offsetq]
324 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 376 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
325 mova m9, [bilin_filter+y_offsetq+16] 377 mova m9, [bilin_filter+y_offsetq+16]
326 %endif 378 %endif
327 mova m10, [pw_8] 379 mova m10, [pw_8]
328 %define filter_y_a m8 380 %define filter_y_a m8
329 %define filter_y_b m9 381 %define filter_y_b m9
330 %define filter_rnd m10 382 %define filter_rnd m10
331 %else ; x86-32 or mmx 383 %else ; x86-32 or mmx
384 %if ARCH_X86=1 && CONFIG_PIC=1
385 ; x_offset == 0, reuse x_offset reg
386 %define tempq x_offsetq
387 add y_offsetq, g_bilin_filterm
388 %define filter_y_a [y_offsetq]
389 %define filter_y_b [y_offsetq+16]
390 mov tempq, g_pw_8m
391 %define filter_rnd [tempq]
392 %else
332 add y_offsetq, bilin_filter 393 add y_offsetq, bilin_filter
333 %define filter_y_a [y_offsetq] 394 %define filter_y_a [y_offsetq]
334 %define filter_y_b [y_offsetq+16] 395 %define filter_y_b [y_offsetq+16]
335 %define filter_rnd [pw_8] 396 %define filter_rnd [pw_8]
336 %endif 397 %endif
398 %endif
399
337 .x_zero_y_other_loop: 400 .x_zero_y_other_loop:
338 %if %1 == 16 401 %if %1 == 16
339 movu m0, [srcq] 402 movu m0, [srcq]
340 movu m4, [srcq+src_strideq] 403 movu m4, [srcq+src_strideq]
341 mova m1, [dstq] 404 mova m1, [dstq]
342 %if cpuflag(ssse3) 405 %if cpuflag(ssse3)
343 punpckhbw m2, m0, m4 406 punpckhbw m2, m0, m4
344 punpcklbw m0, m4 407 punpcklbw m0, m4
345 pmaddubsw m2, filter_y_a 408 pmaddubsw m2, filter_y_a
346 pmaddubsw m0, filter_y_a 409 pmaddubsw m0, filter_y_a
(...skipping 261 matching lines...) Expand 10 before | Expand all | Expand 10 after
608 shl y_offsetd, filter_idx_shift 671 shl y_offsetd, filter_idx_shift
609 %if ARCH_X86_64 && mmsize == 16 672 %if ARCH_X86_64 && mmsize == 16
610 mova m8, [bilin_filter+y_offsetq] 673 mova m8, [bilin_filter+y_offsetq]
611 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 674 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
612 mova m9, [bilin_filter+y_offsetq+16] 675 mova m9, [bilin_filter+y_offsetq+16]
613 %endif 676 %endif
614 mova m10, [pw_8] 677 mova m10, [pw_8]
615 %define filter_y_a m8 678 %define filter_y_a m8
616 %define filter_y_b m9 679 %define filter_y_b m9
617 %define filter_rnd m10 680 %define filter_rnd m10
681 %else ;x86_32
682 %if ARCH_X86=1 && CONFIG_PIC=1
683 ; x_offset == 0.5. We can reuse x_offset reg
684 %define tempq x_offsetq
685 add y_offsetq, g_bilin_filterm
686 %define filter_y_a [y_offsetq]
687 %define filter_y_b [y_offsetq+16]
688 mov tempq, g_pw_8m
689 %define filter_rnd [tempq]
618 %else 690 %else
619 add y_offsetq, bilin_filter 691 add y_offsetq, bilin_filter
620 %define filter_y_a [y_offsetq] 692 %define filter_y_a [y_offsetq]
621 %define filter_y_b [y_offsetq+16] 693 %define filter_y_b [y_offsetq+16]
622 %define filter_rnd [pw_8] 694 %define filter_rnd [pw_8]
623 %endif 695 %endif
696 %endif
697
624 %if %1 == 16 698 %if %1 == 16
625 movu m0, [srcq] 699 movu m0, [srcq]
626 movu m3, [srcq+1] 700 movu m3, [srcq+1]
627 add srcq, src_strideq 701 add srcq, src_strideq
628 pavgb m0, m3 702 pavgb m0, m3
629 .x_half_y_other_loop: 703 .x_half_y_other_loop:
630 movu m4, [srcq] 704 movu m4, [srcq]
631 movu m2, [srcq+1] 705 movu m2, [srcq+1]
632 mova m1, [dstq] 706 mova m1, [dstq]
633 pavgb m4, m2 707 pavgb m4, m2
(...skipping 111 matching lines...) Expand 10 before | Expand all | Expand 10 after
745 shl x_offsetd, filter_idx_shift 819 shl x_offsetd, filter_idx_shift
746 %if ARCH_X86_64 && mmsize == 16 820 %if ARCH_X86_64 && mmsize == 16
747 mova m8, [bilin_filter+x_offsetq] 821 mova m8, [bilin_filter+x_offsetq]
748 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 822 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
749 mova m9, [bilin_filter+x_offsetq+16] 823 mova m9, [bilin_filter+x_offsetq+16]
750 %endif 824 %endif
751 mova m10, [pw_8] 825 mova m10, [pw_8]
752 %define filter_x_a m8 826 %define filter_x_a m8
753 %define filter_x_b m9 827 %define filter_x_b m9
754 %define filter_rnd m10 828 %define filter_rnd m10
829 %else ; x86-32
830 %if ARCH_X86=1 && CONFIG_PIC=1
831 ;y_offset == 0. We can reuse y_offset reg.
832 %define tempq y_offsetq
833 add x_offsetq, g_bilin_filterm
834 %define filter_x_a [x_offsetq]
835 %define filter_x_b [x_offsetq+16]
836 mov tempq, g_pw_8m
837 %define filter_rnd [tempq]
755 %else 838 %else
756 add x_offsetq, bilin_filter 839 add x_offsetq, bilin_filter
757 %define filter_x_a [x_offsetq] 840 %define filter_x_a [x_offsetq]
758 %define filter_x_b [x_offsetq+16] 841 %define filter_x_b [x_offsetq+16]
759 %define filter_rnd [pw_8] 842 %define filter_rnd [pw_8]
760 %endif 843 %endif
844 %endif
845
761 .x_other_y_zero_loop: 846 .x_other_y_zero_loop:
762 %if %1 == 16 847 %if %1 == 16
763 movu m0, [srcq] 848 movu m0, [srcq]
764 movu m4, [srcq+1] 849 movu m4, [srcq+1]
765 mova m1, [dstq] 850 mova m1, [dstq]
766 %if cpuflag(ssse3) 851 %if cpuflag(ssse3)
767 punpckhbw m2, m0, m4 852 punpckhbw m2, m0, m4
768 punpcklbw m0, m4 853 punpcklbw m0, m4
769 pmaddubsw m2, filter_x_a 854 pmaddubsw m2, filter_x_a
770 pmaddubsw m0, filter_x_a 855 pmaddubsw m0, filter_x_a
(...skipping 95 matching lines...) Expand 10 before | Expand all | Expand 10 after
866 shl x_offsetd, filter_idx_shift 951 shl x_offsetd, filter_idx_shift
867 %if ARCH_X86_64 && mmsize == 16 952 %if ARCH_X86_64 && mmsize == 16
868 mova m8, [bilin_filter+x_offsetq] 953 mova m8, [bilin_filter+x_offsetq]
869 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 954 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
870 mova m9, [bilin_filter+x_offsetq+16] 955 mova m9, [bilin_filter+x_offsetq+16]
871 %endif 956 %endif
872 mova m10, [pw_8] 957 mova m10, [pw_8]
873 %define filter_x_a m8 958 %define filter_x_a m8
874 %define filter_x_b m9 959 %define filter_x_b m9
875 %define filter_rnd m10 960 %define filter_rnd m10
961 %else ; x86-32
962 %if ARCH_X86=1 && CONFIG_PIC=1
963 ; y_offset == 0.5. We can reuse y_offset reg.
964 %define tempq y_offsetq
965 add x_offsetq, g_bilin_filterm
966 %define filter_x_a [x_offsetq]
967 %define filter_x_b [x_offsetq+16]
968 mov tempq, g_pw_8m
969 %define filter_rnd [tempq]
876 %else 970 %else
877 add x_offsetq, bilin_filter 971 add x_offsetq, bilin_filter
878 %define filter_x_a [x_offsetq] 972 %define filter_x_a [x_offsetq]
879 %define filter_x_b [x_offsetq+16] 973 %define filter_x_b [x_offsetq+16]
880 %define filter_rnd [pw_8] 974 %define filter_rnd [pw_8]
881 %endif 975 %endif
976 %endif
977
882 %if %1 == 16 978 %if %1 == 16
883 movu m0, [srcq] 979 movu m0, [srcq]
884 movu m1, [srcq+1] 980 movu m1, [srcq+1]
885 %if cpuflag(ssse3) 981 %if cpuflag(ssse3)
886 punpckhbw m2, m0, m1 982 punpckhbw m2, m0, m1
887 punpcklbw m0, m1 983 punpcklbw m0, m1
888 pmaddubsw m2, filter_x_a 984 pmaddubsw m2, filter_x_a
889 pmaddubsw m0, filter_x_a 985 pmaddubsw m0, filter_x_a
890 paddw m2, filter_rnd 986 paddw m2, filter_rnd
891 paddw m0, filter_rnd 987 paddw m0, filter_rnd
(...skipping 158 matching lines...) Expand 10 before | Expand all | Expand 10 after
1050 mova m10, [bilin_filter+y_offsetq] 1146 mova m10, [bilin_filter+y_offsetq]
1051 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 1147 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
1052 mova m11, [bilin_filter+y_offsetq+16] 1148 mova m11, [bilin_filter+y_offsetq+16]
1053 %endif 1149 %endif
1054 mova m12, [pw_8] 1150 mova m12, [pw_8]
1055 %define filter_x_a m8 1151 %define filter_x_a m8
1056 %define filter_x_b m9 1152 %define filter_x_b m9
1057 %define filter_y_a m10 1153 %define filter_y_a m10
1058 %define filter_y_b m11 1154 %define filter_y_b m11
1059 %define filter_rnd m12 1155 %define filter_rnd m12
1156 %else ; x86-32
1157 %if ARCH_X86=1 && CONFIG_PIC=1
1158 ; In this case, there is NO unused register. Used src_stride register. Later,
1159 ; src_stride has to be loaded from stack when it is needed.
1160 %define tempq src_strideq
1161 mov tempq, g_bilin_filterm
1162 add x_offsetq, tempq
1163 add y_offsetq, tempq
1164 %define filter_x_a [x_offsetq]
1165 %define filter_x_b [x_offsetq+16]
1166 %define filter_y_a [y_offsetq]
1167 %define filter_y_b [y_offsetq+16]
1168
1169 mov tempq, g_pw_8m
1170 %define filter_rnd [tempq]
1060 %else 1171 %else
1061 add x_offsetq, bilin_filter 1172 add x_offsetq, bilin_filter
1062 add y_offsetq, bilin_filter 1173 add y_offsetq, bilin_filter
1063 %define filter_x_a [x_offsetq] 1174 %define filter_x_a [x_offsetq]
1064 %define filter_x_b [x_offsetq+16] 1175 %define filter_x_b [x_offsetq+16]
1065 %define filter_y_a [y_offsetq] 1176 %define filter_y_a [y_offsetq]
1066 %define filter_y_b [y_offsetq+16] 1177 %define filter_y_b [y_offsetq+16]
1067 %define filter_rnd [pw_8] 1178 %define filter_rnd [pw_8]
1068 %endif 1179 %endif
1180 %endif
1181
1069 ; x_offset == bilin interpolation && y_offset == bilin interpolation 1182 ; x_offset == bilin interpolation && y_offset == bilin interpolation
1070 %if %1 == 16 1183 %if %1 == 16
1071 movu m0, [srcq] 1184 movu m0, [srcq]
1072 movu m1, [srcq+1] 1185 movu m1, [srcq+1]
1073 %if cpuflag(ssse3) 1186 %if cpuflag(ssse3)
1074 punpckhbw m2, m0, m1 1187 punpckhbw m2, m0, m1
1075 punpcklbw m0, m1 1188 punpcklbw m0, m1
1076 pmaddubsw m2, filter_x_a 1189 pmaddubsw m2, filter_x_a
1077 pmaddubsw m0, filter_x_a 1190 pmaddubsw m0, filter_x_a
1078 paddw m2, filter_rnd 1191 paddw m2, filter_rnd
1079 paddw m0, filter_rnd 1192 paddw m0, filter_rnd
1080 %else 1193 %else
1081 punpckhbw m2, m0, m5 1194 punpckhbw m2, m0, m5
1082 punpckhbw m3, m1, m5 1195 punpckhbw m3, m1, m5
1083 punpcklbw m0, m5 1196 punpcklbw m0, m5
1084 punpcklbw m1, m5 1197 punpcklbw m1, m5
1085 pmullw m0, filter_x_a 1198 pmullw m0, filter_x_a
1086 pmullw m1, filter_x_b 1199 pmullw m1, filter_x_b
1087 paddw m0, filter_rnd 1200 paddw m0, filter_rnd
1088 pmullw m2, filter_x_a 1201 pmullw m2, filter_x_a
1089 pmullw m3, filter_x_b 1202 pmullw m3, filter_x_b
1090 paddw m2, filter_rnd 1203 paddw m2, filter_rnd
1091 paddw m0, m1 1204 paddw m0, m1
1092 paddw m2, m3 1205 paddw m2, m3
1093 %endif 1206 %endif
1094 psraw m0, 4 1207 psraw m0, 4
1095 psraw m2, 4 1208 psraw m2, 4
1096 add srcq, src_strideq 1209
1210 INC_SRC_BY_SRC_STRIDE
1211
1097 packuswb m0, m2 1212 packuswb m0, m2
1098 .x_other_y_other_loop: 1213 .x_other_y_other_loop:
1099 %if cpuflag(ssse3) 1214 %if cpuflag(ssse3)
1100 movu m4, [srcq] 1215 movu m4, [srcq]
1101 movu m3, [srcq+1] 1216 movu m3, [srcq+1]
1102 mova m1, [dstq] 1217 mova m1, [dstq]
1103 punpckhbw m2, m4, m3 1218 punpckhbw m2, m4, m3
1104 punpcklbw m4, m3 1219 punpcklbw m4, m3
1105 pmaddubsw m2, filter_x_a 1220 pmaddubsw m2, filter_x_a
1106 pmaddubsw m4, filter_x_a 1221 pmaddubsw m4, filter_x_a
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after
1156 %if %2 == 1 ; avg 1271 %if %2 == 1 ; avg
1157 ; FIXME(rbultje) pipeline 1272 ; FIXME(rbultje) pipeline
1158 packuswb m0, m2 1273 packuswb m0, m2
1159 pavgb m0, [secq] 1274 pavgb m0, [secq]
1160 punpckhbw m2, m0, m5 1275 punpckhbw m2, m0, m5
1161 punpcklbw m0, m5 1276 punpcklbw m0, m5
1162 %endif 1277 %endif
1163 SUM_SSE m0, m1, m2, m3, m6, m7 1278 SUM_SSE m0, m1, m2, m3, m6, m7
1164 mova m0, m4 1279 mova m0, m4
1165 1280
1166 add srcq, src_strideq 1281 INC_SRC_BY_SRC_STRIDE
1167 add dstq, dst_strideq 1282 add dstq, dst_strideq
1168 %else ; %1 < 16 1283 %else ; %1 < 16
1169 movh m0, [srcq] 1284 movh m0, [srcq]
1170 movh m1, [srcq+1] 1285 movh m1, [srcq+1]
1171 %if cpuflag(ssse3) 1286 %if cpuflag(ssse3)
1172 punpcklbw m0, m1 1287 punpcklbw m0, m1
1173 pmaddubsw m0, filter_x_a 1288 pmaddubsw m0, filter_x_a
1174 paddw m0, filter_rnd 1289 paddw m0, filter_rnd
1175 %else 1290 %else
1176 punpcklbw m0, m5 1291 punpcklbw m0, m5
1177 punpcklbw m1, m5 1292 punpcklbw m1, m5
1178 pmullw m0, filter_x_a 1293 pmullw m0, filter_x_a
1179 pmullw m1, filter_x_b 1294 pmullw m1, filter_x_b
1180 paddw m0, filter_rnd 1295 paddw m0, filter_rnd
1181 paddw m0, m1 1296 paddw m0, m1
1182 %endif 1297 %endif
1183 psraw m0, 4 1298 psraw m0, 4
1184 %if cpuflag(ssse3) 1299 %if cpuflag(ssse3)
1185 packuswb m0, m0 1300 packuswb m0, m0
1186 %endif 1301 %endif
1187 add srcq, src_strideq 1302
1303 INC_SRC_BY_SRC_STRIDE
1304
1188 .x_other_y_other_loop: 1305 .x_other_y_other_loop:
1189 movh m2, [srcq] 1306 movh m2, [srcq]
1190 movh m1, [srcq+1] 1307 movh m1, [srcq+1]
1191 movh m4, [srcq+src_strideq] 1308
1192 movh m3, [srcq+src_strideq+1] 1309 INC_SRC_BY_SRC_STRIDE
1310 movh m4, [srcq]
1311 movh m3, [srcq+1]
1312
1193 %if cpuflag(ssse3) 1313 %if cpuflag(ssse3)
1194 punpcklbw m2, m1 1314 punpcklbw m2, m1
1195 punpcklbw m4, m3 1315 punpcklbw m4, m3
1196 pmaddubsw m2, filter_x_a 1316 pmaddubsw m2, filter_x_a
1197 pmaddubsw m4, filter_x_a 1317 pmaddubsw m4, filter_x_a
1198 movh m3, [dstq+dst_strideq] 1318 movh m3, [dstq+dst_strideq]
1199 movh m1, [dstq] 1319 movh m1, [dstq]
1200 paddw m2, filter_rnd 1320 paddw m2, filter_rnd
1201 paddw m4, filter_rnd 1321 paddw m4, filter_rnd
1202 psraw m2, 4 1322 psraw m2, 4
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after
1246 %if %2 == 1 ; avg 1366 %if %2 == 1 ; avg
1247 ; FIXME(rbultje) pipeline 1367 ; FIXME(rbultje) pipeline
1248 packuswb m0, m2 1368 packuswb m0, m2
1249 pavgb m0, [secq] 1369 pavgb m0, [secq]
1250 punpckhbw m2, m0, m5 1370 punpckhbw m2, m0, m5
1251 punpcklbw m0, m5 1371 punpcklbw m0, m5
1252 %endif 1372 %endif
1253 SUM_SSE m0, m1, m2, m3, m6, m7 1373 SUM_SSE m0, m1, m2, m3, m6, m7
1254 mova m0, m4 1374 mova m0, m4
1255 1375
1256 lea srcq, [srcq+src_strideq*2] 1376 INC_SRC_BY_SRC_STRIDE
1257 lea dstq, [dstq+dst_strideq*2] 1377 lea dstq, [dstq+dst_strideq*2]
1258 %endif 1378 %endif
1259 %if %2 == 1 ; avg 1379 %if %2 == 1 ; avg
1260 add secq, sec_str 1380 add secq, sec_str
1261 %endif 1381 %endif
1262 dec h 1382 dec h
1263 jg .x_other_y_other_loop 1383 jg .x_other_y_other_loop
1264 %undef filter_x_a 1384 %undef filter_x_a
1265 %undef filter_x_b 1385 %undef filter_x_b
1266 %undef filter_y_a 1386 %undef filter_y_a
(...skipping 24 matching lines...) Expand all
1291 SUBPEL_VARIANCE 4, 1 1411 SUBPEL_VARIANCE 4, 1
1292 INIT_XMM sse2 1412 INIT_XMM sse2
1293 SUBPEL_VARIANCE 8, 1 1413 SUBPEL_VARIANCE 8, 1
1294 SUBPEL_VARIANCE 16, 1 1414 SUBPEL_VARIANCE 16, 1
1295 1415
1296 INIT_MMX ssse3 1416 INIT_MMX ssse3
1297 SUBPEL_VARIANCE 4, 1 1417 SUBPEL_VARIANCE 4, 1
1298 INIT_XMM ssse3 1418 INIT_XMM ssse3
1299 SUBPEL_VARIANCE 8, 1 1419 SUBPEL_VARIANCE 8, 1
1300 SUBPEL_VARIANCE 16, 1 1420 SUBPEL_VARIANCE 16, 1
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c ('k') | source/libvpx/vp9/vp9_common.mk » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698