OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
(...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
111 mov r1, ssem ; r1 = unsigned int *sse | 111 mov r1, ssem ; r1 = unsigned int *sse |
112 punpcklwd m6, m5 ; sign-extend m6 word->dword | 112 punpcklwd m6, m5 ; sign-extend m6 word->dword |
113 movd [r1], m7 ; store sse | 113 movd [r1], m7 ; store sse |
114 pshufw m4, m6, 0xe | 114 pshufw m4, m6, 0xe |
115 paddd m6, m4 | 115 paddd m6, m4 |
116 movd rax, m6 ; store sum as return value | 116 movd rax, m6 ; store sum as return value |
117 %endif | 117 %endif |
118 RET | 118 RET |
119 %endmacro | 119 %endmacro |
120 | 120 |
| 121 %macro INC_SRC_BY_SRC_STRIDE 0 |
| 122 %if ARCH_X86=1 && CONFIG_PIC=1 |
| 123 add srcq, src_stridemp |
| 124 %else |
| 125 add srcq, src_strideq |
| 126 %endif |
| 127 %endmacro |
| 128 |
121 %macro SUBPEL_VARIANCE 1-2 0 ; W | 129 %macro SUBPEL_VARIANCE 1-2 0 ; W |
122 %if cpuflag(ssse3) | 130 %if cpuflag(ssse3) |
123 %define bilin_filter_m bilin_filter_m_ssse3 | 131 %define bilin_filter_m bilin_filter_m_ssse3 |
124 %define filter_idx_shift 4 | 132 %define filter_idx_shift 4 |
125 %else | 133 %else |
126 %define bilin_filter_m bilin_filter_m_sse2 | 134 %define bilin_filter_m bilin_filter_m_sse2 |
127 %define filter_idx_shift 5 | 135 %define filter_idx_shift 5 |
128 %endif | 136 %endif |
129 ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses | 137 ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses |
130 ; 11, not 13, if the registers are ordered correctly. May make a minor speed | 138 ; 11, not 13, if the registers are ordered correctly. May make a minor speed |
131 ; difference on Win64 | 139 ; difference on Win64 |
132 %ifdef PIC | 140 |
133 %if %2 == 1 ; avg | 141 %ifdef PIC ; 64bit PIC |
134 cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ | 142 %if %2 == 1 ; avg |
135 x_offset, y_offset, \ | 143 cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ |
136 dst, dst_stride, \ | 144 x_offset, y_offset, \ |
137 sec, sec_stride, height, sse | 145 dst, dst_stride, \ |
138 %define sec_str sec_strideq | 146 sec, sec_stride, height, sse |
| 147 %define sec_str sec_strideq |
| 148 %else |
| 149 cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ |
| 150 y_offset, dst, dst_stride, height, sse |
| 151 %endif |
| 152 %define h heightd |
| 153 %define bilin_filter sseq |
139 %else | 154 %else |
140 cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \ | 155 %if ARCH_X86=1 && CONFIG_PIC=1 |
141 dst, dst_stride, height, sse | 156 %if %2 == 1 ; avg |
| 157 cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ |
| 158 x_offset, y_offset, \ |
| 159 dst, dst_stride, \ |
| 160 sec, sec_stride, \ |
| 161 height, sse, g_bilin_filter, g_pw_8 |
| 162 %define h dword heightm |
| 163 %define sec_str sec_stridemp |
| 164 |
| 165 ;Store bilin_filter and pw_8 location in stack |
| 166 GET_GOT eax |
| 167 add esp, 4 ; restore esp |
| 168 |
| 169 lea ecx, [GLOBAL(bilin_filter_m)] |
| 170 mov g_bilin_filterm, ecx |
| 171 |
| 172 lea ecx, [GLOBAL(pw_8)] |
| 173 mov g_pw_8m, ecx |
| 174 |
| 175 LOAD_IF_USED 0, 1 ; load eax, ecx back |
| 176 %else |
| 177 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ |
| 178 y_offset, dst, dst_stride, height, sse, \ |
| 179 g_bilin_filter, g_pw_8 |
| 180 %define h heightd |
| 181 |
| 182 ;Store bilin_filter and pw_8 location in stack |
| 183 GET_GOT eax |
| 184 add esp, 4 ; restore esp |
| 185 |
| 186 lea ecx, [GLOBAL(bilin_filter_m)] |
| 187 mov g_bilin_filterm, ecx |
| 188 |
| 189 lea ecx, [GLOBAL(pw_8)] |
| 190 mov g_pw_8m, ecx |
| 191 |
| 192 LOAD_IF_USED 0, 1 ; load eax, ecx back |
| 193 %endif |
| 194 %else |
| 195 %if %2 == 1 ; avg |
| 196 cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ |
| 197 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ |
| 198 x_offset, y_offset, \ |
| 199 dst, dst_stride, \ |
| 200 sec, sec_stride, \ |
| 201 height, sse |
| 202 %if ARCH_X86_64 |
| 203 %define h heightd |
| 204 %define sec_str sec_strideq |
| 205 %else |
| 206 %define h dword heightm |
| 207 %define sec_str sec_stridemp |
| 208 %endif |
| 209 %else |
| 210 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ |
| 211 y_offset, dst, dst_stride, height, sse |
| 212 %define h heightd |
| 213 %endif |
| 214 |
| 215 %define bilin_filter bilin_filter_m |
| 216 %endif |
142 %endif | 217 %endif |
143 %define h heightd | 218 |
144 %define bilin_filter sseq | |
145 %else | |
146 %if %2 == 1 ; avg | |
147 cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ | |
148 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ | |
149 x_offset, y_offset, \ | |
150 dst, dst_stride, \ | |
151 sec, sec_stride, \ | |
152 height, sse | |
153 %if ARCH_X86_64 | |
154 %define h heightd | |
155 %define sec_str sec_strideq | |
156 %else | |
157 %define h dword heightm | |
158 %define sec_str sec_stridemp | |
159 %endif | |
160 %else | |
161 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ | |
162 dst, dst_stride, height, sse | |
163 %define h heightd | |
164 %endif | |
165 %define bilin_filter bilin_filter_m | |
166 %endif | |
167 ASSERT %1 <= 16 ; m6 overflows if w > 16 | 219 ASSERT %1 <= 16 ; m6 overflows if w > 16 |
168 pxor m6, m6 ; sum | 220 pxor m6, m6 ; sum |
169 pxor m7, m7 ; sse | 221 pxor m7, m7 ; sse |
170 ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we | 222 ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we |
171 ; could perhaps use it for something more productive then | 223 ; could perhaps use it for something more productive then |
172 pxor m5, m5 ; dedicated zero register | 224 pxor m5, m5 ; dedicated zero register |
173 %if %1 < 16 | 225 %if %1 < 16 |
174 sar h, 1 | 226 sar h, 1 |
175 %if %2 == 1 ; avg | 227 %if %2 == 1 ; avg |
176 shl sec_str, 1 | 228 shl sec_str, 1 |
(...skipping 145 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
322 %if ARCH_X86_64 && mmsize == 16 | 374 %if ARCH_X86_64 && mmsize == 16 |
323 mova m8, [bilin_filter+y_offsetq] | 375 mova m8, [bilin_filter+y_offsetq] |
324 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 | 376 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 |
325 mova m9, [bilin_filter+y_offsetq+16] | 377 mova m9, [bilin_filter+y_offsetq+16] |
326 %endif | 378 %endif |
327 mova m10, [pw_8] | 379 mova m10, [pw_8] |
328 %define filter_y_a m8 | 380 %define filter_y_a m8 |
329 %define filter_y_b m9 | 381 %define filter_y_b m9 |
330 %define filter_rnd m10 | 382 %define filter_rnd m10 |
331 %else ; x86-32 or mmx | 383 %else ; x86-32 or mmx |
| 384 %if ARCH_X86=1 && CONFIG_PIC=1 |
| 385 ; x_offset == 0, reuse x_offset reg |
| 386 %define tempq x_offsetq |
| 387 add y_offsetq, g_bilin_filterm |
| 388 %define filter_y_a [y_offsetq] |
| 389 %define filter_y_b [y_offsetq+16] |
| 390 mov tempq, g_pw_8m |
| 391 %define filter_rnd [tempq] |
| 392 %else |
332 add y_offsetq, bilin_filter | 393 add y_offsetq, bilin_filter |
333 %define filter_y_a [y_offsetq] | 394 %define filter_y_a [y_offsetq] |
334 %define filter_y_b [y_offsetq+16] | 395 %define filter_y_b [y_offsetq+16] |
335 %define filter_rnd [pw_8] | 396 %define filter_rnd [pw_8] |
336 %endif | 397 %endif |
| 398 %endif |
| 399 |
337 .x_zero_y_other_loop: | 400 .x_zero_y_other_loop: |
338 %if %1 == 16 | 401 %if %1 == 16 |
339 movu m0, [srcq] | 402 movu m0, [srcq] |
340 movu m4, [srcq+src_strideq] | 403 movu m4, [srcq+src_strideq] |
341 mova m1, [dstq] | 404 mova m1, [dstq] |
342 %if cpuflag(ssse3) | 405 %if cpuflag(ssse3) |
343 punpckhbw m2, m0, m4 | 406 punpckhbw m2, m0, m4 |
344 punpcklbw m0, m4 | 407 punpcklbw m0, m4 |
345 pmaddubsw m2, filter_y_a | 408 pmaddubsw m2, filter_y_a |
346 pmaddubsw m0, filter_y_a | 409 pmaddubsw m0, filter_y_a |
(...skipping 261 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
608 shl y_offsetd, filter_idx_shift | 671 shl y_offsetd, filter_idx_shift |
609 %if ARCH_X86_64 && mmsize == 16 | 672 %if ARCH_X86_64 && mmsize == 16 |
610 mova m8, [bilin_filter+y_offsetq] | 673 mova m8, [bilin_filter+y_offsetq] |
611 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 | 674 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 |
612 mova m9, [bilin_filter+y_offsetq+16] | 675 mova m9, [bilin_filter+y_offsetq+16] |
613 %endif | 676 %endif |
614 mova m10, [pw_8] | 677 mova m10, [pw_8] |
615 %define filter_y_a m8 | 678 %define filter_y_a m8 |
616 %define filter_y_b m9 | 679 %define filter_y_b m9 |
617 %define filter_rnd m10 | 680 %define filter_rnd m10 |
| 681 %else ;x86_32 |
| 682 %if ARCH_X86=1 && CONFIG_PIC=1 |
| 683 ; x_offset == 0.5. We can reuse x_offset reg |
| 684 %define tempq x_offsetq |
| 685 add y_offsetq, g_bilin_filterm |
| 686 %define filter_y_a [y_offsetq] |
| 687 %define filter_y_b [y_offsetq+16] |
| 688 mov tempq, g_pw_8m |
| 689 %define filter_rnd [tempq] |
618 %else | 690 %else |
619 add y_offsetq, bilin_filter | 691 add y_offsetq, bilin_filter |
620 %define filter_y_a [y_offsetq] | 692 %define filter_y_a [y_offsetq] |
621 %define filter_y_b [y_offsetq+16] | 693 %define filter_y_b [y_offsetq+16] |
622 %define filter_rnd [pw_8] | 694 %define filter_rnd [pw_8] |
623 %endif | 695 %endif |
| 696 %endif |
| 697 |
624 %if %1 == 16 | 698 %if %1 == 16 |
625 movu m0, [srcq] | 699 movu m0, [srcq] |
626 movu m3, [srcq+1] | 700 movu m3, [srcq+1] |
627 add srcq, src_strideq | 701 add srcq, src_strideq |
628 pavgb m0, m3 | 702 pavgb m0, m3 |
629 .x_half_y_other_loop: | 703 .x_half_y_other_loop: |
630 movu m4, [srcq] | 704 movu m4, [srcq] |
631 movu m2, [srcq+1] | 705 movu m2, [srcq+1] |
632 mova m1, [dstq] | 706 mova m1, [dstq] |
633 pavgb m4, m2 | 707 pavgb m4, m2 |
(...skipping 111 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
745 shl x_offsetd, filter_idx_shift | 819 shl x_offsetd, filter_idx_shift |
746 %if ARCH_X86_64 && mmsize == 16 | 820 %if ARCH_X86_64 && mmsize == 16 |
747 mova m8, [bilin_filter+x_offsetq] | 821 mova m8, [bilin_filter+x_offsetq] |
748 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 | 822 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 |
749 mova m9, [bilin_filter+x_offsetq+16] | 823 mova m9, [bilin_filter+x_offsetq+16] |
750 %endif | 824 %endif |
751 mova m10, [pw_8] | 825 mova m10, [pw_8] |
752 %define filter_x_a m8 | 826 %define filter_x_a m8 |
753 %define filter_x_b m9 | 827 %define filter_x_b m9 |
754 %define filter_rnd m10 | 828 %define filter_rnd m10 |
| 829 %else ; x86-32 |
| 830 %if ARCH_X86=1 && CONFIG_PIC=1 |
| 831 ;y_offset == 0. We can reuse y_offset reg. |
| 832 %define tempq y_offsetq |
| 833 add x_offsetq, g_bilin_filterm |
| 834 %define filter_x_a [x_offsetq] |
| 835 %define filter_x_b [x_offsetq+16] |
| 836 mov tempq, g_pw_8m |
| 837 %define filter_rnd [tempq] |
755 %else | 838 %else |
756 add x_offsetq, bilin_filter | 839 add x_offsetq, bilin_filter |
757 %define filter_x_a [x_offsetq] | 840 %define filter_x_a [x_offsetq] |
758 %define filter_x_b [x_offsetq+16] | 841 %define filter_x_b [x_offsetq+16] |
759 %define filter_rnd [pw_8] | 842 %define filter_rnd [pw_8] |
760 %endif | 843 %endif |
| 844 %endif |
| 845 |
761 .x_other_y_zero_loop: | 846 .x_other_y_zero_loop: |
762 %if %1 == 16 | 847 %if %1 == 16 |
763 movu m0, [srcq] | 848 movu m0, [srcq] |
764 movu m4, [srcq+1] | 849 movu m4, [srcq+1] |
765 mova m1, [dstq] | 850 mova m1, [dstq] |
766 %if cpuflag(ssse3) | 851 %if cpuflag(ssse3) |
767 punpckhbw m2, m0, m4 | 852 punpckhbw m2, m0, m4 |
768 punpcklbw m0, m4 | 853 punpcklbw m0, m4 |
769 pmaddubsw m2, filter_x_a | 854 pmaddubsw m2, filter_x_a |
770 pmaddubsw m0, filter_x_a | 855 pmaddubsw m0, filter_x_a |
(...skipping 95 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
866 shl x_offsetd, filter_idx_shift | 951 shl x_offsetd, filter_idx_shift |
867 %if ARCH_X86_64 && mmsize == 16 | 952 %if ARCH_X86_64 && mmsize == 16 |
868 mova m8, [bilin_filter+x_offsetq] | 953 mova m8, [bilin_filter+x_offsetq] |
869 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 | 954 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 |
870 mova m9, [bilin_filter+x_offsetq+16] | 955 mova m9, [bilin_filter+x_offsetq+16] |
871 %endif | 956 %endif |
872 mova m10, [pw_8] | 957 mova m10, [pw_8] |
873 %define filter_x_a m8 | 958 %define filter_x_a m8 |
874 %define filter_x_b m9 | 959 %define filter_x_b m9 |
875 %define filter_rnd m10 | 960 %define filter_rnd m10 |
| 961 %else ; x86-32 |
| 962 %if ARCH_X86=1 && CONFIG_PIC=1 |
| 963 ; y_offset == 0.5. We can reuse y_offset reg. |
| 964 %define tempq y_offsetq |
| 965 add x_offsetq, g_bilin_filterm |
| 966 %define filter_x_a [x_offsetq] |
| 967 %define filter_x_b [x_offsetq+16] |
| 968 mov tempq, g_pw_8m |
| 969 %define filter_rnd [tempq] |
876 %else | 970 %else |
877 add x_offsetq, bilin_filter | 971 add x_offsetq, bilin_filter |
878 %define filter_x_a [x_offsetq] | 972 %define filter_x_a [x_offsetq] |
879 %define filter_x_b [x_offsetq+16] | 973 %define filter_x_b [x_offsetq+16] |
880 %define filter_rnd [pw_8] | 974 %define filter_rnd [pw_8] |
881 %endif | 975 %endif |
| 976 %endif |
| 977 |
882 %if %1 == 16 | 978 %if %1 == 16 |
883 movu m0, [srcq] | 979 movu m0, [srcq] |
884 movu m1, [srcq+1] | 980 movu m1, [srcq+1] |
885 %if cpuflag(ssse3) | 981 %if cpuflag(ssse3) |
886 punpckhbw m2, m0, m1 | 982 punpckhbw m2, m0, m1 |
887 punpcklbw m0, m1 | 983 punpcklbw m0, m1 |
888 pmaddubsw m2, filter_x_a | 984 pmaddubsw m2, filter_x_a |
889 pmaddubsw m0, filter_x_a | 985 pmaddubsw m0, filter_x_a |
890 paddw m2, filter_rnd | 986 paddw m2, filter_rnd |
891 paddw m0, filter_rnd | 987 paddw m0, filter_rnd |
(...skipping 158 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1050 mova m10, [bilin_filter+y_offsetq] | 1146 mova m10, [bilin_filter+y_offsetq] |
1051 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 | 1147 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 |
1052 mova m11, [bilin_filter+y_offsetq+16] | 1148 mova m11, [bilin_filter+y_offsetq+16] |
1053 %endif | 1149 %endif |
1054 mova m12, [pw_8] | 1150 mova m12, [pw_8] |
1055 %define filter_x_a m8 | 1151 %define filter_x_a m8 |
1056 %define filter_x_b m9 | 1152 %define filter_x_b m9 |
1057 %define filter_y_a m10 | 1153 %define filter_y_a m10 |
1058 %define filter_y_b m11 | 1154 %define filter_y_b m11 |
1059 %define filter_rnd m12 | 1155 %define filter_rnd m12 |
| 1156 %else ; x86-32 |
| 1157 %if ARCH_X86=1 && CONFIG_PIC=1 |
| 1158 ; In this case, there is NO unused register. Used src_stride register. Later, |
| 1159 ; src_stride has to be loaded from stack when it is needed. |
| 1160 %define tempq src_strideq |
| 1161 mov tempq, g_bilin_filterm |
| 1162 add x_offsetq, tempq |
| 1163 add y_offsetq, tempq |
| 1164 %define filter_x_a [x_offsetq] |
| 1165 %define filter_x_b [x_offsetq+16] |
| 1166 %define filter_y_a [y_offsetq] |
| 1167 %define filter_y_b [y_offsetq+16] |
| 1168 |
| 1169 mov tempq, g_pw_8m |
| 1170 %define filter_rnd [tempq] |
1060 %else | 1171 %else |
1061 add x_offsetq, bilin_filter | 1172 add x_offsetq, bilin_filter |
1062 add y_offsetq, bilin_filter | 1173 add y_offsetq, bilin_filter |
1063 %define filter_x_a [x_offsetq] | 1174 %define filter_x_a [x_offsetq] |
1064 %define filter_x_b [x_offsetq+16] | 1175 %define filter_x_b [x_offsetq+16] |
1065 %define filter_y_a [y_offsetq] | 1176 %define filter_y_a [y_offsetq] |
1066 %define filter_y_b [y_offsetq+16] | 1177 %define filter_y_b [y_offsetq+16] |
1067 %define filter_rnd [pw_8] | 1178 %define filter_rnd [pw_8] |
1068 %endif | 1179 %endif |
| 1180 %endif |
| 1181 |
1069 ; x_offset == bilin interpolation && y_offset == bilin interpolation | 1182 ; x_offset == bilin interpolation && y_offset == bilin interpolation |
1070 %if %1 == 16 | 1183 %if %1 == 16 |
1071 movu m0, [srcq] | 1184 movu m0, [srcq] |
1072 movu m1, [srcq+1] | 1185 movu m1, [srcq+1] |
1073 %if cpuflag(ssse3) | 1186 %if cpuflag(ssse3) |
1074 punpckhbw m2, m0, m1 | 1187 punpckhbw m2, m0, m1 |
1075 punpcklbw m0, m1 | 1188 punpcklbw m0, m1 |
1076 pmaddubsw m2, filter_x_a | 1189 pmaddubsw m2, filter_x_a |
1077 pmaddubsw m0, filter_x_a | 1190 pmaddubsw m0, filter_x_a |
1078 paddw m2, filter_rnd | 1191 paddw m2, filter_rnd |
1079 paddw m0, filter_rnd | 1192 paddw m0, filter_rnd |
1080 %else | 1193 %else |
1081 punpckhbw m2, m0, m5 | 1194 punpckhbw m2, m0, m5 |
1082 punpckhbw m3, m1, m5 | 1195 punpckhbw m3, m1, m5 |
1083 punpcklbw m0, m5 | 1196 punpcklbw m0, m5 |
1084 punpcklbw m1, m5 | 1197 punpcklbw m1, m5 |
1085 pmullw m0, filter_x_a | 1198 pmullw m0, filter_x_a |
1086 pmullw m1, filter_x_b | 1199 pmullw m1, filter_x_b |
1087 paddw m0, filter_rnd | 1200 paddw m0, filter_rnd |
1088 pmullw m2, filter_x_a | 1201 pmullw m2, filter_x_a |
1089 pmullw m3, filter_x_b | 1202 pmullw m3, filter_x_b |
1090 paddw m2, filter_rnd | 1203 paddw m2, filter_rnd |
1091 paddw m0, m1 | 1204 paddw m0, m1 |
1092 paddw m2, m3 | 1205 paddw m2, m3 |
1093 %endif | 1206 %endif |
1094 psraw m0, 4 | 1207 psraw m0, 4 |
1095 psraw m2, 4 | 1208 psraw m2, 4 |
1096 add srcq, src_strideq | 1209 |
| 1210 INC_SRC_BY_SRC_STRIDE |
| 1211 |
1097 packuswb m0, m2 | 1212 packuswb m0, m2 |
1098 .x_other_y_other_loop: | 1213 .x_other_y_other_loop: |
1099 %if cpuflag(ssse3) | 1214 %if cpuflag(ssse3) |
1100 movu m4, [srcq] | 1215 movu m4, [srcq] |
1101 movu m3, [srcq+1] | 1216 movu m3, [srcq+1] |
1102 mova m1, [dstq] | 1217 mova m1, [dstq] |
1103 punpckhbw m2, m4, m3 | 1218 punpckhbw m2, m4, m3 |
1104 punpcklbw m4, m3 | 1219 punpcklbw m4, m3 |
1105 pmaddubsw m2, filter_x_a | 1220 pmaddubsw m2, filter_x_a |
1106 pmaddubsw m4, filter_x_a | 1221 pmaddubsw m4, filter_x_a |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1156 %if %2 == 1 ; avg | 1271 %if %2 == 1 ; avg |
1157 ; FIXME(rbultje) pipeline | 1272 ; FIXME(rbultje) pipeline |
1158 packuswb m0, m2 | 1273 packuswb m0, m2 |
1159 pavgb m0, [secq] | 1274 pavgb m0, [secq] |
1160 punpckhbw m2, m0, m5 | 1275 punpckhbw m2, m0, m5 |
1161 punpcklbw m0, m5 | 1276 punpcklbw m0, m5 |
1162 %endif | 1277 %endif |
1163 SUM_SSE m0, m1, m2, m3, m6, m7 | 1278 SUM_SSE m0, m1, m2, m3, m6, m7 |
1164 mova m0, m4 | 1279 mova m0, m4 |
1165 | 1280 |
1166 add srcq, src_strideq | 1281 INC_SRC_BY_SRC_STRIDE |
1167 add dstq, dst_strideq | 1282 add dstq, dst_strideq |
1168 %else ; %1 < 16 | 1283 %else ; %1 < 16 |
1169 movh m0, [srcq] | 1284 movh m0, [srcq] |
1170 movh m1, [srcq+1] | 1285 movh m1, [srcq+1] |
1171 %if cpuflag(ssse3) | 1286 %if cpuflag(ssse3) |
1172 punpcklbw m0, m1 | 1287 punpcklbw m0, m1 |
1173 pmaddubsw m0, filter_x_a | 1288 pmaddubsw m0, filter_x_a |
1174 paddw m0, filter_rnd | 1289 paddw m0, filter_rnd |
1175 %else | 1290 %else |
1176 punpcklbw m0, m5 | 1291 punpcklbw m0, m5 |
1177 punpcklbw m1, m5 | 1292 punpcklbw m1, m5 |
1178 pmullw m0, filter_x_a | 1293 pmullw m0, filter_x_a |
1179 pmullw m1, filter_x_b | 1294 pmullw m1, filter_x_b |
1180 paddw m0, filter_rnd | 1295 paddw m0, filter_rnd |
1181 paddw m0, m1 | 1296 paddw m0, m1 |
1182 %endif | 1297 %endif |
1183 psraw m0, 4 | 1298 psraw m0, 4 |
1184 %if cpuflag(ssse3) | 1299 %if cpuflag(ssse3) |
1185 packuswb m0, m0 | 1300 packuswb m0, m0 |
1186 %endif | 1301 %endif |
1187 add srcq, src_strideq | 1302 |
| 1303 INC_SRC_BY_SRC_STRIDE |
| 1304 |
1188 .x_other_y_other_loop: | 1305 .x_other_y_other_loop: |
1189 movh m2, [srcq] | 1306 movh m2, [srcq] |
1190 movh m1, [srcq+1] | 1307 movh m1, [srcq+1] |
1191 movh m4, [srcq+src_strideq] | 1308 |
1192 movh m3, [srcq+src_strideq+1] | 1309 INC_SRC_BY_SRC_STRIDE |
| 1310 movh m4, [srcq] |
| 1311 movh m3, [srcq+1] |
| 1312 |
1193 %if cpuflag(ssse3) | 1313 %if cpuflag(ssse3) |
1194 punpcklbw m2, m1 | 1314 punpcklbw m2, m1 |
1195 punpcklbw m4, m3 | 1315 punpcklbw m4, m3 |
1196 pmaddubsw m2, filter_x_a | 1316 pmaddubsw m2, filter_x_a |
1197 pmaddubsw m4, filter_x_a | 1317 pmaddubsw m4, filter_x_a |
1198 movh m3, [dstq+dst_strideq] | 1318 movh m3, [dstq+dst_strideq] |
1199 movh m1, [dstq] | 1319 movh m1, [dstq] |
1200 paddw m2, filter_rnd | 1320 paddw m2, filter_rnd |
1201 paddw m4, filter_rnd | 1321 paddw m4, filter_rnd |
1202 psraw m2, 4 | 1322 psraw m2, 4 |
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1246 %if %2 == 1 ; avg | 1366 %if %2 == 1 ; avg |
1247 ; FIXME(rbultje) pipeline | 1367 ; FIXME(rbultje) pipeline |
1248 packuswb m0, m2 | 1368 packuswb m0, m2 |
1249 pavgb m0, [secq] | 1369 pavgb m0, [secq] |
1250 punpckhbw m2, m0, m5 | 1370 punpckhbw m2, m0, m5 |
1251 punpcklbw m0, m5 | 1371 punpcklbw m0, m5 |
1252 %endif | 1372 %endif |
1253 SUM_SSE m0, m1, m2, m3, m6, m7 | 1373 SUM_SSE m0, m1, m2, m3, m6, m7 |
1254 mova m0, m4 | 1374 mova m0, m4 |
1255 | 1375 |
1256 lea srcq, [srcq+src_strideq*2] | 1376 INC_SRC_BY_SRC_STRIDE |
1257 lea dstq, [dstq+dst_strideq*2] | 1377 lea dstq, [dstq+dst_strideq*2] |
1258 %endif | 1378 %endif |
1259 %if %2 == 1 ; avg | 1379 %if %2 == 1 ; avg |
1260 add secq, sec_str | 1380 add secq, sec_str |
1261 %endif | 1381 %endif |
1262 dec h | 1382 dec h |
1263 jg .x_other_y_other_loop | 1383 jg .x_other_y_other_loop |
1264 %undef filter_x_a | 1384 %undef filter_x_a |
1265 %undef filter_x_b | 1385 %undef filter_x_b |
1266 %undef filter_y_a | 1386 %undef filter_y_a |
(...skipping 24 matching lines...) Expand all Loading... |
1291 SUBPEL_VARIANCE 4, 1 | 1411 SUBPEL_VARIANCE 4, 1 |
1292 INIT_XMM sse2 | 1412 INIT_XMM sse2 |
1293 SUBPEL_VARIANCE 8, 1 | 1413 SUBPEL_VARIANCE 8, 1 |
1294 SUBPEL_VARIANCE 16, 1 | 1414 SUBPEL_VARIANCE 16, 1 |
1295 | 1415 |
1296 INIT_MMX ssse3 | 1416 INIT_MMX ssse3 |
1297 SUBPEL_VARIANCE 4, 1 | 1417 SUBPEL_VARIANCE 4, 1 |
1298 INIT_XMM ssse3 | 1418 INIT_XMM ssse3 |
1299 SUBPEL_VARIANCE 8, 1 | 1419 SUBPEL_VARIANCE 8, 1 |
1300 SUBPEL_VARIANCE 16, 1 | 1420 SUBPEL_VARIANCE 16, 1 |
OLD | NEW |