| OLD | NEW |
| 1 ; | 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; | 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; | 9 ; |
| 10 | 10 |
| (...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 111 mov r1, ssem ; r1 = unsigned int *sse | 111 mov r1, ssem ; r1 = unsigned int *sse |
| 112 punpcklwd m6, m5 ; sign-extend m6 word->dword | 112 punpcklwd m6, m5 ; sign-extend m6 word->dword |
| 113 movd [r1], m7 ; store sse | 113 movd [r1], m7 ; store sse |
| 114 pshufw m4, m6, 0xe | 114 pshufw m4, m6, 0xe |
| 115 paddd m6, m4 | 115 paddd m6, m4 |
| 116 movd rax, m6 ; store sum as return value | 116 movd rax, m6 ; store sum as return value |
| 117 %endif | 117 %endif |
| 118 RET | 118 RET |
| 119 %endmacro | 119 %endmacro |
| 120 | 120 |
| 121 %macro INC_SRC_BY_SRC_STRIDE 0 |
| 122 %if ARCH_X86=1 && CONFIG_PIC=1 |
| 123 add srcq, src_stridemp |
| 124 %else |
| 125 add srcq, src_strideq |
| 126 %endif |
| 127 %endmacro |
| 128 |
| 121 %macro SUBPEL_VARIANCE 1-2 0 ; W | 129 %macro SUBPEL_VARIANCE 1-2 0 ; W |
| 122 %if cpuflag(ssse3) | 130 %if cpuflag(ssse3) |
| 123 %define bilin_filter_m bilin_filter_m_ssse3 | 131 %define bilin_filter_m bilin_filter_m_ssse3 |
| 124 %define filter_idx_shift 4 | 132 %define filter_idx_shift 4 |
| 125 %else | 133 %else |
| 126 %define bilin_filter_m bilin_filter_m_sse2 | 134 %define bilin_filter_m bilin_filter_m_sse2 |
| 127 %define filter_idx_shift 5 | 135 %define filter_idx_shift 5 |
| 128 %endif | 136 %endif |
| 129 ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses | 137 ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses |
| 130 ; 11, not 13, if the registers are ordered correctly. May make a minor speed | 138 ; 11, not 13, if the registers are ordered correctly. May make a minor speed |
| 131 ; difference on Win64 | 139 ; difference on Win64 |
| 132 %ifdef PIC | 140 |
| 133 %if %2 == 1 ; avg | 141 %ifdef PIC ; 64bit PIC |
| 134 cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ | 142 %if %2 == 1 ; avg |
| 135 x_offset, y_offset, \ | 143 cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ |
| 136 dst, dst_stride, \ | 144 x_offset, y_offset, \ |
| 137 sec, sec_stride, height, sse | 145 dst, dst_stride, \ |
| 138 %define sec_str sec_strideq | 146 sec, sec_stride, height, sse |
| 147 %define sec_str sec_strideq |
| 148 %else |
| 149 cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ |
| 150 y_offset, dst, dst_stride, height, sse |
| 151 %endif |
| 152 %define h heightd |
| 153 %define bilin_filter sseq |
| 139 %else | 154 %else |
| 140 cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \ | 155 %if ARCH_X86=1 && CONFIG_PIC=1 |
| 141 dst, dst_stride, height, sse | 156 %if %2 == 1 ; avg |
| 157 cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ |
| 158 x_offset, y_offset, \ |
| 159 dst, dst_stride, \ |
| 160 sec, sec_stride, \ |
| 161 height, sse, g_bilin_filter, g_pw_8 |
| 162 %define h dword heightm |
| 163 %define sec_str sec_stridemp |
| 164 |
| 165 ;Store bilin_filter and pw_8 location in stack |
| 166 GET_GOT eax |
| 167 add esp, 4 ; restore esp |
| 168 |
| 169 lea ecx, [GLOBAL(bilin_filter_m)] |
| 170 mov g_bilin_filterm, ecx |
| 171 |
| 172 lea ecx, [GLOBAL(pw_8)] |
| 173 mov g_pw_8m, ecx |
| 174 |
| 175 LOAD_IF_USED 0, 1 ; load eax, ecx back |
| 176 %else |
| 177 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ |
| 178 y_offset, dst, dst_stride, height, sse, \ |
| 179 g_bilin_filter, g_pw_8 |
| 180 %define h heightd |
| 181 |
| 182 ;Store bilin_filter and pw_8 location in stack |
| 183 GET_GOT eax |
| 184 add esp, 4 ; restore esp |
| 185 |
| 186 lea ecx, [GLOBAL(bilin_filter_m)] |
| 187 mov g_bilin_filterm, ecx |
| 188 |
| 189 lea ecx, [GLOBAL(pw_8)] |
| 190 mov g_pw_8m, ecx |
| 191 |
| 192 LOAD_IF_USED 0, 1 ; load eax, ecx back |
| 193 %endif |
| 194 %else |
| 195 %if %2 == 1 ; avg |
| 196 cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ |
| 197 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ |
| 198 x_offset, y_offset, \ |
| 199 dst, dst_stride, \ |
| 200 sec, sec_stride, \ |
| 201 height, sse |
| 202 %if ARCH_X86_64 |
| 203 %define h heightd |
| 204 %define sec_str sec_strideq |
| 205 %else |
| 206 %define h dword heightm |
| 207 %define sec_str sec_stridemp |
| 208 %endif |
| 209 %else |
| 210 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ |
| 211 y_offset, dst, dst_stride, height, sse |
| 212 %define h heightd |
| 213 %endif |
| 214 |
| 215 %define bilin_filter bilin_filter_m |
| 216 %endif |
| 142 %endif | 217 %endif |
| 143 %define h heightd | 218 |
| 144 %define bilin_filter sseq | |
| 145 %else | |
| 146 %if %2 == 1 ; avg | |
| 147 cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ | |
| 148 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ | |
| 149 x_offset, y_offset, \ | |
| 150 dst, dst_stride, \ | |
| 151 sec, sec_stride, \ | |
| 152 height, sse | |
| 153 %if ARCH_X86_64 | |
| 154 %define h heightd | |
| 155 %define sec_str sec_strideq | |
| 156 %else | |
| 157 %define h dword heightm | |
| 158 %define sec_str sec_stridemp | |
| 159 %endif | |
| 160 %else | |
| 161 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ | |
| 162 dst, dst_stride, height, sse | |
| 163 %define h heightd | |
| 164 %endif | |
| 165 %define bilin_filter bilin_filter_m | |
| 166 %endif | |
| 167 ASSERT %1 <= 16 ; m6 overflows if w > 16 | 219 ASSERT %1 <= 16 ; m6 overflows if w > 16 |
| 168 pxor m6, m6 ; sum | 220 pxor m6, m6 ; sum |
| 169 pxor m7, m7 ; sse | 221 pxor m7, m7 ; sse |
| 170 ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we | 222 ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we |
| 171 ; could perhaps use it for something more productive then | 223 ; could perhaps use it for something more productive then |
| 172 pxor m5, m5 ; dedicated zero register | 224 pxor m5, m5 ; dedicated zero register |
| 173 %if %1 < 16 | 225 %if %1 < 16 |
| 174 sar h, 1 | 226 sar h, 1 |
| 175 %if %2 == 1 ; avg | 227 %if %2 == 1 ; avg |
| 176 shl sec_str, 1 | 228 shl sec_str, 1 |
| (...skipping 145 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 322 %if ARCH_X86_64 && mmsize == 16 | 374 %if ARCH_X86_64 && mmsize == 16 |
| 323 mova m8, [bilin_filter+y_offsetq] | 375 mova m8, [bilin_filter+y_offsetq] |
| 324 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 | 376 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 |
| 325 mova m9, [bilin_filter+y_offsetq+16] | 377 mova m9, [bilin_filter+y_offsetq+16] |
| 326 %endif | 378 %endif |
| 327 mova m10, [pw_8] | 379 mova m10, [pw_8] |
| 328 %define filter_y_a m8 | 380 %define filter_y_a m8 |
| 329 %define filter_y_b m9 | 381 %define filter_y_b m9 |
| 330 %define filter_rnd m10 | 382 %define filter_rnd m10 |
| 331 %else ; x86-32 or mmx | 383 %else ; x86-32 or mmx |
| 384 %if ARCH_X86=1 && CONFIG_PIC=1 |
| 385 ; x_offset == 0, reuse x_offset reg |
| 386 %define tempq x_offsetq |
| 387 add y_offsetq, g_bilin_filterm |
| 388 %define filter_y_a [y_offsetq] |
| 389 %define filter_y_b [y_offsetq+16] |
| 390 mov tempq, g_pw_8m |
| 391 %define filter_rnd [tempq] |
| 392 %else |
| 332 add y_offsetq, bilin_filter | 393 add y_offsetq, bilin_filter |
| 333 %define filter_y_a [y_offsetq] | 394 %define filter_y_a [y_offsetq] |
| 334 %define filter_y_b [y_offsetq+16] | 395 %define filter_y_b [y_offsetq+16] |
| 335 %define filter_rnd [pw_8] | 396 %define filter_rnd [pw_8] |
| 336 %endif | 397 %endif |
| 398 %endif |
| 399 |
| 337 .x_zero_y_other_loop: | 400 .x_zero_y_other_loop: |
| 338 %if %1 == 16 | 401 %if %1 == 16 |
| 339 movu m0, [srcq] | 402 movu m0, [srcq] |
| 340 movu m4, [srcq+src_strideq] | 403 movu m4, [srcq+src_strideq] |
| 341 mova m1, [dstq] | 404 mova m1, [dstq] |
| 342 %if cpuflag(ssse3) | 405 %if cpuflag(ssse3) |
| 343 punpckhbw m2, m0, m4 | 406 punpckhbw m2, m0, m4 |
| 344 punpcklbw m0, m4 | 407 punpcklbw m0, m4 |
| 345 pmaddubsw m2, filter_y_a | 408 pmaddubsw m2, filter_y_a |
| 346 pmaddubsw m0, filter_y_a | 409 pmaddubsw m0, filter_y_a |
| (...skipping 261 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 608 shl y_offsetd, filter_idx_shift | 671 shl y_offsetd, filter_idx_shift |
| 609 %if ARCH_X86_64 && mmsize == 16 | 672 %if ARCH_X86_64 && mmsize == 16 |
| 610 mova m8, [bilin_filter+y_offsetq] | 673 mova m8, [bilin_filter+y_offsetq] |
| 611 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 | 674 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 |
| 612 mova m9, [bilin_filter+y_offsetq+16] | 675 mova m9, [bilin_filter+y_offsetq+16] |
| 613 %endif | 676 %endif |
| 614 mova m10, [pw_8] | 677 mova m10, [pw_8] |
| 615 %define filter_y_a m8 | 678 %define filter_y_a m8 |
| 616 %define filter_y_b m9 | 679 %define filter_y_b m9 |
| 617 %define filter_rnd m10 | 680 %define filter_rnd m10 |
| 681 %else ;x86_32 |
| 682 %if ARCH_X86=1 && CONFIG_PIC=1 |
| 683 ; x_offset == 0.5. We can reuse x_offset reg |
| 684 %define tempq x_offsetq |
| 685 add y_offsetq, g_bilin_filterm |
| 686 %define filter_y_a [y_offsetq] |
| 687 %define filter_y_b [y_offsetq+16] |
| 688 mov tempq, g_pw_8m |
| 689 %define filter_rnd [tempq] |
| 618 %else | 690 %else |
| 619 add y_offsetq, bilin_filter | 691 add y_offsetq, bilin_filter |
| 620 %define filter_y_a [y_offsetq] | 692 %define filter_y_a [y_offsetq] |
| 621 %define filter_y_b [y_offsetq+16] | 693 %define filter_y_b [y_offsetq+16] |
| 622 %define filter_rnd [pw_8] | 694 %define filter_rnd [pw_8] |
| 623 %endif | 695 %endif |
| 696 %endif |
| 697 |
| 624 %if %1 == 16 | 698 %if %1 == 16 |
| 625 movu m0, [srcq] | 699 movu m0, [srcq] |
| 626 movu m3, [srcq+1] | 700 movu m3, [srcq+1] |
| 627 add srcq, src_strideq | 701 add srcq, src_strideq |
| 628 pavgb m0, m3 | 702 pavgb m0, m3 |
| 629 .x_half_y_other_loop: | 703 .x_half_y_other_loop: |
| 630 movu m4, [srcq] | 704 movu m4, [srcq] |
| 631 movu m2, [srcq+1] | 705 movu m2, [srcq+1] |
| 632 mova m1, [dstq] | 706 mova m1, [dstq] |
| 633 pavgb m4, m2 | 707 pavgb m4, m2 |
| (...skipping 111 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 745 shl x_offsetd, filter_idx_shift | 819 shl x_offsetd, filter_idx_shift |
| 746 %if ARCH_X86_64 && mmsize == 16 | 820 %if ARCH_X86_64 && mmsize == 16 |
| 747 mova m8, [bilin_filter+x_offsetq] | 821 mova m8, [bilin_filter+x_offsetq] |
| 748 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 | 822 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 |
| 749 mova m9, [bilin_filter+x_offsetq+16] | 823 mova m9, [bilin_filter+x_offsetq+16] |
| 750 %endif | 824 %endif |
| 751 mova m10, [pw_8] | 825 mova m10, [pw_8] |
| 752 %define filter_x_a m8 | 826 %define filter_x_a m8 |
| 753 %define filter_x_b m9 | 827 %define filter_x_b m9 |
| 754 %define filter_rnd m10 | 828 %define filter_rnd m10 |
| 829 %else ; x86-32 |
| 830 %if ARCH_X86=1 && CONFIG_PIC=1 |
| 831 ;y_offset == 0. We can reuse y_offset reg. |
| 832 %define tempq y_offsetq |
| 833 add x_offsetq, g_bilin_filterm |
| 834 %define filter_x_a [x_offsetq] |
| 835 %define filter_x_b [x_offsetq+16] |
| 836 mov tempq, g_pw_8m |
| 837 %define filter_rnd [tempq] |
| 755 %else | 838 %else |
| 756 add x_offsetq, bilin_filter | 839 add x_offsetq, bilin_filter |
| 757 %define filter_x_a [x_offsetq] | 840 %define filter_x_a [x_offsetq] |
| 758 %define filter_x_b [x_offsetq+16] | 841 %define filter_x_b [x_offsetq+16] |
| 759 %define filter_rnd [pw_8] | 842 %define filter_rnd [pw_8] |
| 760 %endif | 843 %endif |
| 844 %endif |
| 845 |
| 761 .x_other_y_zero_loop: | 846 .x_other_y_zero_loop: |
| 762 %if %1 == 16 | 847 %if %1 == 16 |
| 763 movu m0, [srcq] | 848 movu m0, [srcq] |
| 764 movu m4, [srcq+1] | 849 movu m4, [srcq+1] |
| 765 mova m1, [dstq] | 850 mova m1, [dstq] |
| 766 %if cpuflag(ssse3) | 851 %if cpuflag(ssse3) |
| 767 punpckhbw m2, m0, m4 | 852 punpckhbw m2, m0, m4 |
| 768 punpcklbw m0, m4 | 853 punpcklbw m0, m4 |
| 769 pmaddubsw m2, filter_x_a | 854 pmaddubsw m2, filter_x_a |
| 770 pmaddubsw m0, filter_x_a | 855 pmaddubsw m0, filter_x_a |
| (...skipping 95 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 866 shl x_offsetd, filter_idx_shift | 951 shl x_offsetd, filter_idx_shift |
| 867 %if ARCH_X86_64 && mmsize == 16 | 952 %if ARCH_X86_64 && mmsize == 16 |
| 868 mova m8, [bilin_filter+x_offsetq] | 953 mova m8, [bilin_filter+x_offsetq] |
| 869 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 | 954 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 |
| 870 mova m9, [bilin_filter+x_offsetq+16] | 955 mova m9, [bilin_filter+x_offsetq+16] |
| 871 %endif | 956 %endif |
| 872 mova m10, [pw_8] | 957 mova m10, [pw_8] |
| 873 %define filter_x_a m8 | 958 %define filter_x_a m8 |
| 874 %define filter_x_b m9 | 959 %define filter_x_b m9 |
| 875 %define filter_rnd m10 | 960 %define filter_rnd m10 |
| 961 %else ; x86-32 |
| 962 %if ARCH_X86=1 && CONFIG_PIC=1 |
| 963 ; y_offset == 0.5. We can reuse y_offset reg. |
| 964 %define tempq y_offsetq |
| 965 add x_offsetq, g_bilin_filterm |
| 966 %define filter_x_a [x_offsetq] |
| 967 %define filter_x_b [x_offsetq+16] |
| 968 mov tempq, g_pw_8m |
| 969 %define filter_rnd [tempq] |
| 876 %else | 970 %else |
| 877 add x_offsetq, bilin_filter | 971 add x_offsetq, bilin_filter |
| 878 %define filter_x_a [x_offsetq] | 972 %define filter_x_a [x_offsetq] |
| 879 %define filter_x_b [x_offsetq+16] | 973 %define filter_x_b [x_offsetq+16] |
| 880 %define filter_rnd [pw_8] | 974 %define filter_rnd [pw_8] |
| 881 %endif | 975 %endif |
| 976 %endif |
| 977 |
| 882 %if %1 == 16 | 978 %if %1 == 16 |
| 883 movu m0, [srcq] | 979 movu m0, [srcq] |
| 884 movu m1, [srcq+1] | 980 movu m1, [srcq+1] |
| 885 %if cpuflag(ssse3) | 981 %if cpuflag(ssse3) |
| 886 punpckhbw m2, m0, m1 | 982 punpckhbw m2, m0, m1 |
| 887 punpcklbw m0, m1 | 983 punpcklbw m0, m1 |
| 888 pmaddubsw m2, filter_x_a | 984 pmaddubsw m2, filter_x_a |
| 889 pmaddubsw m0, filter_x_a | 985 pmaddubsw m0, filter_x_a |
| 890 paddw m2, filter_rnd | 986 paddw m2, filter_rnd |
| 891 paddw m0, filter_rnd | 987 paddw m0, filter_rnd |
| (...skipping 158 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1050 mova m10, [bilin_filter+y_offsetq] | 1146 mova m10, [bilin_filter+y_offsetq] |
| 1051 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 | 1147 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 |
| 1052 mova m11, [bilin_filter+y_offsetq+16] | 1148 mova m11, [bilin_filter+y_offsetq+16] |
| 1053 %endif | 1149 %endif |
| 1054 mova m12, [pw_8] | 1150 mova m12, [pw_8] |
| 1055 %define filter_x_a m8 | 1151 %define filter_x_a m8 |
| 1056 %define filter_x_b m9 | 1152 %define filter_x_b m9 |
| 1057 %define filter_y_a m10 | 1153 %define filter_y_a m10 |
| 1058 %define filter_y_b m11 | 1154 %define filter_y_b m11 |
| 1059 %define filter_rnd m12 | 1155 %define filter_rnd m12 |
| 1156 %else ; x86-32 |
| 1157 %if ARCH_X86=1 && CONFIG_PIC=1 |
| 1158 ; In this case, there is NO unused register. Used src_stride register. Later, |
| 1159 ; src_stride has to be loaded from stack when it is needed. |
| 1160 %define tempq src_strideq |
| 1161 mov tempq, g_bilin_filterm |
| 1162 add x_offsetq, tempq |
| 1163 add y_offsetq, tempq |
| 1164 %define filter_x_a [x_offsetq] |
| 1165 %define filter_x_b [x_offsetq+16] |
| 1166 %define filter_y_a [y_offsetq] |
| 1167 %define filter_y_b [y_offsetq+16] |
| 1168 |
| 1169 mov tempq, g_pw_8m |
| 1170 %define filter_rnd [tempq] |
| 1060 %else | 1171 %else |
| 1061 add x_offsetq, bilin_filter | 1172 add x_offsetq, bilin_filter |
| 1062 add y_offsetq, bilin_filter | 1173 add y_offsetq, bilin_filter |
| 1063 %define filter_x_a [x_offsetq] | 1174 %define filter_x_a [x_offsetq] |
| 1064 %define filter_x_b [x_offsetq+16] | 1175 %define filter_x_b [x_offsetq+16] |
| 1065 %define filter_y_a [y_offsetq] | 1176 %define filter_y_a [y_offsetq] |
| 1066 %define filter_y_b [y_offsetq+16] | 1177 %define filter_y_b [y_offsetq+16] |
| 1067 %define filter_rnd [pw_8] | 1178 %define filter_rnd [pw_8] |
| 1068 %endif | 1179 %endif |
| 1180 %endif |
| 1181 |
| 1069 ; x_offset == bilin interpolation && y_offset == bilin interpolation | 1182 ; x_offset == bilin interpolation && y_offset == bilin interpolation |
| 1070 %if %1 == 16 | 1183 %if %1 == 16 |
| 1071 movu m0, [srcq] | 1184 movu m0, [srcq] |
| 1072 movu m1, [srcq+1] | 1185 movu m1, [srcq+1] |
| 1073 %if cpuflag(ssse3) | 1186 %if cpuflag(ssse3) |
| 1074 punpckhbw m2, m0, m1 | 1187 punpckhbw m2, m0, m1 |
| 1075 punpcklbw m0, m1 | 1188 punpcklbw m0, m1 |
| 1076 pmaddubsw m2, filter_x_a | 1189 pmaddubsw m2, filter_x_a |
| 1077 pmaddubsw m0, filter_x_a | 1190 pmaddubsw m0, filter_x_a |
| 1078 paddw m2, filter_rnd | 1191 paddw m2, filter_rnd |
| 1079 paddw m0, filter_rnd | 1192 paddw m0, filter_rnd |
| 1080 %else | 1193 %else |
| 1081 punpckhbw m2, m0, m5 | 1194 punpckhbw m2, m0, m5 |
| 1082 punpckhbw m3, m1, m5 | 1195 punpckhbw m3, m1, m5 |
| 1083 punpcklbw m0, m5 | 1196 punpcklbw m0, m5 |
| 1084 punpcklbw m1, m5 | 1197 punpcklbw m1, m5 |
| 1085 pmullw m0, filter_x_a | 1198 pmullw m0, filter_x_a |
| 1086 pmullw m1, filter_x_b | 1199 pmullw m1, filter_x_b |
| 1087 paddw m0, filter_rnd | 1200 paddw m0, filter_rnd |
| 1088 pmullw m2, filter_x_a | 1201 pmullw m2, filter_x_a |
| 1089 pmullw m3, filter_x_b | 1202 pmullw m3, filter_x_b |
| 1090 paddw m2, filter_rnd | 1203 paddw m2, filter_rnd |
| 1091 paddw m0, m1 | 1204 paddw m0, m1 |
| 1092 paddw m2, m3 | 1205 paddw m2, m3 |
| 1093 %endif | 1206 %endif |
| 1094 psraw m0, 4 | 1207 psraw m0, 4 |
| 1095 psraw m2, 4 | 1208 psraw m2, 4 |
| 1096 add srcq, src_strideq | 1209 |
| 1210 INC_SRC_BY_SRC_STRIDE |
| 1211 |
| 1097 packuswb m0, m2 | 1212 packuswb m0, m2 |
| 1098 .x_other_y_other_loop: | 1213 .x_other_y_other_loop: |
| 1099 %if cpuflag(ssse3) | 1214 %if cpuflag(ssse3) |
| 1100 movu m4, [srcq] | 1215 movu m4, [srcq] |
| 1101 movu m3, [srcq+1] | 1216 movu m3, [srcq+1] |
| 1102 mova m1, [dstq] | 1217 mova m1, [dstq] |
| 1103 punpckhbw m2, m4, m3 | 1218 punpckhbw m2, m4, m3 |
| 1104 punpcklbw m4, m3 | 1219 punpcklbw m4, m3 |
| 1105 pmaddubsw m2, filter_x_a | 1220 pmaddubsw m2, filter_x_a |
| 1106 pmaddubsw m4, filter_x_a | 1221 pmaddubsw m4, filter_x_a |
| (...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1156 %if %2 == 1 ; avg | 1271 %if %2 == 1 ; avg |
| 1157 ; FIXME(rbultje) pipeline | 1272 ; FIXME(rbultje) pipeline |
| 1158 packuswb m0, m2 | 1273 packuswb m0, m2 |
| 1159 pavgb m0, [secq] | 1274 pavgb m0, [secq] |
| 1160 punpckhbw m2, m0, m5 | 1275 punpckhbw m2, m0, m5 |
| 1161 punpcklbw m0, m5 | 1276 punpcklbw m0, m5 |
| 1162 %endif | 1277 %endif |
| 1163 SUM_SSE m0, m1, m2, m3, m6, m7 | 1278 SUM_SSE m0, m1, m2, m3, m6, m7 |
| 1164 mova m0, m4 | 1279 mova m0, m4 |
| 1165 | 1280 |
| 1166 add srcq, src_strideq | 1281 INC_SRC_BY_SRC_STRIDE |
| 1167 add dstq, dst_strideq | 1282 add dstq, dst_strideq |
| 1168 %else ; %1 < 16 | 1283 %else ; %1 < 16 |
| 1169 movh m0, [srcq] | 1284 movh m0, [srcq] |
| 1170 movh m1, [srcq+1] | 1285 movh m1, [srcq+1] |
| 1171 %if cpuflag(ssse3) | 1286 %if cpuflag(ssse3) |
| 1172 punpcklbw m0, m1 | 1287 punpcklbw m0, m1 |
| 1173 pmaddubsw m0, filter_x_a | 1288 pmaddubsw m0, filter_x_a |
| 1174 paddw m0, filter_rnd | 1289 paddw m0, filter_rnd |
| 1175 %else | 1290 %else |
| 1176 punpcklbw m0, m5 | 1291 punpcklbw m0, m5 |
| 1177 punpcklbw m1, m5 | 1292 punpcklbw m1, m5 |
| 1178 pmullw m0, filter_x_a | 1293 pmullw m0, filter_x_a |
| 1179 pmullw m1, filter_x_b | 1294 pmullw m1, filter_x_b |
| 1180 paddw m0, filter_rnd | 1295 paddw m0, filter_rnd |
| 1181 paddw m0, m1 | 1296 paddw m0, m1 |
| 1182 %endif | 1297 %endif |
| 1183 psraw m0, 4 | 1298 psraw m0, 4 |
| 1184 %if cpuflag(ssse3) | 1299 %if cpuflag(ssse3) |
| 1185 packuswb m0, m0 | 1300 packuswb m0, m0 |
| 1186 %endif | 1301 %endif |
| 1187 add srcq, src_strideq | 1302 |
| 1303 INC_SRC_BY_SRC_STRIDE |
| 1304 |
| 1188 .x_other_y_other_loop: | 1305 .x_other_y_other_loop: |
| 1189 movh m2, [srcq] | 1306 movh m2, [srcq] |
| 1190 movh m1, [srcq+1] | 1307 movh m1, [srcq+1] |
| 1191 movh m4, [srcq+src_strideq] | 1308 |
| 1192 movh m3, [srcq+src_strideq+1] | 1309 INC_SRC_BY_SRC_STRIDE |
| 1310 movh m4, [srcq] |
| 1311 movh m3, [srcq+1] |
| 1312 |
| 1193 %if cpuflag(ssse3) | 1313 %if cpuflag(ssse3) |
| 1194 punpcklbw m2, m1 | 1314 punpcklbw m2, m1 |
| 1195 punpcklbw m4, m3 | 1315 punpcklbw m4, m3 |
| 1196 pmaddubsw m2, filter_x_a | 1316 pmaddubsw m2, filter_x_a |
| 1197 pmaddubsw m4, filter_x_a | 1317 pmaddubsw m4, filter_x_a |
| 1198 movh m3, [dstq+dst_strideq] | 1318 movh m3, [dstq+dst_strideq] |
| 1199 movh m1, [dstq] | 1319 movh m1, [dstq] |
| 1200 paddw m2, filter_rnd | 1320 paddw m2, filter_rnd |
| 1201 paddw m4, filter_rnd | 1321 paddw m4, filter_rnd |
| 1202 psraw m2, 4 | 1322 psraw m2, 4 |
| (...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1246 %if %2 == 1 ; avg | 1366 %if %2 == 1 ; avg |
| 1247 ; FIXME(rbultje) pipeline | 1367 ; FIXME(rbultje) pipeline |
| 1248 packuswb m0, m2 | 1368 packuswb m0, m2 |
| 1249 pavgb m0, [secq] | 1369 pavgb m0, [secq] |
| 1250 punpckhbw m2, m0, m5 | 1370 punpckhbw m2, m0, m5 |
| 1251 punpcklbw m0, m5 | 1371 punpcklbw m0, m5 |
| 1252 %endif | 1372 %endif |
| 1253 SUM_SSE m0, m1, m2, m3, m6, m7 | 1373 SUM_SSE m0, m1, m2, m3, m6, m7 |
| 1254 mova m0, m4 | 1374 mova m0, m4 |
| 1255 | 1375 |
| 1256 lea srcq, [srcq+src_strideq*2] | 1376 INC_SRC_BY_SRC_STRIDE |
| 1257 lea dstq, [dstq+dst_strideq*2] | 1377 lea dstq, [dstq+dst_strideq*2] |
| 1258 %endif | 1378 %endif |
| 1259 %if %2 == 1 ; avg | 1379 %if %2 == 1 ; avg |
| 1260 add secq, sec_str | 1380 add secq, sec_str |
| 1261 %endif | 1381 %endif |
| 1262 dec h | 1382 dec h |
| 1263 jg .x_other_y_other_loop | 1383 jg .x_other_y_other_loop |
| 1264 %undef filter_x_a | 1384 %undef filter_x_a |
| 1265 %undef filter_x_b | 1385 %undef filter_x_b |
| 1266 %undef filter_y_a | 1386 %undef filter_y_a |
| (...skipping 24 matching lines...) Expand all Loading... |
| 1291 SUBPEL_VARIANCE 4, 1 | 1411 SUBPEL_VARIANCE 4, 1 |
| 1292 INIT_XMM sse2 | 1412 INIT_XMM sse2 |
| 1293 SUBPEL_VARIANCE 8, 1 | 1413 SUBPEL_VARIANCE 8, 1 |
| 1294 SUBPEL_VARIANCE 16, 1 | 1414 SUBPEL_VARIANCE 16, 1 |
| 1295 | 1415 |
| 1296 INIT_MMX ssse3 | 1416 INIT_MMX ssse3 |
| 1297 SUBPEL_VARIANCE 4, 1 | 1417 SUBPEL_VARIANCE 4, 1 |
| 1298 INIT_XMM ssse3 | 1418 INIT_XMM ssse3 |
| 1299 SUBPEL_VARIANCE 8, 1 | 1419 SUBPEL_VARIANCE 8, 1 |
| 1300 SUBPEL_VARIANCE 16, 1 | 1420 SUBPEL_VARIANCE 16, 1 |
| OLD | NEW |