| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 842 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 853 lea edx, [edx + 64] | 853 lea edx, [edx + 64] |
| 854 sub ecx, 32 | 854 sub ecx, 32 |
| 855 jg xloop | 855 jg xloop |
| 856 | 856 |
| 857 vzeroupper | 857 vzeroupper |
| 858 ret | 858 ret |
| 859 } | 859 } |
| 860 } | 860 } |
| 861 #endif // HAS_SCALEADDROW_AVX2 | 861 #endif // HAS_SCALEADDROW_AVX2 |
| 862 | 862 |
| 863 // Constant for making pixels signed to avoid pmaddubsw |
| 864 // saturation. |
| 865 static uvec8 kFsub80 = |
| 866 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
| 867 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; |
| 868 |
| 869 // Constant for making pixels unsigned and adding .5 for rounding. |
| 870 static uvec16 kFadd40 = |
| 871 { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 }; |
| 872 |
| 863 // Bilinear column filtering. SSSE3 version. | 873 // Bilinear column filtering. SSSE3 version. |
| 864 __declspec(naked) | 874 __declspec(naked) |
| 865 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, | 875 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
| 866 int dst_width, int x, int dx) { | 876 int dst_width, int x, int dx) { |
| 867 __asm { | 877 __asm { |
| 868 push ebx | 878 push ebx |
| 869 push esi | 879 push esi |
| 870 push edi | 880 push edi |
| 871 mov edi, [esp + 12 + 4] // dst_ptr | 881 mov edi, [esp + 12 + 4] // dst_ptr |
| 872 mov esi, [esp + 12 + 8] // src_ptr | 882 mov esi, [esp + 12 + 8] // src_ptr |
| 873 mov ecx, [esp + 12 + 12] // dst_width | 883 mov ecx, [esp + 12 + 12] // dst_width |
| 874 movd xmm2, [esp + 12 + 16] // x | 884 movd xmm2, [esp + 12 + 16] // x |
| 875 movd xmm3, [esp + 12 + 20] // dx | 885 movd xmm3, [esp + 12 + 20] // dx |
| 876 mov eax, 0x04040000 // shuffle to line up fractions with pixel. | 886 mov eax, 0x04040000 // shuffle to line up fractions with pixel. |
| 877 movd xmm5, eax | 887 movd xmm5, eax |
| 878 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. | 888 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. |
| 879 psrlw xmm6, 9 | 889 psrlw xmm6, 9 |
| 890 pcmpeqb xmm7, xmm7 // generate 0x0001 |
| 891 psrlw xmm7, 15 |
| 880 pextrw eax, xmm2, 1 // get x0 integer. preroll | 892 pextrw eax, xmm2, 1 // get x0 integer. preroll |
| 881 sub ecx, 2 | 893 sub ecx, 2 |
| 882 jl xloop29 | 894 jl xloop29 |
| 883 | 895 |
| 884 movdqa xmm0, xmm2 // x1 = x0 + dx | 896 movdqa xmm0, xmm2 // x1 = x0 + dx |
| 885 paddd xmm0, xmm3 | 897 paddd xmm0, xmm3 |
| 886 punpckldq xmm2, xmm0 // x0 x1 | 898 punpckldq xmm2, xmm0 // x0 x1 |
| 887 punpckldq xmm3, xmm3 // dx dx | 899 punpckldq xmm3, xmm3 // dx dx |
| 888 paddd xmm3, xmm3 // dx * 2, dx * 2 | 900 paddd xmm3, xmm3 // dx * 2, dx * 2 |
| 889 pextrw edx, xmm2, 3 // get x1 integer. preroll | 901 pextrw edx, xmm2, 3 // get x1 integer. preroll |
| 890 | 902 |
| 891 // 2 Pixel loop. | 903 // 2 Pixel loop. |
| 892 xloop2: | 904 xloop2: |
| 893 movdqa xmm1, xmm2 // x0, x1 fractions. | 905 movdqa xmm1, xmm2 // x0, x1 fractions. |
| 894 paddd xmm2, xmm3 // x += dx | 906 paddd xmm2, xmm3 // x += dx |
| 895 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels | 907 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels |
| 896 movd xmm0, ebx | 908 movd xmm0, ebx |
| 897 psrlw xmm1, 9 // 7 bit fractions. | 909 psrlw xmm1, 9 // 7 bit fractions. |
| 898 movzx ebx, word ptr [esi + edx] // 2 source x1 pixels | 910 movzx ebx, word ptr [esi + edx] // 2 source x1 pixels |
| 899 movd xmm4, ebx | 911 movd xmm4, ebx |
| 900 pshufb xmm1, xmm5 // 0011 | 912 pshufb xmm1, xmm5 // 0011 |
| 901 punpcklwd xmm0, xmm4 | 913 punpcklwd xmm0, xmm4 |
| 914 psubb xmm0, xmmword ptr kFsub80 // make pixels signed. |
| 902 pxor xmm1, xmm6 // 0..7f and 7f..0 | 915 pxor xmm1, xmm6 // 0..7f and 7f..0 |
| 903 pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. | 916 paddusb xmm1, xmm7 // +1 so 0..7f and 80..1 |
| 917 pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels. |
| 904 pextrw eax, xmm2, 1 // get x0 integer. next iteration. | 918 pextrw eax, xmm2, 1 // get x0 integer. next iteration. |
| 905 pextrw edx, xmm2, 3 // get x1 integer. next iteration. | 919 pextrw edx, xmm2, 3 // get x1 integer. next iteration. |
| 906 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. | 920 paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round. |
| 907 packuswb xmm0, xmm0 // 8 bits, 2 pixels. | 921 psrlw xmm1, 7 // 8.7 fixed point to low 8 bits. |
| 908 movd ebx, xmm0 | 922 packuswb xmm1, xmm1 // 8 bits, 2 pixels. |
| 923 movd ebx, xmm1 |
| 909 mov [edi], bx | 924 mov [edi], bx |
| 910 lea edi, [edi + 2] | 925 lea edi, [edi + 2] |
| 911 sub ecx, 2 // 2 pixels | 926 sub ecx, 2 // 2 pixels |
| 912 jge xloop2 | 927 jge xloop2 |
| 913 | 928 |
| 914 xloop29: | 929 xloop29: |
| 915 | |
| 916 add ecx, 2 - 1 | 930 add ecx, 2 - 1 |
| 917 jl xloop99 | 931 jl xloop99 |
| 918 | 932 |
| 919 // 1 pixel remainder | 933 // 1 pixel remainder |
| 920 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels | 934 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels |
| 921 movd xmm0, ebx | 935 movd xmm0, ebx |
| 922 psrlw xmm2, 9 // 7 bit fractions. | 936 psrlw xmm2, 9 // 7 bit fractions. |
| 923 pshufb xmm2, xmm5 // 0011 | 937 pshufb xmm2, xmm5 // 0011 |
| 938 psubb xmm0, xmmword ptr kFsub80 // make pixels signed. |
| 924 pxor xmm2, xmm6 // 0..7f and 7f..0 | 939 pxor xmm2, xmm6 // 0..7f and 7f..0 |
| 925 pmaddubsw xmm0, xmm2 // 16 bit | 940 paddusb xmm2, xmm7 // +1 so 0..7f and 80..1 |
| 926 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. | 941 pmaddubsw xmm2, xmm0 // 16 bit |
| 927 packuswb xmm0, xmm0 // 8 bits | 942 paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round. |
| 928 movd ebx, xmm0 | 943 psrlw xmm2, 7 // 8.7 fixed point to low 8 bits. |
| 944 packuswb xmm2, xmm2 // 8 bits |
| 945 movd ebx, xmm2 |
| 929 mov [edi], bl | 946 mov [edi], bl |
| 930 | 947 |
| 931 xloop99: | 948 xloop99: |
| 932 | 949 |
| 933 pop edi | 950 pop edi |
| 934 pop esi | 951 pop esi |
| 935 pop ebx | 952 pop ebx |
| 936 ret | 953 ret |
| 937 } | 954 } |
| 938 } | 955 } |
| (...skipping 409 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1348 idiv ecx | 1365 idiv ecx |
| 1349 ret | 1366 ret |
| 1350 } | 1367 } |
| 1351 } | 1368 } |
| 1352 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) | 1369 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) |
| 1353 | 1370 |
| 1354 #ifdef __cplusplus | 1371 #ifdef __cplusplus |
| 1355 } // extern "C" | 1372 } // extern "C" |
| 1356 } // namespace libyuv | 1373 } // namespace libyuv |
| 1357 #endif | 1374 #endif |
| OLD | NEW |