OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 842 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
853 lea edx, [edx + 64] | 853 lea edx, [edx + 64] |
854 sub ecx, 32 | 854 sub ecx, 32 |
855 jg xloop | 855 jg xloop |
856 | 856 |
857 vzeroupper | 857 vzeroupper |
858 ret | 858 ret |
859 } | 859 } |
860 } | 860 } |
861 #endif // HAS_SCALEADDROW_AVX2 | 861 #endif // HAS_SCALEADDROW_AVX2 |
862 | 862 |
| 863 // Constant for making pixels signed to avoid pmaddubsw |
| 864 // saturation. |
| 865 static uvec8 kFsub80 = |
| 866 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
| 867 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; |
| 868 |
| 869 // Constant for making pixels unsigned and adding .5 for rounding. |
| 870 static uvec16 kFadd40 = |
| 871 { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 }; |
| 872 |
863 // Bilinear column filtering. SSSE3 version. | 873 // Bilinear column filtering. SSSE3 version. |
864 __declspec(naked) | 874 __declspec(naked) |
865 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, | 875 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
866 int dst_width, int x, int dx) { | 876 int dst_width, int x, int dx) { |
867 __asm { | 877 __asm { |
868 push ebx | 878 push ebx |
869 push esi | 879 push esi |
870 push edi | 880 push edi |
871 mov edi, [esp + 12 + 4] // dst_ptr | 881 mov edi, [esp + 12 + 4] // dst_ptr |
872 mov esi, [esp + 12 + 8] // src_ptr | 882 mov esi, [esp + 12 + 8] // src_ptr |
873 mov ecx, [esp + 12 + 12] // dst_width | 883 mov ecx, [esp + 12 + 12] // dst_width |
874 movd xmm2, [esp + 12 + 16] // x | 884 movd xmm2, [esp + 12 + 16] // x |
875 movd xmm3, [esp + 12 + 20] // dx | 885 movd xmm3, [esp + 12 + 20] // dx |
876 mov eax, 0x04040000 // shuffle to line up fractions with pixel. | 886 mov eax, 0x04040000 // shuffle to line up fractions with pixel. |
877 movd xmm5, eax | 887 movd xmm5, eax |
878 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. | 888 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. |
879 psrlw xmm6, 9 | 889 psrlw xmm6, 9 |
| 890 pcmpeqb xmm7, xmm7 // generate 0x0001 |
| 891 psrlw xmm7, 15 |
880 pextrw eax, xmm2, 1 // get x0 integer. preroll | 892 pextrw eax, xmm2, 1 // get x0 integer. preroll |
881 sub ecx, 2 | 893 sub ecx, 2 |
882 jl xloop29 | 894 jl xloop29 |
883 | 895 |
884 movdqa xmm0, xmm2 // x1 = x0 + dx | 896 movdqa xmm0, xmm2 // x1 = x0 + dx |
885 paddd xmm0, xmm3 | 897 paddd xmm0, xmm3 |
886 punpckldq xmm2, xmm0 // x0 x1 | 898 punpckldq xmm2, xmm0 // x0 x1 |
887 punpckldq xmm3, xmm3 // dx dx | 899 punpckldq xmm3, xmm3 // dx dx |
888 paddd xmm3, xmm3 // dx * 2, dx * 2 | 900 paddd xmm3, xmm3 // dx * 2, dx * 2 |
889 pextrw edx, xmm2, 3 // get x1 integer. preroll | 901 pextrw edx, xmm2, 3 // get x1 integer. preroll |
890 | 902 |
891 // 2 Pixel loop. | 903 // 2 Pixel loop. |
892 xloop2: | 904 xloop2: |
893 movdqa xmm1, xmm2 // x0, x1 fractions. | 905 movdqa xmm1, xmm2 // x0, x1 fractions. |
894 paddd xmm2, xmm3 // x += dx | 906 paddd xmm2, xmm3 // x += dx |
895 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels | 907 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels |
896 movd xmm0, ebx | 908 movd xmm0, ebx |
897 psrlw xmm1, 9 // 7 bit fractions. | 909 psrlw xmm1, 9 // 7 bit fractions. |
898 movzx ebx, word ptr [esi + edx] // 2 source x1 pixels | 910 movzx ebx, word ptr [esi + edx] // 2 source x1 pixels |
899 movd xmm4, ebx | 911 movd xmm4, ebx |
900 pshufb xmm1, xmm5 // 0011 | 912 pshufb xmm1, xmm5 // 0011 |
901 punpcklwd xmm0, xmm4 | 913 punpcklwd xmm0, xmm4 |
| 914 psubb xmm0, xmmword ptr kFsub80 // make pixels signed. |
902 pxor xmm1, xmm6 // 0..7f and 7f..0 | 915 pxor xmm1, xmm6 // 0..7f and 7f..0 |
903 pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. | 916 paddusb xmm1, xmm7 // +1 so 0..7f and 80..1 |
| 917 pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels. |
904 pextrw eax, xmm2, 1 // get x0 integer. next iteration. | 918 pextrw eax, xmm2, 1 // get x0 integer. next iteration. |
905 pextrw edx, xmm2, 3 // get x1 integer. next iteration. | 919 pextrw edx, xmm2, 3 // get x1 integer. next iteration. |
906 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. | 920 paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round. |
907 packuswb xmm0, xmm0 // 8 bits, 2 pixels. | 921 psrlw xmm1, 7 // 8.7 fixed point to low 8 bits. |
908 movd ebx, xmm0 | 922 packuswb xmm1, xmm1 // 8 bits, 2 pixels. |
| 923 movd ebx, xmm1 |
909 mov [edi], bx | 924 mov [edi], bx |
910 lea edi, [edi + 2] | 925 lea edi, [edi + 2] |
911 sub ecx, 2 // 2 pixels | 926 sub ecx, 2 // 2 pixels |
912 jge xloop2 | 927 jge xloop2 |
913 | 928 |
914 xloop29: | 929 xloop29: |
915 | |
916 add ecx, 2 - 1 | 930 add ecx, 2 - 1 |
917 jl xloop99 | 931 jl xloop99 |
918 | 932 |
919 // 1 pixel remainder | 933 // 1 pixel remainder |
920 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels | 934 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels |
921 movd xmm0, ebx | 935 movd xmm0, ebx |
922 psrlw xmm2, 9 // 7 bit fractions. | 936 psrlw xmm2, 9 // 7 bit fractions. |
923 pshufb xmm2, xmm5 // 0011 | 937 pshufb xmm2, xmm5 // 0011 |
| 938 psubb xmm0, xmmword ptr kFsub80 // make pixels signed. |
924 pxor xmm2, xmm6 // 0..7f and 7f..0 | 939 pxor xmm2, xmm6 // 0..7f and 7f..0 |
925 pmaddubsw xmm0, xmm2 // 16 bit | 940 paddusb xmm2, xmm7 // +1 so 0..7f and 80..1 |
926 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. | 941 pmaddubsw xmm2, xmm0 // 16 bit |
927 packuswb xmm0, xmm0 // 8 bits | 942 paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round. |
928 movd ebx, xmm0 | 943 psrlw xmm2, 7 // 8.7 fixed point to low 8 bits. |
| 944 packuswb xmm2, xmm2 // 8 bits |
| 945 movd ebx, xmm2 |
929 mov [edi], bl | 946 mov [edi], bl |
930 | 947 |
931 xloop99: | 948 xloop99: |
932 | 949 |
933 pop edi | 950 pop edi |
934 pop esi | 951 pop esi |
935 pop ebx | 952 pop ebx |
936 ret | 953 ret |
937 } | 954 } |
938 } | 955 } |
(...skipping 409 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1348 idiv ecx | 1365 idiv ecx |
1349 ret | 1366 ret |
1350 } | 1367 } |
1351 } | 1368 } |
1352 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) | 1369 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) |
1353 | 1370 |
1354 #ifdef __cplusplus | 1371 #ifdef __cplusplus |
1355 } // extern "C" | 1372 } // extern "C" |
1356 } // namespace libyuv | 1373 } // namespace libyuv |
1357 #endif | 1374 #endif |
OLD | NEW |