Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(360)

Side by Side Diff: source/scale_win.cc

Issue 2084533006: YUV scale filter columns improved filtering accuracy (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: bump version Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/scale_neon.cc ('k') | unit_test/scale_test.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 842 matching lines...) Expand 10 before | Expand all | Expand 10 after
853 lea edx, [edx + 64] 853 lea edx, [edx + 64]
854 sub ecx, 32 854 sub ecx, 32
855 jg xloop 855 jg xloop
856 856
857 vzeroupper 857 vzeroupper
858 ret 858 ret
859 } 859 }
860 } 860 }
861 #endif // HAS_SCALEADDROW_AVX2 861 #endif // HAS_SCALEADDROW_AVX2
862 862
863 // Constant for making pixels signed to avoid pmaddubsw
864 // saturation.
865 static uvec8 kFsub80 =
866 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
867 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
868
869 // Constant for making pixels unsigned and adding .5 for rounding.
870 static uvec16 kFadd40 =
871 { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
872
863 // Bilinear column filtering. SSSE3 version. 873 // Bilinear column filtering. SSSE3 version.
864 __declspec(naked) 874 __declspec(naked)
865 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 875 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
866 int dst_width, int x, int dx) { 876 int dst_width, int x, int dx) {
867 __asm { 877 __asm {
868 push ebx 878 push ebx
869 push esi 879 push esi
870 push edi 880 push edi
871 mov edi, [esp + 12 + 4] // dst_ptr 881 mov edi, [esp + 12 + 4] // dst_ptr
872 mov esi, [esp + 12 + 8] // src_ptr 882 mov esi, [esp + 12 + 8] // src_ptr
873 mov ecx, [esp + 12 + 12] // dst_width 883 mov ecx, [esp + 12 + 12] // dst_width
874 movd xmm2, [esp + 12 + 16] // x 884 movd xmm2, [esp + 12 + 16] // x
875 movd xmm3, [esp + 12 + 20] // dx 885 movd xmm3, [esp + 12 + 20] // dx
876 mov eax, 0x04040000 // shuffle to line up fractions with pixel. 886 mov eax, 0x04040000 // shuffle to line up fractions with pixel.
877 movd xmm5, eax 887 movd xmm5, eax
878 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. 888 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
879 psrlw xmm6, 9 889 psrlw xmm6, 9
890 pcmpeqb xmm7, xmm7 // generate 0x0001
891 psrlw xmm7, 15
880 pextrw eax, xmm2, 1 // get x0 integer. preroll 892 pextrw eax, xmm2, 1 // get x0 integer. preroll
881 sub ecx, 2 893 sub ecx, 2
882 jl xloop29 894 jl xloop29
883 895
884 movdqa xmm0, xmm2 // x1 = x0 + dx 896 movdqa xmm0, xmm2 // x1 = x0 + dx
885 paddd xmm0, xmm3 897 paddd xmm0, xmm3
886 punpckldq xmm2, xmm0 // x0 x1 898 punpckldq xmm2, xmm0 // x0 x1
887 punpckldq xmm3, xmm3 // dx dx 899 punpckldq xmm3, xmm3 // dx dx
888 paddd xmm3, xmm3 // dx * 2, dx * 2 900 paddd xmm3, xmm3 // dx * 2, dx * 2
889 pextrw edx, xmm2, 3 // get x1 integer. preroll 901 pextrw edx, xmm2, 3 // get x1 integer. preroll
890 902
891 // 2 Pixel loop. 903 // 2 Pixel loop.
892 xloop2: 904 xloop2:
893 movdqa xmm1, xmm2 // x0, x1 fractions. 905 movdqa xmm1, xmm2 // x0, x1 fractions.
894 paddd xmm2, xmm3 // x += dx 906 paddd xmm2, xmm3 // x += dx
895 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels 907 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
896 movd xmm0, ebx 908 movd xmm0, ebx
897 psrlw xmm1, 9 // 7 bit fractions. 909 psrlw xmm1, 9 // 7 bit fractions.
898 movzx ebx, word ptr [esi + edx] // 2 source x1 pixels 910 movzx ebx, word ptr [esi + edx] // 2 source x1 pixels
899 movd xmm4, ebx 911 movd xmm4, ebx
900 pshufb xmm1, xmm5 // 0011 912 pshufb xmm1, xmm5 // 0011
901 punpcklwd xmm0, xmm4 913 punpcklwd xmm0, xmm4
914 psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
902 pxor xmm1, xmm6 // 0..7f and 7f..0 915 pxor xmm1, xmm6 // 0..7f and 7f..0
903 pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. 916 paddusb xmm1, xmm7 // +1 so 0..7f and 80..1
917 pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels.
904 pextrw eax, xmm2, 1 // get x0 integer. next iteration. 918 pextrw eax, xmm2, 1 // get x0 integer. next iteration.
905 pextrw edx, xmm2, 3 // get x1 integer. next iteration. 919 pextrw edx, xmm2, 3 // get x1 integer. next iteration.
906 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. 920 paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round.
907 packuswb xmm0, xmm0 // 8 bits, 2 pixels. 921 psrlw xmm1, 7 // 8.7 fixed point to low 8 bits.
908 movd ebx, xmm0 922 packuswb xmm1, xmm1 // 8 bits, 2 pixels.
923 movd ebx, xmm1
909 mov [edi], bx 924 mov [edi], bx
910 lea edi, [edi + 2] 925 lea edi, [edi + 2]
911 sub ecx, 2 // 2 pixels 926 sub ecx, 2 // 2 pixels
912 jge xloop2 927 jge xloop2
913 928
914 xloop29: 929 xloop29:
915
916 add ecx, 2 - 1 930 add ecx, 2 - 1
917 jl xloop99 931 jl xloop99
918 932
919 // 1 pixel remainder 933 // 1 pixel remainder
920 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels 934 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
921 movd xmm0, ebx 935 movd xmm0, ebx
922 psrlw xmm2, 9 // 7 bit fractions. 936 psrlw xmm2, 9 // 7 bit fractions.
923 pshufb xmm2, xmm5 // 0011 937 pshufb xmm2, xmm5 // 0011
938 psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
924 pxor xmm2, xmm6 // 0..7f and 7f..0 939 pxor xmm2, xmm6 // 0..7f and 7f..0
925 pmaddubsw xmm0, xmm2 // 16 bit 940 paddusb xmm2, xmm7 // +1 so 0..7f and 80..1
926 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. 941 pmaddubsw xmm2, xmm0 // 16 bit
927 packuswb xmm0, xmm0 // 8 bits 942 paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round.
928 movd ebx, xmm0 943 psrlw xmm2, 7 // 8.7 fixed point to low 8 bits.
944 packuswb xmm2, xmm2 // 8 bits
945 movd ebx, xmm2
929 mov [edi], bl 946 mov [edi], bl
930 947
931 xloop99: 948 xloop99:
932 949
933 pop edi 950 pop edi
934 pop esi 951 pop esi
935 pop ebx 952 pop ebx
936 ret 953 ret
937 } 954 }
938 } 955 }
(...skipping 409 matching lines...) Expand 10 before | Expand all | Expand 10 after
1348 idiv ecx 1365 idiv ecx
1349 ret 1366 ret
1350 } 1367 }
1351 } 1368 }
1352 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 1369 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
1353 1370
1354 #ifdef __cplusplus 1371 #ifdef __cplusplus
1355 } // extern "C" 1372 } // extern "C"
1356 } // namespace libyuv 1373 } // namespace libyuv
1357 #endif 1374 #endif
OLDNEW
« no previous file with comments | « source/scale_neon.cc ('k') | unit_test/scale_test.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698