source/scale_win.cc - Issue 2084533006: YUV scale filter columns improved filtering accuracy

Side by Side Diff: source/scale_win.cc

Issue 2084533006: YUV scale filter columns improved filtering accuracy (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master

Patch Set: bump version Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.	2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 842 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
853 lea edx, [edx + 64]	853 lea edx, [edx + 64]

854 sub ecx, 32	854 sub ecx, 32

855 jg xloop	855 jg xloop

856	856

857 vzeroupper	857 vzeroupper

858 ret	858 ret

859 }	859 }

860 }	860 }

861 #endif // HAS_SCALEADDROW_AVX2	861 #endif // HAS_SCALEADDROW_AVX2

862	862

	863 // Constant for making pixels signed to avoid pmaddubsw

	864 // saturation.

	865 static uvec8 kFsub80 =

	866 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,

	867 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };

	868

	869 // Constant for making pixels unsigned and adding .5 for rounding.

	870 static uvec16 kFadd40 =

	871 { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };

	872

863 // Bilinear column filtering. SSSE3 version.	873 // Bilinear column filtering. SSSE3 version.

864 __declspec(naked)	874 __declspec(naked)

865 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,	875 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

866 int dst_width, int x, int dx) {	876 int dst_width, int x, int dx) {

867 __asm {	877 __asm {

868 push ebx	878 push ebx

869 push esi	879 push esi

870 push edi	880 push edi

871 mov edi, [esp + 12 + 4] // dst_ptr	881 mov edi, [esp + 12 + 4] // dst_ptr

872 mov esi, [esp + 12 + 8] // src_ptr	882 mov esi, [esp + 12 + 8] // src_ptr

873 mov ecx, [esp + 12 + 12] // dst_width	883 mov ecx, [esp + 12 + 12] // dst_width

874 movd xmm2, [esp + 12 + 16] // x	884 movd xmm2, [esp + 12 + 16] // x

875 movd xmm3, [esp + 12 + 20] // dx	885 movd xmm3, [esp + 12 + 20] // dx

876 mov eax, 0x04040000 // shuffle to line up fractions with pixel.	886 mov eax, 0x04040000 // shuffle to line up fractions with pixel.

877 movd xmm5, eax	887 movd xmm5, eax

878 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.	888 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.

879 psrlw xmm6, 9	889 psrlw xmm6, 9

	890 pcmpeqb xmm7, xmm7 // generate 0x0001

	891 psrlw xmm7, 15

880 pextrw eax, xmm2, 1 // get x0 integer. preroll	892 pextrw eax, xmm2, 1 // get x0 integer. preroll

881 sub ecx, 2	893 sub ecx, 2

882 jl xloop29	894 jl xloop29

883	895

884 movdqa xmm0, xmm2 // x1 = x0 + dx	896 movdqa xmm0, xmm2 // x1 = x0 + dx

885 paddd xmm0, xmm3	897 paddd xmm0, xmm3

886 punpckldq xmm2, xmm0 // x0 x1	898 punpckldq xmm2, xmm0 // x0 x1

887 punpckldq xmm3, xmm3 // dx dx	899 punpckldq xmm3, xmm3 // dx dx

888 paddd xmm3, xmm3 // dx * 2, dx * 2	900 paddd xmm3, xmm3 // dx * 2, dx * 2

889 pextrw edx, xmm2, 3 // get x1 integer. preroll	901 pextrw edx, xmm2, 3 // get x1 integer. preroll

890	902

891 // 2 Pixel loop.	903 // 2 Pixel loop.

892 xloop2:	904 xloop2:

893 movdqa xmm1, xmm2 // x0, x1 fractions.	905 movdqa xmm1, xmm2 // x0, x1 fractions.

894 paddd xmm2, xmm3 // x += dx	906 paddd xmm2, xmm3 // x += dx

895 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels	907 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels

896 movd xmm0, ebx	908 movd xmm0, ebx

897 psrlw xmm1, 9 // 7 bit fractions.	909 psrlw xmm1, 9 // 7 bit fractions.

898 movzx ebx, word ptr [esi + edx] // 2 source x1 pixels	910 movzx ebx, word ptr [esi + edx] // 2 source x1 pixels

899 movd xmm4, ebx	911 movd xmm4, ebx

900 pshufb xmm1, xmm5 // 0011	912 pshufb xmm1, xmm5 // 0011

901 punpcklwd xmm0, xmm4	913 punpcklwd xmm0, xmm4

	914 psubb xmm0, xmmword ptr kFsub80 // make pixels signed.

902 pxor xmm1, xmm6 // 0..7f and 7f..0	915 pxor xmm1, xmm6 // 0..7f and 7f..0

903 pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels.	916 paddusb xmm1, xmm7 // +1 so 0..7f and 80..1

	917 pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels.

904 pextrw eax, xmm2, 1 // get x0 integer. next iteration.	918 pextrw eax, xmm2, 1 // get x0 integer. next iteration.

905 pextrw edx, xmm2, 3 // get x1 integer. next iteration.	919 pextrw edx, xmm2, 3 // get x1 integer. next iteration.

906 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.	920 paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round.

907 packuswb xmm0, xmm0 // 8 bits, 2 pixels.	921 psrlw xmm1, 7 // 8.7 fixed point to low 8 bits.

908 movd ebx, xmm0	922 packuswb xmm1, xmm1 // 8 bits, 2 pixels.

	923 movd ebx, xmm1

909 mov [edi], bx	924 mov [edi], bx

910 lea edi, [edi + 2]	925 lea edi, [edi + 2]

911 sub ecx, 2 // 2 pixels	926 sub ecx, 2 // 2 pixels

912 jge xloop2	927 jge xloop2

913	928

914 xloop29:	929 xloop29:

915

916 add ecx, 2 - 1	930 add ecx, 2 - 1

917 jl xloop99	931 jl xloop99

918	932

919 // 1 pixel remainder	933 // 1 pixel remainder

920 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels	934 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels

921 movd xmm0, ebx	935 movd xmm0, ebx

922 psrlw xmm2, 9 // 7 bit fractions.	936 psrlw xmm2, 9 // 7 bit fractions.

923 pshufb xmm2, xmm5 // 0011	937 pshufb xmm2, xmm5 // 0011

	938 psubb xmm0, xmmword ptr kFsub80 // make pixels signed.

924 pxor xmm2, xmm6 // 0..7f and 7f..0	939 pxor xmm2, xmm6 // 0..7f and 7f..0

925 pmaddubsw xmm0, xmm2 // 16 bit	940 paddusb xmm2, xmm7 // +1 so 0..7f and 80..1

926 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.	941 pmaddubsw xmm2, xmm0 // 16 bit

927 packuswb xmm0, xmm0 // 8 bits	942 paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round.

928 movd ebx, xmm0	943 psrlw xmm2, 7 // 8.7 fixed point to low 8 bits.

	944 packuswb xmm2, xmm2 // 8 bits

	945 movd ebx, xmm2

929 mov [edi], bl	946 mov [edi], bl

930	947

931 xloop99:	948 xloop99:

932	949

933 pop edi	950 pop edi

934 pop esi	951 pop esi

935 pop ebx	952 pop ebx

936 ret	953 ret

937 }	954 }

938 }	955 }

(...skipping 409 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1348 idiv ecx	1365 idiv ecx

1349 ret	1366 ret

1350 }	1367 }

1351 }	1368 }

1352 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)	1369 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)

1353	1370

1354 #ifdef __cplusplus	1371 #ifdef __cplusplus

1355 } // extern "C"	1372 } // extern "C"

1356 } // namespace libyuv	1373 } // namespace libyuv

1357 #endif	1374 #endif

OLD	NEW

« no previous file with comments | « source/scale_neon.cc ('k') | unit_test/scale_test.cc » ('j') | no next file with comments »