source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm - Issue 111463005: libvpx: Pull from upstream

Unified Diff: source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm

Issue 111463005: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 7 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm

===================================================================

--- source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm (revision 240950)

+++ source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm (working copy)

@@ -118,6 +118,14 @@

RET

%endmacro

+%macro INC_SRC_BY_SRC_STRIDE 0

+%if ARCH_X86=1 && CONFIG_PIC=1

+ add srcq, src_stridemp

+%else

+ add srcq, src_strideq

+%endif

+%endmacro

%macro SUBPEL_VARIANCE 1-2 0 ; W

%if cpuflag(ssse3)

%define bilin_filter_m bilin_filter_m_ssse3

@@ -129,41 +137,85 @@

; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses

; 11, not 13, if the registers are ordered correctly. May make a minor speed

; difference on Win64

-%ifdef PIC

-%if %2 == 1 ; avg

-cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \

- x_offset, y_offset, \

- dst, dst_stride, \

- sec, sec_stride, height, sse

-%define sec_str sec_strideq

+%ifdef PIC ; 64bit PIC

+ %if %2 == 1 ; avg

+ cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \

+ x_offset, y_offset, \

+ dst, dst_stride, \

+ sec, sec_stride, height, sse

+ %define sec_str sec_strideq

+ %else

+ cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \

+ y_offset, dst, dst_stride, height, sse

+ %endif

+ %define h heightd

+ %define bilin_filter sseq

%else

-cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \

- dst, dst_stride, height, sse

+ %if ARCH_X86=1 && CONFIG_PIC=1

+ %if %2 == 1 ; avg

+ cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \

+ x_offset, y_offset, \

+ dst, dst_stride, \

+ sec, sec_stride, \

+ height, sse, g_bilin_filter, g_pw_8

+ %define h dword heightm

+ %define sec_str sec_stridemp

+ ;Store bilin_filter and pw_8 location in stack

+ GET_GOT eax

+ add esp, 4 ; restore esp

+ lea ecx, [GLOBAL(bilin_filter_m)]

+ mov g_bilin_filterm, ecx

+ lea ecx, [GLOBAL(pw_8)]

+ mov g_pw_8m, ecx

+ LOAD_IF_USED 0, 1 ; load eax, ecx back

+ %else

+ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \

+ y_offset, dst, dst_stride, height, sse, \

+ g_bilin_filter, g_pw_8

+ %define h heightd

+ ;Store bilin_filter and pw_8 location in stack

+ GET_GOT eax

+ add esp, 4 ; restore esp

+ lea ecx, [GLOBAL(bilin_filter_m)]

+ mov g_bilin_filterm, ecx

+ lea ecx, [GLOBAL(pw_8)]

+ mov g_pw_8m, ecx

+ LOAD_IF_USED 0, 1 ; load eax, ecx back

+ %endif

+ %else

+ %if %2 == 1 ; avg

+ cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \

+ 7 + 2 * ARCH_X86_64, 13, src, src_stride, \

+ x_offset, y_offset, \

+ dst, dst_stride, \

+ sec, sec_stride, \

+ height, sse

+ %if ARCH_X86_64

+ %define h heightd

+ %define sec_str sec_strideq

+ %else

+ %define h dword heightm

+ %define sec_str sec_stridemp

+ %endif

+ %else

+ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \

+ y_offset, dst, dst_stride, height, sse

+ %define h heightd

+ %endif

+ %define bilin_filter bilin_filter_m

+ %endif

%endif

-%define h heightd

-%define bilin_filter sseq

-%else

-%if %2 == 1 ; avg

-cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \

- 7 + 2 * ARCH_X86_64, 13, src, src_stride, \

- x_offset, y_offset, \

- dst, dst_stride, \

- sec, sec_stride, \

- height, sse

-%if ARCH_X86_64

-%define h heightd

-%define sec_str sec_strideq

-%else

-%define h dword heightm

-%define sec_str sec_stridemp

-%endif

-%else

-cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \

- dst, dst_stride, height, sse

-%define h heightd

-%endif

-%define bilin_filter bilin_filter_m

-%endif

ASSERT %1 <= 16 ; m6 overflows if w > 16

pxor m6, m6 ; sum

pxor m7, m7 ; sse

@@ -329,11 +381,22 @@

%define filter_y_b m9

%define filter_rnd m10

%else ; x86-32 or mmx

+%if ARCH_X86=1 && CONFIG_PIC=1

+; x_offset == 0, reuse x_offset reg

+%define tempq x_offsetq

+ add y_offsetq, g_bilin_filterm

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+ mov tempq, g_pw_8m

+%define filter_rnd [tempq]

+%else

add y_offsetq, bilin_filter

%define filter_y_a [y_offsetq]

%define filter_y_b [y_offsetq+16]

%define filter_rnd [pw_8]

%endif

+%endif

.x_zero_y_other_loop:

%if %1 == 16

movu m0, [srcq]

@@ -615,12 +678,23 @@

%define filter_y_a m8

%define filter_y_b m9

%define filter_rnd m10

+%else ;x86_32

+%if ARCH_X86=1 && CONFIG_PIC=1

+; x_offset == 0.5. We can reuse x_offset reg

+%define tempq x_offsetq

+ add y_offsetq, g_bilin_filterm

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+ mov tempq, g_pw_8m

+%define filter_rnd [tempq]

%else

add y_offsetq, bilin_filter

%define filter_y_a [y_offsetq]

%define filter_y_b [y_offsetq+16]

%define filter_rnd [pw_8]

%endif

+%endif

%if %1 == 16

movu m0, [srcq]

movu m3, [srcq+1]

@@ -752,12 +826,23 @@

%define filter_x_a m8

%define filter_x_b m9

%define filter_rnd m10

+%else ; x86-32

+%if ARCH_X86=1 && CONFIG_PIC=1

+;y_offset == 0. We can reuse y_offset reg.

+%define tempq y_offsetq

+ add x_offsetq, g_bilin_filterm

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+ mov tempq, g_pw_8m

+%define filter_rnd [tempq]

%else

add x_offsetq, bilin_filter

%define filter_x_a [x_offsetq]

%define filter_x_b [x_offsetq+16]

%define filter_rnd [pw_8]

%endif

+%endif

.x_other_y_zero_loop:

%if %1 == 16

movu m0, [srcq]

@@ -873,12 +958,23 @@

%define filter_x_a m8

%define filter_x_b m9

%define filter_rnd m10

+%else ; x86-32

+%if ARCH_X86=1 && CONFIG_PIC=1

+; y_offset == 0.5. We can reuse y_offset reg.

+%define tempq y_offsetq

+ add x_offsetq, g_bilin_filterm

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+ mov tempq, g_pw_8m

+%define filter_rnd [tempq]

%else

add x_offsetq, bilin_filter

%define filter_x_a [x_offsetq]

%define filter_x_b [x_offsetq+16]

%define filter_rnd [pw_8]

%endif

+%endif

%if %1 == 16

movu m0, [srcq]

movu m1, [srcq+1]

@@ -1057,6 +1153,21 @@

%define filter_y_a m10

%define filter_y_b m11

%define filter_rnd m12

+%else ; x86-32

+%if ARCH_X86=1 && CONFIG_PIC=1

+; In this case, there is NO unused register. Used src_stride register. Later,

+; src_stride has to be loaded from stack when it is needed.

+%define tempq src_strideq

+ mov tempq, g_bilin_filterm

+ add x_offsetq, tempq

+ add y_offsetq, tempq

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+ mov tempq, g_pw_8m

+%define filter_rnd [tempq]

%else

add x_offsetq, bilin_filter

add y_offsetq, bilin_filter

@@ -1066,6 +1177,8 @@

%define filter_y_b [y_offsetq+16]

%define filter_rnd [pw_8]

%endif

+%endif

; x_offset == bilin interpolation && y_offset == bilin interpolation

%if %1 == 16

movu m0, [srcq]

@@ -1093,7 +1206,9 @@

%endif

psraw m0, 4

psraw m2, 4

- add srcq, src_strideq

+ INC_SRC_BY_SRC_STRIDE

packuswb m0, m2

.x_other_y_other_loop:

%if cpuflag(ssse3)

@@ -1163,7 +1278,7 @@

SUM_SSE m0, m1, m2, m3, m6, m7

mova m0, m4

- add srcq, src_strideq

+ INC_SRC_BY_SRC_STRIDE

add dstq, dst_strideq

%else ; %1 < 16

movh m0, [srcq]

@@ -1184,12 +1299,17 @@

%if cpuflag(ssse3)

packuswb m0, m0

%endif

- add srcq, src_strideq

+ INC_SRC_BY_SRC_STRIDE

.x_other_y_other_loop:

movh m2, [srcq]

movh m1, [srcq+1]

- movh m4, [srcq+src_strideq]

- movh m3, [srcq+src_strideq+1]

+ INC_SRC_BY_SRC_STRIDE

+ movh m4, [srcq]

+ movh m3, [srcq+1]

%if cpuflag(ssse3)

punpcklbw m2, m1

punpcklbw m4, m3

@@ -1253,7 +1373,7 @@

SUM_SSE m0, m1, m2, m3, m6, m7

mova m0, m4

- lea srcq, [srcq+src_strideq*2]

+ INC_SRC_BY_SRC_STRIDE

lea dstq, [dstq+dst_strideq*2]

%endif

%if %2 == 1 ; avg

« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c ('k') | source/libvpx/vp9/vp9_common.mk » ('j') | no next file with comments »