Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(542)

Unified Diff: source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm

Issue 111463005: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 7 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c ('k') | source/libvpx/vp9/vp9_common.mk » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
===================================================================
--- source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm (revision 240950)
+++ source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm (working copy)
@@ -118,6 +118,14 @@
RET
%endmacro
+%macro INC_SRC_BY_SRC_STRIDE 0
+%if ARCH_X86=1 && CONFIG_PIC=1
+ add srcq, src_stridemp
+%else
+ add srcq, src_strideq
+%endif
+%endmacro
+
%macro SUBPEL_VARIANCE 1-2 0 ; W
%if cpuflag(ssse3)
%define bilin_filter_m bilin_filter_m_ssse3
@@ -129,41 +137,85 @@
; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
; 11, not 13, if the registers are ordered correctly. May make a minor speed
; difference on Win64
-%ifdef PIC
-%if %2 == 1 ; avg
-cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
- x_offset, y_offset, \
- dst, dst_stride, \
- sec, sec_stride, height, sse
-%define sec_str sec_strideq
+
+%ifdef PIC ; 64bit PIC
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, height, sse
+ %define sec_str sec_strideq
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, height, sse
+ %endif
+ %define h heightd
+ %define bilin_filter sseq
%else
-cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \
- dst, dst_stride, height, sse
+ %if ARCH_X86=1 && CONFIG_PIC=1
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, \
+ height, sse, g_bilin_filter, g_pw_8
+ %define h dword heightm
+ %define sec_str sec_stridemp
+
+ ;Store bilin_filter and pw_8 location in stack
+ GET_GOT eax
+ add esp, 4 ; restore esp
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, height, sse, \
+ g_bilin_filter, g_pw_8
+ %define h heightd
+
+ ;Store bilin_filter and pw_8 location in stack
+ GET_GOT eax
+ add esp, 4 ; restore esp
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %endif
+ %else
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
+ 7 + 2 * ARCH_X86_64, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, \
+ height, sse
+ %if ARCH_X86_64
+ %define h heightd
+ %define sec_str sec_strideq
+ %else
+ %define h dword heightm
+ %define sec_str sec_stridemp
+ %endif
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, height, sse
+ %define h heightd
+ %endif
+
+ %define bilin_filter bilin_filter_m
+ %endif
%endif
-%define h heightd
-%define bilin_filter sseq
-%else
-%if %2 == 1 ; avg
-cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
- 7 + 2 * ARCH_X86_64, 13, src, src_stride, \
- x_offset, y_offset, \
- dst, dst_stride, \
- sec, sec_stride, \
- height, sse
-%if ARCH_X86_64
-%define h heightd
-%define sec_str sec_strideq
-%else
-%define h dword heightm
-%define sec_str sec_stridemp
-%endif
-%else
-cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
- dst, dst_stride, height, sse
-%define h heightd
-%endif
-%define bilin_filter bilin_filter_m
-%endif
+
ASSERT %1 <= 16 ; m6 overflows if w > 16
pxor m6, m6 ; sum
pxor m7, m7 ; sse
@@ -329,11 +381,22 @@
%define filter_y_b m9
%define filter_rnd m10
%else ; x86-32 or mmx
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
add y_offsetq, bilin_filter
%define filter_y_a [y_offsetq]
%define filter_y_b [y_offsetq+16]
%define filter_rnd [pw_8]
%endif
+%endif
+
.x_zero_y_other_loop:
%if %1 == 16
movu m0, [srcq]
@@ -615,12 +678,23 @@
%define filter_y_a m8
%define filter_y_b m9
%define filter_rnd m10
+%else ;x86_32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
%else
add y_offsetq, bilin_filter
%define filter_y_a [y_offsetq]
%define filter_y_b [y_offsetq+16]
%define filter_rnd [pw_8]
%endif
+%endif
+
%if %1 == 16
movu m0, [srcq]
movu m3, [srcq+1]
@@ -752,12 +826,23 @@
%define filter_x_a m8
%define filter_x_b m9
%define filter_rnd m10
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+;y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
%else
add x_offsetq, bilin_filter
%define filter_x_a [x_offsetq]
%define filter_x_b [x_offsetq+16]
%define filter_rnd [pw_8]
%endif
+%endif
+
.x_other_y_zero_loop:
%if %1 == 16
movu m0, [srcq]
@@ -873,12 +958,23 @@
%define filter_x_a m8
%define filter_x_b m9
%define filter_rnd m10
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
%else
add x_offsetq, bilin_filter
%define filter_x_a [x_offsetq]
%define filter_x_b [x_offsetq+16]
%define filter_rnd [pw_8]
%endif
+%endif
+
%if %1 == 16
movu m0, [srcq]
movu m1, [srcq+1]
@@ -1057,6 +1153,21 @@
%define filter_y_a m10
%define filter_y_b m11
%define filter_rnd m12
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+ mov tempq, g_bilin_filterm
+ add x_offsetq, tempq
+ add y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
%else
add x_offsetq, bilin_filter
add y_offsetq, bilin_filter
@@ -1066,6 +1177,8 @@
%define filter_y_b [y_offsetq+16]
%define filter_rnd [pw_8]
%endif
+%endif
+
; x_offset == bilin interpolation && y_offset == bilin interpolation
%if %1 == 16
movu m0, [srcq]
@@ -1093,7 +1206,9 @@
%endif
psraw m0, 4
psraw m2, 4
- add srcq, src_strideq
+
+ INC_SRC_BY_SRC_STRIDE
+
packuswb m0, m2
.x_other_y_other_loop:
%if cpuflag(ssse3)
@@ -1163,7 +1278,7 @@
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
- add srcq, src_strideq
+ INC_SRC_BY_SRC_STRIDE
add dstq, dst_strideq
%else ; %1 < 16
movh m0, [srcq]
@@ -1184,12 +1299,17 @@
%if cpuflag(ssse3)
packuswb m0, m0
%endif
- add srcq, src_strideq
+
+ INC_SRC_BY_SRC_STRIDE
+
.x_other_y_other_loop:
movh m2, [srcq]
movh m1, [srcq+1]
- movh m4, [srcq+src_strideq]
- movh m3, [srcq+src_strideq+1]
+
+ INC_SRC_BY_SRC_STRIDE
+ movh m4, [srcq]
+ movh m3, [srcq+1]
+
%if cpuflag(ssse3)
punpcklbw m2, m1
punpcklbw m4, m3
@@ -1253,7 +1373,7 @@
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
- lea srcq, [srcq+src_strideq*2]
+ INC_SRC_BY_SRC_STRIDE
lea dstq, [dstq+dst_strideq*2]
%endif
%if %2 == 1 ; avg
« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c ('k') | source/libvpx/vp9/vp9_common.mk » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698