Index: source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm |
=================================================================== |
--- source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm (revision 240950) |
+++ source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm (working copy) |
@@ -118,6 +118,14 @@ |
RET |
%endmacro |
+%macro INC_SRC_BY_SRC_STRIDE 0 |
+%if ARCH_X86=1 && CONFIG_PIC=1 |
+ add srcq, src_stridemp |
+%else |
+ add srcq, src_strideq |
+%endif |
+%endmacro |
+ |
%macro SUBPEL_VARIANCE 1-2 0 ; W |
%if cpuflag(ssse3) |
%define bilin_filter_m bilin_filter_m_ssse3 |
@@ -129,41 +137,85 @@ |
; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses |
; 11, not 13, if the registers are ordered correctly. May make a minor speed |
; difference on Win64 |
-%ifdef PIC |
-%if %2 == 1 ; avg |
-cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ |
- x_offset, y_offset, \ |
- dst, dst_stride, \ |
- sec, sec_stride, height, sse |
-%define sec_str sec_strideq |
+ |
+%ifdef PIC ; 64bit PIC |
+ %if %2 == 1 ; avg |
+ cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ |
+ x_offset, y_offset, \ |
+ dst, dst_stride, \ |
+ sec, sec_stride, height, sse |
+ %define sec_str sec_strideq |
+ %else |
+ cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ |
+ y_offset, dst, dst_stride, height, sse |
+ %endif |
+ %define h heightd |
+ %define bilin_filter sseq |
%else |
-cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \ |
- dst, dst_stride, height, sse |
+ %if ARCH_X86=1 && CONFIG_PIC=1 |
+ %if %2 == 1 ; avg |
+ cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ |
+ x_offset, y_offset, \ |
+ dst, dst_stride, \ |
+ sec, sec_stride, \ |
+ height, sse, g_bilin_filter, g_pw_8 |
+ %define h dword heightm |
+ %define sec_str sec_stridemp |
+ |
+ ;Store bilin_filter and pw_8 location in stack |
+ GET_GOT eax |
+ add esp, 4 ; restore esp |
+ |
+ lea ecx, [GLOBAL(bilin_filter_m)] |
+ mov g_bilin_filterm, ecx |
+ |
+ lea ecx, [GLOBAL(pw_8)] |
+ mov g_pw_8m, ecx |
+ |
+ LOAD_IF_USED 0, 1 ; load eax, ecx back |
+ %else |
+ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ |
+ y_offset, dst, dst_stride, height, sse, \ |
+ g_bilin_filter, g_pw_8 |
+ %define h heightd |
+ |
+ ;Store bilin_filter and pw_8 location in stack |
+ GET_GOT eax |
+ add esp, 4 ; restore esp |
+ |
+ lea ecx, [GLOBAL(bilin_filter_m)] |
+ mov g_bilin_filterm, ecx |
+ |
+ lea ecx, [GLOBAL(pw_8)] |
+ mov g_pw_8m, ecx |
+ |
+ LOAD_IF_USED 0, 1 ; load eax, ecx back |
+ %endif |
+ %else |
+ %if %2 == 1 ; avg |
+ cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ |
+ 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ |
+ x_offset, y_offset, \ |
+ dst, dst_stride, \ |
+ sec, sec_stride, \ |
+ height, sse |
+ %if ARCH_X86_64 |
+ %define h heightd |
+ %define sec_str sec_strideq |
+ %else |
+ %define h dword heightm |
+ %define sec_str sec_stridemp |
+ %endif |
+ %else |
+ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ |
+ y_offset, dst, dst_stride, height, sse |
+ %define h heightd |
+ %endif |
+ |
+ %define bilin_filter bilin_filter_m |
+ %endif |
%endif |
-%define h heightd |
-%define bilin_filter sseq |
-%else |
-%if %2 == 1 ; avg |
-cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ |
- 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ |
- x_offset, y_offset, \ |
- dst, dst_stride, \ |
- sec, sec_stride, \ |
- height, sse |
-%if ARCH_X86_64 |
-%define h heightd |
-%define sec_str sec_strideq |
-%else |
-%define h dword heightm |
-%define sec_str sec_stridemp |
-%endif |
-%else |
-cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ |
- dst, dst_stride, height, sse |
-%define h heightd |
-%endif |
-%define bilin_filter bilin_filter_m |
-%endif |
+ |
ASSERT %1 <= 16 ; m6 overflows if w > 16 |
pxor m6, m6 ; sum |
pxor m7, m7 ; sse |
@@ -329,11 +381,22 @@ |
%define filter_y_b m9 |
%define filter_rnd m10 |
%else ; x86-32 or mmx |
+%if ARCH_X86=1 && CONFIG_PIC=1 |
+; x_offset == 0, reuse x_offset reg |
+%define tempq x_offsetq |
+ add y_offsetq, g_bilin_filterm |
+%define filter_y_a [y_offsetq] |
+%define filter_y_b [y_offsetq+16] |
+ mov tempq, g_pw_8m |
+%define filter_rnd [tempq] |
+%else |
add y_offsetq, bilin_filter |
%define filter_y_a [y_offsetq] |
%define filter_y_b [y_offsetq+16] |
%define filter_rnd [pw_8] |
%endif |
+%endif |
+ |
.x_zero_y_other_loop: |
%if %1 == 16 |
movu m0, [srcq] |
@@ -615,12 +678,23 @@ |
%define filter_y_a m8 |
%define filter_y_b m9 |
%define filter_rnd m10 |
+%else ;x86_32 |
+%if ARCH_X86=1 && CONFIG_PIC=1 |
+; x_offset == 0.5. We can reuse x_offset reg |
+%define tempq x_offsetq |
+ add y_offsetq, g_bilin_filterm |
+%define filter_y_a [y_offsetq] |
+%define filter_y_b [y_offsetq+16] |
+ mov tempq, g_pw_8m |
+%define filter_rnd [tempq] |
%else |
add y_offsetq, bilin_filter |
%define filter_y_a [y_offsetq] |
%define filter_y_b [y_offsetq+16] |
%define filter_rnd [pw_8] |
%endif |
+%endif |
+ |
%if %1 == 16 |
movu m0, [srcq] |
movu m3, [srcq+1] |
@@ -752,12 +826,23 @@ |
%define filter_x_a m8 |
%define filter_x_b m9 |
%define filter_rnd m10 |
+%else ; x86-32 |
+%if ARCH_X86=1 && CONFIG_PIC=1 |
+;y_offset == 0. We can reuse y_offset reg. |
+%define tempq y_offsetq |
+ add x_offsetq, g_bilin_filterm |
+%define filter_x_a [x_offsetq] |
+%define filter_x_b [x_offsetq+16] |
+ mov tempq, g_pw_8m |
+%define filter_rnd [tempq] |
%else |
add x_offsetq, bilin_filter |
%define filter_x_a [x_offsetq] |
%define filter_x_b [x_offsetq+16] |
%define filter_rnd [pw_8] |
%endif |
+%endif |
+ |
.x_other_y_zero_loop: |
%if %1 == 16 |
movu m0, [srcq] |
@@ -873,12 +958,23 @@ |
%define filter_x_a m8 |
%define filter_x_b m9 |
%define filter_rnd m10 |
+%else ; x86-32 |
+%if ARCH_X86=1 && CONFIG_PIC=1 |
+; y_offset == 0.5. We can reuse y_offset reg. |
+%define tempq y_offsetq |
+ add x_offsetq, g_bilin_filterm |
+%define filter_x_a [x_offsetq] |
+%define filter_x_b [x_offsetq+16] |
+ mov tempq, g_pw_8m |
+%define filter_rnd [tempq] |
%else |
add x_offsetq, bilin_filter |
%define filter_x_a [x_offsetq] |
%define filter_x_b [x_offsetq+16] |
%define filter_rnd [pw_8] |
%endif |
+%endif |
+ |
%if %1 == 16 |
movu m0, [srcq] |
movu m1, [srcq+1] |
@@ -1057,6 +1153,21 @@ |
%define filter_y_a m10 |
%define filter_y_b m11 |
%define filter_rnd m12 |
+%else ; x86-32 |
+%if ARCH_X86=1 && CONFIG_PIC=1 |
+; In this case, there is NO unused register. Used src_stride register. Later, |
+; src_stride has to be loaded from stack when it is needed. |
+%define tempq src_strideq |
+ mov tempq, g_bilin_filterm |
+ add x_offsetq, tempq |
+ add y_offsetq, tempq |
+%define filter_x_a [x_offsetq] |
+%define filter_x_b [x_offsetq+16] |
+%define filter_y_a [y_offsetq] |
+%define filter_y_b [y_offsetq+16] |
+ |
+ mov tempq, g_pw_8m |
+%define filter_rnd [tempq] |
%else |
add x_offsetq, bilin_filter |
add y_offsetq, bilin_filter |
@@ -1066,6 +1177,8 @@ |
%define filter_y_b [y_offsetq+16] |
%define filter_rnd [pw_8] |
%endif |
+%endif |
+ |
; x_offset == bilin interpolation && y_offset == bilin interpolation |
%if %1 == 16 |
movu m0, [srcq] |
@@ -1093,7 +1206,9 @@ |
%endif |
psraw m0, 4 |
psraw m2, 4 |
- add srcq, src_strideq |
+ |
+ INC_SRC_BY_SRC_STRIDE |
+ |
packuswb m0, m2 |
.x_other_y_other_loop: |
%if cpuflag(ssse3) |
@@ -1163,7 +1278,7 @@ |
SUM_SSE m0, m1, m2, m3, m6, m7 |
mova m0, m4 |
- add srcq, src_strideq |
+ INC_SRC_BY_SRC_STRIDE |
add dstq, dst_strideq |
%else ; %1 < 16 |
movh m0, [srcq] |
@@ -1184,12 +1299,17 @@ |
%if cpuflag(ssse3) |
packuswb m0, m0 |
%endif |
- add srcq, src_strideq |
+ |
+ INC_SRC_BY_SRC_STRIDE |
+ |
.x_other_y_other_loop: |
movh m2, [srcq] |
movh m1, [srcq+1] |
- movh m4, [srcq+src_strideq] |
- movh m3, [srcq+src_strideq+1] |
+ |
+ INC_SRC_BY_SRC_STRIDE |
+ movh m4, [srcq] |
+ movh m3, [srcq+1] |
+ |
%if cpuflag(ssse3) |
punpcklbw m2, m1 |
punpcklbw m4, m3 |
@@ -1253,7 +1373,7 @@ |
SUM_SSE m0, m1, m2, m3, m6, m7 |
mova m0, m4 |
- lea srcq, [srcq+src_strideq*2] |
+ INC_SRC_BY_SRC_STRIDE |
lea dstq, [dstq+dst_strideq*2] |
%endif |
%if %2 == 1 ; avg |