Index: source/libvpx/vp9/encoder/x86/vp9_highbd_subpel_variance.asm |
=================================================================== |
--- source/libvpx/vp9/encoder/x86/vp9_highbd_subpel_variance.asm (revision 0) |
+++ source/libvpx/vp9/encoder/x86/vp9_highbd_subpel_variance.asm (working copy) |
@@ -0,0 +1,1043 @@ |
+; |
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
+; |
+; Use of this source code is governed by a BSD-style license |
+; that can be found in the LICENSE file in the root of the source |
+; tree. An additional intellectual property rights grant can be found |
+; in the file PATENTS. All contributing project authors may |
+; be found in the AUTHORS file in the root of the source tree. |
+; |
+ |
+%include "third_party/x86inc/x86inc.asm" |
+ |
+SECTION_RODATA |
+pw_8: times 8 dw 8 |
+bilin_filter_m_sse2: times 8 dw 16 |
+ times 8 dw 0 |
+ times 8 dw 15 |
+ times 8 dw 1 |
+ times 8 dw 14 |
+ times 8 dw 2 |
+ times 8 dw 13 |
+ times 8 dw 3 |
+ times 8 dw 12 |
+ times 8 dw 4 |
+ times 8 dw 11 |
+ times 8 dw 5 |
+ times 8 dw 10 |
+ times 8 dw 6 |
+ times 8 dw 9 |
+ times 8 dw 7 |
+ times 16 dw 8 |
+ times 8 dw 7 |
+ times 8 dw 9 |
+ times 8 dw 6 |
+ times 8 dw 10 |
+ times 8 dw 5 |
+ times 8 dw 11 |
+ times 8 dw 4 |
+ times 8 dw 12 |
+ times 8 dw 3 |
+ times 8 dw 13 |
+ times 8 dw 2 |
+ times 8 dw 14 |
+ times 8 dw 1 |
+ times 8 dw 15 |
+ |
+SECTION .text |
+ |
+; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, |
+; int x_offset, int y_offset, |
+; const uint8_t *dst, ptrdiff_t dst_stride, |
+; int height, unsigned int *sse); |
+; |
+; This function returns the SE and stores SSE in the given pointer. |
+ |
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse |
+ psubw %3, %4 |
+ psubw %1, %2 |
+ mova %4, %3 ; make copies to manipulate to calc sum |
+ mova %2, %1 ; use originals for calc sse |
+ pmaddwd %3, %3 |
+ paddw %4, %2 |
+ pmaddwd %1, %1 |
+ movhlps %2, %4 |
+ paddd %6, %3 |
+ paddw %4, %2 |
+ pxor %2, %2 |
+ pcmpgtw %2, %4 ; mask for 0 > %4 (sum) |
+ punpcklwd %4, %2 ; sign-extend word to dword |
+ paddd %6, %1 |
+ paddd %5, %4 |
+ |
+%endmacro |
+ |
+%macro STORE_AND_RET 0 |
+%if mmsize == 16 |
+ ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit |
+ ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. |
+ ; We have to sign-extend it before adding the words within the register |
+ ; and outputing to a dword. |
+ movhlps m3, m7 |
+ movhlps m4, m6 |
+ paddd m7, m3 |
+ paddd m6, m4 |
+ pshufd m3, m7, 0x1 |
+ pshufd m4, m6, 0x1 |
+ paddd m7, m3 |
+ paddd m6, m4 |
+ mov r1, ssem ; r1 = unsigned int *sse |
+ movd [r1], m7 ; store sse |
+ movd rax, m6 ; store sum as return value |
+%endif |
+ RET |
+%endmacro |
+ |
+%macro INC_SRC_BY_SRC_STRIDE 0 |
+%if ARCH_X86=1 && CONFIG_PIC=1 |
+ lea srcq, [srcq + src_stridemp*2] |
+%else |
+ lea srcq, [srcq + src_strideq*2] |
+%endif |
+%endmacro |
+ |
+%macro INC_SRC_BY_SRC_2STRIDE 0 |
+%if ARCH_X86=1 && CONFIG_PIC=1 |
+ lea srcq, [srcq + src_stridemp*4] |
+%else |
+ lea srcq, [srcq + src_strideq*4] |
+%endif |
+%endmacro |
+ |
+%macro SUBPEL_VARIANCE 1-2 0 ; W |
+%define bilin_filter_m bilin_filter_m_sse2 |
+%define filter_idx_shift 5 |
+ |
+ |
+%ifdef PIC ; 64bit PIC |
+ %if %2 == 1 ; avg |
+ cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ |
+ x_offset, y_offset, \ |
+ dst, dst_stride, \ |
+ sec, sec_stride, height, sse |
+ %define sec_str sec_strideq |
+ %else |
+ cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ |
+ y_offset, dst, dst_stride, height, sse |
+ %endif |
+ %define h heightd |
+ %define bilin_filter sseq |
+%else |
+ %if ARCH_X86=1 && CONFIG_PIC=1 |
+ %if %2 == 1 ; avg |
+ cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ |
+ x_offset, y_offset, \ |
+ dst, dst_stride, \ |
+ sec, sec_stride, \ |
+ height, sse, g_bilin_filter, g_pw_8 |
+ %define h dword heightm |
+ %define sec_str sec_stridemp |
+ |
+ ; Store bilin_filter and pw_8 location in stack |
+ GET_GOT eax |
+ add esp, 4 ; restore esp |
+ |
+ lea ecx, [GLOBAL(bilin_filter_m)] |
+ mov g_bilin_filterm, ecx |
+ |
+ lea ecx, [GLOBAL(pw_8)] |
+ mov g_pw_8m, ecx |
+ |
+ LOAD_IF_USED 0, 1 ; load eax, ecx back |
+ %else |
+ cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ |
+ x_offset, y_offset, dst, dst_stride, height, \ |
+ sse, g_bilin_filter, g_pw_8 |
+ %define h heightd |
+ |
+ ; Store bilin_filter and pw_8 location in stack |
+ GET_GOT eax |
+ add esp, 4 ; restore esp |
+ |
+ lea ecx, [GLOBAL(bilin_filter_m)] |
+ mov g_bilin_filterm, ecx |
+ |
+ lea ecx, [GLOBAL(pw_8)] |
+ mov g_pw_8m, ecx |
+ |
+ LOAD_IF_USED 0, 1 ; load eax, ecx back |
+ %endif |
+ %else |
+ %if %2 == 1 ; avg |
+ cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ |
+ 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ |
+ x_offset, y_offset, \ |
+ dst, dst_stride, \ |
+ sec, sec_stride, \ |
+ height, sse |
+ %if ARCH_X86_64 |
+ %define h heightd |
+ %define sec_str sec_strideq |
+ %else |
+ %define h dword heightm |
+ %define sec_str sec_stridemp |
+ %endif |
+ %else |
+ cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ |
+ x_offset, y_offset, dst, dst_stride, height, sse |
+ %define h heightd |
+ %endif |
+ |
+ %define bilin_filter bilin_filter_m |
+ %endif |
+%endif |
+ |
+ ASSERT %1 <= 16 ; m6 overflows if w > 16 |
+ pxor m6, m6 ; sum |
+ pxor m7, m7 ; sse |
+ |
+%if %1 < 16 |
+ sar h, 1 |
+%endif |
+ |
+ ; FIXME(rbultje) replace by jumptable? |
+ test x_offsetd, x_offsetd |
+ jnz .x_nonzero |
+ ; x_offset == 0 |
+ test y_offsetd, y_offsetd |
+ jnz .x_zero_y_nonzero |
+ |
+ ; x_offset == 0 && y_offset == 0 |
+.x_zero_y_zero_loop: |
+%if %1 == 16 |
+ movu m0, [srcq] |
+ movu m2, [srcq + 16] |
+ mova m1, [dstq] |
+ mova m3, [dstq + 16] |
+%if %2 == 1 ; avg |
+ pavgw m0, [secq] |
+ pavgw m2, [secq+16] |
+%endif |
+ SUM_SSE m0, m1, m2, m3, m6, m7 |
+ |
+ lea srcq, [srcq + src_strideq*2] |
+ lea dstq, [dstq + dst_strideq*2] |
+%if %2 == 1 ; avg |
+ lea secq, [secq + sec_str*2] |
+%endif |
+%else ; %1 < 16 |
+ movu m0, [srcq] |
+ movu m2, [srcq + src_strideq*2] |
+ mova m1, [dstq] |
+ mova m3, [dstq + dst_strideq*2] |
+%if %2 == 1 ; avg |
+ pavgw m0, [secq] |
+ pavgw m2, [secq + sec_str*2] |
+%endif |
+ SUM_SSE m0, m1, m2, m3, m6, m7 |
+ |
+ lea srcq, [srcq + src_strideq*4] |
+ lea dstq, [dstq + dst_strideq*4] |
+%if %2 == 1 ; avg |
+ lea secq, [secq + sec_str*4] |
+%endif |
+%endif |
+ dec h |
+ jg .x_zero_y_zero_loop |
+ STORE_AND_RET |
+ |
+.x_zero_y_nonzero: |
+ cmp y_offsetd, 8 |
+ jne .x_zero_y_nonhalf |
+ |
+ ; x_offset == 0 && y_offset == 0.5 |
+.x_zero_y_half_loop: |
+%if %1 == 16 |
+ movu m0, [srcq] |
+ movu m1, [srcq+16] |
+ movu m4, [srcq+src_strideq*2] |
+ movu m5, [srcq+src_strideq*2+16] |
+ mova m2, [dstq] |
+ mova m3, [dstq+16] |
+ pavgw m0, m4 |
+ pavgw m1, m5 |
+%if %2 == 1 ; avg |
+ pavgw m0, [secq] |
+ pavgw m1, [secq+16] |
+%endif |
+ SUM_SSE m0, m2, m1, m3, m6, m7 |
+ |
+ lea srcq, [srcq + src_strideq*2] |
+ lea dstq, [dstq + dst_strideq*2] |
+%if %2 == 1 ; avg |
+ lea secq, [secq + sec_str*2] |
+%endif |
+%else ; %1 < 16 |
+ movu m0, [srcq] |
+ movu m1, [srcq+src_strideq*2] |
+ movu m5, [srcq+src_strideq*4] |
+ mova m2, [dstq] |
+ mova m3, [dstq+dst_strideq*2] |
+ pavgw m0, m1 |
+ pavgw m1, m5 |
+%if %2 == 1 ; avg |
+ pavgw m0, [secq] |
+ pavgw m1, [secq+sec_str*2] |
+%endif |
+ SUM_SSE m0, m2, m1, m3, m6, m7 |
+ |
+ lea srcq, [srcq + src_strideq*4] |
+ lea dstq, [dstq + dst_strideq*4] |
+%if %2 == 1 ; avg |
+ lea secq, [secq + sec_str*4] |
+%endif |
+%endif |
+ dec h |
+ jg .x_zero_y_half_loop |
+ STORE_AND_RET |
+ |
+.x_zero_y_nonhalf: |
+ ; x_offset == 0 && y_offset == bilin interpolation |
+%ifdef PIC |
+ lea bilin_filter, [bilin_filter_m] |
+%endif |
+ shl y_offsetd, filter_idx_shift |
+%if ARCH_X86_64 && mmsize == 16 |
+ mova m8, [bilin_filter+y_offsetq] |
+ mova m9, [bilin_filter+y_offsetq+16] |
+ mova m10, [pw_8] |
+%define filter_y_a m8 |
+%define filter_y_b m9 |
+%define filter_rnd m10 |
+%else ; x86-32 or mmx |
+%if ARCH_X86=1 && CONFIG_PIC=1 |
+; x_offset == 0, reuse x_offset reg |
+%define tempq x_offsetq |
+ add y_offsetq, g_bilin_filterm |
+%define filter_y_a [y_offsetq] |
+%define filter_y_b [y_offsetq+16] |
+ mov tempq, g_pw_8m |
+%define filter_rnd [tempq] |
+%else |
+ add y_offsetq, bilin_filter |
+%define filter_y_a [y_offsetq] |
+%define filter_y_b [y_offsetq+16] |
+%define filter_rnd [pw_8] |
+%endif |
+%endif |
+ |
+.x_zero_y_other_loop: |
+%if %1 == 16 |
+ movu m0, [srcq] |
+ movu m1, [srcq + 16] |
+ movu m4, [srcq+src_strideq*2] |
+ movu m5, [srcq+src_strideq*2+16] |
+ mova m2, [dstq] |
+ mova m3, [dstq+16] |
+ ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can |
+ ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of |
+ ; instructions is the same (5), but it is 1 mul instead of 2, so might be |
+ ; slightly faster because of pmullw latency. It would also cut our rodata |
+ ; tables in half for this function, and save 1-2 registers on x86-64. |
+ pmullw m1, filter_y_a |
+ pmullw m5, filter_y_b |
+ paddw m1, filter_rnd |
+ pmullw m0, filter_y_a |
+ pmullw m4, filter_y_b |
+ paddw m0, filter_rnd |
+ paddw m1, m5 |
+ paddw m0, m4 |
+ psrlw m1, 4 |
+ psrlw m0, 4 |
+%if %2 == 1 ; avg |
+ pavgw m0, [secq] |
+ pavgw m1, [secq+16] |
+%endif |
+ SUM_SSE m0, m2, m1, m3, m6, m7 |
+ |
+ lea srcq, [srcq + src_strideq*2] |
+ lea dstq, [dstq + dst_strideq*2] |
+%if %2 == 1 ; avg |
+ lea secq, [secq + sec_str*2] |
+%endif |
+%else ; %1 < 16 |
+ movu m0, [srcq] |
+ movu m1, [srcq+src_strideq*2] |
+ movu m5, [srcq+src_strideq*4] |
+ mova m4, m1 |
+ mova m2, [dstq] |
+ mova m3, [dstq+dst_strideq*2] |
+ pmullw m1, filter_y_a |
+ pmullw m5, filter_y_b |
+ paddw m1, filter_rnd |
+ pmullw m0, filter_y_a |
+ pmullw m4, filter_y_b |
+ paddw m0, filter_rnd |
+ paddw m1, m5 |
+ paddw m0, m4 |
+ psrlw m1, 4 |
+ psrlw m0, 4 |
+%if %2 == 1 ; avg |
+ pavgw m0, [secq] |
+ pavgw m1, [secq+sec_str*2] |
+%endif |
+ SUM_SSE m0, m2, m1, m3, m6, m7 |
+ |
+ lea srcq, [srcq + src_strideq*4] |
+ lea dstq, [dstq + dst_strideq*4] |
+%if %2 == 1 ; avg |
+ lea secq, [secq + sec_str*4] |
+%endif |
+%endif |
+ dec h |
+ jg .x_zero_y_other_loop |
+%undef filter_y_a |
+%undef filter_y_b |
+%undef filter_rnd |
+ STORE_AND_RET |
+ |
+.x_nonzero: |
+ cmp x_offsetd, 8 |
+ jne .x_nonhalf |
+ ; x_offset == 0.5 |
+ test y_offsetd, y_offsetd |
+ jnz .x_half_y_nonzero |
+ |
+ ; x_offset == 0.5 && y_offset == 0 |
+.x_half_y_zero_loop: |
+%if %1 == 16 |
+ movu m0, [srcq] |
+ movu m1, [srcq + 16] |
+ movu m4, [srcq + 2] |
+ movu m5, [srcq + 18] |
+ mova m2, [dstq] |
+ mova m3, [dstq + 16] |
+ pavgw m0, m4 |
+ pavgw m1, m5 |
+%if %2 == 1 ; avg |
+ pavgw m0, [secq] |
+ pavgw m1, [secq+16] |
+%endif |
+ SUM_SSE m0, m2, m1, m3, m6, m7 |
+ |
+ lea srcq, [srcq + src_strideq*2] |
+ lea dstq, [dstq + dst_strideq*2] |
+%if %2 == 1 ; avg |
+ lea secq, [secq + sec_str*2] |
+%endif |
+%else ; %1 < 16 |
+ movu m0, [srcq] |
+ movu m1, [srcq + src_strideq*2] |
+ movu m4, [srcq + 2] |
+ movu m5, [srcq + src_strideq*2 + 2] |
+ mova m2, [dstq] |
+ mova m3, [dstq + dst_strideq*2] |
+ pavgw m0, m4 |
+ pavgw m1, m5 |
+%if %2 == 1 ; avg |
+ pavgw m0, [secq] |
+ pavgw m1, [secq+sec_str*2] |
+%endif |
+ SUM_SSE m0, m2, m1, m3, m6, m7 |
+ |
+ lea srcq, [srcq + src_strideq*4] |
+ lea dstq, [dstq + dst_strideq*4] |
+%if %2 == 1 ; avg |
+ lea secq, [secq + sec_str*4] |
+%endif |
+%endif |
+ dec h |
+ jg .x_half_y_zero_loop |
+ STORE_AND_RET |
+ |
+.x_half_y_nonzero: |
+ cmp y_offsetd, 8 |
+ jne .x_half_y_nonhalf |
+ |
+ ; x_offset == 0.5 && y_offset == 0.5 |
+%if %1 == 16 |
+ movu m0, [srcq] |
+ movu m1, [srcq+16] |
+ movu m2, [srcq+2] |
+ movu m3, [srcq+18] |
+ lea srcq, [srcq + src_strideq*2] |
+ pavgw m0, m2 |
+ pavgw m1, m3 |
+.x_half_y_half_loop: |
+ movu m2, [srcq] |
+ movu m3, [srcq + 16] |
+ movu m4, [srcq + 2] |
+ movu m5, [srcq + 18] |
+ pavgw m2, m4 |
+ pavgw m3, m5 |
+ pavgw m0, m2 |
+ pavgw m1, m3 |
+ mova m4, [dstq] |
+ mova m5, [dstq + 16] |
+%if %2 == 1 ; avg |
+ pavgw m0, [secq] |
+ pavgw m1, [secq+16] |
+%endif |
+ SUM_SSE m0, m4, m1, m5, m6, m7 |
+ mova m0, m2 |
+ mova m1, m3 |
+ |
+ lea srcq, [srcq + src_strideq*2] |
+ lea dstq, [dstq + dst_strideq*2] |
+%if %2 == 1 ; avg |
+ lea secq, [secq + sec_str*2] |
+%endif |
+%else ; %1 < 16 |
+ movu m0, [srcq] |
+ movu m2, [srcq+2] |
+ lea srcq, [srcq + src_strideq*2] |
+ pavgw m0, m2 |
+.x_half_y_half_loop: |
+ movu m2, [srcq] |
+ movu m3, [srcq + src_strideq*2] |
+ movu m4, [srcq + 2] |
+ movu m5, [srcq + src_strideq*2 + 2] |
+ pavgw m2, m4 |
+ pavgw m3, m5 |
+ pavgw m0, m2 |
+ pavgw m2, m3 |
+ mova m4, [dstq] |
+ mova m5, [dstq + dst_strideq*2] |
+%if %2 == 1 ; avg |
+ pavgw m0, [secq] |
+ pavgw m2, [secq+sec_str*2] |
+%endif |
+ SUM_SSE m0, m4, m2, m5, m6, m7 |
+ mova m0, m3 |
+ |
+ lea srcq, [srcq + src_strideq*4] |
+ lea dstq, [dstq + dst_strideq*4] |
+%if %2 == 1 ; avg |
+ lea secq, [secq + sec_str*4] |
+%endif |
+%endif |
+ dec h |
+ jg .x_half_y_half_loop |
+ STORE_AND_RET |
+ |
+.x_half_y_nonhalf: |
+ ; x_offset == 0.5 && y_offset == bilin interpolation |
+%ifdef PIC |
+ lea bilin_filter, [bilin_filter_m] |
+%endif |
+ shl y_offsetd, filter_idx_shift |
+%if ARCH_X86_64 && mmsize == 16 |
+ mova m8, [bilin_filter+y_offsetq] |
+ mova m9, [bilin_filter+y_offsetq+16] |
+ mova m10, [pw_8] |
+%define filter_y_a m8 |
+%define filter_y_b m9 |
+%define filter_rnd m10 |
+%else ; x86_32 |
+%if ARCH_X86=1 && CONFIG_PIC=1 |
+; x_offset == 0.5. We can reuse x_offset reg |
+%define tempq x_offsetq |
+ add y_offsetq, g_bilin_filterm |
+%define filter_y_a [y_offsetq] |
+%define filter_y_b [y_offsetq+16] |
+ mov tempq, g_pw_8m |
+%define filter_rnd [tempq] |
+%else |
+ add y_offsetq, bilin_filter |
+%define filter_y_a [y_offsetq] |
+%define filter_y_b [y_offsetq+16] |
+%define filter_rnd [pw_8] |
+%endif |
+%endif |
+ |
+%if %1 == 16 |
+ movu m0, [srcq] |
+ movu m1, [srcq+16] |
+ movu m2, [srcq+2] |
+ movu m3, [srcq+18] |
+ lea srcq, [srcq + src_strideq*2] |
+ pavgw m0, m2 |
+ pavgw m1, m3 |
+.x_half_y_other_loop: |
+ movu m2, [srcq] |
+ movu m3, [srcq+16] |
+ movu m4, [srcq+2] |
+ movu m5, [srcq+18] |
+ pavgw m2, m4 |
+ pavgw m3, m5 |
+ mova m4, m2 |
+ mova m5, m3 |
+ pmullw m1, filter_y_a |
+ pmullw m3, filter_y_b |
+ paddw m1, filter_rnd |
+ paddw m1, m3 |
+ pmullw m0, filter_y_a |
+ pmullw m2, filter_y_b |
+ paddw m0, filter_rnd |
+ psrlw m1, 4 |
+ paddw m0, m2 |
+ mova m2, [dstq] |
+ psrlw m0, 4 |
+ mova m3, [dstq+16] |
+%if %2 == 1 ; avg |
+ pavgw m0, [secq] |
+ pavgw m1, [secq+16] |
+%endif |
+ SUM_SSE m0, m2, m1, m3, m6, m7 |
+ mova m0, m4 |
+ mova m1, m5 |
+ |
+ lea srcq, [srcq + src_strideq*2] |
+ lea dstq, [dstq + dst_strideq*2] |
+%if %2 == 1 ; avg |
+ lea secq, [secq + sec_str*2] |
+%endif |
+%else ; %1 < 16 |
+ movu m0, [srcq] |
+ movu m2, [srcq+2] |
+ lea srcq, [srcq + src_strideq*2] |
+ pavgw m0, m2 |
+.x_half_y_other_loop: |
+ movu m2, [srcq] |
+ movu m3, [srcq+src_strideq*2] |
+ movu m4, [srcq+2] |
+ movu m5, [srcq+src_strideq*2+2] |
+ pavgw m2, m4 |
+ pavgw m3, m5 |
+ mova m4, m2 |
+ mova m5, m3 |
+ pmullw m4, filter_y_a |
+ pmullw m3, filter_y_b |
+ paddw m4, filter_rnd |
+ paddw m4, m3 |
+ pmullw m0, filter_y_a |
+ pmullw m2, filter_y_b |
+ paddw m0, filter_rnd |
+ psrlw m4, 4 |
+ paddw m0, m2 |
+ mova m2, [dstq] |
+ psrlw m0, 4 |
+ mova m3, [dstq+dst_strideq*2] |
+%if %2 == 1 ; avg |
+ pavgw m0, [secq] |
+ pavgw m4, [secq+sec_str*2] |
+%endif |
+ SUM_SSE m0, m2, m4, m3, m6, m7 |
+ mova m0, m5 |
+ |
+ lea srcq, [srcq + src_strideq*4] |
+ lea dstq, [dstq + dst_strideq*4] |
+%if %2 == 1 ; avg |
+ lea secq, [secq + sec_str*4] |
+%endif |
+%endif |
+ dec h |
+ jg .x_half_y_other_loop |
+%undef filter_y_a |
+%undef filter_y_b |
+%undef filter_rnd |
+ STORE_AND_RET |
+ |
+.x_nonhalf: |
+ test y_offsetd, y_offsetd |
+ jnz .x_nonhalf_y_nonzero |
+ |
+ ; x_offset == bilin interpolation && y_offset == 0 |
+%ifdef PIC |
+ lea bilin_filter, [bilin_filter_m] |
+%endif |
+ shl x_offsetd, filter_idx_shift |
+%if ARCH_X86_64 && mmsize == 16 |
+ mova m8, [bilin_filter+x_offsetq] |
+ mova m9, [bilin_filter+x_offsetq+16] |
+ mova m10, [pw_8] |
+%define filter_x_a m8 |
+%define filter_x_b m9 |
+%define filter_rnd m10 |
+%else ; x86-32 |
+%if ARCH_X86=1 && CONFIG_PIC=1 |
+; y_offset == 0. We can reuse y_offset reg. |
+%define tempq y_offsetq |
+ add x_offsetq, g_bilin_filterm |
+%define filter_x_a [x_offsetq] |
+%define filter_x_b [x_offsetq+16] |
+ mov tempq, g_pw_8m |
+%define filter_rnd [tempq] |
+%else |
+ add x_offsetq, bilin_filter |
+%define filter_x_a [x_offsetq] |
+%define filter_x_b [x_offsetq+16] |
+%define filter_rnd [pw_8] |
+%endif |
+%endif |
+ |
+.x_other_y_zero_loop: |
+%if %1 == 16 |
+ movu m0, [srcq] |
+ movu m1, [srcq+16] |
+ movu m2, [srcq+2] |
+ movu m3, [srcq+18] |
+ mova m4, [dstq] |
+ mova m5, [dstq+16] |
+ pmullw m1, filter_x_a |
+ pmullw m3, filter_x_b |
+ paddw m1, filter_rnd |
+ pmullw m0, filter_x_a |
+ pmullw m2, filter_x_b |
+ paddw m0, filter_rnd |
+ paddw m1, m3 |
+ paddw m0, m2 |
+ psrlw m1, 4 |
+ psrlw m0, 4 |
+%if %2 == 1 ; avg |
+ pavgw m0, [secq] |
+ pavgw m1, [secq+16] |
+%endif |
+ SUM_SSE m0, m4, m1, m5, m6, m7 |
+ |
+ lea srcq, [srcq+src_strideq*2] |
+ lea dstq, [dstq+dst_strideq*2] |
+%if %2 == 1 ; avg |
+ lea secq, [secq + sec_str*2] |
+%endif |
+%else ; %1 < 16 |
+ movu m0, [srcq] |
+ movu m1, [srcq+src_strideq*2] |
+ movu m2, [srcq+2] |
+ movu m3, [srcq+src_strideq*2+2] |
+ mova m4, [dstq] |
+ mova m5, [dstq+dst_strideq*2] |
+ pmullw m1, filter_x_a |
+ pmullw m3, filter_x_b |
+ paddw m1, filter_rnd |
+ pmullw m0, filter_x_a |
+ pmullw m2, filter_x_b |
+ paddw m0, filter_rnd |
+ paddw m1, m3 |
+ paddw m0, m2 |
+ psrlw m1, 4 |
+ psrlw m0, 4 |
+%if %2 == 1 ; avg |
+ pavgw m0, [secq] |
+ pavgw m1, [secq+sec_str*2] |
+%endif |
+ SUM_SSE m0, m4, m1, m5, m6, m7 |
+ |
+ lea srcq, [srcq+src_strideq*4] |
+ lea dstq, [dstq+dst_strideq*4] |
+%if %2 == 1 ; avg |
+ lea secq, [secq + sec_str*4] |
+%endif |
+%endif |
+ dec h |
+ jg .x_other_y_zero_loop |
+%undef filter_x_a |
+%undef filter_x_b |
+%undef filter_rnd |
+ STORE_AND_RET |
+ |
+.x_nonhalf_y_nonzero: |
+ cmp y_offsetd, 8 |
+ jne .x_nonhalf_y_nonhalf |
+ |
+ ; x_offset == bilin interpolation && y_offset == 0.5 |
+%ifdef PIC |
+ lea bilin_filter, [bilin_filter_m] |
+%endif |
+ shl x_offsetd, filter_idx_shift |
+%if ARCH_X86_64 && mmsize == 16 |
+ mova m8, [bilin_filter+x_offsetq] |
+ mova m9, [bilin_filter+x_offsetq+16] |
+ mova m10, [pw_8] |
+%define filter_x_a m8 |
+%define filter_x_b m9 |
+%define filter_rnd m10 |
+%else ; x86-32 |
+%if ARCH_X86=1 && CONFIG_PIC=1 |
+; y_offset == 0.5. We can reuse y_offset reg. |
+%define tempq y_offsetq |
+ add x_offsetq, g_bilin_filterm |
+%define filter_x_a [x_offsetq] |
+%define filter_x_b [x_offsetq+16] |
+ mov tempq, g_pw_8m |
+%define filter_rnd [tempq] |
+%else |
+ add x_offsetq, bilin_filter |
+%define filter_x_a [x_offsetq] |
+%define filter_x_b [x_offsetq+16] |
+%define filter_rnd [pw_8] |
+%endif |
+%endif |
+ |
+%if %1 == 16 |
+ movu m0, [srcq] |
+ movu m1, [srcq+16] |
+ movu m2, [srcq+2] |
+ movu m3, [srcq+18] |
+ pmullw m0, filter_x_a |
+ pmullw m2, filter_x_b |
+ paddw m0, filter_rnd |
+ pmullw m1, filter_x_a |
+ pmullw m3, filter_x_b |
+ paddw m1, filter_rnd |
+ paddw m0, m2 |
+ paddw m1, m3 |
+ psrlw m0, 4 |
+ psrlw m1, 4 |
+ lea srcq, [srcq+src_strideq*2] |
+.x_other_y_half_loop: |
+ movu m2, [srcq] |
+ movu m3, [srcq+16] |
+ movu m4, [srcq+2] |
+ movu m5, [srcq+18] |
+ pmullw m2, filter_x_a |
+ pmullw m4, filter_x_b |
+ paddw m2, filter_rnd |
+ pmullw m3, filter_x_a |
+ pmullw m5, filter_x_b |
+ paddw m3, filter_rnd |
+ paddw m2, m4 |
+ paddw m3, m5 |
+ mova m4, [dstq] |
+ mova m5, [dstq+16] |
+ psrlw m2, 4 |
+ psrlw m3, 4 |
+ pavgw m0, m2 |
+ pavgw m1, m3 |
+%if %2 == 1 ; avg |
+ pavgw m0, [secq] |
+ pavgw m1, [secq+16] |
+%endif |
+ SUM_SSE m0, m4, m1, m5, m6, m7 |
+ mova m0, m2 |
+ mova m1, m3 |
+ |
+ lea srcq, [srcq+src_strideq*2] |
+ lea dstq, [dstq+dst_strideq*2] |
+%if %2 == 1 ; avg |
+ lea secq, [secq + sec_str*2] |
+%endif |
+%else ; %1 < 16 |
+ movu m0, [srcq] |
+ movu m2, [srcq+2] |
+ pmullw m0, filter_x_a |
+ pmullw m2, filter_x_b |
+ paddw m0, filter_rnd |
+ paddw m0, m2 |
+ psrlw m0, 4 |
+ lea srcq, [srcq+src_strideq*2] |
+.x_other_y_half_loop: |
+ movu m2, [srcq] |
+ movu m3, [srcq+src_strideq*2] |
+ movu m4, [srcq+2] |
+ movu m5, [srcq+src_strideq*2+2] |
+ pmullw m2, filter_x_a |
+ pmullw m4, filter_x_b |
+ paddw m2, filter_rnd |
+ pmullw m3, filter_x_a |
+ pmullw m5, filter_x_b |
+ paddw m3, filter_rnd |
+ paddw m2, m4 |
+ paddw m3, m5 |
+ mova m4, [dstq] |
+ mova m5, [dstq+dst_strideq*2] |
+ psrlw m2, 4 |
+ psrlw m3, 4 |
+ pavgw m0, m2 |
+ pavgw m2, m3 |
+%if %2 == 1 ; avg |
+ pavgw m0, [secq] |
+ pavgw m2, [secq+sec_str*2] |
+%endif |
+ SUM_SSE m0, m4, m2, m5, m6, m7 |
+ mova m0, m3 |
+ |
+ lea srcq, [srcq+src_strideq*4] |
+ lea dstq, [dstq+dst_strideq*4] |
+%if %2 == 1 ; avg |
+ lea secq, [secq + sec_str*4] |
+%endif |
+%endif |
+ dec h |
+ jg .x_other_y_half_loop |
+%undef filter_x_a |
+%undef filter_x_b |
+%undef filter_rnd |
+ STORE_AND_RET |
+ |
+.x_nonhalf_y_nonhalf: |
+; loading filter - this is same as in 8-bit depth |
+%ifdef PIC |
+ lea bilin_filter, [bilin_filter_m] |
+%endif |
+ shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5 |
+ shl y_offsetd, filter_idx_shift |
+%if ARCH_X86_64 && mmsize == 16 |
+ mova m8, [bilin_filter+x_offsetq] |
+ mova m9, [bilin_filter+x_offsetq+16] |
+ mova m10, [bilin_filter+y_offsetq] |
+ mova m11, [bilin_filter+y_offsetq+16] |
+ mova m12, [pw_8] |
+%define filter_x_a m8 |
+%define filter_x_b m9 |
+%define filter_y_a m10 |
+%define filter_y_b m11 |
+%define filter_rnd m12 |
+%else ; x86-32 |
+%if ARCH_X86=1 && CONFIG_PIC=1 |
+; In this case, there is NO unused register. Used src_stride register. Later, |
+; src_stride has to be loaded from stack when it is needed. |
+%define tempq src_strideq |
+ mov tempq, g_bilin_filterm |
+ add x_offsetq, tempq |
+ add y_offsetq, tempq |
+%define filter_x_a [x_offsetq] |
+%define filter_x_b [x_offsetq+16] |
+%define filter_y_a [y_offsetq] |
+%define filter_y_b [y_offsetq+16] |
+ |
+ mov tempq, g_pw_8m |
+%define filter_rnd [tempq] |
+%else |
+ add x_offsetq, bilin_filter |
+ add y_offsetq, bilin_filter |
+%define filter_x_a [x_offsetq] |
+%define filter_x_b [x_offsetq+16] |
+%define filter_y_a [y_offsetq] |
+%define filter_y_b [y_offsetq+16] |
+%define filter_rnd [pw_8] |
+%endif |
+%endif |
+; end of load filter |
+ |
+ ; x_offset == bilin interpolation && y_offset == bilin interpolation |
+%if %1 == 16 |
+ movu m0, [srcq] |
+ movu m2, [srcq+2] |
+ movu m1, [srcq+16] |
+ movu m3, [srcq+18] |
+ pmullw m0, filter_x_a |
+ pmullw m2, filter_x_b |
+ paddw m0, filter_rnd |
+ pmullw m1, filter_x_a |
+ pmullw m3, filter_x_b |
+ paddw m1, filter_rnd |
+ paddw m0, m2 |
+ paddw m1, m3 |
+ psrlw m0, 4 |
+ psrlw m1, 4 |
+ |
+ INC_SRC_BY_SRC_STRIDE |
+ |
+.x_other_y_other_loop: |
+ movu m2, [srcq] |
+ movu m4, [srcq+2] |
+ movu m3, [srcq+16] |
+ movu m5, [srcq+18] |
+ pmullw m2, filter_x_a |
+ pmullw m4, filter_x_b |
+ paddw m2, filter_rnd |
+ pmullw m3, filter_x_a |
+ pmullw m5, filter_x_b |
+ paddw m3, filter_rnd |
+ paddw m2, m4 |
+ paddw m3, m5 |
+ psrlw m2, 4 |
+ psrlw m3, 4 |
+ mova m4, m2 |
+ mova m5, m3 |
+ pmullw m0, filter_y_a |
+ pmullw m2, filter_y_b |
+ paddw m0, filter_rnd |
+ pmullw m1, filter_y_a |
+ pmullw m3, filter_y_b |
+ paddw m0, m2 |
+ paddw m1, filter_rnd |
+ mova m2, [dstq] |
+ paddw m1, m3 |
+ psrlw m0, 4 |
+ psrlw m1, 4 |
+ mova m3, [dstq+16] |
+%if %2 == 1 ; avg |
+ pavgw m0, [secq] |
+ pavgw m1, [secq+16] |
+%endif |
+ SUM_SSE m0, m2, m1, m3, m6, m7 |
+ mova m0, m4 |
+ mova m1, m5 |
+ |
+ INC_SRC_BY_SRC_STRIDE |
+ lea dstq, [dstq + dst_strideq * 2] |
+%if %2 == 1 ; avg |
+ lea secq, [secq + sec_str*2] |
+%endif |
+%else ; %1 < 16 |
+ movu m0, [srcq] |
+ movu m2, [srcq+2] |
+ pmullw m0, filter_x_a |
+ pmullw m2, filter_x_b |
+ paddw m0, filter_rnd |
+ paddw m0, m2 |
+ psrlw m0, 4 |
+ |
+ INC_SRC_BY_SRC_STRIDE |
+ |
+.x_other_y_other_loop: |
+ movu m2, [srcq] |
+ movu m4, [srcq+2] |
+ movu m3, [srcq+src_strideq*2] |
+ movu m5, [srcq+src_strideq*2+2] |
+ pmullw m2, filter_x_a |
+ pmullw m4, filter_x_b |
+ paddw m2, filter_rnd |
+ pmullw m3, filter_x_a |
+ pmullw m5, filter_x_b |
+ paddw m3, filter_rnd |
+ paddw m2, m4 |
+ paddw m3, m5 |
+ psrlw m2, 4 |
+ psrlw m3, 4 |
+ mova m4, m2 |
+ mova m5, m3 |
+ pmullw m0, filter_y_a |
+ pmullw m2, filter_y_b |
+ paddw m0, filter_rnd |
+ pmullw m4, filter_y_a |
+ pmullw m3, filter_y_b |
+ paddw m0, m2 |
+ paddw m4, filter_rnd |
+ mova m2, [dstq] |
+ paddw m4, m3 |
+ psrlw m0, 4 |
+ psrlw m4, 4 |
+ mova m3, [dstq+dst_strideq*2] |
+%if %2 == 1 ; avg |
+ pavgw m0, [secq] |
+ pavgw m4, [secq+sec_str*2] |
+%endif |
+ SUM_SSE m0, m2, m4, m3, m6, m7 |
+ mova m0, m5 |
+ |
+ INC_SRC_BY_SRC_2STRIDE |
+ lea dstq, [dstq + dst_strideq * 4] |
+%if %2 == 1 ; avg |
+ lea secq, [secq + sec_str*4] |
+%endif |
+%endif |
+ dec h |
+ jg .x_other_y_other_loop |
+%undef filter_x_a |
+%undef filter_x_b |
+%undef filter_y_a |
+%undef filter_y_b |
+%undef filter_rnd |
+ STORE_AND_RET |
+%endmacro |
+ |
+INIT_XMM sse2 |
+SUBPEL_VARIANCE 8 |
+SUBPEL_VARIANCE 16 |
+ |
+INIT_XMM sse2 |
+SUBPEL_VARIANCE 8, 1 |
+SUBPEL_VARIANCE 16, 1 |