| Index: source/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
|
| diff --git a/source/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/source/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
|
| index 061b34d3fc6c3b9bcef7ba03d6b9a43989309626..b26383708f7da272f0a6def7a475210a4c248c6a 100644
|
| --- a/source/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
|
| +++ b/source/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
|
| @@ -8,297 +8,146 @@
|
| ; be found in the AUTHORS file in the root of the source tree.
|
| ;
|
|
|
| -%define program_name vpx
|
| -
|
| %include "third_party/x86inc/x86inc.asm"
|
|
|
| SECTION .text
|
|
|
| %macro convolve_fn 1
|
| INIT_XMM sse2
|
| -cglobal convolve_%1, 4, 7, 8, src, src_stride, dst, dst_stride, \
|
| +cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
|
| fx, fxs, fy, fys, w, h
|
| - mov r4d, dword wm
|
| - cmp r4d, 4
|
| + mov r4d, dword wm
|
| + cmp r4d, 4
|
| je .w4
|
| - cmp r4d, 8
|
| + cmp r4d, 8
|
| je .w8
|
| - cmp r4d, 16
|
| + cmp r4d, 16
|
| je .w16
|
| - cmp r4d, 32
|
| + cmp r4d, 32
|
| je .w32
|
|
|
| - ; 64xh
|
| - mov r4d, dword hm
|
| - shr r4d, 1 ; ASSUMPTION: hm is at least EVEN
|
| - sub r4d, 1
|
| -
|
| - movu m0, [srcq]
|
| - movu m4, [srcq+src_strideq]
|
| - movu m1, [srcq+16]
|
| - movu m5, [srcq+src_strideq+16]
|
| - movu m2, [srcq+32]
|
| - movu m6, [srcq+src_strideq+32]
|
| - movu m3, [srcq+48]
|
| - movu m7, [srcq+src_strideq+48]
|
| -
|
| + mov r4d, dword hm
|
| .loop64:
|
| - prefetcht0 [srcq+64 ]
|
| - prefetcht0 [srcq+src_strideq+64]
|
| -
|
| - lea srcq, [srcq+src_strideq*2]
|
| -
|
| + movu m0, [srcq]
|
| + movu m1, [srcq+16]
|
| + movu m2, [srcq+32]
|
| + movu m3, [srcq+48]
|
| + add srcq, src_strideq
|
| %ifidn %1, avg
|
| - pavgb m0, [dstq]
|
| - pavgb m1, [dstq+16]
|
| -
|
| - mova [dstq ], m0
|
| - movu m0, [srcq]
|
| -
|
| - mova [dstq+16], m1
|
| - movu m1, [srcq+16]
|
| -
|
| - pavgb m2, [dstq+32]
|
| - mova [dstq+32], m2
|
| - movu m2, [srcq+32]
|
| - pavgb m3, [dstq+48]
|
| - mova [dstq+48], m3
|
| - movu m3, [srcq+48]
|
| - pavgb m4, [dstq+dst_strideq]
|
| -
|
| - mova [dstq+dst_strideq], m4
|
| - movu m4, [srcq+src_strideq]
|
| -
|
| - pavgb m5, [dstq+dst_strideq+16]
|
| - mova [dstq+dst_strideq+16], m5
|
| - movu m5, [srcq+src_strideq+16]
|
| - pavgb m6, [dstq+dst_strideq+32]
|
| - mova [dstq+dst_strideq+32], m6
|
| - movu m6, [srcq+src_strideq+32]
|
| - pavgb m7, [dstq+dst_strideq+48]
|
| - mova [dstq+dst_strideq+48], m7
|
| - movu m7, [srcq+src_strideq+48]
|
| -
|
| - lea dstq, [dstq+dst_strideq*2]
|
| -%else
|
| - mova [dstq ], m0
|
| - movu m0, [srcq]
|
| -
|
| - mova [dstq+16], m1
|
| - movu m1, [srcq+16]
|
| - mova [dstq+32], m2
|
| - movu m2, [srcq+32]
|
| - mova [dstq+48], m3
|
| - movu m3, [srcq+48]
|
| -
|
| - mova [dstq+dst_strideq], m4
|
| - movu m4, [srcq+src_strideq]
|
| -
|
| - mova [dstq+dst_strideq+16], m5
|
| - movu m5, [srcq+src_strideq+16]
|
| - mova [dstq+dst_strideq+32], m6
|
| - movu m6, [srcq+src_strideq+32]
|
| - mova [dstq+dst_strideq+48], m7
|
| - movu m7, [srcq+src_strideq+48]
|
| -
|
| - lea dstq, [dstq+dst_strideq*2]
|
| + pavgb m0, [dstq]
|
| + pavgb m1, [dstq+16]
|
| + pavgb m2, [dstq+32]
|
| + pavgb m3, [dstq+48]
|
| %endif
|
| - dec r4d
|
| + mova [dstq ], m0
|
| + mova [dstq+16], m1
|
| + mova [dstq+32], m2
|
| + mova [dstq+48], m3
|
| + add dstq, dst_strideq
|
| + dec r4d
|
| jnz .loop64
|
| -
|
| -%ifidn %1, avg
|
| - pavgb m0, [dstq]
|
| - pavgb m1, [dstq+16]
|
| - pavgb m2, [dstq+32]
|
| - pavgb m3, [dstq+48]
|
| - pavgb m4, [dstq+dst_strideq]
|
| - pavgb m5, [dstq+dst_strideq+16]
|
| - pavgb m6, [dstq+dst_strideq+32]
|
| - pavgb m7, [dstq+dst_strideq+48]
|
| -%endif
|
| - mova [dstq ], m0
|
| - mova [dstq+16], m1
|
| - mova [dstq+32], m2
|
| - mova [dstq+48], m3
|
| -
|
| - mova [dstq+dst_strideq ], m4
|
| - mova [dstq+dst_strideq+16], m5
|
| - mova [dstq+dst_strideq+32], m6
|
| - mova [dstq+dst_strideq+48], m7
|
| -
|
| RET
|
|
|
| .w32:
|
| - mov r4d, dword hm
|
| - sub r4d, 2
|
| -
|
| - movu m0, [srcq]
|
| - movu m1, [srcq+16]
|
| - movu m2, [srcq+src_strideq]
|
| - movu m3, [srcq+src_strideq+16]
|
| -
|
| + mov r4d, dword hm
|
| .loop32:
|
| - prefetcht0 [srcq+64]
|
| - prefetcht0 [srcq+src_strideq+64]
|
| -
|
| - lea srcq, [srcq+src_strideq*2]
|
| + movu m0, [srcq]
|
| + movu m1, [srcq+16]
|
| + movu m2, [srcq+src_strideq]
|
| + movu m3, [srcq+src_strideq+16]
|
| + lea srcq, [srcq+src_strideq*2]
|
| %ifidn %1, avg
|
| - pavgb m0, [dstq]
|
| - pavgb m1, [dstq+16]
|
| - pavgb m2, [dstq+dst_strideq]
|
| - pavgb m3, [dstq+dst_strideq+16]
|
| + pavgb m0, [dstq]
|
| + pavgb m1, [dstq +16]
|
| + pavgb m2, [dstq+dst_strideq]
|
| + pavgb m3, [dstq+dst_strideq+16]
|
| %endif
|
| - mova [dstq], m0
|
| - movu m0, [srcq]
|
| -
|
| - mova [dstq+16], m1
|
| - movu m1, [srcq+16]
|
| -
|
| - mova [dstq+dst_strideq], m2
|
| - movu m2, [srcq+src_strideq]
|
| -
|
| - mova [dstq+dst_strideq+16], m3
|
| - movu m3, [srcq+src_strideq+16]
|
| -
|
| - lea dstq, [dstq+dst_strideq*2]
|
| -
|
| - sub r4d, 2
|
| + mova [dstq ], m0
|
| + mova [dstq +16], m1
|
| + mova [dstq+dst_strideq ], m2
|
| + mova [dstq+dst_strideq+16], m3
|
| + lea dstq, [dstq+dst_strideq*2]
|
| + sub r4d, 2
|
| jnz .loop32
|
| -
|
| -%ifidn %1, avg
|
| - pavgb m0, [dstq]
|
| - pavgb m1, [dstq+16]
|
| - pavgb m2, [dstq+dst_strideq]
|
| - pavgb m3, [dstq+dst_strideq+16]
|
| -%endif
|
| - mova [dstq ], m0
|
| - mova [dstq+16], m1
|
| -
|
| - mova [dstq+dst_strideq ], m2
|
| - mova [dstq+dst_strideq+16], m3
|
| -
|
| RET
|
|
|
| .w16:
|
| - mov r4d, dword hm
|
| - sub r4d, 4
|
| -
|
| - movu m0, [srcq]
|
| - movu m1, [srcq+src_strideq]
|
| -
|
| + mov r4d, dword hm
|
| + lea r5q, [src_strideq*3]
|
| + lea r6q, [dst_strideq*3]
|
| .loop16:
|
| - lea srcq, [srcq+src_strideq]
|
| - prefetcht0 [srcq+src_strideq*4]
|
| - lea srcq, [srcq+src_strideq]
|
| - prefetcht0 [srcq+src_strideq*2]
|
| + movu m0, [srcq]
|
| + movu m1, [srcq+src_strideq]
|
| + movu m2, [srcq+src_strideq*2]
|
| + movu m3, [srcq+r5q]
|
| + lea srcq, [srcq+src_strideq*4]
|
| %ifidn %1, avg
|
| - pavgb m0, [dstq]
|
| - pavgb m1, [dstq+dst_strideq]
|
| + pavgb m0, [dstq]
|
| + pavgb m1, [dstq+dst_strideq]
|
| + pavgb m2, [dstq+dst_strideq*2]
|
| + pavgb m3, [dstq+r6q]
|
| %endif
|
| - mova [dstq ], m0
|
| - mova [dstq+dst_strideq], m1
|
| -
|
| - lea dstq, [dstq+dst_strideq*2]
|
| -
|
| - movu m0, [srcq]
|
| - movu m1, [srcq+src_strideq]
|
| -
|
| - sub r4d, 2
|
| + mova [dstq ], m0
|
| + mova [dstq+dst_strideq ], m1
|
| + mova [dstq+dst_strideq*2], m2
|
| + mova [dstq+r6q ], m3
|
| + lea dstq, [dstq+dst_strideq*4]
|
| + sub r4d, 4
|
| jnz .loop16
|
| -
|
| - lea srcq, [srcq+src_strideq*2]
|
| -%ifidn %1, avg
|
| - pavgb m0, [dstq]
|
| - pavgb m1, [dstq+dst_strideq]
|
| -%endif
|
| - mova [dstq ], m0
|
| - mova [dstq+dst_strideq], m1
|
| -
|
| - lea dstq, [dstq+dst_strideq*2]
|
| -
|
| - movu m0, [srcq]
|
| - movu m1, [srcq+src_strideq]
|
| -
|
| -%ifidn %1, avg
|
| - pavgb m0, [dstq]
|
| - pavgb m1, [dstq+dst_strideq]
|
| -%endif
|
| -
|
| - mova [dstq ], m0
|
| - mova [dstq+dst_strideq], m1
|
| -
|
| RET
|
|
|
| INIT_MMX sse
|
| .w8:
|
| - mov r4d, dword hm
|
| - sub r4d, 2
|
| -
|
| - movu m0, [srcq]
|
| - movu m1, [srcq+src_strideq]
|
| -
|
| + mov r4d, dword hm
|
| + lea r5q, [src_strideq*3]
|
| + lea r6q, [dst_strideq*3]
|
| .loop8:
|
| - lea srcq, [srcq+src_strideq]
|
| - prefetcht0 [srcq+src_strideq*4]
|
| - lea srcq, [srcq+src_strideq]
|
| - prefetcht0 [srcq+src_strideq*2]
|
| -
|
| + movu m0, [srcq]
|
| + movu m1, [srcq+src_strideq]
|
| + movu m2, [srcq+src_strideq*2]
|
| + movu m3, [srcq+r5q]
|
| + lea srcq, [srcq+src_strideq*4]
|
| %ifidn %1, avg
|
| - pavgb m0, [dstq]
|
| - pavgb m1, [dstq+dst_strideq]
|
| + pavgb m0, [dstq]
|
| + pavgb m1, [dstq+dst_strideq]
|
| + pavgb m2, [dstq+dst_strideq*2]
|
| + pavgb m3, [dstq+r6q]
|
| %endif
|
| - mova [dstq ], m0
|
| - mova [dstq+dst_strideq], m1
|
| -
|
| - movu m0, [srcq]
|
| - movu m1, [srcq+src_strideq]
|
| -
|
| - lea dstq, [dstq+dst_strideq*2]
|
| -
|
| - sub r4d, 2
|
| + mova [dstq ], m0
|
| + mova [dstq+dst_strideq ], m1
|
| + mova [dstq+dst_strideq*2], m2
|
| + mova [dstq+r6q ], m3
|
| + lea dstq, [dstq+dst_strideq*4]
|
| + sub r4d, 4
|
| jnz .loop8
|
| -
|
| -%ifidn %1, avg
|
| - pavgb m0, [dstq]
|
| - pavgb m1, [dstq+dst_strideq]
|
| -%endif
|
| - mova [dstq ], m0
|
| - mova [dstq+dst_strideq], m1
|
| -
|
| RET
|
|
|
| .w4:
|
| - mov r4d, dword hm
|
| -
|
| - lea r5q, [src_strideq*3]
|
| - lea r6q, [dst_strideq*3]
|
| -
|
| + mov r4d, dword hm
|
| + lea r5q, [src_strideq*3]
|
| + lea r6q, [dst_strideq*3]
|
| .loop4:
|
| - movh m0, [srcq]
|
| - movh m1, [srcq+src_strideq]
|
| - movh m2, [srcq+src_strideq*2]
|
| - movh m3, [srcq+r5q]
|
| -
|
| - lea srcq, [srcq+src_strideq*4]
|
| + movh m0, [srcq]
|
| + movh m1, [srcq+src_strideq]
|
| + movh m2, [srcq+src_strideq*2]
|
| + movh m3, [srcq+r5q]
|
| + lea srcq, [srcq+src_strideq*4]
|
| %ifidn %1, avg
|
| - movh m4, [dstq]
|
| - movh m5, [dstq+dst_strideq]
|
| - movh m6, [dstq+dst_strideq*2]
|
| - movh m7, [dstq+r6q]
|
| -
|
| - pavgb m0, m4
|
| - pavgb m1, m5
|
| - pavgb m2, m6
|
| - pavgb m3, m7
|
| + movh m4, [dstq]
|
| + movh m5, [dstq+dst_strideq]
|
| + movh m6, [dstq+dst_strideq*2]
|
| + movh m7, [dstq+r6q]
|
| + pavgb m0, m4
|
| + pavgb m1, m5
|
| + pavgb m2, m6
|
| + pavgb m3, m7
|
| %endif
|
| - movh [dstq ], m0
|
| - movh [dstq+dst_strideq ], m1
|
| - movh [dstq+dst_strideq*2], m2
|
| - movh [dstq+r6q ], m3
|
| -
|
| - lea dstq, [dstq+dst_strideq*4]
|
| -
|
| - sub r4d, 4
|
| + movh [dstq ], m0
|
| + movh [dstq+dst_strideq ], m1
|
| + movh [dstq+dst_strideq*2], m2
|
| + movh [dstq+r6q ], m3
|
| + lea dstq, [dstq+dst_strideq*4]
|
| + sub r4d, 4
|
| jnz .loop4
|
| RET
|
| %endmacro
|
|
|