| Index: source/patched-ffmpeg-mt/libavcodec/x86/vp56dsp.asm
|
| ===================================================================
|
| --- source/patched-ffmpeg-mt/libavcodec/x86/vp56dsp.asm (revision 0)
|
| +++ source/patched-ffmpeg-mt/libavcodec/x86/vp56dsp.asm (revision 0)
|
| @@ -0,0 +1,173 @@
|
| +;******************************************************************************
|
| +;* MMX/SSE2-optimized functions for the VP6 decoder
|
| +;* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com>
|
| +;* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com>
|
| +;*
|
| +;* This file is part of FFmpeg.
|
| +;*
|
| +;* FFmpeg is free software; you can redistribute it and/or
|
| +;* modify it under the terms of the GNU Lesser General Public
|
| +;* License as published by the Free Software Foundation; either
|
| +;* version 2.1 of the License, or (at your option) any later version.
|
| +;*
|
| +;* FFmpeg is distributed in the hope that it will be useful,
|
| +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| +;* Lesser General Public License for more details.
|
| +;*
|
| +;* You should have received a copy of the GNU Lesser General Public
|
| +;* License along with FFmpeg; if not, write to the Free Software
|
| +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| +;******************************************************************************
|
| +
|
| +%include "x86inc.asm"
|
| +%include "x86util.asm"
|
| +
|
| +cextern pw_64
|
| +
|
| +SECTION .text
|
| +
|
| +%macro DIAG4_MMX 6
|
| + movq m0, [%1+%2]
|
| + movq m1, [%1+%3]
|
| + movq m3, m0
|
| + movq m4, m1
|
| + punpcklbw m0, m7
|
| + punpcklbw m1, m7
|
| + punpckhbw m3, m7
|
| + punpckhbw m4, m7
|
| + pmullw m0, [rsp+8*11] ; src[x-8 ] * biweight [0]
|
| + pmullw m1, [rsp+8*12] ; src[x ] * biweight [1]
|
| + pmullw m3, [rsp+8*11] ; src[x-8 ] * biweight [0]
|
| + pmullw m4, [rsp+8*12] ; src[x ] * biweight [1]
|
| + paddw m0, m1
|
| + paddw m3, m4
|
| + movq m1, [%1+%4]
|
| + movq m2, [%1+%5]
|
| + movq m4, m1
|
| + movq m5, m2
|
| + punpcklbw m1, m7
|
| + punpcklbw m2, m7
|
| + punpckhbw m4, m7
|
| + punpckhbw m5, m7
|
| + pmullw m1, [rsp+8*13] ; src[x+8 ] * biweight [2]
|
| + pmullw m2, [rsp+8*14] ; src[x+16] * biweight [3]
|
| + pmullw m4, [rsp+8*13] ; src[x+8 ] * biweight [2]
|
| + pmullw m5, [rsp+8*14] ; src[x+16] * biweight [3]
|
| + paddw m1, m2
|
| + paddw m4, m5
|
| + paddsw m0, m1
|
| + paddsw m3, m4
|
| + paddsw m0, m6 ; Add 64
|
| + paddsw m3, m6 ; Add 64
|
| + psraw m0, 7
|
| + psraw m3, 7
|
| + packuswb m0, m3
|
| + movq [%6], m0
|
| +%endmacro
|
| +
|
| +%macro DIAG4_SSE2 6
|
| + movq m0, [%1+%2]
|
| + movq m1, [%1+%3]
|
| + punpcklbw m0, m7
|
| + punpcklbw m1, m7
|
| + pmullw m0, m4 ; src[x-8 ] * biweight [0]
|
| + pmullw m1, m5 ; src[x ] * biweight [1]
|
| + paddw m0, m1
|
| + movq m1, [%1+%4]
|
| + movq m2, [%1+%5]
|
| + punpcklbw m1, m7
|
| + punpcklbw m2, m7
|
| + pmullw m1, m6 ; src[x+8 ] * biweight [2]
|
| + pmullw m2, m3 ; src[x+16] * biweight [3]
|
| + paddw m1, m2
|
| + paddsw m0, m1
|
| + paddsw m0, [pw_64] ; Add 64
|
| + psraw m0, 7
|
| + packuswb m0, m0
|
| + movq [%6], m0
|
| +%endmacro
|
| +
|
| +%macro SPLAT4REGS_MMX 0
|
| + movq m5, m3
|
| + punpcklwd m3, m3
|
| + movq m4, m3
|
| + punpckldq m3, m3
|
| + punpckhdq m4, m4
|
| + punpckhwd m5, m5
|
| + movq m2, m5
|
| + punpckhdq m2, m2
|
| + punpckldq m5, m5
|
| + movq [rsp+8*11], m3
|
| + movq [rsp+8*12], m4
|
| + movq [rsp+8*13], m5
|
| + movq [rsp+8*14], m2
|
| +%endmacro
|
| +
|
| +%macro SPLAT4REGS_SSE2 0
|
| + pshuflw m4, m3, 0x0
|
| + pshuflw m5, m3, 0x55
|
| + pshuflw m6, m3, 0xAA
|
| + pshuflw m3, m3, 0xFF
|
| + punpcklqdq m4, m4
|
| + punpcklqdq m5, m5
|
| + punpcklqdq m6, m6
|
| + punpcklqdq m3, m3
|
| +%endmacro
|
| +
|
| +%macro vp6_filter_diag4 2
|
| +; void ff_vp6_filter_diag4_<opt>(uint8_t *dst, uint8_t *src, int stride,
|
| +; const int16_t h_weight[4], const int16_t v_weights[4])
|
| +cglobal vp6_filter_diag4_%1, 5, 7, %2
|
| + mov r5, rsp ; backup stack pointer
|
| + and rsp, ~(mmsize-1) ; align stack
|
| +%ifidn %1, sse2
|
| + sub rsp, 8*11
|
| +%else
|
| + sub rsp, 8*15
|
| + movq m6, [pw_64]
|
| +%endif
|
| +%ifdef ARCH_X86_64
|
| + movsxd r2, r2d
|
| +%endif
|
| +
|
| + sub r1, r2
|
| +
|
| + pxor m7, m7
|
| + movq m3, [r3]
|
| + SPLAT4REGS
|
| +
|
| + mov r3, rsp
|
| + mov r6, 11
|
| +.nextrow
|
| + DIAG4 r1, -1, 0, 1, 2, r3
|
| + add r3, 8
|
| + add r1, r2
|
| + dec r6
|
| + jnz .nextrow
|
| +
|
| + movq m3, [r4]
|
| + SPLAT4REGS
|
| +
|
| + lea r3, [rsp+8]
|
| + mov r6, 8
|
| +.nextcol
|
| + DIAG4 r3, -8, 0, 8, 16, r0
|
| + add r3, 8
|
| + add r0, r2
|
| + dec r6
|
| + jnz .nextcol
|
| +
|
| + mov rsp, r5 ; restore stack pointer
|
| + RET
|
| +%endmacro
|
| +
|
| +INIT_MMX
|
| +%define DIAG4 DIAG4_MMX
|
| +%define SPLAT4REGS SPLAT4REGS_MMX
|
| +vp6_filter_diag4 mmx, 0
|
| +
|
| +INIT_XMM
|
| +%define DIAG4 DIAG4_SSE2
|
| +%define SPLAT4REGS SPLAT4REGS_SSE2
|
| +vp6_filter_diag4 sse2, 8
|
|
|