source/libvpx/vp9/common/x86/vp9_idctllm_mmx.asm - Issue 11555023: libvpx: Add VP9 decoder.

Unified Diff: source/libvpx/vp9/common/x86/vp9_idctllm_mmx.asm

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: source/libvpx/vp9/common/x86/vp9_idctllm_mmx.asm

===================================================================

--- source/libvpx/vp9/common/x86/vp9_idctllm_mmx.asm (revision 0)

+++ source/libvpx/vp9/common/x86/vp9_idctllm_mmx.asm (revision 0)

@@ -0,0 +1,241 @@

+; Use of this source code is governed by a BSD-style license

+; that can be found in the LICENSE file in the root of the source

+; tree. An additional intellectual property rights grant can be found

+; in the file PATENTS. All contributing project authors may

+; be found in the AUTHORS file in the root of the source tree.

+%include "third_party/x86inc/x86inc.asm"

+SECTION_RODATA

+align 16

+x_s1sqr2: times 4 dw 0x8A8C

+align 16

+x_c1sqr2less1: times 4 dw 0x4E7B

+align 16

+pw_16: times 4 dw 16

+SECTION .text

+; /****************************************************************************

+; * Notes:

+; *

+; * This implementation makes use of 16 bit fixed point version of two multiply

+; * constants:

+; * 1. sqrt(2) * cos (pi/8)

+; * 2. sqrt(2) * sin (pi/8)

+; * Because the first constant is bigger than 1, to maintain the same 16 bit

+; * fixed point precision as the second one, we use a trick of

+; * x * a = x + x*(a-1)

+; * so

+; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).

+; *

+; * For the second constant, because of the 16bit version is 35468, which

+; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative

+; * number.

+; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x

+; *

+; **************************************************************************/

+INIT_MMX

+;void short_idct4x4llm_mmx(short *input, short *output, int pitch)

+cglobal short_idct4x4llm_mmx, 3,3,0, inp, out, pit

+ mova m0, [inpq +0]

+ mova m1, [inpq +8]

+ mova m2, [inpq+16]

+ mova m3, [inpq+24]

+ psubw m0, m2 ; b1= 0-2

+ paddw m2, m2 ;

+ mova m5, m1

+ paddw m2, m0 ; a1 =0+2

+ pmulhw m5, [x_s1sqr2] ;

+ paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)

+ mova m7, m3 ;

+ pmulhw m7, [x_c1sqr2less1] ;

+ paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)

+ psubw m7, m5 ; c1

+ mova m5, m1

+ mova m4, m3

+ pmulhw m5, [x_c1sqr2less1]

+ paddw m5, m1

+ pmulhw m3, [x_s1sqr2]

+ paddw m3, m4

+ paddw m3, m5 ; d1

+ mova m6, m2 ; a1

+ mova m4, m0 ; b1

+ paddw m2, m3 ;0

+ paddw m4, m7 ;1

+ psubw m0, m7 ;2

+ psubw m6, m3 ;3

+ mova m1, m2 ; 03 02 01 00

+ mova m3, m4 ; 23 22 21 20

+ punpcklwd m1, m0 ; 11 01 10 00

+ punpckhwd m2, m0 ; 13 03 12 02

+ punpcklwd m3, m6 ; 31 21 30 20

+ punpckhwd m4, m6 ; 33 23 32 22

+ mova m0, m1 ; 11 01 10 00

+ mova m5, m2 ; 13 03 12 02

+ punpckldq m0, m3 ; 30 20 10 00

+ punpckhdq m1, m3 ; 31 21 11 01

+ punpckldq m2, m4 ; 32 22 12 02

+ punpckhdq m5, m4 ; 33 23 13 03

+ mova m3, m5 ; 33 23 13 03

+ psubw m0, m2 ; b1= 0-2

+ paddw m2, m2 ;

+ mova m5, m1

+ paddw m2, m0 ; a1 =0+2

+ pmulhw m5, [x_s1sqr2] ;

+ paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)

+ mova m7, m3 ;

+ pmulhw m7, [x_c1sqr2less1] ;

+ paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)

+ psubw m7, m5 ; c1

+ mova m5, m1

+ mova m4, m3

+ pmulhw m5, [x_c1sqr2less1]

+ paddw m5, m1

+ pmulhw m3, [x_s1sqr2]

+ paddw m3, m4

+ paddw m3, m5 ; d1

+ paddw m0, [pw_16]

+ paddw m2, [pw_16]

+ mova m6, m2 ; a1

+ mova m4, m0 ; b1

+ paddw m2, m3 ;0

+ paddw m4, m7 ;1

+ psubw m0, m7 ;2

+ psubw m6, m3 ;3

+ psraw m2, 5

+ psraw m0, 5

+ psraw m4, 5

+ psraw m6, 5

+ mova m1, m2 ; 03 02 01 00

+ mova m3, m4 ; 23 22 21 20

+ punpcklwd m1, m0 ; 11 01 10 00

+ punpckhwd m2, m0 ; 13 03 12 02

+ punpcklwd m3, m6 ; 31 21 30 20

+ punpckhwd m4, m6 ; 33 23 32 22

+ mova m0, m1 ; 11 01 10 00

+ mova m5, m2 ; 13 03 12 02

+ punpckldq m0, m3 ; 30 20 10 00

+ punpckhdq m1, m3 ; 31 21 11 01

+ punpckldq m2, m4 ; 32 22 12 02

+ punpckhdq m5, m4 ; 33 23 13 03

+ mova [outq], m0

+ mova [outq+r2], m1

+ mova [outq+pitq*2], m2

+ add outq, pitq

+ mova [outq+pitq*2], m5

+ RET

+;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)

+cglobal short_idct4x4llm_1_mmx,3,3,0,inp,out,pit

+ movh m0, [inpq]

+ paddw m0, [pw_16]

+ psraw m0, 5

+ punpcklwd m0, m0

+ punpckldq m0, m0

+ mova [outq], m0

+ mova [outq+pitq], m0

+ mova [outq+pitq*2], m0

+ add r1, r2

+ mova [outq+pitq*2], m0

+ RET

+;void dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)

+cglobal dc_only_idct_add_mmx, 4,5,0,in_dc,pred,dst,pit,stride

+%if ARCH_X86_64

+ movsxd strideq, dword stridem

+%else

+ mov strideq, stridem

+%endif

+ pxor m0, m0

+ movh m5, in_dcq ; dc

+ paddw m5, [pw_16]

+ psraw m5, 5

+ punpcklwd m5, m5

+ punpckldq m5, m5

+ movh m1, [predq]

+ punpcklbw m1, m0

+ paddsw m1, m5

+ packuswb m1, m0 ; pack and unpack to saturate

+ movh [dstq], m1

+ movh m2, [predq+pitq]

+ punpcklbw m2, m0

+ paddsw m2, m5

+ packuswb m2, m0 ; pack and unpack to saturate

+ movh [dstq+strideq], m2

+ movh m3, [predq+2*pitq]

+ punpcklbw m3, m0

+ paddsw m3, m5

+ packuswb m3, m0 ; pack and unpack to saturate

+ movh [dstq+2*strideq], m3

+ add dstq, strideq

+ add predq, pitq

+ movh m4, [predq+2*pitq]

+ punpcklbw m4, m0

+ paddsw m4, m5

+ packuswb m4, m0 ; pack and unpack to saturate

+ movh [dstq+2*strideq], m4

+ RET

« libvpx.gyp ('K') | « source/libvpx/vp9/common/x86/vp9_idct_x86.h ('k') | source/libvpx/vp9/common/x86/vp9_idctllm_sse2.asm » ('j') | no next file with comments »