Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(281)

Unified Diff: source/libvpx/vp9/decoder/x86/vp9_dequantize_mmx.asm

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 8 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: source/libvpx/vp9/decoder/x86/vp9_dequantize_mmx.asm
===================================================================
--- source/libvpx/vp9/decoder/x86/vp9_dequantize_mmx.asm (revision 0)
+++ source/libvpx/vp9/decoder/x86/vp9_dequantize_mmx.asm (revision 0)
@@ -0,0 +1,406 @@
+;
+; Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+align 16
+x_s1sqr2: times 4 dw 0x8A8C
+align 16
+x_c1sqr2less1: times 4 dw 0x4E7B
+align 16
+pw_16: times 4 dw 16
+
+SECTION .text
+
+INIT_MMX
+
+
+;void dequantize_b_impl_mmx(short *sq, short *dq, short *q)
+cglobal dequantize_b_impl_mmx, 3,3,0,sq,dq,arg3
+ mova m1, [sqq]
+ pmullw m1, [arg3q+0] ; mm4 *= kernel 0 modifiers.
+ mova [dqq+ 0], m1
+
+ mova m1, [sqq+8]
+ pmullw m1, [arg3q+8] ; mm4 *= kernel 0 modifiers.
+ mova [dqq+ 8], m1
+
+ mova m1, [sqq+16]
+ pmullw m1, [arg3q+16] ; mm4 *= kernel 0 modifiers.
+ mova [dqq+16], m1
+
+ mova m1, [sqq+24]
+ pmullw m1, [arg3q+24] ; mm4 *= kernel 0 modifiers.
+ mova [dqq+24], m1
+ RET
+
+
+;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)
+cglobal dequant_idct_add_mmx, 4,6,0,inp,dq,pred,dest,pit,stride
+
+%if ARCH_X86_64
+ movsxd strideq, dword stridem
+ movsxd pitq, dword pitm
+%else
+ mov strideq, stridem
+ mov pitq, pitm
+%endif
+
+ mova m0, [inpq+ 0]
+ pmullw m0, [dqq]
+
+ mova m1, [inpq+ 8]
+ pmullw m1, [dqq+ 8]
+
+ mova m2, [inpq+16]
+ pmullw m2, [dqq+16]
+
+ mova m3, [inpq+24]
+ pmullw m3, [dqq+24]
+
+ pxor m7, m7
+ mova [inpq], m7
+ mova [inpq+8], m7
+ mova [inpq+16], m7
+ mova [inpq+24], m7
+
+
+ psubw m0, m2 ; b1= 0-2
+ paddw m2, m2 ;
+
+ mova m5, m1
+ paddw m2, m0 ; a1 =0+2
+
+ pmulhw m5, [x_s1sqr2];
+ paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ mova m7, m3 ;
+ pmulhw m7, [x_c1sqr2less1];
+
+ paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw m7, m5 ; c1
+
+ mova m5, m1
+ mova m4, m3
+
+ pmulhw m5, [x_c1sqr2less1]
+ paddw m5, m1
+
+ pmulhw m3, [x_s1sqr2]
+ paddw m3, m4
+
+ paddw m3, m5 ; d1
+ mova m6, m2 ; a1
+
+ mova m4, m0 ; b1
+ paddw m2, m3 ;0
+
+ paddw m4, m7 ;1
+ psubw m0, m7 ;2
+
+ psubw m6, m3 ;3
+
+ mova m1, m2 ; 03 02 01 00
+ mova m3, m4 ; 23 22 21 20
+
+ punpcklwd m1, m0 ; 11 01 10 00
+ punpckhwd m2, m0 ; 13 03 12 02
+
+ punpcklwd m3, m6 ; 31 21 30 20
+ punpckhwd m4, m6 ; 33 23 32 22
+
+ mova m0, m1 ; 11 01 10 00
+ mova m5, m2 ; 13 03 12 02
+
+ punpckldq m0, m3 ; 30 20 10 00
+ punpckhdq m1, m3 ; 31 21 11 01
+
+ punpckldq m2, m4 ; 32 22 12 02
+ punpckhdq m5, m4 ; 33 23 13 03
+
+ mova m3, m5 ; 33 23 13 03
+
+ psubw m0, m2 ; b1= 0-2
+ paddw m2, m2 ;
+
+ mova m5, m1
+ paddw m2, m0 ; a1 =0+2
+
+ pmulhw m5, [x_s1sqr2];
+ paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ mova m7, m3 ;
+ pmulhw m7, [x_c1sqr2less1];
+
+ paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw m7, m5 ; c1
+
+ mova m5, m1
+ mova m4, m3
+
+ pmulhw m5, [x_c1sqr2less1]
+ paddw m5, m1
+
+ pmulhw m3, [x_s1sqr2]
+ paddw m3, m4
+
+ paddw m3, m5 ; d1
+ paddw m0, [pw_16]
+
+ paddw m2, [pw_16]
+ mova m6, m2 ; a1
+
+ mova m4, m0 ; b1
+ paddw m2, m3 ;0
+
+ paddw m4, m7 ;1
+ psubw m0, m7 ;2
+
+ psubw m6, m3 ;3
+ psraw m2, 5
+
+ psraw m0, 5
+ psraw m4, 5
+
+ psraw m6, 5
+
+ mova m1, m2 ; 03 02 01 00
+ mova m3, m4 ; 23 22 21 20
+
+ punpcklwd m1, m0 ; 11 01 10 00
+ punpckhwd m2, m0 ; 13 03 12 02
+
+ punpcklwd m3, m6 ; 31 21 30 20
+ punpckhwd m4, m6 ; 33 23 32 22
+
+ mova m0, m1 ; 11 01 10 00
+ mova m5, m2 ; 13 03 12 02
+
+ punpckldq m0, m3 ; 30 20 10 00
+ punpckhdq m1, m3 ; 31 21 11 01
+
+ punpckldq m2, m4 ; 32 22 12 02
+ punpckhdq m5, m4 ; 33 23 13 03
+
+ pxor m7, m7
+
+ movh m4, [predq]
+ punpcklbw m4, m7
+ paddsw m0, m4
+ packuswb m0, m7
+ movh [destq], m0
+
+ movh m4, [predq+pitq]
+ punpcklbw m4, m7
+ paddsw m1, m4
+ packuswb m1, m7
+ movh [destq+strideq], m1
+
+ movh m4, [predq+2*pitq]
+ punpcklbw m4, m7
+ paddsw m2, m4
+ packuswb m2, m7
+ movh [destq+strideq*2], m2
+
+ add destq, strideq
+ add predq, pitq
+
+ movh m4, [predq+2*pitq]
+ punpcklbw m4, m7
+ paddsw m5, m4
+ packuswb m5, m7
+ movh [destq+strideq*2], m5
+ RET
+
+
+;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)
+cglobal dequant_dc_idct_add_mmx, 4,7,0,inp,dq,pred,dest,pit,stride,Dc
+
+%if ARCH_X86_64
+ movsxd strideq, dword stridem
+ movsxd pitq, dword pitm
+%else
+ mov strideq, stridem
+ mov pitq, pitm
+%endif
+
+ mov Dcq, Dcm
+ mova m0, [inpq+ 0]
+ pmullw m0, [dqq+ 0]
+
+ mova m1, [inpq+ 8]
+ pmullw m1, [dqq+ 8]
+
+ mova m2, [inpq+16]
+ pmullw m2, [dqq+16]
+
+ mova m3, [inpq+24]
+ pmullw m3, [dqq+24]
+
+ pxor m7, m7
+ mova [inpq+ 0], m7
+ mova [inpq+ 8], m7
+ mova [inpq+16], m7
+ mova [inpq+24], m7
+
+ ; move lower word of Dc to lower word of m0
+ psrlq m0, 16
+ psllq m0, 16
+ and Dcq, 0xFFFF ; If Dc < 0, we don't want the full dword precision.
+ movh m7, Dcq
+ por m0, m7
+ psubw m0, m2 ; b1= 0-2
+ paddw m2, m2 ;
+
+ mova m5, m1
+ paddw m2, m0 ; a1 =0+2
+
+ pmulhw m5, [x_s1sqr2];
+ paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ mova m7, m3 ;
+ pmulhw m7, [x_c1sqr2less1];
+
+ paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw m7, m5 ; c1
+
+ mova m5, m1
+ mova m4, m3
+
+ pmulhw m5, [x_c1sqr2less1]
+ paddw m5, m1
+
+ pmulhw m3, [x_s1sqr2]
+ paddw m3, m4
+
+ paddw m3, m5 ; d1
+ mova m6, m2 ; a1
+
+ mova m4, m0 ; b1
+ paddw m2, m3 ;0
+
+ paddw m4, m7 ;1
+ psubw m0, m7 ;2
+
+ psubw m6, m3 ;3
+
+ mova m1, m2 ; 03 02 01 00
+ mova m3, m4 ; 23 22 21 20
+
+ punpcklwd m1, m0 ; 11 01 10 00
+ punpckhwd m2, m0 ; 13 03 12 02
+
+ punpcklwd m3, m6 ; 31 21 30 20
+ punpckhwd m4, m6 ; 33 23 32 22
+
+ mova m0, m1 ; 11 01 10 00
+ mova m5, m2 ; 13 03 12 02
+
+ punpckldq m0, m3 ; 30 20 10 00
+ punpckhdq m1, m3 ; 31 21 11 01
+
+ punpckldq m2, m4 ; 32 22 12 02
+ punpckhdq m5, m4 ; 33 23 13 03
+
+ mova m3, m5 ; 33 23 13 03
+
+ psubw m0, m2 ; b1= 0-2
+ paddw m2, m2 ;
+
+ mova m5, m1
+ paddw m2, m0 ; a1 =0+2
+
+ pmulhw m5, [x_s1sqr2];
+ paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ mova m7, m3 ;
+ pmulhw m7, [x_c1sqr2less1];
+
+ paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw m7, m5 ; c1
+
+ mova m5, m1
+ mova m4, m3
+
+ pmulhw m5, [x_c1sqr2less1]
+ paddw m5, m1
+
+ pmulhw m3, [x_s1sqr2]
+ paddw m3, m4
+
+ paddw m3, m5 ; d1
+ paddw m0, [pw_16]
+
+ paddw m2, [pw_16]
+ mova m6, m2 ; a1
+
+ mova m4, m0 ; b1
+ paddw m2, m3 ;0
+
+ paddw m4, m7 ;1
+ psubw m0, m7 ;2
+
+ psubw m6, m3 ;3
+ psraw m2, 5
+
+ psraw m0, 5
+ psraw m4, 5
+
+ psraw m6, 5
+
+ mova m1, m2 ; 03 02 01 00
+ mova m3, m4 ; 23 22 21 20
+
+ punpcklwd m1, m0 ; 11 01 10 00
+ punpckhwd m2, m0 ; 13 03 12 02
+
+ punpcklwd m3, m6 ; 31 21 30 20
+ punpckhwd m4, m6 ; 33 23 32 22
+
+ mova m0, m1 ; 11 01 10 00
+ mova m5, m2 ; 13 03 12 02
+
+ punpckldq m0, m3 ; 30 20 10 00
+ punpckhdq m1, m3 ; 31 21 11 01
+
+ punpckldq m2, m4 ; 32 22 12 02
+ punpckhdq m5, m4 ; 33 23 13 03
+
+ pxor m7, m7
+
+ movh m4, [predq]
+ punpcklbw m4, m7
+ paddsw m0, m4
+ packuswb m0, m7
+ movh [destq], m0
+
+ movh m4, [predq+pitq]
+ punpcklbw m4, m7
+ paddsw m1, m4
+ packuswb m1, m7
+ movh [destq+strideq], m1
+
+ movh m4, [predq+2*pitq]
+ punpcklbw m4, m7
+ paddsw m2, m4
+ packuswb m2, m7
+ movh [destq+strideq*2], m2
+
+ add destq, strideq
+ add predq, pitq
+
+ movh m4, [predq+2*pitq]
+ punpcklbw m4, m7
+ paddsw m5, m4
+ packuswb m5, m7
+ movh [destq+strideq*2], m5
+ RET
+

Powered by Google App Engine
This is Rietveld 408576698