Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(6)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_idctllm_mmx.asm

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 8 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11 %include "third_party/x86inc/x86inc.asm"
12
13 SECTION_RODATA
14 align 16
15 x_s1sqr2: times 4 dw 0x8A8C
16 align 16
17 x_c1sqr2less1: times 4 dw 0x4E7B
18 align 16
19 pw_16: times 4 dw 16
20
21 SECTION .text
22
23
24 ; /****************************************************************************
25 ; * Notes:
26 ; *
27 ; * This implementation makes use of 16 bit fixed point version of two multiply
28 ; * constants:
29 ; * 1. sqrt(2) * cos (pi/8)
30 ; * 2. sqrt(2) * sin (pi/8)
31 ; * Because the first constant is bigger than 1, to maintain the same 16 bit
32 ; * fixed point precision as the second one, we use a trick of
33 ; * x * a = x + x*(a-1)
34 ; * so
35 ; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
36 ; *
37 ; * For the second constant, because of the 16bit version is 35468, which
38 ; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
39 ; * number.
40 ; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
41 ; *
42 ; **************************************************************************/
43
44 INIT_MMX
45
46 ;void short_idct4x4llm_mmx(short *input, short *output, int pitch)
47 cglobal short_idct4x4llm_mmx, 3,3,0, inp, out, pit
48 mova m0, [inpq +0]
49 mova m1, [inpq +8]
50
51 mova m2, [inpq+16]
52 mova m3, [inpq+24]
53
54 psubw m0, m2 ; b1= 0-2
55 paddw m2, m2 ;
56
57 mova m5, m1
58 paddw m2, m0 ; a1 =0+2
59
60 pmulhw m5, [x_s1sqr2] ;
61 paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
62
63 mova m7, m3 ;
64 pmulhw m7, [x_c1sqr2less1] ;
65
66 paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
67 psubw m7, m5 ; c1
68
69 mova m5, m1
70 mova m4, m3
71
72 pmulhw m5, [x_c1sqr2less1]
73 paddw m5, m1
74
75 pmulhw m3, [x_s1sqr2]
76 paddw m3, m4
77
78 paddw m3, m5 ; d1
79 mova m6, m2 ; a1
80
81 mova m4, m0 ; b1
82 paddw m2, m3 ;0
83
84 paddw m4, m7 ;1
85 psubw m0, m7 ;2
86
87 psubw m6, m3 ;3
88
89 mova m1, m2 ; 03 02 01 00
90 mova m3, m4 ; 23 22 21 20
91
92 punpcklwd m1, m0 ; 11 01 10 00
93 punpckhwd m2, m0 ; 13 03 12 02
94
95 punpcklwd m3, m6 ; 31 21 30 20
96 punpckhwd m4, m6 ; 33 23 32 22
97
98 mova m0, m1 ; 11 01 10 00
99 mova m5, m2 ; 13 03 12 02
100
101 punpckldq m0, m3 ; 30 20 10 00
102 punpckhdq m1, m3 ; 31 21 11 01
103
104 punpckldq m2, m4 ; 32 22 12 02
105 punpckhdq m5, m4 ; 33 23 13 03
106
107 mova m3, m5 ; 33 23 13 03
108
109 psubw m0, m2 ; b1= 0-2
110 paddw m2, m2 ;
111
112 mova m5, m1
113 paddw m2, m0 ; a1 =0+2
114
115 pmulhw m5, [x_s1sqr2] ;
116 paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
117
118 mova m7, m3 ;
119 pmulhw m7, [x_c1sqr2less1] ;
120
121 paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
122 psubw m7, m5 ; c1
123
124 mova m5, m1
125 mova m4, m3
126
127 pmulhw m5, [x_c1sqr2less1]
128 paddw m5, m1
129
130 pmulhw m3, [x_s1sqr2]
131 paddw m3, m4
132
133 paddw m3, m5 ; d1
134 paddw m0, [pw_16]
135
136 paddw m2, [pw_16]
137 mova m6, m2 ; a1
138
139 mova m4, m0 ; b1
140 paddw m2, m3 ;0
141
142 paddw m4, m7 ;1
143 psubw m0, m7 ;2
144
145 psubw m6, m3 ;3
146 psraw m2, 5
147
148 psraw m0, 5
149 psraw m4, 5
150
151 psraw m6, 5
152
153 mova m1, m2 ; 03 02 01 00
154 mova m3, m4 ; 23 22 21 20
155
156 punpcklwd m1, m0 ; 11 01 10 00
157 punpckhwd m2, m0 ; 13 03 12 02
158
159 punpcklwd m3, m6 ; 31 21 30 20
160 punpckhwd m4, m6 ; 33 23 32 22
161
162 mova m0, m1 ; 11 01 10 00
163 mova m5, m2 ; 13 03 12 02
164
165 punpckldq m0, m3 ; 30 20 10 00
166 punpckhdq m1, m3 ; 31 21 11 01
167
168 punpckldq m2, m4 ; 32 22 12 02
169 punpckhdq m5, m4 ; 33 23 13 03
170
171 mova [outq], m0
172
173 mova [outq+r2], m1
174 mova [outq+pitq*2], m2
175
176 add outq, pitq
177 mova [outq+pitq*2], m5
178 RET
179
180 ;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)
181 cglobal short_idct4x4llm_1_mmx,3,3,0,inp,out,pit
182 movh m0, [inpq]
183 paddw m0, [pw_16]
184 psraw m0, 5
185 punpcklwd m0, m0
186 punpckldq m0, m0
187
188 mova [outq], m0
189 mova [outq+pitq], m0
190
191 mova [outq+pitq*2], m0
192 add r1, r2
193
194 mova [outq+pitq*2], m0
195 RET
196
197
198 ;void dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned cha r *dst_ptr, int pitch, int stride)
199 cglobal dc_only_idct_add_mmx, 4,5,0,in_dc,pred,dst,pit,stride
200 %if ARCH_X86_64
201 movsxd strideq, dword stridem
202 %else
203 mov strideq, stridem
204 %endif
205 pxor m0, m0
206
207 movh m5, in_dcq ; dc
208 paddw m5, [pw_16]
209
210 psraw m5, 5
211
212 punpcklwd m5, m5
213 punpckldq m5, m5
214
215 movh m1, [predq]
216 punpcklbw m1, m0
217 paddsw m1, m5
218 packuswb m1, m0 ; pack and unpack to saturate
219 movh [dstq], m1
220
221 movh m2, [predq+pitq]
222 punpcklbw m2, m0
223 paddsw m2, m5
224 packuswb m2, m0 ; pack and unpack to saturate
225 movh [dstq+strideq], m2
226
227 movh m3, [predq+2*pitq]
228 punpcklbw m3, m0
229 paddsw m3, m5
230 packuswb m3, m0 ; pack and unpack to saturate
231 movh [dstq+2*strideq], m3
232
233 add dstq, strideq
234 add predq, pitq
235 movh m4, [predq+2*pitq]
236 punpcklbw m4, m0
237 paddsw m4, m5
238 packuswb m4, m0 ; pack and unpack to saturate
239 movh [dstq+2*strideq], m4
240 RET
241
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698