Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(230)

Side by Side Diff: source/libvpx/vp9/decoder/x86/vp9_dequantize_mmx.asm

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 8 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11 %include "third_party/x86inc/x86inc.asm"
12
13 SECTION_RODATA
14 align 16
15 x_s1sqr2: times 4 dw 0x8A8C
16 align 16
17 x_c1sqr2less1: times 4 dw 0x4E7B
18 align 16
19 pw_16: times 4 dw 16
20
21 SECTION .text
22
23 INIT_MMX
24
25
26 ;void dequantize_b_impl_mmx(short *sq, short *dq, short *q)
27 cglobal dequantize_b_impl_mmx, 3,3,0,sq,dq,arg3
28 mova m1, [sqq]
29 pmullw m1, [arg3q+0] ; mm4 *= kernel 0 modifiers.
30 mova [dqq+ 0], m1
31
32 mova m1, [sqq+8]
33 pmullw m1, [arg3q+8] ; mm4 *= kernel 0 modifiers.
34 mova [dqq+ 8], m1
35
36 mova m1, [sqq+16]
37 pmullw m1, [arg3q+16] ; mm4 *= kernel 0 modifiers.
38 mova [dqq+16], m1
39
40 mova m1, [sqq+24]
41 pmullw m1, [arg3q+24] ; mm4 *= kernel 0 modifiers.
42 mova [dqq+24], m1
43 RET
44
45
46 ;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigne d char *dest, int pitch, int stride)
47 cglobal dequant_idct_add_mmx, 4,6,0,inp,dq,pred,dest,pit,stride
48
49 %if ARCH_X86_64
50 movsxd strideq, dword stridem
51 movsxd pitq, dword pitm
52 %else
53 mov strideq, stridem
54 mov pitq, pitm
55 %endif
56
57 mova m0, [inpq+ 0]
58 pmullw m0, [dqq]
59
60 mova m1, [inpq+ 8]
61 pmullw m1, [dqq+ 8]
62
63 mova m2, [inpq+16]
64 pmullw m2, [dqq+16]
65
66 mova m3, [inpq+24]
67 pmullw m3, [dqq+24]
68
69 pxor m7, m7
70 mova [inpq], m7
71 mova [inpq+8], m7
72 mova [inpq+16], m7
73 mova [inpq+24], m7
74
75
76 psubw m0, m2 ; b1= 0-2
77 paddw m2, m2 ;
78
79 mova m5, m1
80 paddw m2, m0 ; a1 =0+2
81
82 pmulhw m5, [x_s1sqr2];
83 paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
84
85 mova m7, m3 ;
86 pmulhw m7, [x_c1sqr2less1];
87
88 paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
89 psubw m7, m5 ; c1
90
91 mova m5, m1
92 mova m4, m3
93
94 pmulhw m5, [x_c1sqr2less1]
95 paddw m5, m1
96
97 pmulhw m3, [x_s1sqr2]
98 paddw m3, m4
99
100 paddw m3, m5 ; d1
101 mova m6, m2 ; a1
102
103 mova m4, m0 ; b1
104 paddw m2, m3 ;0
105
106 paddw m4, m7 ;1
107 psubw m0, m7 ;2
108
109 psubw m6, m3 ;3
110
111 mova m1, m2 ; 03 02 01 00
112 mova m3, m4 ; 23 22 21 20
113
114 punpcklwd m1, m0 ; 11 01 10 00
115 punpckhwd m2, m0 ; 13 03 12 02
116
117 punpcklwd m3, m6 ; 31 21 30 20
118 punpckhwd m4, m6 ; 33 23 32 22
119
120 mova m0, m1 ; 11 01 10 00
121 mova m5, m2 ; 13 03 12 02
122
123 punpckldq m0, m3 ; 30 20 10 00
124 punpckhdq m1, m3 ; 31 21 11 01
125
126 punpckldq m2, m4 ; 32 22 12 02
127 punpckhdq m5, m4 ; 33 23 13 03
128
129 mova m3, m5 ; 33 23 13 03
130
131 psubw m0, m2 ; b1= 0-2
132 paddw m2, m2 ;
133
134 mova m5, m1
135 paddw m2, m0 ; a1 =0+2
136
137 pmulhw m5, [x_s1sqr2];
138 paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
139
140 mova m7, m3 ;
141 pmulhw m7, [x_c1sqr2less1];
142
143 paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
144 psubw m7, m5 ; c1
145
146 mova m5, m1
147 mova m4, m3
148
149 pmulhw m5, [x_c1sqr2less1]
150 paddw m5, m1
151
152 pmulhw m3, [x_s1sqr2]
153 paddw m3, m4
154
155 paddw m3, m5 ; d1
156 paddw m0, [pw_16]
157
158 paddw m2, [pw_16]
159 mova m6, m2 ; a1
160
161 mova m4, m0 ; b1
162 paddw m2, m3 ;0
163
164 paddw m4, m7 ;1
165 psubw m0, m7 ;2
166
167 psubw m6, m3 ;3
168 psraw m2, 5
169
170 psraw m0, 5
171 psraw m4, 5
172
173 psraw m6, 5
174
175 mova m1, m2 ; 03 02 01 00
176 mova m3, m4 ; 23 22 21 20
177
178 punpcklwd m1, m0 ; 11 01 10 00
179 punpckhwd m2, m0 ; 13 03 12 02
180
181 punpcklwd m3, m6 ; 31 21 30 20
182 punpckhwd m4, m6 ; 33 23 32 22
183
184 mova m0, m1 ; 11 01 10 00
185 mova m5, m2 ; 13 03 12 02
186
187 punpckldq m0, m3 ; 30 20 10 00
188 punpckhdq m1, m3 ; 31 21 11 01
189
190 punpckldq m2, m4 ; 32 22 12 02
191 punpckhdq m5, m4 ; 33 23 13 03
192
193 pxor m7, m7
194
195 movh m4, [predq]
196 punpcklbw m4, m7
197 paddsw m0, m4
198 packuswb m0, m7
199 movh [destq], m0
200
201 movh m4, [predq+pitq]
202 punpcklbw m4, m7
203 paddsw m1, m4
204 packuswb m1, m7
205 movh [destq+strideq], m1
206
207 movh m4, [predq+2*pitq]
208 punpcklbw m4, m7
209 paddsw m2, m4
210 packuswb m2, m7
211 movh [destq+strideq*2], m2
212
213 add destq, strideq
214 add predq, pitq
215
216 movh m4, [predq+2*pitq]
217 punpcklbw m4, m7
218 paddsw m5, m4
219 packuswb m5, m7
220 movh [destq+strideq*2], m5
221 RET
222
223
224 ;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsi gned char *dest, int pitch, int stride, int Dc)
225 cglobal dequant_dc_idct_add_mmx, 4,7,0,inp,dq,pred,dest,pit,stride,Dc
226
227 %if ARCH_X86_64
228 movsxd strideq, dword stridem
229 movsxd pitq, dword pitm
230 %else
231 mov strideq, stridem
232 mov pitq, pitm
233 %endif
234
235 mov Dcq, Dcm
236 mova m0, [inpq+ 0]
237 pmullw m0, [dqq+ 0]
238
239 mova m1, [inpq+ 8]
240 pmullw m1, [dqq+ 8]
241
242 mova m2, [inpq+16]
243 pmullw m2, [dqq+16]
244
245 mova m3, [inpq+24]
246 pmullw m3, [dqq+24]
247
248 pxor m7, m7
249 mova [inpq+ 0], m7
250 mova [inpq+ 8], m7
251 mova [inpq+16], m7
252 mova [inpq+24], m7
253
254 ; move lower word of Dc to lower word of m0
255 psrlq m0, 16
256 psllq m0, 16
257 and Dcq, 0xFFFF ; If Dc < 0, we don't want the full dword precision.
258 movh m7, Dcq
259 por m0, m7
260 psubw m0, m2 ; b1= 0-2
261 paddw m2, m2 ;
262
263 mova m5, m1
264 paddw m2, m0 ; a1 =0+2
265
266 pmulhw m5, [x_s1sqr2];
267 paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
268
269 mova m7, m3 ;
270 pmulhw m7, [x_c1sqr2less1];
271
272 paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
273 psubw m7, m5 ; c1
274
275 mova m5, m1
276 mova m4, m3
277
278 pmulhw m5, [x_c1sqr2less1]
279 paddw m5, m1
280
281 pmulhw m3, [x_s1sqr2]
282 paddw m3, m4
283
284 paddw m3, m5 ; d1
285 mova m6, m2 ; a1
286
287 mova m4, m0 ; b1
288 paddw m2, m3 ;0
289
290 paddw m4, m7 ;1
291 psubw m0, m7 ;2
292
293 psubw m6, m3 ;3
294
295 mova m1, m2 ; 03 02 01 00
296 mova m3, m4 ; 23 22 21 20
297
298 punpcklwd m1, m0 ; 11 01 10 00
299 punpckhwd m2, m0 ; 13 03 12 02
300
301 punpcklwd m3, m6 ; 31 21 30 20
302 punpckhwd m4, m6 ; 33 23 32 22
303
304 mova m0, m1 ; 11 01 10 00
305 mova m5, m2 ; 13 03 12 02
306
307 punpckldq m0, m3 ; 30 20 10 00
308 punpckhdq m1, m3 ; 31 21 11 01
309
310 punpckldq m2, m4 ; 32 22 12 02
311 punpckhdq m5, m4 ; 33 23 13 03
312
313 mova m3, m5 ; 33 23 13 03
314
315 psubw m0, m2 ; b1= 0-2
316 paddw m2, m2 ;
317
318 mova m5, m1
319 paddw m2, m0 ; a1 =0+2
320
321 pmulhw m5, [x_s1sqr2];
322 paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
323
324 mova m7, m3 ;
325 pmulhw m7, [x_c1sqr2less1];
326
327 paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
328 psubw m7, m5 ; c1
329
330 mova m5, m1
331 mova m4, m3
332
333 pmulhw m5, [x_c1sqr2less1]
334 paddw m5, m1
335
336 pmulhw m3, [x_s1sqr2]
337 paddw m3, m4
338
339 paddw m3, m5 ; d1
340 paddw m0, [pw_16]
341
342 paddw m2, [pw_16]
343 mova m6, m2 ; a1
344
345 mova m4, m0 ; b1
346 paddw m2, m3 ;0
347
348 paddw m4, m7 ;1
349 psubw m0, m7 ;2
350
351 psubw m6, m3 ;3
352 psraw m2, 5
353
354 psraw m0, 5
355 psraw m4, 5
356
357 psraw m6, 5
358
359 mova m1, m2 ; 03 02 01 00
360 mova m3, m4 ; 23 22 21 20
361
362 punpcklwd m1, m0 ; 11 01 10 00
363 punpckhwd m2, m0 ; 13 03 12 02
364
365 punpcklwd m3, m6 ; 31 21 30 20
366 punpckhwd m4, m6 ; 33 23 32 22
367
368 mova m0, m1 ; 11 01 10 00
369 mova m5, m2 ; 13 03 12 02
370
371 punpckldq m0, m3 ; 30 20 10 00
372 punpckhdq m1, m3 ; 31 21 11 01
373
374 punpckldq m2, m4 ; 32 22 12 02
375 punpckhdq m5, m4 ; 33 23 13 03
376
377 pxor m7, m7
378
379 movh m4, [predq]
380 punpcklbw m4, m7
381 paddsw m0, m4
382 packuswb m0, m7
383 movh [destq], m0
384
385 movh m4, [predq+pitq]
386 punpcklbw m4, m7
387 paddsw m1, m4
388 packuswb m1, m7
389 movh [destq+strideq], m1
390
391 movh m4, [predq+2*pitq]
392 punpcklbw m4, m7
393 paddsw m2, m4
394 packuswb m2, m7
395 movh [destq+strideq*2], m2
396
397 add destq, strideq
398 add predq, pitq
399
400 movh m4, [predq+2*pitq]
401 punpcklbw m4, m7
402 paddsw m5, m4
403 packuswb m5, m7
404 movh [destq+strideq*2], m5
405 RET
406
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698