Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(83)

Side by Side Diff: source/patched-ffmpeg-mt/libavcodec/x86/vp3dsp.asm

Issue 3384002: ffmpeg source update for sep 09 (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/ffmpeg/
Patch Set: Created 10 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;******************************************************************************
2 ;* MMX/SSE2-optimized functions for the VP3 decoder
3 ;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
4 ;*
5 ;* This file is part of FFmpeg.
6 ;*
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
11 ;*
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
16 ;*
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
21
22 %include "x86inc.asm"
23 %include "x86util.asm"
24
25 ; MMX-optimized functions cribbed from the original VP3 source code.
26
27 SECTION_RODATA
28
29 vp3_idct_data: times 8 dw 64277
30 times 8 dw 60547
31 times 8 dw 54491
32 times 8 dw 46341
33 times 8 dw 36410
34 times 8 dw 25080
35 times 8 dw 12785
36
37 cextern pb_1
38 cextern pb_3
39 cextern pb_7
40 cextern pb_1F
41 cextern pb_81
42
43 cextern pw_8
44
45 cextern put_signed_pixels_clamped_mmx
46 cextern add_pixels_clamped_mmx
47
48 SECTION .text
49
50 ; this is off by one or two for some cases when filter_limit is greater than 63
51 ; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
52 ; out: p1 in mm4, p2 in mm3
53 %macro VP3_LOOP_FILTER 0
54 movq m7, m6
55 pand m6, [pb_7] ; p0&7
56 psrlw m7, 3
57 pand m7, [pb_1F] ; p0>>3
58 movq m3, m2 ; p2
59 pxor m2, m4
60 pand m2, [pb_1] ; (p2^p1)&1
61 movq m5, m2
62 paddb m2, m2
63 paddb m2, m5 ; 3*(p2^p1)&1
64 paddb m2, m6 ; extra bits lost in shifts
65 pcmpeqb m0, m0
66 pxor m1, m0 ; 255 - p3
67 pavgb m1, m2 ; (256 - p3 + extrabits) >> 1
68 pxor m0, m4 ; 255 - p1
69 pavgb m0, m3 ; (256 + p2-p1) >> 1
70 paddb m1, [pb_3]
71 pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2
72 pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3
73 paddusb m7, m1 ; d+128+1
74 movq m6, [pb_81]
75 psubusb m6, m7
76 psubusb m7, [pb_81]
77
78 movq m5, [r2+516] ; flim
79 pminub m6, m5
80 pminub m7, m5
81 movq m0, m6
82 movq m1, m7
83 paddb m6, m6
84 paddb m7, m7
85 pminub m6, m5
86 pminub m7, m5
87 psubb m6, m0
88 psubb m7, m1
89 paddusb m4, m7
90 psubusb m4, m6
91 psubusb m3, m7
92 paddusb m3, m6
93 %endmacro
94
95 %macro STORE_4_WORDS 1
96 movd r2d, %1
97 mov [r0 -1], r2w
98 psrlq %1, 32
99 shr r2, 16
100 mov [r0+r1 -1], r2w
101 movd r2d, %1
102 mov [r0+r1*2-1], r2w
103 shr r2, 16
104 mov [r0+r3 -1], r2w
105 %endmacro
106
107 INIT_MMX
108 cglobal vp3_v_loop_filter_mmx2, 3, 4
109 %ifdef ARCH_X86_64
110 movsxd r1, r1d
111 %endif
112 mov r3, r1
113 neg r1
114 movq m6, [r0+r1*2]
115 movq m4, [r0+r1 ]
116 movq m2, [r0 ]
117 movq m1, [r0+r3 ]
118
119 VP3_LOOP_FILTER
120
121 movq [r0+r1], m4
122 movq [r0 ], m3
123 RET
124
125 cglobal vp3_h_loop_filter_mmx2, 3, 4
126 %ifdef ARCH_X86_64
127 movsxd r1, r1d
128 %endif
129 lea r3, [r1*3]
130
131 movd m6, [r0 -2]
132 movd m4, [r0+r1 -2]
133 movd m2, [r0+r1*2-2]
134 movd m1, [r0+r3 -2]
135 lea r0, [r0+r1*4 ]
136 punpcklbw m6, [r0 -2]
137 punpcklbw m4, [r0+r1 -2]
138 punpcklbw m2, [r0+r1*2-2]
139 punpcklbw m1, [r0+r3 -2]
140 sub r0, r3
141 sub r0, r1
142
143 TRANSPOSE4x4B 6, 4, 2, 1, 0
144 VP3_LOOP_FILTER
145 SBUTTERFLY bw, 4, 3, 5
146
147 STORE_4_WORDS m4
148 lea r0, [r0+r1*4 ]
149 STORE_4_WORDS m3
150 RET
151
152 ; from original comments: The Macro does IDct on 4 1-D Dcts
153 %macro BeginIDCT 0
154 movq m2, I(3)
155 movq m6, C(3)
156 movq m4, m2
157 movq m7, J(5)
158 pmulhw m4, m6 ; r4 = c3*i3 - i3
159 movq m1, C(5)
160 pmulhw m6, m7 ; r6 = c3*i5 - i5
161 movq m5, m1
162 pmulhw m1, m2 ; r1 = c5*i3 - i3
163 movq m3, I(1)
164 pmulhw m5, m7 ; r5 = c5*i5 - i5
165 movq m0, C(1)
166 paddw m4, m2 ; r4 = c3*i3
167 paddw m6, m7 ; r6 = c3*i5
168 paddw m2, m1 ; r2 = c5*i3
169 movq m1, J(7)
170 paddw m7, m5 ; r7 = c5*i5
171 movq m5, m0 ; r5 = c1
172 pmulhw m0, m3 ; r0 = c1*i1 - i1
173 paddsw m4, m7 ; r4 = C = c3*i3 + c5*i5
174 pmulhw m5, m1 ; r5 = c1*i7 - i7
175 movq m7, C(7)
176 psubsw m6, m2 ; r6 = D = c3*i5 - c5*i3
177 paddw m0, m3 ; r0 = c1*i1
178 pmulhw m3, m7 ; r3 = c7*i1
179 movq m2, I(2)
180 pmulhw m7, m1 ; r7 = c7*i7
181 paddw m5, m1 ; r5 = c1*i7
182 movq m1, m2 ; r1 = i2
183 pmulhw m2, C(2) ; r2 = c2*i2 - i2
184 psubsw m3, m5 ; r3 = B = c7*i1 - c1*i7
185 movq m5, J(6)
186 paddsw m0, m7 ; r0 = A = c1*i1 + c7*i7
187 movq m7, m5 ; r7 = i6
188 psubsw m0, m4 ; r0 = A - C
189 pmulhw m5, C(2) ; r5 = c2*i6 - i6
190 paddw m2, m1 ; r2 = c2*i2
191 pmulhw m1, C(6) ; r1 = c6*i2
192 paddsw m4, m4 ; r4 = C + C
193 paddsw m4, m0 ; r4 = C. = A + C
194 psubsw m3, m6 ; r3 = B - D
195 paddw m5, m7 ; r5 = c2*i6
196 paddsw m6, m6 ; r6 = D + D
197 pmulhw m7, C(6) ; r7 = c6*i6
198 paddsw m6, m3 ; r6 = D. = B + D
199 movq I(1), m4 ; save C. at I(1)
200 psubsw m1, m5 ; r1 = H = c6*i2 - c2*i6
201 movq m4, C(4)
202 movq m5, m3 ; r5 = B - D
203 pmulhw m3, m4 ; r3 = (c4 - 1) * (B - D)
204 paddsw m7, m2 ; r3 = (c4 - 1) * (B - D)
205 movq I(2), m6 ; save D. at I(2)
206 movq m2, m0 ; r2 = A - C
207 movq m6, I(0)
208 pmulhw m0, m4 ; r0 = (c4 - 1) * (A - C)
209 paddw m5, m3 ; r5 = B. = c4 * (B - D)
210 movq m3, J(4)
211 psubsw m5, m1 ; r5 = B.. = B. - H
212 paddw m2, m0 ; r0 = A. = c4 * (A - C)
213 psubsw m6, m3 ; r6 = i0 - i4
214 movq m0, m6
215 pmulhw m6, m4 ; r6 = (c4 - 1) * (i0 - i4)
216 paddsw m3, m3 ; r3 = i4 + i4
217 paddsw m1, m1 ; r1 = H + H
218 paddsw m3, m0 ; r3 = i0 + i4
219 paddsw m1, m5 ; r1 = H. = B + H
220 pmulhw m4, m3 ; r4 = (c4 - 1) * (i0 + i4)
221 paddsw m6, m0 ; r6 = F = c4 * (i0 - i4)
222 psubsw m6, m2 ; r6 = F. = F - A.
223 paddsw m2, m2 ; r2 = A. + A.
224 movq m0, I(1) ; r0 = C.
225 paddsw m2, m6 ; r2 = A.. = F + A.
226 paddw m4, m3 ; r4 = E = c4 * (i0 + i4)
227 psubsw m2, m1 ; r2 = R2 = A.. - H.
228 %endmacro
229
230 ; RowIDCT gets ready to transpose
231 %macro RowIDCT 0
232 BeginIDCT
233 movq m3, I(2) ; r3 = D.
234 psubsw m4, m7 ; r4 = E. = E - G
235 paddsw m1, m1 ; r1 = H. + H.
236 paddsw m7, m7 ; r7 = G + G
237 paddsw m1, m2 ; r1 = R1 = A.. + H.
238 paddsw m7, m4 ; r1 = R1 = A.. + H.
239 psubsw m4, m3 ; r4 = R4 = E. - D.
240 paddsw m3, m3
241 psubsw m6, m5 ; r6 = R6 = F. - B..
242 paddsw m5, m5
243 paddsw m3, m4 ; r3 = R3 = E. + D.
244 paddsw m5, m6 ; r5 = R5 = F. + B..
245 psubsw m7, m0 ; r7 = R7 = G. - C.
246 paddsw m0, m0
247 movq I(1), m1 ; save R1
248 paddsw m0, m7 ; r0 = R0 = G. + C.
249 %endmacro
250
251 ; Column IDCT normalizes and stores final results
252 %macro ColumnIDCT 0
253 BeginIDCT
254 paddsw m2, OC_8 ; adjust R2 (and R1) for shift
255 paddsw m1, m1 ; r1 = H. + H.
256 paddsw m1, m2 ; r1 = R1 = A.. + H.
257 psraw m2, 4 ; r2 = NR2
258 psubsw m4, m7 ; r4 = E. = E - G
259 psraw m1, 4 ; r1 = NR2
260 movq m3, I(2) ; r3 = D.
261 paddsw m7, m7 ; r7 = G + G
262 movq I(2), m2 ; store NR2 at I2
263 paddsw m7, m4 ; r7 = G. = E + G
264 movq I(1), m1 ; store NR1 at I1
265 psubsw m4, m3 ; r4 = R4 = E. - D.
266 paddsw m4, OC_8 ; adjust R4 (and R3) for shift
267 paddsw m3, m3 ; r3 = D. + D.
268 paddsw m3, m4 ; r3 = R3 = E. + D.
269 psraw m4, 4 ; r4 = NR4
270 psubsw m6, m5 ; r6 = R6 = F. - B..
271 psraw m3, 4 ; r3 = NR3
272 paddsw m6, OC_8 ; adjust R6 (and R5) for shift
273 paddsw m5, m5 ; r5 = B.. + B..
274 paddsw m5, m6 ; r5 = R5 = F. + B..
275 psraw m6, 4 ; r6 = NR6
276 movq J(4), m4 ; store NR4 at J4
277 psraw m5, 4 ; r5 = NR5
278 movq I(3), m3 ; store NR3 at I3
279 psubsw m7, m0 ; r7 = R7 = G. - C.
280 paddsw m7, OC_8 ; adjust R7 (and R0) for shift
281 paddsw m0, m0 ; r0 = C. + C.
282 paddsw m0, m7 ; r0 = R0 = G. + C.
283 psraw m7, 4 ; r7 = NR7
284 movq J(6), m6 ; store NR6 at J6
285 psraw m0, 4 ; r0 = NR0
286 movq J(5), m5 ; store NR5 at J5
287 movq J(7), m7 ; store NR7 at J7
288 movq I(0), m0 ; store NR0 at I0
289 %endmacro
290
291 ; Following macro does two 4x4 transposes in place.
292 ;
293 ; At entry (we assume):
294 ;
295 ; r0 = a3 a2 a1 a0
296 ; I(1) = b3 b2 b1 b0
297 ; r2 = c3 c2 c1 c0
298 ; r3 = d3 d2 d1 d0
299 ;
300 ; r4 = e3 e2 e1 e0
301 ; r5 = f3 f2 f1 f0
302 ; r6 = g3 g2 g1 g0
303 ; r7 = h3 h2 h1 h0
304 ;
305 ; At exit, we have:
306 ;
307 ; I(0) = d0 c0 b0 a0
308 ; I(1) = d1 c1 b1 a1
309 ; I(2) = d2 c2 b2 a2
310 ; I(3) = d3 c3 b3 a3
311 ;
312 ; J(4) = h0 g0 f0 e0
313 ; J(5) = h1 g1 f1 e1
314 ; J(6) = h2 g2 f2 e2
315 ; J(7) = h3 g3 f3 e3
316 ;
317 ; I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
318 ; J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
319 ;
320 ; Since r1 is free at entry, we calculate the Js first.
321 %macro Transpose 0
322 movq m1, m4 ; r1 = e3 e2 e1 e0
323 punpcklwd m4, m5 ; r4 = f1 e1 f0 e0
324 movq I(0), m0 ; save a3 a2 a1 a0
325 punpckhwd m1, m5 ; r1 = f3 e3 f2 e2
326 movq m0, m6 ; r0 = g3 g2 g1 g0
327 punpcklwd m6, m7 ; r6 = h1 g1 h0 g0
328 movq m5, m4 ; r5 = f1 e1 f0 e0
329 punpckldq m4, m6 ; r4 = h0 g0 f0 e0 = R4
330 punpckhdq m5, m6 ; r5 = h1 g1 f1 e1 = R5
331 movq m6, m1 ; r6 = f3 e3 f2 e2
332 movq J(4), m4
333 punpckhwd m0, m7 ; r0 = h3 g3 h2 g2
334 movq J(5), m5
335 punpckhdq m6, m0 ; r6 = h3 g3 f3 e3 = R7
336 movq m4, I(0) ; r4 = a3 a2 a1 a0
337 punpckldq m1, m0 ; r1 = h2 g2 f2 e2 = R6
338 movq m5, I(1) ; r5 = b3 b2 b1 b0
339 movq m0, m4 ; r0 = a3 a2 a1 a0
340 movq J(7), m6
341 punpcklwd m0, m5 ; r0 = b1 a1 b0 a0
342 movq J(6), m1
343 punpckhwd m4, m5 ; r4 = b3 a3 b2 a2
344 movq m5, m2 ; r5 = c3 c2 c1 c0
345 punpcklwd m2, m3 ; r2 = d1 c1 d0 c0
346 movq m1, m0 ; r1 = b1 a1 b0 a0
347 punpckldq m0, m2 ; r0 = d0 c0 b0 a0 = R0
348 punpckhdq m1, m2 ; r1 = d1 c1 b1 a1 = R1
349 movq m2, m4 ; r2 = b3 a3 b2 a2
350 movq I(0), m0
351 punpckhwd m5, m3 ; r5 = d3 c3 d2 c2
352 movq I(1), m1
353 punpckhdq m4, m5 ; r4 = d3 c3 b3 a3 = R3
354 punpckldq m2, m5 ; r2 = d2 c2 b2 a2 = R2
355 movq I(3), m4
356 movq I(2), m2
357 %endmacro
358
359 %macro VP3_IDCT_mmx 1
360 ; eax = quantized input
361 ; ebx = dequantizer matrix
362 ; ecx = IDCT constants
363 ; M(I) = ecx + MaskOffset(0) + I * 8
364 ; C(I) = ecx + CosineOffset(32) + (I-1) * 8
365 ; edx = output
366 ; r0..r7 = mm0..mm7
367 %define OC_8 [pw_8]
368 %define C(x) [vp3_idct_data+16*(x-1)]
369
370 ; at this point, function has completed dequantization + dezigzag +
371 ; partial transposition; now do the idct itself
372 %define I(x) [%1+16* x ]
373 %define J(x) [%1+16*(x-4)+8]
374 RowIDCT
375 Transpose
376
377 %define I(x) [%1+16* x +64]
378 %define J(x) [%1+16*(x-4)+72]
379 RowIDCT
380 Transpose
381
382 %define I(x) [%1+16*x]
383 %define J(x) [%1+16*x]
384 ColumnIDCT
385
386 %define I(x) [%1+16*x+8]
387 %define J(x) [%1+16*x+8]
388 ColumnIDCT
389 %endmacro
390
391 %macro VP3_1D_IDCT_SSE2 0
392 movdqa m2, I(3) ; xmm2 = i3
393 movdqa m6, C(3) ; xmm6 = c3
394 movdqa m4, m2 ; xmm4 = i3
395 movdqa m7, I(5) ; xmm7 = i5
396 pmulhw m4, m6 ; xmm4 = c3 * i3 - i3
397 movdqa m1, C(5) ; xmm1 = c5
398 pmulhw m6, m7 ; xmm6 = c3 * i5 - i5
399 movdqa m5, m1 ; xmm5 = c5
400 pmulhw m1, m2 ; xmm1 = c5 * i3 - i3
401 movdqa m3, I(1) ; xmm3 = i1
402 pmulhw m5, m7 ; xmm5 = c5 * i5 - i5
403 movdqa m0, C(1) ; xmm0 = c1
404 paddw m4, m2 ; xmm4 = c3 * i3
405 paddw m6, m7 ; xmm6 = c3 * i5
406 paddw m2, m1 ; xmm2 = c5 * i3
407 movdqa m1, I(7) ; xmm1 = i7
408 paddw m7, m5 ; xmm7 = c5 * i5
409 movdqa m5, m0 ; xmm5 = c1
410 pmulhw m0, m3 ; xmm0 = c1 * i1 - i1
411 paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C
412 pmulhw m5, m1 ; xmm5 = c1 * i7 - i7
413 movdqa m7, C(7) ; xmm7 = c7
414 psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D
415 paddw m0, m3 ; xmm0 = c1 * i1
416 pmulhw m3, m7 ; xmm3 = c7 * i1
417 movdqa m2, I(2) ; xmm2 = i2
418 pmulhw m7, m1 ; xmm7 = c7 * i7
419 paddw m5, m1 ; xmm5 = c1 * i7
420 movdqa m1, m2 ; xmm1 = i2
421 pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2
422 psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B
423 movdqa m5, I(6) ; xmm5 = i6
424 paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A
425 movdqa m7, m5 ; xmm7 = i6
426 psubsw m0, m4 ; xmm0 = A - C
427 pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6
428 paddw m2, m1 ; xmm2 = i2 * c2
429 pmulhw m1, C(6) ; xmm1 = c6 * i2
430 paddsw m4, m4 ; xmm4 = C + C
431 paddsw m4, m0 ; xmm4 = A + C = C.
432 psubsw m3, m6 ; xmm3 = B - D
433 paddw m5, m7 ; xmm5 = c2 * i6
434 paddsw m6, m6 ; xmm6 = D + D
435 pmulhw m7, C(6) ; xmm7 = c6 * i6
436 paddsw m6, m3 ; xmm6 = B + D = D.
437 movdqa I(1), m4 ; Save C. at I(1)
438 psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H
439 movdqa m4, C(4) ; xmm4 = C4
440 movdqa m5, m3 ; xmm5 = B - D
441 pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D )
442 paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G
443 movdqa I(2), m6 ; save D. at I(2)
444 movdqa m2, m0 ; xmm2 = A - C
445 movdqa m6, I(0) ; xmm6 = i0
446 pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A.
447 paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B.
448 movdqa m3, I(4) ; xmm3 = i4
449 psubsw m5, m1 ; xmm5 = B. - H = B..
450 paddw m2, m0 ; xmm2 = c4 * ( A - C) = A.
451 psubsw m6, m3 ; xmm6 = i0 - i4
452 movdqa m0, m6 ; xmm0 = i0 - i4
453 pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F
454 paddsw m3, m3 ; xmm3 = i4 + i4
455 paddsw m1, m1 ; xmm1 = H + H
456 paddsw m3, m0 ; xmm3 = i0 + i4
457 paddsw m1, m5 ; xmm1 = B. + H = H.
458 pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 )
459 paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 )
460 psubsw m6, m2 ; xmm6 = F - A. = F.
461 paddsw m2, m2 ; xmm2 = A. + A.
462 movdqa m0, I(1) ; Load C. from I(1)
463 paddsw m2, m6 ; xmm2 = F + A. = A..
464 paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3
465 psubsw m2, m1 ; xmm2 = A.. - H. = R2
466 ADD(m2) ; Adjust R2 and R1 before shifting
467 paddsw m1, m1 ; xmm1 = H. + H.
468 paddsw m1, m2 ; xmm1 = A.. + H. = R1
469 SHIFT(m2) ; xmm2 = op2
470 psubsw m4, m7 ; xmm4 = E - G = E.
471 SHIFT(m1) ; xmm1 = op1
472 movdqa m3, I(2) ; Load D. from I(2)
473 paddsw m7, m7 ; xmm7 = G + G
474 paddsw m7, m4 ; xmm7 = E + G = G.
475 psubsw m4, m3 ; xmm4 = E. - D. = R4
476 ADD(m4) ; Adjust R4 and R3 before shifting
477 paddsw m3, m3 ; xmm3 = D. + D.
478 paddsw m3, m4 ; xmm3 = E. + D. = R3
479 SHIFT(m4) ; xmm4 = op4
480 psubsw m6, m5 ; xmm6 = F. - B..= R6
481 SHIFT(m3) ; xmm3 = op3
482 ADD(m6) ; Adjust R6 and R5 before shifting
483 paddsw m5, m5 ; xmm5 = B.. + B..
484 paddsw m5, m6 ; xmm5 = F. + B.. = R5
485 SHIFT(m6) ; xmm6 = op6
486 SHIFT(m5) ; xmm5 = op5
487 psubsw m7, m0 ; xmm7 = G. - C. = R7
488 ADD(m7) ; Adjust R7 and R0 before shifting
489 paddsw m0, m0 ; xmm0 = C. + C.
490 paddsw m0, m7 ; xmm0 = G. + C.
491 SHIFT(m7) ; xmm7 = op7
492 SHIFT(m0) ; xmm0 = op0
493 %endmacro
494
495 %macro PUT_BLOCK 8
496 movdqa O(0), m%1
497 movdqa O(1), m%2
498 movdqa O(2), m%3
499 movdqa O(3), m%4
500 movdqa O(4), m%5
501 movdqa O(5), m%6
502 movdqa O(6), m%7
503 movdqa O(7), m%8
504 %endmacro
505
506 %macro VP3_IDCT_sse2 1
507 %define I(x) [%1+16*x]
508 %define O(x) [%1+16*x]
509 %define C(x) [vp3_idct_data+16*(x-1)]
510 %define SHIFT(x)
511 %define ADD(x)
512 VP3_1D_IDCT_SSE2
513 %ifdef ARCH_X86_64
514 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
515 %else
516 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16]
517 %endif
518 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
519
520 %define SHIFT(x) psraw x, 4
521 %define ADD(x) paddsw x, [pw_8]
522 VP3_1D_IDCT_SSE2
523 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
524 %endmacro
525
526 %macro vp3_idct_funcs 3
527 cglobal vp3_idct_%1, 1, 1, %2
528 VP3_IDCT_%1 r0
529 RET
530
531 cglobal vp3_idct_put_%1, 3, %3, %2
532 VP3_IDCT_%1 r2
533 %ifdef ARCH_X86_64
534 mov r3, r2
535 mov r2, r1
536 mov r1, r0
537 mov r0, r3
538 %else
539 mov r0m, r2
540 mov r1m, r0
541 mov r2m, r1
542 %endif
543 %ifdef WIN64
544 call put_signed_pixels_clamped_mmx
545 RET
546 %else
547 jmp put_signed_pixels_clamped_mmx
548 %endif
549
550 cglobal vp3_idct_add_%1, 3, %3, %2
551 VP3_IDCT_%1 r2
552 %ifdef ARCH_X86_64
553 mov r3, r2
554 mov r2, r1
555 mov r1, r0
556 mov r0, r3
557 %else
558 mov r0m, r2
559 mov r1m, r0
560 mov r2m, r1
561 %endif
562 %ifdef WIN64
563 call add_pixels_clamped_mmx
564 RET
565 %else
566 jmp add_pixels_clamped_mmx
567 %endif
568 %endmacro
569
570 %ifdef ARCH_X86_64
571 %define REGS 4
572 %else
573 %define REGS 3
574 %endif
575 INIT_MMX
576 vp3_idct_funcs mmx, 0, REGS
577 INIT_XMM
578 vp3_idct_funcs sse2, 9, REGS
579 %undef REGS
580
581 %macro DC_ADD 0
582 movq m2, [r0 ]
583 movq m3, [r0+r1 ]
584 paddusb m2, m0
585 movq m4, [r0+r1*2]
586 paddusb m3, m0
587 movq m5, [r0+r3 ]
588 paddusb m4, m0
589 paddusb m5, m0
590 psubusb m2, m1
591 psubusb m3, m1
592 movq [r0 ], m2
593 psubusb m4, m1
594 movq [r0+r1 ], m3
595 psubusb m5, m1
596 movq [r0+r1*2], m4
597 movq [r0+r3 ], m5
598 %endmacro
599
600 INIT_MMX
601 cglobal vp3_idct_dc_add_mmx2, 3, 4
602 %ifdef ARCH_X86_64
603 movsxd r1, r1d
604 %endif
605 lea r3, [r1*3]
606 movsx r2, word [r2]
607 add r2, 15
608 sar r2, 5
609 movd m0, r2d
610 pshufw m0, m0, 0x0
611 pxor m1, m1
612 psubw m1, m0
613 packuswb m0, m0
614 packuswb m1, m1
615 DC_ADD
616 lea r0, [r0+r1*4]
617 DC_ADD
618 RET
OLDNEW
« no previous file with comments | « source/patched-ffmpeg-mt/libavcodec/x86/vc1dsp_yasm.asm ('k') | source/patched-ffmpeg-mt/libavcodec/x86/vp3dsp_mmx.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698