Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(28)

Side by Side Diff: source/patched-ffmpeg-mt/libavcodec/x86/h264_chromamc.asm

Issue 3384002: ffmpeg source update for sep 09 (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/ffmpeg/
Patch Set: Created 10 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;******************************************************************************
2 ;* MMX/SSSE3-optimized functions for H264 chroma MC
3 ;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
4 ;* 2005-2008 Loren Merritt
5 ;*
6 ;* This file is part of FFmpeg.
7 ;*
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
12 ;*
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
17 ;*
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
22
23 %include "x86inc.asm"
24 %include "x86util.asm"
25
26 SECTION_RODATA
27
28 rnd_rv40_2d_tbl: times 4 dw 0
29 times 4 dw 16
30 times 4 dw 32
31 times 4 dw 16
32 times 4 dw 32
33 times 4 dw 28
34 times 4 dw 32
35 times 4 dw 28
36 times 4 dw 0
37 times 4 dw 32
38 times 4 dw 16
39 times 4 dw 32
40 times 4 dw 32
41 times 4 dw 28
42 times 4 dw 32
43 times 4 dw 28
44 rnd_rv40_1d_tbl: times 4 dw 0
45 times 4 dw 2
46 times 4 dw 4
47 times 4 dw 2
48 times 4 dw 4
49 times 4 dw 3
50 times 4 dw 4
51 times 4 dw 3
52 times 4 dw 0
53 times 4 dw 4
54 times 4 dw 2
55 times 4 dw 4
56 times 4 dw 4
57 times 4 dw 3
58 times 4 dw 4
59 times 4 dw 3
60
61 cextern pw_3
62 cextern pw_4
63 cextern pw_8
64 cextern pw_28
65 cextern pw_32
66 cextern pw_64
67
68 SECTION .text
69
70 %macro mv0_pixels_mc8 0
71 lea r4, [r2*2 ]
72 .next4rows
73 movq mm0, [r1 ]
74 movq mm1, [r1+r2]
75 CHROMAMC_AVG mm0, [r0 ]
76 CHROMAMC_AVG mm1, [r0+r2]
77 movq [r0 ], mm0
78 movq [r0+r2], mm1
79 add r0, r4
80 add r1, r4
81 movq mm0, [r1 ]
82 movq mm1, [r1+r2]
83 CHROMAMC_AVG mm0, [r0 ]
84 CHROMAMC_AVG mm1, [r0+r2]
85 add r1, r4
86 movq [r0 ], mm0
87 movq [r0+r2], mm1
88 add r0, r4
89 sub r3d, 4
90 jne .next4rows
91 %endmacro
92
93 %macro chroma_mc8_mmx_func 3
94 ; put/avg_h264_chroma_mc8_mmx_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1 */,
95 ; int stride, int h, int mx, int my)
96 cglobal %1_%2_chroma_mc8_%3, 6, 7, 0
97 %ifdef ARCH_X86_64
98 movsxd r2, r2d
99 %endif
100 mov r6d, r5d
101 or r6d, r4d
102 jne .at_least_one_non_zero
103 ; mx == 0 AND my == 0 - no filter needed
104 mv0_pixels_mc8
105 REP_RET
106
107 .at_least_one_non_zero
108 %ifidn %2, rv40
109 %ifdef PIC
110 %define rnd_1d_rv40 r11
111 %define rnd_2d_rv40 r11
112 %else ; no-PIC
113 %define rnd_1d_rv40 rnd_rv40_1d_tbl
114 %define rnd_2d_rv40 rnd_rv40_2d_tbl
115 %endif
116 %ifdef ARCH_X86_64
117 mov r10, r5
118 and r10, 6 ; &~1 for mx/my=[0,7]
119 lea r10, [r10*4+r4]
120 sar r10d, 1
121 %define rnd_bias r10
122 %define dest_reg r0
123 %else ; x86-32
124 mov r0, r5
125 and r0, 6 ; &~1 for mx/my=[0,7]
126 lea r0, [r0*4+r4]
127 sar r0d, 1
128 %define rnd_bias r0
129 %define dest_reg r5
130 %endif
131 %else ; vc1, h264
132 %define rnd_bias 0
133 %define dest_reg r0
134 %endif
135
136 test r5d, r5d
137 mov r6, 1
138 je .my_is_zero
139 test r4d, r4d
140 mov r6, r2 ; dxy = x ? 1 : stride
141 jne .both_non_zero
142 .my_is_zero
143 ; mx == 0 XOR my == 0 - 1 dimensional filter only
144 or r4d, r5d ; x + y
145
146 %ifidn %2, rv40
147 %ifdef PIC
148 lea r11, [rnd_rv40_1d_tbl]
149 %endif
150 %ifndef ARCH_X86_64
151 mov r5, r0m
152 %endif
153 %endif
154
155 movd m5, r4d
156 movq m4, [pw_8]
157 movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
158 punpcklwd m5, m5
159 punpckldq m5, m5 ; mm5 = B = x
160 pxor m7, m7
161 psubw m4, m5 ; mm4 = A = 8-x
162
163 .next1drow
164 movq m0, [r1 ] ; mm0 = src[0..7]
165 movq m2, [r1+r6] ; mm1 = src[1..8]
166
167 movq m1, m0
168 movq m3, m2
169 punpcklbw m0, m7
170 punpckhbw m1, m7
171 punpcklbw m2, m7
172 punpckhbw m3, m7
173 pmullw m0, m4 ; [mm0,mm1] = A * src[0..7]
174 pmullw m1, m4
175 pmullw m2, m5 ; [mm2,mm3] = B * src[1..8]
176 pmullw m3, m5
177
178 paddw m0, m6
179 paddw m1, m6
180 paddw m0, m2
181 paddw m1, m3
182 psrlw m0, 3
183 psrlw m1, 3
184 packuswb m0, m1
185 CHROMAMC_AVG m0, [dest_reg]
186 movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + ( rnd >> 3)) >> 3
187
188 add dest_reg, r2
189 add r1, r2
190 dec r3d
191 jne .next1drow
192 REP_RET
193
194 .both_non_zero ; general case, bilinear
195 movd m4, r4d ; x
196 movd m6, r5d ; y
197 %ifidn %2, rv40
198 %ifdef PIC
199 lea r11, [rnd_rv40_2d_tbl]
200 %endif
201 %ifndef ARCH_X86_64
202 mov r5, r0m
203 %endif
204 %endif
205 mov r6, rsp ; backup stack pointer
206 and rsp, ~(mmsize-1) ; align stack
207 sub rsp, 16 ; AA and DD
208
209 punpcklwd m4, m4
210 punpcklwd m6, m6
211 punpckldq m4, m4 ; mm4 = x words
212 punpckldq m6, m6 ; mm6 = y words
213 movq m5, m4
214 pmullw m4, m6 ; mm4 = x * y
215 psllw m5, 3
216 psllw m6, 3
217 movq m7, m5
218 paddw m7, m6
219 movq [rsp+8], m4 ; DD = x * y
220 psubw m5, m4 ; mm5 = B = 8x - xy
221 psubw m6, m4 ; mm6 = C = 8y - xy
222 paddw m4, [pw_64]
223 psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64
224 pxor m7, m7
225 movq [rsp ], m4
226
227 movq m0, [r1 ] ; mm0 = src[0..7]
228 movq m1, [r1+1] ; mm1 = src[1..8]
229 .next2drow
230 add r1, r2
231
232 movq m2, m0
233 movq m3, m1
234 punpckhbw m0, m7
235 punpcklbw m1, m7
236 punpcklbw m2, m7
237 punpckhbw m3, m7
238 pmullw m0, [rsp]
239 pmullw m2, [rsp]
240 pmullw m1, m5
241 pmullw m3, m5
242 paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4]
243 paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8]
244
245 movq m0, [r1]
246 movq m1, m0
247 punpcklbw m0, m7
248 punpckhbw m1, m7
249 pmullw m0, m6
250 pmullw m1, m6
251 paddw m2, m0
252 paddw m3, m1 ; [mm2,mm3] += C * src[0..7]
253
254 movq m1, [r1+1]
255 movq m0, m1
256 movq m4, m1
257 punpcklbw m0, m7
258 punpckhbw m4, m7
259 pmullw m0, [rsp+8]
260 pmullw m4, [rsp+8]
261 paddw m2, m0
262 paddw m3, m4 ; [mm2,mm3] += D * src[1..8]
263 movq m0, [r1]
264
265 paddw m2, [rnd_2d_%2+rnd_bias*8]
266 paddw m3, [rnd_2d_%2+rnd_bias*8]
267 psrlw m2, 6
268 psrlw m3, 6
269 packuswb m2, m3
270 CHROMAMC_AVG m2, [dest_reg]
271 movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6
272
273 add dest_reg, r2
274 dec r3d
275 jne .next2drow
276 mov rsp, r6 ; restore stack pointer
277 RET
278 %endmacro
279
280 %macro chroma_mc4_mmx_func 3
281 cglobal %1_%2_chroma_mc4_%3, 6, 6, 0
282 %ifdef ARCH_X86_64
283 movsxd r2, r2d
284 %endif
285 pxor m7, m7
286 movd m2, r4d ; x
287 movd m3, r5d ; y
288 movq m4, [pw_8]
289 movq m5, [pw_8]
290 punpcklwd m2, m2
291 punpcklwd m3, m3
292 punpcklwd m2, m2
293 punpcklwd m3, m3
294 psubw m4, m2
295 psubw m5, m3
296
297 %ifidn %2, rv40
298 %ifdef PIC
299 lea r11, [rnd_rv40_2d_tbl]
300 %define rnd_2d_rv40 r11
301 %else
302 %define rnd_2d_rv40 rnd_rv40_2d_tbl
303 %endif
304 and r5, 6 ; &~1 for mx/my=[0,7]
305 lea r5, [r5*4+r4]
306 sar r5d, 1
307 %define rnd_bias r5
308 %else ; vc1, h264
309 %define rnd_bias 0
310 %endif
311
312 movd m0, [r1 ]
313 movd m6, [r1+1]
314 add r1, r2
315 punpcklbw m0, m7
316 punpcklbw m6, m7
317 pmullw m0, m4
318 pmullw m6, m2
319 paddw m6, m0
320
321 .next2rows
322 movd m0, [r1 ]
323 movd m1, [r1+1]
324 add r1, r2
325 punpcklbw m0, m7
326 punpcklbw m1, m7
327 pmullw m0, m4
328 pmullw m1, m2
329 paddw m1, m0
330 movq m0, m1
331
332 pmullw m6, m5
333 pmullw m1, m3
334 paddw m6, [rnd_2d_%2+rnd_bias*8]
335 paddw m1, m6
336 psrlw m1, 6
337 packuswb m1, m1
338 CHROMAMC_AVG4 m1, m6, [r0]
339 movd [r0], m1
340 add r0, r2
341
342 movd m6, [r1 ]
343 movd m1, [r1+1]
344 add r1, r2
345 punpcklbw m6, m7
346 punpcklbw m1, m7
347 pmullw m6, m4
348 pmullw m1, m2
349 paddw m1, m6
350 movq m6, m1
351 pmullw m0, m5
352 pmullw m1, m3
353 paddw m0, [rnd_2d_%2+rnd_bias*8]
354 paddw m1, m0
355 psrlw m1, 6
356 packuswb m1, m1
357 CHROMAMC_AVG4 m1, m0, [r0]
358 movd [r0], m1
359 add r0, r2
360 sub r3d, 2
361 jnz .next2rows
362 REP_RET
363 %endmacro
364
365 %macro chroma_mc2_mmx_func 3
366 cglobal %1_%2_chroma_mc2_%3, 6, 7, 0
367 %ifdef ARCH_X86_64
368 movsxd r2, r2d
369 %endif
370
371 mov r6d, r4d
372 shl r4d, 16
373 sub r4d, r6d
374 add r4d, 8
375 imul r5d, r4d ; x*y<<16 | y*(8-x)
376 shl r4d, 3
377 sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
378
379 movd m5, r4d
380 movd m6, r5d
381 punpckldq m5, m5 ; mm5 = {A,B,A,B}
382 punpckldq m6, m6 ; mm6 = {C,D,C,D}
383 pxor m7, m7
384 movd m2, [r1]
385 punpcklbw m2, m7
386 pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2]
387
388 .nextrow
389 add r1, r2
390 movq m1, m2
391 pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
392 movd m0, [r1]
393 punpcklbw m0, m7
394 pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2]
395 movq m2, m0
396 pmaddwd m0, m6
397 paddw m1, [rnd_2d_%2]
398 paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
399 psrlw m1, 6
400 packssdw m1, m7
401 packuswb m1, m7
402 CHROMAMC_AVG4 m1, m3, [r0]
403 movd r5d, m1
404 mov [r0], r5w
405 add r0, r2
406 sub r3d, 1
407 jnz .nextrow
408 REP_RET
409 %endmacro
410
411 %define rnd_1d_h264 pw_4
412 %define rnd_2d_h264 pw_32
413 %define rnd_1d_vc1 pw_3
414 %define rnd_2d_vc1 pw_28
415
416 %macro NOTHING 2-3
417 %endmacro
418 %macro DIRECT_AVG 2
419 PAVG %1, %2
420 %endmacro
421 %macro COPY_AVG 3
422 movd %2, %3
423 PAVG %1, %2
424 %endmacro
425
426 INIT_MMX
427 %define CHROMAMC_AVG NOTHING
428 %define CHROMAMC_AVG4 NOTHING
429 chroma_mc8_mmx_func put, h264, mmx_rnd
430 chroma_mc8_mmx_func put, vc1, mmx_nornd
431 chroma_mc8_mmx_func put, rv40, mmx
432 chroma_mc4_mmx_func put, h264, mmx
433 chroma_mc4_mmx_func put, rv40, mmx
434 chroma_mc2_mmx_func put, h264, mmx2
435
436 %define CHROMAMC_AVG DIRECT_AVG
437 %define CHROMAMC_AVG4 COPY_AVG
438 %define PAVG pavgb
439 chroma_mc8_mmx_func avg, h264, mmx2_rnd
440 chroma_mc8_mmx_func avg, vc1, mmx2_nornd
441 chroma_mc8_mmx_func avg, rv40, mmx2
442 chroma_mc4_mmx_func avg, h264, mmx2
443 chroma_mc4_mmx_func avg, rv40, mmx2
444 chroma_mc2_mmx_func avg, h264, mmx2
445
446 %define PAVG pavgusb
447 chroma_mc8_mmx_func avg, h264, 3dnow_rnd
448 chroma_mc8_mmx_func avg, vc1, 3dnow_nornd
449 chroma_mc8_mmx_func avg, rv40, 3dnow
450 chroma_mc4_mmx_func avg, h264, 3dnow
451 chroma_mc4_mmx_func avg, rv40, 3dnow
452
453 %macro chroma_mc8_ssse3_func 3
454 cglobal %1_%2_chroma_mc8_%3, 6, 7, 8
455 %ifdef ARCH_X86_64
456 movsxd r2, r2d
457 %endif
458 mov r6d, r5d
459 or r6d, r4d
460 jne .at_least_one_non_zero
461 ; mx == 0 AND my == 0 - no filter needed
462 mv0_pixels_mc8
463 REP_RET
464
465 .at_least_one_non_zero
466 test r5d, r5d
467 je .my_is_zero
468 test r4d, r4d
469 je .mx_is_zero
470
471 ; general case, bilinear
472 mov r6d, r4d
473 shl r4d, 8
474 sub r4, r6
475 add r4, 8 ; x*288+8 = x<<8 | (8-x)
476 mov r6, 8
477 sub r6d, r5d
478 imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
479 imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
480
481 movd m7, r6d
482 movd m6, r4d
483 movdqa m5, [rnd_2d_%2]
484 pshuflw m7, m7, 0
485 pshuflw m6, m6, 0
486 movlhps m7, m7
487 movlhps m6, m6
488
489 movq m0, [r1 ]
490 movq m1, [r1 +1]
491 punpcklbw m0, m1
492 add r1, r2
493 .next2rows
494 movq m1, [r1 ]
495 movq m2, [r1 +1]
496 movq m3, [r1+r2 ]
497 movq m4, [r1+r2+1]
498 lea r1, [r1+r2*2]
499 punpcklbw m1, m2
500 punpcklbw m3, m4
501 movdqa m2, m1
502 movdqa m4, m3
503 pmaddubsw m0, m7
504 pmaddubsw m1, m6
505 pmaddubsw m2, m7
506 pmaddubsw m3, m6
507 paddw m0, m5
508 paddw m2, m5
509 paddw m1, m0
510 paddw m3, m2
511 movdqa m0, m4
512 psrlw m1, 6
513 psrlw m3, 6
514 %ifidn %1, avg
515 movq m2, [r0 ]
516 movhps m2, [r0+r2]
517 %endif
518 packuswb m1, m3
519 CHROMAMC_AVG m1, m2
520 movq [r0 ], m1
521 movhps [r0+r2], m1
522 sub r3d, 2
523 lea r0, [r0+r2*2]
524 jg .next2rows
525 REP_RET
526
527 .my_is_zero
528 mov r5d, r4d
529 shl r4d, 8
530 add r4, 8
531 sub r4, r5 ; 255*x+8 = x<<8 | (8-x)
532 movd m7, r4d
533 movq m6, [rnd_1d_%2]
534 pshuflw m7, m7, 0
535 movlhps m6, m6
536 movlhps m7, m7
537
538 .next2xrows
539 movq m0, [r1 ]
540 movq m1, [r1 +1]
541 movq m2, [r1+r2 ]
542 movq m3, [r1+r2+1]
543 punpcklbw m0, m1
544 punpcklbw m2, m3
545 pmaddubsw m0, m7
546 pmaddubsw m2, m7
547 %ifidn %1, avg
548 movq m4, [r0 ]
549 movhps m4, [r0+r2]
550 %endif
551 paddw m0, m6
552 paddw m2, m6
553 psrlw m0, 3
554 psrlw m2, 3
555 packuswb m0, m2
556 CHROMAMC_AVG m0, m4
557 movq [r0 ], m0
558 movhps [r0+r2], m0
559 sub r3d, 2
560 lea r0, [r0+r2*2]
561 lea r1, [r1+r2*2]
562 jg .next2xrows
563 REP_RET
564
565 .mx_is_zero
566 mov r4d, r5d
567 shl r5d, 8
568 add r5, 8
569 sub r5, r4 ; 255*y+8 = y<<8 | (8-y)
570 movd m7, r5d
571 movq m6, [rnd_1d_%2]
572 pshuflw m7, m7, 0
573 movlhps m6, m6
574 movlhps m7, m7
575
576 .next2yrows
577 movq m0, [r1 ]
578 movq m1, [r1+r2 ]
579 movdqa m2, m1
580 movq m3, [r1+r2*2]
581 punpcklbw m0, m1
582 punpcklbw m2, m3
583 pmaddubsw m0, m7
584 pmaddubsw m2, m7
585 %ifidn %1, avg
586 movq m4, [r0 ]
587 movhps m4, [r0+r2]
588 %endif
589 paddw m0, m6
590 paddw m2, m6
591 psrlw m0, 3
592 psrlw m2, 3
593 packuswb m0, m2
594 CHROMAMC_AVG m0, m4
595 movq [r0 ], m0
596 movhps [r0+r2], m0
597 sub r3d, 2
598 lea r0, [r0+r2*2]
599 lea r1, [r1+r2*2]
600 jg .next2yrows
601 REP_RET
602 %endmacro
603
604 %macro chroma_mc4_ssse3_func 3
605 cglobal %1_%2_chroma_mc4_%3, 6, 7, 0
606 %ifdef ARCH_X86_64
607 movsxd r2, r2d
608 %endif
609 mov r6, r4
610 shl r4d, 8
611 sub r4d, r6d
612 add r4d, 8 ; x*288+8
613 mov r6, 8
614 sub r6d, r5d
615 imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
616 imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
617
618 movd m7, r6d
619 movd m6, r4d
620 movq m5, [pw_32]
621 pshufw m7, m7, 0
622 pshufw m6, m6, 0
623
624 movd m0, [r1 ]
625 punpcklbw m0, [r1 +1]
626 add r1, r2
627 .next2rows
628 movd m1, [r1 ]
629 movd m3, [r1+r2 ]
630 punpcklbw m1, [r1 +1]
631 punpcklbw m3, [r1+r2+1]
632 lea r1, [r1+r2*2]
633 movq m2, m1
634 movq m4, m3
635 pmaddubsw m0, m7
636 pmaddubsw m1, m6
637 pmaddubsw m2, m7
638 pmaddubsw m3, m6
639 paddw m0, m5
640 paddw m2, m5
641 paddw m1, m0
642 paddw m3, m2
643 movq m0, m4
644 psrlw m1, 6
645 psrlw m3, 6
646 packuswb m1, m1
647 packuswb m3, m3
648 CHROMAMC_AVG m1, [r0 ]
649 CHROMAMC_AVG m3, [r0+r2]
650 movd [r0 ], m1
651 movd [r0+r2], m3
652 sub r3d, 2
653 lea r0, [r0+r2*2]
654 jg .next2rows
655 REP_RET
656 %endmacro
657
658 %define CHROMAMC_AVG NOTHING
659 INIT_XMM
660 chroma_mc8_ssse3_func put, h264, ssse3_rnd
661 chroma_mc8_ssse3_func put, vc1, ssse3_nornd
662 INIT_MMX
663 chroma_mc4_ssse3_func put, h264, ssse3
664
665 %define CHROMAMC_AVG DIRECT_AVG
666 %define PAVG pavgb
667 INIT_XMM
668 chroma_mc8_ssse3_func avg, h264, ssse3_rnd
669 chroma_mc8_ssse3_func avg, vc1, ssse3_nornd
670 INIT_MMX
671 chroma_mc4_ssse3_func avg, h264, ssse3
OLDNEW
« no previous file with comments | « source/patched-ffmpeg-mt/libavcodec/x86/fft_sse.c ('k') | source/patched-ffmpeg-mt/libavcodec/x86/h264_deblock.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698