Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(562)

Side by Side Diff: source/patched-ffmpeg-mt/libavcodec/x86/fft_mmx.asm

Issue 3384002: ffmpeg source update for sep 09 (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/ffmpeg/
Patch Set: Created 10 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 ;****************************************************************************** 1 ;******************************************************************************
2 ;* FFT transform with SSE/3DNow optimizations 2 ;* FFT transform with SSE/3DNow optimizations
3 ;* Copyright (c) 2008 Loren Merritt 3 ;* Copyright (c) 2008 Loren Merritt
4 ;* 4 ;*
5 ;* This algorithm (though not any of the implementation details) is 5 ;* This algorithm (though not any of the implementation details) is
6 ;* based on libdjbfft by D. J. Bernstein. 6 ;* based on libdjbfft by D. J. Bernstein.
7 ;* 7 ;*
8 ;* This file is part of FFmpeg. 8 ;* This file is part of FFmpeg.
9 ;* 9 ;*
10 ;* FFmpeg is free software; you can redistribute it and/or 10 ;* FFmpeg is free software; you can redistribute it and/or
(...skipping 11 matching lines...) Expand all
22 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;****************************************************************************** 23 ;******************************************************************************
24 24
25 ; These functions are not individually interchangeable with the C versions. 25 ; These functions are not individually interchangeable with the C versions.
26 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results 26 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
27 ; in blocks as conventient to the vector size. 27 ; in blocks as conventient to the vector size.
28 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) 28 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
29 29
30 %include "x86inc.asm" 30 %include "x86inc.asm"
31 31
32 %ifdef ARCH_X86_64
33 %define pointer resq
34 %else
35 %define pointer resd
36 %endif
37
38 struc FFTContext
39 .nbits: resd 1
40 .reverse: resd 1
41 .revtab: pointer 1
42 .tmpbuf: pointer 1
43 .mdctsize: resd 1
44 .mdctbits: resd 1
45 .tcos: pointer 1
46 .tsin: pointer 1
47 endstruc
48
32 SECTION_RODATA 49 SECTION_RODATA
33 50
34 %define M_SQRT1_2 0.70710678118654752440 51 %define M_SQRT1_2 0.70710678118654752440
35 ps_root2: times 4 dd M_SQRT1_2 52 ps_root2: times 4 dd M_SQRT1_2
36 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 53 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
37 ps_m1p1: dd 1<<31, 0 54 ps_m1p1: dd 1<<31, 0
38 55
39 %assign i 16 56 %assign i 16
40 %rep 13 57 %rep 13
41 cextern cos_ %+ i 58 cextern cos_ %+ i
(...skipping 379 matching lines...) Expand 10 before | Expand all | Expand 10 after
421 DECL_PASS pass_interleave_3dn, PASS_BIG 0 438 DECL_PASS pass_interleave_3dn, PASS_BIG 0
422 %define pass_3dn2 pass_3dn 439 %define pass_3dn2 pass_3dn
423 %define pass_interleave_3dn2 pass_interleave_3dn 440 %define pass_interleave_3dn2 pass_interleave_3dn
424 441
425 %ifdef PIC 442 %ifdef PIC
426 %define SECTION_REL - $$ 443 %define SECTION_REL - $$
427 %else 444 %else
428 %define SECTION_REL 445 %define SECTION_REL
429 %endif 446 %endif
430 447
448 %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
449 lea r2, [dispatch_tab%1]
450 mov r2, [r2 + (%2q-2)*gprsize]
451 %ifdef PIC
452 lea r3, [$$]
453 add r2, r3
454 %endif
455 call r2
456 %endmacro ; FFT_DISPATCH
457
431 %macro DECL_FFT 2-3 ; nbits, cpu, suffix 458 %macro DECL_FFT 2-3 ; nbits, cpu, suffix
432 %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL 459 %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
433 %if %1==5 460 %if %1==5
434 %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL 461 %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL
435 %endif 462 %endif
436 463
437 %assign n 1<<%1 464 %assign n 1<<%1
438 %rep 17-%1 465 %rep 17-%1
439 %assign n2 n/2 466 %assign n2 n/2
440 %assign n4 n/4 467 %assign n4 n/4
(...skipping 16 matching lines...) Expand all
457 %undef n 484 %undef n
458 485
459 align 8 486 align 8
460 dispatch_tab%3%2: pointer list_of_fft 487 dispatch_tab%3%2: pointer list_of_fft
461 488
462 section .text 489 section .text
463 490
464 ; On x86_32, this function does the register saving and restoring for all of fft . 491 ; On x86_32, this function does the register saving and restoring for all of fft .
465 ; The others pass args in registers and don't spill anything. 492 ; The others pass args in registers and don't spill anything.
466 cglobal fft_dispatch%3%2, 2,5,8, z, nbits 493 cglobal fft_dispatch%3%2, 2,5,8, z, nbits
467 lea r2, [dispatch_tab%3%2] 494 FFT_DISPATCH %3%2, nbits
468 mov r2, [r2 + (nbitsq-2)*gprsize]
469 %ifdef PIC
470 lea r3, [$$]
471 add r2, r3
472 %endif
473 call r2
474 RET 495 RET
475 %endmacro ; DECL_FFT 496 %endmacro ; DECL_FFT
476 497
477 DECL_FFT 5, _sse 498 DECL_FFT 5, _sse
478 DECL_FFT 5, _sse, _interleave 499 DECL_FFT 5, _sse, _interleave
479 DECL_FFT 4, _3dn 500 DECL_FFT 4, _3dn
480 DECL_FFT 4, _3dn, _interleave 501 DECL_FFT 4, _3dn, _interleave
481 DECL_FFT 4, _3dn2 502 DECL_FFT 4, _3dn2
482 DECL_FFT 4, _3dn2, _interleave 503 DECL_FFT 4, _3dn2, _interleave
483 504
505 INIT_XMM
506 %undef mulps
507 %undef addps
508 %undef subps
509 %undef unpcklps
510 %undef unpckhps
511
512 %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
513 movaps xmm0, [%3+%2*4]
514 movaps xmm1, [%3+%1*4-0x10]
515 movaps xmm2, xmm0
516 shufps xmm0, xmm1, 0x88
517 shufps xmm1, xmm2, 0x77
518 movlps xmm4, [%4+%2*2]
519 movlps xmm5, [%5+%2*2+0x0]
520 movhps xmm4, [%4+%1*2-0x8]
521 movhps xmm5, [%5+%1*2-0x8]
522 movaps xmm2, xmm0
523 movaps xmm3, xmm1
524 mulps xmm0, xmm5
525 mulps xmm1, xmm4
526 mulps xmm2, xmm4
527 mulps xmm3, xmm5
528 subps xmm1, xmm0
529 addps xmm2, xmm3
530 movaps xmm0, xmm1
531 unpcklps xmm1, xmm2
532 unpckhps xmm0, xmm2
533 %endmacro
534
535 %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
536 movaps xmm6, [%4+%1*2]
537 movaps %2, [%4+%1*2+0x10]
538 movaps %3, xmm6
539 movaps xmm7, %2
540 mulps xmm6, [%5+%1]
541 mulps %2, [%6+%1]
542 mulps %3, [%6+%1]
543 mulps xmm7, [%5+%1]
544 subps %2, xmm6
545 addps %3, xmm7
546 %endmacro
547
548 %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
549 .post:
550 CMUL %1, xmm0, xmm1, %3, %4, %5
551 CMUL %2, xmm4, xmm5, %3, %4, %5
552 shufps xmm1, xmm1, 0x1b
553 shufps xmm5, xmm5, 0x1b
554 movaps xmm6, xmm4
555 unpckhps xmm4, xmm1
556 unpcklps xmm6, xmm1
557 movaps xmm2, xmm0
558 unpcklps xmm0, xmm5
559 unpckhps xmm2, xmm5
560 movaps [%3+%2*2], xmm6
561 movaps [%3+%2*2+0x10], xmm4
562 movaps [%3+%1*2], xmm0
563 movaps [%3+%1*2+0x10], xmm2
564 sub %2, 0x10
565 add %1, 0x10
566 jl .post
567 %endmacro
568
569 cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
570 %ifdef ARCH_X86_64
571 %define rrevtab r10
572 %define rtcos r11
573 %define rtsin r12
574 push r12
575 push r13
576 push r14
577 %else
578 %define rrevtab r6
579 %define rtsin r6
580 %define rtcos r5
581 %endif
582 mov r3d, [r0+FFTContext.mdctsize]
583 add r2, r3
584 shr r3, 1
585 mov rtcos, [r0+FFTContext.tcos]
586 mov rtsin, [r0+FFTContext.tsin]
587 add rtcos, r3
588 add rtsin, r3
589 %ifndef ARCH_X86_64
590 push rtcos
591 push rtsin
592 %endif
593 shr r3, 1
594 mov rrevtab, [r0+FFTContext.revtab]
595 add rrevtab, r3
596 %ifndef ARCH_X86_64
597 push rrevtab
598 %endif
599
600 sub r3, 4
601 %ifdef ARCH_X86_64
602 xor r4, r4
603 sub r4, r3
604 %endif
605 .pre:
606 %ifndef ARCH_X86_64
607 ;unspill
608 xor r4, r4
609 sub r4, r3
610 mov rtsin, [esp+4]
611 mov rtcos, [esp+8]
612 %endif
613
614 PREROTATER r4, r3, r2, rtcos, rtsin
615 %ifdef ARCH_X86_64
616 movzx r5, word [rrevtab+r4-4]
617 movzx r6, word [rrevtab+r4-2]
618 movzx r13, word [rrevtab+r3]
619 movzx r14, word [rrevtab+r3+2]
620 movlps [r1+r5 *8], xmm0
621 movhps [r1+r6 *8], xmm0
622 movlps [r1+r13*8], xmm1
623 movhps [r1+r14*8], xmm1
624 add r4, 4
625 %else
626 mov r6, [esp]
627 movzx r5, word [r6+r4-4]
628 movzx r4, word [r6+r4-2]
629 movlps [r1+r5*8], xmm0
630 movhps [r1+r4*8], xmm0
631 movzx r5, word [r6+r3]
632 movzx r4, word [r6+r3+2]
633 movlps [r1+r5*8], xmm1
634 movhps [r1+r4*8], xmm1
635 %endif
636 sub r3, 4
637 jns .pre
638
639 mov r5, r0
640 mov r6, r1
641 mov r0, r1
642 mov r1d, [r5+FFTContext.nbits]
643
644 FFT_DISPATCH _sse, r1
645
646 mov r0d, [r5+FFTContext.mdctsize]
647 add r6, r0
648 shr r0, 1
649 %ifndef ARCH_X86_64
650 %define rtcos r2
651 %define rtsin r3
652 mov rtcos, [esp+8]
653 mov rtsin, [esp+4]
654 %endif
655 neg r0
656 mov r1, -16
657 sub r1, r0
658 POSROTATESHUF r0, r1, r6, rtcos, rtsin
659 %ifdef ARCH_X86_64
660 pop r14
661 pop r13
662 pop r12
663 %else
664 add esp, 12
665 %endif
666 RET
OLDNEW
« no previous file with comments | « source/patched-ffmpeg-mt/libavcodec/x86/fft_3dn2.c ('k') | source/patched-ffmpeg-mt/libavcodec/x86/fft_sse.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698