| OLD | NEW |
| 1 ;****************************************************************************** | 1 ;****************************************************************************** |
| 2 ;* FFT transform with SSE/3DNow optimizations | 2 ;* FFT transform with SSE/3DNow optimizations |
| 3 ;* Copyright (c) 2008 Loren Merritt | 3 ;* Copyright (c) 2008 Loren Merritt |
| 4 ;* | 4 ;* |
| 5 ;* This algorithm (though not any of the implementation details) is | 5 ;* This algorithm (though not any of the implementation details) is |
| 6 ;* based on libdjbfft by D. J. Bernstein. | 6 ;* based on libdjbfft by D. J. Bernstein. |
| 7 ;* | 7 ;* |
| 8 ;* This file is part of FFmpeg. | 8 ;* This file is part of FFmpeg. |
| 9 ;* | 9 ;* |
| 10 ;* FFmpeg is free software; you can redistribute it and/or | 10 ;* FFmpeg is free software; you can redistribute it and/or |
| (...skipping 11 matching lines...) Expand all Loading... |
| 22 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | 22 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 23 ;****************************************************************************** | 23 ;****************************************************************************** |
| 24 | 24 |
| 25 ; These functions are not individually interchangeable with the C versions. | 25 ; These functions are not individually interchangeable with the C versions. |
| 26 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results | 26 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results |
| 27 ; in blocks as conventient to the vector size. | 27 ; in blocks as conventient to the vector size. |
| 28 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) | 28 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) |
| 29 | 29 |
| 30 %include "x86inc.asm" | 30 %include "x86inc.asm" |
| 31 | 31 |
| 32 %ifdef ARCH_X86_64 |
| 33 %define pointer resq |
| 34 %else |
| 35 %define pointer resd |
| 36 %endif |
| 37 |
| 38 struc FFTContext |
| 39 .nbits: resd 1 |
| 40 .reverse: resd 1 |
| 41 .revtab: pointer 1 |
| 42 .tmpbuf: pointer 1 |
| 43 .mdctsize: resd 1 |
| 44 .mdctbits: resd 1 |
| 45 .tcos: pointer 1 |
| 46 .tsin: pointer 1 |
| 47 endstruc |
| 48 |
| 32 SECTION_RODATA | 49 SECTION_RODATA |
| 33 | 50 |
| 34 %define M_SQRT1_2 0.70710678118654752440 | 51 %define M_SQRT1_2 0.70710678118654752440 |
| 35 ps_root2: times 4 dd M_SQRT1_2 | 52 ps_root2: times 4 dd M_SQRT1_2 |
| 36 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 | 53 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 |
| 37 ps_m1p1: dd 1<<31, 0 | 54 ps_m1p1: dd 1<<31, 0 |
| 38 | 55 |
| 39 %assign i 16 | 56 %assign i 16 |
| 40 %rep 13 | 57 %rep 13 |
| 41 cextern cos_ %+ i | 58 cextern cos_ %+ i |
| (...skipping 379 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 421 DECL_PASS pass_interleave_3dn, PASS_BIG 0 | 438 DECL_PASS pass_interleave_3dn, PASS_BIG 0 |
| 422 %define pass_3dn2 pass_3dn | 439 %define pass_3dn2 pass_3dn |
| 423 %define pass_interleave_3dn2 pass_interleave_3dn | 440 %define pass_interleave_3dn2 pass_interleave_3dn |
| 424 | 441 |
| 425 %ifdef PIC | 442 %ifdef PIC |
| 426 %define SECTION_REL - $$ | 443 %define SECTION_REL - $$ |
| 427 %else | 444 %else |
| 428 %define SECTION_REL | 445 %define SECTION_REL |
| 429 %endif | 446 %endif |
| 430 | 447 |
| 448 %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs |
| 449 lea r2, [dispatch_tab%1] |
| 450 mov r2, [r2 + (%2q-2)*gprsize] |
| 451 %ifdef PIC |
| 452 lea r3, [$$] |
| 453 add r2, r3 |
| 454 %endif |
| 455 call r2 |
| 456 %endmacro ; FFT_DISPATCH |
| 457 |
| 431 %macro DECL_FFT 2-3 ; nbits, cpu, suffix | 458 %macro DECL_FFT 2-3 ; nbits, cpu, suffix |
| 432 %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL | 459 %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL |
| 433 %if %1==5 | 460 %if %1==5 |
| 434 %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL | 461 %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL |
| 435 %endif | 462 %endif |
| 436 | 463 |
| 437 %assign n 1<<%1 | 464 %assign n 1<<%1 |
| 438 %rep 17-%1 | 465 %rep 17-%1 |
| 439 %assign n2 n/2 | 466 %assign n2 n/2 |
| 440 %assign n4 n/4 | 467 %assign n4 n/4 |
| (...skipping 16 matching lines...) Expand all Loading... |
| 457 %undef n | 484 %undef n |
| 458 | 485 |
| 459 align 8 | 486 align 8 |
| 460 dispatch_tab%3%2: pointer list_of_fft | 487 dispatch_tab%3%2: pointer list_of_fft |
| 461 | 488 |
| 462 section .text | 489 section .text |
| 463 | 490 |
| 464 ; On x86_32, this function does the register saving and restoring for all of fft
. | 491 ; On x86_32, this function does the register saving and restoring for all of fft
. |
| 465 ; The others pass args in registers and don't spill anything. | 492 ; The others pass args in registers and don't spill anything. |
| 466 cglobal fft_dispatch%3%2, 2,5,8, z, nbits | 493 cglobal fft_dispatch%3%2, 2,5,8, z, nbits |
| 467 lea r2, [dispatch_tab%3%2] | 494 FFT_DISPATCH %3%2, nbits |
| 468 mov r2, [r2 + (nbitsq-2)*gprsize] | |
| 469 %ifdef PIC | |
| 470 lea r3, [$$] | |
| 471 add r2, r3 | |
| 472 %endif | |
| 473 call r2 | |
| 474 RET | 495 RET |
| 475 %endmacro ; DECL_FFT | 496 %endmacro ; DECL_FFT |
| 476 | 497 |
| 477 DECL_FFT 5, _sse | 498 DECL_FFT 5, _sse |
| 478 DECL_FFT 5, _sse, _interleave | 499 DECL_FFT 5, _sse, _interleave |
| 479 DECL_FFT 4, _3dn | 500 DECL_FFT 4, _3dn |
| 480 DECL_FFT 4, _3dn, _interleave | 501 DECL_FFT 4, _3dn, _interleave |
| 481 DECL_FFT 4, _3dn2 | 502 DECL_FFT 4, _3dn2 |
| 482 DECL_FFT 4, _3dn2, _interleave | 503 DECL_FFT 4, _3dn2, _interleave |
| 483 | 504 |
| 505 INIT_XMM |
| 506 %undef mulps |
| 507 %undef addps |
| 508 %undef subps |
| 509 %undef unpcklps |
| 510 %undef unpckhps |
| 511 |
| 512 %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8 |
| 513 movaps xmm0, [%3+%2*4] |
| 514 movaps xmm1, [%3+%1*4-0x10] |
| 515 movaps xmm2, xmm0 |
| 516 shufps xmm0, xmm1, 0x88 |
| 517 shufps xmm1, xmm2, 0x77 |
| 518 movlps xmm4, [%4+%2*2] |
| 519 movlps xmm5, [%5+%2*2+0x0] |
| 520 movhps xmm4, [%4+%1*2-0x8] |
| 521 movhps xmm5, [%5+%1*2-0x8] |
| 522 movaps xmm2, xmm0 |
| 523 movaps xmm3, xmm1 |
| 524 mulps xmm0, xmm5 |
| 525 mulps xmm1, xmm4 |
| 526 mulps xmm2, xmm4 |
| 527 mulps xmm3, xmm5 |
| 528 subps xmm1, xmm0 |
| 529 addps xmm2, xmm3 |
| 530 movaps xmm0, xmm1 |
| 531 unpcklps xmm1, xmm2 |
| 532 unpckhps xmm0, xmm2 |
| 533 %endmacro |
| 534 |
| 535 %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 |
| 536 movaps xmm6, [%4+%1*2] |
| 537 movaps %2, [%4+%1*2+0x10] |
| 538 movaps %3, xmm6 |
| 539 movaps xmm7, %2 |
| 540 mulps xmm6, [%5+%1] |
| 541 mulps %2, [%6+%1] |
| 542 mulps %3, [%6+%1] |
| 543 mulps xmm7, [%5+%1] |
| 544 subps %2, xmm6 |
| 545 addps %3, xmm7 |
| 546 %endmacro |
| 547 |
| 548 %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 |
| 549 .post: |
| 550 CMUL %1, xmm0, xmm1, %3, %4, %5 |
| 551 CMUL %2, xmm4, xmm5, %3, %4, %5 |
| 552 shufps xmm1, xmm1, 0x1b |
| 553 shufps xmm5, xmm5, 0x1b |
| 554 movaps xmm6, xmm4 |
| 555 unpckhps xmm4, xmm1 |
| 556 unpcklps xmm6, xmm1 |
| 557 movaps xmm2, xmm0 |
| 558 unpcklps xmm0, xmm5 |
| 559 unpckhps xmm2, xmm5 |
| 560 movaps [%3+%2*2], xmm6 |
| 561 movaps [%3+%2*2+0x10], xmm4 |
| 562 movaps [%3+%1*2], xmm0 |
| 563 movaps [%3+%1*2+0x10], xmm2 |
| 564 sub %2, 0x10 |
| 565 add %1, 0x10 |
| 566 jl .post |
| 567 %endmacro |
| 568 |
| 569 cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample
*input |
| 570 %ifdef ARCH_X86_64 |
| 571 %define rrevtab r10 |
| 572 %define rtcos r11 |
| 573 %define rtsin r12 |
| 574 push r12 |
| 575 push r13 |
| 576 push r14 |
| 577 %else |
| 578 %define rrevtab r6 |
| 579 %define rtsin r6 |
| 580 %define rtcos r5 |
| 581 %endif |
| 582 mov r3d, [r0+FFTContext.mdctsize] |
| 583 add r2, r3 |
| 584 shr r3, 1 |
| 585 mov rtcos, [r0+FFTContext.tcos] |
| 586 mov rtsin, [r0+FFTContext.tsin] |
| 587 add rtcos, r3 |
| 588 add rtsin, r3 |
| 589 %ifndef ARCH_X86_64 |
| 590 push rtcos |
| 591 push rtsin |
| 592 %endif |
| 593 shr r3, 1 |
| 594 mov rrevtab, [r0+FFTContext.revtab] |
| 595 add rrevtab, r3 |
| 596 %ifndef ARCH_X86_64 |
| 597 push rrevtab |
| 598 %endif |
| 599 |
| 600 sub r3, 4 |
| 601 %ifdef ARCH_X86_64 |
| 602 xor r4, r4 |
| 603 sub r4, r3 |
| 604 %endif |
| 605 .pre: |
| 606 %ifndef ARCH_X86_64 |
| 607 ;unspill |
| 608 xor r4, r4 |
| 609 sub r4, r3 |
| 610 mov rtsin, [esp+4] |
| 611 mov rtcos, [esp+8] |
| 612 %endif |
| 613 |
| 614 PREROTATER r4, r3, r2, rtcos, rtsin |
| 615 %ifdef ARCH_X86_64 |
| 616 movzx r5, word [rrevtab+r4-4] |
| 617 movzx r6, word [rrevtab+r4-2] |
| 618 movzx r13, word [rrevtab+r3] |
| 619 movzx r14, word [rrevtab+r3+2] |
| 620 movlps [r1+r5 *8], xmm0 |
| 621 movhps [r1+r6 *8], xmm0 |
| 622 movlps [r1+r13*8], xmm1 |
| 623 movhps [r1+r14*8], xmm1 |
| 624 add r4, 4 |
| 625 %else |
| 626 mov r6, [esp] |
| 627 movzx r5, word [r6+r4-4] |
| 628 movzx r4, word [r6+r4-2] |
| 629 movlps [r1+r5*8], xmm0 |
| 630 movhps [r1+r4*8], xmm0 |
| 631 movzx r5, word [r6+r3] |
| 632 movzx r4, word [r6+r3+2] |
| 633 movlps [r1+r5*8], xmm1 |
| 634 movhps [r1+r4*8], xmm1 |
| 635 %endif |
| 636 sub r3, 4 |
| 637 jns .pre |
| 638 |
| 639 mov r5, r0 |
| 640 mov r6, r1 |
| 641 mov r0, r1 |
| 642 mov r1d, [r5+FFTContext.nbits] |
| 643 |
| 644 FFT_DISPATCH _sse, r1 |
| 645 |
| 646 mov r0d, [r5+FFTContext.mdctsize] |
| 647 add r6, r0 |
| 648 shr r0, 1 |
| 649 %ifndef ARCH_X86_64 |
| 650 %define rtcos r2 |
| 651 %define rtsin r3 |
| 652 mov rtcos, [esp+8] |
| 653 mov rtsin, [esp+4] |
| 654 %endif |
| 655 neg r0 |
| 656 mov r1, -16 |
| 657 sub r1, r0 |
| 658 POSROTATESHUF r0, r1, r6, rtcos, rtsin |
| 659 %ifdef ARCH_X86_64 |
| 660 pop r14 |
| 661 pop r13 |
| 662 pop r12 |
| 663 %else |
| 664 add esp, 12 |
| 665 %endif |
| 666 RET |
| OLD | NEW |