source/patched-ffmpeg-mt/libavcodec/x86/fft_mmx.asm - Issue 3384002: ffmpeg source update for sep 09

Side by Side Diff: source/patched-ffmpeg-mt/libavcodec/x86/fft_mmx.asm

Issue 3384002: ffmpeg source update for sep 09 (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/ffmpeg/

Patch Set: Created 10 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 ;******************************************************************************	1 ;******************************************************************************

2 ;* FFT transform with SSE/3DNow optimizations	2 ;* FFT transform with SSE/3DNow optimizations

3 ;* Copyright (c) 2008 Loren Merritt	3 ;* Copyright (c) 2008 Loren Merritt

4 ;*	4 ;*

5 ;* This algorithm (though not any of the implementation details) is	5 ;* This algorithm (though not any of the implementation details) is

6 ;* based on libdjbfft by D. J. Bernstein.	6 ;* based on libdjbfft by D. J. Bernstein.

7 ;*	7 ;*

8 ;* This file is part of FFmpeg.	8 ;* This file is part of FFmpeg.

9 ;*	9 ;*

10 ;* FFmpeg is free software; you can redistribute it and/or	10 ;* FFmpeg is free software; you can redistribute it and/or

(...skipping 11 matching lines...) Expand all Loading...
22 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA	22 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

23 ;******************************************************************************	23 ;******************************************************************************

24	24

25 ; These functions are not individually interchangeable with the C versions.	25 ; These functions are not individually interchangeable with the C versions.

26 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results	26 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results

27 ; in blocks as conventient to the vector size.	27 ; in blocks as conventient to the vector size.

28 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)	28 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)

29	29

30 %include "x86inc.asm"	30 %include "x86inc.asm"

31	31

	32 %ifdef ARCH_X86_64

	33 %define pointer resq

	34 %else

	35 %define pointer resd

	36 %endif

	37

	38 struc FFTContext

	39 .nbits: resd 1

	40 .reverse: resd 1

	41 .revtab: pointer 1

	42 .tmpbuf: pointer 1

	43 .mdctsize: resd 1

	44 .mdctbits: resd 1

	45 .tcos: pointer 1

	46 .tsin: pointer 1

	47 endstruc

	48

32 SECTION_RODATA	49 SECTION_RODATA

33	50

34 %define M_SQRT1_2 0.70710678118654752440	51 %define M_SQRT1_2 0.70710678118654752440

35 ps_root2: times 4 dd M_SQRT1_2	52 ps_root2: times 4 dd M_SQRT1_2

36 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2	53 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2

37 ps_m1p1: dd 1<<31, 0	54 ps_m1p1: dd 1<<31, 0

38	55

39 %assign i 16	56 %assign i 16

40 %rep 13	57 %rep 13

41 cextern cos_ %+ i	58 cextern cos_ %+ i

(...skipping 379 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
421 DECL_PASS pass_interleave_3dn, PASS_BIG 0	438 DECL_PASS pass_interleave_3dn, PASS_BIG 0

422 %define pass_3dn2 pass_3dn	439 %define pass_3dn2 pass_3dn

423 %define pass_interleave_3dn2 pass_interleave_3dn	440 %define pass_interleave_3dn2 pass_interleave_3dn

424	441

425 %ifdef PIC	442 %ifdef PIC

426 %define SECTION_REL - $$	443 %define SECTION_REL - $$

427 %else	444 %else

428 %define SECTION_REL	445 %define SECTION_REL

429 %endif	446 %endif

430	447

	448 %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs

	449 lea r2, [dispatch_tab%1]

	450 mov r2, [r2 + (%2q-2)*gprsize]

	451 %ifdef PIC

	452 lea r3, [$$]

	453 add r2, r3

	454 %endif

	455 call r2

	456 %endmacro ; FFT_DISPATCH

	457

431 %macro DECL_FFT 2-3 ; nbits, cpu, suffix	458 %macro DECL_FFT 2-3 ; nbits, cpu, suffix

432 %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL	459 %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL

433 %if %1==5	460 %if %1==5

434 %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL	461 %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL

435 %endif	462 %endif

436	463

437 %assign n 1<<%1	464 %assign n 1<<%1

438 %rep 17-%1	465 %rep 17-%1

439 %assign n2 n/2	466 %assign n2 n/2

440 %assign n4 n/4	467 %assign n4 n/4

(...skipping 16 matching lines...) Expand all Loading...
457 %undef n	484 %undef n

458	485

459 align 8	486 align 8

460 dispatch_tab%3%2: pointer list_of_fft	487 dispatch_tab%3%2: pointer list_of_fft

461	488

462 section .text	489 section .text

463	490

464 ; On x86_32, this function does the register saving and restoring for all of fft .	491 ; On x86_32, this function does the register saving and restoring for all of fft .

465 ; The others pass args in registers and don't spill anything.	492 ; The others pass args in registers and don't spill anything.

466 cglobal fft_dispatch%3%2, 2,5,8, z, nbits	493 cglobal fft_dispatch%3%2, 2,5,8, z, nbits

467 lea r2, [dispatch_tab%3%2]	494 FFT_DISPATCH %3%2, nbits

468 mov r2, [r2 + (nbitsq-2)*gprsize]

469 %ifdef PIC

470 lea r3, [$$]

471 add r2, r3

472 %endif

473 call r2

474 RET	495 RET

475 %endmacro ; DECL_FFT	496 %endmacro ; DECL_FFT

476	497

477 DECL_FFT 5, _sse	498 DECL_FFT 5, _sse

478 DECL_FFT 5, _sse, _interleave	499 DECL_FFT 5, _sse, _interleave

479 DECL_FFT 4, _3dn	500 DECL_FFT 4, _3dn

480 DECL_FFT 4, _3dn, _interleave	501 DECL_FFT 4, _3dn, _interleave

481 DECL_FFT 4, _3dn2	502 DECL_FFT 4, _3dn2

482 DECL_FFT 4, _3dn2, _interleave	503 DECL_FFT 4, _3dn2, _interleave

483	504

	505 INIT_XMM

	506 %undef mulps

	507 %undef addps

	508 %undef subps

	509 %undef unpcklps

	510 %undef unpckhps

	511

	512 %macro PREROTATER 5 ;-2k, 2k, input+n4, tcos+n8, tsin+n8

	513 movaps xmm0, [%3+%2*4]

	514 movaps xmm1, [%3+%1*4-0x10]

	515 movaps xmm2, xmm0

	516 shufps xmm0, xmm1, 0x88

	517 shufps xmm1, xmm2, 0x77

	518 movlps xmm4, [%4+%2*2]

	519 movlps xmm5, [%5+%2*2+0x0]

	520 movhps xmm4, [%4+%1*2-0x8]

	521 movhps xmm5, [%5+%1*2-0x8]

	522 movaps xmm2, xmm0

	523 movaps xmm3, xmm1

	524 mulps xmm0, xmm5

	525 mulps xmm1, xmm4

	526 mulps xmm2, xmm4

	527 mulps xmm3, xmm5

	528 subps xmm1, xmm0

	529 addps xmm2, xmm3

	530 movaps xmm0, xmm1

	531 unpcklps xmm1, xmm2

	532 unpckhps xmm0, xmm2

	533 %endmacro

	534

	535 %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5

	536 movaps xmm6, [%4+%1*2]

	537 movaps %2, [%4+%1*2+0x10]

	538 movaps %3, xmm6

	539 movaps xmm7, %2

	540 mulps xmm6, [%5+%1]

	541 mulps %2, [%6+%1]

	542 mulps %3, [%6+%1]

	543 mulps xmm7, [%5+%1]

	544 subps %2, xmm6

	545 addps %3, xmm7

	546 %endmacro

	547

	548 %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8

	549 .post:

	550 CMUL %1, xmm0, xmm1, %3, %4, %5

	551 CMUL %2, xmm4, xmm5, %3, %4, %5

	552 shufps xmm1, xmm1, 0x1b

	553 shufps xmm5, xmm5, 0x1b

	554 movaps xmm6, xmm4

	555 unpckhps xmm4, xmm1

	556 unpcklps xmm6, xmm1

	557 movaps xmm2, xmm0

	558 unpcklps xmm0, xmm5

	559 unpckhps xmm2, xmm5

	560 movaps [%3+%2*2], xmm6

	561 movaps [%3+%2*2+0x10], xmm4

	562 movaps [%3+%1*2], xmm0

	563 movaps [%3+%1*2+0x10], xmm2

	564 sub %2, 0x10

	565 add %1, 0x10

	566 jl .post

	567 %endmacro

	568

	569 cglobal imdct_half_sse, 3,7,8; FFTContext s, FFTSample output, const FFTSample *input

	570 %ifdef ARCH_X86_64

	571 %define rrevtab r10

	572 %define rtcos r11

	573 %define rtsin r12

	574 push r12

	575 push r13

	576 push r14

	577 %else

	578 %define rrevtab r6

	579 %define rtsin r6

	580 %define rtcos r5

	581 %endif

	582 mov r3d, [r0+FFTContext.mdctsize]

	583 add r2, r3

	584 shr r3, 1

	585 mov rtcos, [r0+FFTContext.tcos]

	586 mov rtsin, [r0+FFTContext.tsin]

	587 add rtcos, r3

	588 add rtsin, r3

	589 %ifndef ARCH_X86_64

	590 push rtcos

	591 push rtsin

	592 %endif

	593 shr r3, 1

	594 mov rrevtab, [r0+FFTContext.revtab]

	595 add rrevtab, r3

	596 %ifndef ARCH_X86_64

	597 push rrevtab

	598 %endif

	599

	600 sub r3, 4

	601 %ifdef ARCH_X86_64

	602 xor r4, r4

	603 sub r4, r3

	604 %endif

	605 .pre:

	606 %ifndef ARCH_X86_64

	607 ;unspill

	608 xor r4, r4

	609 sub r4, r3

	610 mov rtsin, [esp+4]

	611 mov rtcos, [esp+8]

	612 %endif

	613

	614 PREROTATER r4, r3, r2, rtcos, rtsin

	615 %ifdef ARCH_X86_64

	616 movzx r5, word [rrevtab+r4-4]

	617 movzx r6, word [rrevtab+r4-2]

	618 movzx r13, word [rrevtab+r3]

	619 movzx r14, word [rrevtab+r3+2]

	620 movlps [r1+r5 *8], xmm0

	621 movhps [r1+r6 *8], xmm0

	622 movlps [r1+r13*8], xmm1

	623 movhps [r1+r14*8], xmm1

	624 add r4, 4

	625 %else

	626 mov r6, [esp]

	627 movzx r5, word [r6+r4-4]

	628 movzx r4, word [r6+r4-2]

	629 movlps [r1+r5*8], xmm0

	630 movhps [r1+r4*8], xmm0

	631 movzx r5, word [r6+r3]

	632 movzx r4, word [r6+r3+2]

	633 movlps [r1+r5*8], xmm1

	634 movhps [r1+r4*8], xmm1

	635 %endif

	636 sub r3, 4

	637 jns .pre

	638

	639 mov r5, r0

	640 mov r6, r1

	641 mov r0, r1

	642 mov r1d, [r5+FFTContext.nbits]

	643

	644 FFT_DISPATCH _sse, r1

	645

	646 mov r0d, [r5+FFTContext.mdctsize]

	647 add r6, r0

	648 shr r0, 1

	649 %ifndef ARCH_X86_64

	650 %define rtcos r2

	651 %define rtsin r3

	652 mov rtcos, [esp+8]

	653 mov rtsin, [esp+4]

	654 %endif

	655 neg r0

	656 mov r1, -16

	657 sub r1, r0

	658 POSROTATESHUF r0, r1, r6, rtcos, rtsin

	659 %ifdef ARCH_X86_64

	660 pop r14

	661 pop r13

	662 pop r12

	663 %else

	664 add esp, 12

	665 %endif

	666 RET

OLD	NEW

« no previous file with comments | « source/patched-ffmpeg-mt/libavcodec/x86/fft_3dn2.c ('k') | source/patched-ffmpeg-mt/libavcodec/x86/fft_sse.c » ('j') | no next file with comments »