| Index: celt/mdct.c
|
| diff --git a/celt/mdct.c b/celt/mdct.c
|
| index 16a36c692f9b35b50971516ec249f51d1c71216c..90a214ad0e617a7d258f2aa61a07739d6ef9d7c4 100644
|
| --- a/celt/mdct.c
|
| +++ b/celt/mdct.c
|
| @@ -109,12 +109,14 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
|
| int N, N2, N4;
|
| kiss_twiddle_scalar sine;
|
| VARDECL(kiss_fft_scalar, f);
|
| + VARDECL(kiss_fft_scalar, f2);
|
| SAVE_STACK;
|
| N = l->n;
|
| N >>= shift;
|
| N2 = N>>1;
|
| N4 = N>>2;
|
| ALLOC(f, N2, kiss_fft_scalar);
|
| + ALLOC(f2, N2, kiss_fft_scalar);
|
| /* sin(x) ~= x here */
|
| #ifdef FIXED_POINT
|
| sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
|
| @@ -131,7 +133,7 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
|
| kiss_fft_scalar * OPUS_RESTRICT yp = f;
|
| const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);
|
| const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
|
| - for(i=0;i<(overlap>>2);i++)
|
| + for(i=0;i<((overlap+3)>>2);i++)
|
| {
|
| /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
|
| *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2);
|
| @@ -143,7 +145,7 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
|
| }
|
| wp1 = window;
|
| wp2 = window+overlap-1;
|
| - for(;i<N4-(overlap>>2);i++)
|
| + for(;i<N4-((overlap+3)>>2);i++)
|
| {
|
| /* Real part arranged as a-bR, Imag part arranged as -c-dR */
|
| *yp++ = *xp2;
|
| @@ -180,12 +182,12 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
|
| }
|
|
|
| /* N/4 complex FFT, down-scales by 4/N */
|
| - opus_fft(l->kfft[shift], (kiss_fft_cpx *)f, (kiss_fft_cpx *)in);
|
| + opus_fft(l->kfft[shift], (kiss_fft_cpx *)f, (kiss_fft_cpx *)f2);
|
|
|
| /* Post-rotate */
|
| {
|
| /* Temp pointers to make it really clear to the compiler what we're doing */
|
| - const kiss_fft_scalar * OPUS_RESTRICT fp = in;
|
| + const kiss_fft_scalar * OPUS_RESTRICT fp = f2;
|
| kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
|
| kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
|
| const kiss_twiddle_scalar *t = &l->trig[0];
|
| @@ -212,14 +214,12 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
|
| int i;
|
| int N, N2, N4;
|
| kiss_twiddle_scalar sine;
|
| - VARDECL(kiss_fft_scalar, f);
|
| VARDECL(kiss_fft_scalar, f2);
|
| SAVE_STACK;
|
| N = l->n;
|
| N >>= shift;
|
| N2 = N>>1;
|
| N4 = N>>2;
|
| - ALLOC(f, N2, kiss_fft_scalar);
|
| ALLOC(f2, N2, kiss_fft_scalar);
|
| /* sin(x) ~= x here */
|
| #ifdef FIXED_POINT
|
| @@ -249,81 +249,60 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
|
| }
|
|
|
| /* Inverse N/4 complex FFT. This one should *not* downscale even in fixed-point */
|
| - opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)f);
|
| + opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)(out+(overlap>>1)));
|
|
|
| - /* Post-rotate */
|
| + /* Post-rotate and de-shuffle from both ends of the buffer at once to make
|
| + it in-place. */
|
| {
|
| - kiss_fft_scalar * OPUS_RESTRICT fp = f;
|
| + kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1);
|
| + kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2;
|
| const kiss_twiddle_scalar *t = &l->trig[0];
|
| -
|
| - for(i=0;i<N4;i++)
|
| + /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
|
| + middle pair will be computed twice. */
|
| + for(i=0;i<(N4+1)>>1;i++)
|
| {
|
| kiss_fft_scalar re, im, yr, yi;
|
| - re = fp[0];
|
| - im = fp[1];
|
| + kiss_twiddle_scalar t0, t1;
|
| + re = yp0[0];
|
| + im = yp0[1];
|
| + t0 = t[i<<shift];
|
| + t1 = t[(N4-i)<<shift];
|
| /* We'd scale up by 2 here, but instead it's done when mixing the windows */
|
| - yr = S_MUL(re,t[i<<shift]) - S_MUL(im,t[(N4-i)<<shift]);
|
| - yi = S_MUL(im,t[i<<shift]) + S_MUL(re,t[(N4-i)<<shift]);
|
| + yr = S_MUL(re,t0) - S_MUL(im,t1);
|
| + yi = S_MUL(im,t0) + S_MUL(re,t1);
|
| + re = yp1[0];
|
| + im = yp1[1];
|
| /* works because the cos is nearly one */
|
| - *fp++ = yr - S_MUL(yi,sine);
|
| - *fp++ = yi + S_MUL(yr,sine);
|
| - }
|
| - }
|
| - /* De-shuffle the components for the middle of the window only */
|
| - {
|
| - const kiss_fft_scalar * OPUS_RESTRICT fp1 = f;
|
| - const kiss_fft_scalar * OPUS_RESTRICT fp2 = f+N2-1;
|
| - kiss_fft_scalar * OPUS_RESTRICT yp = f2;
|
| - for(i = 0; i < N4; i++)
|
| - {
|
| - *yp++ =-*fp1;
|
| - *yp++ = *fp2;
|
| - fp1 += 2;
|
| - fp2 -= 2;
|
| + yp0[0] = -(yr - S_MUL(yi,sine));
|
| + yp1[1] = yi + S_MUL(yr,sine);
|
| +
|
| + t0 = t[(N4-i-1)<<shift];
|
| + t1 = t[(i+1)<<shift];
|
| + /* We'd scale up by 2 here, but instead it's done when mixing the windows */
|
| + yr = S_MUL(re,t0) - S_MUL(im,t1);
|
| + yi = S_MUL(im,t0) + S_MUL(re,t1);
|
| + /* works because the cos is nearly one */
|
| + yp1[0] = -(yr - S_MUL(yi,sine));
|
| + yp0[1] = yi + S_MUL(yr,sine);
|
| + yp0 += 2;
|
| + yp1 -= 2;
|
| }
|
| }
|
| - out -= (N2-overlap)>>1;
|
| +
|
| /* Mirror on both sides for TDAC */
|
| {
|
| - kiss_fft_scalar * OPUS_RESTRICT fp1 = f2+N4-1;
|
| - kiss_fft_scalar * OPUS_RESTRICT xp1 = out+N2-1;
|
| - kiss_fft_scalar * OPUS_RESTRICT yp1 = out+N4-overlap/2;
|
| - const opus_val16 * OPUS_RESTRICT wp1 = window;
|
| - const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
|
| - for(i = 0; i< N4-overlap/2; i++)
|
| - {
|
| - *xp1 = *fp1;
|
| - xp1--;
|
| - fp1--;
|
| - }
|
| - for(; i < N4; i++)
|
| - {
|
| - kiss_fft_scalar x1;
|
| - x1 = *fp1--;
|
| - *yp1++ +=-MULT16_32_Q15(*wp1, x1);
|
| - *xp1-- += MULT16_32_Q15(*wp2, x1);
|
| - wp1++;
|
| - wp2--;
|
| - }
|
| - }
|
| - {
|
| - kiss_fft_scalar * OPUS_RESTRICT fp2 = f2+N4;
|
| - kiss_fft_scalar * OPUS_RESTRICT xp2 = out+N2;
|
| - kiss_fft_scalar * OPUS_RESTRICT yp2 = out+N-1-(N4-overlap/2);
|
| + kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
|
| + kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
|
| const opus_val16 * OPUS_RESTRICT wp1 = window;
|
| const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
|
| - for(i = 0; i< N4-overlap/2; i++)
|
| - {
|
| - *xp2 = *fp2;
|
| - xp2++;
|
| - fp2++;
|
| - }
|
| - for(; i < N4; i++)
|
| +
|
| + for(i = 0; i < overlap/2; i++)
|
| {
|
| - kiss_fft_scalar x2;
|
| - x2 = *fp2++;
|
| - *yp2-- = MULT16_32_Q15(*wp1, x2);
|
| - *xp2++ = MULT16_32_Q15(*wp2, x2);
|
| + kiss_fft_scalar x1, x2;
|
| + x1 = *xp1;
|
| + x2 = *yp1;
|
| + *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1);
|
| + *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1);
|
| wp1++;
|
| wp2--;
|
| }
|
|
|