Index: celt/mdct.c |
diff --git a/celt/mdct.c b/celt/mdct.c |
index 16a36c692f9b35b50971516ec249f51d1c71216c..90a214ad0e617a7d258f2aa61a07739d6ef9d7c4 100644 |
--- a/celt/mdct.c |
+++ b/celt/mdct.c |
@@ -109,12 +109,14 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar |
int N, N2, N4; |
kiss_twiddle_scalar sine; |
VARDECL(kiss_fft_scalar, f); |
+ VARDECL(kiss_fft_scalar, f2); |
SAVE_STACK; |
N = l->n; |
N >>= shift; |
N2 = N>>1; |
N4 = N>>2; |
ALLOC(f, N2, kiss_fft_scalar); |
+ ALLOC(f2, N2, kiss_fft_scalar); |
/* sin(x) ~= x here */ |
#ifdef FIXED_POINT |
sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N; |
@@ -131,7 +133,7 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar |
kiss_fft_scalar * OPUS_RESTRICT yp = f; |
const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1); |
const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; |
- for(i=0;i<(overlap>>2);i++) |
+ for(i=0;i<((overlap+3)>>2);i++) |
{ |
/* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ |
*yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2); |
@@ -143,7 +145,7 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar |
} |
wp1 = window; |
wp2 = window+overlap-1; |
- for(;i<N4-(overlap>>2);i++) |
+ for(;i<N4-((overlap+3)>>2);i++) |
{ |
/* Real part arranged as a-bR, Imag part arranged as -c-dR */ |
*yp++ = *xp2; |
@@ -180,12 +182,12 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar |
} |
/* N/4 complex FFT, down-scales by 4/N */ |
- opus_fft(l->kfft[shift], (kiss_fft_cpx *)f, (kiss_fft_cpx *)in); |
+ opus_fft(l->kfft[shift], (kiss_fft_cpx *)f, (kiss_fft_cpx *)f2); |
/* Post-rotate */ |
{ |
/* Temp pointers to make it really clear to the compiler what we're doing */ |
- const kiss_fft_scalar * OPUS_RESTRICT fp = in; |
+ const kiss_fft_scalar * OPUS_RESTRICT fp = f2; |
kiss_fft_scalar * OPUS_RESTRICT yp1 = out; |
kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); |
const kiss_twiddle_scalar *t = &l->trig[0]; |
@@ -212,14 +214,12 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala |
int i; |
int N, N2, N4; |
kiss_twiddle_scalar sine; |
- VARDECL(kiss_fft_scalar, f); |
VARDECL(kiss_fft_scalar, f2); |
SAVE_STACK; |
N = l->n; |
N >>= shift; |
N2 = N>>1; |
N4 = N>>2; |
- ALLOC(f, N2, kiss_fft_scalar); |
ALLOC(f2, N2, kiss_fft_scalar); |
/* sin(x) ~= x here */ |
#ifdef FIXED_POINT |
@@ -249,81 +249,60 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala |
} |
/* Inverse N/4 complex FFT. This one should *not* downscale even in fixed-point */ |
- opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)f); |
+ opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)(out+(overlap>>1))); |
- /* Post-rotate */ |
+ /* Post-rotate and de-shuffle from both ends of the buffer at once to make |
+ it in-place. */ |
{ |
- kiss_fft_scalar * OPUS_RESTRICT fp = f; |
+ kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1); |
+ kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2; |
const kiss_twiddle_scalar *t = &l->trig[0]; |
- |
- for(i=0;i<N4;i++) |
+ /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the |
+ middle pair will be computed twice. */ |
+ for(i=0;i<(N4+1)>>1;i++) |
{ |
kiss_fft_scalar re, im, yr, yi; |
- re = fp[0]; |
- im = fp[1]; |
+ kiss_twiddle_scalar t0, t1; |
+ re = yp0[0]; |
+ im = yp0[1]; |
+ t0 = t[i<<shift]; |
+ t1 = t[(N4-i)<<shift]; |
/* We'd scale up by 2 here, but instead it's done when mixing the windows */ |
- yr = S_MUL(re,t[i<<shift]) - S_MUL(im,t[(N4-i)<<shift]); |
- yi = S_MUL(im,t[i<<shift]) + S_MUL(re,t[(N4-i)<<shift]); |
+ yr = S_MUL(re,t0) - S_MUL(im,t1); |
+ yi = S_MUL(im,t0) + S_MUL(re,t1); |
+ re = yp1[0]; |
+ im = yp1[1]; |
/* works because the cos is nearly one */ |
- *fp++ = yr - S_MUL(yi,sine); |
- *fp++ = yi + S_MUL(yr,sine); |
- } |
- } |
- /* De-shuffle the components for the middle of the window only */ |
- { |
- const kiss_fft_scalar * OPUS_RESTRICT fp1 = f; |
- const kiss_fft_scalar * OPUS_RESTRICT fp2 = f+N2-1; |
- kiss_fft_scalar * OPUS_RESTRICT yp = f2; |
- for(i = 0; i < N4; i++) |
- { |
- *yp++ =-*fp1; |
- *yp++ = *fp2; |
- fp1 += 2; |
- fp2 -= 2; |
+ yp0[0] = -(yr - S_MUL(yi,sine)); |
+ yp1[1] = yi + S_MUL(yr,sine); |
+ |
+ t0 = t[(N4-i-1)<<shift]; |
+ t1 = t[(i+1)<<shift]; |
+ /* We'd scale up by 2 here, but instead it's done when mixing the windows */ |
+ yr = S_MUL(re,t0) - S_MUL(im,t1); |
+ yi = S_MUL(im,t0) + S_MUL(re,t1); |
+ /* works because the cos is nearly one */ |
+ yp1[0] = -(yr - S_MUL(yi,sine)); |
+ yp0[1] = yi + S_MUL(yr,sine); |
+ yp0 += 2; |
+ yp1 -= 2; |
} |
} |
- out -= (N2-overlap)>>1; |
+ |
/* Mirror on both sides for TDAC */ |
{ |
- kiss_fft_scalar * OPUS_RESTRICT fp1 = f2+N4-1; |
- kiss_fft_scalar * OPUS_RESTRICT xp1 = out+N2-1; |
- kiss_fft_scalar * OPUS_RESTRICT yp1 = out+N4-overlap/2; |
- const opus_val16 * OPUS_RESTRICT wp1 = window; |
- const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1; |
- for(i = 0; i< N4-overlap/2; i++) |
- { |
- *xp1 = *fp1; |
- xp1--; |
- fp1--; |
- } |
- for(; i < N4; i++) |
- { |
- kiss_fft_scalar x1; |
- x1 = *fp1--; |
- *yp1++ +=-MULT16_32_Q15(*wp1, x1); |
- *xp1-- += MULT16_32_Q15(*wp2, x1); |
- wp1++; |
- wp2--; |
- } |
- } |
- { |
- kiss_fft_scalar * OPUS_RESTRICT fp2 = f2+N4; |
- kiss_fft_scalar * OPUS_RESTRICT xp2 = out+N2; |
- kiss_fft_scalar * OPUS_RESTRICT yp2 = out+N-1-(N4-overlap/2); |
+ kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1; |
+ kiss_fft_scalar * OPUS_RESTRICT yp1 = out; |
const opus_val16 * OPUS_RESTRICT wp1 = window; |
const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1; |
- for(i = 0; i< N4-overlap/2; i++) |
- { |
- *xp2 = *fp2; |
- xp2++; |
- fp2++; |
- } |
- for(; i < N4; i++) |
+ |
+ for(i = 0; i < overlap/2; i++) |
{ |
- kiss_fft_scalar x2; |
- x2 = *fp2++; |
- *yp2-- = MULT16_32_Q15(*wp1, x2); |
- *xp2++ = MULT16_32_Q15(*wp2, x2); |
+ kiss_fft_scalar x1, x2; |
+ x1 = *xp1; |
+ x2 = *yp1; |
+ *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1); |
+ *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1); |
wp1++; |
wp2--; |
} |