| Index: celt/celt_lpc.c | 
| diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c | 
| index d2addbf24b2323e0368242df94605c8a3a46c12d..7ffe90a357ed482b98c7b0bcc94ccf3f96f9ed81 100644 | 
| --- a/celt/celt_lpc.c | 
| +++ b/celt/celt_lpc.c | 
| @@ -32,6 +32,7 @@ | 
| #include "celt_lpc.h" | 
| #include "stack_alloc.h" | 
| #include "mathops.h" | 
| +#include "pitch.h" | 
|  | 
| void _celt_lpc( | 
| opus_val16       *_lpc, /* out: [0...p-1] LPC coefficients      */ | 
| @@ -87,42 +88,71 @@ int          p | 
| #endif | 
| } | 
|  | 
| -void celt_fir(const opus_val16 *x, | 
| +void celt_fir(const opus_val16 *_x, | 
| const opus_val16 *num, | 
| -         opus_val16 *y, | 
| +         opus_val16 *_y, | 
| int N, | 
| int ord, | 
| opus_val16 *mem) | 
| { | 
| int i,j; | 
| +   VARDECL(opus_val16, rnum); | 
| +   VARDECL(opus_val16, x); | 
| +   SAVE_STACK; | 
|  | 
| +   ALLOC(rnum, ord, opus_val16); | 
| +   ALLOC(x, N+ord, opus_val16); | 
| +   for(i=0;i<ord;i++) | 
| +      rnum[i] = num[ord-i-1]; | 
| +   for(i=0;i<ord;i++) | 
| +      x[i] = mem[ord-i-1]; | 
| +   for (i=0;i<N;i++) | 
| +      x[i+ord]=_x[i]; | 
| +   for(i=0;i<ord;i++) | 
| +      mem[i] = _x[N-i-1]; | 
| +#ifdef SMALL_FOOTPRINT | 
| for (i=0;i<N;i++) | 
| { | 
| -      opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT); | 
| +      opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT); | 
| for (j=0;j<ord;j++) | 
| { | 
| -         sum += MULT16_16(num[j],mem[j]); | 
| -      } | 
| -      for (j=ord-1;j>=1;j--) | 
| -      { | 
| -         mem[j]=mem[j-1]; | 
| +         sum = MAC16_16(sum,rnum[j],x[i+j]); | 
| } | 
| -      mem[0] = x[i]; | 
| -      y[i] = ROUND16(sum, SIG_SHIFT); | 
| +      _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT)); | 
| } | 
| +#else | 
| +   for (i=0;i<N-3;i+=4) | 
| +   { | 
| +      opus_val32 sum[4]={0,0,0,0}; | 
| +      xcorr_kernel(rnum, x+i, sum, ord); | 
| +      _y[i  ] = SATURATE16(ADD32(EXTEND32(_x[i  ]), PSHR32(sum[0], SIG_SHIFT))); | 
| +      _y[i+1] = SATURATE16(ADD32(EXTEND32(_x[i+1]), PSHR32(sum[1], SIG_SHIFT))); | 
| +      _y[i+2] = SATURATE16(ADD32(EXTEND32(_x[i+2]), PSHR32(sum[2], SIG_SHIFT))); | 
| +      _y[i+3] = SATURATE16(ADD32(EXTEND32(_x[i+3]), PSHR32(sum[3], SIG_SHIFT))); | 
| +   } | 
| +   for (;i<N;i++) | 
| +   { | 
| +      opus_val32 sum = 0; | 
| +      for (j=0;j<ord;j++) | 
| +         sum = MAC16_16(sum,rnum[j],x[i+j]); | 
| +      _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT))); | 
| +   } | 
| +#endif | 
| +   RESTORE_STACK; | 
| } | 
|  | 
| -void celt_iir(const opus_val32 *x, | 
| +void celt_iir(const opus_val32 *_x, | 
| const opus_val16 *den, | 
| -         opus_val32 *y, | 
| +         opus_val32 *_y, | 
| int N, | 
| int ord, | 
| opus_val16 *mem) | 
| { | 
| +#ifdef SMALL_FOOTPRINT | 
| int i,j; | 
| for (i=0;i<N;i++) | 
| { | 
| -      opus_val32 sum = x[i]; | 
| +      opus_val32 sum = _x[i]; | 
| for (j=0;j<ord;j++) | 
| { | 
| sum -= MULT16_16(den[j],mem[j]); | 
| @@ -132,11 +162,65 @@ void celt_iir(const opus_val32 *x, | 
| mem[j]=mem[j-1]; | 
| } | 
| mem[0] = ROUND16(sum,SIG_SHIFT); | 
| -      y[i] = sum; | 
| +      _y[i] = sum; | 
| } | 
| +#else | 
| +   int i,j; | 
| +   VARDECL(opus_val16, rden); | 
| +   VARDECL(opus_val16, y); | 
| +   SAVE_STACK; | 
| + | 
| +   celt_assert((ord&3)==0); | 
| +   ALLOC(rden, ord, opus_val16); | 
| +   ALLOC(y, N+ord, opus_val16); | 
| +   for(i=0;i<ord;i++) | 
| +      rden[i] = den[ord-i-1]; | 
| +   for(i=0;i<ord;i++) | 
| +      y[i] = -mem[ord-i-1]; | 
| +   for(;i<N+ord;i++) | 
| +      y[i]=0; | 
| +   for (i=0;i<N-3;i+=4) | 
| +   { | 
| +      /* Unroll by 4 as if it were an FIR filter */ | 
| +      opus_val32 sum[4]; | 
| +      sum[0]=_x[i]; | 
| +      sum[1]=_x[i+1]; | 
| +      sum[2]=_x[i+2]; | 
| +      sum[3]=_x[i+3]; | 
| +      xcorr_kernel(rden, y+i, sum, ord); | 
| + | 
| +      /* Patch up the result to compensate for the fact that this is an IIR */ | 
| +      y[i+ord  ] = -ROUND16(sum[0],SIG_SHIFT); | 
| +      _y[i  ] = sum[0]; | 
| +      sum[1] = MAC16_16(sum[1], y[i+ord  ], den[0]); | 
| +      y[i+ord+1] = -ROUND16(sum[1],SIG_SHIFT); | 
| +      _y[i+1] = sum[1]; | 
| +      sum[2] = MAC16_16(sum[2], y[i+ord+1], den[0]); | 
| +      sum[2] = MAC16_16(sum[2], y[i+ord  ], den[1]); | 
| +      y[i+ord+2] = -ROUND16(sum[2],SIG_SHIFT); | 
| +      _y[i+2] = sum[2]; | 
| + | 
| +      sum[3] = MAC16_16(sum[3], y[i+ord+2], den[0]); | 
| +      sum[3] = MAC16_16(sum[3], y[i+ord+1], den[1]); | 
| +      sum[3] = MAC16_16(sum[3], y[i+ord  ], den[2]); | 
| +      y[i+ord+3] = -ROUND16(sum[3],SIG_SHIFT); | 
| +      _y[i+3] = sum[3]; | 
| +   } | 
| +   for (;i<N;i++) | 
| +   { | 
| +      opus_val32 sum = _x[i]; | 
| +      for (j=0;j<ord;j++) | 
| +         sum -= MULT16_16(rden[j],y[i+j]); | 
| +      y[i+ord] = ROUND16(sum,SIG_SHIFT); | 
| +      _y[i] = sum; | 
| +   } | 
| +   for(i=0;i<ord;i++) | 
| +      mem[i] = _y[N-i-1]; | 
| +   RESTORE_STACK; | 
| +#endif | 
| } | 
|  | 
| -void _celt_autocorr( | 
| +int _celt_autocorr( | 
| const opus_val16 *x,   /*  in: [0...n-1] samples x   */ | 
| opus_val32       *ac,  /* out: [0...lag-1] ac values */ | 
| const opus_val16       *window, | 
| @@ -146,43 +230,79 @@ void _celt_autocorr( | 
| ) | 
| { | 
| opus_val32 d; | 
| -   int i; | 
| +   int i, k; | 
| +   int fastN=n-lag; | 
| +   int shift; | 
| +   const opus_val16 *xptr; | 
| VARDECL(opus_val16, xx); | 
| SAVE_STACK; | 
| ALLOC(xx, n, opus_val16); | 
| celt_assert(n>0); | 
| celt_assert(overlap>=0); | 
| -   for (i=0;i<n;i++) | 
| -      xx[i] = x[i]; | 
| -   for (i=0;i<overlap;i++) | 
| +   if (overlap == 0) | 
| { | 
| -      xx[i] = MULT16_16_Q15(x[i],window[i]); | 
| -      xx[n-i-1] = MULT16_16_Q15(x[n-i-1],window[i]); | 
| +      xptr = x; | 
| +   } else { | 
| +      for (i=0;i<n;i++) | 
| +         xx[i] = x[i]; | 
| +      for (i=0;i<overlap;i++) | 
| +      { | 
| +         xx[i] = MULT16_16_Q15(x[i],window[i]); | 
| +         xx[n-i-1] = MULT16_16_Q15(x[n-i-1],window[i]); | 
| +      } | 
| +      xptr = xx; | 
| } | 
| +   shift=0; | 
| #ifdef FIXED_POINT | 
| { | 
| -      opus_val32 ac0=0; | 
| -      int shift; | 
| -      for(i=0;i<n;i++) | 
| -         ac0 += SHR32(MULT16_16(xx[i],xx[i]),9); | 
| -      ac0 += 1+n; | 
| +      opus_val32 ac0; | 
| +      ac0 = 1+(n<<7); | 
| +      if (n&1) ac0 += SHR32(MULT16_16(xptr[0],xptr[0]),9); | 
| +      for(i=(n&1);i<n;i+=2) | 
| +      { | 
| +         ac0 += SHR32(MULT16_16(xptr[i],xptr[i]),9); | 
| +         ac0 += SHR32(MULT16_16(xptr[i+1],xptr[i+1]),9); | 
| +      } | 
|  | 
| shift = celt_ilog2(ac0)-30+10; | 
| -      shift = (shift+1)/2; | 
| -      for(i=0;i<n;i++) | 
| -         xx[i] = VSHR32(xx[i], shift); | 
| +      shift = (shift)/2; | 
| +      if (shift>0) | 
| +      { | 
| +         for(i=0;i<n;i++) | 
| +            xx[i] = PSHR32(xptr[i], shift); | 
| +         xptr = xx; | 
| +      } else | 
| +         shift = 0; | 
| } | 
| #endif | 
| -   while (lag>=0) | 
| +   celt_pitch_xcorr(xptr, xptr, ac, fastN, lag+1); | 
| +   for (k=0;k<=lag;k++) | 
| { | 
| -      for (i = lag, d = 0; i < n; i++) | 
| -         d += xx[i] * xx[i-lag]; | 
| -      ac[lag] = d; | 
| -      /*printf ("%f ", ac[lag]);*/ | 
| -      lag--; | 
| +      for (i = k+fastN, d = 0; i < n; i++) | 
| +         d = MAC16_16(d, xptr[i], xptr[i-k]); | 
| +      ac[k] += d; | 
| } | 
| -   /*printf ("\n");*/ | 
| -   ac[0] += 10; | 
| +#ifdef FIXED_POINT | 
| +   shift = 2*shift; | 
| +   if (shift<=0) | 
| +      ac[0] += SHL32((opus_int32)1, -shift); | 
| +   if (ac[0] < 268435456) | 
| +   { | 
| +      int shift2 = 29 - EC_ILOG(ac[0]); | 
| +      for (i=0;i<=lag;i++) | 
| +         ac[i] = SHL32(ac[i], shift2); | 
| +      shift -= shift2; | 
| +   } else if (ac[0] >= 536870912) | 
| +   { | 
| +      int shift2=1; | 
| +      if (ac[0] >= 1073741824) | 
| +         shift2++; | 
| +      for (i=0;i<=lag;i++) | 
| +         ac[i] = SHR32(ac[i], shift2); | 
| +      shift += shift2; | 
| +   } | 
| +#endif | 
|  | 
| RESTORE_STACK; | 
| +   return shift; | 
| } | 
|  |