Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(16)

Side by Side Diff: third_party/opus/src/src/analysis.c

Issue 2962373002: [Opus] Update to v1.2.1 (Closed)
Patch Set: Pre-increment instead of post-increment Created 3 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/opus/src/src/analysis.h ('k') | third_party/opus/src/src/mlp_data.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* Copyright (c) 2011 Xiph.Org Foundation 1 /* Copyright (c) 2011 Xiph.Org Foundation
2 Written by Jean-Marc Valin */ 2 Written by Jean-Marc Valin */
3 /* 3 /*
4 Redistribution and use in source and binary forms, with or without 4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions 5 modification, are permitted provided that the following conditions
6 are met: 6 are met:
7 7
8 - Redistributions of source code must retain the above copyright 8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer. 9 notice, this list of conditions and the following disclaimer.
10 10
(...skipping 11 matching lines...) Expand all
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */ 26 */
27 27
28 #ifdef HAVE_CONFIG_H 28 #ifdef HAVE_CONFIG_H
29 #include "config.h" 29 #include "config.h"
30 #endif 30 #endif
31 31
32 #define ANALYSIS_C
33
34 #include <stdio.h>
35
36 #include "mathops.h"
32 #include "kiss_fft.h" 37 #include "kiss_fft.h"
33 #include "celt.h" 38 #include "celt.h"
34 #include "modes.h" 39 #include "modes.h"
35 #include "arch.h" 40 #include "arch.h"
36 #include "quant_bands.h" 41 #include "quant_bands.h"
37 #include <stdio.h>
38 #include "analysis.h" 42 #include "analysis.h"
39 #include "mlp.h" 43 #include "mlp.h"
40 #include "stack_alloc.h" 44 #include "stack_alloc.h"
45 #include "float_cast.h"
41 46
42 #ifndef M_PI 47 #ifndef M_PI
43 #define M_PI 3.141592653 48 #define M_PI 3.141592653
44 #endif 49 #endif
45 50
51 #ifndef DISABLE_FLOAT_API
52
46 static const float dct_table[128] = { 53 static const float dct_table[128] = {
47 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.2500 00f, 0.250000f, 54 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.2500 00f, 0.250000f,
48 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.2500 00f, 0.250000f, 55 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.2500 00f, 0.250000f,
49 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.1026 31f, 0.034654f, 56 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.1026 31f, 0.034654f,
50 -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.3383 30f,-0.351851f, 57 -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.3383 30f,-0.351851f,
51 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.2939 69f,-0.346760f, 58 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.2939 69f,-0.346760f,
52 -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.2939 69f, 0.346760f, 59 -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.2939 69f, 0.346760f,
53 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.2733 00f,-0.102631f, 60 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.2733 00f,-0.102631f,
54 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.2242 92f,-0.338330f, 61 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.2242 92f,-0.338330f,
55 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.1352 99f, 0.326641f, 62 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.1352 99f, 0.326641f,
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
89 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627 f, 0.904508f, 96 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627 f, 0.904508f,
90 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703 f, 0.933013f, 97 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703 f, 0.933013f,
91 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072 f, 0.956773f, 98 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072 f, 0.956773f,
92 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465 f, 0.975528f, 99 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465 f, 0.975528f,
93 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671 f, 0.989074f, 100 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671 f, 0.989074f,
94 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534 f, 0.997261f, 101 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534 f, 0.997261f,
95 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957 f, 1.000000f, 102 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957 f, 1.000000f,
96 }; 103 };
97 104
98 static const int tbands[NB_TBANDS+1] = { 105 static const int tbands[NB_TBANDS+1] = {
99 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 12 0 106 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 136, 160, 192, 240
100 }; 107 };
101 108
102 static const int extra_bands[NB_TOT_BANDS+1] = {
103 1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
104 };
105
106 /*static const float tweight[NB_TBANDS+1] = {
107 .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
108 };*/
109
110 #define NB_TONAL_SKIP_BANDS 9 109 #define NB_TONAL_SKIP_BANDS 9
111 110
112 #define cA 0.43157974f 111 static opus_val32 silk_resampler_down2_hp(
113 #define cB 0.67848403f 112 opus_val32 *S, /* I/O State vector [ 2 ] */
114 #define cC 0.08595542f 113 opus_val32 *out, /* O Output signal [ floo r(len/2) ] */
115 #define cE ((float)M_PI/2) 114 const opus_val32 *in, /* I Input signal [ len ] */
116 static OPUS_INLINE float fast_atan2f(float y, float x) { 115 int inLen /* I Number of input samp les */
117 float x2, y2; 116 )
118 /* Should avoid underflow on the values we'll get */ 117 {
119 if (ABS16(x)+ABS16(y)<1e-9f) 118 int k, len2 = inLen/2;
120 { 119 opus_val32 in32, out32, out32_hp, Y, X;
121 x*=1e12f; 120 opus_val64 hp_ener = 0;
122 y*=1e12f; 121 /* Internal variables and state are in Q10 format */
123 } 122 for( k = 0; k < len2; k++ ) {
124 x2 = x*x; 123 /* Convert to Q10 */
125 y2 = y*y; 124 in32 = in[ 2 * k ];
126 if(x2<y2){ 125
127 float den = (y2 + cB*x2) * (y2 + cC*x2); 126 /* All-pass section for even input sample */
128 if (den!=0) 127 Y = SUB32( in32, S[ 0 ] );
129 return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE); 128 X = MULT16_32_Q15(QCONST16(0.6074371f, 15), Y);
130 else 129 out32 = ADD32( S[ 0 ], X );
131 return (y<0 ? -cE : cE); 130 S[ 0 ] = ADD32( in32, X );
132 }else{ 131 out32_hp = out32;
133 float den = (x2 + cB*y2) * (x2 + cC*y2); 132 /* Convert to Q10 */
134 if (den!=0) 133 in32 = in[ 2 * k + 1 ];
135 return x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE); 134
136 else 135 /* All-pass section for odd input sample, and add to output of previous section */
137 return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE); 136 Y = SUB32( in32, S[ 1 ] );
138 } 137 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
138 out32 = ADD32( out32, S[ 1 ] );
139 out32 = ADD32( out32, X );
140 S[ 1 ] = ADD32( in32, X );
141
142 Y = SUB32( -in32, S[ 2 ] );
143 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
144 out32_hp = ADD32( out32_hp, S[ 2 ] );
145 out32_hp = ADD32( out32_hp, X );
146 S[ 2 ] = ADD32( -in32, X );
147
148 hp_ener += out32_hp*(opus_val64)out32_hp;
149 /* Add, convert back to int16 and store to output */
150 out[ k ] = HALF32(out32);
151 }
152 #ifdef FIXED_POINT
153 /* len2 can be up to 480, so we shift by 8 more to make it fit. */
154 hp_ener = hp_ener >> (2*SIG_SHIFT + 8);
155 #endif
156 return (opus_val32)hp_ener;
139 } 157 }
140 158
141 void tonality_analysis_init(TonalityAnalysisState *tonal) 159 static opus_val32 downmix_and_resample(downmix_func downmix, const void *_x, opu s_val32 *y, opus_val32 S[3], int subframe, int offset, int c1, int c2, int C, in t Fs)
160 {
161 VARDECL(opus_val32, tmp);
162 opus_val32 scale;
163 int j;
164 opus_val32 ret = 0;
165 SAVE_STACK;
166
167 if (subframe==0) return 0;
168 if (Fs == 48000)
169 {
170 subframe *= 2;
171 offset *= 2;
172 } else if (Fs == 16000) {
173 subframe = subframe*2/3;
174 offset = offset*2/3;
175 }
176 ALLOC(tmp, subframe, opus_val32);
177
178 downmix(_x, tmp, subframe, offset, c1, c2, C);
179 #ifdef FIXED_POINT
180 scale = (1<<SIG_SHIFT);
181 #else
182 scale = 1.f/32768;
183 #endif
184 if (c2==-2)
185 scale /= C;
186 else if (c2>-1)
187 scale /= 2;
188 for (j=0;j<subframe;j++)
189 tmp[j] *= scale;
190 if (Fs == 48000)
191 {
192 ret = silk_resampler_down2_hp(S, y, tmp, subframe);
193 } else if (Fs == 24000) {
194 OPUS_COPY(y, tmp, subframe);
195 } else if (Fs == 16000) {
196 VARDECL(opus_val32, tmp3x);
197 ALLOC(tmp3x, 3*subframe, opus_val32);
198 /* Don't do this at home! This resampler is horrible and it's only (barely )
199 usable for the purpose of the analysis because we don't care about all
200 the aliasing between 8 kHz and 12 kHz. */
201 for (j=0;j<subframe;j++)
202 {
203 tmp3x[3*j] = tmp[j];
204 tmp3x[3*j+1] = tmp[j];
205 tmp3x[3*j+2] = tmp[j];
206 }
207 silk_resampler_down2_hp(S, y, tmp3x, 3*subframe);
208 }
209 RESTORE_STACK;
210 return ret;
211 }
212
213 void tonality_analysis_init(TonalityAnalysisState *tonal, opus_int32 Fs)
142 { 214 {
143 /* Initialize reusable fields. */ 215 /* Initialize reusable fields. */
144 tonal->arch = opus_select_arch(); 216 tonal->arch = opus_select_arch();
217 tonal->Fs = Fs;
145 /* Clear remaining fields. */ 218 /* Clear remaining fields. */
146 tonality_analysis_reset(tonal); 219 tonality_analysis_reset(tonal);
147 } 220 }
148 221
149 void tonality_analysis_reset(TonalityAnalysisState *tonal) 222 void tonality_analysis_reset(TonalityAnalysisState *tonal)
150 { 223 {
151 /* Clear non-reusable fields. */ 224 /* Clear non-reusable fields. */
152 char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START; 225 char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
153 OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal)); 226 OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
227 tonal->music_confidence = .9f;
228 tonal->speech_confidence = .1f;
154 } 229 }
155 230
156 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len) 231 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
157 { 232 {
158 int pos; 233 int pos;
159 int curr_lookahead; 234 int curr_lookahead;
160 float psum; 235 float psum;
236 float tonality_max;
237 float tonality_avg;
238 int tonality_count;
161 int i; 239 int i;
162 240
163 pos = tonal->read_pos; 241 pos = tonal->read_pos;
164 curr_lookahead = tonal->write_pos-tonal->read_pos; 242 curr_lookahead = tonal->write_pos-tonal->read_pos;
165 if (curr_lookahead<0) 243 if (curr_lookahead<0)
166 curr_lookahead += DETECT_SIZE; 244 curr_lookahead += DETECT_SIZE;
167 245
168 if (len > 480 && pos != tonal->write_pos) 246 /* On long frames, look at the second analysis window rather than the first. */
247 if (len > tonal->Fs/50 && pos != tonal->write_pos)
169 { 248 {
170 pos++; 249 pos++;
171 if (pos==DETECT_SIZE) 250 if (pos==DETECT_SIZE)
172 pos=0; 251 pos=0;
173 } 252 }
174 if (pos == tonal->write_pos) 253 if (pos == tonal->write_pos)
175 pos--; 254 pos--;
176 if (pos<0) 255 if (pos<0)
177 pos = DETECT_SIZE-1; 256 pos = DETECT_SIZE-1;
178 OPUS_COPY(info_out, &tonal->info[pos], 1); 257 OPUS_COPY(info_out, &tonal->info[pos], 1);
179 tonal->read_subframe += len/120; 258 tonality_max = tonality_avg = info_out->tonality;
180 while (tonal->read_subframe>=4) 259 tonality_count = 1;
260 /* If possible, look ahead for a tone to compensate for the delay in the tone detector. */
261 for (i=0;i<3;i++)
181 { 262 {
182 tonal->read_subframe -= 4; 263 pos++;
264 if (pos==DETECT_SIZE)
265 pos = 0;
266 if (pos == tonal->write_pos)
267 break;
268 tonality_max = MAX32(tonality_max, tonal->info[pos].tonality);
269 tonality_avg += tonal->info[pos].tonality;
270 tonality_count++;
271 }
272 info_out->tonality = MAX32(tonality_avg/tonality_count, tonality_max-.2f);
273 tonal->read_subframe += len/(tonal->Fs/400);
274 while (tonal->read_subframe>=8)
275 {
276 tonal->read_subframe -= 8;
183 tonal->read_pos++; 277 tonal->read_pos++;
184 } 278 }
185 if (tonal->read_pos>=DETECT_SIZE) 279 if (tonal->read_pos>=DETECT_SIZE)
186 tonal->read_pos-=DETECT_SIZE; 280 tonal->read_pos-=DETECT_SIZE;
187 281
188 /* Compensate for the delay in the features themselves. 282 /* The -1 is to compensate for the delay in the features themselves. */
189 FIXME: Need a better estimate the 10 I just made up */ 283 curr_lookahead = IMAX(curr_lookahead-1, 0);
190 curr_lookahead = IMAX(curr_lookahead-10, 0);
191 284
192 psum=0; 285 psum=0;
193 /* Summing the probability of transition patterns that involve music at 286 /* Summing the probability of transition patterns that involve music at
194 time (DETECT_SIZE-curr_lookahead-1) */ 287 time (DETECT_SIZE-curr_lookahead-1) */
195 for (i=0;i<DETECT_SIZE-curr_lookahead;i++) 288 for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
196 psum += tonal->pmusic[i]; 289 psum += tonal->pmusic[i];
197 for (;i<DETECT_SIZE;i++) 290 for (;i<DETECT_SIZE;i++)
198 psum += tonal->pspeech[i]; 291 psum += tonal->pspeech[i];
199 psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence; 292 psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
200 /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/ 293 /*printf("%f %f %f %f %f\n", psum, info_out->music_prob, info_out->vad_prob, info_out->activity_probability, info_out->tonality);*/
201 294
202 info_out->music_prob = psum; 295 info_out->music_prob = psum;
203 } 296 }
204 297
298 static const float std_feature_bias[9] = {
299 5.684947f, 3.475288f, 1.770634f, 1.599784f, 3.773215f,
300 2.163313f, 1.260756f, 1.116868f, 1.918795f
301 };
302
303 #define LEAKAGE_OFFSET 2.5f
304 #define LEAKAGE_SLOPE 2.f
305
306 #ifdef FIXED_POINT
307 /* For fixed-point, the input is +/-2^15 shifted up by SIG_SHIFT, so we need to
308 compensate for that in the energy. */
309 #define SCALE_COMPENS (1.f/((opus_int32)1<<(15+SIG_SHIFT)))
310 #define SCALE_ENER(e) ((SCALE_COMPENS*SCALE_COMPENS)*(e))
311 #else
312 #define SCALE_ENER(e) (e)
313 #endif
314
205 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt _mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix) 315 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt _mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
206 { 316 {
207 int i, b; 317 int i, b;
208 const kiss_fft_state *kfft; 318 const kiss_fft_state *kfft;
209 VARDECL(kiss_fft_cpx, in); 319 VARDECL(kiss_fft_cpx, in);
210 VARDECL(kiss_fft_cpx, out); 320 VARDECL(kiss_fft_cpx, out);
211 int N = 480, N2=240; 321 int N = 480, N2=240;
212 float * OPUS_RESTRICT A = tonal->angle; 322 float * OPUS_RESTRICT A = tonal->angle;
213 float * OPUS_RESTRICT dA = tonal->d_angle; 323 float * OPUS_RESTRICT dA = tonal->d_angle;
214 float * OPUS_RESTRICT d2A = tonal->d2_angle; 324 float * OPUS_RESTRICT d2A = tonal->d2_angle;
(...skipping 13 matching lines...) Expand all
228 float relativeE; 338 float relativeE;
229 float frame_probs[2]; 339 float frame_probs[2];
230 float alpha, alphaE, alphaE2; 340 float alpha, alphaE, alphaE2;
231 float frame_loudness; 341 float frame_loudness;
232 float bandwidth_mask; 342 float bandwidth_mask;
233 int bandwidth=0; 343 int bandwidth=0;
234 float maxE = 0; 344 float maxE = 0;
235 float noise_floor; 345 float noise_floor;
236 int remaining; 346 int remaining;
237 AnalysisInfo *info; 347 AnalysisInfo *info;
348 float hp_ener;
349 float tonality2[240];
350 float midE[8];
351 float spec_variability=0;
352 float band_log2[NB_TBANDS+1];
353 float leakage_from[NB_TBANDS+1];
354 float leakage_to[NB_TBANDS+1];
238 SAVE_STACK; 355 SAVE_STACK;
239 356
240 tonal->last_transition++; 357 alpha = 1.f/IMIN(10, 1+tonal->count);
241 alpha = 1.f/IMIN(20, 1+tonal->count); 358 alphaE = 1.f/IMIN(25, 1+tonal->count);
242 alphaE = 1.f/IMIN(50, 1+tonal->count); 359 alphaE2 = 1.f/IMIN(500, 1+tonal->count);
243 alphaE2 = 1.f/IMIN(1000, 1+tonal->count);
244 360
245 if (tonal->count<4) 361 if (tonal->Fs == 48000)
246 tonal->music_prob = .5; 362 {
363 /* len and offset are now at 24 kHz. */
364 len/= 2;
365 offset /= 2;
366 } else if (tonal->Fs == 16000) {
367 len = 3*len/2;
368 offset = 3*offset/2;
369 }
370
371 if (tonal->count<4) {
372 if (tonal->application == OPUS_APPLICATION_VOIP)
373 tonal->music_prob = .1f;
374 else
375 tonal->music_prob = .625f;
376 }
247 kfft = celt_mode->mdct.kfft[0]; 377 kfft = celt_mode->mdct.kfft[0];
248 if (tonal->count==0) 378 if (tonal->count==0)
249 tonal->mem_fill = 240; 379 tonal->mem_fill = 240;
250 downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal ->mem_fill), offset, c1, c2, C); 380 tonal->hp_ener_accum += (float)downmix_and_resample(downmix, x,
381 &tonal->inmem[tonal->mem_fill], tonal->downmix_state,
382 IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C, tonal ->Fs);
251 if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE) 383 if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
252 { 384 {
253 tonal->mem_fill += len; 385 tonal->mem_fill += len;
254 /* Don't have enough to update the analysis */ 386 /* Don't have enough to update the analysis */
255 RESTORE_STACK; 387 RESTORE_STACK;
256 return; 388 return;
257 } 389 }
390 hp_ener = tonal->hp_ener_accum;
258 info = &tonal->info[tonal->write_pos++]; 391 info = &tonal->info[tonal->write_pos++];
259 if (tonal->write_pos>=DETECT_SIZE) 392 if (tonal->write_pos>=DETECT_SIZE)
260 tonal->write_pos-=DETECT_SIZE; 393 tonal->write_pos-=DETECT_SIZE;
261 394
262 ALLOC(in, 480, kiss_fft_cpx); 395 ALLOC(in, 480, kiss_fft_cpx);
263 ALLOC(out, 480, kiss_fft_cpx); 396 ALLOC(out, 480, kiss_fft_cpx);
264 ALLOC(tonality, 240, float); 397 ALLOC(tonality, 240, float);
265 ALLOC(noisiness, 240, float); 398 ALLOC(noisiness, 240, float);
266 for (i=0;i<N2;i++) 399 for (i=0;i<N2;i++)
267 { 400 {
268 float w = analysis_window[i]; 401 float w = analysis_window[i];
269 in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]); 402 in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
270 in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]); 403 in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
271 in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]); 404 in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
272 in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]); 405 in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
273 } 406 }
274 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240); 407 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
275 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill); 408 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
276 downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->me m_fill, c1, c2, C); 409 tonal->hp_ener_accum = (float)downmix_and_resample(downmix, x,
410 &tonal->inmem[240], tonal->downmix_state, remaining,
411 offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs);
277 tonal->mem_fill = 240 + remaining; 412 tonal->mem_fill = 240 + remaining;
278 opus_fft(kfft, in, out, tonal->arch); 413 opus_fft(kfft, in, out, tonal->arch);
279 #ifndef FIXED_POINT 414 #ifndef FIXED_POINT
280 /* If there's any NaN on the input, the entire output will be NaN, so we onl y need to check one value. */ 415 /* If there's any NaN on the input, the entire output will be NaN, so we onl y need to check one value. */
281 if (celt_isnan(out[0].r)) 416 if (celt_isnan(out[0].r))
282 { 417 {
283 info->valid = 0; 418 info->valid = 0;
284 RESTORE_STACK; 419 RESTORE_STACK;
285 return; 420 return;
286 } 421 }
(...skipping 11 matching lines...) Expand all
298 X2i = (float)out[N-i].r-out[i].r; 433 X2i = (float)out[N-i].r-out[i].r;
299 434
300 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r); 435 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
301 d_angle = angle - A[i]; 436 d_angle = angle - A[i];
302 d2_angle = d_angle - dA[i]; 437 d2_angle = d_angle - dA[i];
303 438
304 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r); 439 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
305 d_angle2 = angle2 - angle; 440 d_angle2 = angle2 - angle;
306 d2_angle2 = d_angle2 - d_angle; 441 d2_angle2 = d_angle2 - d_angle;
307 442
308 mod1 = d2_angle - (float)floor(.5+d2_angle); 443 mod1 = d2_angle - (float)float2int(d2_angle);
309 noisiness[i] = ABS16(mod1); 444 noisiness[i] = ABS16(mod1);
310 mod1 *= mod1; 445 mod1 *= mod1;
311 mod1 *= mod1; 446 mod1 *= mod1;
312 447
313 mod2 = d2_angle2 - (float)floor(.5+d2_angle2); 448 mod2 = d2_angle2 - (float)float2int(d2_angle2);
314 noisiness[i] += ABS16(mod2); 449 noisiness[i] += ABS16(mod2);
315 mod2 *= mod2; 450 mod2 *= mod2;
316 mod2 *= mod2; 451 mod2 *= mod2;
317 452
318 avg_mod = .25f*(d2A[i]+2.f*mod1+mod2); 453 avg_mod = .25f*(d2A[i]+mod1+2*mod2);
454 /* This introduces an extra delay of 2 frames in the detection. */
319 tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f; 455 tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
456 /* No delay on this detection, but it's less reliable. */
457 tonality2[i] = 1.f/(1.f+40.f*16.f*pi4*mod2)-.015f;
320 458
321 A[i] = angle2; 459 A[i] = angle2;
322 dA[i] = d_angle2; 460 dA[i] = d_angle2;
323 d2A[i] = mod2; 461 d2A[i] = mod2;
324 } 462 }
325 463 for (i=2;i<N2-1;i++)
464 {
465 float tt = MIN32(tonality2[i], MAX32(tonality2[i-1], tonality2[i+1]));
466 tonality[i] = .9f*MAX32(tonality[i], tt-.1f);
467 }
326 frame_tonality = 0; 468 frame_tonality = 0;
327 max_frame_tonality = 0; 469 max_frame_tonality = 0;
328 /*tw_sum = 0;*/ 470 /*tw_sum = 0;*/
329 info->activity = 0; 471 info->activity = 0;
330 frame_noisiness = 0; 472 frame_noisiness = 0;
331 frame_stationarity = 0; 473 frame_stationarity = 0;
332 if (!tonal->count) 474 if (!tonal->count)
333 { 475 {
334 for (b=0;b<NB_TBANDS;b++) 476 for (b=0;b<NB_TBANDS;b++)
335 { 477 {
336 tonal->lowE[b] = 1e10; 478 tonal->lowE[b] = 1e10;
337 tonal->highE[b] = -1e10; 479 tonal->highE[b] = -1e10;
338 } 480 }
339 } 481 }
340 relativeE = 0; 482 relativeE = 0;
341 frame_loudness = 0; 483 frame_loudness = 0;
484 /* The energy of the very first band is special because of DC. */
485 {
486 float E = 0;
487 float X1r, X2r;
488 X1r = 2*(float)out[0].r;
489 X2r = 2*(float)out[0].i;
490 E = X1r*X1r + X2r*X2r;
491 for (i=1;i<4;i++)
492 {
493 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
494 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
495 E += binE;
496 }
497 E = SCALE_ENER(E);
498 band_log2[0] = .5f*1.442695f*(float)log(E+1e-10f);
499 }
342 for (b=0;b<NB_TBANDS;b++) 500 for (b=0;b<NB_TBANDS;b++)
343 { 501 {
344 float E=0, tE=0, nE=0; 502 float E=0, tE=0, nE=0;
345 float L1, L2; 503 float L1, L2;
346 float stationarity; 504 float stationarity;
347 for (i=tbands[b];i<tbands[b+1];i++) 505 for (i=tbands[b];i<tbands[b+1];i++)
348 { 506 {
349 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r 507 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
350 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; 508 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
351 #ifdef FIXED_POINT 509 binE = SCALE_ENER(binE);
352 /* FIXME: It's probably best to change the BFCC filter initial state i nstead */
353 binE *= 5.55e-17f;
354 #endif
355 E += binE; 510 E += binE;
356 tE += binE*tonality[i]; 511 tE += binE*MAX32(0, tonality[i]);
357 nE += binE*2.f*(.5f-noisiness[i]); 512 nE += binE*2.f*(.5f-noisiness[i]);
358 } 513 }
359 #ifndef FIXED_POINT 514 #ifndef FIXED_POINT
360 /* Check for extreme band energies that could cause NaNs later. */ 515 /* Check for extreme band energies that could cause NaNs later. */
361 if (!(E<1e9f) || celt_isnan(E)) 516 if (!(E<1e9f) || celt_isnan(E))
362 { 517 {
363 info->valid = 0; 518 info->valid = 0;
364 RESTORE_STACK; 519 RESTORE_STACK;
365 return; 520 return;
366 } 521 }
367 #endif 522 #endif
368 523
369 tonal->E[tonal->E_count][b] = E; 524 tonal->E[tonal->E_count][b] = E;
370 frame_noisiness += nE/(1e-15f+E); 525 frame_noisiness += nE/(1e-15f+E);
371 526
372 frame_loudness += (float)sqrt(E+1e-10f); 527 frame_loudness += (float)sqrt(E+1e-10f);
373 logE[b] = (float)log(E+1e-10f); 528 logE[b] = (float)log(E+1e-10f);
374 tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f); 529 band_log2[b+1] = .5f*1.442695f*(float)log(E+1e-10f);
375 tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f); 530 tonal->logE[tonal->E_count][b] = logE[b];
376 if (tonal->highE[b] < tonal->lowE[b]+1.f) 531 if (tonal->count==0)
532 tonal->highE[b] = tonal->lowE[b] = logE[b];
533 if (tonal->highE[b] > tonal->lowE[b] + 7.5)
377 { 534 {
378 tonal->highE[b]+=.5f; 535 if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b])
379 tonal->lowE[b]-=.5f; 536 tonal->highE[b] -= .01f;
537 else
538 tonal->lowE[b] += .01f;
380 } 539 }
381 relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE [b]); 540 if (logE[b] > tonal->highE[b])
541 {
542 tonal->highE[b] = logE[b];
543 tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]);
544 } else if (logE[b] < tonal->lowE[b])
545 {
546 tonal->lowE[b] = logE[b];
547 tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]);
548 }
549 relativeE += (logE[b]-tonal->lowE[b])/(1e-15f + (tonal->highE[b]-tonal->l owE[b]));
382 550
383 L1=L2=0; 551 L1=L2=0;
384 for (i=0;i<NB_FRAMES;i++) 552 for (i=0;i<NB_FRAMES;i++)
385 { 553 {
386 L1 += (float)sqrt(tonal->E[i][b]); 554 L1 += (float)sqrt(tonal->E[i][b]);
387 L2 += tonal->E[i][b]; 555 L2 += tonal->E[i][b];
388 } 556 }
389 557
390 stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2)); 558 stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
391 stationarity *= stationarity; 559 stationarity *= stationarity;
(...skipping 11 matching lines...) Expand all
403 frame_tonality += band_tonality[b]; 571 frame_tonality += band_tonality[b];
404 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS) 572 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
405 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS]; 573 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
406 #endif 574 #endif
407 max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*f rame_tonality); 575 max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*f rame_tonality);
408 slope += band_tonality[b]*(b-8); 576 slope += band_tonality[b]*(b-8);
409 /*printf("%f %f ", band_tonality[b], stationarity);*/ 577 /*printf("%f %f ", band_tonality[b], stationarity);*/
410 tonal->prev_band_tonality[b] = band_tonality[b]; 578 tonal->prev_band_tonality[b] = band_tonality[b];
411 } 579 }
412 580
581 leakage_from[0] = band_log2[0];
582 leakage_to[0] = band_log2[0] - LEAKAGE_OFFSET;
583 for (b=1;b<NB_TBANDS+1;b++)
584 {
585 float leak_slope = LEAKAGE_SLOPE*(tbands[b]-tbands[b-1])/4;
586 leakage_from[b] = MIN16(leakage_from[b-1]+leak_slope, band_log2[b]);
587 leakage_to[b] = MAX16(leakage_to[b-1]-leak_slope, band_log2[b]-LEAKAGE_OF FSET);
588 }
589 for (b=NB_TBANDS-2;b>=0;b--)
590 {
591 float leak_slope = LEAKAGE_SLOPE*(tbands[b+1]-tbands[b])/4;
592 leakage_from[b] = MIN16(leakage_from[b+1]+leak_slope, leakage_from[b]);
593 leakage_to[b] = MAX16(leakage_to[b+1]-leak_slope, leakage_to[b]);
594 }
595 celt_assert(NB_TBANDS+1 <= LEAK_BANDS);
596 for (b=0;b<NB_TBANDS+1;b++)
597 {
598 /* leak_boost[] is made up of two terms. The first, based on leakage_to[] ,
599 represents the boost needed to overcome the amount of analysis leakage
600 cause in a weaker band b by louder neighbouring bands.
601 The second, based on leakage_from[], applies to a loud band b for
602 which the quantization noise causes synthesis leakage to the weaker
603 neighbouring bands. */
604 float boost = MAX16(0, leakage_to[b] - band_log2[b]) +
605 MAX16(0, band_log2[b] - (leakage_from[b]+LEAKAGE_OFFSET));
606 info->leak_boost[b] = IMIN(255, (int)floor(.5 + 64.f*boost));
607 }
608 for (;b<LEAK_BANDS;b++) info->leak_boost[b] = 0;
609
610 for (i=0;i<NB_FRAMES;i++)
611 {
612 int j;
613 float mindist = 1e15f;
614 for (j=0;j<NB_FRAMES;j++)
615 {
616 int k;
617 float dist=0;
618 for (k=0;k<NB_TBANDS;k++)
619 {
620 float tmp;
621 tmp = tonal->logE[i][k] - tonal->logE[j][k];
622 dist += tmp*tmp;
623 }
624 if (j!=i)
625 mindist = MIN32(mindist, dist);
626 }
627 spec_variability += mindist;
628 }
629 spec_variability = (float)sqrt(spec_variability/NB_FRAMES/NB_TBANDS);
413 bandwidth_mask = 0; 630 bandwidth_mask = 0;
414 bandwidth = 0; 631 bandwidth = 0;
415 maxE = 0; 632 maxE = 0;
416 noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8))); 633 noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
417 #ifdef FIXED_POINT
418 noise_floor *= 1<<(15+SIG_SHIFT);
419 #endif
420 noise_floor *= noise_floor; 634 noise_floor *= noise_floor;
421 for (b=0;b<NB_TOT_BANDS;b++) 635 for (b=0;b<NB_TBANDS;b++)
422 { 636 {
423 float E=0; 637 float E=0;
424 int band_start, band_end; 638 int band_start, band_end;
425 /* Keep a margin of 300 Hz for aliasing */ 639 /* Keep a margin of 300 Hz for aliasing */
426 band_start = extra_bands[b]; 640 band_start = tbands[b];
427 band_end = extra_bands[b+1]; 641 band_end = tbands[b+1];
428 for (i=band_start;i<band_end;i++) 642 for (i=band_start;i<band_end;i++)
429 { 643 {
430 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r 644 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
431 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; 645 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
432 E += binE; 646 E += binE;
433 } 647 }
648 E = SCALE_ENER(E);
434 maxE = MAX32(maxE, E); 649 maxE = MAX32(maxE, E);
435 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E); 650 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
436 E = MAX32(E, tonal->meanE[b]); 651 E = MAX32(E, tonal->meanE[b]);
437 /* Use a simple follower with 13 dB/Bark slope for spreading function */ 652 /* Use a simple follower with 13 dB/Bark slope for spreading function */
438 bandwidth_mask = MAX32(.05f*bandwidth_mask, E); 653 bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
439 /* Consider the band "active" only if all these conditions are met: 654 /* Consider the band "active" only if all these conditions are met:
440 1) less than 10 dB below the simple follower 655 1) less than 10 dB below the simple follower
441 2) less than 90 dB below the peak band (maximal masking possible consi dering 656 2) less than 90 dB below the peak band (maximal masking possible consi dering
442 both the ATH and the loudness-dependent slope of the spreading func tion) 657 both the ATH and the loudness-dependent slope of the spreading func tion)
443 3) above the PCM quantization noise floor 658 3) above the PCM quantization noise floor
659 We use b+1 because the first CELT band isn't included in tbands[]
444 */ 660 */
445 if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-ban d_start)) 661 if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-ban d_start))
446 bandwidth = b; 662 bandwidth = b+1;
663 }
664 /* Special case for the last two bands, for which we don't have spectrum but only
665 the energy above 12 kHz. */
666 if (tonal->Fs == 48000) {
667 float ratio;
668 float E = hp_ener*(1.f/(240*240));
669 ratio = tonal->prev_bandwidth==20 ? 0.03f : 0.07f;
670 #ifdef FIXED_POINT
671 /* silk_resampler_down2_hp() shifted right by an extra 8 bits. */
672 E *= 256.f*(1.f/Q15ONE)*(1.f/Q15ONE);
673 #endif
674 maxE = MAX32(maxE, E);
675 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
676 E = MAX32(E, tonal->meanE[b]);
677 /* Use a simple follower with 13 dB/Bark slope for spreading function */
678 bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
679 if (E>ratio*bandwidth_mask && E*1e9f > maxE && E > noise_floor*160)
680 bandwidth = 20;
681 /* This detector is unreliable, so if the bandwidth is close to SWB, assu me it's FB. */
682 if (bandwidth >= 17)
683 bandwidth = 20;
447 } 684 }
448 if (tonal->count<=2) 685 if (tonal->count<=2)
449 bandwidth = 20; 686 bandwidth = 20;
450 frame_loudness = 20*(float)log10(frame_loudness); 687 frame_loudness = 20*(float)log10(frame_loudness);
451 tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness); 688 tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness);
452 tonal->lowECount *= (1-alphaE); 689 tonal->lowECount *= (1-alphaE);
453 if (frame_loudness < tonal->Etracker-30) 690 if (frame_loudness < tonal->Etracker-30)
454 tonal->lowECount += alphaE; 691 tonal->lowECount += alphaE;
455 692
456 for (i=0;i<8;i++) 693 for (i=0;i<8;i++)
457 { 694 {
458 float sum=0; 695 float sum=0;
459 for (b=0;b<16;b++) 696 for (b=0;b<16;b++)
460 sum += dct_table[i*16+b]*logE[b]; 697 sum += dct_table[i*16+b]*logE[b];
461 BFCC[i] = sum; 698 BFCC[i] = sum;
462 } 699 }
700 for (i=0;i<8;i++)
701 {
702 float sum=0;
703 for (b=0;b<16;b++)
704 sum += dct_table[i*16+b]*.5f*(tonal->highE[b]+tonal->lowE[b]);
705 midE[i] = sum;
706 }
463 707
464 frame_stationarity /= NB_TBANDS; 708 frame_stationarity /= NB_TBANDS;
465 relativeE /= NB_TBANDS; 709 relativeE /= NB_TBANDS;
466 if (tonal->count<10) 710 if (tonal->count<10)
467 relativeE = .5; 711 relativeE = .5f;
468 frame_noisiness /= NB_TBANDS; 712 frame_noisiness /= NB_TBANDS;
469 #if 1 713 #if 1
470 info->activity = frame_noisiness + (1-frame_noisiness)*relativeE; 714 info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
471 #else 715 #else
472 info->activity = .5*(1+frame_noisiness-frame_stationarity); 716 info->activity = .5*(1+frame_noisiness-frame_stationarity);
473 #endif 717 #endif
474 frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS)); 718 frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
475 frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f); 719 frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
476 tonal->prev_tonality = frame_tonality; 720 tonal->prev_tonality = frame_tonality;
477 721
478 slope /= 8*8; 722 slope /= 8*8;
479 info->tonality_slope = slope; 723 info->tonality_slope = slope;
480 724
481 tonal->E_count = (tonal->E_count+1)%NB_FRAMES; 725 tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
482 tonal->count++; 726 tonal->count = IMIN(tonal->count+1, ANALYSIS_COUNT_MAX);
483 info->tonality = frame_tonality; 727 info->tonality = frame_tonality;
484 728
485 for (i=0;i<4;i++) 729 for (i=0;i<4;i++)
486 features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem [i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i]; 730 features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem [i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
487 731
488 for (i=0;i<4;i++) 732 for (i=0;i<4;i++)
489 tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i]; 733 tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
490 734
491 for (i=0;i<4;i++) 735 for (i=0;i<4;i++)
492 features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->m em[i]-tonal->mem[i+16]); 736 features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->m em[i]-tonal->mem[i+16]);
493 for (i=0;i<3;i++) 737 for (i=0;i<3;i++)
494 features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->m em[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8]; 738 features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->m em[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
495 739
496 if (tonal->count > 5) 740 if (tonal->count > 5)
497 { 741 {
498 for (i=0;i<9;i++) 742 for (i=0;i<9;i++)
499 tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i ]; 743 tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i ];
500 } 744 }
745 for (i=0;i<4;i++)
746 features[i] = BFCC[i]-midE[i];
501 747
502 for (i=0;i<8;i++) 748 for (i=0;i<8;i++)
503 { 749 {
504 tonal->mem[i+24] = tonal->mem[i+16]; 750 tonal->mem[i+24] = tonal->mem[i+16];
505 tonal->mem[i+16] = tonal->mem[i+8]; 751 tonal->mem[i+16] = tonal->mem[i+8];
506 tonal->mem[i+8] = tonal->mem[i]; 752 tonal->mem[i+8] = tonal->mem[i];
507 tonal->mem[i] = BFCC[i]; 753 tonal->mem[i] = BFCC[i];
508 } 754 }
509 for (i=0;i<9;i++) 755 for (i=0;i<9;i++)
510 features[11+i] = (float)sqrt(tonal->std[i]); 756 features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i];
511 features[20] = info->tonality; 757 features[18] = spec_variability - 0.78f;
512 features[21] = info->activity; 758 features[20] = info->tonality - 0.154723f;
513 features[22] = frame_stationarity; 759 features[21] = info->activity - 0.724643f;
514 features[23] = info->tonality_slope; 760 features[22] = frame_stationarity - 0.743717f;
515 features[24] = tonal->lowECount; 761 features[23] = info->tonality_slope + 0.069216f;
762 features[24] = tonal->lowECount - 0.067930f;
516 763
517 #ifndef DISABLE_FLOAT_API
518 mlp_process(&net, features, frame_probs); 764 mlp_process(&net, features, frame_probs);
519 frame_probs[0] = .5f*(frame_probs[0]+1); 765 frame_probs[0] = .5f*(frame_probs[0]+1);
520 /* Curve fitting between the MLP probability and the actual probability */ 766 /* Curve fitting between the MLP probability and the actual probability */
521 frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)po w(frame_probs[0], 10); 767 /*frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float) pow(frame_probs[0], 10);*/
522 /* Probability of active audio (as opposed to silence) */ 768 /* Probability of active audio (as opposed to silence) */
523 frame_probs[1] = .5f*frame_probs[1]+.5f; 769 frame_probs[1] = .5f*frame_probs[1]+.5f;
524 /* Consider that silence has a 50-50 probability. */ 770 frame_probs[1] *= frame_probs[1];
525 frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
526 771
527 /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/ 772 /* Probability of speech or music vs noise */
773 info->activity_probability = frame_probs[1];
774
775 /*printf("%f %f\n", frame_probs[0], frame_probs[1]);*/
528 { 776 {
529 /* Probability of state transition */ 777 /* Probability of state transition */
530 float tau; 778 float tau;
531 /* Represents independence of the MLP probabilities, where 779 /* Represents independence of the MLP probabilities, where
532 beta=1 means fully independent. */ 780 beta=1 means fully independent. */
533 float beta; 781 float beta;
534 /* Denormalized probability of speech (p0) and music (p1) after update */ 782 /* Denormalized probability of speech (p0) and music (p1) after update */
535 float p0, p1; 783 float p0, p1;
536 /* Probabilities for "all speech" and "all music" */ 784 /* Probabilities for "all speech" and "all music" */
537 float s0, m0; 785 float s0, m0;
538 /* Probability sum for renormalisation */ 786 /* Probability sum for renormalisation */
539 float psum; 787 float psum;
540 /* Instantaneous probability of speech and music, with beta pre-applied. */ 788 /* Instantaneous probability of speech and music, with beta pre-applied. */
541 float speech0; 789 float speech0;
542 float music0; 790 float music0;
543 float p, q; 791 float p, q;
544 792
793 /* More silence transitions for speech than for music. */
794 tau = .001f*tonal->music_prob + .01f*(1-tonal->music_prob);
795 p = MAX16(.05f,MIN16(.95f,frame_probs[1]));
796 q = MAX16(.05f,MIN16(.95f,tonal->vad_prob));
797 beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
798 /* p0 and p1 are the probabilities of speech and music at this frame
799 using only information from previous frame and applying the
800 state transition model */
801 p0 = (1-tonal->vad_prob)*(1-tau) + tonal->vad_prob *tau;
802 p1 = tonal->vad_prob *(1-tau) + (1-tonal->vad_prob)*tau;
803 /* We apply the current probability with exponent beta to work around
804 the fact that the probability estimates aren't independent. */
805 p0 *= (float)pow(1-frame_probs[1], beta);
806 p1 *= (float)pow(frame_probs[1], beta);
807 /* Normalise the probabilities to get the Marokv probability of music. */
808 tonal->vad_prob = p1/(p0+p1);
809 info->vad_prob = tonal->vad_prob;
810 /* Consider that silence has a 50-50 probability of being speech or music . */
811 frame_probs[0] = tonal->vad_prob*frame_probs[0] + (1-tonal->vad_prob)*.5f ;
812
545 /* One transition every 3 minutes of active audio */ 813 /* One transition every 3 minutes of active audio */
546 tau = .00005f*frame_probs[1]; 814 tau = .0001f;
547 /* Adapt beta based on how "unexpected" the new prob is */ 815 /* Adapt beta based on how "unexpected" the new prob is */
548 p = MAX16(.05f,MIN16(.95f,frame_probs[0])); 816 p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
549 q = MAX16(.05f,MIN16(.95f,tonal->music_prob)); 817 q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
550 beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); 818 beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
551 /* p0 and p1 are the probabilities of speech and music at this frame 819 /* p0 and p1 are the probabilities of speech and music at this frame
552 using only information from previous frame and applying the 820 using only information from previous frame and applying the
553 state transition model */ 821 state transition model */
554 p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau; 822 p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau;
555 p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau; 823 p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
556 /* We apply the current probability with exponent beta to work around 824 /* We apply the current probability with exponent beta to work around
557 the fact that the probability estimates aren't independent. */ 825 the fact that the probability estimates aren't independent. */
558 p0 *= (float)pow(1-frame_probs[0], beta); 826 p0 *= (float)pow(1-frame_probs[0], beta);
559 p1 *= (float)pow(frame_probs[0], beta); 827 p1 *= (float)pow(frame_probs[0], beta);
560 /* Normalise the probabilities to get the Marokv probability of music. */ 828 /* Normalise the probabilities to get the Marokv probability of music. */
561 tonal->music_prob = p1/(p0+p1); 829 tonal->music_prob = p1/(p0+p1);
562 info->music_prob = tonal->music_prob; 830 info->music_prob = tonal->music_prob;
563 831
832 /*printf("%f %f %f %f\n", frame_probs[0], frame_probs[1], tonal->music_pr ob, tonal->vad_prob);*/
564 /* This chunk of code deals with delayed decision. */ 833 /* This chunk of code deals with delayed decision. */
565 psum=1e-20f; 834 psum=1e-20f;
566 /* Instantaneous probability of speech and music, with beta pre-applied. */ 835 /* Instantaneous probability of speech and music, with beta pre-applied. */
567 speech0 = (float)pow(1-frame_probs[0], beta); 836 speech0 = (float)pow(1-frame_probs[0], beta);
568 music0 = (float)pow(frame_probs[0], beta); 837 music0 = (float)pow(frame_probs[0], beta);
569 if (tonal->count==1) 838 if (tonal->count==1)
570 { 839 {
571 tonal->pspeech[0]=.5; 840 if (tonal->application == OPUS_APPLICATION_VOIP)
572 tonal->pmusic [0]=.5; 841 tonal->pmusic[0] = .1f;
842 else
843 tonal->pmusic[0] = .625f;
844 tonal->pspeech[0] = 1-tonal->pmusic[0];
573 } 845 }
574 /* Updated probability of having only speech (s0) or only music (m0), 846 /* Updated probability of having only speech (s0) or only music (m0),
575 before considering the new observation. */ 847 before considering the new observation. */
576 s0 = tonal->pspeech[0] + tonal->pspeech[1]; 848 s0 = tonal->pspeech[0] + tonal->pspeech[1];
577 m0 = tonal->pmusic [0] + tonal->pmusic [1]; 849 m0 = tonal->pmusic [0] + tonal->pmusic [1];
578 /* Updates s0 and m0 with instantaneous probability. */ 850 /* Updates s0 and m0 with instantaneous probability. */
579 tonal->pspeech[0] = s0*(1-tau)*speech0; 851 tonal->pspeech[0] = s0*(1-tau)*speech0;
580 tonal->pmusic [0] = m0*(1-tau)*music0; 852 tonal->pmusic [0] = m0*(1-tau)*music0;
581 /* Propagate the transition probabilities */ 853 /* Propagate the transition probabilities */
582 for (i=1;i<DETECT_SIZE-1;i++) 854 for (i=1;i<DETECT_SIZE-1;i++)
(...skipping 29 matching lines...) Expand all
612 tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500); 884 tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
613 tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->m usic_confidence); 885 tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->m usic_confidence);
614 } 886 }
615 if (tonal->music_prob<.1) 887 if (tonal->music_prob<.1)
616 { 888 {
617 float adapt; 889 float adapt;
618 adapt = 1.f/(++tonal->speech_confidence_count); 890 adapt = 1.f/(++tonal->speech_confidence_count);
619 tonal->speech_confidence_count = IMIN(tonal->speech_confidence_coun t, 500); 891 tonal->speech_confidence_count = IMIN(tonal->speech_confidence_coun t, 500);
620 tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->s peech_confidence); 892 tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->s peech_confidence);
621 } 893 }
622 } else {
623 if (tonal->music_confidence_count==0)
624 tonal->music_confidence = .9f;
625 if (tonal->speech_confidence_count==0)
626 tonal->speech_confidence = .1f;
627 } 894 }
628 } 895 }
629 if (tonal->last_music != (tonal->music_prob>.5f))
630 tonal->last_transition=0;
631 tonal->last_music = tonal->music_prob>.5f; 896 tonal->last_music = tonal->music_prob>.5f;
632 #else 897 #ifdef MLP_TRAINING
633 info->music_prob = 0; 898 for (i=0;i<25;i++)
899 printf("%f ", features[i]);
900 printf("\n");
634 #endif 901 #endif
635 /*for (i=0;i<25;i++)
636 printf("%f ", features[i]);
637 printf("\n");*/
638 902
639 info->bandwidth = bandwidth; 903 info->bandwidth = bandwidth;
904 tonal->prev_bandwidth = bandwidth;
640 /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/ 905 /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
641 info->noisiness = frame_noisiness; 906 info->noisiness = frame_noisiness;
642 info->valid = 1; 907 info->valid = 1;
643 RESTORE_STACK; 908 RESTORE_STACK;
644 } 909 }
645 910
646 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, co nst void *analysis_pcm, 911 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, co nst void *analysis_pcm,
647 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs, 912 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
648 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_inf o) 913 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_inf o)
649 { 914 {
650 int offset; 915 int offset;
651 int pcm_len; 916 int pcm_len;
652 917
918 analysis_frame_size -= analysis_frame_size&1;
653 if (analysis_pcm != NULL) 919 if (analysis_pcm != NULL)
654 { 920 {
655 /* Avoid overflow/wrap-around of the analysis buffer */ 921 /* Avoid overflow/wrap-around of the analysis buffer */
656 analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size); 922 analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/50, analysis_frame_size);
657 923
658 pcm_len = analysis_frame_size - analysis->analysis_offset; 924 pcm_len = analysis_frame_size - analysis->analysis_offset;
659 offset = analysis->analysis_offset; 925 offset = analysis->analysis_offset;
660 do { 926 while (pcm_len>0) {
661 tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len) , offset, c1, c2, C, lsb_depth, downmix); 927 tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(Fs/50, pcm_le n), offset, c1, c2, C, lsb_depth, downmix);
662 offset += 480; 928 offset += Fs/50;
663 pcm_len -= 480; 929 pcm_len -= Fs/50;
664 } while (pcm_len>0); 930 }
665 analysis->analysis_offset = analysis_frame_size; 931 analysis->analysis_offset = analysis_frame_size;
666 932
667 analysis->analysis_offset -= frame_size; 933 analysis->analysis_offset -= frame_size;
668 } 934 }
669 935
670 analysis_info->valid = 0; 936 analysis_info->valid = 0;
671 tonality_get_info(analysis, analysis_info, frame_size); 937 tonality_get_info(analysis, analysis_info, frame_size);
672 } 938 }
939
940 #endif /* DISABLE_FLOAT_API */
OLDNEW
« no previous file with comments | « third_party/opus/src/src/analysis.h ('k') | third_party/opus/src/src/mlp_data.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698