OLD | NEW |
1 /* Copyright (c) 2011 Xiph.Org Foundation | 1 /* Copyright (c) 2011 Xiph.Org Foundation |
2 Written by Jean-Marc Valin */ | 2 Written by Jean-Marc Valin */ |
3 /* | 3 /* |
4 Redistribution and use in source and binary forms, with or without | 4 Redistribution and use in source and binary forms, with or without |
5 modification, are permitted provided that the following conditions | 5 modification, are permitted provided that the following conditions |
6 are met: | 6 are met: |
7 | 7 |
8 - Redistributions of source code must retain the above copyright | 8 - Redistributions of source code must retain the above copyright |
9 notice, this list of conditions and the following disclaimer. | 9 notice, this list of conditions and the following disclaimer. |
10 | 10 |
(...skipping 97 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
108 /*static const float tweight[NB_TBANDS+1] = { | 108 /*static const float tweight[NB_TBANDS+1] = { |
109 .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5 | 109 .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5 |
110 };*/ | 110 };*/ |
111 | 111 |
112 #define NB_TONAL_SKIP_BANDS 9 | 112 #define NB_TONAL_SKIP_BANDS 9 |
113 | 113 |
114 #define cA 0.43157974f | 114 #define cA 0.43157974f |
115 #define cB 0.67848403f | 115 #define cB 0.67848403f |
116 #define cC 0.08595542f | 116 #define cC 0.08595542f |
117 #define cE ((float)M_PI/2) | 117 #define cE ((float)M_PI/2) |
118 static inline float fast_atan2f(float y, float x) { | 118 static OPUS_INLINE float fast_atan2f(float y, float x) { |
119 float x2, y2; | 119 float x2, y2; |
120 /* Should avoid underflow on the values we'll get */ | 120 /* Should avoid underflow on the values we'll get */ |
121 if (ABS16(x)+ABS16(y)<1e-9f) | 121 if (ABS16(x)+ABS16(y)<1e-9f) |
122 { | 122 { |
123 x*=1e12f; | 123 x*=1e12f; |
124 y*=1e12f; | 124 y*=1e12f; |
125 } | 125 } |
126 x2 = x*x; | 126 x2 = x*x; |
127 y2 = y*y; | 127 y2 = y*y; |
128 if(x2<y2){ | 128 if(x2<y2){ |
(...skipping 117 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
246 if (tonal->write_pos>=DETECT_SIZE) | 246 if (tonal->write_pos>=DETECT_SIZE) |
247 tonal->write_pos-=DETECT_SIZE; | 247 tonal->write_pos-=DETECT_SIZE; |
248 | 248 |
249 ALLOC(in, 480, kiss_fft_cpx); | 249 ALLOC(in, 480, kiss_fft_cpx); |
250 ALLOC(out, 480, kiss_fft_cpx); | 250 ALLOC(out, 480, kiss_fft_cpx); |
251 ALLOC(tonality, 240, float); | 251 ALLOC(tonality, 240, float); |
252 ALLOC(noisiness, 240, float); | 252 ALLOC(noisiness, 240, float); |
253 for (i=0;i<N2;i++) | 253 for (i=0;i<N2;i++) |
254 { | 254 { |
255 float w = analysis_window[i]; | 255 float w = analysis_window[i]; |
256 in[i].r = w*tonal->inmem[i]; | 256 in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]); |
257 in[i].i = w*tonal->inmem[N2+i]; | 257 in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]); |
258 in[N-i-1].r = w*tonal->inmem[N-i-1]; | 258 in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]); |
259 in[N-i-1].i = w*tonal->inmem[N+N2-i-1]; | 259 in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]); |
260 } | 260 } |
261 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240); | 261 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240); |
262 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill); | 262 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill); |
263 downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->me
m_fill, c1, c2, C); | 263 downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->me
m_fill, c1, c2, C); |
264 tonal->mem_fill = 240 + remaining; | 264 tonal->mem_fill = 240 + remaining; |
265 opus_fft(kfft, in, out); | 265 opus_fft(kfft, in, out); |
266 | 266 |
267 for (i=1;i<N2;i++) | 267 for (i=1;i<N2;i++) |
268 { | 268 { |
269 float X1r, X2r, X1i, X2i; | 269 float X1r, X2r, X1i, X2i; |
270 float angle, d_angle, d2_angle; | 270 float angle, d_angle, d2_angle; |
271 float angle2, d_angle2, d2_angle2; | 271 float angle2, d_angle2, d2_angle2; |
272 float mod1, mod2, avg_mod; | 272 float mod1, mod2, avg_mod; |
273 X1r = out[i].r+out[N-i].r; | 273 X1r = (float)out[i].r+out[N-i].r; |
274 X1i = out[i].i-out[N-i].i; | 274 X1i = (float)out[i].i-out[N-i].i; |
275 X2r = out[i].i+out[N-i].i; | 275 X2r = (float)out[i].i+out[N-i].i; |
276 X2i = out[N-i].r-out[i].r; | 276 X2i = (float)out[N-i].r-out[i].r; |
277 | 277 |
278 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r); | 278 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r); |
279 d_angle = angle - A[i]; | 279 d_angle = angle - A[i]; |
280 d2_angle = d_angle - dA[i]; | 280 d2_angle = d_angle - dA[i]; |
281 | 281 |
282 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r); | 282 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r); |
283 d_angle2 = angle2 - angle; | 283 d_angle2 = angle2 - angle; |
284 d2_angle2 = d_angle2 - d_angle; | 284 d2_angle2 = d_angle2 - d_angle; |
285 | 285 |
286 mod1 = d2_angle - (float)floor(.5+d2_angle); | 286 mod1 = d2_angle - (float)floor(.5+d2_angle); |
(...skipping 23 matching lines...) Expand all Loading... |
310 if (!tonal->count) | 310 if (!tonal->count) |
311 { | 311 { |
312 for (b=0;b<NB_TBANDS;b++) | 312 for (b=0;b<NB_TBANDS;b++) |
313 { | 313 { |
314 tonal->lowE[b] = 1e10; | 314 tonal->lowE[b] = 1e10; |
315 tonal->highE[b] = -1e10; | 315 tonal->highE[b] = -1e10; |
316 } | 316 } |
317 } | 317 } |
318 relativeE = 0; | 318 relativeE = 0; |
319 frame_loudness = 0; | 319 frame_loudness = 0; |
320 bandwidth_mask = 0; | |
321 for (b=0;b<NB_TBANDS;b++) | 320 for (b=0;b<NB_TBANDS;b++) |
322 { | 321 { |
323 float E=0, tE=0, nE=0; | 322 float E=0, tE=0, nE=0; |
324 float L1, L2; | 323 float L1, L2; |
325 float stationarity; | 324 float stationarity; |
326 for (i=tbands[b];i<tbands[b+1];i++) | 325 for (i=tbands[b];i<tbands[b+1];i++) |
327 { | 326 { |
328 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r | 327 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r |
329 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; | 328 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; |
330 #ifdef FIXED_POINT | 329 #ifdef FIXED_POINT |
331 /* FIXME: It's probably best to change the BFCC filter initial state i
nstead */ | 330 /* FIXME: It's probably best to change the BFCC filter initial state i
nstead */ |
332 binE *= 5.55e-17f; | 331 binE *= 5.55e-17f; |
333 #endif | 332 #endif |
334 E += binE; | 333 E += binE; |
335 tE += binE*tonality[i]; | 334 tE += binE*tonality[i]; |
336 nE += binE*2.f*(.5f-noisiness[i]); | 335 nE += binE*2.f*(.5f-noisiness[i]); |
337 } | 336 } |
338 tonal->E[tonal->E_count][b] = E; | 337 tonal->E[tonal->E_count][b] = E; |
339 frame_noisiness += nE/(1e-15f+E); | 338 frame_noisiness += nE/(1e-15f+E); |
340 | 339 |
341 frame_loudness += sqrt(E+1e-10f); | 340 frame_loudness += (float)sqrt(E+1e-10f); |
342 logE[b] = (float)log(E+1e-10f); | 341 logE[b] = (float)log(E+1e-10f); |
343 tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f); | 342 tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f); |
344 tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f); | 343 tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f); |
345 if (tonal->highE[b] < tonal->lowE[b]+1.f) | 344 if (tonal->highE[b] < tonal->lowE[b]+1.f) |
346 { | 345 { |
347 tonal->highE[b]+=.5f; | 346 tonal->highE[b]+=.5f; |
348 tonal->lowE[b]-=.5f; | 347 tonal->lowE[b]-=.5f; |
349 } | 348 } |
350 relativeE += (logE[b]-tonal->lowE[b])/(1e-15+tonal->highE[b]-tonal->lowE[
b]); | 349 relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE
[b]); |
351 | 350 |
352 L1=L2=0; | 351 L1=L2=0; |
353 for (i=0;i<NB_FRAMES;i++) | 352 for (i=0;i<NB_FRAMES;i++) |
354 { | 353 { |
355 L1 += sqrt(tonal->E[i][b]); | 354 L1 += (float)sqrt(tonal->E[i][b]); |
356 L2 += tonal->E[i][b]; | 355 L2 += tonal->E[i][b]; |
357 } | 356 } |
358 | 357 |
359 stationarity = MIN16(0.99f,L1/sqrt(1e-15+NB_FRAMES*L2)); | 358 stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2)); |
360 stationarity *= stationarity; | 359 stationarity *= stationarity; |
361 stationarity *= stationarity; | 360 stationarity *= stationarity; |
362 frame_stationarity += stationarity; | 361 frame_stationarity += stationarity; |
363 /*band_tonality[b] = tE/(1e-15+E)*/; | 362 /*band_tonality[b] = tE/(1e-15+E)*/; |
364 band_tonality[b] = MAX16(tE/(1e-15+E), stationarity*tonal->prev_band_tona
lity[b]); | 363 band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_ton
ality[b]); |
365 #if 0 | 364 #if 0 |
366 if (b>=NB_TONAL_SKIP_BANDS) | 365 if (b>=NB_TONAL_SKIP_BANDS) |
367 { | 366 { |
368 frame_tonality += tweight[b]*band_tonality[b]; | 367 frame_tonality += tweight[b]*band_tonality[b]; |
369 tw_sum += tweight[b]; | 368 tw_sum += tweight[b]; |
370 } | 369 } |
371 #else | 370 #else |
372 frame_tonality += band_tonality[b]; | 371 frame_tonality += band_tonality[b]; |
373 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS) | 372 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS) |
374 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS]; | 373 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS]; |
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
469 } | 468 } |
470 | 469 |
471 for (i=0;i<8;i++) | 470 for (i=0;i<8;i++) |
472 { | 471 { |
473 tonal->mem[i+24] = tonal->mem[i+16]; | 472 tonal->mem[i+24] = tonal->mem[i+16]; |
474 tonal->mem[i+16] = tonal->mem[i+8]; | 473 tonal->mem[i+16] = tonal->mem[i+8]; |
475 tonal->mem[i+8] = tonal->mem[i]; | 474 tonal->mem[i+8] = tonal->mem[i]; |
476 tonal->mem[i] = BFCC[i]; | 475 tonal->mem[i] = BFCC[i]; |
477 } | 476 } |
478 for (i=0;i<9;i++) | 477 for (i=0;i<9;i++) |
479 features[11+i] = sqrt(tonal->std[i]); | 478 features[11+i] = (float)sqrt(tonal->std[i]); |
480 features[20] = info->tonality; | 479 features[20] = info->tonality; |
481 features[21] = info->activity; | 480 features[21] = info->activity; |
482 features[22] = frame_stationarity; | 481 features[22] = frame_stationarity; |
483 features[23] = info->tonality_slope; | 482 features[23] = info->tonality_slope; |
484 features[24] = tonal->lowECount; | 483 features[24] = tonal->lowECount; |
485 | 484 |
486 #ifndef DISABLE_FLOAT_API | 485 #ifndef DISABLE_FLOAT_API |
487 mlp_process(&net, features, frame_probs); | 486 mlp_process(&net, features, frame_probs); |
488 frame_probs[0] = .5f*(frame_probs[0]+1); | 487 frame_probs[0] = .5f*(frame_probs[0]+1); |
489 /* Curve fitting between the MLP probability and the actual probability */ | 488 /* Curve fitting between the MLP probability and the actual probability */ |
(...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
590 adapt = 1.f/(++tonal->speech_confidence_count); | 589 adapt = 1.f/(++tonal->speech_confidence_count); |
591 tonal->speech_confidence_count = IMIN(tonal->speech_confidence_coun
t, 500); | 590 tonal->speech_confidence_count = IMIN(tonal->speech_confidence_coun
t, 500); |
592 tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->s
peech_confidence); | 591 tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->s
peech_confidence); |
593 } | 592 } |
594 } else { | 593 } else { |
595 if (tonal->music_confidence_count==0) | 594 if (tonal->music_confidence_count==0) |
596 tonal->music_confidence = .9f; | 595 tonal->music_confidence = .9f; |
597 if (tonal->speech_confidence_count==0) | 596 if (tonal->speech_confidence_count==0) |
598 tonal->speech_confidence = .1f; | 597 tonal->speech_confidence = .1f; |
599 } | 598 } |
600 psum = MAX16(tonal->speech_confidence, MIN16(tonal->music_confidence, psu
m)); | |
601 } | 599 } |
602 if (tonal->last_music != (tonal->music_prob>.5f)) | 600 if (tonal->last_music != (tonal->music_prob>.5f)) |
603 tonal->last_transition=0; | 601 tonal->last_transition=0; |
604 tonal->last_music = tonal->music_prob>.5f; | 602 tonal->last_music = tonal->music_prob>.5f; |
605 #else | 603 #else |
606 info->music_prob = 0; | 604 info->music_prob = 0; |
607 #endif | 605 #endif |
608 /*for (i=0;i<25;i++) | 606 /*for (i=0;i<25;i++) |
609 printf("%f ", features[i]); | 607 printf("%f ", features[i]); |
610 printf("\n");*/ | 608 printf("\n");*/ |
(...skipping 27 matching lines...) Expand all Loading... |
638 pcm_len -= 480; | 636 pcm_len -= 480; |
639 } while (pcm_len>0); | 637 } while (pcm_len>0); |
640 analysis->analysis_offset = analysis_frame_size; | 638 analysis->analysis_offset = analysis_frame_size; |
641 | 639 |
642 analysis->analysis_offset -= frame_size; | 640 analysis->analysis_offset -= frame_size; |
643 } | 641 } |
644 | 642 |
645 analysis_info->valid = 0; | 643 analysis_info->valid = 0; |
646 tonality_get_info(analysis, analysis_info, frame_size); | 644 tonality_get_info(analysis, analysis_info, frame_size); |
647 } | 645 } |
OLD | NEW |