Index: third_party/opus/src/src/analysis.c |
diff --git a/third_party/opus/src/src/analysis.c b/third_party/opus/src/src/analysis.c |
index 663431a436a1c81a9ba73ff2a6f2d7fffaebb158..f4160e4b4ed5c114cf7d610dbd97a1569060a077 100644 |
--- a/third_party/opus/src/src/analysis.c |
+++ b/third_party/opus/src/src/analysis.c |
@@ -29,20 +29,27 @@ |
#include "config.h" |
#endif |
+#define ANALYSIS_C |
+ |
+#include <stdio.h> |
+ |
+#include "mathops.h" |
#include "kiss_fft.h" |
#include "celt.h" |
#include "modes.h" |
#include "arch.h" |
#include "quant_bands.h" |
-#include <stdio.h> |
#include "analysis.h" |
#include "mlp.h" |
#include "stack_alloc.h" |
+#include "float_cast.h" |
#ifndef M_PI |
#define M_PI 3.141592653 |
#endif |
+#ifndef DISABLE_FLOAT_API |
+ |
static const float dct_table[128] = { |
0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, |
0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, |
@@ -96,52 +103,118 @@ static const float analysis_window[240] = { |
}; |
static const int tbands[NB_TBANDS+1] = { |
- 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120 |
+ 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 136, 160, 192, 240 |
}; |
-static const int extra_bands[NB_TOT_BANDS+1] = { |
- 1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200 |
-}; |
- |
-/*static const float tweight[NB_TBANDS+1] = { |
- .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5 |
-};*/ |
- |
#define NB_TONAL_SKIP_BANDS 9 |
-#define cA 0.43157974f |
-#define cB 0.67848403f |
-#define cC 0.08595542f |
-#define cE ((float)M_PI/2) |
-static OPUS_INLINE float fast_atan2f(float y, float x) { |
- float x2, y2; |
- /* Should avoid underflow on the values we'll get */ |
- if (ABS16(x)+ABS16(y)<1e-9f) |
+static opus_val32 silk_resampler_down2_hp( |
+ opus_val32 *S, /* I/O State vector [ 2 ] */ |
+ opus_val32 *out, /* O Output signal [ floor(len/2) ] */ |
+ const opus_val32 *in, /* I Input signal [ len ] */ |
+ int inLen /* I Number of input samples */ |
+) |
+{ |
+ int k, len2 = inLen/2; |
+ opus_val32 in32, out32, out32_hp, Y, X; |
+ opus_val64 hp_ener = 0; |
+ /* Internal variables and state are in Q10 format */ |
+ for( k = 0; k < len2; k++ ) { |
+ /* Convert to Q10 */ |
+ in32 = in[ 2 * k ]; |
+ |
+ /* All-pass section for even input sample */ |
+ Y = SUB32( in32, S[ 0 ] ); |
+ X = MULT16_32_Q15(QCONST16(0.6074371f, 15), Y); |
+ out32 = ADD32( S[ 0 ], X ); |
+ S[ 0 ] = ADD32( in32, X ); |
+ out32_hp = out32; |
+ /* Convert to Q10 */ |
+ in32 = in[ 2 * k + 1 ]; |
+ |
+ /* All-pass section for odd input sample, and add to output of previous section */ |
+ Y = SUB32( in32, S[ 1 ] ); |
+ X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y); |
+ out32 = ADD32( out32, S[ 1 ] ); |
+ out32 = ADD32( out32, X ); |
+ S[ 1 ] = ADD32( in32, X ); |
+ |
+ Y = SUB32( -in32, S[ 2 ] ); |
+ X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y); |
+ out32_hp = ADD32( out32_hp, S[ 2 ] ); |
+ out32_hp = ADD32( out32_hp, X ); |
+ S[ 2 ] = ADD32( -in32, X ); |
+ |
+ hp_ener += out32_hp*(opus_val64)out32_hp; |
+ /* Add, convert back to int16 and store to output */ |
+ out[ k ] = HALF32(out32); |
+ } |
+#ifdef FIXED_POINT |
+ /* len2 can be up to 480, so we shift by 8 more to make it fit. */ |
+ hp_ener = hp_ener >> (2*SIG_SHIFT + 8); |
+#endif |
+ return (opus_val32)hp_ener; |
+} |
+ |
+static opus_val32 downmix_and_resample(downmix_func downmix, const void *_x, opus_val32 *y, opus_val32 S[3], int subframe, int offset, int c1, int c2, int C, int Fs) |
+{ |
+ VARDECL(opus_val32, tmp); |
+ opus_val32 scale; |
+ int j; |
+ opus_val32 ret = 0; |
+ SAVE_STACK; |
+ |
+ if (subframe==0) return 0; |
+ if (Fs == 48000) |
{ |
- x*=1e12f; |
- y*=1e12f; |
+ subframe *= 2; |
+ offset *= 2; |
+ } else if (Fs == 16000) { |
+ subframe = subframe*2/3; |
+ offset = offset*2/3; |
} |
- x2 = x*x; |
- y2 = y*y; |
- if(x2<y2){ |
- float den = (y2 + cB*x2) * (y2 + cC*x2); |
- if (den!=0) |
- return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE); |
- else |
- return (y<0 ? -cE : cE); |
- }else{ |
- float den = (x2 + cB*y2) * (x2 + cC*y2); |
- if (den!=0) |
- return x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE); |
- else |
- return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE); |
+ ALLOC(tmp, subframe, opus_val32); |
+ |
+ downmix(_x, tmp, subframe, offset, c1, c2, C); |
+#ifdef FIXED_POINT |
+ scale = (1<<SIG_SHIFT); |
+#else |
+ scale = 1.f/32768; |
+#endif |
+ if (c2==-2) |
+ scale /= C; |
+ else if (c2>-1) |
+ scale /= 2; |
+ for (j=0;j<subframe;j++) |
+ tmp[j] *= scale; |
+ if (Fs == 48000) |
+ { |
+ ret = silk_resampler_down2_hp(S, y, tmp, subframe); |
+ } else if (Fs == 24000) { |
+ OPUS_COPY(y, tmp, subframe); |
+ } else if (Fs == 16000) { |
+ VARDECL(opus_val32, tmp3x); |
+ ALLOC(tmp3x, 3*subframe, opus_val32); |
+ /* Don't do this at home! This resampler is horrible and it's only (barely) |
+ usable for the purpose of the analysis because we don't care about all |
+ the aliasing between 8 kHz and 12 kHz. */ |
+ for (j=0;j<subframe;j++) |
+ { |
+ tmp3x[3*j] = tmp[j]; |
+ tmp3x[3*j+1] = tmp[j]; |
+ tmp3x[3*j+2] = tmp[j]; |
+ } |
+ silk_resampler_down2_hp(S, y, tmp3x, 3*subframe); |
} |
+ RESTORE_STACK; |
+ return ret; |
} |
-void tonality_analysis_init(TonalityAnalysisState *tonal) |
+void tonality_analysis_init(TonalityAnalysisState *tonal, opus_int32 Fs) |
{ |
/* Initialize reusable fields. */ |
tonal->arch = opus_select_arch(); |
+ tonal->Fs = Fs; |
/* Clear remaining fields. */ |
tonality_analysis_reset(tonal); |
} |
@@ -151,6 +224,8 @@ void tonality_analysis_reset(TonalityAnalysisState *tonal) |
/* Clear non-reusable fields. */ |
char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START; |
OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal)); |
+ tonal->music_confidence = .9f; |
+ tonal->speech_confidence = .1f; |
} |
void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len) |
@@ -158,6 +233,9 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int |
int pos; |
int curr_lookahead; |
float psum; |
+ float tonality_max; |
+ float tonality_avg; |
+ int tonality_count; |
int i; |
pos = tonal->read_pos; |
@@ -165,7 +243,8 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int |
if (curr_lookahead<0) |
curr_lookahead += DETECT_SIZE; |
- if (len > 480 && pos != tonal->write_pos) |
+ /* On long frames, look at the second analysis window rather than the first. */ |
+ if (len > tonal->Fs/50 && pos != tonal->write_pos) |
{ |
pos++; |
if (pos==DETECT_SIZE) |
@@ -176,18 +255,32 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int |
if (pos<0) |
pos = DETECT_SIZE-1; |
OPUS_COPY(info_out, &tonal->info[pos], 1); |
- tonal->read_subframe += len/120; |
- while (tonal->read_subframe>=4) |
+ tonality_max = tonality_avg = info_out->tonality; |
+ tonality_count = 1; |
+ /* If possible, look ahead for a tone to compensate for the delay in the tone detector. */ |
+ for (i=0;i<3;i++) |
{ |
- tonal->read_subframe -= 4; |
+ pos++; |
+ if (pos==DETECT_SIZE) |
+ pos = 0; |
+ if (pos == tonal->write_pos) |
+ break; |
+ tonality_max = MAX32(tonality_max, tonal->info[pos].tonality); |
+ tonality_avg += tonal->info[pos].tonality; |
+ tonality_count++; |
+ } |
+ info_out->tonality = MAX32(tonality_avg/tonality_count, tonality_max-.2f); |
+ tonal->read_subframe += len/(tonal->Fs/400); |
+ while (tonal->read_subframe>=8) |
+ { |
+ tonal->read_subframe -= 8; |
tonal->read_pos++; |
} |
if (tonal->read_pos>=DETECT_SIZE) |
tonal->read_pos-=DETECT_SIZE; |
- /* Compensate for the delay in the features themselves. |
- FIXME: Need a better estimate the 10 I just made up */ |
- curr_lookahead = IMAX(curr_lookahead-10, 0); |
+ /* The -1 is to compensate for the delay in the features themselves. */ |
+ curr_lookahead = IMAX(curr_lookahead-1, 0); |
psum=0; |
/* Summing the probability of transition patterns that involve music at |
@@ -197,11 +290,28 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int |
for (;i<DETECT_SIZE;i++) |
psum += tonal->pspeech[i]; |
psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence; |
- /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/ |
+ /*printf("%f %f %f %f %f\n", psum, info_out->music_prob, info_out->vad_prob, info_out->activity_probability, info_out->tonality);*/ |
info_out->music_prob = psum; |
} |
+static const float std_feature_bias[9] = { |
+ 5.684947f, 3.475288f, 1.770634f, 1.599784f, 3.773215f, |
+ 2.163313f, 1.260756f, 1.116868f, 1.918795f |
+}; |
+ |
+#define LEAKAGE_OFFSET 2.5f |
+#define LEAKAGE_SLOPE 2.f |
+ |
+#ifdef FIXED_POINT |
+/* For fixed-point, the input is +/-2^15 shifted up by SIG_SHIFT, so we need to |
+ compensate for that in the energy. */ |
+#define SCALE_COMPENS (1.f/((opus_int32)1<<(15+SIG_SHIFT))) |
+#define SCALE_ENER(e) ((SCALE_COMPENS*SCALE_COMPENS)*(e)) |
+#else |
+#define SCALE_ENER(e) (e) |
+#endif |
+ |
static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix) |
{ |
int i, b; |
@@ -235,19 +345,41 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt |
float noise_floor; |
int remaining; |
AnalysisInfo *info; |
+ float hp_ener; |
+ float tonality2[240]; |
+ float midE[8]; |
+ float spec_variability=0; |
+ float band_log2[NB_TBANDS+1]; |
+ float leakage_from[NB_TBANDS+1]; |
+ float leakage_to[NB_TBANDS+1]; |
SAVE_STACK; |
- tonal->last_transition++; |
- alpha = 1.f/IMIN(20, 1+tonal->count); |
- alphaE = 1.f/IMIN(50, 1+tonal->count); |
- alphaE2 = 1.f/IMIN(1000, 1+tonal->count); |
+ alpha = 1.f/IMIN(10, 1+tonal->count); |
+ alphaE = 1.f/IMIN(25, 1+tonal->count); |
+ alphaE2 = 1.f/IMIN(500, 1+tonal->count); |
+ |
+ if (tonal->Fs == 48000) |
+ { |
+ /* len and offset are now at 24 kHz. */ |
+ len/= 2; |
+ offset /= 2; |
+ } else if (tonal->Fs == 16000) { |
+ len = 3*len/2; |
+ offset = 3*offset/2; |
+ } |
- if (tonal->count<4) |
- tonal->music_prob = .5; |
+ if (tonal->count<4) { |
+ if (tonal->application == OPUS_APPLICATION_VOIP) |
+ tonal->music_prob = .1f; |
+ else |
+ tonal->music_prob = .625f; |
+ } |
kfft = celt_mode->mdct.kfft[0]; |
if (tonal->count==0) |
tonal->mem_fill = 240; |
- downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C); |
+ tonal->hp_ener_accum += (float)downmix_and_resample(downmix, x, |
+ &tonal->inmem[tonal->mem_fill], tonal->downmix_state, |
+ IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C, tonal->Fs); |
if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE) |
{ |
tonal->mem_fill += len; |
@@ -255,6 +387,7 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt |
RESTORE_STACK; |
return; |
} |
+ hp_ener = tonal->hp_ener_accum; |
info = &tonal->info[tonal->write_pos++]; |
if (tonal->write_pos>=DETECT_SIZE) |
tonal->write_pos-=DETECT_SIZE; |
@@ -273,7 +406,9 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt |
} |
OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240); |
remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill); |
- downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C); |
+ tonal->hp_ener_accum = (float)downmix_and_resample(downmix, x, |
+ &tonal->inmem[240], tonal->downmix_state, remaining, |
+ offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs); |
tonal->mem_fill = 240 + remaining; |
opus_fft(kfft, in, out, tonal->arch); |
#ifndef FIXED_POINT |
@@ -305,24 +440,31 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt |
d_angle2 = angle2 - angle; |
d2_angle2 = d_angle2 - d_angle; |
- mod1 = d2_angle - (float)floor(.5+d2_angle); |
+ mod1 = d2_angle - (float)float2int(d2_angle); |
noisiness[i] = ABS16(mod1); |
mod1 *= mod1; |
mod1 *= mod1; |
- mod2 = d2_angle2 - (float)floor(.5+d2_angle2); |
+ mod2 = d2_angle2 - (float)float2int(d2_angle2); |
noisiness[i] += ABS16(mod2); |
mod2 *= mod2; |
mod2 *= mod2; |
- avg_mod = .25f*(d2A[i]+2.f*mod1+mod2); |
+ avg_mod = .25f*(d2A[i]+mod1+2*mod2); |
+ /* This introduces an extra delay of 2 frames in the detection. */ |
tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f; |
+ /* No delay on this detection, but it's less reliable. */ |
+ tonality2[i] = 1.f/(1.f+40.f*16.f*pi4*mod2)-.015f; |
A[i] = angle2; |
dA[i] = d_angle2; |
d2A[i] = mod2; |
} |
- |
+ for (i=2;i<N2-1;i++) |
+ { |
+ float tt = MIN32(tonality2[i], MAX32(tonality2[i-1], tonality2[i+1])); |
+ tonality[i] = .9f*MAX32(tonality[i], tt-.1f); |
+ } |
frame_tonality = 0; |
max_frame_tonality = 0; |
/*tw_sum = 0;*/ |
@@ -339,6 +481,22 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt |
} |
relativeE = 0; |
frame_loudness = 0; |
+ /* The energy of the very first band is special because of DC. */ |
+ { |
+ float E = 0; |
+ float X1r, X2r; |
+ X1r = 2*(float)out[0].r; |
+ X2r = 2*(float)out[0].i; |
+ E = X1r*X1r + X2r*X2r; |
+ for (i=1;i<4;i++) |
+ { |
+ float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r |
+ + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; |
+ E += binE; |
+ } |
+ E = SCALE_ENER(E); |
+ band_log2[0] = .5f*1.442695f*(float)log(E+1e-10f); |
+ } |
for (b=0;b<NB_TBANDS;b++) |
{ |
float E=0, tE=0, nE=0; |
@@ -348,12 +506,9 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt |
{ |
float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r |
+ out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; |
-#ifdef FIXED_POINT |
- /* FIXME: It's probably best to change the BFCC filter initial state instead */ |
- binE *= 5.55e-17f; |
-#endif |
+ binE = SCALE_ENER(binE); |
E += binE; |
- tE += binE*tonality[i]; |
+ tE += binE*MAX32(0, tonality[i]); |
nE += binE*2.f*(.5f-noisiness[i]); |
} |
#ifndef FIXED_POINT |
@@ -371,14 +526,27 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt |
frame_loudness += (float)sqrt(E+1e-10f); |
logE[b] = (float)log(E+1e-10f); |
- tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f); |
- tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f); |
- if (tonal->highE[b] < tonal->lowE[b]+1.f) |
+ band_log2[b+1] = .5f*1.442695f*(float)log(E+1e-10f); |
+ tonal->logE[tonal->E_count][b] = logE[b]; |
+ if (tonal->count==0) |
+ tonal->highE[b] = tonal->lowE[b] = logE[b]; |
+ if (tonal->highE[b] > tonal->lowE[b] + 7.5) |
+ { |
+ if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b]) |
+ tonal->highE[b] -= .01f; |
+ else |
+ tonal->lowE[b] += .01f; |
+ } |
+ if (logE[b] > tonal->highE[b]) |
{ |
- tonal->highE[b]+=.5f; |
- tonal->lowE[b]-=.5f; |
+ tonal->highE[b] = logE[b]; |
+ tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]); |
+ } else if (logE[b] < tonal->lowE[b]) |
+ { |
+ tonal->lowE[b] = logE[b]; |
+ tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]); |
} |
- relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]); |
+ relativeE += (logE[b]-tonal->lowE[b])/(1e-15f + (tonal->highE[b]-tonal->lowE[b])); |
L1=L2=0; |
for (i=0;i<NB_FRAMES;i++) |
@@ -410,27 +578,74 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt |
tonal->prev_band_tonality[b] = band_tonality[b]; |
} |
+ leakage_from[0] = band_log2[0]; |
+ leakage_to[0] = band_log2[0] - LEAKAGE_OFFSET; |
+ for (b=1;b<NB_TBANDS+1;b++) |
+ { |
+ float leak_slope = LEAKAGE_SLOPE*(tbands[b]-tbands[b-1])/4; |
+ leakage_from[b] = MIN16(leakage_from[b-1]+leak_slope, band_log2[b]); |
+ leakage_to[b] = MAX16(leakage_to[b-1]-leak_slope, band_log2[b]-LEAKAGE_OFFSET); |
+ } |
+ for (b=NB_TBANDS-2;b>=0;b--) |
+ { |
+ float leak_slope = LEAKAGE_SLOPE*(tbands[b+1]-tbands[b])/4; |
+ leakage_from[b] = MIN16(leakage_from[b+1]+leak_slope, leakage_from[b]); |
+ leakage_to[b] = MAX16(leakage_to[b+1]-leak_slope, leakage_to[b]); |
+ } |
+ celt_assert(NB_TBANDS+1 <= LEAK_BANDS); |
+ for (b=0;b<NB_TBANDS+1;b++) |
+ { |
+ /* leak_boost[] is made up of two terms. The first, based on leakage_to[], |
+ represents the boost needed to overcome the amount of analysis leakage |
+ cause in a weaker band b by louder neighbouring bands. |
+ The second, based on leakage_from[], applies to a loud band b for |
+ which the quantization noise causes synthesis leakage to the weaker |
+ neighbouring bands. */ |
+ float boost = MAX16(0, leakage_to[b] - band_log2[b]) + |
+ MAX16(0, band_log2[b] - (leakage_from[b]+LEAKAGE_OFFSET)); |
+ info->leak_boost[b] = IMIN(255, (int)floor(.5 + 64.f*boost)); |
+ } |
+ for (;b<LEAK_BANDS;b++) info->leak_boost[b] = 0; |
+ |
+ for (i=0;i<NB_FRAMES;i++) |
+ { |
+ int j; |
+ float mindist = 1e15f; |
+ for (j=0;j<NB_FRAMES;j++) |
+ { |
+ int k; |
+ float dist=0; |
+ for (k=0;k<NB_TBANDS;k++) |
+ { |
+ float tmp; |
+ tmp = tonal->logE[i][k] - tonal->logE[j][k]; |
+ dist += tmp*tmp; |
+ } |
+ if (j!=i) |
+ mindist = MIN32(mindist, dist); |
+ } |
+ spec_variability += mindist; |
+ } |
+ spec_variability = (float)sqrt(spec_variability/NB_FRAMES/NB_TBANDS); |
bandwidth_mask = 0; |
bandwidth = 0; |
maxE = 0; |
noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8))); |
-#ifdef FIXED_POINT |
- noise_floor *= 1<<(15+SIG_SHIFT); |
-#endif |
noise_floor *= noise_floor; |
- for (b=0;b<NB_TOT_BANDS;b++) |
+ for (b=0;b<NB_TBANDS;b++) |
{ |
float E=0; |
int band_start, band_end; |
/* Keep a margin of 300 Hz for aliasing */ |
- band_start = extra_bands[b]; |
- band_end = extra_bands[b+1]; |
+ band_start = tbands[b]; |
+ band_end = tbands[b+1]; |
for (i=band_start;i<band_end;i++) |
{ |
float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r |
+ out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; |
E += binE; |
} |
+ E = SCALE_ENER(E); |
maxE = MAX32(maxE, E); |
tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E); |
E = MAX32(E, tonal->meanE[b]); |
@@ -441,14 +656,36 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt |
2) less than 90 dB below the peak band (maximal masking possible considering |
both the ATH and the loudness-dependent slope of the spreading function) |
3) above the PCM quantization noise floor |
+ We use b+1 because the first CELT band isn't included in tbands[] |
*/ |
if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start)) |
- bandwidth = b; |
+ bandwidth = b+1; |
+ } |
+ /* Special case for the last two bands, for which we don't have spectrum but only |
+ the energy above 12 kHz. */ |
+ if (tonal->Fs == 48000) { |
+ float ratio; |
+ float E = hp_ener*(1.f/(240*240)); |
+ ratio = tonal->prev_bandwidth==20 ? 0.03f : 0.07f; |
+#ifdef FIXED_POINT |
+ /* silk_resampler_down2_hp() shifted right by an extra 8 bits. */ |
+ E *= 256.f*(1.f/Q15ONE)*(1.f/Q15ONE); |
+#endif |
+ maxE = MAX32(maxE, E); |
+ tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E); |
+ E = MAX32(E, tonal->meanE[b]); |
+ /* Use a simple follower with 13 dB/Bark slope for spreading function */ |
+ bandwidth_mask = MAX32(.05f*bandwidth_mask, E); |
+ if (E>ratio*bandwidth_mask && E*1e9f > maxE && E > noise_floor*160) |
+ bandwidth = 20; |
+ /* This detector is unreliable, so if the bandwidth is close to SWB, assume it's FB. */ |
+ if (bandwidth >= 17) |
+ bandwidth = 20; |
} |
if (tonal->count<=2) |
bandwidth = 20; |
frame_loudness = 20*(float)log10(frame_loudness); |
- tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness); |
+ tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness); |
tonal->lowECount *= (1-alphaE); |
if (frame_loudness < tonal->Etracker-30) |
tonal->lowECount += alphaE; |
@@ -460,11 +697,18 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt |
sum += dct_table[i*16+b]*logE[b]; |
BFCC[i] = sum; |
} |
+ for (i=0;i<8;i++) |
+ { |
+ float sum=0; |
+ for (b=0;b<16;b++) |
+ sum += dct_table[i*16+b]*.5f*(tonal->highE[b]+tonal->lowE[b]); |
+ midE[i] = sum; |
+ } |
frame_stationarity /= NB_TBANDS; |
relativeE /= NB_TBANDS; |
if (tonal->count<10) |
- relativeE = .5; |
+ relativeE = .5f; |
frame_noisiness /= NB_TBANDS; |
#if 1 |
info->activity = frame_noisiness + (1-frame_noisiness)*relativeE; |
@@ -479,7 +723,7 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt |
info->tonality_slope = slope; |
tonal->E_count = (tonal->E_count+1)%NB_FRAMES; |
- tonal->count++; |
+ tonal->count = IMIN(tonal->count+1, ANALYSIS_COUNT_MAX); |
info->tonality = frame_tonality; |
for (i=0;i<4;i++) |
@@ -498,6 +742,8 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt |
for (i=0;i<9;i++) |
tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i]; |
} |
+ for (i=0;i<4;i++) |
+ features[i] = BFCC[i]-midE[i]; |
for (i=0;i<8;i++) |
{ |
@@ -507,24 +753,26 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt |
tonal->mem[i] = BFCC[i]; |
} |
for (i=0;i<9;i++) |
- features[11+i] = (float)sqrt(tonal->std[i]); |
- features[20] = info->tonality; |
- features[21] = info->activity; |
- features[22] = frame_stationarity; |
- features[23] = info->tonality_slope; |
- features[24] = tonal->lowECount; |
+ features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i]; |
+ features[18] = spec_variability - 0.78f; |
+ features[20] = info->tonality - 0.154723f; |
+ features[21] = info->activity - 0.724643f; |
+ features[22] = frame_stationarity - 0.743717f; |
+ features[23] = info->tonality_slope + 0.069216f; |
+ features[24] = tonal->lowECount - 0.067930f; |
-#ifndef DISABLE_FLOAT_API |
mlp_process(&net, features, frame_probs); |
frame_probs[0] = .5f*(frame_probs[0]+1); |
/* Curve fitting between the MLP probability and the actual probability */ |
- frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10); |
+ /*frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);*/ |
/* Probability of active audio (as opposed to silence) */ |
frame_probs[1] = .5f*frame_probs[1]+.5f; |
- /* Consider that silence has a 50-50 probability. */ |
- frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f; |
+ frame_probs[1] *= frame_probs[1]; |
+ |
+ /* Probability of speech or music vs noise */ |
+ info->activity_probability = frame_probs[1]; |
- /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/ |
+ /*printf("%f %f\n", frame_probs[0], frame_probs[1]);*/ |
{ |
/* Probability of state transition */ |
float tau; |
@@ -542,12 +790,32 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt |
float music0; |
float p, q; |
+ /* More silence transitions for speech than for music. */ |
+ tau = .001f*tonal->music_prob + .01f*(1-tonal->music_prob); |
+ p = MAX16(.05f,MIN16(.95f,frame_probs[1])); |
+ q = MAX16(.05f,MIN16(.95f,tonal->vad_prob)); |
+ beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); |
+ /* p0 and p1 are the probabilities of speech and music at this frame |
+ using only information from previous frame and applying the |
+ state transition model */ |
+ p0 = (1-tonal->vad_prob)*(1-tau) + tonal->vad_prob *tau; |
+ p1 = tonal->vad_prob *(1-tau) + (1-tonal->vad_prob)*tau; |
+ /* We apply the current probability with exponent beta to work around |
+ the fact that the probability estimates aren't independent. */ |
+ p0 *= (float)pow(1-frame_probs[1], beta); |
+ p1 *= (float)pow(frame_probs[1], beta); |
+ /* Normalise the probabilities to get the Marokv probability of music. */ |
+ tonal->vad_prob = p1/(p0+p1); |
+ info->vad_prob = tonal->vad_prob; |
+ /* Consider that silence has a 50-50 probability of being speech or music. */ |
+ frame_probs[0] = tonal->vad_prob*frame_probs[0] + (1-tonal->vad_prob)*.5f; |
+ |
/* One transition every 3 minutes of active audio */ |
- tau = .00005f*frame_probs[1]; |
+ tau = .0001f; |
/* Adapt beta based on how "unexpected" the new prob is */ |
p = MAX16(.05f,MIN16(.95f,frame_probs[0])); |
q = MAX16(.05f,MIN16(.95f,tonal->music_prob)); |
- beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); |
+ beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); |
/* p0 and p1 are the probabilities of speech and music at this frame |
using only information from previous frame and applying the |
state transition model */ |
@@ -561,6 +829,7 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt |
tonal->music_prob = p1/(p0+p1); |
info->music_prob = tonal->music_prob; |
+ /*printf("%f %f %f %f\n", frame_probs[0], frame_probs[1], tonal->music_prob, tonal->vad_prob);*/ |
/* This chunk of code deals with delayed decision. */ |
psum=1e-20f; |
/* Instantaneous probability of speech and music, with beta pre-applied. */ |
@@ -568,8 +837,11 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt |
music0 = (float)pow(frame_probs[0], beta); |
if (tonal->count==1) |
{ |
- tonal->pspeech[0]=.5; |
- tonal->pmusic [0]=.5; |
+ if (tonal->application == OPUS_APPLICATION_VOIP) |
+ tonal->pmusic[0] = .1f; |
+ else |
+ tonal->pmusic[0] = .625f; |
+ tonal->pspeech[0] = 1-tonal->pmusic[0]; |
} |
/* Updated probability of having only speech (s0) or only music (m0), |
before considering the new observation. */ |
@@ -619,24 +891,17 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt |
tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500); |
tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence); |
} |
- } else { |
- if (tonal->music_confidence_count==0) |
- tonal->music_confidence = .9f; |
- if (tonal->speech_confidence_count==0) |
- tonal->speech_confidence = .1f; |
} |
} |
- if (tonal->last_music != (tonal->music_prob>.5f)) |
- tonal->last_transition=0; |
tonal->last_music = tonal->music_prob>.5f; |
-#else |
- info->music_prob = 0; |
-#endif |
- /*for (i=0;i<25;i++) |
+#ifdef MLP_TRAINING |
+ for (i=0;i<25;i++) |
printf("%f ", features[i]); |
- printf("\n");*/ |
+ printf("\n"); |
+#endif |
info->bandwidth = bandwidth; |
+ tonal->prev_bandwidth = bandwidth; |
/*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/ |
info->noisiness = frame_noisiness; |
info->valid = 1; |
@@ -650,18 +915,19 @@ void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, co |
int offset; |
int pcm_len; |
+ analysis_frame_size -= analysis_frame_size&1; |
if (analysis_pcm != NULL) |
{ |
/* Avoid overflow/wrap-around of the analysis buffer */ |
- analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size); |
+ analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/50, analysis_frame_size); |
pcm_len = analysis_frame_size - analysis->analysis_offset; |
offset = analysis->analysis_offset; |
- do { |
- tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix); |
- offset += 480; |
- pcm_len -= 480; |
- } while (pcm_len>0); |
+ while (pcm_len>0) { |
+ tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(Fs/50, pcm_len), offset, c1, c2, C, lsb_depth, downmix); |
+ offset += Fs/50; |
+ pcm_len -= Fs/50; |
+ } |
analysis->analysis_offset = analysis_frame_size; |
analysis->analysis_offset -= frame_size; |
@@ -670,3 +936,5 @@ void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, co |
analysis_info->valid = 0; |
tonality_get_info(analysis, analysis_info, frame_size); |
} |
+ |
+#endif /* DISABLE_FLOAT_API */ |