OLD | NEW |
1 /* Copyright (c) 2011 Xiph.Org Foundation | 1 /* Copyright (c) 2011 Xiph.Org Foundation |
2 Written by Jean-Marc Valin */ | 2 Written by Jean-Marc Valin */ |
3 /* | 3 /* |
4 Redistribution and use in source and binary forms, with or without | 4 Redistribution and use in source and binary forms, with or without |
5 modification, are permitted provided that the following conditions | 5 modification, are permitted provided that the following conditions |
6 are met: | 6 are met: |
7 | 7 |
8 - Redistributions of source code must retain the above copyright | 8 - Redistributions of source code must retain the above copyright |
9 notice, this list of conditions and the following disclaimer. | 9 notice, this list of conditions and the following disclaimer. |
10 | 10 |
(...skipping 11 matching lines...) Expand all Loading... |
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | 22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | 23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | 24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
26 */ | 26 */ |
27 | 27 |
28 #ifdef HAVE_CONFIG_H | 28 #ifdef HAVE_CONFIG_H |
29 #include "config.h" | 29 #include "config.h" |
30 #endif | 30 #endif |
31 | 31 |
| 32 #define ANALYSIS_C |
| 33 |
| 34 #include <stdio.h> |
| 35 |
| 36 #include "mathops.h" |
32 #include "kiss_fft.h" | 37 #include "kiss_fft.h" |
33 #include "celt.h" | 38 #include "celt.h" |
34 #include "modes.h" | 39 #include "modes.h" |
35 #include "arch.h" | 40 #include "arch.h" |
36 #include "quant_bands.h" | 41 #include "quant_bands.h" |
37 #include <stdio.h> | |
38 #include "analysis.h" | 42 #include "analysis.h" |
39 #include "mlp.h" | 43 #include "mlp.h" |
40 #include "stack_alloc.h" | 44 #include "stack_alloc.h" |
| 45 #include "float_cast.h" |
41 | 46 |
42 #ifndef M_PI | 47 #ifndef M_PI |
43 #define M_PI 3.141592653 | 48 #define M_PI 3.141592653 |
44 #endif | 49 #endif |
45 | 50 |
| 51 #ifndef DISABLE_FLOAT_API |
| 52 |
46 static const float dct_table[128] = { | 53 static const float dct_table[128] = { |
47 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.2500
00f, 0.250000f, | 54 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.2500
00f, 0.250000f, |
48 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.2500
00f, 0.250000f, | 55 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.2500
00f, 0.250000f, |
49 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.1026
31f, 0.034654f, | 56 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.1026
31f, 0.034654f, |
50 -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.3383
30f,-0.351851f, | 57 -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.3383
30f,-0.351851f, |
51 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.2939
69f,-0.346760f, | 58 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.2939
69f,-0.346760f, |
52 -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.2939
69f, 0.346760f, | 59 -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.2939
69f, 0.346760f, |
53 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.2733
00f,-0.102631f, | 60 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.2733
00f,-0.102631f, |
54 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.2242
92f,-0.338330f, | 61 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.2242
92f,-0.338330f, |
55 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.1352
99f, 0.326641f, | 62 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.1352
99f, 0.326641f, |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
89 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627
f, 0.904508f, | 96 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627
f, 0.904508f, |
90 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703
f, 0.933013f, | 97 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703
f, 0.933013f, |
91 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072
f, 0.956773f, | 98 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072
f, 0.956773f, |
92 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465
f, 0.975528f, | 99 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465
f, 0.975528f, |
93 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671
f, 0.989074f, | 100 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671
f, 0.989074f, |
94 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534
f, 0.997261f, | 101 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534
f, 0.997261f, |
95 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957
f, 1.000000f, | 102 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957
f, 1.000000f, |
96 }; | 103 }; |
97 | 104 |
98 static const int tbands[NB_TBANDS+1] = { | 105 static const int tbands[NB_TBANDS+1] = { |
99 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 12
0 | 106 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 136, 160, 192,
240 |
100 }; | 107 }; |
101 | 108 |
102 static const int extra_bands[NB_TOT_BANDS+1] = { | |
103 1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96,
120, 160, 200 | |
104 }; | |
105 | |
106 /*static const float tweight[NB_TBANDS+1] = { | |
107 .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5 | |
108 };*/ | |
109 | |
110 #define NB_TONAL_SKIP_BANDS 9 | 109 #define NB_TONAL_SKIP_BANDS 9 |
111 | 110 |
112 #define cA 0.43157974f | 111 static opus_val32 silk_resampler_down2_hp( |
113 #define cB 0.67848403f | 112 opus_val32 *S, /* I/O State vector [ 2 ]
*/ |
114 #define cC 0.08595542f | 113 opus_val32 *out, /* O Output signal [ floo
r(len/2) ] */ |
115 #define cE ((float)M_PI/2) | 114 const opus_val32 *in, /* I Input signal [ len ]
*/ |
116 static OPUS_INLINE float fast_atan2f(float y, float x) { | 115 int inLen /* I Number of input samp
les */ |
117 float x2, y2; | 116 ) |
118 /* Should avoid underflow on the values we'll get */ | 117 { |
119 if (ABS16(x)+ABS16(y)<1e-9f) | 118 int k, len2 = inLen/2; |
120 { | 119 opus_val32 in32, out32, out32_hp, Y, X; |
121 x*=1e12f; | 120 opus_val64 hp_ener = 0; |
122 y*=1e12f; | 121 /* Internal variables and state are in Q10 format */ |
123 } | 122 for( k = 0; k < len2; k++ ) { |
124 x2 = x*x; | 123 /* Convert to Q10 */ |
125 y2 = y*y; | 124 in32 = in[ 2 * k ]; |
126 if(x2<y2){ | 125 |
127 float den = (y2 + cB*x2) * (y2 + cC*x2); | 126 /* All-pass section for even input sample */ |
128 if (den!=0) | 127 Y = SUB32( in32, S[ 0 ] ); |
129 return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE); | 128 X = MULT16_32_Q15(QCONST16(0.6074371f, 15), Y); |
130 else | 129 out32 = ADD32( S[ 0 ], X ); |
131 return (y<0 ? -cE : cE); | 130 S[ 0 ] = ADD32( in32, X ); |
132 }else{ | 131 out32_hp = out32; |
133 float den = (x2 + cB*y2) * (x2 + cC*y2); | 132 /* Convert to Q10 */ |
134 if (den!=0) | 133 in32 = in[ 2 * k + 1 ]; |
135 return x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE); | 134 |
136 else | 135 /* All-pass section for odd input sample, and add to output of previous
section */ |
137 return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE); | 136 Y = SUB32( in32, S[ 1 ] ); |
138 } | 137 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y); |
| 138 out32 = ADD32( out32, S[ 1 ] ); |
| 139 out32 = ADD32( out32, X ); |
| 140 S[ 1 ] = ADD32( in32, X ); |
| 141 |
| 142 Y = SUB32( -in32, S[ 2 ] ); |
| 143 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y); |
| 144 out32_hp = ADD32( out32_hp, S[ 2 ] ); |
| 145 out32_hp = ADD32( out32_hp, X ); |
| 146 S[ 2 ] = ADD32( -in32, X ); |
| 147 |
| 148 hp_ener += out32_hp*(opus_val64)out32_hp; |
| 149 /* Add, convert back to int16 and store to output */ |
| 150 out[ k ] = HALF32(out32); |
| 151 } |
| 152 #ifdef FIXED_POINT |
| 153 /* len2 can be up to 480, so we shift by 8 more to make it fit. */ |
| 154 hp_ener = hp_ener >> (2*SIG_SHIFT + 8); |
| 155 #endif |
| 156 return (opus_val32)hp_ener; |
139 } | 157 } |
140 | 158 |
141 void tonality_analysis_init(TonalityAnalysisState *tonal) | 159 static opus_val32 downmix_and_resample(downmix_func downmix, const void *_x, opu
s_val32 *y, opus_val32 S[3], int subframe, int offset, int c1, int c2, int C, in
t Fs) |
| 160 { |
| 161 VARDECL(opus_val32, tmp); |
| 162 opus_val32 scale; |
| 163 int j; |
| 164 opus_val32 ret = 0; |
| 165 SAVE_STACK; |
| 166 |
| 167 if (subframe==0) return 0; |
| 168 if (Fs == 48000) |
| 169 { |
| 170 subframe *= 2; |
| 171 offset *= 2; |
| 172 } else if (Fs == 16000) { |
| 173 subframe = subframe*2/3; |
| 174 offset = offset*2/3; |
| 175 } |
| 176 ALLOC(tmp, subframe, opus_val32); |
| 177 |
| 178 downmix(_x, tmp, subframe, offset, c1, c2, C); |
| 179 #ifdef FIXED_POINT |
| 180 scale = (1<<SIG_SHIFT); |
| 181 #else |
| 182 scale = 1.f/32768; |
| 183 #endif |
| 184 if (c2==-2) |
| 185 scale /= C; |
| 186 else if (c2>-1) |
| 187 scale /= 2; |
| 188 for (j=0;j<subframe;j++) |
| 189 tmp[j] *= scale; |
| 190 if (Fs == 48000) |
| 191 { |
| 192 ret = silk_resampler_down2_hp(S, y, tmp, subframe); |
| 193 } else if (Fs == 24000) { |
| 194 OPUS_COPY(y, tmp, subframe); |
| 195 } else if (Fs == 16000) { |
| 196 VARDECL(opus_val32, tmp3x); |
| 197 ALLOC(tmp3x, 3*subframe, opus_val32); |
| 198 /* Don't do this at home! This resampler is horrible and it's only (barely
) |
| 199 usable for the purpose of the analysis because we don't care about all |
| 200 the aliasing between 8 kHz and 12 kHz. */ |
| 201 for (j=0;j<subframe;j++) |
| 202 { |
| 203 tmp3x[3*j] = tmp[j]; |
| 204 tmp3x[3*j+1] = tmp[j]; |
| 205 tmp3x[3*j+2] = tmp[j]; |
| 206 } |
| 207 silk_resampler_down2_hp(S, y, tmp3x, 3*subframe); |
| 208 } |
| 209 RESTORE_STACK; |
| 210 return ret; |
| 211 } |
| 212 |
| 213 void tonality_analysis_init(TonalityAnalysisState *tonal, opus_int32 Fs) |
142 { | 214 { |
143 /* Initialize reusable fields. */ | 215 /* Initialize reusable fields. */ |
144 tonal->arch = opus_select_arch(); | 216 tonal->arch = opus_select_arch(); |
| 217 tonal->Fs = Fs; |
145 /* Clear remaining fields. */ | 218 /* Clear remaining fields. */ |
146 tonality_analysis_reset(tonal); | 219 tonality_analysis_reset(tonal); |
147 } | 220 } |
148 | 221 |
149 void tonality_analysis_reset(TonalityAnalysisState *tonal) | 222 void tonality_analysis_reset(TonalityAnalysisState *tonal) |
150 { | 223 { |
151 /* Clear non-reusable fields. */ | 224 /* Clear non-reusable fields. */ |
152 char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START; | 225 char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START; |
153 OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal)); | 226 OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal)); |
| 227 tonal->music_confidence = .9f; |
| 228 tonal->speech_confidence = .1f; |
154 } | 229 } |
155 | 230 |
156 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int
len) | 231 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int
len) |
157 { | 232 { |
158 int pos; | 233 int pos; |
159 int curr_lookahead; | 234 int curr_lookahead; |
160 float psum; | 235 float psum; |
| 236 float tonality_max; |
| 237 float tonality_avg; |
| 238 int tonality_count; |
161 int i; | 239 int i; |
162 | 240 |
163 pos = tonal->read_pos; | 241 pos = tonal->read_pos; |
164 curr_lookahead = tonal->write_pos-tonal->read_pos; | 242 curr_lookahead = tonal->write_pos-tonal->read_pos; |
165 if (curr_lookahead<0) | 243 if (curr_lookahead<0) |
166 curr_lookahead += DETECT_SIZE; | 244 curr_lookahead += DETECT_SIZE; |
167 | 245 |
168 if (len > 480 && pos != tonal->write_pos) | 246 /* On long frames, look at the second analysis window rather than the first.
*/ |
| 247 if (len > tonal->Fs/50 && pos != tonal->write_pos) |
169 { | 248 { |
170 pos++; | 249 pos++; |
171 if (pos==DETECT_SIZE) | 250 if (pos==DETECT_SIZE) |
172 pos=0; | 251 pos=0; |
173 } | 252 } |
174 if (pos == tonal->write_pos) | 253 if (pos == tonal->write_pos) |
175 pos--; | 254 pos--; |
176 if (pos<0) | 255 if (pos<0) |
177 pos = DETECT_SIZE-1; | 256 pos = DETECT_SIZE-1; |
178 OPUS_COPY(info_out, &tonal->info[pos], 1); | 257 OPUS_COPY(info_out, &tonal->info[pos], 1); |
179 tonal->read_subframe += len/120; | 258 tonality_max = tonality_avg = info_out->tonality; |
180 while (tonal->read_subframe>=4) | 259 tonality_count = 1; |
| 260 /* If possible, look ahead for a tone to compensate for the delay in the tone
detector. */ |
| 261 for (i=0;i<3;i++) |
181 { | 262 { |
182 tonal->read_subframe -= 4; | 263 pos++; |
| 264 if (pos==DETECT_SIZE) |
| 265 pos = 0; |
| 266 if (pos == tonal->write_pos) |
| 267 break; |
| 268 tonality_max = MAX32(tonality_max, tonal->info[pos].tonality); |
| 269 tonality_avg += tonal->info[pos].tonality; |
| 270 tonality_count++; |
| 271 } |
| 272 info_out->tonality = MAX32(tonality_avg/tonality_count, tonality_max-.2f); |
| 273 tonal->read_subframe += len/(tonal->Fs/400); |
| 274 while (tonal->read_subframe>=8) |
| 275 { |
| 276 tonal->read_subframe -= 8; |
183 tonal->read_pos++; | 277 tonal->read_pos++; |
184 } | 278 } |
185 if (tonal->read_pos>=DETECT_SIZE) | 279 if (tonal->read_pos>=DETECT_SIZE) |
186 tonal->read_pos-=DETECT_SIZE; | 280 tonal->read_pos-=DETECT_SIZE; |
187 | 281 |
188 /* Compensate for the delay in the features themselves. | 282 /* The -1 is to compensate for the delay in the features themselves. */ |
189 FIXME: Need a better estimate the 10 I just made up */ | 283 curr_lookahead = IMAX(curr_lookahead-1, 0); |
190 curr_lookahead = IMAX(curr_lookahead-10, 0); | |
191 | 284 |
192 psum=0; | 285 psum=0; |
193 /* Summing the probability of transition patterns that involve music at | 286 /* Summing the probability of transition patterns that involve music at |
194 time (DETECT_SIZE-curr_lookahead-1) */ | 287 time (DETECT_SIZE-curr_lookahead-1) */ |
195 for (i=0;i<DETECT_SIZE-curr_lookahead;i++) | 288 for (i=0;i<DETECT_SIZE-curr_lookahead;i++) |
196 psum += tonal->pmusic[i]; | 289 psum += tonal->pmusic[i]; |
197 for (;i<DETECT_SIZE;i++) | 290 for (;i<DETECT_SIZE;i++) |
198 psum += tonal->pspeech[i]; | 291 psum += tonal->pspeech[i]; |
199 psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence; | 292 psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence; |
200 /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/ | 293 /*printf("%f %f %f %f %f\n", psum, info_out->music_prob, info_out->vad_prob,
info_out->activity_probability, info_out->tonality);*/ |
201 | 294 |
202 info_out->music_prob = psum; | 295 info_out->music_prob = psum; |
203 } | 296 } |
204 | 297 |
| 298 static const float std_feature_bias[9] = { |
| 299 5.684947f, 3.475288f, 1.770634f, 1.599784f, 3.773215f, |
| 300 2.163313f, 1.260756f, 1.116868f, 1.918795f |
| 301 }; |
| 302 |
| 303 #define LEAKAGE_OFFSET 2.5f |
| 304 #define LEAKAGE_SLOPE 2.f |
| 305 |
| 306 #ifdef FIXED_POINT |
| 307 /* For fixed-point, the input is +/-2^15 shifted up by SIG_SHIFT, so we need to |
| 308 compensate for that in the energy. */ |
| 309 #define SCALE_COMPENS (1.f/((opus_int32)1<<(15+SIG_SHIFT))) |
| 310 #define SCALE_ENER(e) ((SCALE_COMPENS*SCALE_COMPENS)*(e)) |
| 311 #else |
| 312 #define SCALE_ENER(e) (e) |
| 313 #endif |
| 314 |
205 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth,
downmix_func downmix) | 315 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth,
downmix_func downmix) |
206 { | 316 { |
207 int i, b; | 317 int i, b; |
208 const kiss_fft_state *kfft; | 318 const kiss_fft_state *kfft; |
209 VARDECL(kiss_fft_cpx, in); | 319 VARDECL(kiss_fft_cpx, in); |
210 VARDECL(kiss_fft_cpx, out); | 320 VARDECL(kiss_fft_cpx, out); |
211 int N = 480, N2=240; | 321 int N = 480, N2=240; |
212 float * OPUS_RESTRICT A = tonal->angle; | 322 float * OPUS_RESTRICT A = tonal->angle; |
213 float * OPUS_RESTRICT dA = tonal->d_angle; | 323 float * OPUS_RESTRICT dA = tonal->d_angle; |
214 float * OPUS_RESTRICT d2A = tonal->d2_angle; | 324 float * OPUS_RESTRICT d2A = tonal->d2_angle; |
(...skipping 13 matching lines...) Expand all Loading... |
228 float relativeE; | 338 float relativeE; |
229 float frame_probs[2]; | 339 float frame_probs[2]; |
230 float alpha, alphaE, alphaE2; | 340 float alpha, alphaE, alphaE2; |
231 float frame_loudness; | 341 float frame_loudness; |
232 float bandwidth_mask; | 342 float bandwidth_mask; |
233 int bandwidth=0; | 343 int bandwidth=0; |
234 float maxE = 0; | 344 float maxE = 0; |
235 float noise_floor; | 345 float noise_floor; |
236 int remaining; | 346 int remaining; |
237 AnalysisInfo *info; | 347 AnalysisInfo *info; |
| 348 float hp_ener; |
| 349 float tonality2[240]; |
| 350 float midE[8]; |
| 351 float spec_variability=0; |
| 352 float band_log2[NB_TBANDS+1]; |
| 353 float leakage_from[NB_TBANDS+1]; |
| 354 float leakage_to[NB_TBANDS+1]; |
238 SAVE_STACK; | 355 SAVE_STACK; |
239 | 356 |
240 tonal->last_transition++; | 357 alpha = 1.f/IMIN(10, 1+tonal->count); |
241 alpha = 1.f/IMIN(20, 1+tonal->count); | 358 alphaE = 1.f/IMIN(25, 1+tonal->count); |
242 alphaE = 1.f/IMIN(50, 1+tonal->count); | 359 alphaE2 = 1.f/IMIN(500, 1+tonal->count); |
243 alphaE2 = 1.f/IMIN(1000, 1+tonal->count); | |
244 | 360 |
245 if (tonal->count<4) | 361 if (tonal->Fs == 48000) |
246 tonal->music_prob = .5; | 362 { |
| 363 /* len and offset are now at 24 kHz. */ |
| 364 len/= 2; |
| 365 offset /= 2; |
| 366 } else if (tonal->Fs == 16000) { |
| 367 len = 3*len/2; |
| 368 offset = 3*offset/2; |
| 369 } |
| 370 |
| 371 if (tonal->count<4) { |
| 372 if (tonal->application == OPUS_APPLICATION_VOIP) |
| 373 tonal->music_prob = .1f; |
| 374 else |
| 375 tonal->music_prob = .625f; |
| 376 } |
247 kfft = celt_mode->mdct.kfft[0]; | 377 kfft = celt_mode->mdct.kfft[0]; |
248 if (tonal->count==0) | 378 if (tonal->count==0) |
249 tonal->mem_fill = 240; | 379 tonal->mem_fill = 240; |
250 downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal
->mem_fill), offset, c1, c2, C); | 380 tonal->hp_ener_accum += (float)downmix_and_resample(downmix, x, |
| 381 &tonal->inmem[tonal->mem_fill], tonal->downmix_state, |
| 382 IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C, tonal
->Fs); |
251 if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE) | 383 if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE) |
252 { | 384 { |
253 tonal->mem_fill += len; | 385 tonal->mem_fill += len; |
254 /* Don't have enough to update the analysis */ | 386 /* Don't have enough to update the analysis */ |
255 RESTORE_STACK; | 387 RESTORE_STACK; |
256 return; | 388 return; |
257 } | 389 } |
| 390 hp_ener = tonal->hp_ener_accum; |
258 info = &tonal->info[tonal->write_pos++]; | 391 info = &tonal->info[tonal->write_pos++]; |
259 if (tonal->write_pos>=DETECT_SIZE) | 392 if (tonal->write_pos>=DETECT_SIZE) |
260 tonal->write_pos-=DETECT_SIZE; | 393 tonal->write_pos-=DETECT_SIZE; |
261 | 394 |
262 ALLOC(in, 480, kiss_fft_cpx); | 395 ALLOC(in, 480, kiss_fft_cpx); |
263 ALLOC(out, 480, kiss_fft_cpx); | 396 ALLOC(out, 480, kiss_fft_cpx); |
264 ALLOC(tonality, 240, float); | 397 ALLOC(tonality, 240, float); |
265 ALLOC(noisiness, 240, float); | 398 ALLOC(noisiness, 240, float); |
266 for (i=0;i<N2;i++) | 399 for (i=0;i<N2;i++) |
267 { | 400 { |
268 float w = analysis_window[i]; | 401 float w = analysis_window[i]; |
269 in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]); | 402 in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]); |
270 in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]); | 403 in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]); |
271 in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]); | 404 in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]); |
272 in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]); | 405 in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]); |
273 } | 406 } |
274 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240); | 407 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240); |
275 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill); | 408 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill); |
276 downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->me
m_fill, c1, c2, C); | 409 tonal->hp_ener_accum = (float)downmix_and_resample(downmix, x, |
| 410 &tonal->inmem[240], tonal->downmix_state, remaining, |
| 411 offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs); |
277 tonal->mem_fill = 240 + remaining; | 412 tonal->mem_fill = 240 + remaining; |
278 opus_fft(kfft, in, out, tonal->arch); | 413 opus_fft(kfft, in, out, tonal->arch); |
279 #ifndef FIXED_POINT | 414 #ifndef FIXED_POINT |
280 /* If there's any NaN on the input, the entire output will be NaN, so we onl
y need to check one value. */ | 415 /* If there's any NaN on the input, the entire output will be NaN, so we onl
y need to check one value. */ |
281 if (celt_isnan(out[0].r)) | 416 if (celt_isnan(out[0].r)) |
282 { | 417 { |
283 info->valid = 0; | 418 info->valid = 0; |
284 RESTORE_STACK; | 419 RESTORE_STACK; |
285 return; | 420 return; |
286 } | 421 } |
(...skipping 11 matching lines...) Expand all Loading... |
298 X2i = (float)out[N-i].r-out[i].r; | 433 X2i = (float)out[N-i].r-out[i].r; |
299 | 434 |
300 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r); | 435 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r); |
301 d_angle = angle - A[i]; | 436 d_angle = angle - A[i]; |
302 d2_angle = d_angle - dA[i]; | 437 d2_angle = d_angle - dA[i]; |
303 | 438 |
304 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r); | 439 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r); |
305 d_angle2 = angle2 - angle; | 440 d_angle2 = angle2 - angle; |
306 d2_angle2 = d_angle2 - d_angle; | 441 d2_angle2 = d_angle2 - d_angle; |
307 | 442 |
308 mod1 = d2_angle - (float)floor(.5+d2_angle); | 443 mod1 = d2_angle - (float)float2int(d2_angle); |
309 noisiness[i] = ABS16(mod1); | 444 noisiness[i] = ABS16(mod1); |
310 mod1 *= mod1; | 445 mod1 *= mod1; |
311 mod1 *= mod1; | 446 mod1 *= mod1; |
312 | 447 |
313 mod2 = d2_angle2 - (float)floor(.5+d2_angle2); | 448 mod2 = d2_angle2 - (float)float2int(d2_angle2); |
314 noisiness[i] += ABS16(mod2); | 449 noisiness[i] += ABS16(mod2); |
315 mod2 *= mod2; | 450 mod2 *= mod2; |
316 mod2 *= mod2; | 451 mod2 *= mod2; |
317 | 452 |
318 avg_mod = .25f*(d2A[i]+2.f*mod1+mod2); | 453 avg_mod = .25f*(d2A[i]+mod1+2*mod2); |
| 454 /* This introduces an extra delay of 2 frames in the detection. */ |
319 tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f; | 455 tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f; |
| 456 /* No delay on this detection, but it's less reliable. */ |
| 457 tonality2[i] = 1.f/(1.f+40.f*16.f*pi4*mod2)-.015f; |
320 | 458 |
321 A[i] = angle2; | 459 A[i] = angle2; |
322 dA[i] = d_angle2; | 460 dA[i] = d_angle2; |
323 d2A[i] = mod2; | 461 d2A[i] = mod2; |
324 } | 462 } |
325 | 463 for (i=2;i<N2-1;i++) |
| 464 { |
| 465 float tt = MIN32(tonality2[i], MAX32(tonality2[i-1], tonality2[i+1])); |
| 466 tonality[i] = .9f*MAX32(tonality[i], tt-.1f); |
| 467 } |
326 frame_tonality = 0; | 468 frame_tonality = 0; |
327 max_frame_tonality = 0; | 469 max_frame_tonality = 0; |
328 /*tw_sum = 0;*/ | 470 /*tw_sum = 0;*/ |
329 info->activity = 0; | 471 info->activity = 0; |
330 frame_noisiness = 0; | 472 frame_noisiness = 0; |
331 frame_stationarity = 0; | 473 frame_stationarity = 0; |
332 if (!tonal->count) | 474 if (!tonal->count) |
333 { | 475 { |
334 for (b=0;b<NB_TBANDS;b++) | 476 for (b=0;b<NB_TBANDS;b++) |
335 { | 477 { |
336 tonal->lowE[b] = 1e10; | 478 tonal->lowE[b] = 1e10; |
337 tonal->highE[b] = -1e10; | 479 tonal->highE[b] = -1e10; |
338 } | 480 } |
339 } | 481 } |
340 relativeE = 0; | 482 relativeE = 0; |
341 frame_loudness = 0; | 483 frame_loudness = 0; |
| 484 /* The energy of the very first band is special because of DC. */ |
| 485 { |
| 486 float E = 0; |
| 487 float X1r, X2r; |
| 488 X1r = 2*(float)out[0].r; |
| 489 X2r = 2*(float)out[0].i; |
| 490 E = X1r*X1r + X2r*X2r; |
| 491 for (i=1;i<4;i++) |
| 492 { |
| 493 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r |
| 494 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; |
| 495 E += binE; |
| 496 } |
| 497 E = SCALE_ENER(E); |
| 498 band_log2[0] = .5f*1.442695f*(float)log(E+1e-10f); |
| 499 } |
342 for (b=0;b<NB_TBANDS;b++) | 500 for (b=0;b<NB_TBANDS;b++) |
343 { | 501 { |
344 float E=0, tE=0, nE=0; | 502 float E=0, tE=0, nE=0; |
345 float L1, L2; | 503 float L1, L2; |
346 float stationarity; | 504 float stationarity; |
347 for (i=tbands[b];i<tbands[b+1];i++) | 505 for (i=tbands[b];i<tbands[b+1];i++) |
348 { | 506 { |
349 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r | 507 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r |
350 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; | 508 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; |
351 #ifdef FIXED_POINT | 509 binE = SCALE_ENER(binE); |
352 /* FIXME: It's probably best to change the BFCC filter initial state i
nstead */ | |
353 binE *= 5.55e-17f; | |
354 #endif | |
355 E += binE; | 510 E += binE; |
356 tE += binE*tonality[i]; | 511 tE += binE*MAX32(0, tonality[i]); |
357 nE += binE*2.f*(.5f-noisiness[i]); | 512 nE += binE*2.f*(.5f-noisiness[i]); |
358 } | 513 } |
359 #ifndef FIXED_POINT | 514 #ifndef FIXED_POINT |
360 /* Check for extreme band energies that could cause NaNs later. */ | 515 /* Check for extreme band energies that could cause NaNs later. */ |
361 if (!(E<1e9f) || celt_isnan(E)) | 516 if (!(E<1e9f) || celt_isnan(E)) |
362 { | 517 { |
363 info->valid = 0; | 518 info->valid = 0; |
364 RESTORE_STACK; | 519 RESTORE_STACK; |
365 return; | 520 return; |
366 } | 521 } |
367 #endif | 522 #endif |
368 | 523 |
369 tonal->E[tonal->E_count][b] = E; | 524 tonal->E[tonal->E_count][b] = E; |
370 frame_noisiness += nE/(1e-15f+E); | 525 frame_noisiness += nE/(1e-15f+E); |
371 | 526 |
372 frame_loudness += (float)sqrt(E+1e-10f); | 527 frame_loudness += (float)sqrt(E+1e-10f); |
373 logE[b] = (float)log(E+1e-10f); | 528 logE[b] = (float)log(E+1e-10f); |
374 tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f); | 529 band_log2[b+1] = .5f*1.442695f*(float)log(E+1e-10f); |
375 tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f); | 530 tonal->logE[tonal->E_count][b] = logE[b]; |
376 if (tonal->highE[b] < tonal->lowE[b]+1.f) | 531 if (tonal->count==0) |
| 532 tonal->highE[b] = tonal->lowE[b] = logE[b]; |
| 533 if (tonal->highE[b] > tonal->lowE[b] + 7.5) |
377 { | 534 { |
378 tonal->highE[b]+=.5f; | 535 if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b]) |
379 tonal->lowE[b]-=.5f; | 536 tonal->highE[b] -= .01f; |
| 537 else |
| 538 tonal->lowE[b] += .01f; |
380 } | 539 } |
381 relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE
[b]); | 540 if (logE[b] > tonal->highE[b]) |
| 541 { |
| 542 tonal->highE[b] = logE[b]; |
| 543 tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]); |
| 544 } else if (logE[b] < tonal->lowE[b]) |
| 545 { |
| 546 tonal->lowE[b] = logE[b]; |
| 547 tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]); |
| 548 } |
| 549 relativeE += (logE[b]-tonal->lowE[b])/(1e-15f + (tonal->highE[b]-tonal->l
owE[b])); |
382 | 550 |
383 L1=L2=0; | 551 L1=L2=0; |
384 for (i=0;i<NB_FRAMES;i++) | 552 for (i=0;i<NB_FRAMES;i++) |
385 { | 553 { |
386 L1 += (float)sqrt(tonal->E[i][b]); | 554 L1 += (float)sqrt(tonal->E[i][b]); |
387 L2 += tonal->E[i][b]; | 555 L2 += tonal->E[i][b]; |
388 } | 556 } |
389 | 557 |
390 stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2)); | 558 stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2)); |
391 stationarity *= stationarity; | 559 stationarity *= stationarity; |
(...skipping 11 matching lines...) Expand all Loading... |
403 frame_tonality += band_tonality[b]; | 571 frame_tonality += band_tonality[b]; |
404 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS) | 572 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS) |
405 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS]; | 573 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS]; |
406 #endif | 574 #endif |
407 max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*f
rame_tonality); | 575 max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*f
rame_tonality); |
408 slope += band_tonality[b]*(b-8); | 576 slope += band_tonality[b]*(b-8); |
409 /*printf("%f %f ", band_tonality[b], stationarity);*/ | 577 /*printf("%f %f ", band_tonality[b], stationarity);*/ |
410 tonal->prev_band_tonality[b] = band_tonality[b]; | 578 tonal->prev_band_tonality[b] = band_tonality[b]; |
411 } | 579 } |
412 | 580 |
| 581 leakage_from[0] = band_log2[0]; |
| 582 leakage_to[0] = band_log2[0] - LEAKAGE_OFFSET; |
| 583 for (b=1;b<NB_TBANDS+1;b++) |
| 584 { |
| 585 float leak_slope = LEAKAGE_SLOPE*(tbands[b]-tbands[b-1])/4; |
| 586 leakage_from[b] = MIN16(leakage_from[b-1]+leak_slope, band_log2[b]); |
| 587 leakage_to[b] = MAX16(leakage_to[b-1]-leak_slope, band_log2[b]-LEAKAGE_OF
FSET); |
| 588 } |
| 589 for (b=NB_TBANDS-2;b>=0;b--) |
| 590 { |
| 591 float leak_slope = LEAKAGE_SLOPE*(tbands[b+1]-tbands[b])/4; |
| 592 leakage_from[b] = MIN16(leakage_from[b+1]+leak_slope, leakage_from[b]); |
| 593 leakage_to[b] = MAX16(leakage_to[b+1]-leak_slope, leakage_to[b]); |
| 594 } |
| 595 celt_assert(NB_TBANDS+1 <= LEAK_BANDS); |
| 596 for (b=0;b<NB_TBANDS+1;b++) |
| 597 { |
| 598 /* leak_boost[] is made up of two terms. The first, based on leakage_to[]
, |
| 599 represents the boost needed to overcome the amount of analysis leakage |
| 600 cause in a weaker band b by louder neighbouring bands. |
| 601 The second, based on leakage_from[], applies to a loud band b for |
| 602 which the quantization noise causes synthesis leakage to the weaker |
| 603 neighbouring bands. */ |
| 604 float boost = MAX16(0, leakage_to[b] - band_log2[b]) + |
| 605 MAX16(0, band_log2[b] - (leakage_from[b]+LEAKAGE_OFFSET)); |
| 606 info->leak_boost[b] = IMIN(255, (int)floor(.5 + 64.f*boost)); |
| 607 } |
| 608 for (;b<LEAK_BANDS;b++) info->leak_boost[b] = 0; |
| 609 |
| 610 for (i=0;i<NB_FRAMES;i++) |
| 611 { |
| 612 int j; |
| 613 float mindist = 1e15f; |
| 614 for (j=0;j<NB_FRAMES;j++) |
| 615 { |
| 616 int k; |
| 617 float dist=0; |
| 618 for (k=0;k<NB_TBANDS;k++) |
| 619 { |
| 620 float tmp; |
| 621 tmp = tonal->logE[i][k] - tonal->logE[j][k]; |
| 622 dist += tmp*tmp; |
| 623 } |
| 624 if (j!=i) |
| 625 mindist = MIN32(mindist, dist); |
| 626 } |
| 627 spec_variability += mindist; |
| 628 } |
| 629 spec_variability = (float)sqrt(spec_variability/NB_FRAMES/NB_TBANDS); |
413 bandwidth_mask = 0; | 630 bandwidth_mask = 0; |
414 bandwidth = 0; | 631 bandwidth = 0; |
415 maxE = 0; | 632 maxE = 0; |
416 noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8))); | 633 noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8))); |
417 #ifdef FIXED_POINT | |
418 noise_floor *= 1<<(15+SIG_SHIFT); | |
419 #endif | |
420 noise_floor *= noise_floor; | 634 noise_floor *= noise_floor; |
421 for (b=0;b<NB_TOT_BANDS;b++) | 635 for (b=0;b<NB_TBANDS;b++) |
422 { | 636 { |
423 float E=0; | 637 float E=0; |
424 int band_start, band_end; | 638 int band_start, band_end; |
425 /* Keep a margin of 300 Hz for aliasing */ | 639 /* Keep a margin of 300 Hz for aliasing */ |
426 band_start = extra_bands[b]; | 640 band_start = tbands[b]; |
427 band_end = extra_bands[b+1]; | 641 band_end = tbands[b+1]; |
428 for (i=band_start;i<band_end;i++) | 642 for (i=band_start;i<band_end;i++) |
429 { | 643 { |
430 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r | 644 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r |
431 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; | 645 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; |
432 E += binE; | 646 E += binE; |
433 } | 647 } |
| 648 E = SCALE_ENER(E); |
434 maxE = MAX32(maxE, E); | 649 maxE = MAX32(maxE, E); |
435 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E); | 650 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E); |
436 E = MAX32(E, tonal->meanE[b]); | 651 E = MAX32(E, tonal->meanE[b]); |
437 /* Use a simple follower with 13 dB/Bark slope for spreading function */ | 652 /* Use a simple follower with 13 dB/Bark slope for spreading function */ |
438 bandwidth_mask = MAX32(.05f*bandwidth_mask, E); | 653 bandwidth_mask = MAX32(.05f*bandwidth_mask, E); |
439 /* Consider the band "active" only if all these conditions are met: | 654 /* Consider the band "active" only if all these conditions are met: |
440 1) less than 10 dB below the simple follower | 655 1) less than 10 dB below the simple follower |
441 2) less than 90 dB below the peak band (maximal masking possible consi
dering | 656 2) less than 90 dB below the peak band (maximal masking possible consi
dering |
442 both the ATH and the loudness-dependent slope of the spreading func
tion) | 657 both the ATH and the loudness-dependent slope of the spreading func
tion) |
443 3) above the PCM quantization noise floor | 658 3) above the PCM quantization noise floor |
| 659 We use b+1 because the first CELT band isn't included in tbands[] |
444 */ | 660 */ |
445 if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-ban
d_start)) | 661 if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-ban
d_start)) |
446 bandwidth = b; | 662 bandwidth = b+1; |
| 663 } |
| 664 /* Special case for the last two bands, for which we don't have spectrum but
only |
| 665 the energy above 12 kHz. */ |
| 666 if (tonal->Fs == 48000) { |
| 667 float ratio; |
| 668 float E = hp_ener*(1.f/(240*240)); |
| 669 ratio = tonal->prev_bandwidth==20 ? 0.03f : 0.07f; |
| 670 #ifdef FIXED_POINT |
| 671 /* silk_resampler_down2_hp() shifted right by an extra 8 bits. */ |
| 672 E *= 256.f*(1.f/Q15ONE)*(1.f/Q15ONE); |
| 673 #endif |
| 674 maxE = MAX32(maxE, E); |
| 675 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E); |
| 676 E = MAX32(E, tonal->meanE[b]); |
| 677 /* Use a simple follower with 13 dB/Bark slope for spreading function */ |
| 678 bandwidth_mask = MAX32(.05f*bandwidth_mask, E); |
| 679 if (E>ratio*bandwidth_mask && E*1e9f > maxE && E > noise_floor*160) |
| 680 bandwidth = 20; |
| 681 /* This detector is unreliable, so if the bandwidth is close to SWB, assu
me it's FB. */ |
| 682 if (bandwidth >= 17) |
| 683 bandwidth = 20; |
447 } | 684 } |
448 if (tonal->count<=2) | 685 if (tonal->count<=2) |
449 bandwidth = 20; | 686 bandwidth = 20; |
450 frame_loudness = 20*(float)log10(frame_loudness); | 687 frame_loudness = 20*(float)log10(frame_loudness); |
451 tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness); | 688 tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness); |
452 tonal->lowECount *= (1-alphaE); | 689 tonal->lowECount *= (1-alphaE); |
453 if (frame_loudness < tonal->Etracker-30) | 690 if (frame_loudness < tonal->Etracker-30) |
454 tonal->lowECount += alphaE; | 691 tonal->lowECount += alphaE; |
455 | 692 |
456 for (i=0;i<8;i++) | 693 for (i=0;i<8;i++) |
457 { | 694 { |
458 float sum=0; | 695 float sum=0; |
459 for (b=0;b<16;b++) | 696 for (b=0;b<16;b++) |
460 sum += dct_table[i*16+b]*logE[b]; | 697 sum += dct_table[i*16+b]*logE[b]; |
461 BFCC[i] = sum; | 698 BFCC[i] = sum; |
462 } | 699 } |
| 700 for (i=0;i<8;i++) |
| 701 { |
| 702 float sum=0; |
| 703 for (b=0;b<16;b++) |
| 704 sum += dct_table[i*16+b]*.5f*(tonal->highE[b]+tonal->lowE[b]); |
| 705 midE[i] = sum; |
| 706 } |
463 | 707 |
464 frame_stationarity /= NB_TBANDS; | 708 frame_stationarity /= NB_TBANDS; |
465 relativeE /= NB_TBANDS; | 709 relativeE /= NB_TBANDS; |
466 if (tonal->count<10) | 710 if (tonal->count<10) |
467 relativeE = .5; | 711 relativeE = .5f; |
468 frame_noisiness /= NB_TBANDS; | 712 frame_noisiness /= NB_TBANDS; |
469 #if 1 | 713 #if 1 |
470 info->activity = frame_noisiness + (1-frame_noisiness)*relativeE; | 714 info->activity = frame_noisiness + (1-frame_noisiness)*relativeE; |
471 #else | 715 #else |
472 info->activity = .5*(1+frame_noisiness-frame_stationarity); | 716 info->activity = .5*(1+frame_noisiness-frame_stationarity); |
473 #endif | 717 #endif |
474 frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS)); | 718 frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS)); |
475 frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f); | 719 frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f); |
476 tonal->prev_tonality = frame_tonality; | 720 tonal->prev_tonality = frame_tonality; |
477 | 721 |
478 slope /= 8*8; | 722 slope /= 8*8; |
479 info->tonality_slope = slope; | 723 info->tonality_slope = slope; |
480 | 724 |
481 tonal->E_count = (tonal->E_count+1)%NB_FRAMES; | 725 tonal->E_count = (tonal->E_count+1)%NB_FRAMES; |
482 tonal->count++; | 726 tonal->count = IMIN(tonal->count+1, ANALYSIS_COUNT_MAX); |
483 info->tonality = frame_tonality; | 727 info->tonality = frame_tonality; |
484 | 728 |
485 for (i=0;i<4;i++) | 729 for (i=0;i<4;i++) |
486 features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem
[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i]; | 730 features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem
[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i]; |
487 | 731 |
488 for (i=0;i<4;i++) | 732 for (i=0;i<4;i++) |
489 tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i]; | 733 tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i]; |
490 | 734 |
491 for (i=0;i<4;i++) | 735 for (i=0;i<4;i++) |
492 features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->m
em[i]-tonal->mem[i+16]); | 736 features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->m
em[i]-tonal->mem[i+16]); |
493 for (i=0;i<3;i++) | 737 for (i=0;i<3;i++) |
494 features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->m
em[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8]; | 738 features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->m
em[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8]; |
495 | 739 |
496 if (tonal->count > 5) | 740 if (tonal->count > 5) |
497 { | 741 { |
498 for (i=0;i<9;i++) | 742 for (i=0;i<9;i++) |
499 tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i
]; | 743 tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i
]; |
500 } | 744 } |
| 745 for (i=0;i<4;i++) |
| 746 features[i] = BFCC[i]-midE[i]; |
501 | 747 |
502 for (i=0;i<8;i++) | 748 for (i=0;i<8;i++) |
503 { | 749 { |
504 tonal->mem[i+24] = tonal->mem[i+16]; | 750 tonal->mem[i+24] = tonal->mem[i+16]; |
505 tonal->mem[i+16] = tonal->mem[i+8]; | 751 tonal->mem[i+16] = tonal->mem[i+8]; |
506 tonal->mem[i+8] = tonal->mem[i]; | 752 tonal->mem[i+8] = tonal->mem[i]; |
507 tonal->mem[i] = BFCC[i]; | 753 tonal->mem[i] = BFCC[i]; |
508 } | 754 } |
509 for (i=0;i<9;i++) | 755 for (i=0;i<9;i++) |
510 features[11+i] = (float)sqrt(tonal->std[i]); | 756 features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i]; |
511 features[20] = info->tonality; | 757 features[18] = spec_variability - 0.78f; |
512 features[21] = info->activity; | 758 features[20] = info->tonality - 0.154723f; |
513 features[22] = frame_stationarity; | 759 features[21] = info->activity - 0.724643f; |
514 features[23] = info->tonality_slope; | 760 features[22] = frame_stationarity - 0.743717f; |
515 features[24] = tonal->lowECount; | 761 features[23] = info->tonality_slope + 0.069216f; |
| 762 features[24] = tonal->lowECount - 0.067930f; |
516 | 763 |
517 #ifndef DISABLE_FLOAT_API | |
518 mlp_process(&net, features, frame_probs); | 764 mlp_process(&net, features, frame_probs); |
519 frame_probs[0] = .5f*(frame_probs[0]+1); | 765 frame_probs[0] = .5f*(frame_probs[0]+1); |
520 /* Curve fitting between the MLP probability and the actual probability */ | 766 /* Curve fitting between the MLP probability and the actual probability */ |
521 frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)po
w(frame_probs[0], 10); | 767 /*frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)
pow(frame_probs[0], 10);*/ |
522 /* Probability of active audio (as opposed to silence) */ | 768 /* Probability of active audio (as opposed to silence) */ |
523 frame_probs[1] = .5f*frame_probs[1]+.5f; | 769 frame_probs[1] = .5f*frame_probs[1]+.5f; |
524 /* Consider that silence has a 50-50 probability. */ | 770 frame_probs[1] *= frame_probs[1]; |
525 frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f; | |
526 | 771 |
527 /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/ | 772 /* Probability of speech or music vs noise */ |
| 773 info->activity_probability = frame_probs[1]; |
| 774 |
| 775 /*printf("%f %f\n", frame_probs[0], frame_probs[1]);*/ |
528 { | 776 { |
529 /* Probability of state transition */ | 777 /* Probability of state transition */ |
530 float tau; | 778 float tau; |
531 /* Represents independence of the MLP probabilities, where | 779 /* Represents independence of the MLP probabilities, where |
532 beta=1 means fully independent. */ | 780 beta=1 means fully independent. */ |
533 float beta; | 781 float beta; |
534 /* Denormalized probability of speech (p0) and music (p1) after update */ | 782 /* Denormalized probability of speech (p0) and music (p1) after update */ |
535 float p0, p1; | 783 float p0, p1; |
536 /* Probabilities for "all speech" and "all music" */ | 784 /* Probabilities for "all speech" and "all music" */ |
537 float s0, m0; | 785 float s0, m0; |
538 /* Probability sum for renormalisation */ | 786 /* Probability sum for renormalisation */ |
539 float psum; | 787 float psum; |
540 /* Instantaneous probability of speech and music, with beta pre-applied.
*/ | 788 /* Instantaneous probability of speech and music, with beta pre-applied.
*/ |
541 float speech0; | 789 float speech0; |
542 float music0; | 790 float music0; |
543 float p, q; | 791 float p, q; |
544 | 792 |
| 793 /* More silence transitions for speech than for music. */ |
| 794 tau = .001f*tonal->music_prob + .01f*(1-tonal->music_prob); |
| 795 p = MAX16(.05f,MIN16(.95f,frame_probs[1])); |
| 796 q = MAX16(.05f,MIN16(.95f,tonal->vad_prob)); |
| 797 beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); |
| 798 /* p0 and p1 are the probabilities of speech and music at this frame |
| 799 using only information from previous frame and applying the |
| 800 state transition model */ |
| 801 p0 = (1-tonal->vad_prob)*(1-tau) + tonal->vad_prob *tau; |
| 802 p1 = tonal->vad_prob *(1-tau) + (1-tonal->vad_prob)*tau; |
| 803 /* We apply the current probability with exponent beta to work around |
| 804 the fact that the probability estimates aren't independent. */ |
| 805 p0 *= (float)pow(1-frame_probs[1], beta); |
| 806 p1 *= (float)pow(frame_probs[1], beta); |
| 807 /* Normalise the probabilities to get the Marokv probability of music. */ |
| 808 tonal->vad_prob = p1/(p0+p1); |
| 809 info->vad_prob = tonal->vad_prob; |
| 810 /* Consider that silence has a 50-50 probability of being speech or music
. */ |
| 811 frame_probs[0] = tonal->vad_prob*frame_probs[0] + (1-tonal->vad_prob)*.5f
; |
| 812 |
545 /* One transition every 3 minutes of active audio */ | 813 /* One transition every 3 minutes of active audio */ |
546 tau = .00005f*frame_probs[1]; | 814 tau = .0001f; |
547 /* Adapt beta based on how "unexpected" the new prob is */ | 815 /* Adapt beta based on how "unexpected" the new prob is */ |
548 p = MAX16(.05f,MIN16(.95f,frame_probs[0])); | 816 p = MAX16(.05f,MIN16(.95f,frame_probs[0])); |
549 q = MAX16(.05f,MIN16(.95f,tonal->music_prob)); | 817 q = MAX16(.05f,MIN16(.95f,tonal->music_prob)); |
550 beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); | 818 beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); |
551 /* p0 and p1 are the probabilities of speech and music at this frame | 819 /* p0 and p1 are the probabilities of speech and music at this frame |
552 using only information from previous frame and applying the | 820 using only information from previous frame and applying the |
553 state transition model */ | 821 state transition model */ |
554 p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau; | 822 p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau; |
555 p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau; | 823 p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau; |
556 /* We apply the current probability with exponent beta to work around | 824 /* We apply the current probability with exponent beta to work around |
557 the fact that the probability estimates aren't independent. */ | 825 the fact that the probability estimates aren't independent. */ |
558 p0 *= (float)pow(1-frame_probs[0], beta); | 826 p0 *= (float)pow(1-frame_probs[0], beta); |
559 p1 *= (float)pow(frame_probs[0], beta); | 827 p1 *= (float)pow(frame_probs[0], beta); |
560 /* Normalise the probabilities to get the Marokv probability of music. */ | 828 /* Normalise the probabilities to get the Marokv probability of music. */ |
561 tonal->music_prob = p1/(p0+p1); | 829 tonal->music_prob = p1/(p0+p1); |
562 info->music_prob = tonal->music_prob; | 830 info->music_prob = tonal->music_prob; |
563 | 831 |
| 832 /*printf("%f %f %f %f\n", frame_probs[0], frame_probs[1], tonal->music_pr
ob, tonal->vad_prob);*/ |
564 /* This chunk of code deals with delayed decision. */ | 833 /* This chunk of code deals with delayed decision. */ |
565 psum=1e-20f; | 834 psum=1e-20f; |
566 /* Instantaneous probability of speech and music, with beta pre-applied.
*/ | 835 /* Instantaneous probability of speech and music, with beta pre-applied.
*/ |
567 speech0 = (float)pow(1-frame_probs[0], beta); | 836 speech0 = (float)pow(1-frame_probs[0], beta); |
568 music0 = (float)pow(frame_probs[0], beta); | 837 music0 = (float)pow(frame_probs[0], beta); |
569 if (tonal->count==1) | 838 if (tonal->count==1) |
570 { | 839 { |
571 tonal->pspeech[0]=.5; | 840 if (tonal->application == OPUS_APPLICATION_VOIP) |
572 tonal->pmusic [0]=.5; | 841 tonal->pmusic[0] = .1f; |
| 842 else |
| 843 tonal->pmusic[0] = .625f; |
| 844 tonal->pspeech[0] = 1-tonal->pmusic[0]; |
573 } | 845 } |
574 /* Updated probability of having only speech (s0) or only music (m0), | 846 /* Updated probability of having only speech (s0) or only music (m0), |
575 before considering the new observation. */ | 847 before considering the new observation. */ |
576 s0 = tonal->pspeech[0] + tonal->pspeech[1]; | 848 s0 = tonal->pspeech[0] + tonal->pspeech[1]; |
577 m0 = tonal->pmusic [0] + tonal->pmusic [1]; | 849 m0 = tonal->pmusic [0] + tonal->pmusic [1]; |
578 /* Updates s0 and m0 with instantaneous probability. */ | 850 /* Updates s0 and m0 with instantaneous probability. */ |
579 tonal->pspeech[0] = s0*(1-tau)*speech0; | 851 tonal->pspeech[0] = s0*(1-tau)*speech0; |
580 tonal->pmusic [0] = m0*(1-tau)*music0; | 852 tonal->pmusic [0] = m0*(1-tau)*music0; |
581 /* Propagate the transition probabilities */ | 853 /* Propagate the transition probabilities */ |
582 for (i=1;i<DETECT_SIZE-1;i++) | 854 for (i=1;i<DETECT_SIZE-1;i++) |
(...skipping 29 matching lines...) Expand all Loading... |
612 tonal->music_confidence_count = IMIN(tonal->music_confidence_count,
500); | 884 tonal->music_confidence_count = IMIN(tonal->music_confidence_count,
500); |
613 tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->m
usic_confidence); | 885 tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->m
usic_confidence); |
614 } | 886 } |
615 if (tonal->music_prob<.1) | 887 if (tonal->music_prob<.1) |
616 { | 888 { |
617 float adapt; | 889 float adapt; |
618 adapt = 1.f/(++tonal->speech_confidence_count); | 890 adapt = 1.f/(++tonal->speech_confidence_count); |
619 tonal->speech_confidence_count = IMIN(tonal->speech_confidence_coun
t, 500); | 891 tonal->speech_confidence_count = IMIN(tonal->speech_confidence_coun
t, 500); |
620 tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->s
peech_confidence); | 892 tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->s
peech_confidence); |
621 } | 893 } |
622 } else { | |
623 if (tonal->music_confidence_count==0) | |
624 tonal->music_confidence = .9f; | |
625 if (tonal->speech_confidence_count==0) | |
626 tonal->speech_confidence = .1f; | |
627 } | 894 } |
628 } | 895 } |
629 if (tonal->last_music != (tonal->music_prob>.5f)) | |
630 tonal->last_transition=0; | |
631 tonal->last_music = tonal->music_prob>.5f; | 896 tonal->last_music = tonal->music_prob>.5f; |
632 #else | 897 #ifdef MLP_TRAINING |
633 info->music_prob = 0; | 898 for (i=0;i<25;i++) |
| 899 printf("%f ", features[i]); |
| 900 printf("\n"); |
634 #endif | 901 #endif |
635 /*for (i=0;i<25;i++) | |
636 printf("%f ", features[i]); | |
637 printf("\n");*/ | |
638 | 902 |
639 info->bandwidth = bandwidth; | 903 info->bandwidth = bandwidth; |
| 904 tonal->prev_bandwidth = bandwidth; |
640 /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/ | 905 /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/ |
641 info->noisiness = frame_noisiness; | 906 info->noisiness = frame_noisiness; |
642 info->valid = 1; | 907 info->valid = 1; |
643 RESTORE_STACK; | 908 RESTORE_STACK; |
644 } | 909 } |
645 | 910 |
646 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, co
nst void *analysis_pcm, | 911 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, co
nst void *analysis_pcm, |
647 int analysis_frame_size, int frame_size, int c1, int c2, int C,
opus_int32 Fs, | 912 int analysis_frame_size, int frame_size, int c1, int c2, int C,
opus_int32 Fs, |
648 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_inf
o) | 913 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_inf
o) |
649 { | 914 { |
650 int offset; | 915 int offset; |
651 int pcm_len; | 916 int pcm_len; |
652 | 917 |
| 918 analysis_frame_size -= analysis_frame_size&1; |
653 if (analysis_pcm != NULL) | 919 if (analysis_pcm != NULL) |
654 { | 920 { |
655 /* Avoid overflow/wrap-around of the analysis buffer */ | 921 /* Avoid overflow/wrap-around of the analysis buffer */ |
656 analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size); | 922 analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/50, analysis_frame_size); |
657 | 923 |
658 pcm_len = analysis_frame_size - analysis->analysis_offset; | 924 pcm_len = analysis_frame_size - analysis->analysis_offset; |
659 offset = analysis->analysis_offset; | 925 offset = analysis->analysis_offset; |
660 do { | 926 while (pcm_len>0) { |
661 tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len)
, offset, c1, c2, C, lsb_depth, downmix); | 927 tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(Fs/50, pcm_le
n), offset, c1, c2, C, lsb_depth, downmix); |
662 offset += 480; | 928 offset += Fs/50; |
663 pcm_len -= 480; | 929 pcm_len -= Fs/50; |
664 } while (pcm_len>0); | 930 } |
665 analysis->analysis_offset = analysis_frame_size; | 931 analysis->analysis_offset = analysis_frame_size; |
666 | 932 |
667 analysis->analysis_offset -= frame_size; | 933 analysis->analysis_offset -= frame_size; |
668 } | 934 } |
669 | 935 |
670 analysis_info->valid = 0; | 936 analysis_info->valid = 0; |
671 tonality_get_info(analysis, analysis_info, frame_size); | 937 tonality_get_info(analysis, analysis_info, frame_size); |
672 } | 938 } |
| 939 |
| 940 #endif /* DISABLE_FLOAT_API */ |
OLD | NEW |