| OLD | NEW |
| 1 /* Copyright (c) 2011 Xiph.Org Foundation | 1 /* Copyright (c) 2011 Xiph.Org Foundation |
| 2 Written by Jean-Marc Valin */ | 2 Written by Jean-Marc Valin */ |
| 3 /* | 3 /* |
| 4 Redistribution and use in source and binary forms, with or without | 4 Redistribution and use in source and binary forms, with or without |
| 5 modification, are permitted provided that the following conditions | 5 modification, are permitted provided that the following conditions |
| 6 are met: | 6 are met: |
| 7 | 7 |
| 8 - Redistributions of source code must retain the above copyright | 8 - Redistributions of source code must retain the above copyright |
| 9 notice, this list of conditions and the following disclaimer. | 9 notice, this list of conditions and the following disclaimer. |
| 10 | 10 |
| (...skipping 11 matching lines...) Expand all Loading... |
| 22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | 22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
| 23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | 23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| 24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | 24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| 25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 26 */ | 26 */ |
| 27 | 27 |
| 28 #ifdef HAVE_CONFIG_H | 28 #ifdef HAVE_CONFIG_H |
| 29 #include "config.h" | 29 #include "config.h" |
| 30 #endif | 30 #endif |
| 31 | 31 |
| 32 #define ANALYSIS_C |
| 33 |
| 34 #include <stdio.h> |
| 35 |
| 36 #include "mathops.h" |
| 32 #include "kiss_fft.h" | 37 #include "kiss_fft.h" |
| 33 #include "celt.h" | 38 #include "celt.h" |
| 34 #include "modes.h" | 39 #include "modes.h" |
| 35 #include "arch.h" | 40 #include "arch.h" |
| 36 #include "quant_bands.h" | 41 #include "quant_bands.h" |
| 37 #include <stdio.h> | |
| 38 #include "analysis.h" | 42 #include "analysis.h" |
| 39 #include "mlp.h" | 43 #include "mlp.h" |
| 40 #include "stack_alloc.h" | 44 #include "stack_alloc.h" |
| 45 #include "float_cast.h" |
| 41 | 46 |
| 42 #ifndef M_PI | 47 #ifndef M_PI |
| 43 #define M_PI 3.141592653 | 48 #define M_PI 3.141592653 |
| 44 #endif | 49 #endif |
| 45 | 50 |
| 51 #ifndef DISABLE_FLOAT_API |
| 52 |
| 46 static const float dct_table[128] = { | 53 static const float dct_table[128] = { |
| 47 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.2500
00f, 0.250000f, | 54 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.2500
00f, 0.250000f, |
| 48 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.2500
00f, 0.250000f, | 55 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.2500
00f, 0.250000f, |
| 49 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.1026
31f, 0.034654f, | 56 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.1026
31f, 0.034654f, |
| 50 -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.3383
30f,-0.351851f, | 57 -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.3383
30f,-0.351851f, |
| 51 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.2939
69f,-0.346760f, | 58 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.2939
69f,-0.346760f, |
| 52 -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.2939
69f, 0.346760f, | 59 -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.2939
69f, 0.346760f, |
| 53 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.2733
00f,-0.102631f, | 60 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.2733
00f,-0.102631f, |
| 54 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.2242
92f,-0.338330f, | 61 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.2242
92f,-0.338330f, |
| 55 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.1352
99f, 0.326641f, | 62 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.1352
99f, 0.326641f, |
| (...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 89 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627
f, 0.904508f, | 96 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627
f, 0.904508f, |
| 90 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703
f, 0.933013f, | 97 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703
f, 0.933013f, |
| 91 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072
f, 0.956773f, | 98 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072
f, 0.956773f, |
| 92 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465
f, 0.975528f, | 99 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465
f, 0.975528f, |
| 93 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671
f, 0.989074f, | 100 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671
f, 0.989074f, |
| 94 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534
f, 0.997261f, | 101 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534
f, 0.997261f, |
| 95 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957
f, 1.000000f, | 102 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957
f, 1.000000f, |
| 96 }; | 103 }; |
| 97 | 104 |
| 98 static const int tbands[NB_TBANDS+1] = { | 105 static const int tbands[NB_TBANDS+1] = { |
| 99 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 12
0 | 106 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 136, 160, 192,
240 |
| 100 }; | 107 }; |
| 101 | 108 |
| 102 static const int extra_bands[NB_TOT_BANDS+1] = { | |
| 103 1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96,
120, 160, 200 | |
| 104 }; | |
| 105 | |
| 106 /*static const float tweight[NB_TBANDS+1] = { | |
| 107 .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5 | |
| 108 };*/ | |
| 109 | |
| 110 #define NB_TONAL_SKIP_BANDS 9 | 109 #define NB_TONAL_SKIP_BANDS 9 |
| 111 | 110 |
| 112 #define cA 0.43157974f | 111 static opus_val32 silk_resampler_down2_hp( |
| 113 #define cB 0.67848403f | 112 opus_val32 *S, /* I/O State vector [ 2 ]
*/ |
| 114 #define cC 0.08595542f | 113 opus_val32 *out, /* O Output signal [ floo
r(len/2) ] */ |
| 115 #define cE ((float)M_PI/2) | 114 const opus_val32 *in, /* I Input signal [ len ]
*/ |
| 116 static OPUS_INLINE float fast_atan2f(float y, float x) { | 115 int inLen /* I Number of input samp
les */ |
| 117 float x2, y2; | 116 ) |
| 118 /* Should avoid underflow on the values we'll get */ | 117 { |
| 119 if (ABS16(x)+ABS16(y)<1e-9f) | 118 int k, len2 = inLen/2; |
| 120 { | 119 opus_val32 in32, out32, out32_hp, Y, X; |
| 121 x*=1e12f; | 120 opus_val64 hp_ener = 0; |
| 122 y*=1e12f; | 121 /* Internal variables and state are in Q10 format */ |
| 123 } | 122 for( k = 0; k < len2; k++ ) { |
| 124 x2 = x*x; | 123 /* Convert to Q10 */ |
| 125 y2 = y*y; | 124 in32 = in[ 2 * k ]; |
| 126 if(x2<y2){ | 125 |
| 127 float den = (y2 + cB*x2) * (y2 + cC*x2); | 126 /* All-pass section for even input sample */ |
| 128 if (den!=0) | 127 Y = SUB32( in32, S[ 0 ] ); |
| 129 return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE); | 128 X = MULT16_32_Q15(QCONST16(0.6074371f, 15), Y); |
| 130 else | 129 out32 = ADD32( S[ 0 ], X ); |
| 131 return (y<0 ? -cE : cE); | 130 S[ 0 ] = ADD32( in32, X ); |
| 132 }else{ | 131 out32_hp = out32; |
| 133 float den = (x2 + cB*y2) * (x2 + cC*y2); | 132 /* Convert to Q10 */ |
| 134 if (den!=0) | 133 in32 = in[ 2 * k + 1 ]; |
| 135 return x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE); | 134 |
| 136 else | 135 /* All-pass section for odd input sample, and add to output of previous
section */ |
| 137 return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE); | 136 Y = SUB32( in32, S[ 1 ] ); |
| 138 } | 137 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y); |
| 138 out32 = ADD32( out32, S[ 1 ] ); |
| 139 out32 = ADD32( out32, X ); |
| 140 S[ 1 ] = ADD32( in32, X ); |
| 141 |
| 142 Y = SUB32( -in32, S[ 2 ] ); |
| 143 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y); |
| 144 out32_hp = ADD32( out32_hp, S[ 2 ] ); |
| 145 out32_hp = ADD32( out32_hp, X ); |
| 146 S[ 2 ] = ADD32( -in32, X ); |
| 147 |
| 148 hp_ener += out32_hp*(opus_val64)out32_hp; |
| 149 /* Add, convert back to int16 and store to output */ |
| 150 out[ k ] = HALF32(out32); |
| 151 } |
| 152 #ifdef FIXED_POINT |
| 153 /* len2 can be up to 480, so we shift by 8 more to make it fit. */ |
| 154 hp_ener = hp_ener >> (2*SIG_SHIFT + 8); |
| 155 #endif |
| 156 return (opus_val32)hp_ener; |
| 139 } | 157 } |
| 140 | 158 |
| 141 void tonality_analysis_init(TonalityAnalysisState *tonal) | 159 static opus_val32 downmix_and_resample(downmix_func downmix, const void *_x, opu
s_val32 *y, opus_val32 S[3], int subframe, int offset, int c1, int c2, int C, in
t Fs) |
| 160 { |
| 161 VARDECL(opus_val32, tmp); |
| 162 opus_val32 scale; |
| 163 int j; |
| 164 opus_val32 ret = 0; |
| 165 SAVE_STACK; |
| 166 |
| 167 if (subframe==0) return 0; |
| 168 if (Fs == 48000) |
| 169 { |
| 170 subframe *= 2; |
| 171 offset *= 2; |
| 172 } else if (Fs == 16000) { |
| 173 subframe = subframe*2/3; |
| 174 offset = offset*2/3; |
| 175 } |
| 176 ALLOC(tmp, subframe, opus_val32); |
| 177 |
| 178 downmix(_x, tmp, subframe, offset, c1, c2, C); |
| 179 #ifdef FIXED_POINT |
| 180 scale = (1<<SIG_SHIFT); |
| 181 #else |
| 182 scale = 1.f/32768; |
| 183 #endif |
| 184 if (c2==-2) |
| 185 scale /= C; |
| 186 else if (c2>-1) |
| 187 scale /= 2; |
| 188 for (j=0;j<subframe;j++) |
| 189 tmp[j] *= scale; |
| 190 if (Fs == 48000) |
| 191 { |
| 192 ret = silk_resampler_down2_hp(S, y, tmp, subframe); |
| 193 } else if (Fs == 24000) { |
| 194 OPUS_COPY(y, tmp, subframe); |
| 195 } else if (Fs == 16000) { |
| 196 VARDECL(opus_val32, tmp3x); |
| 197 ALLOC(tmp3x, 3*subframe, opus_val32); |
| 198 /* Don't do this at home! This resampler is horrible and it's only (barely
) |
| 199 usable for the purpose of the analysis because we don't care about all |
| 200 the aliasing between 8 kHz and 12 kHz. */ |
| 201 for (j=0;j<subframe;j++) |
| 202 { |
| 203 tmp3x[3*j] = tmp[j]; |
| 204 tmp3x[3*j+1] = tmp[j]; |
| 205 tmp3x[3*j+2] = tmp[j]; |
| 206 } |
| 207 silk_resampler_down2_hp(S, y, tmp3x, 3*subframe); |
| 208 } |
| 209 RESTORE_STACK; |
| 210 return ret; |
| 211 } |
| 212 |
| 213 void tonality_analysis_init(TonalityAnalysisState *tonal, opus_int32 Fs) |
| 142 { | 214 { |
| 143 /* Initialize reusable fields. */ | 215 /* Initialize reusable fields. */ |
| 144 tonal->arch = opus_select_arch(); | 216 tonal->arch = opus_select_arch(); |
| 217 tonal->Fs = Fs; |
| 145 /* Clear remaining fields. */ | 218 /* Clear remaining fields. */ |
| 146 tonality_analysis_reset(tonal); | 219 tonality_analysis_reset(tonal); |
| 147 } | 220 } |
| 148 | 221 |
| 149 void tonality_analysis_reset(TonalityAnalysisState *tonal) | 222 void tonality_analysis_reset(TonalityAnalysisState *tonal) |
| 150 { | 223 { |
| 151 /* Clear non-reusable fields. */ | 224 /* Clear non-reusable fields. */ |
| 152 char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START; | 225 char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START; |
| 153 OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal)); | 226 OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal)); |
| 227 tonal->music_confidence = .9f; |
| 228 tonal->speech_confidence = .1f; |
| 154 } | 229 } |
| 155 | 230 |
| 156 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int
len) | 231 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int
len) |
| 157 { | 232 { |
| 158 int pos; | 233 int pos; |
| 159 int curr_lookahead; | 234 int curr_lookahead; |
| 160 float psum; | 235 float psum; |
| 236 float tonality_max; |
| 237 float tonality_avg; |
| 238 int tonality_count; |
| 161 int i; | 239 int i; |
| 162 | 240 |
| 163 pos = tonal->read_pos; | 241 pos = tonal->read_pos; |
| 164 curr_lookahead = tonal->write_pos-tonal->read_pos; | 242 curr_lookahead = tonal->write_pos-tonal->read_pos; |
| 165 if (curr_lookahead<0) | 243 if (curr_lookahead<0) |
| 166 curr_lookahead += DETECT_SIZE; | 244 curr_lookahead += DETECT_SIZE; |
| 167 | 245 |
| 168 if (len > 480 && pos != tonal->write_pos) | 246 /* On long frames, look at the second analysis window rather than the first.
*/ |
| 247 if (len > tonal->Fs/50 && pos != tonal->write_pos) |
| 169 { | 248 { |
| 170 pos++; | 249 pos++; |
| 171 if (pos==DETECT_SIZE) | 250 if (pos==DETECT_SIZE) |
| 172 pos=0; | 251 pos=0; |
| 173 } | 252 } |
| 174 if (pos == tonal->write_pos) | 253 if (pos == tonal->write_pos) |
| 175 pos--; | 254 pos--; |
| 176 if (pos<0) | 255 if (pos<0) |
| 177 pos = DETECT_SIZE-1; | 256 pos = DETECT_SIZE-1; |
| 178 OPUS_COPY(info_out, &tonal->info[pos], 1); | 257 OPUS_COPY(info_out, &tonal->info[pos], 1); |
| 179 tonal->read_subframe += len/120; | 258 tonality_max = tonality_avg = info_out->tonality; |
| 180 while (tonal->read_subframe>=4) | 259 tonality_count = 1; |
| 260 /* If possible, look ahead for a tone to compensate for the delay in the tone
detector. */ |
| 261 for (i=0;i<3;i++) |
| 181 { | 262 { |
| 182 tonal->read_subframe -= 4; | 263 pos++; |
| 264 if (pos==DETECT_SIZE) |
| 265 pos = 0; |
| 266 if (pos == tonal->write_pos) |
| 267 break; |
| 268 tonality_max = MAX32(tonality_max, tonal->info[pos].tonality); |
| 269 tonality_avg += tonal->info[pos].tonality; |
| 270 tonality_count++; |
| 271 } |
| 272 info_out->tonality = MAX32(tonality_avg/tonality_count, tonality_max-.2f); |
| 273 tonal->read_subframe += len/(tonal->Fs/400); |
| 274 while (tonal->read_subframe>=8) |
| 275 { |
| 276 tonal->read_subframe -= 8; |
| 183 tonal->read_pos++; | 277 tonal->read_pos++; |
| 184 } | 278 } |
| 185 if (tonal->read_pos>=DETECT_SIZE) | 279 if (tonal->read_pos>=DETECT_SIZE) |
| 186 tonal->read_pos-=DETECT_SIZE; | 280 tonal->read_pos-=DETECT_SIZE; |
| 187 | 281 |
| 188 /* Compensate for the delay in the features themselves. | 282 /* The -1 is to compensate for the delay in the features themselves. */ |
| 189 FIXME: Need a better estimate the 10 I just made up */ | 283 curr_lookahead = IMAX(curr_lookahead-1, 0); |
| 190 curr_lookahead = IMAX(curr_lookahead-10, 0); | |
| 191 | 284 |
| 192 psum=0; | 285 psum=0; |
| 193 /* Summing the probability of transition patterns that involve music at | 286 /* Summing the probability of transition patterns that involve music at |
| 194 time (DETECT_SIZE-curr_lookahead-1) */ | 287 time (DETECT_SIZE-curr_lookahead-1) */ |
| 195 for (i=0;i<DETECT_SIZE-curr_lookahead;i++) | 288 for (i=0;i<DETECT_SIZE-curr_lookahead;i++) |
| 196 psum += tonal->pmusic[i]; | 289 psum += tonal->pmusic[i]; |
| 197 for (;i<DETECT_SIZE;i++) | 290 for (;i<DETECT_SIZE;i++) |
| 198 psum += tonal->pspeech[i]; | 291 psum += tonal->pspeech[i]; |
| 199 psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence; | 292 psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence; |
| 200 /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/ | 293 /*printf("%f %f %f %f %f\n", psum, info_out->music_prob, info_out->vad_prob,
info_out->activity_probability, info_out->tonality);*/ |
| 201 | 294 |
| 202 info_out->music_prob = psum; | 295 info_out->music_prob = psum; |
| 203 } | 296 } |
| 204 | 297 |
| 298 static const float std_feature_bias[9] = { |
| 299 5.684947f, 3.475288f, 1.770634f, 1.599784f, 3.773215f, |
| 300 2.163313f, 1.260756f, 1.116868f, 1.918795f |
| 301 }; |
| 302 |
| 303 #define LEAKAGE_OFFSET 2.5f |
| 304 #define LEAKAGE_SLOPE 2.f |
| 305 |
| 306 #ifdef FIXED_POINT |
| 307 /* For fixed-point, the input is +/-2^15 shifted up by SIG_SHIFT, so we need to |
| 308 compensate for that in the energy. */ |
| 309 #define SCALE_COMPENS (1.f/((opus_int32)1<<(15+SIG_SHIFT))) |
| 310 #define SCALE_ENER(e) ((SCALE_COMPENS*SCALE_COMPENS)*(e)) |
| 311 #else |
| 312 #define SCALE_ENER(e) (e) |
| 313 #endif |
| 314 |
| 205 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth,
downmix_func downmix) | 315 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth,
downmix_func downmix) |
| 206 { | 316 { |
| 207 int i, b; | 317 int i, b; |
| 208 const kiss_fft_state *kfft; | 318 const kiss_fft_state *kfft; |
| 209 VARDECL(kiss_fft_cpx, in); | 319 VARDECL(kiss_fft_cpx, in); |
| 210 VARDECL(kiss_fft_cpx, out); | 320 VARDECL(kiss_fft_cpx, out); |
| 211 int N = 480, N2=240; | 321 int N = 480, N2=240; |
| 212 float * OPUS_RESTRICT A = tonal->angle; | 322 float * OPUS_RESTRICT A = tonal->angle; |
| 213 float * OPUS_RESTRICT dA = tonal->d_angle; | 323 float * OPUS_RESTRICT dA = tonal->d_angle; |
| 214 float * OPUS_RESTRICT d2A = tonal->d2_angle; | 324 float * OPUS_RESTRICT d2A = tonal->d2_angle; |
| (...skipping 13 matching lines...) Expand all Loading... |
| 228 float relativeE; | 338 float relativeE; |
| 229 float frame_probs[2]; | 339 float frame_probs[2]; |
| 230 float alpha, alphaE, alphaE2; | 340 float alpha, alphaE, alphaE2; |
| 231 float frame_loudness; | 341 float frame_loudness; |
| 232 float bandwidth_mask; | 342 float bandwidth_mask; |
| 233 int bandwidth=0; | 343 int bandwidth=0; |
| 234 float maxE = 0; | 344 float maxE = 0; |
| 235 float noise_floor; | 345 float noise_floor; |
| 236 int remaining; | 346 int remaining; |
| 237 AnalysisInfo *info; | 347 AnalysisInfo *info; |
| 348 float hp_ener; |
| 349 float tonality2[240]; |
| 350 float midE[8]; |
| 351 float spec_variability=0; |
| 352 float band_log2[NB_TBANDS+1]; |
| 353 float leakage_from[NB_TBANDS+1]; |
| 354 float leakage_to[NB_TBANDS+1]; |
| 238 SAVE_STACK; | 355 SAVE_STACK; |
| 239 | 356 |
| 240 tonal->last_transition++; | 357 alpha = 1.f/IMIN(10, 1+tonal->count); |
| 241 alpha = 1.f/IMIN(20, 1+tonal->count); | 358 alphaE = 1.f/IMIN(25, 1+tonal->count); |
| 242 alphaE = 1.f/IMIN(50, 1+tonal->count); | 359 alphaE2 = 1.f/IMIN(500, 1+tonal->count); |
| 243 alphaE2 = 1.f/IMIN(1000, 1+tonal->count); | |
| 244 | 360 |
| 245 if (tonal->count<4) | 361 if (tonal->Fs == 48000) |
| 246 tonal->music_prob = .5; | 362 { |
| 363 /* len and offset are now at 24 kHz. */ |
| 364 len/= 2; |
| 365 offset /= 2; |
| 366 } else if (tonal->Fs == 16000) { |
| 367 len = 3*len/2; |
| 368 offset = 3*offset/2; |
| 369 } |
| 370 |
| 371 if (tonal->count<4) { |
| 372 if (tonal->application == OPUS_APPLICATION_VOIP) |
| 373 tonal->music_prob = .1f; |
| 374 else |
| 375 tonal->music_prob = .625f; |
| 376 } |
| 247 kfft = celt_mode->mdct.kfft[0]; | 377 kfft = celt_mode->mdct.kfft[0]; |
| 248 if (tonal->count==0) | 378 if (tonal->count==0) |
| 249 tonal->mem_fill = 240; | 379 tonal->mem_fill = 240; |
| 250 downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal
->mem_fill), offset, c1, c2, C); | 380 tonal->hp_ener_accum += (float)downmix_and_resample(downmix, x, |
| 381 &tonal->inmem[tonal->mem_fill], tonal->downmix_state, |
| 382 IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C, tonal
->Fs); |
| 251 if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE) | 383 if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE) |
| 252 { | 384 { |
| 253 tonal->mem_fill += len; | 385 tonal->mem_fill += len; |
| 254 /* Don't have enough to update the analysis */ | 386 /* Don't have enough to update the analysis */ |
| 255 RESTORE_STACK; | 387 RESTORE_STACK; |
| 256 return; | 388 return; |
| 257 } | 389 } |
| 390 hp_ener = tonal->hp_ener_accum; |
| 258 info = &tonal->info[tonal->write_pos++]; | 391 info = &tonal->info[tonal->write_pos++]; |
| 259 if (tonal->write_pos>=DETECT_SIZE) | 392 if (tonal->write_pos>=DETECT_SIZE) |
| 260 tonal->write_pos-=DETECT_SIZE; | 393 tonal->write_pos-=DETECT_SIZE; |
| 261 | 394 |
| 262 ALLOC(in, 480, kiss_fft_cpx); | 395 ALLOC(in, 480, kiss_fft_cpx); |
| 263 ALLOC(out, 480, kiss_fft_cpx); | 396 ALLOC(out, 480, kiss_fft_cpx); |
| 264 ALLOC(tonality, 240, float); | 397 ALLOC(tonality, 240, float); |
| 265 ALLOC(noisiness, 240, float); | 398 ALLOC(noisiness, 240, float); |
| 266 for (i=0;i<N2;i++) | 399 for (i=0;i<N2;i++) |
| 267 { | 400 { |
| 268 float w = analysis_window[i]; | 401 float w = analysis_window[i]; |
| 269 in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]); | 402 in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]); |
| 270 in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]); | 403 in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]); |
| 271 in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]); | 404 in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]); |
| 272 in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]); | 405 in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]); |
| 273 } | 406 } |
| 274 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240); | 407 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240); |
| 275 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill); | 408 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill); |
| 276 downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->me
m_fill, c1, c2, C); | 409 tonal->hp_ener_accum = (float)downmix_and_resample(downmix, x, |
| 410 &tonal->inmem[240], tonal->downmix_state, remaining, |
| 411 offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs); |
| 277 tonal->mem_fill = 240 + remaining; | 412 tonal->mem_fill = 240 + remaining; |
| 278 opus_fft(kfft, in, out, tonal->arch); | 413 opus_fft(kfft, in, out, tonal->arch); |
| 279 #ifndef FIXED_POINT | 414 #ifndef FIXED_POINT |
| 280 /* If there's any NaN on the input, the entire output will be NaN, so we onl
y need to check one value. */ | 415 /* If there's any NaN on the input, the entire output will be NaN, so we onl
y need to check one value. */ |
| 281 if (celt_isnan(out[0].r)) | 416 if (celt_isnan(out[0].r)) |
| 282 { | 417 { |
| 283 info->valid = 0; | 418 info->valid = 0; |
| 284 RESTORE_STACK; | 419 RESTORE_STACK; |
| 285 return; | 420 return; |
| 286 } | 421 } |
| (...skipping 11 matching lines...) Expand all Loading... |
| 298 X2i = (float)out[N-i].r-out[i].r; | 433 X2i = (float)out[N-i].r-out[i].r; |
| 299 | 434 |
| 300 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r); | 435 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r); |
| 301 d_angle = angle - A[i]; | 436 d_angle = angle - A[i]; |
| 302 d2_angle = d_angle - dA[i]; | 437 d2_angle = d_angle - dA[i]; |
| 303 | 438 |
| 304 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r); | 439 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r); |
| 305 d_angle2 = angle2 - angle; | 440 d_angle2 = angle2 - angle; |
| 306 d2_angle2 = d_angle2 - d_angle; | 441 d2_angle2 = d_angle2 - d_angle; |
| 307 | 442 |
| 308 mod1 = d2_angle - (float)floor(.5+d2_angle); | 443 mod1 = d2_angle - (float)float2int(d2_angle); |
| 309 noisiness[i] = ABS16(mod1); | 444 noisiness[i] = ABS16(mod1); |
| 310 mod1 *= mod1; | 445 mod1 *= mod1; |
| 311 mod1 *= mod1; | 446 mod1 *= mod1; |
| 312 | 447 |
| 313 mod2 = d2_angle2 - (float)floor(.5+d2_angle2); | 448 mod2 = d2_angle2 - (float)float2int(d2_angle2); |
| 314 noisiness[i] += ABS16(mod2); | 449 noisiness[i] += ABS16(mod2); |
| 315 mod2 *= mod2; | 450 mod2 *= mod2; |
| 316 mod2 *= mod2; | 451 mod2 *= mod2; |
| 317 | 452 |
| 318 avg_mod = .25f*(d2A[i]+2.f*mod1+mod2); | 453 avg_mod = .25f*(d2A[i]+mod1+2*mod2); |
| 454 /* This introduces an extra delay of 2 frames in the detection. */ |
| 319 tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f; | 455 tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f; |
| 456 /* No delay on this detection, but it's less reliable. */ |
| 457 tonality2[i] = 1.f/(1.f+40.f*16.f*pi4*mod2)-.015f; |
| 320 | 458 |
| 321 A[i] = angle2; | 459 A[i] = angle2; |
| 322 dA[i] = d_angle2; | 460 dA[i] = d_angle2; |
| 323 d2A[i] = mod2; | 461 d2A[i] = mod2; |
| 324 } | 462 } |
| 325 | 463 for (i=2;i<N2-1;i++) |
| 464 { |
| 465 float tt = MIN32(tonality2[i], MAX32(tonality2[i-1], tonality2[i+1])); |
| 466 tonality[i] = .9f*MAX32(tonality[i], tt-.1f); |
| 467 } |
| 326 frame_tonality = 0; | 468 frame_tonality = 0; |
| 327 max_frame_tonality = 0; | 469 max_frame_tonality = 0; |
| 328 /*tw_sum = 0;*/ | 470 /*tw_sum = 0;*/ |
| 329 info->activity = 0; | 471 info->activity = 0; |
| 330 frame_noisiness = 0; | 472 frame_noisiness = 0; |
| 331 frame_stationarity = 0; | 473 frame_stationarity = 0; |
| 332 if (!tonal->count) | 474 if (!tonal->count) |
| 333 { | 475 { |
| 334 for (b=0;b<NB_TBANDS;b++) | 476 for (b=0;b<NB_TBANDS;b++) |
| 335 { | 477 { |
| 336 tonal->lowE[b] = 1e10; | 478 tonal->lowE[b] = 1e10; |
| 337 tonal->highE[b] = -1e10; | 479 tonal->highE[b] = -1e10; |
| 338 } | 480 } |
| 339 } | 481 } |
| 340 relativeE = 0; | 482 relativeE = 0; |
| 341 frame_loudness = 0; | 483 frame_loudness = 0; |
| 484 /* The energy of the very first band is special because of DC. */ |
| 485 { |
| 486 float E = 0; |
| 487 float X1r, X2r; |
| 488 X1r = 2*(float)out[0].r; |
| 489 X2r = 2*(float)out[0].i; |
| 490 E = X1r*X1r + X2r*X2r; |
| 491 for (i=1;i<4;i++) |
| 492 { |
| 493 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r |
| 494 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; |
| 495 E += binE; |
| 496 } |
| 497 E = SCALE_ENER(E); |
| 498 band_log2[0] = .5f*1.442695f*(float)log(E+1e-10f); |
| 499 } |
| 342 for (b=0;b<NB_TBANDS;b++) | 500 for (b=0;b<NB_TBANDS;b++) |
| 343 { | 501 { |
| 344 float E=0, tE=0, nE=0; | 502 float E=0, tE=0, nE=0; |
| 345 float L1, L2; | 503 float L1, L2; |
| 346 float stationarity; | 504 float stationarity; |
| 347 for (i=tbands[b];i<tbands[b+1];i++) | 505 for (i=tbands[b];i<tbands[b+1];i++) |
| 348 { | 506 { |
| 349 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r | 507 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r |
| 350 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; | 508 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; |
| 351 #ifdef FIXED_POINT | 509 binE = SCALE_ENER(binE); |
| 352 /* FIXME: It's probably best to change the BFCC filter initial state i
nstead */ | |
| 353 binE *= 5.55e-17f; | |
| 354 #endif | |
| 355 E += binE; | 510 E += binE; |
| 356 tE += binE*tonality[i]; | 511 tE += binE*MAX32(0, tonality[i]); |
| 357 nE += binE*2.f*(.5f-noisiness[i]); | 512 nE += binE*2.f*(.5f-noisiness[i]); |
| 358 } | 513 } |
| 359 #ifndef FIXED_POINT | 514 #ifndef FIXED_POINT |
| 360 /* Check for extreme band energies that could cause NaNs later. */ | 515 /* Check for extreme band energies that could cause NaNs later. */ |
| 361 if (!(E<1e9f) || celt_isnan(E)) | 516 if (!(E<1e9f) || celt_isnan(E)) |
| 362 { | 517 { |
| 363 info->valid = 0; | 518 info->valid = 0; |
| 364 RESTORE_STACK; | 519 RESTORE_STACK; |
| 365 return; | 520 return; |
| 366 } | 521 } |
| 367 #endif | 522 #endif |
| 368 | 523 |
| 369 tonal->E[tonal->E_count][b] = E; | 524 tonal->E[tonal->E_count][b] = E; |
| 370 frame_noisiness += nE/(1e-15f+E); | 525 frame_noisiness += nE/(1e-15f+E); |
| 371 | 526 |
| 372 frame_loudness += (float)sqrt(E+1e-10f); | 527 frame_loudness += (float)sqrt(E+1e-10f); |
| 373 logE[b] = (float)log(E+1e-10f); | 528 logE[b] = (float)log(E+1e-10f); |
| 374 tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f); | 529 band_log2[b+1] = .5f*1.442695f*(float)log(E+1e-10f); |
| 375 tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f); | 530 tonal->logE[tonal->E_count][b] = logE[b]; |
| 376 if (tonal->highE[b] < tonal->lowE[b]+1.f) | 531 if (tonal->count==0) |
| 532 tonal->highE[b] = tonal->lowE[b] = logE[b]; |
| 533 if (tonal->highE[b] > tonal->lowE[b] + 7.5) |
| 377 { | 534 { |
| 378 tonal->highE[b]+=.5f; | 535 if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b]) |
| 379 tonal->lowE[b]-=.5f; | 536 tonal->highE[b] -= .01f; |
| 537 else |
| 538 tonal->lowE[b] += .01f; |
| 380 } | 539 } |
| 381 relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE
[b]); | 540 if (logE[b] > tonal->highE[b]) |
| 541 { |
| 542 tonal->highE[b] = logE[b]; |
| 543 tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]); |
| 544 } else if (logE[b] < tonal->lowE[b]) |
| 545 { |
| 546 tonal->lowE[b] = logE[b]; |
| 547 tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]); |
| 548 } |
| 549 relativeE += (logE[b]-tonal->lowE[b])/(1e-15f + (tonal->highE[b]-tonal->l
owE[b])); |
| 382 | 550 |
| 383 L1=L2=0; | 551 L1=L2=0; |
| 384 for (i=0;i<NB_FRAMES;i++) | 552 for (i=0;i<NB_FRAMES;i++) |
| 385 { | 553 { |
| 386 L1 += (float)sqrt(tonal->E[i][b]); | 554 L1 += (float)sqrt(tonal->E[i][b]); |
| 387 L2 += tonal->E[i][b]; | 555 L2 += tonal->E[i][b]; |
| 388 } | 556 } |
| 389 | 557 |
| 390 stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2)); | 558 stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2)); |
| 391 stationarity *= stationarity; | 559 stationarity *= stationarity; |
| (...skipping 11 matching lines...) Expand all Loading... |
| 403 frame_tonality += band_tonality[b]; | 571 frame_tonality += band_tonality[b]; |
| 404 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS) | 572 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS) |
| 405 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS]; | 573 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS]; |
| 406 #endif | 574 #endif |
| 407 max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*f
rame_tonality); | 575 max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*f
rame_tonality); |
| 408 slope += band_tonality[b]*(b-8); | 576 slope += band_tonality[b]*(b-8); |
| 409 /*printf("%f %f ", band_tonality[b], stationarity);*/ | 577 /*printf("%f %f ", band_tonality[b], stationarity);*/ |
| 410 tonal->prev_band_tonality[b] = band_tonality[b]; | 578 tonal->prev_band_tonality[b] = band_tonality[b]; |
| 411 } | 579 } |
| 412 | 580 |
| 581 leakage_from[0] = band_log2[0]; |
| 582 leakage_to[0] = band_log2[0] - LEAKAGE_OFFSET; |
| 583 for (b=1;b<NB_TBANDS+1;b++) |
| 584 { |
| 585 float leak_slope = LEAKAGE_SLOPE*(tbands[b]-tbands[b-1])/4; |
| 586 leakage_from[b] = MIN16(leakage_from[b-1]+leak_slope, band_log2[b]); |
| 587 leakage_to[b] = MAX16(leakage_to[b-1]-leak_slope, band_log2[b]-LEAKAGE_OF
FSET); |
| 588 } |
| 589 for (b=NB_TBANDS-2;b>=0;b--) |
| 590 { |
| 591 float leak_slope = LEAKAGE_SLOPE*(tbands[b+1]-tbands[b])/4; |
| 592 leakage_from[b] = MIN16(leakage_from[b+1]+leak_slope, leakage_from[b]); |
| 593 leakage_to[b] = MAX16(leakage_to[b+1]-leak_slope, leakage_to[b]); |
| 594 } |
| 595 celt_assert(NB_TBANDS+1 <= LEAK_BANDS); |
| 596 for (b=0;b<NB_TBANDS+1;b++) |
| 597 { |
| 598 /* leak_boost[] is made up of two terms. The first, based on leakage_to[]
, |
| 599 represents the boost needed to overcome the amount of analysis leakage |
| 600 cause in a weaker band b by louder neighbouring bands. |
| 601 The second, based on leakage_from[], applies to a loud band b for |
| 602 which the quantization noise causes synthesis leakage to the weaker |
| 603 neighbouring bands. */ |
| 604 float boost = MAX16(0, leakage_to[b] - band_log2[b]) + |
| 605 MAX16(0, band_log2[b] - (leakage_from[b]+LEAKAGE_OFFSET)); |
| 606 info->leak_boost[b] = IMIN(255, (int)floor(.5 + 64.f*boost)); |
| 607 } |
| 608 for (;b<LEAK_BANDS;b++) info->leak_boost[b] = 0; |
| 609 |
| 610 for (i=0;i<NB_FRAMES;i++) |
| 611 { |
| 612 int j; |
| 613 float mindist = 1e15f; |
| 614 for (j=0;j<NB_FRAMES;j++) |
| 615 { |
| 616 int k; |
| 617 float dist=0; |
| 618 for (k=0;k<NB_TBANDS;k++) |
| 619 { |
| 620 float tmp; |
| 621 tmp = tonal->logE[i][k] - tonal->logE[j][k]; |
| 622 dist += tmp*tmp; |
| 623 } |
| 624 if (j!=i) |
| 625 mindist = MIN32(mindist, dist); |
| 626 } |
| 627 spec_variability += mindist; |
| 628 } |
| 629 spec_variability = (float)sqrt(spec_variability/NB_FRAMES/NB_TBANDS); |
| 413 bandwidth_mask = 0; | 630 bandwidth_mask = 0; |
| 414 bandwidth = 0; | 631 bandwidth = 0; |
| 415 maxE = 0; | 632 maxE = 0; |
| 416 noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8))); | 633 noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8))); |
| 417 #ifdef FIXED_POINT | |
| 418 noise_floor *= 1<<(15+SIG_SHIFT); | |
| 419 #endif | |
| 420 noise_floor *= noise_floor; | 634 noise_floor *= noise_floor; |
| 421 for (b=0;b<NB_TOT_BANDS;b++) | 635 for (b=0;b<NB_TBANDS;b++) |
| 422 { | 636 { |
| 423 float E=0; | 637 float E=0; |
| 424 int band_start, band_end; | 638 int band_start, band_end; |
| 425 /* Keep a margin of 300 Hz for aliasing */ | 639 /* Keep a margin of 300 Hz for aliasing */ |
| 426 band_start = extra_bands[b]; | 640 band_start = tbands[b]; |
| 427 band_end = extra_bands[b+1]; | 641 band_end = tbands[b+1]; |
| 428 for (i=band_start;i<band_end;i++) | 642 for (i=band_start;i<band_end;i++) |
| 429 { | 643 { |
| 430 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r | 644 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r |
| 431 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; | 645 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; |
| 432 E += binE; | 646 E += binE; |
| 433 } | 647 } |
| 648 E = SCALE_ENER(E); |
| 434 maxE = MAX32(maxE, E); | 649 maxE = MAX32(maxE, E); |
| 435 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E); | 650 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E); |
| 436 E = MAX32(E, tonal->meanE[b]); | 651 E = MAX32(E, tonal->meanE[b]); |
| 437 /* Use a simple follower with 13 dB/Bark slope for spreading function */ | 652 /* Use a simple follower with 13 dB/Bark slope for spreading function */ |
| 438 bandwidth_mask = MAX32(.05f*bandwidth_mask, E); | 653 bandwidth_mask = MAX32(.05f*bandwidth_mask, E); |
| 439 /* Consider the band "active" only if all these conditions are met: | 654 /* Consider the band "active" only if all these conditions are met: |
| 440 1) less than 10 dB below the simple follower | 655 1) less than 10 dB below the simple follower |
| 441 2) less than 90 dB below the peak band (maximal masking possible consi
dering | 656 2) less than 90 dB below the peak band (maximal masking possible consi
dering |
| 442 both the ATH and the loudness-dependent slope of the spreading func
tion) | 657 both the ATH and the loudness-dependent slope of the spreading func
tion) |
| 443 3) above the PCM quantization noise floor | 658 3) above the PCM quantization noise floor |
| 659 We use b+1 because the first CELT band isn't included in tbands[] |
| 444 */ | 660 */ |
| 445 if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-ban
d_start)) | 661 if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-ban
d_start)) |
| 446 bandwidth = b; | 662 bandwidth = b+1; |
| 663 } |
| 664 /* Special case for the last two bands, for which we don't have spectrum but
only |
| 665 the energy above 12 kHz. */ |
| 666 if (tonal->Fs == 48000) { |
| 667 float ratio; |
| 668 float E = hp_ener*(1.f/(240*240)); |
| 669 ratio = tonal->prev_bandwidth==20 ? 0.03f : 0.07f; |
| 670 #ifdef FIXED_POINT |
| 671 /* silk_resampler_down2_hp() shifted right by an extra 8 bits. */ |
| 672 E *= 256.f*(1.f/Q15ONE)*(1.f/Q15ONE); |
| 673 #endif |
| 674 maxE = MAX32(maxE, E); |
| 675 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E); |
| 676 E = MAX32(E, tonal->meanE[b]); |
| 677 /* Use a simple follower with 13 dB/Bark slope for spreading function */ |
| 678 bandwidth_mask = MAX32(.05f*bandwidth_mask, E); |
| 679 if (E>ratio*bandwidth_mask && E*1e9f > maxE && E > noise_floor*160) |
| 680 bandwidth = 20; |
| 681 /* This detector is unreliable, so if the bandwidth is close to SWB, assu
me it's FB. */ |
| 682 if (bandwidth >= 17) |
| 683 bandwidth = 20; |
| 447 } | 684 } |
| 448 if (tonal->count<=2) | 685 if (tonal->count<=2) |
| 449 bandwidth = 20; | 686 bandwidth = 20; |
| 450 frame_loudness = 20*(float)log10(frame_loudness); | 687 frame_loudness = 20*(float)log10(frame_loudness); |
| 451 tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness); | 688 tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness); |
| 452 tonal->lowECount *= (1-alphaE); | 689 tonal->lowECount *= (1-alphaE); |
| 453 if (frame_loudness < tonal->Etracker-30) | 690 if (frame_loudness < tonal->Etracker-30) |
| 454 tonal->lowECount += alphaE; | 691 tonal->lowECount += alphaE; |
| 455 | 692 |
| 456 for (i=0;i<8;i++) | 693 for (i=0;i<8;i++) |
| 457 { | 694 { |
| 458 float sum=0; | 695 float sum=0; |
| 459 for (b=0;b<16;b++) | 696 for (b=0;b<16;b++) |
| 460 sum += dct_table[i*16+b]*logE[b]; | 697 sum += dct_table[i*16+b]*logE[b]; |
| 461 BFCC[i] = sum; | 698 BFCC[i] = sum; |
| 462 } | 699 } |
| 700 for (i=0;i<8;i++) |
| 701 { |
| 702 float sum=0; |
| 703 for (b=0;b<16;b++) |
| 704 sum += dct_table[i*16+b]*.5f*(tonal->highE[b]+tonal->lowE[b]); |
| 705 midE[i] = sum; |
| 706 } |
| 463 | 707 |
| 464 frame_stationarity /= NB_TBANDS; | 708 frame_stationarity /= NB_TBANDS; |
| 465 relativeE /= NB_TBANDS; | 709 relativeE /= NB_TBANDS; |
| 466 if (tonal->count<10) | 710 if (tonal->count<10) |
| 467 relativeE = .5; | 711 relativeE = .5f; |
| 468 frame_noisiness /= NB_TBANDS; | 712 frame_noisiness /= NB_TBANDS; |
| 469 #if 1 | 713 #if 1 |
| 470 info->activity = frame_noisiness + (1-frame_noisiness)*relativeE; | 714 info->activity = frame_noisiness + (1-frame_noisiness)*relativeE; |
| 471 #else | 715 #else |
| 472 info->activity = .5*(1+frame_noisiness-frame_stationarity); | 716 info->activity = .5*(1+frame_noisiness-frame_stationarity); |
| 473 #endif | 717 #endif |
| 474 frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS)); | 718 frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS)); |
| 475 frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f); | 719 frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f); |
| 476 tonal->prev_tonality = frame_tonality; | 720 tonal->prev_tonality = frame_tonality; |
| 477 | 721 |
| 478 slope /= 8*8; | 722 slope /= 8*8; |
| 479 info->tonality_slope = slope; | 723 info->tonality_slope = slope; |
| 480 | 724 |
| 481 tonal->E_count = (tonal->E_count+1)%NB_FRAMES; | 725 tonal->E_count = (tonal->E_count+1)%NB_FRAMES; |
| 482 tonal->count++; | 726 tonal->count = IMIN(tonal->count+1, ANALYSIS_COUNT_MAX); |
| 483 info->tonality = frame_tonality; | 727 info->tonality = frame_tonality; |
| 484 | 728 |
| 485 for (i=0;i<4;i++) | 729 for (i=0;i<4;i++) |
| 486 features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem
[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i]; | 730 features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem
[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i]; |
| 487 | 731 |
| 488 for (i=0;i<4;i++) | 732 for (i=0;i<4;i++) |
| 489 tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i]; | 733 tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i]; |
| 490 | 734 |
| 491 for (i=0;i<4;i++) | 735 for (i=0;i<4;i++) |
| 492 features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->m
em[i]-tonal->mem[i+16]); | 736 features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->m
em[i]-tonal->mem[i+16]); |
| 493 for (i=0;i<3;i++) | 737 for (i=0;i<3;i++) |
| 494 features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->m
em[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8]; | 738 features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->m
em[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8]; |
| 495 | 739 |
| 496 if (tonal->count > 5) | 740 if (tonal->count > 5) |
| 497 { | 741 { |
| 498 for (i=0;i<9;i++) | 742 for (i=0;i<9;i++) |
| 499 tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i
]; | 743 tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i
]; |
| 500 } | 744 } |
| 745 for (i=0;i<4;i++) |
| 746 features[i] = BFCC[i]-midE[i]; |
| 501 | 747 |
| 502 for (i=0;i<8;i++) | 748 for (i=0;i<8;i++) |
| 503 { | 749 { |
| 504 tonal->mem[i+24] = tonal->mem[i+16]; | 750 tonal->mem[i+24] = tonal->mem[i+16]; |
| 505 tonal->mem[i+16] = tonal->mem[i+8]; | 751 tonal->mem[i+16] = tonal->mem[i+8]; |
| 506 tonal->mem[i+8] = tonal->mem[i]; | 752 tonal->mem[i+8] = tonal->mem[i]; |
| 507 tonal->mem[i] = BFCC[i]; | 753 tonal->mem[i] = BFCC[i]; |
| 508 } | 754 } |
| 509 for (i=0;i<9;i++) | 755 for (i=0;i<9;i++) |
| 510 features[11+i] = (float)sqrt(tonal->std[i]); | 756 features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i]; |
| 511 features[20] = info->tonality; | 757 features[18] = spec_variability - 0.78f; |
| 512 features[21] = info->activity; | 758 features[20] = info->tonality - 0.154723f; |
| 513 features[22] = frame_stationarity; | 759 features[21] = info->activity - 0.724643f; |
| 514 features[23] = info->tonality_slope; | 760 features[22] = frame_stationarity - 0.743717f; |
| 515 features[24] = tonal->lowECount; | 761 features[23] = info->tonality_slope + 0.069216f; |
| 762 features[24] = tonal->lowECount - 0.067930f; |
| 516 | 763 |
| 517 #ifndef DISABLE_FLOAT_API | |
| 518 mlp_process(&net, features, frame_probs); | 764 mlp_process(&net, features, frame_probs); |
| 519 frame_probs[0] = .5f*(frame_probs[0]+1); | 765 frame_probs[0] = .5f*(frame_probs[0]+1); |
| 520 /* Curve fitting between the MLP probability and the actual probability */ | 766 /* Curve fitting between the MLP probability and the actual probability */ |
| 521 frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)po
w(frame_probs[0], 10); | 767 /*frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)
pow(frame_probs[0], 10);*/ |
| 522 /* Probability of active audio (as opposed to silence) */ | 768 /* Probability of active audio (as opposed to silence) */ |
| 523 frame_probs[1] = .5f*frame_probs[1]+.5f; | 769 frame_probs[1] = .5f*frame_probs[1]+.5f; |
| 524 /* Consider that silence has a 50-50 probability. */ | 770 frame_probs[1] *= frame_probs[1]; |
| 525 frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f; | |
| 526 | 771 |
| 527 /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/ | 772 /* Probability of speech or music vs noise */ |
| 773 info->activity_probability = frame_probs[1]; |
| 774 |
| 775 /*printf("%f %f\n", frame_probs[0], frame_probs[1]);*/ |
| 528 { | 776 { |
| 529 /* Probability of state transition */ | 777 /* Probability of state transition */ |
| 530 float tau; | 778 float tau; |
| 531 /* Represents independence of the MLP probabilities, where | 779 /* Represents independence of the MLP probabilities, where |
| 532 beta=1 means fully independent. */ | 780 beta=1 means fully independent. */ |
| 533 float beta; | 781 float beta; |
| 534 /* Denormalized probability of speech (p0) and music (p1) after update */ | 782 /* Denormalized probability of speech (p0) and music (p1) after update */ |
| 535 float p0, p1; | 783 float p0, p1; |
| 536 /* Probabilities for "all speech" and "all music" */ | 784 /* Probabilities for "all speech" and "all music" */ |
| 537 float s0, m0; | 785 float s0, m0; |
| 538 /* Probability sum for renormalisation */ | 786 /* Probability sum for renormalisation */ |
| 539 float psum; | 787 float psum; |
| 540 /* Instantaneous probability of speech and music, with beta pre-applied.
*/ | 788 /* Instantaneous probability of speech and music, with beta pre-applied.
*/ |
| 541 float speech0; | 789 float speech0; |
| 542 float music0; | 790 float music0; |
| 543 float p, q; | 791 float p, q; |
| 544 | 792 |
| 793 /* More silence transitions for speech than for music. */ |
| 794 tau = .001f*tonal->music_prob + .01f*(1-tonal->music_prob); |
| 795 p = MAX16(.05f,MIN16(.95f,frame_probs[1])); |
| 796 q = MAX16(.05f,MIN16(.95f,tonal->vad_prob)); |
| 797 beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); |
| 798 /* p0 and p1 are the probabilities of speech and music at this frame |
| 799 using only information from previous frame and applying the |
| 800 state transition model */ |
| 801 p0 = (1-tonal->vad_prob)*(1-tau) + tonal->vad_prob *tau; |
| 802 p1 = tonal->vad_prob *(1-tau) + (1-tonal->vad_prob)*tau; |
| 803 /* We apply the current probability with exponent beta to work around |
| 804 the fact that the probability estimates aren't independent. */ |
| 805 p0 *= (float)pow(1-frame_probs[1], beta); |
| 806 p1 *= (float)pow(frame_probs[1], beta); |
| 807 /* Normalise the probabilities to get the Marokv probability of music. */ |
| 808 tonal->vad_prob = p1/(p0+p1); |
| 809 info->vad_prob = tonal->vad_prob; |
| 810 /* Consider that silence has a 50-50 probability of being speech or music
. */ |
| 811 frame_probs[0] = tonal->vad_prob*frame_probs[0] + (1-tonal->vad_prob)*.5f
; |
| 812 |
| 545 /* One transition every 3 minutes of active audio */ | 813 /* One transition every 3 minutes of active audio */ |
| 546 tau = .00005f*frame_probs[1]; | 814 tau = .0001f; |
| 547 /* Adapt beta based on how "unexpected" the new prob is */ | 815 /* Adapt beta based on how "unexpected" the new prob is */ |
| 548 p = MAX16(.05f,MIN16(.95f,frame_probs[0])); | 816 p = MAX16(.05f,MIN16(.95f,frame_probs[0])); |
| 549 q = MAX16(.05f,MIN16(.95f,tonal->music_prob)); | 817 q = MAX16(.05f,MIN16(.95f,tonal->music_prob)); |
| 550 beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); | 818 beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); |
| 551 /* p0 and p1 are the probabilities of speech and music at this frame | 819 /* p0 and p1 are the probabilities of speech and music at this frame |
| 552 using only information from previous frame and applying the | 820 using only information from previous frame and applying the |
| 553 state transition model */ | 821 state transition model */ |
| 554 p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau; | 822 p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau; |
| 555 p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau; | 823 p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau; |
| 556 /* We apply the current probability with exponent beta to work around | 824 /* We apply the current probability with exponent beta to work around |
| 557 the fact that the probability estimates aren't independent. */ | 825 the fact that the probability estimates aren't independent. */ |
| 558 p0 *= (float)pow(1-frame_probs[0], beta); | 826 p0 *= (float)pow(1-frame_probs[0], beta); |
| 559 p1 *= (float)pow(frame_probs[0], beta); | 827 p1 *= (float)pow(frame_probs[0], beta); |
| 560 /* Normalise the probabilities to get the Marokv probability of music. */ | 828 /* Normalise the probabilities to get the Marokv probability of music. */ |
| 561 tonal->music_prob = p1/(p0+p1); | 829 tonal->music_prob = p1/(p0+p1); |
| 562 info->music_prob = tonal->music_prob; | 830 info->music_prob = tonal->music_prob; |
| 563 | 831 |
| 832 /*printf("%f %f %f %f\n", frame_probs[0], frame_probs[1], tonal->music_pr
ob, tonal->vad_prob);*/ |
| 564 /* This chunk of code deals with delayed decision. */ | 833 /* This chunk of code deals with delayed decision. */ |
| 565 psum=1e-20f; | 834 psum=1e-20f; |
| 566 /* Instantaneous probability of speech and music, with beta pre-applied.
*/ | 835 /* Instantaneous probability of speech and music, with beta pre-applied.
*/ |
| 567 speech0 = (float)pow(1-frame_probs[0], beta); | 836 speech0 = (float)pow(1-frame_probs[0], beta); |
| 568 music0 = (float)pow(frame_probs[0], beta); | 837 music0 = (float)pow(frame_probs[0], beta); |
| 569 if (tonal->count==1) | 838 if (tonal->count==1) |
| 570 { | 839 { |
| 571 tonal->pspeech[0]=.5; | 840 if (tonal->application == OPUS_APPLICATION_VOIP) |
| 572 tonal->pmusic [0]=.5; | 841 tonal->pmusic[0] = .1f; |
| 842 else |
| 843 tonal->pmusic[0] = .625f; |
| 844 tonal->pspeech[0] = 1-tonal->pmusic[0]; |
| 573 } | 845 } |
| 574 /* Updated probability of having only speech (s0) or only music (m0), | 846 /* Updated probability of having only speech (s0) or only music (m0), |
| 575 before considering the new observation. */ | 847 before considering the new observation. */ |
| 576 s0 = tonal->pspeech[0] + tonal->pspeech[1]; | 848 s0 = tonal->pspeech[0] + tonal->pspeech[1]; |
| 577 m0 = tonal->pmusic [0] + tonal->pmusic [1]; | 849 m0 = tonal->pmusic [0] + tonal->pmusic [1]; |
| 578 /* Updates s0 and m0 with instantaneous probability. */ | 850 /* Updates s0 and m0 with instantaneous probability. */ |
| 579 tonal->pspeech[0] = s0*(1-tau)*speech0; | 851 tonal->pspeech[0] = s0*(1-tau)*speech0; |
| 580 tonal->pmusic [0] = m0*(1-tau)*music0; | 852 tonal->pmusic [0] = m0*(1-tau)*music0; |
| 581 /* Propagate the transition probabilities */ | 853 /* Propagate the transition probabilities */ |
| 582 for (i=1;i<DETECT_SIZE-1;i++) | 854 for (i=1;i<DETECT_SIZE-1;i++) |
| (...skipping 29 matching lines...) Expand all Loading... |
| 612 tonal->music_confidence_count = IMIN(tonal->music_confidence_count,
500); | 884 tonal->music_confidence_count = IMIN(tonal->music_confidence_count,
500); |
| 613 tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->m
usic_confidence); | 885 tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->m
usic_confidence); |
| 614 } | 886 } |
| 615 if (tonal->music_prob<.1) | 887 if (tonal->music_prob<.1) |
| 616 { | 888 { |
| 617 float adapt; | 889 float adapt; |
| 618 adapt = 1.f/(++tonal->speech_confidence_count); | 890 adapt = 1.f/(++tonal->speech_confidence_count); |
| 619 tonal->speech_confidence_count = IMIN(tonal->speech_confidence_coun
t, 500); | 891 tonal->speech_confidence_count = IMIN(tonal->speech_confidence_coun
t, 500); |
| 620 tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->s
peech_confidence); | 892 tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->s
peech_confidence); |
| 621 } | 893 } |
| 622 } else { | |
| 623 if (tonal->music_confidence_count==0) | |
| 624 tonal->music_confidence = .9f; | |
| 625 if (tonal->speech_confidence_count==0) | |
| 626 tonal->speech_confidence = .1f; | |
| 627 } | 894 } |
| 628 } | 895 } |
| 629 if (tonal->last_music != (tonal->music_prob>.5f)) | |
| 630 tonal->last_transition=0; | |
| 631 tonal->last_music = tonal->music_prob>.5f; | 896 tonal->last_music = tonal->music_prob>.5f; |
| 632 #else | 897 #ifdef MLP_TRAINING |
| 633 info->music_prob = 0; | 898 for (i=0;i<25;i++) |
| 899 printf("%f ", features[i]); |
| 900 printf("\n"); |
| 634 #endif | 901 #endif |
| 635 /*for (i=0;i<25;i++) | |
| 636 printf("%f ", features[i]); | |
| 637 printf("\n");*/ | |
| 638 | 902 |
| 639 info->bandwidth = bandwidth; | 903 info->bandwidth = bandwidth; |
| 904 tonal->prev_bandwidth = bandwidth; |
| 640 /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/ | 905 /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/ |
| 641 info->noisiness = frame_noisiness; | 906 info->noisiness = frame_noisiness; |
| 642 info->valid = 1; | 907 info->valid = 1; |
| 643 RESTORE_STACK; | 908 RESTORE_STACK; |
| 644 } | 909 } |
| 645 | 910 |
| 646 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, co
nst void *analysis_pcm, | 911 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, co
nst void *analysis_pcm, |
| 647 int analysis_frame_size, int frame_size, int c1, int c2, int C,
opus_int32 Fs, | 912 int analysis_frame_size, int frame_size, int c1, int c2, int C,
opus_int32 Fs, |
| 648 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_inf
o) | 913 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_inf
o) |
| 649 { | 914 { |
| 650 int offset; | 915 int offset; |
| 651 int pcm_len; | 916 int pcm_len; |
| 652 | 917 |
| 918 analysis_frame_size -= analysis_frame_size&1; |
| 653 if (analysis_pcm != NULL) | 919 if (analysis_pcm != NULL) |
| 654 { | 920 { |
| 655 /* Avoid overflow/wrap-around of the analysis buffer */ | 921 /* Avoid overflow/wrap-around of the analysis buffer */ |
| 656 analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size); | 922 analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/50, analysis_frame_size); |
| 657 | 923 |
| 658 pcm_len = analysis_frame_size - analysis->analysis_offset; | 924 pcm_len = analysis_frame_size - analysis->analysis_offset; |
| 659 offset = analysis->analysis_offset; | 925 offset = analysis->analysis_offset; |
| 660 do { | 926 while (pcm_len>0) { |
| 661 tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len)
, offset, c1, c2, C, lsb_depth, downmix); | 927 tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(Fs/50, pcm_le
n), offset, c1, c2, C, lsb_depth, downmix); |
| 662 offset += 480; | 928 offset += Fs/50; |
| 663 pcm_len -= 480; | 929 pcm_len -= Fs/50; |
| 664 } while (pcm_len>0); | 930 } |
| 665 analysis->analysis_offset = analysis_frame_size; | 931 analysis->analysis_offset = analysis_frame_size; |
| 666 | 932 |
| 667 analysis->analysis_offset -= frame_size; | 933 analysis->analysis_offset -= frame_size; |
| 668 } | 934 } |
| 669 | 935 |
| 670 analysis_info->valid = 0; | 936 analysis_info->valid = 0; |
| 671 tonality_get_info(analysis, analysis_info, frame_size); | 937 tonality_get_info(analysis, analysis_info, frame_size); |
| 672 } | 938 } |
| 939 |
| 940 #endif /* DISABLE_FLOAT_API */ |
| OLD | NEW |