OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 // |
| 5 // To know more about the algorithm used and the original code which this is |
| 6 // based of, see |
| 7 // https://wiki.corp.google.com/twiki/bin/view/Main/ChromeGoogleCodeXRef |
| 8 |
| 9 #include "chrome/browser/speech/endpointer/energy_endpointer.h" |
| 10 |
| 11 #include "base/logging.h" |
| 12 #include <math.h> |
| 13 #include <vector> |
| 14 |
| 15 namespace { |
| 16 |
| 17 // Returns the RMS (quadratic mean) of the input signal. |
| 18 float RMS(const int16* samples, int num_samples) { |
| 19 int64 ssq_int64 = 0; |
| 20 int64 sum_int64 = 0; |
| 21 for (int i = 0; i < num_samples; ++i) { |
| 22 sum_int64 += samples[i]; |
| 23 ssq_int64 += samples[i] * samples[i]; |
| 24 } |
| 25 // now convert to floats. |
| 26 double sum = static_cast<double>(sum_int64); |
| 27 sum /= num_samples; |
| 28 double ssq = static_cast<double>(ssq_int64); |
| 29 return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum))); |
| 30 } |
| 31 |
| 32 int64 Secs2Usecs(float seconds) { |
| 33 return static_cast<int64>(0.5 + (1.0e6 * seconds)); |
| 34 } |
| 35 |
| 36 } // namespace |
| 37 |
| 38 namespace speech_input { |
| 39 |
| 40 // Stores threshold-crossing histories for making decisions about the speech |
| 41 // state. |
| 42 class EnergyEndpointer::HistoryRing { |
| 43 public: |
| 44 HistoryRing() {} |
| 45 |
| 46 // Resets the ring to |size| elements each with state |initial_state| |
| 47 void SetRing(int size, bool initial_state); |
| 48 |
| 49 // Inserts a new entry into the ring and drops the oldest entry. |
| 50 void Insert(int64 time_us, bool decision); |
| 51 |
| 52 // Returns the time in microseconds of the most recently added entry. |
| 53 int64 EndTime() const; |
| 54 |
| 55 // Returns the sum of all intervals during which 'decision' is true within |
| 56 // the time in seconds specified by 'duration'. The returned interval is |
| 57 // in seconds. |
| 58 float RingSum(float duration_sec); |
| 59 |
| 60 private: |
| 61 struct DecisionPoint { |
| 62 int64 time_us; |
| 63 bool decision; |
| 64 }; |
| 65 |
| 66 std::vector<DecisionPoint> decision_points_; |
| 67 int insertion_index_; // Index at which the next item gets added/inserted. |
| 68 |
| 69 DISALLOW_COPY_AND_ASSIGN(HistoryRing); |
| 70 }; |
| 71 |
| 72 void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) { |
| 73 insertion_index_ = 0; |
| 74 decision_points_.clear(); |
| 75 DecisionPoint init = { -1, initial_state }; |
| 76 decision_points_.resize(size, init); |
| 77 } |
| 78 |
| 79 void EnergyEndpointer::HistoryRing::Insert(int64 time_us, bool decision) { |
| 80 decision_points_[insertion_index_].time_us = time_us; |
| 81 decision_points_[insertion_index_].decision = decision; |
| 82 insertion_index_ = (insertion_index_ + 1) % decision_points_.size(); |
| 83 } |
| 84 |
| 85 int64 EnergyEndpointer::HistoryRing::EndTime() const { |
| 86 int ind = insertion_index_ - 1; |
| 87 if (ind < 0) |
| 88 ind = decision_points_.size() - 1; |
| 89 return decision_points_[ind].time_us; |
| 90 } |
| 91 |
| 92 float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) { |
| 93 if (!decision_points_.size()) |
| 94 return 0.0; |
| 95 |
| 96 int64 sum_us = 0; |
| 97 int ind = insertion_index_ - 1; |
| 98 if (ind < 0) |
| 99 ind = decision_points_.size() - 1; |
| 100 int64 end_us = decision_points_[ind].time_us; |
| 101 bool is_on = decision_points_[ind].decision; |
| 102 int64 start_us = end_us - static_cast<int64>(0.5 + (1.0e6 * duration_sec)); |
| 103 if (start_us < 0) |
| 104 start_us = 0; |
| 105 size_t n_summed = 1; // n points ==> (n-1) intervals |
| 106 while ((decision_points_[ind].time_us > start_us) && |
| 107 (n_summed < decision_points_.size())) { |
| 108 --ind; |
| 109 if (ind < 0) |
| 110 ind = decision_points_.size() - 1; |
| 111 if (is_on) |
| 112 sum_us += end_us - decision_points_[ind].time_us; |
| 113 is_on = decision_points_[ind].decision; |
| 114 end_us = decision_points_[ind].time_us; |
| 115 n_summed++; |
| 116 } |
| 117 |
| 118 return 1.0e-6f * sum_us; // Returns total time that was super threshold. |
| 119 } |
| 120 |
| 121 EnergyEndpointer::EnergyEndpointer() |
| 122 : endpointer_time_us_(0), |
| 123 max_window_dur_(4.0), |
| 124 history_(new HistoryRing()) { |
| 125 } |
| 126 |
| 127 EnergyEndpointer::~EnergyEndpointer() { |
| 128 } |
| 129 |
| 130 int EnergyEndpointer::TimeToFrame(float time) const { |
| 131 return static_cast<int32>(0.5 + (time / params_.frame_period())); |
| 132 } |
| 133 |
| 134 void EnergyEndpointer::Restart(bool reset_threshold) { |
| 135 status_ = EP_PRE_SPEECH; |
| 136 user_input_start_time_us_ = 0; |
| 137 |
| 138 if (reset_threshold) { |
| 139 decision_threshold_ = params_.decision_threshold(); |
| 140 rms_adapt_ = decision_threshold_; |
| 141 noise_level_ = params_.decision_threshold() / 2.0f; |
| 142 frame_counter_ = 0; // Used for rapid initial update of levels. |
| 143 } |
| 144 |
| 145 // Set up the memories to hold the history windows. |
| 146 history_->SetRing(TimeToFrame(max_window_dur_), false); |
| 147 |
| 148 // Flag that indicates that current input should be used for |
| 149 // estimating the environment. The user has not yet started input |
| 150 // by e.g. pressed the push-to-talk button. By default, this is |
| 151 // false for backward compatibility. |
| 152 estimating_environment_ = false; |
| 153 } |
| 154 |
| 155 void EnergyEndpointer::Init(const EnergyEndpointerParams& params) { |
| 156 params_ = params; |
| 157 |
| 158 // Find the longest history interval to be used, and make the ring |
| 159 // large enough to accommodate that number of frames. NOTE: This |
| 160 // depends upon ep_frame_period being set correctly in the factory |
| 161 // that did this instantiation. |
| 162 max_window_dur_ = params_.onset_window(); |
| 163 if (params_.speech_on_window() > max_window_dur_) |
| 164 max_window_dur_ = params_.speech_on_window(); |
| 165 if (params_.offset_window() > max_window_dur_) |
| 166 max_window_dur_ = params_.offset_window(); |
| 167 Restart(true); |
| 168 |
| 169 offset_confirm_dur_sec_ = params_.offset_window() - |
| 170 params_.offset_confirm_dur(); |
| 171 if (offset_confirm_dur_sec_ < 0.0) |
| 172 offset_confirm_dur_sec_ = 0.0; |
| 173 |
| 174 user_input_start_time_us_ = 0; |
| 175 |
| 176 // Flag that indicates that current input should be used for |
| 177 // estimating the environment. The user has not yet started input |
| 178 // by e.g. pressed the push-to-talk button. By default, this is |
| 179 // false for backward compatibility. |
| 180 estimating_environment_ = false; |
| 181 // The initial value of the noise and speech levels is inconsequential. |
| 182 // The level of the first frame will overwrite these values. |
| 183 noise_level_ = params_.decision_threshold() / 2.0f; |
| 184 fast_update_frames_ = |
| 185 static_cast<int64>(params_.fast_update_dur() / params_.frame_period()); |
| 186 |
| 187 frame_counter_ = 0; // Used for rapid initial update of levels. |
| 188 |
| 189 sample_rate_ = params_.sample_rate(); |
| 190 start_lag_ = static_cast<int>(sample_rate_ / |
| 191 params_.max_fundamental_frequency()); |
| 192 end_lag_ = static_cast<int>(sample_rate_ / |
| 193 params_.min_fundamental_frequency()); |
| 194 } |
| 195 |
| 196 void EnergyEndpointer::StartSession() { |
| 197 Restart(true); |
| 198 } |
| 199 |
| 200 void EnergyEndpointer::EndSession() { |
| 201 status_ = EP_POST_SPEECH; |
| 202 } |
| 203 |
| 204 void EnergyEndpointer::SetEnvironmentEstimationMode() { |
| 205 Restart(true); |
| 206 estimating_environment_ = true; |
| 207 } |
| 208 |
| 209 void EnergyEndpointer::SetUserInputMode() { |
| 210 estimating_environment_ = false; |
| 211 user_input_start_time_us_ = endpointer_time_us_; |
| 212 } |
| 213 |
| 214 void EnergyEndpointer::ProcessAudioFrame(int64 time_us, |
| 215 const int16* samples, |
| 216 int num_samples) { |
| 217 endpointer_time_us_ = time_us; |
| 218 float rms = RMS(samples, num_samples); |
| 219 |
| 220 // Check that this is user input audio vs. pre-input adaptation audio. |
| 221 // Input audio starts when the user indicates start of input, by e.g. |
| 222 // pressing push-to-talk. Audio recieved prior to that is used to update |
| 223 // noise and speech level estimates. |
| 224 if (!estimating_environment_) { |
| 225 bool decision = false; |
| 226 if ((endpointer_time_us_ - user_input_start_time_us_) < |
| 227 Secs2Usecs(params_.contamination_rejection_period())) { |
| 228 decision = false; |
| 229 DLOG(INFO) << "decision: forced to false, time: " << endpointer_time_us_; |
| 230 } else { |
| 231 decision = (rms > decision_threshold_); |
| 232 } |
| 233 DLOG(INFO) << "endpointer_time: " << endpointer_time_us_ |
| 234 << " user_input_start_time: " << user_input_start_time_us_ |
| 235 << " FA reject period " |
| 236 << Secs2Usecs(params_.contamination_rejection_period()) |
| 237 << " decision: " << (decision ? "SPEECH +++" : "SIL ------"); |
| 238 |
| 239 history_->Insert(endpointer_time_us_, decision); |
| 240 |
| 241 switch (status_) { |
| 242 case EP_PRE_SPEECH: |
| 243 if (history_->RingSum(params_.onset_window()) > |
| 244 params_.onset_detect_dur()) { |
| 245 status_ = EP_POSSIBLE_ONSET; |
| 246 } |
| 247 break; |
| 248 |
| 249 case EP_POSSIBLE_ONSET: { |
| 250 float tsum = history_->RingSum(params_.onset_window()); |
| 251 if (tsum > params_.onset_confirm_dur()) { |
| 252 status_ = EP_SPEECH_PRESENT; |
| 253 } else { // If signal is not maintained, drop back to pre-speech. |
| 254 if (tsum <= params_.onset_detect_dur()) |
| 255 status_ = EP_PRE_SPEECH; |
| 256 } |
| 257 break; |
| 258 } |
| 259 |
| 260 case EP_SPEECH_PRESENT: { |
| 261 // To induce hysteresis in the state residency, we allow a |
| 262 // smaller residency time in the on_ring, than was required to |
| 263 // enter the SPEECH_PERSENT state. |
| 264 float on_time = history_->RingSum(params_.speech_on_window()); |
| 265 if (on_time < params_.on_maintain_dur()) |
| 266 status_ = EP_POSSIBLE_OFFSET; |
| 267 break; |
| 268 } |
| 269 |
| 270 case EP_POSSIBLE_OFFSET: |
| 271 if (history_->RingSum(params_.offset_window()) <= |
| 272 offset_confirm_dur_sec_) { |
| 273 // Note that this offset time may be beyond the end |
| 274 // of the input buffer in a real-time system. It will be up |
| 275 // to the RecognizerSession to decide what to do. |
| 276 status_ = EP_PRE_SPEECH; // Automatically reset for next utterance. |
| 277 } else { // If speech picks up again we allow return to SPEECH_PRESENT. |
| 278 if (history_->RingSum(params_.speech_on_window()) >= |
| 279 params_.on_maintain_dur()) |
| 280 status_ = EP_SPEECH_PRESENT; |
| 281 } |
| 282 break; |
| 283 |
| 284 default: |
| 285 LOG(WARNING) << "Invalid case in switch: " << status_; |
| 286 break; |
| 287 } |
| 288 |
| 289 // If this is a quiet, non-speech region, slowly adapt the detection |
| 290 // threshold to be about 6dB above the average RMS. |
| 291 if ((!decision) && (status_ == EP_PRE_SPEECH)) { |
| 292 decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms); |
| 293 rms_adapt_ = decision_threshold_; |
| 294 } else { |
| 295 // If this is in a speech region, adapt the decision threshold to |
| 296 // be about 10dB below the average RMS. If the noise level is high, |
| 297 // the threshold is pushed up. |
| 298 // Adaptation up to a higher level is 5 times faster than decay to |
| 299 // a lower level. |
| 300 if ((status_ == EP_SPEECH_PRESENT) && decision) { |
| 301 if (rms_adapt_ > rms) { |
| 302 rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms); |
| 303 } else { |
| 304 rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms); |
| 305 } |
| 306 float target_threshold = 0.3f * rms_adapt_ + noise_level_; |
| 307 decision_threshold_ = (.90f * decision_threshold_) + |
| 308 (0.10f * target_threshold); |
| 309 } |
| 310 } |
| 311 |
| 312 // Set a floor |
| 313 if (decision_threshold_ <params_.min_decision_threshold()) |
| 314 decision_threshold_ = params_.min_decision_threshold(); |
| 315 } |
| 316 |
| 317 // Update speech and noise levels. |
| 318 UpdateLevels(rms); |
| 319 ++frame_counter_; |
| 320 } |
| 321 |
| 322 void EnergyEndpointer::UpdateLevels(float rms) { |
| 323 // Update quickly initially. We assume this is noise and that |
| 324 // speech is 6dB above the noise. |
| 325 if (frame_counter_ < fast_update_frames_) { |
| 326 // Alpha increases from 0 to (k-1)/k where k is the number of time |
| 327 // steps in the initial adaptation period. |
| 328 float alpha = static_cast<float>(frame_counter_) / |
| 329 static_cast<float>(fast_update_frames_); |
| 330 noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms); |
| 331 DLOG(INFO) << "FAST UPDATE, frame_counter_ " << frame_counter_ |
| 332 << "fast_update_frames_ " << fast_update_frames_; |
| 333 } else { |
| 334 // Update Noise level. The noise level adapts quickly downward, but |
| 335 // slowly upward. The noise_level_ parameter is not currently used |
| 336 // for threshold adaptation. It is used for UI feedback. |
| 337 if (noise_level_ < rms) |
| 338 noise_level_ = (0.999f * noise_level_) + (0.001f * rms); |
| 339 else |
| 340 noise_level_ = (0.95f * noise_level_) + (0.05f * rms); |
| 341 } |
| 342 if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) { |
| 343 decision_threshold_ = noise_level_ * 2; // 6dB above noise level. |
| 344 // Set a floor |
| 345 if (decision_threshold_ < params_.min_decision_threshold()) |
| 346 decision_threshold_ = params_.min_decision_threshold(); |
| 347 } |
| 348 } |
| 349 |
| 350 EpStatus EnergyEndpointer::Status(int64* status_time) const { |
| 351 *status_time = history_->EndTime(); |
| 352 return status_; |
| 353 } |
| 354 |
| 355 } // namespace speech |
OLD | NEW |