Index: chrome/browser/speech/endpointer/energy_endpointer.cc |
diff --git a/chrome/browser/speech/endpointer/energy_endpointer.cc b/chrome/browser/speech/endpointer/energy_endpointer.cc |
new file mode 100644 |
index 0000000000000000000000000000000000000000..44ca4ddb22190f94f66d651bd77741be0807d92b |
--- /dev/null |
+++ b/chrome/browser/speech/endpointer/energy_endpointer.cc |
@@ -0,0 +1,355 @@ |
+// Copyright (c) 2010 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+// |
+// To know more about the algorithm used and the original code which this is |
+// based of, see |
+// https://wiki.corp.google.com/twiki/bin/view/Main/ChromeGoogleCodeXRef |
+ |
+#include "chrome/browser/speech/endpointer/energy_endpointer.h" |
+ |
+#include "base/logging.h" |
+#include <math.h> |
+#include <vector> |
+ |
+namespace { |
+ |
+// Returns the RMS (quadratic mean) of the input signal. |
+float RMS(const int16* samples, int num_samples) { |
+ int64 ssq_int64 = 0; |
+ int64 sum_int64 = 0; |
+ for (int i = 0; i < num_samples; ++i) { |
+ sum_int64 += samples[i]; |
+ ssq_int64 += samples[i] * samples[i]; |
+ } |
+ // now convert to floats. |
+ double sum = static_cast<double>(sum_int64); |
+ sum /= num_samples; |
+ double ssq = static_cast<double>(ssq_int64); |
+ return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum))); |
+} |
+ |
+int64 Secs2Usecs(float seconds) { |
+ return static_cast<int64>(0.5 + (1.0e6 * seconds)); |
+} |
+ |
+} // namespace |
+ |
+namespace speech_input { |
+ |
+// Stores threshold-crossing histories for making decisions about the speech |
+// state. |
+class EnergyEndpointer::HistoryRing { |
+ public: |
+ HistoryRing() {} |
+ |
+ // Resets the ring to |size| elements each with state |initial_state| |
+ void SetRing(int size, bool initial_state); |
+ |
+ // Inserts a new entry into the ring and drops the oldest entry. |
+ void Insert(int64 time_us, bool decision); |
+ |
+ // Returns the time in microseconds of the most recently added entry. |
+ int64 EndTime() const; |
+ |
+ // Returns the sum of all intervals during which 'decision' is true within |
+ // the time in seconds specified by 'duration'. The returned interval is |
+ // in seconds. |
+ float RingSum(float duration_sec); |
+ |
+ private: |
+ struct DecisionPoint { |
+ int64 time_us; |
+ bool decision; |
+ }; |
+ |
+ std::vector<DecisionPoint> decision_points_; |
+ int insertion_index_; // Index at which the next item gets added/inserted. |
+ |
+ DISALLOW_COPY_AND_ASSIGN(HistoryRing); |
+}; |
+ |
+void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) { |
+ insertion_index_ = 0; |
+ decision_points_.clear(); |
+ DecisionPoint init = { -1, initial_state }; |
+ decision_points_.resize(size, init); |
+} |
+ |
+void EnergyEndpointer::HistoryRing::Insert(int64 time_us, bool decision) { |
+ decision_points_[insertion_index_].time_us = time_us; |
+ decision_points_[insertion_index_].decision = decision; |
+ insertion_index_ = (insertion_index_ + 1) % decision_points_.size(); |
+} |
+ |
+int64 EnergyEndpointer::HistoryRing::EndTime() const { |
+ int ind = insertion_index_ - 1; |
+ if (ind < 0) |
+ ind = decision_points_.size() - 1; |
+ return decision_points_[ind].time_us; |
+} |
+ |
+float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) { |
+ if (!decision_points_.size()) |
+ return 0.0; |
+ |
+ int64 sum_us = 0; |
+ int ind = insertion_index_ - 1; |
+ if (ind < 0) |
+ ind = decision_points_.size() - 1; |
+ int64 end_us = decision_points_[ind].time_us; |
+ bool is_on = decision_points_[ind].decision; |
+ int64 start_us = end_us - static_cast<int64>(0.5 + (1.0e6 * duration_sec)); |
+ if (start_us < 0) |
+ start_us = 0; |
+ size_t n_summed = 1; // n points ==> (n-1) intervals |
+ while ((decision_points_[ind].time_us > start_us) && |
+ (n_summed < decision_points_.size())) { |
+ --ind; |
+ if (ind < 0) |
+ ind = decision_points_.size() - 1; |
+ if (is_on) |
+ sum_us += end_us - decision_points_[ind].time_us; |
+ is_on = decision_points_[ind].decision; |
+ end_us = decision_points_[ind].time_us; |
+ n_summed++; |
+ } |
+ |
+ return 1.0e-6f * sum_us; // Returns total time that was super threshold. |
+} |
+ |
+EnergyEndpointer::EnergyEndpointer() |
+ : endpointer_time_us_(0), |
+ max_window_dur_(4.0), |
+ history_(new HistoryRing()) { |
+} |
+ |
+EnergyEndpointer::~EnergyEndpointer() { |
+} |
+ |
+int EnergyEndpointer::TimeToFrame(float time) const { |
+ return static_cast<int32>(0.5 + (time / params_.frame_period())); |
+} |
+ |
+void EnergyEndpointer::Restart(bool reset_threshold) { |
+ status_ = EP_PRE_SPEECH; |
+ user_input_start_time_us_ = 0; |
+ |
+ if (reset_threshold) { |
+ decision_threshold_ = params_.decision_threshold(); |
+ rms_adapt_ = decision_threshold_; |
+ noise_level_ = params_.decision_threshold() / 2.0f; |
+ frame_counter_ = 0; // Used for rapid initial update of levels. |
+ } |
+ |
+ // Set up the memories to hold the history windows. |
+ history_->SetRing(TimeToFrame(max_window_dur_), false); |
+ |
+ // Flag that indicates that current input should be used for |
+ // estimating the environment. The user has not yet started input |
+ // by e.g. pressed the push-to-talk button. By default, this is |
+ // false for backward compatibility. |
+ estimating_environment_ = false; |
+} |
+ |
+void EnergyEndpointer::Init(const EnergyEndpointerParams& params) { |
+ params_ = params; |
+ |
+ // Find the longest history interval to be used, and make the ring |
+ // large enough to accommodate that number of frames. NOTE: This |
+ // depends upon ep_frame_period being set correctly in the factory |
+ // that did this instantiation. |
+ max_window_dur_ = params_.onset_window(); |
+ if (params_.speech_on_window() > max_window_dur_) |
+ max_window_dur_ = params_.speech_on_window(); |
+ if (params_.offset_window() > max_window_dur_) |
+ max_window_dur_ = params_.offset_window(); |
+ Restart(true); |
+ |
+ offset_confirm_dur_sec_ = params_.offset_window() - |
+ params_.offset_confirm_dur(); |
+ if (offset_confirm_dur_sec_ < 0.0) |
+ offset_confirm_dur_sec_ = 0.0; |
+ |
+ user_input_start_time_us_ = 0; |
+ |
+ // Flag that indicates that current input should be used for |
+ // estimating the environment. The user has not yet started input |
+ // by e.g. pressed the push-to-talk button. By default, this is |
+ // false for backward compatibility. |
+ estimating_environment_ = false; |
+ // The initial value of the noise and speech levels is inconsequential. |
+ // The level of the first frame will overwrite these values. |
+ noise_level_ = params_.decision_threshold() / 2.0f; |
+ fast_update_frames_ = |
+ static_cast<int64>(params_.fast_update_dur() / params_.frame_period()); |
+ |
+ frame_counter_ = 0; // Used for rapid initial update of levels. |
+ |
+ sample_rate_ = params_.sample_rate(); |
+ start_lag_ = static_cast<int>(sample_rate_ / |
+ params_.max_fundamental_frequency()); |
+ end_lag_ = static_cast<int>(sample_rate_ / |
+ params_.min_fundamental_frequency()); |
+} |
+ |
+void EnergyEndpointer::StartSession() { |
+ Restart(true); |
+} |
+ |
+void EnergyEndpointer::EndSession() { |
+ status_ = EP_POST_SPEECH; |
+} |
+ |
+void EnergyEndpointer::SetEnvironmentEstimationMode() { |
+ Restart(true); |
+ estimating_environment_ = true; |
+} |
+ |
+void EnergyEndpointer::SetUserInputMode() { |
+ estimating_environment_ = false; |
+ user_input_start_time_us_ = endpointer_time_us_; |
+} |
+ |
+void EnergyEndpointer::ProcessAudioFrame(int64 time_us, |
+ const int16* samples, |
+ int num_samples) { |
+ endpointer_time_us_ = time_us; |
+ float rms = RMS(samples, num_samples); |
+ |
+ // Check that this is user input audio vs. pre-input adaptation audio. |
+ // Input audio starts when the user indicates start of input, by e.g. |
+ // pressing push-to-talk. Audio recieved prior to that is used to update |
+ // noise and speech level estimates. |
+ if (!estimating_environment_) { |
+ bool decision = false; |
+ if ((endpointer_time_us_ - user_input_start_time_us_) < |
+ Secs2Usecs(params_.contamination_rejection_period())) { |
+ decision = false; |
+ DLOG(INFO) << "decision: forced to false, time: " << endpointer_time_us_; |
+ } else { |
+ decision = (rms > decision_threshold_); |
+ } |
+ DLOG(INFO) << "endpointer_time: " << endpointer_time_us_ |
+ << " user_input_start_time: " << user_input_start_time_us_ |
+ << " FA reject period " |
+ << Secs2Usecs(params_.contamination_rejection_period()) |
+ << " decision: " << (decision ? "SPEECH +++" : "SIL ------"); |
+ |
+ history_->Insert(endpointer_time_us_, decision); |
+ |
+ switch (status_) { |
+ case EP_PRE_SPEECH: |
+ if (history_->RingSum(params_.onset_window()) > |
+ params_.onset_detect_dur()) { |
+ status_ = EP_POSSIBLE_ONSET; |
+ } |
+ break; |
+ |
+ case EP_POSSIBLE_ONSET: { |
+ float tsum = history_->RingSum(params_.onset_window()); |
+ if (tsum > params_.onset_confirm_dur()) { |
+ status_ = EP_SPEECH_PRESENT; |
+ } else { // If signal is not maintained, drop back to pre-speech. |
+ if (tsum <= params_.onset_detect_dur()) |
+ status_ = EP_PRE_SPEECH; |
+ } |
+ break; |
+ } |
+ |
+ case EP_SPEECH_PRESENT: { |
+ // To induce hysteresis in the state residency, we allow a |
+ // smaller residency time in the on_ring, than was required to |
+ // enter the SPEECH_PERSENT state. |
+ float on_time = history_->RingSum(params_.speech_on_window()); |
+ if (on_time < params_.on_maintain_dur()) |
+ status_ = EP_POSSIBLE_OFFSET; |
+ break; |
+ } |
+ |
+ case EP_POSSIBLE_OFFSET: |
+ if (history_->RingSum(params_.offset_window()) <= |
+ offset_confirm_dur_sec_) { |
+ // Note that this offset time may be beyond the end |
+ // of the input buffer in a real-time system. It will be up |
+ // to the RecognizerSession to decide what to do. |
+ status_ = EP_PRE_SPEECH; // Automatically reset for next utterance. |
+ } else { // If speech picks up again we allow return to SPEECH_PRESENT. |
+ if (history_->RingSum(params_.speech_on_window()) >= |
+ params_.on_maintain_dur()) |
+ status_ = EP_SPEECH_PRESENT; |
+ } |
+ break; |
+ |
+ default: |
+ LOG(WARNING) << "Invalid case in switch: " << status_; |
+ break; |
+ } |
+ |
+ // If this is a quiet, non-speech region, slowly adapt the detection |
+ // threshold to be about 6dB above the average RMS. |
+ if ((!decision) && (status_ == EP_PRE_SPEECH)) { |
+ decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms); |
+ rms_adapt_ = decision_threshold_; |
+ } else { |
+ // If this is in a speech region, adapt the decision threshold to |
+ // be about 10dB below the average RMS. If the noise level is high, |
+ // the threshold is pushed up. |
+ // Adaptation up to a higher level is 5 times faster than decay to |
+ // a lower level. |
+ if ((status_ == EP_SPEECH_PRESENT) && decision) { |
+ if (rms_adapt_ > rms) { |
+ rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms); |
+ } else { |
+ rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms); |
+ } |
+ float target_threshold = 0.3f * rms_adapt_ + noise_level_; |
+ decision_threshold_ = (.90f * decision_threshold_) + |
+ (0.10f * target_threshold); |
+ } |
+ } |
+ |
+ // Set a floor |
+ if (decision_threshold_ <params_.min_decision_threshold()) |
+ decision_threshold_ = params_.min_decision_threshold(); |
+ } |
+ |
+ // Update speech and noise levels. |
+ UpdateLevels(rms); |
+ ++frame_counter_; |
+} |
+ |
+void EnergyEndpointer::UpdateLevels(float rms) { |
+ // Update quickly initially. We assume this is noise and that |
+ // speech is 6dB above the noise. |
+ if (frame_counter_ < fast_update_frames_) { |
+ // Alpha increases from 0 to (k-1)/k where k is the number of time |
+ // steps in the initial adaptation period. |
+ float alpha = static_cast<float>(frame_counter_) / |
+ static_cast<float>(fast_update_frames_); |
+ noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms); |
+ DLOG(INFO) << "FAST UPDATE, frame_counter_ " << frame_counter_ |
+ << "fast_update_frames_ " << fast_update_frames_; |
+ } else { |
+ // Update Noise level. The noise level adapts quickly downward, but |
+ // slowly upward. The noise_level_ parameter is not currently used |
+ // for threshold adaptation. It is used for UI feedback. |
+ if (noise_level_ < rms) |
+ noise_level_ = (0.999f * noise_level_) + (0.001f * rms); |
+ else |
+ noise_level_ = (0.95f * noise_level_) + (0.05f * rms); |
+ } |
+ if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) { |
+ decision_threshold_ = noise_level_ * 2; // 6dB above noise level. |
+ // Set a floor |
+ if (decision_threshold_ < params_.min_decision_threshold()) |
+ decision_threshold_ = params_.min_decision_threshold(); |
+ } |
+} |
+ |
+EpStatus EnergyEndpointer::Status(int64* status_time) const { |
+ *status_time = history_->EndTime(); |
+ return status_; |
+} |
+ |
+} // namespace speech |