Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(65)

Unified Diff: chrome/browser/speech/endpointer/energy_endpointer.cc

Issue 3117026: Add an endpointer for detecting end of speech. (Closed)
Patch Set: Merged with latest. Created 10 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/browser/speech/endpointer/energy_endpointer.cc
diff --git a/chrome/browser/speech/endpointer/energy_endpointer.cc b/chrome/browser/speech/endpointer/energy_endpointer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..44ca4ddb22190f94f66d651bd77741be0807d92b
--- /dev/null
+++ b/chrome/browser/speech/endpointer/energy_endpointer.cc
@@ -0,0 +1,355 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// To know more about the algorithm used and the original code which this is
+// based of, see
+// https://wiki.corp.google.com/twiki/bin/view/Main/ChromeGoogleCodeXRef
+
+#include "chrome/browser/speech/endpointer/energy_endpointer.h"
+
+#include "base/logging.h"
+#include <math.h>
+#include <vector>
+
+namespace {
+
+// Returns the RMS (quadratic mean) of the input signal.
+float RMS(const int16* samples, int num_samples) {
+ int64 ssq_int64 = 0;
+ int64 sum_int64 = 0;
+ for (int i = 0; i < num_samples; ++i) {
+ sum_int64 += samples[i];
+ ssq_int64 += samples[i] * samples[i];
+ }
+ // now convert to floats.
+ double sum = static_cast<double>(sum_int64);
+ sum /= num_samples;
+ double ssq = static_cast<double>(ssq_int64);
+ return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum)));
+}
+
+int64 Secs2Usecs(float seconds) {
+ return static_cast<int64>(0.5 + (1.0e6 * seconds));
+}
+
+} // namespace
+
+namespace speech_input {
+
+// Stores threshold-crossing histories for making decisions about the speech
+// state.
+class EnergyEndpointer::HistoryRing {
+ public:
+ HistoryRing() {}
+
+ // Resets the ring to |size| elements each with state |initial_state|
+ void SetRing(int size, bool initial_state);
+
+ // Inserts a new entry into the ring and drops the oldest entry.
+ void Insert(int64 time_us, bool decision);
+
+ // Returns the time in microseconds of the most recently added entry.
+ int64 EndTime() const;
+
+ // Returns the sum of all intervals during which 'decision' is true within
+ // the time in seconds specified by 'duration'. The returned interval is
+ // in seconds.
+ float RingSum(float duration_sec);
+
+ private:
+ struct DecisionPoint {
+ int64 time_us;
+ bool decision;
+ };
+
+ std::vector<DecisionPoint> decision_points_;
+ int insertion_index_; // Index at which the next item gets added/inserted.
+
+ DISALLOW_COPY_AND_ASSIGN(HistoryRing);
+};
+
+void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) {
+ insertion_index_ = 0;
+ decision_points_.clear();
+ DecisionPoint init = { -1, initial_state };
+ decision_points_.resize(size, init);
+}
+
+void EnergyEndpointer::HistoryRing::Insert(int64 time_us, bool decision) {
+ decision_points_[insertion_index_].time_us = time_us;
+ decision_points_[insertion_index_].decision = decision;
+ insertion_index_ = (insertion_index_ + 1) % decision_points_.size();
+}
+
+int64 EnergyEndpointer::HistoryRing::EndTime() const {
+ int ind = insertion_index_ - 1;
+ if (ind < 0)
+ ind = decision_points_.size() - 1;
+ return decision_points_[ind].time_us;
+}
+
+float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) {
+ if (!decision_points_.size())
+ return 0.0;
+
+ int64 sum_us = 0;
+ int ind = insertion_index_ - 1;
+ if (ind < 0)
+ ind = decision_points_.size() - 1;
+ int64 end_us = decision_points_[ind].time_us;
+ bool is_on = decision_points_[ind].decision;
+ int64 start_us = end_us - static_cast<int64>(0.5 + (1.0e6 * duration_sec));
+ if (start_us < 0)
+ start_us = 0;
+ size_t n_summed = 1; // n points ==> (n-1) intervals
+ while ((decision_points_[ind].time_us > start_us) &&
+ (n_summed < decision_points_.size())) {
+ --ind;
+ if (ind < 0)
+ ind = decision_points_.size() - 1;
+ if (is_on)
+ sum_us += end_us - decision_points_[ind].time_us;
+ is_on = decision_points_[ind].decision;
+ end_us = decision_points_[ind].time_us;
+ n_summed++;
+ }
+
+ return 1.0e-6f * sum_us; // Returns total time that was super threshold.
+}
+
+EnergyEndpointer::EnergyEndpointer()
+ : endpointer_time_us_(0),
+ max_window_dur_(4.0),
+ history_(new HistoryRing()) {
+}
+
+EnergyEndpointer::~EnergyEndpointer() {
+}
+
+int EnergyEndpointer::TimeToFrame(float time) const {
+ return static_cast<int32>(0.5 + (time / params_.frame_period()));
+}
+
+void EnergyEndpointer::Restart(bool reset_threshold) {
+ status_ = EP_PRE_SPEECH;
+ user_input_start_time_us_ = 0;
+
+ if (reset_threshold) {
+ decision_threshold_ = params_.decision_threshold();
+ rms_adapt_ = decision_threshold_;
+ noise_level_ = params_.decision_threshold() / 2.0f;
+ frame_counter_ = 0; // Used for rapid initial update of levels.
+ }
+
+ // Set up the memories to hold the history windows.
+ history_->SetRing(TimeToFrame(max_window_dur_), false);
+
+ // Flag that indicates that current input should be used for
+ // estimating the environment. The user has not yet started input
+ // by e.g. pressed the push-to-talk button. By default, this is
+ // false for backward compatibility.
+ estimating_environment_ = false;
+}
+
+void EnergyEndpointer::Init(const EnergyEndpointerParams& params) {
+ params_ = params;
+
+ // Find the longest history interval to be used, and make the ring
+ // large enough to accommodate that number of frames. NOTE: This
+ // depends upon ep_frame_period being set correctly in the factory
+ // that did this instantiation.
+ max_window_dur_ = params_.onset_window();
+ if (params_.speech_on_window() > max_window_dur_)
+ max_window_dur_ = params_.speech_on_window();
+ if (params_.offset_window() > max_window_dur_)
+ max_window_dur_ = params_.offset_window();
+ Restart(true);
+
+ offset_confirm_dur_sec_ = params_.offset_window() -
+ params_.offset_confirm_dur();
+ if (offset_confirm_dur_sec_ < 0.0)
+ offset_confirm_dur_sec_ = 0.0;
+
+ user_input_start_time_us_ = 0;
+
+ // Flag that indicates that current input should be used for
+ // estimating the environment. The user has not yet started input
+ // by e.g. pressed the push-to-talk button. By default, this is
+ // false for backward compatibility.
+ estimating_environment_ = false;
+ // The initial value of the noise and speech levels is inconsequential.
+ // The level of the first frame will overwrite these values.
+ noise_level_ = params_.decision_threshold() / 2.0f;
+ fast_update_frames_ =
+ static_cast<int64>(params_.fast_update_dur() / params_.frame_period());
+
+ frame_counter_ = 0; // Used for rapid initial update of levels.
+
+ sample_rate_ = params_.sample_rate();
+ start_lag_ = static_cast<int>(sample_rate_ /
+ params_.max_fundamental_frequency());
+ end_lag_ = static_cast<int>(sample_rate_ /
+ params_.min_fundamental_frequency());
+}
+
+void EnergyEndpointer::StartSession() {
+ Restart(true);
+}
+
+void EnergyEndpointer::EndSession() {
+ status_ = EP_POST_SPEECH;
+}
+
+void EnergyEndpointer::SetEnvironmentEstimationMode() {
+ Restart(true);
+ estimating_environment_ = true;
+}
+
+void EnergyEndpointer::SetUserInputMode() {
+ estimating_environment_ = false;
+ user_input_start_time_us_ = endpointer_time_us_;
+}
+
+void EnergyEndpointer::ProcessAudioFrame(int64 time_us,
+ const int16* samples,
+ int num_samples) {
+ endpointer_time_us_ = time_us;
+ float rms = RMS(samples, num_samples);
+
+ // Check that this is user input audio vs. pre-input adaptation audio.
+ // Input audio starts when the user indicates start of input, by e.g.
+ // pressing push-to-talk. Audio recieved prior to that is used to update
+ // noise and speech level estimates.
+ if (!estimating_environment_) {
+ bool decision = false;
+ if ((endpointer_time_us_ - user_input_start_time_us_) <
+ Secs2Usecs(params_.contamination_rejection_period())) {
+ decision = false;
+ DLOG(INFO) << "decision: forced to false, time: " << endpointer_time_us_;
+ } else {
+ decision = (rms > decision_threshold_);
+ }
+ DLOG(INFO) << "endpointer_time: " << endpointer_time_us_
+ << " user_input_start_time: " << user_input_start_time_us_
+ << " FA reject period "
+ << Secs2Usecs(params_.contamination_rejection_period())
+ << " decision: " << (decision ? "SPEECH +++" : "SIL ------");
+
+ history_->Insert(endpointer_time_us_, decision);
+
+ switch (status_) {
+ case EP_PRE_SPEECH:
+ if (history_->RingSum(params_.onset_window()) >
+ params_.onset_detect_dur()) {
+ status_ = EP_POSSIBLE_ONSET;
+ }
+ break;
+
+ case EP_POSSIBLE_ONSET: {
+ float tsum = history_->RingSum(params_.onset_window());
+ if (tsum > params_.onset_confirm_dur()) {
+ status_ = EP_SPEECH_PRESENT;
+ } else { // If signal is not maintained, drop back to pre-speech.
+ if (tsum <= params_.onset_detect_dur())
+ status_ = EP_PRE_SPEECH;
+ }
+ break;
+ }
+
+ case EP_SPEECH_PRESENT: {
+ // To induce hysteresis in the state residency, we allow a
+ // smaller residency time in the on_ring, than was required to
+ // enter the SPEECH_PERSENT state.
+ float on_time = history_->RingSum(params_.speech_on_window());
+ if (on_time < params_.on_maintain_dur())
+ status_ = EP_POSSIBLE_OFFSET;
+ break;
+ }
+
+ case EP_POSSIBLE_OFFSET:
+ if (history_->RingSum(params_.offset_window()) <=
+ offset_confirm_dur_sec_) {
+ // Note that this offset time may be beyond the end
+ // of the input buffer in a real-time system. It will be up
+ // to the RecognizerSession to decide what to do.
+ status_ = EP_PRE_SPEECH; // Automatically reset for next utterance.
+ } else { // If speech picks up again we allow return to SPEECH_PRESENT.
+ if (history_->RingSum(params_.speech_on_window()) >=
+ params_.on_maintain_dur())
+ status_ = EP_SPEECH_PRESENT;
+ }
+ break;
+
+ default:
+ LOG(WARNING) << "Invalid case in switch: " << status_;
+ break;
+ }
+
+ // If this is a quiet, non-speech region, slowly adapt the detection
+ // threshold to be about 6dB above the average RMS.
+ if ((!decision) && (status_ == EP_PRE_SPEECH)) {
+ decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms);
+ rms_adapt_ = decision_threshold_;
+ } else {
+ // If this is in a speech region, adapt the decision threshold to
+ // be about 10dB below the average RMS. If the noise level is high,
+ // the threshold is pushed up.
+ // Adaptation up to a higher level is 5 times faster than decay to
+ // a lower level.
+ if ((status_ == EP_SPEECH_PRESENT) && decision) {
+ if (rms_adapt_ > rms) {
+ rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms);
+ } else {
+ rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms);
+ }
+ float target_threshold = 0.3f * rms_adapt_ + noise_level_;
+ decision_threshold_ = (.90f * decision_threshold_) +
+ (0.10f * target_threshold);
+ }
+ }
+
+ // Set a floor
+ if (decision_threshold_ <params_.min_decision_threshold())
+ decision_threshold_ = params_.min_decision_threshold();
+ }
+
+ // Update speech and noise levels.
+ UpdateLevels(rms);
+ ++frame_counter_;
+}
+
+void EnergyEndpointer::UpdateLevels(float rms) {
+ // Update quickly initially. We assume this is noise and that
+ // speech is 6dB above the noise.
+ if (frame_counter_ < fast_update_frames_) {
+ // Alpha increases from 0 to (k-1)/k where k is the number of time
+ // steps in the initial adaptation period.
+ float alpha = static_cast<float>(frame_counter_) /
+ static_cast<float>(fast_update_frames_);
+ noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms);
+ DLOG(INFO) << "FAST UPDATE, frame_counter_ " << frame_counter_
+ << "fast_update_frames_ " << fast_update_frames_;
+ } else {
+ // Update Noise level. The noise level adapts quickly downward, but
+ // slowly upward. The noise_level_ parameter is not currently used
+ // for threshold adaptation. It is used for UI feedback.
+ if (noise_level_ < rms)
+ noise_level_ = (0.999f * noise_level_) + (0.001f * rms);
+ else
+ noise_level_ = (0.95f * noise_level_) + (0.05f * rms);
+ }
+ if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) {
+ decision_threshold_ = noise_level_ * 2; // 6dB above noise level.
+ // Set a floor
+ if (decision_threshold_ < params_.min_decision_threshold())
+ decision_threshold_ = params_.min_decision_threshold();
+ }
+}
+
+EpStatus EnergyEndpointer::Status(int64* status_time) const {
+ *status_time = history_->EndTime();
+ return status_;
+}
+
+} // namespace speech

Powered by Google App Engine
This is Rietveld 408576698