chrome/browser/speech/endpointer/energy_endpointer.cc - Issue 3117026: Add an endpointer for detecting end of speech.

Side by Side Diff: chrome/browser/speech/endpointer/energy_endpointer.cc

Issue 3117026: Add an endpointer for detecting end of speech. (Closed)

Patch Set: Merged with latest. Created 10 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « chrome/browser/speech/endpointer/energy_endpointer.h ('k') | chrome/browser/speech/endpointer/energy_endpointer_params.h » ('j') | chrome/browser/speech/speech_recognizer.h » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4 //

	5 // To know more about the algorithm used and the original code which this is

	6 // based of, see

	7 // https://wiki.corp.google.com/twiki/bin/view/Main/ChromeGoogleCodeXRef

	8

	9 #include "chrome/browser/speech/endpointer/energy_endpointer.h"

	10

	11 #include "base/logging.h"

	12 #include <math.h>

	13 #include <vector>

	14

	15 namespace {

	16

	17 // Returns the RMS (quadratic mean) of the input signal.

	18 float RMS(const int16* samples, int num_samples) {

	19 int64 ssq_int64 = 0;

	20 int64 sum_int64 = 0;

	21 for (int i = 0; i < num_samples; ++i) {

	22 sum_int64 += samples[i];

	23 ssq_int64 += samples[i] * samples[i];

	24 }

	25 // now convert to floats.

	26 double sum = static_cast<double>(sum_int64);

	27 sum /= num_samples;

	28 double ssq = static_cast<double>(ssq_int64);

	29 return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum)));

	30 }

	31

	32 int64 Secs2Usecs(float seconds) {

	33 return static_cast<int64>(0.5 + (1.0e6 * seconds));

	34 }

	35

	36 } // namespace

	37

	38 namespace speech_input {

	39

	40 // Stores threshold-crossing histories for making decisions about the speech

	41 // state.

	42 class EnergyEndpointer::HistoryRing {

	43 public:

	44 HistoryRing() {}

	45

	46 // Resets the ring to \|size\| elements each with state \|initial_state\|

	47 void SetRing(int size, bool initial_state);

	48

	49 // Inserts a new entry into the ring and drops the oldest entry.

	50 void Insert(int64 time_us, bool decision);

	51

	52 // Returns the time in microseconds of the most recently added entry.

	53 int64 EndTime() const;

	54

	55 // Returns the sum of all intervals during which 'decision' is true within

	56 // the time in seconds specified by 'duration'. The returned interval is

	57 // in seconds.

	58 float RingSum(float duration_sec);

	59

	60 private:

	61 struct DecisionPoint {

	62 int64 time_us;

	63 bool decision;

	64 };

	65

	66 std::vector<DecisionPoint> decision_points_;

	67 int insertion_index_; // Index at which the next item gets added/inserted.

	68

	69 DISALLOW_COPY_AND_ASSIGN(HistoryRing);

	70 };

	71

	72 void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) {

	73 insertion_index_ = 0;

	74 decision_points_.clear();

	75 DecisionPoint init = { -1, initial_state };

	76 decision_points_.resize(size, init);

	77 }

	78

	79 void EnergyEndpointer::HistoryRing::Insert(int64 time_us, bool decision) {

	80 decision_points_[insertion_index_].time_us = time_us;

	81 decision_points_[insertion_index_].decision = decision;

	82 insertion_index_ = (insertion_index_ + 1) % decision_points_.size();

	83 }

	84

	85 int64 EnergyEndpointer::HistoryRing::EndTime() const {

	86 int ind = insertion_index_ - 1;

	87 if (ind < 0)

	88 ind = decision_points_.size() - 1;

	89 return decision_points_[ind].time_us;

	90 }

	91

	92 float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) {

	93 if (!decision_points_.size())

	94 return 0.0;

	95

	96 int64 sum_us = 0;

	97 int ind = insertion_index_ - 1;

	98 if (ind < 0)

	99 ind = decision_points_.size() - 1;

	100 int64 end_us = decision_points_[ind].time_us;

	101 bool is_on = decision_points_[ind].decision;

	102 int64 start_us = end_us - static_cast<int64>(0.5 + (1.0e6 * duration_sec));

	103 if (start_us < 0)

	104 start_us = 0;

	105 size_t n_summed = 1; // n points ==> (n-1) intervals

	106 while ((decision_points_[ind].time_us > start_us) &&

	107 (n_summed < decision_points_.size())) {

	108 --ind;

	109 if (ind < 0)

	110 ind = decision_points_.size() - 1;

	111 if (is_on)

	112 sum_us += end_us - decision_points_[ind].time_us;

	113 is_on = decision_points_[ind].decision;

	114 end_us = decision_points_[ind].time_us;

	115 n_summed++;

	116 }

	117

	118 return 1.0e-6f * sum_us; // Returns total time that was super threshold.

	119 }

	120

	121 EnergyEndpointer::EnergyEndpointer()

	122 : endpointer_time_us_(0),

	123 max_window_dur_(4.0),

	124 history_(new HistoryRing()) {

	125 }

	126

	127 EnergyEndpointer::~EnergyEndpointer() {

	128 }

	129

	130 int EnergyEndpointer::TimeToFrame(float time) const {

	131 return static_cast<int32>(0.5 + (time / params_.frame_period()));

	132 }

	133

	134 void EnergyEndpointer::Restart(bool reset_threshold) {

	135 status_ = EP_PRE_SPEECH;

	136 user_input_start_time_us_ = 0;

	137

	138 if (reset_threshold) {

	139 decision_threshold_ = params_.decision_threshold();

	140 rms_adapt_ = decision_threshold_;

	141 noise_level_ = params_.decision_threshold() / 2.0f;

	142 frame_counter_ = 0; // Used for rapid initial update of levels.

	143 }

	144

	145 // Set up the memories to hold the history windows.

	146 history_->SetRing(TimeToFrame(max_window_dur_), false);

	147

	148 // Flag that indicates that current input should be used for

	149 // estimating the environment. The user has not yet started input

	150 // by e.g. pressed the push-to-talk button. By default, this is

	151 // false for backward compatibility.

	152 estimating_environment_ = false;

	153 }

	154

	155 void EnergyEndpointer::Init(const EnergyEndpointerParams& params) {

	156 params_ = params;

	157

	158 // Find the longest history interval to be used, and make the ring

	159 // large enough to accommodate that number of frames. NOTE: This

	160 // depends upon ep_frame_period being set correctly in the factory

	161 // that did this instantiation.

	162 max_window_dur_ = params_.onset_window();

	163 if (params_.speech_on_window() > max_window_dur_)

	164 max_window_dur_ = params_.speech_on_window();

	165 if (params_.offset_window() > max_window_dur_)

	166 max_window_dur_ = params_.offset_window();

	167 Restart(true);

	168

	169 offset_confirm_dur_sec_ = params_.offset_window() -

	170 params_.offset_confirm_dur();

	171 if (offset_confirm_dur_sec_ < 0.0)

	172 offset_confirm_dur_sec_ = 0.0;

	173

	174 user_input_start_time_us_ = 0;

	175

	176 // Flag that indicates that current input should be used for

	177 // estimating the environment. The user has not yet started input

	178 // by e.g. pressed the push-to-talk button. By default, this is

	179 // false for backward compatibility.

	180 estimating_environment_ = false;

	181 // The initial value of the noise and speech levels is inconsequential.

	182 // The level of the first frame will overwrite these values.

	183 noise_level_ = params_.decision_threshold() / 2.0f;

	184 fast_update_frames_ =

	185 static_cast<int64>(params_.fast_update_dur() / params_.frame_period());

	186

	187 frame_counter_ = 0; // Used for rapid initial update of levels.

	188

	189 sample_rate_ = params_.sample_rate();

	190 start_lag_ = static_cast<int>(sample_rate_ /

	191 params_.max_fundamental_frequency());

	192 end_lag_ = static_cast<int>(sample_rate_ /

	193 params_.min_fundamental_frequency());

	194 }

	195

	196 void EnergyEndpointer::StartSession() {

	197 Restart(true);

	198 }

	199

	200 void EnergyEndpointer::EndSession() {

	201 status_ = EP_POST_SPEECH;

	202 }

	203

	204 void EnergyEndpointer::SetEnvironmentEstimationMode() {

	205 Restart(true);

	206 estimating_environment_ = true;

	207 }

	208

	209 void EnergyEndpointer::SetUserInputMode() {

	210 estimating_environment_ = false;

	211 user_input_start_time_us_ = endpointer_time_us_;

	212 }

	213

	214 void EnergyEndpointer::ProcessAudioFrame(int64 time_us,

	215 const int16* samples,

	216 int num_samples) {

	217 endpointer_time_us_ = time_us;

	218 float rms = RMS(samples, num_samples);

	219

	220 // Check that this is user input audio vs. pre-input adaptation audio.

	221 // Input audio starts when the user indicates start of input, by e.g.

	222 // pressing push-to-talk. Audio recieved prior to that is used to update

	223 // noise and speech level estimates.

	224 if (!estimating_environment_) {

	225 bool decision = false;

	226 if ((endpointer_time_us_ - user_input_start_time_us_) <

	227 Secs2Usecs(params_.contamination_rejection_period())) {

	228 decision = false;

	229 DLOG(INFO) << "decision: forced to false, time: " << endpointer_time_us_;

	230 } else {

	231 decision = (rms > decision_threshold_);

	232 }

	233 DLOG(INFO) << "endpointer_time: " << endpointer_time_us_

	234 << " user_input_start_time: " << user_input_start_time_us_

	235 << " FA reject period "

	236 << Secs2Usecs(params_.contamination_rejection_period())

	237 << " decision: " << (decision ? "SPEECH +++" : "SIL ------");

	238

	239 history_->Insert(endpointer_time_us_, decision);

	240

	241 switch (status_) {

	242 case EP_PRE_SPEECH:

	243 if (history_->RingSum(params_.onset_window()) >

	244 params_.onset_detect_dur()) {

	245 status_ = EP_POSSIBLE_ONSET;

	246 }

	247 break;

	248

	249 case EP_POSSIBLE_ONSET: {

	250 float tsum = history_->RingSum(params_.onset_window());

	251 if (tsum > params_.onset_confirm_dur()) {

	252 status_ = EP_SPEECH_PRESENT;

	253 } else { // If signal is not maintained, drop back to pre-speech.

	254 if (tsum <= params_.onset_detect_dur())

	255 status_ = EP_PRE_SPEECH;

	256 }

	257 break;

	258 }

	259

	260 case EP_SPEECH_PRESENT: {

	261 // To induce hysteresis in the state residency, we allow a

	262 // smaller residency time in the on_ring, than was required to

	263 // enter the SPEECH_PERSENT state.

	264 float on_time = history_->RingSum(params_.speech_on_window());

	265 if (on_time < params_.on_maintain_dur())

	266 status_ = EP_POSSIBLE_OFFSET;

	267 break;

	268 }

	269

	270 case EP_POSSIBLE_OFFSET:

	271 if (history_->RingSum(params_.offset_window()) <=

	272 offset_confirm_dur_sec_) {

	273 // Note that this offset time may be beyond the end

	274 // of the input buffer in a real-time system. It will be up

	275 // to the RecognizerSession to decide what to do.

	276 status_ = EP_PRE_SPEECH; // Automatically reset for next utterance.

	277 } else { // If speech picks up again we allow return to SPEECH_PRESENT.

	278 if (history_->RingSum(params_.speech_on_window()) >=

	279 params_.on_maintain_dur())

	280 status_ = EP_SPEECH_PRESENT;

	281 }

	282 break;

	283

	284 default:

	285 LOG(WARNING) << "Invalid case in switch: " << status_;

	286 break;

	287 }

	288

	289 // If this is a quiet, non-speech region, slowly adapt the detection

	290 // threshold to be about 6dB above the average RMS.

	291 if ((!decision) && (status_ == EP_PRE_SPEECH)) {

	292 decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms);

	293 rms_adapt_ = decision_threshold_;

	294 } else {

	295 // If this is in a speech region, adapt the decision threshold to

	296 // be about 10dB below the average RMS. If the noise level is high,

	297 // the threshold is pushed up.

	298 // Adaptation up to a higher level is 5 times faster than decay to

	299 // a lower level.

	300 if ((status_ == EP_SPEECH_PRESENT) && decision) {

	301 if (rms_adapt_ > rms) {

	302 rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms);

	303 } else {

	304 rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms);

	305 }

	306 float target_threshold = 0.3f * rms_adapt_ + noise_level_;

	307 decision_threshold_ = (.90f * decision_threshold_) +

	308 (0.10f * target_threshold);

	309 }

	310 }

	311

	312 // Set a floor

	313 if (decision_threshold_ <params_.min_decision_threshold())

	314 decision_threshold_ = params_.min_decision_threshold();

	315 }

	316

	317 // Update speech and noise levels.

	318 UpdateLevels(rms);

	319 ++frame_counter_;

	320 }

	321

	322 void EnergyEndpointer::UpdateLevels(float rms) {

	323 // Update quickly initially. We assume this is noise and that

	324 // speech is 6dB above the noise.

	325 if (frame_counter_ < fast_update_frames_) {

	326 // Alpha increases from 0 to (k-1)/k where k is the number of time

	327 // steps in the initial adaptation period.

	328 float alpha = static_cast<float>(frame_counter_) /

	329 static_cast<float>(fast_update_frames_);

	330 noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms);

	331 DLOG(INFO) << "FAST UPDATE, frame_counter_ " << frame_counter_

	332 << "fast_update_frames_ " << fast_update_frames_;

	333 } else {

	334 // Update Noise level. The noise level adapts quickly downward, but

	335 // slowly upward. The noise_level_ parameter is not currently used

	336 // for threshold adaptation. It is used for UI feedback.

	337 if (noise_level_ < rms)

	338 noise_level_ = (0.999f * noise_level_) + (0.001f * rms);

	339 else

	340 noise_level_ = (0.95f * noise_level_) + (0.05f * rms);

	341 }

	342 if (estimating_environment_ \|\| (frame_counter_ < fast_update_frames_)) {

	343 decision_threshold_ = noise_level_ * 2; // 6dB above noise level.

	344 // Set a floor

	345 if (decision_threshold_ < params_.min_decision_threshold())

	346 decision_threshold_ = params_.min_decision_threshold();

	347 }

	348 }

	349

	350 EpStatus EnergyEndpointer::Status(int64* status_time) const {

	351 *status_time = history_->EndTime();

	352 return status_;

	353 }

	354

	355 } // namespace speech

OLD	NEW