Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(52)

Side by Side Diff: chrome/browser/speech/endpointer/energy_endpointer.cc

Issue 3117026: Add an endpointer for detecting end of speech. (Closed)
Patch Set: Merged with latest. Created 10 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 //
5 // To know more about the algorithm used and the original code which this is
6 // based of, see
7 // https://wiki.corp.google.com/twiki/bin/view/Main/ChromeGoogleCodeXRef
8
9 #include "chrome/browser/speech/endpointer/energy_endpointer.h"
10
11 #include "base/logging.h"
12 #include <math.h>
13 #include <vector>
14
15 namespace {
16
17 // Returns the RMS (quadratic mean) of the input signal.
18 float RMS(const int16* samples, int num_samples) {
19 int64 ssq_int64 = 0;
20 int64 sum_int64 = 0;
21 for (int i = 0; i < num_samples; ++i) {
22 sum_int64 += samples[i];
23 ssq_int64 += samples[i] * samples[i];
24 }
25 // now convert to floats.
26 double sum = static_cast<double>(sum_int64);
27 sum /= num_samples;
28 double ssq = static_cast<double>(ssq_int64);
29 return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum)));
30 }
31
32 int64 Secs2Usecs(float seconds) {
33 return static_cast<int64>(0.5 + (1.0e6 * seconds));
34 }
35
36 } // namespace
37
38 namespace speech_input {
39
40 // Stores threshold-crossing histories for making decisions about the speech
41 // state.
42 class EnergyEndpointer::HistoryRing {
43 public:
44 HistoryRing() {}
45
46 // Resets the ring to |size| elements each with state |initial_state|
47 void SetRing(int size, bool initial_state);
48
49 // Inserts a new entry into the ring and drops the oldest entry.
50 void Insert(int64 time_us, bool decision);
51
52 // Returns the time in microseconds of the most recently added entry.
53 int64 EndTime() const;
54
55 // Returns the sum of all intervals during which 'decision' is true within
56 // the time in seconds specified by 'duration'. The returned interval is
57 // in seconds.
58 float RingSum(float duration_sec);
59
60 private:
61 struct DecisionPoint {
62 int64 time_us;
63 bool decision;
64 };
65
66 std::vector<DecisionPoint> decision_points_;
67 int insertion_index_; // Index at which the next item gets added/inserted.
68
69 DISALLOW_COPY_AND_ASSIGN(HistoryRing);
70 };
71
72 void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) {
73 insertion_index_ = 0;
74 decision_points_.clear();
75 DecisionPoint init = { -1, initial_state };
76 decision_points_.resize(size, init);
77 }
78
79 void EnergyEndpointer::HistoryRing::Insert(int64 time_us, bool decision) {
80 decision_points_[insertion_index_].time_us = time_us;
81 decision_points_[insertion_index_].decision = decision;
82 insertion_index_ = (insertion_index_ + 1) % decision_points_.size();
83 }
84
85 int64 EnergyEndpointer::HistoryRing::EndTime() const {
86 int ind = insertion_index_ - 1;
87 if (ind < 0)
88 ind = decision_points_.size() - 1;
89 return decision_points_[ind].time_us;
90 }
91
92 float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) {
93 if (!decision_points_.size())
94 return 0.0;
95
96 int64 sum_us = 0;
97 int ind = insertion_index_ - 1;
98 if (ind < 0)
99 ind = decision_points_.size() - 1;
100 int64 end_us = decision_points_[ind].time_us;
101 bool is_on = decision_points_[ind].decision;
102 int64 start_us = end_us - static_cast<int64>(0.5 + (1.0e6 * duration_sec));
103 if (start_us < 0)
104 start_us = 0;
105 size_t n_summed = 1; // n points ==> (n-1) intervals
106 while ((decision_points_[ind].time_us > start_us) &&
107 (n_summed < decision_points_.size())) {
108 --ind;
109 if (ind < 0)
110 ind = decision_points_.size() - 1;
111 if (is_on)
112 sum_us += end_us - decision_points_[ind].time_us;
113 is_on = decision_points_[ind].decision;
114 end_us = decision_points_[ind].time_us;
115 n_summed++;
116 }
117
118 return 1.0e-6f * sum_us; // Returns total time that was super threshold.
119 }
120
121 EnergyEndpointer::EnergyEndpointer()
122 : endpointer_time_us_(0),
123 max_window_dur_(4.0),
124 history_(new HistoryRing()) {
125 }
126
127 EnergyEndpointer::~EnergyEndpointer() {
128 }
129
130 int EnergyEndpointer::TimeToFrame(float time) const {
131 return static_cast<int32>(0.5 + (time / params_.frame_period()));
132 }
133
134 void EnergyEndpointer::Restart(bool reset_threshold) {
135 status_ = EP_PRE_SPEECH;
136 user_input_start_time_us_ = 0;
137
138 if (reset_threshold) {
139 decision_threshold_ = params_.decision_threshold();
140 rms_adapt_ = decision_threshold_;
141 noise_level_ = params_.decision_threshold() / 2.0f;
142 frame_counter_ = 0; // Used for rapid initial update of levels.
143 }
144
145 // Set up the memories to hold the history windows.
146 history_->SetRing(TimeToFrame(max_window_dur_), false);
147
148 // Flag that indicates that current input should be used for
149 // estimating the environment. The user has not yet started input
150 // by e.g. pressed the push-to-talk button. By default, this is
151 // false for backward compatibility.
152 estimating_environment_ = false;
153 }
154
155 void EnergyEndpointer::Init(const EnergyEndpointerParams& params) {
156 params_ = params;
157
158 // Find the longest history interval to be used, and make the ring
159 // large enough to accommodate that number of frames. NOTE: This
160 // depends upon ep_frame_period being set correctly in the factory
161 // that did this instantiation.
162 max_window_dur_ = params_.onset_window();
163 if (params_.speech_on_window() > max_window_dur_)
164 max_window_dur_ = params_.speech_on_window();
165 if (params_.offset_window() > max_window_dur_)
166 max_window_dur_ = params_.offset_window();
167 Restart(true);
168
169 offset_confirm_dur_sec_ = params_.offset_window() -
170 params_.offset_confirm_dur();
171 if (offset_confirm_dur_sec_ < 0.0)
172 offset_confirm_dur_sec_ = 0.0;
173
174 user_input_start_time_us_ = 0;
175
176 // Flag that indicates that current input should be used for
177 // estimating the environment. The user has not yet started input
178 // by e.g. pressed the push-to-talk button. By default, this is
179 // false for backward compatibility.
180 estimating_environment_ = false;
181 // The initial value of the noise and speech levels is inconsequential.
182 // The level of the first frame will overwrite these values.
183 noise_level_ = params_.decision_threshold() / 2.0f;
184 fast_update_frames_ =
185 static_cast<int64>(params_.fast_update_dur() / params_.frame_period());
186
187 frame_counter_ = 0; // Used for rapid initial update of levels.
188
189 sample_rate_ = params_.sample_rate();
190 start_lag_ = static_cast<int>(sample_rate_ /
191 params_.max_fundamental_frequency());
192 end_lag_ = static_cast<int>(sample_rate_ /
193 params_.min_fundamental_frequency());
194 }
195
196 void EnergyEndpointer::StartSession() {
197 Restart(true);
198 }
199
200 void EnergyEndpointer::EndSession() {
201 status_ = EP_POST_SPEECH;
202 }
203
204 void EnergyEndpointer::SetEnvironmentEstimationMode() {
205 Restart(true);
206 estimating_environment_ = true;
207 }
208
209 void EnergyEndpointer::SetUserInputMode() {
210 estimating_environment_ = false;
211 user_input_start_time_us_ = endpointer_time_us_;
212 }
213
214 void EnergyEndpointer::ProcessAudioFrame(int64 time_us,
215 const int16* samples,
216 int num_samples) {
217 endpointer_time_us_ = time_us;
218 float rms = RMS(samples, num_samples);
219
220 // Check that this is user input audio vs. pre-input adaptation audio.
221 // Input audio starts when the user indicates start of input, by e.g.
222 // pressing push-to-talk. Audio recieved prior to that is used to update
223 // noise and speech level estimates.
224 if (!estimating_environment_) {
225 bool decision = false;
226 if ((endpointer_time_us_ - user_input_start_time_us_) <
227 Secs2Usecs(params_.contamination_rejection_period())) {
228 decision = false;
229 DLOG(INFO) << "decision: forced to false, time: " << endpointer_time_us_;
230 } else {
231 decision = (rms > decision_threshold_);
232 }
233 DLOG(INFO) << "endpointer_time: " << endpointer_time_us_
234 << " user_input_start_time: " << user_input_start_time_us_
235 << " FA reject period "
236 << Secs2Usecs(params_.contamination_rejection_period())
237 << " decision: " << (decision ? "SPEECH +++" : "SIL ------");
238
239 history_->Insert(endpointer_time_us_, decision);
240
241 switch (status_) {
242 case EP_PRE_SPEECH:
243 if (history_->RingSum(params_.onset_window()) >
244 params_.onset_detect_dur()) {
245 status_ = EP_POSSIBLE_ONSET;
246 }
247 break;
248
249 case EP_POSSIBLE_ONSET: {
250 float tsum = history_->RingSum(params_.onset_window());
251 if (tsum > params_.onset_confirm_dur()) {
252 status_ = EP_SPEECH_PRESENT;
253 } else { // If signal is not maintained, drop back to pre-speech.
254 if (tsum <= params_.onset_detect_dur())
255 status_ = EP_PRE_SPEECH;
256 }
257 break;
258 }
259
260 case EP_SPEECH_PRESENT: {
261 // To induce hysteresis in the state residency, we allow a
262 // smaller residency time in the on_ring, than was required to
263 // enter the SPEECH_PERSENT state.
264 float on_time = history_->RingSum(params_.speech_on_window());
265 if (on_time < params_.on_maintain_dur())
266 status_ = EP_POSSIBLE_OFFSET;
267 break;
268 }
269
270 case EP_POSSIBLE_OFFSET:
271 if (history_->RingSum(params_.offset_window()) <=
272 offset_confirm_dur_sec_) {
273 // Note that this offset time may be beyond the end
274 // of the input buffer in a real-time system. It will be up
275 // to the RecognizerSession to decide what to do.
276 status_ = EP_PRE_SPEECH; // Automatically reset for next utterance.
277 } else { // If speech picks up again we allow return to SPEECH_PRESENT.
278 if (history_->RingSum(params_.speech_on_window()) >=
279 params_.on_maintain_dur())
280 status_ = EP_SPEECH_PRESENT;
281 }
282 break;
283
284 default:
285 LOG(WARNING) << "Invalid case in switch: " << status_;
286 break;
287 }
288
289 // If this is a quiet, non-speech region, slowly adapt the detection
290 // threshold to be about 6dB above the average RMS.
291 if ((!decision) && (status_ == EP_PRE_SPEECH)) {
292 decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms);
293 rms_adapt_ = decision_threshold_;
294 } else {
295 // If this is in a speech region, adapt the decision threshold to
296 // be about 10dB below the average RMS. If the noise level is high,
297 // the threshold is pushed up.
298 // Adaptation up to a higher level is 5 times faster than decay to
299 // a lower level.
300 if ((status_ == EP_SPEECH_PRESENT) && decision) {
301 if (rms_adapt_ > rms) {
302 rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms);
303 } else {
304 rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms);
305 }
306 float target_threshold = 0.3f * rms_adapt_ + noise_level_;
307 decision_threshold_ = (.90f * decision_threshold_) +
308 (0.10f * target_threshold);
309 }
310 }
311
312 // Set a floor
313 if (decision_threshold_ <params_.min_decision_threshold())
314 decision_threshold_ = params_.min_decision_threshold();
315 }
316
317 // Update speech and noise levels.
318 UpdateLevels(rms);
319 ++frame_counter_;
320 }
321
322 void EnergyEndpointer::UpdateLevels(float rms) {
323 // Update quickly initially. We assume this is noise and that
324 // speech is 6dB above the noise.
325 if (frame_counter_ < fast_update_frames_) {
326 // Alpha increases from 0 to (k-1)/k where k is the number of time
327 // steps in the initial adaptation period.
328 float alpha = static_cast<float>(frame_counter_) /
329 static_cast<float>(fast_update_frames_);
330 noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms);
331 DLOG(INFO) << "FAST UPDATE, frame_counter_ " << frame_counter_
332 << "fast_update_frames_ " << fast_update_frames_;
333 } else {
334 // Update Noise level. The noise level adapts quickly downward, but
335 // slowly upward. The noise_level_ parameter is not currently used
336 // for threshold adaptation. It is used for UI feedback.
337 if (noise_level_ < rms)
338 noise_level_ = (0.999f * noise_level_) + (0.001f * rms);
339 else
340 noise_level_ = (0.95f * noise_level_) + (0.05f * rms);
341 }
342 if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) {
343 decision_threshold_ = noise_level_ * 2; // 6dB above noise level.
344 // Set a floor
345 if (decision_threshold_ < params_.min_decision_threshold())
346 decision_threshold_ = params_.min_decision_threshold();
347 }
348 }
349
350 EpStatus EnergyEndpointer::Status(int64* status_time) const {
351 *status_time = history_->EndTime();
352 return status_;
353 }
354
355 } // namespace speech
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698