Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(113)

Side by Side Diff: content/browser/speech/speech_recognizer_impl.cc

Issue 9835049: Speech refactoring: Reimplemented speech_recognizer as a FSM. (CL1.5) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Minor style fixes. Created 8 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "content/browser/speech/speech_recognizer_impl.h" 5 #include "content/browser/speech/speech_recognizer_impl.h"
6 6
7 #include "base/basictypes.h"
7 #include "base/bind.h" 8 #include "base/bind.h"
8 #include "base/time.h" 9 #include "base/time.h"
9 #include "content/browser/browser_main_loop.h" 10 #include "content/browser/browser_main_loop.h"
10 #include "content/browser/speech/audio_buffer.h" 11 #include "content/browser/speech/audio_buffer.h"
11 #include "content/browser/speech/google_one_shot_remote_engine.h" 12 #include "content/browser/speech/google_one_shot_remote_engine.h"
12 #include "content/public/browser/browser_thread.h" 13 #include "content/public/browser/browser_thread.h"
13 #include "content/public/browser/speech_recognition_event_listener.h" 14 #include "content/public/browser/speech_recognition_event_listener.h"
14 #include "content/public/browser/speech_recognizer.h" 15 #include "content/public/browser/speech_recognizer.h"
15 #include "content/public/common/speech_recognition_error.h" 16 #include "content/public/common/speech_recognition_error.h"
16 #include "content/public/common/speech_recognition_result.h" 17 #include "content/public/common/speech_recognition_result.h"
17 #include "net/url_request/url_request_context_getter.h" 18 #include "net/url_request/url_request_context_getter.h"
18 19
20 #define BIND(x) base::Bind(&SpeechRecognizerImpl::x, this)
hans 2012/04/02 16:05:59 Hmm, not super happy about this macro and the use
Primiano Tucci (use gerrit) 2012/04/03 10:16:39 Reverted to switch-style FSM as agreed.
21
19 using content::BrowserMainLoop; 22 using content::BrowserMainLoop;
20 using content::BrowserThread; 23 using content::BrowserThread;
21 using content::SpeechRecognitionError; 24 using content::SpeechRecognitionError;
22 using content::SpeechRecognitionEventListener; 25 using content::SpeechRecognitionEventListener;
23 using content::SpeechRecognitionResult; 26 using content::SpeechRecognitionResult;
24 using content::SpeechRecognizer; 27 using content::SpeechRecognizer;
25 using media::AudioInputController; 28 using media::AudioInputController;
26 29
27 namespace { 30 namespace {
28 31
(...skipping 12 matching lines...) Expand all
41 44
42 // Maximum level to draw to display unclipped meter. (1.0f displays clipping.) 45 // Maximum level to draw to display unclipped meter. (1.0f displays clipping.)
43 const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f; 46 const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f;
44 47
45 // Returns true if more than 5% of the samples are at min or max value. 48 // Returns true if more than 5% of the samples are at min or max value.
46 bool DetectClipping(const speech::AudioChunk& chunk) { 49 bool DetectClipping(const speech::AudioChunk& chunk) {
47 const int num_samples = chunk.NumSamples(); 50 const int num_samples = chunk.NumSamples();
48 const int16* samples = chunk.SamplesData16(); 51 const int16* samples = chunk.SamplesData16();
49 const int kThreshold = num_samples / 20; 52 const int kThreshold = num_samples / 20;
50 int clipping_samples = 0; 53 int clipping_samples = 0;
54
51 for (int i = 0; i < num_samples; ++i) { 55 for (int i = 0; i < num_samples; ++i) {
52 if (samples[i] <= -32767 || samples[i] >= 32767) { 56 if (samples[i] <= -32767 || samples[i] >= 32767) {
53 if (++clipping_samples > kThreshold) 57 if (++clipping_samples > kThreshold)
54 return true; 58 return true;
55 } 59 }
56 } 60 }
57 return false; 61 return false;
58 } 62 }
59 63
60 } // namespace 64 } // namespace
61 65
62 SpeechRecognizer* SpeechRecognizer::Create( 66 SpeechRecognizer* SpeechRecognizer::Create(
63 SpeechRecognitionEventListener* listener, 67 SpeechRecognitionEventListener* listener,
64 int caller_id, 68 int caller_id,
65 const std::string& language, 69 const std::string& language,
66 const std::string& grammar, 70 const std::string& grammar,
67 net::URLRequestContextGetter* context_getter, 71 net::URLRequestContextGetter* context_getter,
68 bool filter_profanities, 72 bool filter_profanities,
69 const std::string& hardware_info, 73 const std::string& hardware_info,
70 const std::string& origin_url) { 74 const std::string& origin_url) {
75 speech::GoogleOneShotRemoteEngineConfig google_sr_config;
76 google_sr_config.language = language;
77 google_sr_config.grammar = grammar;
78 google_sr_config.audio_sample_rate =
79 speech::SpeechRecognizerImpl::kAudioSampleRate;
80 google_sr_config.audio_num_bits_per_sample =
81 speech::SpeechRecognizerImpl::kNumBitsPerAudioSample;
82 google_sr_config.filter_profanities = filter_profanities;
83 google_sr_config.hardware_info = hardware_info;
84 google_sr_config.origin_url = origin_url;
85
86 speech::GoogleOneShotRemoteEngine* google_sr_engine =
87 new speech::GoogleOneShotRemoteEngine(context_getter);
88 google_sr_engine->SetConfig(google_sr_config);
89
71 return new speech::SpeechRecognizerImpl(listener, 90 return new speech::SpeechRecognizerImpl(listener,
72 caller_id, 91 caller_id,
73 language, 92 google_sr_engine);
74 grammar,
75 context_getter,
76 filter_profanities,
77 hardware_info,
78 origin_url);
79 } 93 }
80 94
81 namespace speech { 95 namespace speech {
82 96
83 const int SpeechRecognizerImpl::kAudioSampleRate = 16000; 97 const int SpeechRecognizerImpl::kAudioSampleRate = 16000;
84 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO; 98 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO;
85 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; 99 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16;
86 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; 100 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000;
87 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; 101 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300;
88 102
103 COMPILE_ASSERT((SpeechRecognizerImpl::kNumBitsPerAudioSample & 0x7) == 0,
hans 2012/04/02 16:05:59 I think using the % operator instead of & would ma
Primiano Tucci (use gerrit) 2012/04/03 10:16:39 Done.
104 kNumBitsPerAudioSample_must_be_a_multiple_of_8);
105
89 SpeechRecognizerImpl::SpeechRecognizerImpl( 106 SpeechRecognizerImpl::SpeechRecognizerImpl(
90 SpeechRecognitionEventListener* listener, 107 SpeechRecognitionEventListener* listener,
91 int caller_id, 108 int caller_id,
92 const std::string& language, 109 SpeechRecognitionEngine* engine)
93 const std::string& grammar,
94 net::URLRequestContextGetter* context_getter,
95 bool filter_profanities,
96 const std::string& hardware_info,
97 const std::string& origin_url)
98 : listener_(listener), 110 : listener_(listener),
99 testing_audio_manager_(NULL), 111 testing_audio_manager_(NULL),
112 recognition_engine_(engine),
100 endpointer_(kAudioSampleRate), 113 endpointer_(kAudioSampleRate),
101 context_getter_(context_getter),
102 caller_id_(caller_id), 114 caller_id_(caller_id),
103 language_(language), 115 in_event_dispatching_(false),
104 grammar_(grammar), 116 state_(STATE_IDLE) {
105 filter_profanities_(filter_profanities),
106 hardware_info_(hardware_info),
107 origin_url_(origin_url),
108 num_samples_recorded_(0),
109 audio_level_(0.0f) {
110 DCHECK(listener_ != NULL); 117 DCHECK(listener_ != NULL);
118 DCHECK(recognition_engine_ != NULL);
119 InitializeFSM();
111 endpointer_.set_speech_input_complete_silence_length( 120 endpointer_.set_speech_input_complete_silence_length(
112 base::Time::kMicrosecondsPerSecond / 2); 121 base::Time::kMicrosecondsPerSecond / 2);
113 endpointer_.set_long_speech_input_complete_silence_length( 122 endpointer_.set_long_speech_input_complete_silence_length(
114 base::Time::kMicrosecondsPerSecond); 123 base::Time::kMicrosecondsPerSecond);
115 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); 124 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);
116 endpointer_.StartSession(); 125 endpointer_.StartSession();
126 recognition_engine_->set_delegate(this);
117 } 127 }
118 128
119 SpeechRecognizerImpl::~SpeechRecognizerImpl() { 129 SpeechRecognizerImpl::~SpeechRecognizerImpl() {
120 // Recording should have stopped earlier due to the endpointer or
121 // |StopRecording| being called.
122 DCHECK(!audio_controller_.get());
123 DCHECK(!recognition_engine_.get() ||
124 !recognition_engine_->IsRecognitionPending());
125 endpointer_.EndSession(); 130 endpointer_.EndSession();
126 } 131 }
127 132
133 // ------- Methods that trigger Finite State Machine (FSM) events ------------
134
135 // NOTE: all the external events and request should be enqueued (PostTask), even
hans 2012/04/02 16:05:59 s/request/requests/ ?
Primiano Tucci (use gerrit) 2012/04/03 10:16:39 Done.
136 // if they come from the same (IO) thread, in order to preserve the relationship
137 // of causality between events and avoid interleaved event processing due to
138 // synchronous callbacks.
139
128 void SpeechRecognizerImpl::StartRecognition() { 140 void SpeechRecognizerImpl::StartRecognition() {
141 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
142 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
143 this, EVENT_START, FSMEventArgs()));
144 }
145
146 void SpeechRecognizerImpl::AbortRecognition() {
147 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
148 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
149 this, EVENT_ABORT, FSMEventArgs()));
150 }
151
152 void SpeechRecognizerImpl::StopAudioCapture() {
153 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
154 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
155 this, EVENT_STOP_CAPTURE,
156 FSMEventArgs()));
157 }
158
159 bool SpeechRecognizerImpl::IsActive() const {
160 // Checking the FSM state from another thread (thus, while the FSM is
161 // potentially concurrently evolving) is meaningless.
162 // If you're doing it, probably you have some design issues.
hans 2012/04/02 16:05:59 i'm not sure this comment adds much.. i think the
Primiano Tucci (use gerrit) 2012/04/03 10:16:39 Agree, removed the last line.
129 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); 163 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
130 DCHECK(!audio_controller_.get()); 164 return state_ != STATE_IDLE;
131 DCHECK(!recognition_engine_.get() || 165 }
132 !recognition_engine_->IsRecognitionPending()); 166
133 167 bool SpeechRecognizerImpl::IsCapturingAudio() const {
134 // The endpointer needs to estimate the environment/background noise before 168 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive().
135 // starting to treat the audio as user input. In |HandleOnData| we wait until 169 const bool is_capturing_audio = state_ >= STATE_STARTING &&
136 // such time has passed before switching to user input mode. 170 state_ <= STATE_RECOGNIZING;
137 endpointer_.SetEnvironmentEstimationMode(); 171 DCHECK((is_capturing_audio && (audio_controller_.get() != NULL)) ||
138 172 (!is_capturing_audio && audio_controller_.get() == NULL));
173 return is_capturing_audio;
174 }
175
176 // Invoked in the audio thread.
177 void SpeechRecognizerImpl::OnError(AudioInputController* controller,
178 int error_code) {
179 FSMEventArgs args;
180 args.audio_error_code = error_code;
181 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
182 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
183 this, EVENT_AUDIO_ERROR, args));
184 }
185
186 void SpeechRecognizerImpl::OnData(AudioInputController* controller,
187 const uint8* data, uint32 size) {
188 if (size == 0) // This could happen when audio capture stops and is normal.
189 return;
190
191 FSMEventArgs args;
192 args.audio_data = new AudioChunk(data, static_cast<size_t>(size),
193 kNumBitsPerAudioSample / 8);
194 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
195 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
196 this, EVENT_AUDIO_DATA, args));
197 }
198
199 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {}
200
201 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult(
202 const content::SpeechRecognitionResult& result) {
203 FSMEventArgs args;
204 args.engine_result = result;
205 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
206 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
207 this, EVENT_ENGINE_RESULT, args));
208 }
209
210 void SpeechRecognizerImpl::OnSpeechRecognitionEngineError(
211 const content::SpeechRecognitionError& error) {
212 FSMEventArgs args;
213 args.engine_error = error;
214 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
215 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
216 this, EVENT_ENGINE_ERROR, args));
217 }
218
219 // ----------------------- Core FSM implementation ---------------------------
220 // TODO(primiano) After the changes in the media package (r129173), this class
221 // slightly violates the SpeechRecognitionEventListener interface contract. In
222 // particular, it is not true anymore that this class can be freed after the
223 // OnRecognitionEnd event, since the audio_controller_.Close() asynchronous
224 // call can be still in progress after the end event. Currently, it does not
225 // represent a problem for the browser itself, since since refcounting protects
hans 2012/04/02 16:05:59 s/since since/since/
Primiano Tucci (use gerrit) 2012/04/03 10:16:39 Done.
226 // us against such race conditions. However, we should fix this in the next CLs.
227 // For instance, tests are currently working just because the
228 // TestAudioInputController is not closing asynchronously as the real controller
229 // does, but they will become flaky if TestAudioInputController will be fixed.
230
231 void SpeechRecognizerImpl::InitializeFSM() {
232 fsm[STATE_IDLE][EVENT_ABORT] = BIND(DoNothing);
233 fsm[STATE_IDLE][EVENT_START] = BIND(StartRecording);
234 fsm[STATE_IDLE][EVENT_STOP_CAPTURE] = BIND(DoNothing);
235 fsm[STATE_IDLE][EVENT_AUDIO_DATA] = BIND(DoNothing);
236 fsm[STATE_IDLE][EVENT_ENGINE_RESULT] = BIND(DoNothing);
237 fsm[STATE_IDLE][EVENT_ENGINE_ERROR] = BIND(DoNothing);
238 fsm[STATE_IDLE][EVENT_AUDIO_ERROR] = BIND(DoNothing);
239
240 fsm[STATE_STARTING][EVENT_ABORT] = BIND(Abort);
241 fsm[STATE_STARTING][EVENT_START] = kUnfeasibleTransition;
242 fsm[STATE_STARTING][EVENT_STOP_CAPTURE] = BIND(Abort);
243 fsm[STATE_STARTING][EVENT_AUDIO_DATA] = BIND(StartRecognitionEngine);
244 fsm[STATE_STARTING][EVENT_ENGINE_RESULT] = kUnfeasibleTransition;
245 fsm[STATE_STARTING][EVENT_ENGINE_ERROR] = BIND(Abort);
246 fsm[STATE_STARTING][EVENT_AUDIO_ERROR] = BIND(Abort);
247
248 fsm[STATE_ESTIMATING_ENVIRONMENT][EVENT_ABORT] = BIND(Abort);
249 fsm[STATE_ESTIMATING_ENVIRONMENT][EVENT_START] = kUnfeasibleTransition;
250 fsm[STATE_ESTIMATING_ENVIRONMENT][EVENT_STOP_CAPTURE] =
251 BIND(StopCaptureAndWaitResult);
252 fsm[STATE_ESTIMATING_ENVIRONMENT][EVENT_AUDIO_DATA] =
253 BIND(WaitEnvironmentEstimationCompletion);
254 fsm[STATE_ESTIMATING_ENVIRONMENT][EVENT_ENGINE_RESULT] =
255 BIND(ProcessIntermediateResult);
256 fsm[STATE_ESTIMATING_ENVIRONMENT][EVENT_ENGINE_ERROR] = BIND(Abort);
257 fsm[STATE_ESTIMATING_ENVIRONMENT][EVENT_AUDIO_ERROR] = BIND(Abort);
258
259 fsm[STATE_WAITING_FOR_SPEECH][EVENT_ABORT] = BIND(Abort);
260 fsm[STATE_WAITING_FOR_SPEECH][EVENT_START] = kUnfeasibleTransition;
261 fsm[STATE_WAITING_FOR_SPEECH][EVENT_STOP_CAPTURE] =
262 BIND(StopCaptureAndWaitResult);
263 fsm[STATE_WAITING_FOR_SPEECH][EVENT_AUDIO_DATA] =
264 BIND(DetectUserSpeechOrTimeout);
265 fsm[STATE_WAITING_FOR_SPEECH][EVENT_ENGINE_RESULT] =
266 BIND(ProcessIntermediateResult);
267 fsm[STATE_WAITING_FOR_SPEECH][EVENT_ENGINE_ERROR] = BIND(Abort);
268 fsm[STATE_WAITING_FOR_SPEECH][EVENT_AUDIO_ERROR] = BIND(Abort);
269
270 fsm[STATE_RECOGNIZING][EVENT_ABORT] = BIND(Abort);
271 fsm[STATE_RECOGNIZING][EVENT_START] = kUnfeasibleTransition;
272 fsm[STATE_RECOGNIZING][EVENT_STOP_CAPTURE] = BIND(StopCaptureAndWaitResult);
273 fsm[STATE_RECOGNIZING][EVENT_AUDIO_DATA] = BIND(DetectEndOfSpeech);
274 fsm[STATE_RECOGNIZING][EVENT_ENGINE_RESULT] = BIND(ProcessIntermediateResult);
275 fsm[STATE_RECOGNIZING][EVENT_ENGINE_ERROR] = BIND(Abort);
276 fsm[STATE_RECOGNIZING][EVENT_AUDIO_ERROR] = BIND(Abort);
277
278 fsm[STATE_WAITING_FINAL_RESULT][EVENT_ABORT] = BIND(Abort);
279 fsm[STATE_WAITING_FINAL_RESULT][EVENT_START] = kUnfeasibleTransition;
280 fsm[STATE_WAITING_FINAL_RESULT][EVENT_STOP_CAPTURE] = BIND(DoNothing);
281 fsm[STATE_WAITING_FINAL_RESULT][EVENT_AUDIO_DATA] = BIND(DoNothing);
282 fsm[STATE_WAITING_FINAL_RESULT][EVENT_ENGINE_RESULT] =
283 BIND(ProcessFinalResult);
284 fsm[STATE_WAITING_FINAL_RESULT][EVENT_ENGINE_ERROR] = BIND(Abort);
285 fsm[STATE_WAITING_FINAL_RESULT][EVENT_AUDIO_ERROR] = BIND(Abort);
286 }
287
288 void SpeechRecognizerImpl::DispatchEvent(FSMEvent event, FSMEventArgs args) {
289 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
290 DCHECK_LE(event, EVENT_MAX);
291 DCHECK_LE(state_, STATE_MAX);
292
293 // Event dispatching must be sequential, otherwise it will break all the rules
294 // and the assumptions of the finite state automata model.
295 DCHECK(!in_event_dispatching_);
296 in_event_dispatching_ = true;
297
298 // Guard against the delegate freeing us until we finish processing the event.
299 scoped_refptr<SpeechRecognizerImpl> me(this);
300
301 args.event = event;
302
303 if (event == EVENT_AUDIO_DATA) {
304 DCHECK(args.audio_data.get() != NULL);
305 ProcessAudioPipeline(*(args.audio_data.get()));
hans 2012/04/02 16:05:59 I think you can just do ProcessAudioPipeline(*args
Primiano Tucci (use gerrit) 2012/04/03 10:16:39 Done.
306 }
307
308 // The audio pipeline must be processed before the event dispatch, otherwise
309 // it would take actions according to the future state instead of the current.
310 const TransitionFunction& transition = fsm[state_][event];
hans 2012/04/02 16:05:59 i liked the switch-case better
Satish 2012/04/02 21:57:09 I was thinking earlier that a table would be appea
311 if(transition.Equals(kUnfeasibleTransition)) {
312 NOTREACHED() << "Unfeasible event " << event << " in state " << state_;
313 } else {
314 state_ = transition.Run(args);
315 }
316
317 in_event_dispatching_ = false;
318 }
319
320 // ----------- Contract for all the FSM evolution functions below -------------
321 // - Are guaranteed to be executed in the IO thread;
322 // - Are guaranteed to be not reentrant (themselves and each other);
323 // - event_args members are guaranteed to be stable during the call;
324 // - The class won't be freed in the meanwhile due to callbacks;
325 // - IsCapturingAudio() returns true if and only if audio_controller_ != NULL.
326
327 // TODO(primiano) the audio pipeline is currently serial. However, the
328 // clipper->endpointer->vumeter chain and the sr_engine could be parallelized.
329 // We should profile the execution to see if it would be worth or not.
330 void SpeechRecognizerImpl::ProcessAudioPipeline(const AudioChunk& raw_audio) {
331 const bool route_to_endpointer = state_ >= STATE_ESTIMATING_ENVIRONMENT &&
332 state_ <= STATE_RECOGNIZING;
333 const bool route_to_sr_engine = route_to_endpointer;
334 const bool route_to_vumeter = state_ >= STATE_WAITING_FOR_SPEECH &&
335 state_ <= STATE_RECOGNIZING;
336 const bool clip_detected = DetectClipping(raw_audio);
337 float rms = 0;
338
339 num_samples_recorded_ += raw_audio.NumSamples();
340
341 if (route_to_endpointer) {
342 endpointer_.ProcessAudio(raw_audio, &rms);
343 }
344 if (route_to_vumeter) {
345 DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|.
346 UpdateSignalAndNoiseLevels(rms, clip_detected);
347 }
348 if (route_to_sr_engine) {
349 DCHECK(recognition_engine_.get());
350 recognition_engine_->TakeAudioChunk(raw_audio);
351 }
352 }
353
354 SpeechRecognizerImpl::FSMState
355 SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) {
356 DCHECK(recognition_engine_.get());
357 DCHECK(!IsCapturingAudio());
139 AudioManager* audio_manager = (testing_audio_manager_ != NULL) ? 358 AudioManager* audio_manager = (testing_audio_manager_ != NULL) ?
140 testing_audio_manager_ : 359 testing_audio_manager_ :
141 BrowserMainLoop::GetAudioManager(); 360 BrowserMainLoop::GetAudioManager();
142 const int samples_per_packet = kAudioSampleRate * 361 DCHECK(audio_manager != NULL);
143 GoogleOneShotRemoteEngine::kAudioPacketIntervalMs / 1000; 362
363 VLOG(1) << "SpeechRecognizerImpl starting audio capture.";
364 num_samples_recorded_ = 0;
365 audio_level_ = 0;
366 listener_->OnRecognitionStart(caller_id_);
367
368 if (!audio_manager->HasAudioInputDevices()) {
369 return AbortWithError(SpeechRecognitionError(
370 content::SPEECH_RECOGNITION_ERROR_AUDIO,
371 content::SPEECH_AUDIO_ERROR_DETAILS_NO_MIC));
372 }
373
374 if (audio_manager->IsRecordingInProcess()) {
375 return AbortWithError(SpeechRecognitionError(
376 content::SPEECH_RECOGNITION_ERROR_AUDIO,
377 content::SPEECH_AUDIO_ERROR_DETAILS_IN_USE));
378 }
379
380 const int samples_per_packet = (kAudioSampleRate *
381 recognition_engine_->GetDesiredAudioChunkDurationMs()) / 1000;
144 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout, 382 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,
145 kAudioSampleRate, kNumBitsPerAudioSample, 383 kAudioSampleRate, kNumBitsPerAudioSample,
146 samples_per_packet); 384 samples_per_packet);
147 audio_controller_ = AudioInputController::Create(audio_manager, this, params); 385 audio_controller_ = AudioInputController::Create(audio_manager, this, params);
148 DCHECK(audio_controller_.get()); 386
149 VLOG(1) << "SpeechRecognizer starting record."; 387 if (audio_controller_.get() == NULL) {
150 num_samples_recorded_ = 0; 388 return AbortWithError(
389 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO));
390 }
391
392 // The endpointer needs to estimate the environment/background noise before
393 // starting to treat the audio as user input. We wait in the state
394 // ESTIMATING_ENVIRONMENT until such interval has elapsed before switching
395 // to user input mode.
396 endpointer_.SetEnvironmentEstimationMode();
151 audio_controller_->Record(); 397 audio_controller_->Record();
152 } 398 return STATE_STARTING;
153 399 }
154 void SpeechRecognizerImpl::AbortRecognition() { 400
155 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); 401 SpeechRecognizerImpl::FSMState
156 DCHECK(audio_controller_.get() || recognition_engine_.get()); 402 SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) {
157 403 // This is the first audio packet captured, so the recognition engine is
158 // Stop recording if required. 404 // started and the delegate notifies about the event.
hans 2012/04/02 16:05:59 s/notifies/notified/
Primiano Tucci (use gerrit) 2012/04/03 10:16:39 Done.
159 if (audio_controller_.get()) { 405 DCHECK(recognition_engine_.get());
406 recognition_engine_->StartRecognition();
407 listener_->OnAudioStart(caller_id_);
408
409 // This is a little hack, since TakeAudioChunk() is already called by
410 // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping
411 // the first audio chunk captured after opening the audio device.
412 recognition_engine_->TakeAudioChunk(*(event_args.audio_data));
413 return STATE_ESTIMATING_ENVIRONMENT;
414 }
415
416 SpeechRecognizerImpl::FSMState
417 SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) {
418 DCHECK(endpointer_.IsEstimatingEnvironment());
419 if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) {
420 endpointer_.SetUserInputMode();
421 listener_->OnEnvironmentEstimationComplete(caller_id_);
422 return STATE_WAITING_FOR_SPEECH;
423 } else {
424 return STATE_ESTIMATING_ENVIRONMENT;
425 }
426 }
427
428 SpeechRecognizerImpl::FSMState
429 SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) {
430 if (endpointer_.DidStartReceivingSpeech()) {
431 listener_->OnSoundStart(caller_id_);
432 return STATE_RECOGNIZING;
433 } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) {
434 return AbortWithError(
435 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH));
436 } else {
437 return STATE_WAITING_FOR_SPEECH;
438 }
439 }
440
441 SpeechRecognizerImpl::FSMState
442 SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) {
443 if (endpointer_.speech_input_complete()) {
444 return StopCaptureAndWaitResult(event_args);
445 } else {
446 return STATE_RECOGNIZING;
447 }
448 }
449
450 SpeechRecognizerImpl::FSMState
451 SpeechRecognizerImpl::StopCaptureAndWaitResult(const FSMEventArgs&) {
452 DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING);
453
454 VLOG(1) << "Concluding recognition";
455 CloseAudioControllerAsynchronously();
456 recognition_engine_->AudioChunksEnded();
457
458 if (state_ > STATE_WAITING_FOR_SPEECH)
459 listener_->OnSoundEnd(caller_id_);
460
461 listener_->OnAudioEnd(caller_id_);
462 return STATE_WAITING_FINAL_RESULT;
463 }
464
465 SpeechRecognizerImpl::FSMState
466 SpeechRecognizerImpl::Abort(const FSMEventArgs& event_args) {
467 // TODO(primiano) Should raise SPEECH_RECOGNITION_ERROR_ABORTED in lack of
468 // other specific error sources (so that it was an explicit abort request).
469 // However, SPEECH_RECOGNITION_ERROR_ABORTED is not caught in UI layers
470 // and currently would cause an exception. JS will probably need it in future.
471 if (event_args.event == EVENT_AUDIO_ERROR) {
472 return AbortWithError(
473 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO));
474 } else if (event_args.event == EVENT_ENGINE_ERROR) {
475 return AbortWithError(event_args.engine_error);
476 }
477 return AbortWithError(NULL);
478 }
479
480 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortWithError(
481 const SpeechRecognitionError& error) {
482 return AbortWithError(&error);
483 }
484
485 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortWithError(
486 const SpeechRecognitionError* error) {
487 if (IsCapturingAudio())
160 CloseAudioControllerAsynchronously(); 488 CloseAudioControllerAsynchronously();
161 } 489
162 490 VLOG(1) << "SpeechRecognizerImpl canceling recognition. ";
163 VLOG(1) << "SpeechRecognizer canceling recognition."; 491
164 recognition_engine_.reset(); 492 // The recognition engine is initialized only after STATE_STARTING.
165 } 493 if (state_ > STATE_STARTING) {
166 494 DCHECK(recognition_engine_.get());
167 void SpeechRecognizerImpl::StopAudioCapture() { 495 recognition_engine_->EndRecognition();
168 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); 496 }
169 497
170 // If audio recording has already stopped and we are in recognition phase, 498 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT)
171 // silently ignore any more calls to stop recording. 499 listener_->OnSoundEnd(caller_id_);
172 if (!audio_controller_.get()) 500
173 return; 501 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT)
174 502 listener_->OnAudioEnd(caller_id_);
175 CloseAudioControllerAsynchronously(); 503
176 listener_->OnSoundEnd(caller_id_); 504 if (error != NULL)
177 listener_->OnAudioEnd(caller_id_); 505 listener_->OnRecognitionError(caller_id_, *error);
hans 2012/04/02 16:05:59 just a thought (maybe for the future).. i wonder w
Primiano Tucci (use gerrit) 2012/04/03 10:16:39 We should think on the implications that it might
178 506
179 // If we haven't got any audio yet end the recognition sequence here. 507 listener_->OnRecognitionEnd(caller_id_);
180 if (recognition_engine_ == NULL) { 508
181 // Guard against the listener freeing us until we finish our job. 509 return STATE_IDLE;
182 scoped_refptr<SpeechRecognizerImpl> me(this); 510 }
183 listener_->OnRecognitionEnd(caller_id_); 511
184 } else { 512 SpeechRecognizerImpl::FSMState
185 recognition_engine_->AudioChunksEnded(); 513 SpeechRecognizerImpl::ProcessIntermediateResult(const FSMEventArgs&) {
186 } 514 // This is in preparation for future speech recognition functions.
187 } 515 NOTREACHED();
188 516 return state_;
189 // Invoked in the audio thread. 517 }
190 void SpeechRecognizerImpl::OnError(AudioInputController* controller, 518
191 int error_code) { 519 SpeechRecognizerImpl::FSMState
192 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 520 SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) {
193 base::Bind(&SpeechRecognizerImpl::HandleOnError, 521 const SpeechRecognitionResult& result = event_args.engine_result;
194 this, error_code)); 522 VLOG(1) << "Got valid result";
195 } 523 recognition_engine_->EndRecognition();
196
197 void SpeechRecognizerImpl::HandleOnError(int error_code) {
198 LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code;
199
200 // Check if we are still recording before canceling recognition, as
201 // recording might have been stopped after this error was posted to the queue
202 // by |OnError|.
203 if (!audio_controller_.get())
204 return;
205
206 InformErrorAndAbortRecognition(content::SPEECH_RECOGNITION_ERROR_AUDIO);
207 }
208
209 void SpeechRecognizerImpl::OnData(AudioInputController* controller,
210 const uint8* data, uint32 size) {
211 if (size == 0) // This could happen when recording stops and is normal.
212 return;
213 scoped_refptr<AudioChunk> raw_audio(
214 new AudioChunk(data,
215 static_cast<size_t>(size),
216 kNumBitsPerAudioSample / 8));
217 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
218 base::Bind(&SpeechRecognizerImpl::HandleOnData,
219 this, raw_audio));
220 }
221
222 void SpeechRecognizerImpl::HandleOnData(scoped_refptr<AudioChunk> raw_audio) {
223 // Check if we are still recording and if not discard this buffer, as
224 // recording might have been stopped after this buffer was posted to the queue
225 // by |OnData|.
226 if (!audio_controller_.get())
227 return;
228
229 bool speech_was_heard_before_packet = endpointer_.DidStartReceivingSpeech();
230
231 float rms;
232 endpointer_.ProcessAudio(*raw_audio, &rms);
233 bool did_clip = DetectClipping(*raw_audio);
234 num_samples_recorded_ += raw_audio->NumSamples();
235
236 if (recognition_engine_ == NULL) {
237 // This was the first audio packet recorded, so start a request to the
238 // server to send the data and inform the listener.
239 listener_->OnAudioStart(caller_id_);
240 GoogleOneShotRemoteEngineConfig google_sr_config;
241 google_sr_config.language = language_;
242 google_sr_config.grammar = grammar_;
243 google_sr_config.audio_sample_rate = kAudioSampleRate;
244 google_sr_config.audio_num_bits_per_sample = kNumBitsPerAudioSample;
245 google_sr_config.filter_profanities = filter_profanities_;
246 google_sr_config.hardware_info = hardware_info_;
247 google_sr_config.origin_url = origin_url_;
248 GoogleOneShotRemoteEngine* google_sr_engine =
249 new GoogleOneShotRemoteEngine(context_getter_.get());
250 google_sr_engine->SetConfig(google_sr_config);
251 recognition_engine_.reset(google_sr_engine);
252 recognition_engine_->set_delegate(this);
253 recognition_engine_->StartRecognition();
254 }
255
256 recognition_engine_->TakeAudioChunk(*raw_audio);
257
258 if (endpointer_.IsEstimatingEnvironment()) {
259 // Check if we have gathered enough audio for the endpointer to do
260 // environment estimation and should move on to detect speech/end of speech.
261 if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs *
262 kAudioSampleRate) / 1000) {
263 endpointer_.SetUserInputMode();
264 listener_->OnEnvironmentEstimationComplete(caller_id_);
265 }
266 return; // No more processing since we are still estimating environment.
267 }
268
269 // Check if we have waited too long without hearing any speech.
270 bool speech_was_heard_after_packet = endpointer_.DidStartReceivingSpeech();
271 if (!speech_was_heard_after_packet &&
272 num_samples_recorded_ >= (kNoSpeechTimeoutMs / 1000) * kAudioSampleRate) {
273 InformErrorAndAbortRecognition(
274 content::SPEECH_RECOGNITION_ERROR_NO_SPEECH);
275 return;
276 }
277
278 if (!speech_was_heard_before_packet && speech_was_heard_after_packet)
279 listener_->OnSoundStart(caller_id_);
280
281 // Calculate the input volume to display in the UI, smoothing towards the
282 // new level.
283 float level = (rms - kAudioMeterMinDb) /
284 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
285 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);
286 if (level > audio_level_) {
287 audio_level_ += (level - audio_level_) * kUpSmoothingFactor;
288 } else {
289 audio_level_ += (level - audio_level_) * kDownSmoothingFactor;
290 }
291
292 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /
293 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
294 noise_level = std::min(std::max(0.0f, noise_level),
295 kAudioMeterRangeMaxUnclipped);
296
297 listener_->OnAudioLevelsChange(caller_id_, did_clip ? 1.0f : audio_level_,
298 noise_level);
299
300 if (endpointer_.speech_input_complete())
301 StopAudioCapture();
302 }
303
304 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {}
305
306 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult(
307 const content::SpeechRecognitionResult& result) {
308 // Guard against the listener freeing us until we finish our job.
309 scoped_refptr<SpeechRecognizerImpl> me(this);
310 listener_->OnRecognitionResult(caller_id_, result); 524 listener_->OnRecognitionResult(caller_id_, result);
311 listener_->OnRecognitionEnd(caller_id_); 525 listener_->OnRecognitionEnd(caller_id_);
312 } 526 return STATE_IDLE;
313 527 }
314 void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( 528
315 const content::SpeechRecognitionError& error) { 529 SpeechRecognizerImpl::FSMState
316 InformErrorAndAbortRecognition(error.code); 530 SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const {
317 } 531 return state_; // Just keep the current state.
318
319 void SpeechRecognizerImpl::InformErrorAndAbortRecognition(
320 content::SpeechRecognitionErrorCode error) {
321 DCHECK_NE(error, content::SPEECH_RECOGNITION_ERROR_NONE);
322 AbortRecognition();
323
324 // Guard against the listener freeing us until we finish our job.
325 scoped_refptr<SpeechRecognizerImpl> me(this);
326 listener_->OnRecognitionError(caller_id_, error);
327 } 532 }
328 533
329 void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() { 534 void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() {
330 VLOG(1) << "SpeechRecognizer stopping record."; 535 DCHECK(IsCapturingAudio());
536 VLOG(1) << "SpeechRecognizerImpl stopping audio capture.";
331 // Issues a Close on the audio controller, passing an empty callback. The only 537 // Issues a Close on the audio controller, passing an empty callback. The only
332 // purpose of such callback is to keep the audio controller refcounted until 538 // purpose of such callback is to keep the audio controller refcounted until
333 // Close has completed (in the audio thread) and automatically destroy it 539 // Close has completed (in the audio thread) and automatically destroy it
334 // afterwards (upon return from OnAudioClosed). 540 // afterwards (upon return from OnAudioClosed).
335 audio_controller_->Close(base::Bind(&SpeechRecognizerImpl::OnAudioClosed, 541 audio_controller_->Close(base::Bind(&SpeechRecognizerImpl::OnAudioClosed,
336 this, audio_controller_)); 542 this, audio_controller_));
337 audio_controller_ = NULL; // The controller is still refcounted by Bind. 543 audio_controller_ = NULL; // The controller is still refcounted by Bind.
338 } 544 }
339 545
340 bool SpeechRecognizerImpl::IsActive() const { 546 int SpeechRecognizerImpl::GetElapsedTimeMs() const {
341 return (recognition_engine_.get() != NULL); 547 return (num_samples_recorded_ * 1000) / kAudioSampleRate;
342 } 548 }
343 549
344 bool SpeechRecognizerImpl::IsCapturingAudio() const { 550 void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms,
345 return (audio_controller_.get() != NULL); 551 bool clip_detected) {
552 // Calculate the input volume to display in the UI, smoothing towards the
553 // new level.
554 // TODO(primiano) Do we really need all this floating point arith here?
555 // Perhaps it might be quite expensive on mobile.
556 float level = (rms - kAudioMeterMinDb) /
557 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
558 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);
559 if (level > audio_level_) {
560 audio_level_ += (level - audio_level_) * kUpSmoothingFactor;
561 } else {
562 audio_level_ += (level - audio_level_) * kDownSmoothingFactor;
563 }
564
565 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /
566 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
567 noise_level = std::min(std::max(0.0f, noise_level),
568 kAudioMeterRangeMaxUnclipped);
569
570 listener_->OnAudioLevelsChange(
571 caller_id_, clip_detected ? 1.0f : audio_level_, noise_level);
346 } 572 }
347 573
348 const SpeechRecognitionEngine& 574 const SpeechRecognitionEngine&
349 SpeechRecognizerImpl::recognition_engine() const { 575 SpeechRecognizerImpl::recognition_engine() const {
350 return *(recognition_engine_.get()); 576 return *(recognition_engine_.get());
351 } 577 }
352 578
353 void SpeechRecognizerImpl::SetAudioManagerForTesting( 579 void SpeechRecognizerImpl::SetAudioManagerForTesting(
354 AudioManager* audio_manager) { 580 AudioManager* audio_manager) {
355 testing_audio_manager_ = audio_manager; 581 testing_audio_manager_ = audio_manager;
356 } 582 }
357 583
584 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs()
585 : audio_error_code(0),
586 audio_data(NULL),
587 engine_error(content::SPEECH_RECOGNITION_ERROR_NONE) {
588 }
589
590 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() {
591 }
358 592
359 } // namespace speech 593 } // namespace speech
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698