OLD | NEW |
---|---|
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "content/browser/speech/speech_recognizer_impl.h" | 5 #include "content/browser/speech/speech_recognizer_impl.h" |
6 | 6 |
7 #include "base/basictypes.h" | |
7 #include "base/bind.h" | 8 #include "base/bind.h" |
8 #include "base/time.h" | 9 #include "base/time.h" |
9 #include "content/browser/browser_main_loop.h" | 10 #include "content/browser/browser_main_loop.h" |
10 #include "content/browser/speech/audio_buffer.h" | 11 #include "content/browser/speech/audio_buffer.h" |
11 #include "content/browser/speech/google_one_shot_remote_engine.h" | 12 #include "content/browser/speech/google_one_shot_remote_engine.h" |
12 #include "content/public/browser/browser_thread.h" | 13 #include "content/public/browser/browser_thread.h" |
13 #include "content/public/browser/speech_recognition_event_listener.h" | 14 #include "content/public/browser/speech_recognition_event_listener.h" |
14 #include "content/public/browser/speech_recognizer.h" | 15 #include "content/public/browser/speech_recognizer.h" |
15 #include "content/public/common/speech_recognition_error.h" | 16 #include "content/public/common/speech_recognition_error.h" |
16 #include "content/public/common/speech_recognition_result.h" | 17 #include "content/public/common/speech_recognition_result.h" |
17 #include "net/url_request/url_request_context_getter.h" | 18 #include "net/url_request/url_request_context_getter.h" |
18 | 19 |
20 #define BIND(x) base::Bind(&SpeechRecognizerImpl::x, this) | |
hans
2012/04/02 16:05:59
Hmm, not super happy about this macro and the use
Primiano Tucci (use gerrit)
2012/04/03 10:16:39
Reverted to switch-style FSM as agreed.
| |
21 | |
19 using content::BrowserMainLoop; | 22 using content::BrowserMainLoop; |
20 using content::BrowserThread; | 23 using content::BrowserThread; |
21 using content::SpeechRecognitionError; | 24 using content::SpeechRecognitionError; |
22 using content::SpeechRecognitionEventListener; | 25 using content::SpeechRecognitionEventListener; |
23 using content::SpeechRecognitionResult; | 26 using content::SpeechRecognitionResult; |
24 using content::SpeechRecognizer; | 27 using content::SpeechRecognizer; |
25 using media::AudioInputController; | 28 using media::AudioInputController; |
26 | 29 |
27 namespace { | 30 namespace { |
28 | 31 |
(...skipping 12 matching lines...) Expand all Loading... | |
41 | 44 |
42 // Maximum level to draw to display unclipped meter. (1.0f displays clipping.) | 45 // Maximum level to draw to display unclipped meter. (1.0f displays clipping.) |
43 const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f; | 46 const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f; |
44 | 47 |
45 // Returns true if more than 5% of the samples are at min or max value. | 48 // Returns true if more than 5% of the samples are at min or max value. |
46 bool DetectClipping(const speech::AudioChunk& chunk) { | 49 bool DetectClipping(const speech::AudioChunk& chunk) { |
47 const int num_samples = chunk.NumSamples(); | 50 const int num_samples = chunk.NumSamples(); |
48 const int16* samples = chunk.SamplesData16(); | 51 const int16* samples = chunk.SamplesData16(); |
49 const int kThreshold = num_samples / 20; | 52 const int kThreshold = num_samples / 20; |
50 int clipping_samples = 0; | 53 int clipping_samples = 0; |
54 | |
51 for (int i = 0; i < num_samples; ++i) { | 55 for (int i = 0; i < num_samples; ++i) { |
52 if (samples[i] <= -32767 || samples[i] >= 32767) { | 56 if (samples[i] <= -32767 || samples[i] >= 32767) { |
53 if (++clipping_samples > kThreshold) | 57 if (++clipping_samples > kThreshold) |
54 return true; | 58 return true; |
55 } | 59 } |
56 } | 60 } |
57 return false; | 61 return false; |
58 } | 62 } |
59 | 63 |
60 } // namespace | 64 } // namespace |
61 | 65 |
62 SpeechRecognizer* SpeechRecognizer::Create( | 66 SpeechRecognizer* SpeechRecognizer::Create( |
63 SpeechRecognitionEventListener* listener, | 67 SpeechRecognitionEventListener* listener, |
64 int caller_id, | 68 int caller_id, |
65 const std::string& language, | 69 const std::string& language, |
66 const std::string& grammar, | 70 const std::string& grammar, |
67 net::URLRequestContextGetter* context_getter, | 71 net::URLRequestContextGetter* context_getter, |
68 bool filter_profanities, | 72 bool filter_profanities, |
69 const std::string& hardware_info, | 73 const std::string& hardware_info, |
70 const std::string& origin_url) { | 74 const std::string& origin_url) { |
75 speech::GoogleOneShotRemoteEngineConfig google_sr_config; | |
76 google_sr_config.language = language; | |
77 google_sr_config.grammar = grammar; | |
78 google_sr_config.audio_sample_rate = | |
79 speech::SpeechRecognizerImpl::kAudioSampleRate; | |
80 google_sr_config.audio_num_bits_per_sample = | |
81 speech::SpeechRecognizerImpl::kNumBitsPerAudioSample; | |
82 google_sr_config.filter_profanities = filter_profanities; | |
83 google_sr_config.hardware_info = hardware_info; | |
84 google_sr_config.origin_url = origin_url; | |
85 | |
86 speech::GoogleOneShotRemoteEngine* google_sr_engine = | |
87 new speech::GoogleOneShotRemoteEngine(context_getter); | |
88 google_sr_engine->SetConfig(google_sr_config); | |
89 | |
71 return new speech::SpeechRecognizerImpl(listener, | 90 return new speech::SpeechRecognizerImpl(listener, |
72 caller_id, | 91 caller_id, |
73 language, | 92 google_sr_engine); |
74 grammar, | |
75 context_getter, | |
76 filter_profanities, | |
77 hardware_info, | |
78 origin_url); | |
79 } | 93 } |
80 | 94 |
81 namespace speech { | 95 namespace speech { |
82 | 96 |
83 const int SpeechRecognizerImpl::kAudioSampleRate = 16000; | 97 const int SpeechRecognizerImpl::kAudioSampleRate = 16000; |
84 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO; | 98 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO; |
85 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; | 99 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; |
86 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; | 100 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; |
87 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; | 101 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; |
88 | 102 |
103 COMPILE_ASSERT((SpeechRecognizerImpl::kNumBitsPerAudioSample & 0x7) == 0, | |
hans
2012/04/02 16:05:59
I think using the % operator instead of & would ma
Primiano Tucci (use gerrit)
2012/04/03 10:16:39
Done.
| |
104 kNumBitsPerAudioSample_must_be_a_multiple_of_8); | |
105 | |
89 SpeechRecognizerImpl::SpeechRecognizerImpl( | 106 SpeechRecognizerImpl::SpeechRecognizerImpl( |
90 SpeechRecognitionEventListener* listener, | 107 SpeechRecognitionEventListener* listener, |
91 int caller_id, | 108 int caller_id, |
92 const std::string& language, | 109 SpeechRecognitionEngine* engine) |
93 const std::string& grammar, | |
94 net::URLRequestContextGetter* context_getter, | |
95 bool filter_profanities, | |
96 const std::string& hardware_info, | |
97 const std::string& origin_url) | |
98 : listener_(listener), | 110 : listener_(listener), |
99 testing_audio_manager_(NULL), | 111 testing_audio_manager_(NULL), |
112 recognition_engine_(engine), | |
100 endpointer_(kAudioSampleRate), | 113 endpointer_(kAudioSampleRate), |
101 context_getter_(context_getter), | |
102 caller_id_(caller_id), | 114 caller_id_(caller_id), |
103 language_(language), | 115 in_event_dispatching_(false), |
104 grammar_(grammar), | 116 state_(STATE_IDLE) { |
105 filter_profanities_(filter_profanities), | |
106 hardware_info_(hardware_info), | |
107 origin_url_(origin_url), | |
108 num_samples_recorded_(0), | |
109 audio_level_(0.0f) { | |
110 DCHECK(listener_ != NULL); | 117 DCHECK(listener_ != NULL); |
118 DCHECK(recognition_engine_ != NULL); | |
119 InitializeFSM(); | |
111 endpointer_.set_speech_input_complete_silence_length( | 120 endpointer_.set_speech_input_complete_silence_length( |
112 base::Time::kMicrosecondsPerSecond / 2); | 121 base::Time::kMicrosecondsPerSecond / 2); |
113 endpointer_.set_long_speech_input_complete_silence_length( | 122 endpointer_.set_long_speech_input_complete_silence_length( |
114 base::Time::kMicrosecondsPerSecond); | 123 base::Time::kMicrosecondsPerSecond); |
115 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); | 124 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); |
116 endpointer_.StartSession(); | 125 endpointer_.StartSession(); |
126 recognition_engine_->set_delegate(this); | |
117 } | 127 } |
118 | 128 |
119 SpeechRecognizerImpl::~SpeechRecognizerImpl() { | 129 SpeechRecognizerImpl::~SpeechRecognizerImpl() { |
120 // Recording should have stopped earlier due to the endpointer or | |
121 // |StopRecording| being called. | |
122 DCHECK(!audio_controller_.get()); | |
123 DCHECK(!recognition_engine_.get() || | |
124 !recognition_engine_->IsRecognitionPending()); | |
125 endpointer_.EndSession(); | 130 endpointer_.EndSession(); |
126 } | 131 } |
127 | 132 |
133 // ------- Methods that trigger Finite State Machine (FSM) events ------------ | |
134 | |
135 // NOTE: all the external events and request should be enqueued (PostTask), even | |
hans
2012/04/02 16:05:59
s/request/requests/ ?
Primiano Tucci (use gerrit)
2012/04/03 10:16:39
Done.
| |
136 // if they come from the same (IO) thread, in order to preserve the relationship | |
137 // of causality between events and avoid interleaved event processing due to | |
138 // synchronous callbacks. | |
139 | |
128 void SpeechRecognizerImpl::StartRecognition() { | 140 void SpeechRecognizerImpl::StartRecognition() { |
141 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | |
142 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | |
143 this, EVENT_START, FSMEventArgs())); | |
144 } | |
145 | |
146 void SpeechRecognizerImpl::AbortRecognition() { | |
147 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | |
148 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | |
149 this, EVENT_ABORT, FSMEventArgs())); | |
150 } | |
151 | |
152 void SpeechRecognizerImpl::StopAudioCapture() { | |
153 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | |
154 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | |
155 this, EVENT_STOP_CAPTURE, | |
156 FSMEventArgs())); | |
157 } | |
158 | |
159 bool SpeechRecognizerImpl::IsActive() const { | |
160 // Checking the FSM state from another thread (thus, while the FSM is | |
161 // potentially concurrently evolving) is meaningless. | |
162 // If you're doing it, probably you have some design issues. | |
hans
2012/04/02 16:05:59
i'm not sure this comment adds much.. i think the
Primiano Tucci (use gerrit)
2012/04/03 10:16:39
Agree, removed the last line.
| |
129 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); | 163 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
130 DCHECK(!audio_controller_.get()); | 164 return state_ != STATE_IDLE; |
131 DCHECK(!recognition_engine_.get() || | 165 } |
132 !recognition_engine_->IsRecognitionPending()); | 166 |
133 | 167 bool SpeechRecognizerImpl::IsCapturingAudio() const { |
134 // The endpointer needs to estimate the environment/background noise before | 168 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive(). |
135 // starting to treat the audio as user input. In |HandleOnData| we wait until | 169 const bool is_capturing_audio = state_ >= STATE_STARTING && |
136 // such time has passed before switching to user input mode. | 170 state_ <= STATE_RECOGNIZING; |
137 endpointer_.SetEnvironmentEstimationMode(); | 171 DCHECK((is_capturing_audio && (audio_controller_.get() != NULL)) || |
138 | 172 (!is_capturing_audio && audio_controller_.get() == NULL)); |
173 return is_capturing_audio; | |
174 } | |
175 | |
176 // Invoked in the audio thread. | |
177 void SpeechRecognizerImpl::OnError(AudioInputController* controller, | |
178 int error_code) { | |
179 FSMEventArgs args; | |
180 args.audio_error_code = error_code; | |
181 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | |
182 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | |
183 this, EVENT_AUDIO_ERROR, args)); | |
184 } | |
185 | |
186 void SpeechRecognizerImpl::OnData(AudioInputController* controller, | |
187 const uint8* data, uint32 size) { | |
188 if (size == 0) // This could happen when audio capture stops and is normal. | |
189 return; | |
190 | |
191 FSMEventArgs args; | |
192 args.audio_data = new AudioChunk(data, static_cast<size_t>(size), | |
193 kNumBitsPerAudioSample / 8); | |
194 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | |
195 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | |
196 this, EVENT_AUDIO_DATA, args)); | |
197 } | |
198 | |
199 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} | |
200 | |
201 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult( | |
202 const content::SpeechRecognitionResult& result) { | |
203 FSMEventArgs args; | |
204 args.engine_result = result; | |
205 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | |
206 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | |
207 this, EVENT_ENGINE_RESULT, args)); | |
208 } | |
209 | |
210 void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( | |
211 const content::SpeechRecognitionError& error) { | |
212 FSMEventArgs args; | |
213 args.engine_error = error; | |
214 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | |
215 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | |
216 this, EVENT_ENGINE_ERROR, args)); | |
217 } | |
218 | |
219 // ----------------------- Core FSM implementation --------------------------- | |
220 // TODO(primiano) After the changes in the media package (r129173), this class | |
221 // slightly violates the SpeechRecognitionEventListener interface contract. In | |
222 // particular, it is not true anymore that this class can be freed after the | |
223 // OnRecognitionEnd event, since the audio_controller_.Close() asynchronous | |
224 // call can be still in progress after the end event. Currently, it does not | |
225 // represent a problem for the browser itself, since since refcounting protects | |
hans
2012/04/02 16:05:59
s/since since/since/
Primiano Tucci (use gerrit)
2012/04/03 10:16:39
Done.
| |
226 // us against such race conditions. However, we should fix this in the next CLs. | |
227 // For instance, tests are currently working just because the | |
228 // TestAudioInputController is not closing asynchronously as the real controller | |
229 // does, but they will become flaky if TestAudioInputController will be fixed. | |
230 | |
231 void SpeechRecognizerImpl::InitializeFSM() { | |
232 fsm[STATE_IDLE][EVENT_ABORT] = BIND(DoNothing); | |
233 fsm[STATE_IDLE][EVENT_START] = BIND(StartRecording); | |
234 fsm[STATE_IDLE][EVENT_STOP_CAPTURE] = BIND(DoNothing); | |
235 fsm[STATE_IDLE][EVENT_AUDIO_DATA] = BIND(DoNothing); | |
236 fsm[STATE_IDLE][EVENT_ENGINE_RESULT] = BIND(DoNothing); | |
237 fsm[STATE_IDLE][EVENT_ENGINE_ERROR] = BIND(DoNothing); | |
238 fsm[STATE_IDLE][EVENT_AUDIO_ERROR] = BIND(DoNothing); | |
239 | |
240 fsm[STATE_STARTING][EVENT_ABORT] = BIND(Abort); | |
241 fsm[STATE_STARTING][EVENT_START] = kUnfeasibleTransition; | |
242 fsm[STATE_STARTING][EVENT_STOP_CAPTURE] = BIND(Abort); | |
243 fsm[STATE_STARTING][EVENT_AUDIO_DATA] = BIND(StartRecognitionEngine); | |
244 fsm[STATE_STARTING][EVENT_ENGINE_RESULT] = kUnfeasibleTransition; | |
245 fsm[STATE_STARTING][EVENT_ENGINE_ERROR] = BIND(Abort); | |
246 fsm[STATE_STARTING][EVENT_AUDIO_ERROR] = BIND(Abort); | |
247 | |
248 fsm[STATE_ESTIMATING_ENVIRONMENT][EVENT_ABORT] = BIND(Abort); | |
249 fsm[STATE_ESTIMATING_ENVIRONMENT][EVENT_START] = kUnfeasibleTransition; | |
250 fsm[STATE_ESTIMATING_ENVIRONMENT][EVENT_STOP_CAPTURE] = | |
251 BIND(StopCaptureAndWaitResult); | |
252 fsm[STATE_ESTIMATING_ENVIRONMENT][EVENT_AUDIO_DATA] = | |
253 BIND(WaitEnvironmentEstimationCompletion); | |
254 fsm[STATE_ESTIMATING_ENVIRONMENT][EVENT_ENGINE_RESULT] = | |
255 BIND(ProcessIntermediateResult); | |
256 fsm[STATE_ESTIMATING_ENVIRONMENT][EVENT_ENGINE_ERROR] = BIND(Abort); | |
257 fsm[STATE_ESTIMATING_ENVIRONMENT][EVENT_AUDIO_ERROR] = BIND(Abort); | |
258 | |
259 fsm[STATE_WAITING_FOR_SPEECH][EVENT_ABORT] = BIND(Abort); | |
260 fsm[STATE_WAITING_FOR_SPEECH][EVENT_START] = kUnfeasibleTransition; | |
261 fsm[STATE_WAITING_FOR_SPEECH][EVENT_STOP_CAPTURE] = | |
262 BIND(StopCaptureAndWaitResult); | |
263 fsm[STATE_WAITING_FOR_SPEECH][EVENT_AUDIO_DATA] = | |
264 BIND(DetectUserSpeechOrTimeout); | |
265 fsm[STATE_WAITING_FOR_SPEECH][EVENT_ENGINE_RESULT] = | |
266 BIND(ProcessIntermediateResult); | |
267 fsm[STATE_WAITING_FOR_SPEECH][EVENT_ENGINE_ERROR] = BIND(Abort); | |
268 fsm[STATE_WAITING_FOR_SPEECH][EVENT_AUDIO_ERROR] = BIND(Abort); | |
269 | |
270 fsm[STATE_RECOGNIZING][EVENT_ABORT] = BIND(Abort); | |
271 fsm[STATE_RECOGNIZING][EVENT_START] = kUnfeasibleTransition; | |
272 fsm[STATE_RECOGNIZING][EVENT_STOP_CAPTURE] = BIND(StopCaptureAndWaitResult); | |
273 fsm[STATE_RECOGNIZING][EVENT_AUDIO_DATA] = BIND(DetectEndOfSpeech); | |
274 fsm[STATE_RECOGNIZING][EVENT_ENGINE_RESULT] = BIND(ProcessIntermediateResult); | |
275 fsm[STATE_RECOGNIZING][EVENT_ENGINE_ERROR] = BIND(Abort); | |
276 fsm[STATE_RECOGNIZING][EVENT_AUDIO_ERROR] = BIND(Abort); | |
277 | |
278 fsm[STATE_WAITING_FINAL_RESULT][EVENT_ABORT] = BIND(Abort); | |
279 fsm[STATE_WAITING_FINAL_RESULT][EVENT_START] = kUnfeasibleTransition; | |
280 fsm[STATE_WAITING_FINAL_RESULT][EVENT_STOP_CAPTURE] = BIND(DoNothing); | |
281 fsm[STATE_WAITING_FINAL_RESULT][EVENT_AUDIO_DATA] = BIND(DoNothing); | |
282 fsm[STATE_WAITING_FINAL_RESULT][EVENT_ENGINE_RESULT] = | |
283 BIND(ProcessFinalResult); | |
284 fsm[STATE_WAITING_FINAL_RESULT][EVENT_ENGINE_ERROR] = BIND(Abort); | |
285 fsm[STATE_WAITING_FINAL_RESULT][EVENT_AUDIO_ERROR] = BIND(Abort); | |
286 } | |
287 | |
288 void SpeechRecognizerImpl::DispatchEvent(FSMEvent event, FSMEventArgs args) { | |
289 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); | |
290 DCHECK_LE(event, EVENT_MAX); | |
291 DCHECK_LE(state_, STATE_MAX); | |
292 | |
293 // Event dispatching must be sequential, otherwise it will break all the rules | |
294 // and the assumptions of the finite state automata model. | |
295 DCHECK(!in_event_dispatching_); | |
296 in_event_dispatching_ = true; | |
297 | |
298 // Guard against the delegate freeing us until we finish processing the event. | |
299 scoped_refptr<SpeechRecognizerImpl> me(this); | |
300 | |
301 args.event = event; | |
302 | |
303 if (event == EVENT_AUDIO_DATA) { | |
304 DCHECK(args.audio_data.get() != NULL); | |
305 ProcessAudioPipeline(*(args.audio_data.get())); | |
hans
2012/04/02 16:05:59
I think you can just do ProcessAudioPipeline(*args
Primiano Tucci (use gerrit)
2012/04/03 10:16:39
Done.
| |
306 } | |
307 | |
308 // The audio pipeline must be processed before the event dispatch, otherwise | |
309 // it would take actions according to the future state instead of the current. | |
310 const TransitionFunction& transition = fsm[state_][event]; | |
hans
2012/04/02 16:05:59
i liked the switch-case better
Satish
2012/04/02 21:57:09
I was thinking earlier that a table would be appea
| |
311 if(transition.Equals(kUnfeasibleTransition)) { | |
312 NOTREACHED() << "Unfeasible event " << event << " in state " << state_; | |
313 } else { | |
314 state_ = transition.Run(args); | |
315 } | |
316 | |
317 in_event_dispatching_ = false; | |
318 } | |
319 | |
320 // ----------- Contract for all the FSM evolution functions below ------------- | |
321 // - Are guaranteed to be executed in the IO thread; | |
322 // - Are guaranteed to be not reentrant (themselves and each other); | |
323 // - event_args members are guaranteed to be stable during the call; | |
324 // - The class won't be freed in the meanwhile due to callbacks; | |
325 // - IsCapturingAudio() returns true if and only if audio_controller_ != NULL. | |
326 | |
327 // TODO(primiano) the audio pipeline is currently serial. However, the | |
328 // clipper->endpointer->vumeter chain and the sr_engine could be parallelized. | |
329 // We should profile the execution to see if it would be worth or not. | |
330 void SpeechRecognizerImpl::ProcessAudioPipeline(const AudioChunk& raw_audio) { | |
331 const bool route_to_endpointer = state_ >= STATE_ESTIMATING_ENVIRONMENT && | |
332 state_ <= STATE_RECOGNIZING; | |
333 const bool route_to_sr_engine = route_to_endpointer; | |
334 const bool route_to_vumeter = state_ >= STATE_WAITING_FOR_SPEECH && | |
335 state_ <= STATE_RECOGNIZING; | |
336 const bool clip_detected = DetectClipping(raw_audio); | |
337 float rms = 0; | |
338 | |
339 num_samples_recorded_ += raw_audio.NumSamples(); | |
340 | |
341 if (route_to_endpointer) { | |
342 endpointer_.ProcessAudio(raw_audio, &rms); | |
343 } | |
344 if (route_to_vumeter) { | |
345 DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|. | |
346 UpdateSignalAndNoiseLevels(rms, clip_detected); | |
347 } | |
348 if (route_to_sr_engine) { | |
349 DCHECK(recognition_engine_.get()); | |
350 recognition_engine_->TakeAudioChunk(raw_audio); | |
351 } | |
352 } | |
353 | |
354 SpeechRecognizerImpl::FSMState | |
355 SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) { | |
356 DCHECK(recognition_engine_.get()); | |
357 DCHECK(!IsCapturingAudio()); | |
139 AudioManager* audio_manager = (testing_audio_manager_ != NULL) ? | 358 AudioManager* audio_manager = (testing_audio_manager_ != NULL) ? |
140 testing_audio_manager_ : | 359 testing_audio_manager_ : |
141 BrowserMainLoop::GetAudioManager(); | 360 BrowserMainLoop::GetAudioManager(); |
142 const int samples_per_packet = kAudioSampleRate * | 361 DCHECK(audio_manager != NULL); |
143 GoogleOneShotRemoteEngine::kAudioPacketIntervalMs / 1000; | 362 |
363 VLOG(1) << "SpeechRecognizerImpl starting audio capture."; | |
364 num_samples_recorded_ = 0; | |
365 audio_level_ = 0; | |
366 listener_->OnRecognitionStart(caller_id_); | |
367 | |
368 if (!audio_manager->HasAudioInputDevices()) { | |
369 return AbortWithError(SpeechRecognitionError( | |
370 content::SPEECH_RECOGNITION_ERROR_AUDIO, | |
371 content::SPEECH_AUDIO_ERROR_DETAILS_NO_MIC)); | |
372 } | |
373 | |
374 if (audio_manager->IsRecordingInProcess()) { | |
375 return AbortWithError(SpeechRecognitionError( | |
376 content::SPEECH_RECOGNITION_ERROR_AUDIO, | |
377 content::SPEECH_AUDIO_ERROR_DETAILS_IN_USE)); | |
378 } | |
379 | |
380 const int samples_per_packet = (kAudioSampleRate * | |
381 recognition_engine_->GetDesiredAudioChunkDurationMs()) / 1000; | |
144 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout, | 382 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout, |
145 kAudioSampleRate, kNumBitsPerAudioSample, | 383 kAudioSampleRate, kNumBitsPerAudioSample, |
146 samples_per_packet); | 384 samples_per_packet); |
147 audio_controller_ = AudioInputController::Create(audio_manager, this, params); | 385 audio_controller_ = AudioInputController::Create(audio_manager, this, params); |
148 DCHECK(audio_controller_.get()); | 386 |
149 VLOG(1) << "SpeechRecognizer starting record."; | 387 if (audio_controller_.get() == NULL) { |
150 num_samples_recorded_ = 0; | 388 return AbortWithError( |
389 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO)); | |
390 } | |
391 | |
392 // The endpointer needs to estimate the environment/background noise before | |
393 // starting to treat the audio as user input. We wait in the state | |
394 // ESTIMATING_ENVIRONMENT until such interval has elapsed before switching | |
395 // to user input mode. | |
396 endpointer_.SetEnvironmentEstimationMode(); | |
151 audio_controller_->Record(); | 397 audio_controller_->Record(); |
152 } | 398 return STATE_STARTING; |
153 | 399 } |
154 void SpeechRecognizerImpl::AbortRecognition() { | 400 |
155 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); | 401 SpeechRecognizerImpl::FSMState |
156 DCHECK(audio_controller_.get() || recognition_engine_.get()); | 402 SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) { |
157 | 403 // This is the first audio packet captured, so the recognition engine is |
158 // Stop recording if required. | 404 // started and the delegate notifies about the event. |
hans
2012/04/02 16:05:59
s/notifies/notified/
Primiano Tucci (use gerrit)
2012/04/03 10:16:39
Done.
| |
159 if (audio_controller_.get()) { | 405 DCHECK(recognition_engine_.get()); |
406 recognition_engine_->StartRecognition(); | |
407 listener_->OnAudioStart(caller_id_); | |
408 | |
409 // This is a little hack, since TakeAudioChunk() is already called by | |
410 // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping | |
411 // the first audio chunk captured after opening the audio device. | |
412 recognition_engine_->TakeAudioChunk(*(event_args.audio_data)); | |
413 return STATE_ESTIMATING_ENVIRONMENT; | |
414 } | |
415 | |
416 SpeechRecognizerImpl::FSMState | |
417 SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) { | |
418 DCHECK(endpointer_.IsEstimatingEnvironment()); | |
419 if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) { | |
420 endpointer_.SetUserInputMode(); | |
421 listener_->OnEnvironmentEstimationComplete(caller_id_); | |
422 return STATE_WAITING_FOR_SPEECH; | |
423 } else { | |
424 return STATE_ESTIMATING_ENVIRONMENT; | |
425 } | |
426 } | |
427 | |
428 SpeechRecognizerImpl::FSMState | |
429 SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) { | |
430 if (endpointer_.DidStartReceivingSpeech()) { | |
431 listener_->OnSoundStart(caller_id_); | |
432 return STATE_RECOGNIZING; | |
433 } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) { | |
434 return AbortWithError( | |
435 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH)); | |
436 } else { | |
437 return STATE_WAITING_FOR_SPEECH; | |
438 } | |
439 } | |
440 | |
441 SpeechRecognizerImpl::FSMState | |
442 SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) { | |
443 if (endpointer_.speech_input_complete()) { | |
444 return StopCaptureAndWaitResult(event_args); | |
445 } else { | |
446 return STATE_RECOGNIZING; | |
447 } | |
448 } | |
449 | |
450 SpeechRecognizerImpl::FSMState | |
451 SpeechRecognizerImpl::StopCaptureAndWaitResult(const FSMEventArgs&) { | |
452 DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING); | |
453 | |
454 VLOG(1) << "Concluding recognition"; | |
455 CloseAudioControllerAsynchronously(); | |
456 recognition_engine_->AudioChunksEnded(); | |
457 | |
458 if (state_ > STATE_WAITING_FOR_SPEECH) | |
459 listener_->OnSoundEnd(caller_id_); | |
460 | |
461 listener_->OnAudioEnd(caller_id_); | |
462 return STATE_WAITING_FINAL_RESULT; | |
463 } | |
464 | |
465 SpeechRecognizerImpl::FSMState | |
466 SpeechRecognizerImpl::Abort(const FSMEventArgs& event_args) { | |
467 // TODO(primiano) Should raise SPEECH_RECOGNITION_ERROR_ABORTED in lack of | |
468 // other specific error sources (so that it was an explicit abort request). | |
469 // However, SPEECH_RECOGNITION_ERROR_ABORTED is not caught in UI layers | |
470 // and currently would cause an exception. JS will probably need it in future. | |
471 if (event_args.event == EVENT_AUDIO_ERROR) { | |
472 return AbortWithError( | |
473 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO)); | |
474 } else if (event_args.event == EVENT_ENGINE_ERROR) { | |
475 return AbortWithError(event_args.engine_error); | |
476 } | |
477 return AbortWithError(NULL); | |
478 } | |
479 | |
480 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortWithError( | |
481 const SpeechRecognitionError& error) { | |
482 return AbortWithError(&error); | |
483 } | |
484 | |
485 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortWithError( | |
486 const SpeechRecognitionError* error) { | |
487 if (IsCapturingAudio()) | |
160 CloseAudioControllerAsynchronously(); | 488 CloseAudioControllerAsynchronously(); |
161 } | 489 |
162 | 490 VLOG(1) << "SpeechRecognizerImpl canceling recognition. "; |
163 VLOG(1) << "SpeechRecognizer canceling recognition."; | 491 |
164 recognition_engine_.reset(); | 492 // The recognition engine is initialized only after STATE_STARTING. |
165 } | 493 if (state_ > STATE_STARTING) { |
166 | 494 DCHECK(recognition_engine_.get()); |
167 void SpeechRecognizerImpl::StopAudioCapture() { | 495 recognition_engine_->EndRecognition(); |
168 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); | 496 } |
169 | 497 |
170 // If audio recording has already stopped and we are in recognition phase, | 498 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT) |
171 // silently ignore any more calls to stop recording. | 499 listener_->OnSoundEnd(caller_id_); |
172 if (!audio_controller_.get()) | 500 |
173 return; | 501 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT) |
174 | 502 listener_->OnAudioEnd(caller_id_); |
175 CloseAudioControllerAsynchronously(); | 503 |
176 listener_->OnSoundEnd(caller_id_); | 504 if (error != NULL) |
177 listener_->OnAudioEnd(caller_id_); | 505 listener_->OnRecognitionError(caller_id_, *error); |
hans
2012/04/02 16:05:59
just a thought (maybe for the future).. i wonder w
Primiano Tucci (use gerrit)
2012/04/03 10:16:39
We should think on the implications that it might
| |
178 | 506 |
179 // If we haven't got any audio yet end the recognition sequence here. | 507 listener_->OnRecognitionEnd(caller_id_); |
180 if (recognition_engine_ == NULL) { | 508 |
181 // Guard against the listener freeing us until we finish our job. | 509 return STATE_IDLE; |
182 scoped_refptr<SpeechRecognizerImpl> me(this); | 510 } |
183 listener_->OnRecognitionEnd(caller_id_); | 511 |
184 } else { | 512 SpeechRecognizerImpl::FSMState |
185 recognition_engine_->AudioChunksEnded(); | 513 SpeechRecognizerImpl::ProcessIntermediateResult(const FSMEventArgs&) { |
186 } | 514 // This is in preparation for future speech recognition functions. |
187 } | 515 NOTREACHED(); |
188 | 516 return state_; |
189 // Invoked in the audio thread. | 517 } |
190 void SpeechRecognizerImpl::OnError(AudioInputController* controller, | 518 |
191 int error_code) { | 519 SpeechRecognizerImpl::FSMState |
192 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 520 SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) { |
193 base::Bind(&SpeechRecognizerImpl::HandleOnError, | 521 const SpeechRecognitionResult& result = event_args.engine_result; |
194 this, error_code)); | 522 VLOG(1) << "Got valid result"; |
195 } | 523 recognition_engine_->EndRecognition(); |
196 | |
197 void SpeechRecognizerImpl::HandleOnError(int error_code) { | |
198 LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code; | |
199 | |
200 // Check if we are still recording before canceling recognition, as | |
201 // recording might have been stopped after this error was posted to the queue | |
202 // by |OnError|. | |
203 if (!audio_controller_.get()) | |
204 return; | |
205 | |
206 InformErrorAndAbortRecognition(content::SPEECH_RECOGNITION_ERROR_AUDIO); | |
207 } | |
208 | |
209 void SpeechRecognizerImpl::OnData(AudioInputController* controller, | |
210 const uint8* data, uint32 size) { | |
211 if (size == 0) // This could happen when recording stops and is normal. | |
212 return; | |
213 scoped_refptr<AudioChunk> raw_audio( | |
214 new AudioChunk(data, | |
215 static_cast<size_t>(size), | |
216 kNumBitsPerAudioSample / 8)); | |
217 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | |
218 base::Bind(&SpeechRecognizerImpl::HandleOnData, | |
219 this, raw_audio)); | |
220 } | |
221 | |
222 void SpeechRecognizerImpl::HandleOnData(scoped_refptr<AudioChunk> raw_audio) { | |
223 // Check if we are still recording and if not discard this buffer, as | |
224 // recording might have been stopped after this buffer was posted to the queue | |
225 // by |OnData|. | |
226 if (!audio_controller_.get()) | |
227 return; | |
228 | |
229 bool speech_was_heard_before_packet = endpointer_.DidStartReceivingSpeech(); | |
230 | |
231 float rms; | |
232 endpointer_.ProcessAudio(*raw_audio, &rms); | |
233 bool did_clip = DetectClipping(*raw_audio); | |
234 num_samples_recorded_ += raw_audio->NumSamples(); | |
235 | |
236 if (recognition_engine_ == NULL) { | |
237 // This was the first audio packet recorded, so start a request to the | |
238 // server to send the data and inform the listener. | |
239 listener_->OnAudioStart(caller_id_); | |
240 GoogleOneShotRemoteEngineConfig google_sr_config; | |
241 google_sr_config.language = language_; | |
242 google_sr_config.grammar = grammar_; | |
243 google_sr_config.audio_sample_rate = kAudioSampleRate; | |
244 google_sr_config.audio_num_bits_per_sample = kNumBitsPerAudioSample; | |
245 google_sr_config.filter_profanities = filter_profanities_; | |
246 google_sr_config.hardware_info = hardware_info_; | |
247 google_sr_config.origin_url = origin_url_; | |
248 GoogleOneShotRemoteEngine* google_sr_engine = | |
249 new GoogleOneShotRemoteEngine(context_getter_.get()); | |
250 google_sr_engine->SetConfig(google_sr_config); | |
251 recognition_engine_.reset(google_sr_engine); | |
252 recognition_engine_->set_delegate(this); | |
253 recognition_engine_->StartRecognition(); | |
254 } | |
255 | |
256 recognition_engine_->TakeAudioChunk(*raw_audio); | |
257 | |
258 if (endpointer_.IsEstimatingEnvironment()) { | |
259 // Check if we have gathered enough audio for the endpointer to do | |
260 // environment estimation and should move on to detect speech/end of speech. | |
261 if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs * | |
262 kAudioSampleRate) / 1000) { | |
263 endpointer_.SetUserInputMode(); | |
264 listener_->OnEnvironmentEstimationComplete(caller_id_); | |
265 } | |
266 return; // No more processing since we are still estimating environment. | |
267 } | |
268 | |
269 // Check if we have waited too long without hearing any speech. | |
270 bool speech_was_heard_after_packet = endpointer_.DidStartReceivingSpeech(); | |
271 if (!speech_was_heard_after_packet && | |
272 num_samples_recorded_ >= (kNoSpeechTimeoutMs / 1000) * kAudioSampleRate) { | |
273 InformErrorAndAbortRecognition( | |
274 content::SPEECH_RECOGNITION_ERROR_NO_SPEECH); | |
275 return; | |
276 } | |
277 | |
278 if (!speech_was_heard_before_packet && speech_was_heard_after_packet) | |
279 listener_->OnSoundStart(caller_id_); | |
280 | |
281 // Calculate the input volume to display in the UI, smoothing towards the | |
282 // new level. | |
283 float level = (rms - kAudioMeterMinDb) / | |
284 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); | |
285 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped); | |
286 if (level > audio_level_) { | |
287 audio_level_ += (level - audio_level_) * kUpSmoothingFactor; | |
288 } else { | |
289 audio_level_ += (level - audio_level_) * kDownSmoothingFactor; | |
290 } | |
291 | |
292 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / | |
293 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); | |
294 noise_level = std::min(std::max(0.0f, noise_level), | |
295 kAudioMeterRangeMaxUnclipped); | |
296 | |
297 listener_->OnAudioLevelsChange(caller_id_, did_clip ? 1.0f : audio_level_, | |
298 noise_level); | |
299 | |
300 if (endpointer_.speech_input_complete()) | |
301 StopAudioCapture(); | |
302 } | |
303 | |
304 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} | |
305 | |
306 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult( | |
307 const content::SpeechRecognitionResult& result) { | |
308 // Guard against the listener freeing us until we finish our job. | |
309 scoped_refptr<SpeechRecognizerImpl> me(this); | |
310 listener_->OnRecognitionResult(caller_id_, result); | 524 listener_->OnRecognitionResult(caller_id_, result); |
311 listener_->OnRecognitionEnd(caller_id_); | 525 listener_->OnRecognitionEnd(caller_id_); |
312 } | 526 return STATE_IDLE; |
313 | 527 } |
314 void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( | 528 |
315 const content::SpeechRecognitionError& error) { | 529 SpeechRecognizerImpl::FSMState |
316 InformErrorAndAbortRecognition(error.code); | 530 SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const { |
317 } | 531 return state_; // Just keep the current state. |
318 | |
319 void SpeechRecognizerImpl::InformErrorAndAbortRecognition( | |
320 content::SpeechRecognitionErrorCode error) { | |
321 DCHECK_NE(error, content::SPEECH_RECOGNITION_ERROR_NONE); | |
322 AbortRecognition(); | |
323 | |
324 // Guard against the listener freeing us until we finish our job. | |
325 scoped_refptr<SpeechRecognizerImpl> me(this); | |
326 listener_->OnRecognitionError(caller_id_, error); | |
327 } | 532 } |
328 | 533 |
329 void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() { | 534 void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() { |
330 VLOG(1) << "SpeechRecognizer stopping record."; | 535 DCHECK(IsCapturingAudio()); |
536 VLOG(1) << "SpeechRecognizerImpl stopping audio capture."; | |
331 // Issues a Close on the audio controller, passing an empty callback. The only | 537 // Issues a Close on the audio controller, passing an empty callback. The only |
332 // purpose of such callback is to keep the audio controller refcounted until | 538 // purpose of such callback is to keep the audio controller refcounted until |
333 // Close has completed (in the audio thread) and automatically destroy it | 539 // Close has completed (in the audio thread) and automatically destroy it |
334 // afterwards (upon return from OnAudioClosed). | 540 // afterwards (upon return from OnAudioClosed). |
335 audio_controller_->Close(base::Bind(&SpeechRecognizerImpl::OnAudioClosed, | 541 audio_controller_->Close(base::Bind(&SpeechRecognizerImpl::OnAudioClosed, |
336 this, audio_controller_)); | 542 this, audio_controller_)); |
337 audio_controller_ = NULL; // The controller is still refcounted by Bind. | 543 audio_controller_ = NULL; // The controller is still refcounted by Bind. |
338 } | 544 } |
339 | 545 |
340 bool SpeechRecognizerImpl::IsActive() const { | 546 int SpeechRecognizerImpl::GetElapsedTimeMs() const { |
341 return (recognition_engine_.get() != NULL); | 547 return (num_samples_recorded_ * 1000) / kAudioSampleRate; |
342 } | 548 } |
343 | 549 |
344 bool SpeechRecognizerImpl::IsCapturingAudio() const { | 550 void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms, |
345 return (audio_controller_.get() != NULL); | 551 bool clip_detected) { |
552 // Calculate the input volume to display in the UI, smoothing towards the | |
553 // new level. | |
554 // TODO(primiano) Do we really need all this floating point arith here? | |
555 // Perhaps it might be quite expensive on mobile. | |
556 float level = (rms - kAudioMeterMinDb) / | |
557 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); | |
558 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped); | |
559 if (level > audio_level_) { | |
560 audio_level_ += (level - audio_level_) * kUpSmoothingFactor; | |
561 } else { | |
562 audio_level_ += (level - audio_level_) * kDownSmoothingFactor; | |
563 } | |
564 | |
565 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / | |
566 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); | |
567 noise_level = std::min(std::max(0.0f, noise_level), | |
568 kAudioMeterRangeMaxUnclipped); | |
569 | |
570 listener_->OnAudioLevelsChange( | |
571 caller_id_, clip_detected ? 1.0f : audio_level_, noise_level); | |
346 } | 572 } |
347 | 573 |
348 const SpeechRecognitionEngine& | 574 const SpeechRecognitionEngine& |
349 SpeechRecognizerImpl::recognition_engine() const { | 575 SpeechRecognizerImpl::recognition_engine() const { |
350 return *(recognition_engine_.get()); | 576 return *(recognition_engine_.get()); |
351 } | 577 } |
352 | 578 |
353 void SpeechRecognizerImpl::SetAudioManagerForTesting( | 579 void SpeechRecognizerImpl::SetAudioManagerForTesting( |
354 AudioManager* audio_manager) { | 580 AudioManager* audio_manager) { |
355 testing_audio_manager_ = audio_manager; | 581 testing_audio_manager_ = audio_manager; |
356 } | 582 } |
357 | 583 |
584 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs() | |
585 : audio_error_code(0), | |
586 audio_data(NULL), | |
587 engine_error(content::SPEECH_RECOGNITION_ERROR_NONE) { | |
588 } | |
589 | |
590 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { | |
591 } | |
358 | 592 |
359 } // namespace speech | 593 } // namespace speech |
OLD | NEW |