OLD | NEW |
---|---|
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "content/browser/speech/speech_recognizer_impl.h" | 5 #include "content/browser/speech/speech_recognizer_impl.h" |
6 | 6 |
7 #include "base/basictypes.h" | |
7 #include "base/bind.h" | 8 #include "base/bind.h" |
8 #include "base/time.h" | 9 #include "base/time.h" |
9 #include "content/browser/browser_main_loop.h" | 10 #include "content/browser/browser_main_loop.h" |
10 #include "content/browser/speech/audio_buffer.h" | 11 #include "content/browser/speech/audio_buffer.h" |
11 #include "content/browser/speech/google_one_shot_remote_engine.h" | 12 #include "content/browser/speech/google_one_shot_remote_engine.h" |
12 #include "content/public/browser/browser_thread.h" | 13 #include "content/public/browser/browser_thread.h" |
13 #include "content/public/browser/speech_recognition_event_listener.h" | 14 #include "content/public/browser/speech_recognition_event_listener.h" |
14 #include "content/public/browser/speech_recognizer.h" | 15 #include "content/public/browser/speech_recognizer.h" |
15 #include "content/public/common/speech_recognition_error.h" | 16 #include "content/public/common/speech_recognition_error.h" |
16 #include "content/public/common/speech_recognition_result.h" | 17 #include "content/public/common/speech_recognition_result.h" |
17 #include "net/url_request/url_request_context_getter.h" | 18 #include "net/url_request/url_request_context_getter.h" |
18 | 19 |
19 using content::BrowserMainLoop; | 20 using content::BrowserMainLoop; |
20 using content::BrowserThread; | 21 using content::BrowserThread; |
21 using content::SpeechRecognitionError; | 22 using content::SpeechRecognitionError; |
22 using content::SpeechRecognitionEventListener; | 23 using content::SpeechRecognitionEventListener; |
23 using content::SpeechRecognitionResult; | 24 using content::SpeechRecognitionResult; |
24 using content::SpeechRecognizer; | 25 using content::SpeechRecognizer; |
25 using media::AudioInputController; | 26 using media::AudioInputController; |
26 using media::AudioManager; | 27 using media::AudioManager; |
28 using media::AudioParameters; | |
27 | 29 |
28 namespace { | 30 namespace { |
29 | 31 |
30 // The following constants are related to the volume level indicator shown in | 32 // The following constants are related to the volume level indicator shown in |
31 // the UI for recorded audio. | 33 // the UI for recorded audio. |
32 // Multiplier used when new volume is greater than previous level. | 34 // Multiplier used when new volume is greater than previous level. |
33 const float kUpSmoothingFactor = 1.0f; | 35 const float kUpSmoothingFactor = 1.0f; |
34 // Multiplier used when new volume is lesser than previous level. | 36 // Multiplier used when new volume is lesser than previous level. |
35 const float kDownSmoothingFactor = 0.7f; | 37 const float kDownSmoothingFactor = 0.7f; |
36 // RMS dB value of a maximum (unclipped) sine wave for int16 samples. | 38 // RMS dB value of a maximum (unclipped) sine wave for int16 samples. |
37 const float kAudioMeterMaxDb = 90.31f; | 39 const float kAudioMeterMaxDb = 90.31f; |
38 // This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0. | 40 // This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0. |
39 // Values lower than this will display as empty level-meter. | 41 // Values lower than this will display as empty level-meter. |
40 const float kAudioMeterMinDb = 30.0f; | 42 const float kAudioMeterMinDb = 30.0f; |
41 const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb; | 43 const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb; |
42 | 44 |
43 // Maximum level to draw to display unclipped meter. (1.0f displays clipping.) | 45 // Maximum level to draw to display unclipped meter. (1.0f displays clipping.) |
44 const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f; | 46 const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f; |
45 | 47 |
46 // Returns true if more than 5% of the samples are at min or max value. | 48 // Returns true if more than 5% of the samples are at min or max value. |
47 bool DetectClipping(const speech::AudioChunk& chunk) { | 49 bool DetectClipping(const speech::AudioChunk& chunk) { |
48 const int num_samples = chunk.NumSamples(); | 50 const int num_samples = chunk.NumSamples(); |
49 const int16* samples = chunk.SamplesData16(); | 51 const int16* samples = chunk.SamplesData16(); |
50 const int kThreshold = num_samples / 20; | 52 const int kThreshold = num_samples / 20; |
51 int clipping_samples = 0; | 53 int clipping_samples = 0; |
54 | |
52 for (int i = 0; i < num_samples; ++i) { | 55 for (int i = 0; i < num_samples; ++i) { |
53 if (samples[i] <= -32767 || samples[i] >= 32767) { | 56 if (samples[i] <= -32767 || samples[i] >= 32767) { |
54 if (++clipping_samples > kThreshold) | 57 if (++clipping_samples > kThreshold) |
55 return true; | 58 return true; |
56 } | 59 } |
57 } | 60 } |
58 return false; | 61 return false; |
59 } | 62 } |
60 | 63 |
61 } // namespace | 64 } // namespace |
62 | 65 |
63 SpeechRecognizer* SpeechRecognizer::Create( | 66 SpeechRecognizer* SpeechRecognizer::Create( |
64 SpeechRecognitionEventListener* listener, | 67 SpeechRecognitionEventListener* listener, |
65 int caller_id, | 68 int caller_id, |
66 const std::string& language, | 69 const std::string& language, |
67 const std::string& grammar, | 70 const std::string& grammar, |
68 net::URLRequestContextGetter* context_getter, | 71 net::URLRequestContextGetter* context_getter, |
69 bool filter_profanities, | 72 bool filter_profanities, |
70 const std::string& hardware_info, | 73 const std::string& hardware_info, |
71 const std::string& origin_url) { | 74 const std::string& origin_url) { |
75 speech::GoogleOneShotRemoteEngineConfig remote_engine_config; | |
76 remote_engine_config.language = language; | |
77 remote_engine_config.grammar = grammar; | |
78 remote_engine_config.audio_sample_rate = | |
79 speech::SpeechRecognizerImpl::kAudioSampleRate; | |
80 remote_engine_config.audio_num_bits_per_sample = | |
81 speech::SpeechRecognizerImpl::kNumBitsPerAudioSample; | |
82 remote_engine_config.filter_profanities = filter_profanities; | |
83 remote_engine_config.hardware_info = hardware_info; | |
84 remote_engine_config.origin_url = origin_url; | |
85 | |
86 // SpeechRecognizerImpl takes ownership of google_remote_engine. | |
87 speech::GoogleOneShotRemoteEngine* google_remote_engine = | |
88 new speech::GoogleOneShotRemoteEngine(context_getter); | |
89 google_remote_engine->SetConfig(remote_engine_config); | |
90 | |
72 return new speech::SpeechRecognizerImpl(listener, | 91 return new speech::SpeechRecognizerImpl(listener, |
73 caller_id, | 92 caller_id, |
74 language, | 93 google_remote_engine); |
75 grammar, | |
76 context_getter, | |
77 filter_profanities, | |
78 hardware_info, | |
79 origin_url); | |
80 } | 94 } |
81 | 95 |
82 namespace speech { | 96 namespace speech { |
83 | 97 |
84 const int SpeechRecognizerImpl::kAudioSampleRate = 16000; | 98 const int SpeechRecognizerImpl::kAudioSampleRate = 16000; |
85 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO; | 99 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO; |
86 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; | 100 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; |
87 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; | 101 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; |
88 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; | 102 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; |
89 | 103 |
104 COMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0, | |
105 kNumBitsPerAudioSample_must_be_a_multiple_of_8); | |
106 | |
90 SpeechRecognizerImpl::SpeechRecognizerImpl( | 107 SpeechRecognizerImpl::SpeechRecognizerImpl( |
91 SpeechRecognitionEventListener* listener, | 108 SpeechRecognitionEventListener* listener, |
92 int caller_id, | 109 int caller_id, |
93 const std::string& language, | 110 SpeechRecognitionEngine* engine) |
94 const std::string& grammar, | |
95 net::URLRequestContextGetter* context_getter, | |
96 bool filter_profanities, | |
97 const std::string& hardware_info, | |
98 const std::string& origin_url) | |
99 : listener_(listener), | 111 : listener_(listener), |
100 testing_audio_manager_(NULL), | 112 testing_audio_manager_(NULL), |
113 recognition_engine_(engine), | |
101 endpointer_(kAudioSampleRate), | 114 endpointer_(kAudioSampleRate), |
102 context_getter_(context_getter), | |
103 caller_id_(caller_id), | 115 caller_id_(caller_id), |
104 language_(language), | 116 is_dispatching_event_(false), |
105 grammar_(grammar), | 117 state_(STATE_IDLE) { |
106 filter_profanities_(filter_profanities), | |
107 hardware_info_(hardware_info), | |
108 origin_url_(origin_url), | |
109 num_samples_recorded_(0), | |
110 audio_level_(0.0f) { | |
111 DCHECK(listener_ != NULL); | 118 DCHECK(listener_ != NULL); |
119 DCHECK(recognition_engine_ != NULL); | |
112 endpointer_.set_speech_input_complete_silence_length( | 120 endpointer_.set_speech_input_complete_silence_length( |
113 base::Time::kMicrosecondsPerSecond / 2); | 121 base::Time::kMicrosecondsPerSecond / 2); |
114 endpointer_.set_long_speech_input_complete_silence_length( | 122 endpointer_.set_long_speech_input_complete_silence_length( |
115 base::Time::kMicrosecondsPerSecond); | 123 base::Time::kMicrosecondsPerSecond); |
116 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); | 124 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); |
117 endpointer_.StartSession(); | 125 endpointer_.StartSession(); |
126 recognition_engine_->set_delegate(this); | |
118 } | 127 } |
119 | 128 |
120 SpeechRecognizerImpl::~SpeechRecognizerImpl() { | 129 SpeechRecognizerImpl::~SpeechRecognizerImpl() { |
121 // Recording should have stopped earlier due to the endpointer or | |
122 // |StopRecording| being called. | |
123 DCHECK(!audio_controller_.get()); | |
124 DCHECK(!recognition_engine_.get() || | |
125 !recognition_engine_->IsRecognitionPending()); | |
126 endpointer_.EndSession(); | 130 endpointer_.EndSession(); |
127 } | 131 } |
128 | 132 |
133 // ------- Methods that trigger Finite State Machine (FSM) events ------------ | |
134 | |
135 // NOTE:all the external events and requests should be enqueued (PostTask), even | |
136 // if they come from the same (IO) thread, in order to preserve the relationship | |
137 // of causality between events and avoid interleaved event processing due to | |
138 // synchronous callbacks. | |
139 | |
129 void SpeechRecognizerImpl::StartRecognition() { | 140 void SpeechRecognizerImpl::StartRecognition() { |
141 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | |
142 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | |
143 this, FSMEventArgs(EVENT_START))); | |
144 } | |
145 | |
146 void SpeechRecognizerImpl::AbortRecognition() { | |
147 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | |
148 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | |
149 this, FSMEventArgs(EVENT_ABORT))); | |
150 } | |
151 | |
152 void SpeechRecognizerImpl::StopAudioCapture() { | |
153 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | |
154 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | |
155 this, FSMEventArgs(EVENT_STOP_CAPTURE))); | |
156 } | |
157 | |
158 bool SpeechRecognizerImpl::IsActive() const { | |
159 // Checking the FSM state from another thread (thus, while the FSM is | |
160 // potentially concurrently evolving) is meaningless. | |
130 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); | 161 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
131 DCHECK(!audio_controller_.get()); | 162 return state_ != STATE_IDLE; |
132 DCHECK(!recognition_engine_.get() || | 163 } |
133 !recognition_engine_->IsRecognitionPending()); | 164 |
134 | 165 bool SpeechRecognizerImpl::IsCapturingAudio() const { |
135 // The endpointer needs to estimate the environment/background noise before | 166 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive(). |
136 // starting to treat the audio as user input. In |HandleOnData| we wait until | 167 const bool is_capturing_audio = state_ >= STATE_STARTING && |
137 // such time has passed before switching to user input mode. | 168 state_ <= STATE_RECOGNIZING; |
138 endpointer_.SetEnvironmentEstimationMode(); | 169 DCHECK((is_capturing_audio && (audio_controller_.get() != NULL)) || |
139 | 170 (!is_capturing_audio && audio_controller_.get() == NULL)); |
140 AudioManager* audio_manager = (testing_audio_manager_ != NULL) ? | 171 return is_capturing_audio; |
141 testing_audio_manager_ : BrowserMainLoop::GetAudioManager(); | |
142 const int samples_per_packet = kAudioSampleRate * | |
143 GoogleOneShotRemoteEngine::kAudioPacketIntervalMs / 1000; | |
144 media::AudioParameters params( | |
145 media::AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout, | |
146 kAudioSampleRate, kNumBitsPerAudioSample, samples_per_packet); | |
147 audio_controller_ = AudioInputController::Create(audio_manager, this, params); | |
148 DCHECK(audio_controller_.get()); | |
149 VLOG(1) << "SpeechRecognizer starting record."; | |
150 num_samples_recorded_ = 0; | |
151 audio_controller_->Record(); | |
152 } | |
153 | |
154 void SpeechRecognizerImpl::AbortRecognition() { | |
155 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); | |
156 DCHECK(audio_controller_.get() || recognition_engine_.get()); | |
157 | |
158 // Stop recording if required. | |
159 if (audio_controller_.get()) { | |
160 CloseAudioControllerAsynchronously(); | |
161 } | |
162 | |
163 VLOG(1) << "SpeechRecognizer canceling recognition."; | |
164 recognition_engine_.reset(); | |
165 } | |
166 | |
167 void SpeechRecognizerImpl::StopAudioCapture() { | |
168 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); | |
169 | |
170 // If audio recording has already stopped and we are in recognition phase, | |
171 // silently ignore any more calls to stop recording. | |
172 if (!audio_controller_.get()) | |
173 return; | |
174 | |
175 CloseAudioControllerAsynchronously(); | |
176 listener_->OnSoundEnd(caller_id_); | |
177 listener_->OnAudioEnd(caller_id_); | |
178 | |
179 // If we haven't got any audio yet end the recognition sequence here. | |
180 if (recognition_engine_ == NULL) { | |
181 // Guard against the listener freeing us until we finish our job. | |
182 scoped_refptr<SpeechRecognizerImpl> me(this); | |
183 listener_->OnRecognitionEnd(caller_id_); | |
184 } else { | |
185 recognition_engine_->AudioChunksEnded(); | |
186 } | |
187 } | 172 } |
188 | 173 |
189 // Invoked in the audio thread. | 174 // Invoked in the audio thread. |
190 void SpeechRecognizerImpl::OnError(AudioInputController* controller, | 175 void SpeechRecognizerImpl::OnError(AudioInputController* controller, |
191 int error_code) { | 176 int error_code) { |
192 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 177 FSMEventArgs event_args(EVENT_AUDIO_ERROR); |
193 base::Bind(&SpeechRecognizerImpl::HandleOnError, | 178 event_args.audio_error_code = error_code; |
194 this, error_code)); | 179 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
195 } | 180 base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
196 | 181 this, event_args)); |
197 void SpeechRecognizerImpl::HandleOnError(int error_code) { | |
198 LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code; | |
199 | |
200 // Check if we are still recording before canceling recognition, as | |
201 // recording might have been stopped after this error was posted to the queue | |
202 // by |OnError|. | |
203 if (!audio_controller_.get()) | |
204 return; | |
205 | |
206 InformErrorAndAbortRecognition(content::SPEECH_RECOGNITION_ERROR_AUDIO); | |
207 } | 182 } |
208 | 183 |
209 void SpeechRecognizerImpl::OnData(AudioInputController* controller, | 184 void SpeechRecognizerImpl::OnData(AudioInputController* controller, |
210 const uint8* data, uint32 size) { | 185 const uint8* data, uint32 size) { |
211 if (size == 0) // This could happen when recording stops and is normal. | 186 if (size == 0) // This could happen when audio capture stops and is normal. |
212 return; | 187 return; |
213 scoped_refptr<AudioChunk> raw_audio( | 188 |
214 new AudioChunk(data, | 189 FSMEventArgs event_args(EVENT_AUDIO_DATA); |
215 static_cast<size_t>(size), | 190 event_args.audio_data = new AudioChunk(data, static_cast<size_t>(size), |
216 kNumBitsPerAudioSample / 8)); | 191 kNumBitsPerAudioSample / 8); |
217 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 192 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
218 base::Bind(&SpeechRecognizerImpl::HandleOnData, | 193 base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
219 this, raw_audio)); | 194 this, event_args)); |
220 } | |
221 | |
222 void SpeechRecognizerImpl::HandleOnData(scoped_refptr<AudioChunk> raw_audio) { | |
223 // Check if we are still recording and if not discard this buffer, as | |
224 // recording might have been stopped after this buffer was posted to the queue | |
225 // by |OnData|. | |
226 if (!audio_controller_.get()) | |
227 return; | |
228 | |
229 bool speech_was_heard_before_packet = endpointer_.DidStartReceivingSpeech(); | |
230 | |
231 float rms; | |
232 endpointer_.ProcessAudio(*raw_audio, &rms); | |
233 bool did_clip = DetectClipping(*raw_audio); | |
234 num_samples_recorded_ += raw_audio->NumSamples(); | |
235 | |
236 if (recognition_engine_ == NULL) { | |
237 // This was the first audio packet recorded, so start a request to the | |
238 // server to send the data and inform the listener. | |
239 listener_->OnAudioStart(caller_id_); | |
240 GoogleOneShotRemoteEngineConfig google_sr_config; | |
241 google_sr_config.language = language_; | |
242 google_sr_config.grammar = grammar_; | |
243 google_sr_config.audio_sample_rate = kAudioSampleRate; | |
244 google_sr_config.audio_num_bits_per_sample = kNumBitsPerAudioSample; | |
245 google_sr_config.filter_profanities = filter_profanities_; | |
246 google_sr_config.hardware_info = hardware_info_; | |
247 google_sr_config.origin_url = origin_url_; | |
248 GoogleOneShotRemoteEngine* google_sr_engine = | |
249 new GoogleOneShotRemoteEngine(context_getter_.get()); | |
250 google_sr_engine->SetConfig(google_sr_config); | |
251 recognition_engine_.reset(google_sr_engine); | |
252 recognition_engine_->set_delegate(this); | |
253 recognition_engine_->StartRecognition(); | |
254 } | |
255 | |
256 recognition_engine_->TakeAudioChunk(*raw_audio); | |
257 | |
258 if (endpointer_.IsEstimatingEnvironment()) { | |
259 // Check if we have gathered enough audio for the endpointer to do | |
260 // environment estimation and should move on to detect speech/end of speech. | |
261 if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs * | |
262 kAudioSampleRate) / 1000) { | |
263 endpointer_.SetUserInputMode(); | |
264 listener_->OnEnvironmentEstimationComplete(caller_id_); | |
265 } | |
266 return; // No more processing since we are still estimating environment. | |
267 } | |
268 | |
269 // Check if we have waited too long without hearing any speech. | |
270 bool speech_was_heard_after_packet = endpointer_.DidStartReceivingSpeech(); | |
271 if (!speech_was_heard_after_packet && | |
272 num_samples_recorded_ >= (kNoSpeechTimeoutMs / 1000) * kAudioSampleRate) { | |
273 InformErrorAndAbortRecognition( | |
274 content::SPEECH_RECOGNITION_ERROR_NO_SPEECH); | |
275 return; | |
276 } | |
277 | |
278 if (!speech_was_heard_before_packet && speech_was_heard_after_packet) | |
279 listener_->OnSoundStart(caller_id_); | |
280 | |
281 // Calculate the input volume to display in the UI, smoothing towards the | |
282 // new level. | |
283 float level = (rms - kAudioMeterMinDb) / | |
284 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); | |
285 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped); | |
286 if (level > audio_level_) { | |
287 audio_level_ += (level - audio_level_) * kUpSmoothingFactor; | |
288 } else { | |
289 audio_level_ += (level - audio_level_) * kDownSmoothingFactor; | |
290 } | |
291 | |
292 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / | |
293 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); | |
294 noise_level = std::min(std::max(0.0f, noise_level), | |
295 kAudioMeterRangeMaxUnclipped); | |
296 | |
297 listener_->OnAudioLevelsChange(caller_id_, did_clip ? 1.0f : audio_level_, | |
298 noise_level); | |
299 | |
300 if (endpointer_.speech_input_complete()) | |
301 StopAudioCapture(); | |
302 } | 195 } |
303 | 196 |
304 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} | 197 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} |
305 | 198 |
306 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult( | 199 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult( |
307 const content::SpeechRecognitionResult& result) { | 200 const content::SpeechRecognitionResult& result) { |
308 // Guard against the listener freeing us until we finish our job. | 201 FSMEventArgs event_args(EVENT_ENGINE_RESULT); |
202 event_args.engine_result = result; | |
203 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | |
204 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | |
205 this, event_args)); | |
206 } | |
207 | |
208 void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( | |
209 const content::SpeechRecognitionError& error) { | |
210 FSMEventArgs event_args(EVENT_ENGINE_ERROR); | |
211 event_args.engine_error = error; | |
212 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | |
213 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | |
214 this, event_args)); | |
215 } | |
216 | |
217 // ----------------------- Core FSM implementation --------------------------- | |
218 // TODO(primiano) After the changes in the media package (r129173), this class | |
219 // slightly violates the SpeechRecognitionEventListener interface contract. In | |
220 // particular, it is not true anymore that this class can be freed after the | |
221 // OnRecognitionEnd event, since the audio_controller_.Close() asynchronous | |
222 // call can be still in progress after the end event. Currently, it does not | |
223 // represent a problem for the browser itself, since refcounting protects us | |
224 // against such race conditions. However, we should fix this in the next CLs. | |
225 // For instance, tests are currently working just because the | |
226 // TestAudioInputController is not closing asynchronously as the real controller | |
227 // does, but they will become flaky if TestAudioInputController will be fixed. | |
228 | |
229 void SpeechRecognizerImpl::DispatchEvent(const FSMEventArgs& event_args) { | |
230 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); | |
231 DCHECK_LE(event_args.event, EVENT_MAX); | |
232 DCHECK_LE(state_, STATE_MAX); | |
233 | |
234 // Event dispatching must be sequential, otherwise it will break all the rules | |
235 // and the assumptions of the finite state automata model. | |
236 DCHECK(!is_dispatching_event_); | |
237 is_dispatching_event_ = true; | |
238 | |
239 // Guard against the delegate freeing us until we finish processing the event. | |
309 scoped_refptr<SpeechRecognizerImpl> me(this); | 240 scoped_refptr<SpeechRecognizerImpl> me(this); |
241 | |
242 if (event_args.event == EVENT_AUDIO_DATA) { | |
243 DCHECK(event_args.audio_data.get() != NULL); | |
244 ProcessAudioPipeline(*event_args.audio_data); | |
245 } | |
246 | |
247 // The audio pipeline must be processed before the event dispatch, otherwise | |
248 // it would take actions according to the future state instead of the current. | |
249 state_ = ExecuteTransitionAndGetNextState(event_args); | |
250 | |
251 is_dispatching_event_ = false; | |
252 } | |
253 | |
254 SpeechRecognizerImpl::FSMState | |
255 SpeechRecognizerImpl::ExecuteTransitionAndGetNextState( | |
256 const FSMEventArgs& event_args) { | |
257 const FSMEvent event = event_args.event; | |
258 switch (state_) { | |
259 case STATE_IDLE: | |
260 switch (event) { | |
261 // TODO(primiano) restore UNREACHABLE_CONDITION on EVENT_ABORT and | |
262 // EVENT_STOP_CAPTURE below once speech input extensions are fixed. | |
263 case EVENT_ABORT: | |
264 return DoNothing(event_args); | |
265 case EVENT_START: | |
266 return StartRecording(event_args); | |
267 case EVENT_STOP_CAPTURE: // Corner cases related to queued messages | |
268 case EVENT_AUDIO_DATA: // being lately dispatched. | |
269 case EVENT_ENGINE_RESULT: | |
270 case EVENT_ENGINE_ERROR: | |
271 case EVENT_AUDIO_ERROR: | |
272 return DoNothing(event_args); | |
273 } | |
274 break; | |
275 case STATE_STARTING: | |
276 switch (event) { | |
277 case EVENT_ABORT: | |
278 return Abort(event_args); | |
279 case EVENT_START: | |
280 return NotFeasible(event_args); | |
281 case EVENT_STOP_CAPTURE: | |
282 return Abort(event_args); | |
283 case EVENT_AUDIO_DATA: | |
284 return StartRecognitionEngine(event_args); | |
285 case EVENT_ENGINE_RESULT: | |
286 return NotFeasible(event_args); | |
287 case EVENT_ENGINE_ERROR: | |
288 case EVENT_AUDIO_ERROR: | |
289 return Abort(event_args); | |
290 } | |
291 break; | |
292 case STATE_ESTIMATING_ENVIRONMENT: | |
293 switch (event) { | |
294 case EVENT_ABORT: | |
295 return Abort(event_args); | |
296 case EVENT_START: | |
297 return NotFeasible(event_args); | |
298 case EVENT_STOP_CAPTURE: | |
299 return StopCaptureAndWaitForResult(event_args); | |
300 case EVENT_AUDIO_DATA: | |
301 return WaitEnvironmentEstimationCompletion(event_args); | |
302 case EVENT_ENGINE_RESULT: | |
303 return ProcessIntermediateResult(event_args); | |
304 case EVENT_ENGINE_ERROR: | |
305 case EVENT_AUDIO_ERROR: | |
306 return Abort(event_args); | |
307 } | |
308 break; | |
309 case STATE_WAITING_FOR_SPEECH: | |
310 switch (event) { | |
311 case EVENT_ABORT: | |
312 return Abort(event_args); | |
313 case EVENT_START: | |
314 return NotFeasible(event_args); | |
315 case EVENT_STOP_CAPTURE: | |
316 return StopCaptureAndWaitForResult(event_args); | |
317 case EVENT_AUDIO_DATA: | |
318 return DetectUserSpeechOrTimeout(event_args); | |
319 case EVENT_ENGINE_RESULT: | |
320 return ProcessIntermediateResult(event_args); | |
321 case EVENT_ENGINE_ERROR: | |
322 case EVENT_AUDIO_ERROR: | |
323 return Abort(event_args); | |
324 } | |
325 break; | |
326 case STATE_RECOGNIZING: | |
327 switch (event) { | |
328 case EVENT_ABORT: | |
329 return Abort(event_args); | |
330 case EVENT_START: | |
331 return NotFeasible(event_args); | |
332 case EVENT_STOP_CAPTURE: | |
333 return StopCaptureAndWaitForResult(event_args); | |
334 case EVENT_AUDIO_DATA: | |
335 return DetectEndOfSpeech(event_args); | |
336 case EVENT_ENGINE_RESULT: | |
337 return ProcessIntermediateResult(event_args); | |
338 case EVENT_ENGINE_ERROR: | |
339 case EVENT_AUDIO_ERROR: | |
340 return Abort(event_args); | |
341 } | |
342 break; | |
343 case STATE_WAITING_FINAL_RESULT: | |
344 switch (event) { | |
345 case EVENT_ABORT: | |
346 return Abort(event_args); | |
347 case EVENT_START: | |
348 return NotFeasible(event_args); | |
349 case EVENT_STOP_CAPTURE: | |
350 case EVENT_AUDIO_DATA: | |
351 return DoNothing(event_args); | |
352 case EVENT_ENGINE_RESULT: | |
353 return ProcessFinalResult(event_args); | |
354 case EVENT_ENGINE_ERROR: | |
355 case EVENT_AUDIO_ERROR: | |
356 return Abort(event_args); | |
357 } | |
358 break; | |
359 } | |
360 return NotFeasible(event_args); | |
361 } | |
362 | |
363 // ----------- Contract for all the FSM evolution functions below ------------- | |
364 // - Are guaranteed to be executed in the IO thread; | |
365 // - Are guaranteed to be not reentrant (themselves and each other); | |
366 // - event_args members are guaranteed to be stable during the call; | |
367 // - The class won't be freed in the meanwhile due to callbacks; | |
368 // - IsCapturingAudio() returns true if and only if audio_controller_ != NULL. | |
369 | |
370 // TODO(primiano) the audio pipeline is currently serial. However, the | |
371 // clipper->endpointer->vumeter chain and the sr_engine could be parallelized. | |
372 // We should profile the execution to see if it would be worth or not. | |
373 void SpeechRecognizerImpl::ProcessAudioPipeline(const AudioChunk& raw_audio) { | |
374 const bool route_to_endpointer = state_ >= STATE_ESTIMATING_ENVIRONMENT && | |
375 state_ <= STATE_RECOGNIZING; | |
376 const bool route_to_sr_engine = route_to_endpointer; | |
377 const bool route_to_vumeter = state_ >= STATE_WAITING_FOR_SPEECH && | |
378 state_ <= STATE_RECOGNIZING; | |
379 const bool clip_detected = DetectClipping(raw_audio); | |
380 float rms = 0; | |
bulach
2012/04/12 16:20:16
nit: 0.0f
Primiano Tucci (use gerrit)
2012/04/12 17:38:05
Done.
| |
381 | |
382 num_samples_recorded_ += raw_audio.NumSamples(); | |
383 | |
384 if (route_to_endpointer) | |
385 endpointer_.ProcessAudio(raw_audio, &rms); | |
386 | |
387 if (route_to_vumeter) { | |
388 DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|. | |
389 UpdateSignalAndNoiseLevels(rms, clip_detected); | |
390 } | |
391 if (route_to_sr_engine) { | |
392 DCHECK(recognition_engine_.get() != NULL); | |
393 recognition_engine_->TakeAudioChunk(raw_audio); | |
394 } | |
395 } | |
396 | |
397 SpeechRecognizerImpl::FSMState | |
398 SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) { | |
399 DCHECK(recognition_engine_.get() != NULL); | |
400 DCHECK(!IsCapturingAudio()); | |
401 AudioManager* audio_manager = (testing_audio_manager_ != NULL) ? | |
402 testing_audio_manager_ : | |
403 BrowserMainLoop::GetAudioManager(); | |
404 DCHECK(audio_manager != NULL); | |
405 | |
406 DVLOG(1) << "SpeechRecognizerImpl starting audio capture."; | |
407 num_samples_recorded_ = 0; | |
408 audio_level_ = 0; | |
409 listener_->OnRecognitionStart(caller_id_); | |
410 | |
411 if (!audio_manager->HasAudioInputDevices()) { | |
412 return AbortWithError(SpeechRecognitionError( | |
413 content::SPEECH_RECOGNITION_ERROR_AUDIO, | |
414 content::SPEECH_AUDIO_ERROR_DETAILS_NO_MIC)); | |
415 } | |
416 | |
417 if (audio_manager->IsRecordingInProcess()) { | |
418 return AbortWithError(SpeechRecognitionError( | |
419 content::SPEECH_RECOGNITION_ERROR_AUDIO, | |
420 content::SPEECH_AUDIO_ERROR_DETAILS_IN_USE)); | |
421 } | |
422 | |
423 const int samples_per_packet = (kAudioSampleRate * | |
424 recognition_engine_->GetDesiredAudioChunkDurationMs()) / 1000; | |
425 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout, | |
426 kAudioSampleRate, kNumBitsPerAudioSample, | |
427 samples_per_packet); | |
428 audio_controller_ = AudioInputController::Create(audio_manager, this, params); | |
429 | |
430 if (audio_controller_.get() == NULL) { | |
431 return AbortWithError( | |
432 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO)); | |
433 } | |
434 | |
435 // The endpointer needs to estimate the environment/background noise before | |
436 // starting to treat the audio as user input. We wait in the state | |
437 // ESTIMATING_ENVIRONMENT until such interval has elapsed before switching | |
438 // to user input mode. | |
439 endpointer_.SetEnvironmentEstimationMode(); | |
440 audio_controller_->Record(); | |
441 return STATE_STARTING; | |
442 } | |
443 | |
444 SpeechRecognizerImpl::FSMState | |
445 SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) { | |
446 // This is the first audio packet captured, so the recognition engine is | |
447 // started and the delegate notified about the event. | |
448 DCHECK(recognition_engine_.get() != NULL); | |
449 recognition_engine_->StartRecognition(); | |
450 listener_->OnAudioStart(caller_id_); | |
451 | |
452 // This is a little hack, since TakeAudioChunk() is already called by | |
453 // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping | |
454 // the first audio chunk captured after opening the audio device. | |
455 recognition_engine_->TakeAudioChunk(*(event_args.audio_data)); | |
456 return STATE_ESTIMATING_ENVIRONMENT; | |
457 } | |
458 | |
459 SpeechRecognizerImpl::FSMState | |
460 SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) { | |
461 DCHECK(endpointer_.IsEstimatingEnvironment()); | |
462 if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) { | |
463 endpointer_.SetUserInputMode(); | |
464 listener_->OnEnvironmentEstimationComplete(caller_id_); | |
465 return STATE_WAITING_FOR_SPEECH; | |
466 } else { | |
467 return STATE_ESTIMATING_ENVIRONMENT; | |
468 } | |
469 } | |
470 | |
471 SpeechRecognizerImpl::FSMState | |
472 SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) { | |
473 if (endpointer_.DidStartReceivingSpeech()) { | |
474 listener_->OnSoundStart(caller_id_); | |
475 return STATE_RECOGNIZING; | |
476 } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) { | |
477 return AbortWithError( | |
478 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH)); | |
479 } | |
480 return STATE_WAITING_FOR_SPEECH; | |
481 } | |
482 | |
483 SpeechRecognizerImpl::FSMState | |
484 SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) { | |
485 if (endpointer_.speech_input_complete()) { | |
486 return StopCaptureAndWaitForResult(event_args); | |
487 } | |
488 return STATE_RECOGNIZING; | |
489 } | |
490 | |
491 SpeechRecognizerImpl::FSMState | |
492 SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) { | |
493 DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING); | |
494 | |
495 DVLOG(1) << "Concluding recognition"; | |
496 CloseAudioControllerAsynchronously(); | |
497 recognition_engine_->AudioChunksEnded(); | |
498 | |
499 if (state_ > STATE_WAITING_FOR_SPEECH) | |
500 listener_->OnSoundEnd(caller_id_); | |
501 | |
502 listener_->OnAudioEnd(caller_id_); | |
503 return STATE_WAITING_FINAL_RESULT; | |
504 } | |
505 | |
506 SpeechRecognizerImpl::FSMState | |
507 SpeechRecognizerImpl::Abort(const FSMEventArgs& event_args) { | |
508 // TODO(primiano) Should raise SPEECH_RECOGNITION_ERROR_ABORTED in lack of | |
509 // other specific error sources (so that it was an explicit abort request). | |
510 // However, SPEECH_RECOGNITION_ERROR_ABORTED is not currently caught by | |
511 // ChromeSpeechRecognitionManagerDelegate and would cause an exception. | |
512 // JS support will probably need it in future. | |
513 if (event_args.event == EVENT_AUDIO_ERROR) { | |
514 return AbortWithError( | |
515 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO)); | |
516 } else if (event_args.event == EVENT_ENGINE_ERROR) { | |
517 return AbortWithError(event_args.engine_error); | |
518 } | |
519 return AbortWithError(NULL); | |
520 } | |
521 | |
522 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortWithError( | |
523 const SpeechRecognitionError& error) { | |
524 return AbortWithError(&error); | |
525 } | |
526 | |
527 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortWithError( | |
528 const SpeechRecognitionError* error) { | |
529 if (IsCapturingAudio()) | |
530 CloseAudioControllerAsynchronously(); | |
531 | |
532 DVLOG(1) << "SpeechRecognizerImpl canceling recognition. "; | |
533 | |
534 // The recognition engine is initialized only after STATE_STARTING. | |
535 if (state_ > STATE_STARTING) { | |
536 DCHECK(recognition_engine_.get() != NULL); | |
537 recognition_engine_->EndRecognition(); | |
538 } | |
539 | |
540 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT) | |
541 listener_->OnSoundEnd(caller_id_); | |
542 | |
543 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT) | |
544 listener_->OnAudioEnd(caller_id_); | |
545 | |
546 if (error != NULL) | |
547 listener_->OnRecognitionError(caller_id_, *error); | |
548 | |
549 listener_->OnRecognitionEnd(caller_id_); | |
550 | |
551 return STATE_IDLE; | |
552 } | |
553 | |
554 SpeechRecognizerImpl::FSMState | |
555 SpeechRecognizerImpl::ProcessIntermediateResult(const FSMEventArgs&) { | |
556 // This is in preparation for future speech recognition functions. | |
557 NOTREACHED(); | |
558 return state_; | |
559 } | |
560 | |
561 SpeechRecognizerImpl::FSMState | |
562 SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) { | |
563 const SpeechRecognitionResult& result = event_args.engine_result; | |
564 DVLOG(1) << "Got valid result"; | |
565 recognition_engine_->EndRecognition(); | |
310 listener_->OnRecognitionResult(caller_id_, result); | 566 listener_->OnRecognitionResult(caller_id_, result); |
311 listener_->OnRecognitionEnd(caller_id_); | 567 listener_->OnRecognitionEnd(caller_id_); |
312 } | 568 return STATE_IDLE; |
313 | 569 } |
314 void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( | 570 |
315 const content::SpeechRecognitionError& error) { | 571 SpeechRecognizerImpl::FSMState |
316 InformErrorAndAbortRecognition(error.code); | 572 SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const { |
317 } | 573 return state_; // Just keep the current state. |
318 | 574 } |
319 void SpeechRecognizerImpl::InformErrorAndAbortRecognition( | 575 |
320 content::SpeechRecognitionErrorCode error) { | 576 SpeechRecognizerImpl::FSMState |
321 DCHECK_NE(error, content::SPEECH_RECOGNITION_ERROR_NONE); | 577 SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) { |
322 AbortRecognition(); | 578 NOTREACHED() << "Unfeasible event " << event_args.event |
323 | 579 << " in state " << state_; |
324 // Guard against the listener freeing us until we finish our job. | 580 return state_; |
325 scoped_refptr<SpeechRecognizerImpl> me(this); | |
326 listener_->OnRecognitionError(caller_id_, error); | |
327 } | 581 } |
328 | 582 |
329 void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() { | 583 void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() { |
330 VLOG(1) << "SpeechRecognizer stopping record."; | 584 DCHECK(IsCapturingAudio()); |
585 DVLOG(1) << "SpeechRecognizerImpl stopping audio capture."; | |
331 // Issues a Close on the audio controller, passing an empty callback. The only | 586 // Issues a Close on the audio controller, passing an empty callback. The only |
332 // purpose of such callback is to keep the audio controller refcounted until | 587 // purpose of such callback is to keep the audio controller refcounted until |
333 // Close has completed (in the audio thread) and automatically destroy it | 588 // Close has completed (in the audio thread) and automatically destroy it |
334 // afterwards (upon return from OnAudioClosed). | 589 // afterwards (upon return from OnAudioClosed). |
335 audio_controller_->Close(base::Bind(&SpeechRecognizerImpl::OnAudioClosed, | 590 audio_controller_->Close(base::Bind(&SpeechRecognizerImpl::OnAudioClosed, |
336 this, audio_controller_)); | 591 this, audio_controller_)); |
337 audio_controller_ = NULL; // The controller is still refcounted by Bind. | 592 audio_controller_ = NULL; // The controller is still refcounted by Bind. |
338 } | 593 } |
339 | 594 |
340 bool SpeechRecognizerImpl::IsActive() const { | 595 int SpeechRecognizerImpl::GetElapsedTimeMs() const { |
341 return (recognition_engine_.get() != NULL); | 596 return (num_samples_recorded_ * 1000) / kAudioSampleRate; |
342 } | 597 } |
343 | 598 |
344 bool SpeechRecognizerImpl::IsCapturingAudio() const { | 599 void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms, |
345 return (audio_controller_.get() != NULL); | 600 bool clip_detected) { |
601 // Calculate the input volume to display in the UI, smoothing towards the | |
602 // new level. | |
603 // TODO(primiano) Do we really need all this floating point arith here? | |
604 // Perhaps it might be quite expensive on mobile. | |
605 float level = (rms - kAudioMeterMinDb) / | |
606 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); | |
607 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped); | |
608 const float smoothing_factor = (level > audio_level_) ? kUpSmoothingFactor : | |
609 kDownSmoothingFactor; | |
610 audio_level_ += (level - audio_level_) * smoothing_factor; | |
611 | |
612 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / | |
613 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); | |
614 noise_level = std::min(std::max(0.0f, noise_level), | |
615 kAudioMeterRangeMaxUnclipped); | |
616 | |
617 listener_->OnAudioLevelsChange( | |
618 caller_id_, clip_detected ? 1.0f : audio_level_, noise_level); | |
346 } | 619 } |
347 | 620 |
348 const SpeechRecognitionEngine& | 621 const SpeechRecognitionEngine& |
349 SpeechRecognizerImpl::recognition_engine() const { | 622 SpeechRecognizerImpl::recognition_engine() const { |
350 return *(recognition_engine_.get()); | 623 return *(recognition_engine_.get()); |
351 } | 624 } |
352 | 625 |
353 void SpeechRecognizerImpl::SetAudioManagerForTesting( | 626 void SpeechRecognizerImpl::SetAudioManagerForTesting( |
354 AudioManager* audio_manager) { | 627 AudioManager* audio_manager) { |
355 testing_audio_manager_ = audio_manager; | 628 testing_audio_manager_ = audio_manager; |
356 } | 629 } |
357 | 630 |
631 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) | |
632 : event(event_value), | |
633 audio_error_code(0), | |
634 audio_data(NULL), | |
635 engine_error(content::SPEECH_RECOGNITION_ERROR_NONE) { | |
636 } | |
637 | |
638 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { | |
639 } | |
358 | 640 |
359 } // namespace speech | 641 } // namespace speech |
OLD | NEW |