OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "content/browser/speech/speech_recognizer_impl.h" | 5 #include "content/browser/speech/speech_recognizer_impl.h" |
6 | 6 |
| 7 #include "base/basictypes.h" |
7 #include "base/bind.h" | 8 #include "base/bind.h" |
8 #include "base/time.h" | 9 #include "base/time.h" |
9 #include "content/browser/browser_main_loop.h" | 10 #include "content/browser/browser_main_loop.h" |
10 #include "content/browser/speech/audio_buffer.h" | 11 #include "content/browser/speech/audio_buffer.h" |
11 #include "content/browser/speech/google_one_shot_remote_engine.h" | 12 #include "content/browser/speech/google_one_shot_remote_engine.h" |
12 #include "content/public/browser/browser_thread.h" | 13 #include "content/public/browser/browser_thread.h" |
13 #include "content/public/browser/speech_recognition_event_listener.h" | 14 #include "content/public/browser/speech_recognition_event_listener.h" |
14 #include "content/public/browser/speech_recognizer.h" | 15 #include "content/public/browser/speech_recognizer.h" |
15 #include "content/public/common/speech_recognition_error.h" | 16 #include "content/public/common/speech_recognition_error.h" |
16 #include "content/public/common/speech_recognition_result.h" | 17 #include "content/public/common/speech_recognition_result.h" |
17 #include "net/url_request/url_request_context_getter.h" | 18 #include "net/url_request/url_request_context_getter.h" |
18 | 19 |
19 using content::BrowserMainLoop; | 20 using content::BrowserMainLoop; |
20 using content::BrowserThread; | 21 using content::BrowserThread; |
21 using content::SpeechRecognitionError; | 22 using content::SpeechRecognitionError; |
22 using content::SpeechRecognitionEventListener; | 23 using content::SpeechRecognitionEventListener; |
23 using content::SpeechRecognitionResult; | 24 using content::SpeechRecognitionResult; |
24 using content::SpeechRecognizer; | 25 using content::SpeechRecognizer; |
25 using media::AudioInputController; | 26 using media::AudioInputController; |
26 using media::AudioManager; | 27 using media::AudioManager; |
| 28 using media::AudioParameters; |
27 | 29 |
28 namespace { | 30 namespace { |
29 | 31 |
30 // The following constants are related to the volume level indicator shown in | 32 // The following constants are related to the volume level indicator shown in |
31 // the UI for recorded audio. | 33 // the UI for recorded audio. |
32 // Multiplier used when new volume is greater than previous level. | 34 // Multiplier used when new volume is greater than previous level. |
33 const float kUpSmoothingFactor = 1.0f; | 35 const float kUpSmoothingFactor = 1.0f; |
34 // Multiplier used when new volume is lesser than previous level. | 36 // Multiplier used when new volume is lesser than previous level. |
35 const float kDownSmoothingFactor = 0.7f; | 37 const float kDownSmoothingFactor = 0.7f; |
36 // RMS dB value of a maximum (unclipped) sine wave for int16 samples. | 38 // RMS dB value of a maximum (unclipped) sine wave for int16 samples. |
37 const float kAudioMeterMaxDb = 90.31f; | 39 const float kAudioMeterMaxDb = 90.31f; |
38 // This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0. | 40 // This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0. |
39 // Values lower than this will display as empty level-meter. | 41 // Values lower than this will display as empty level-meter. |
40 const float kAudioMeterMinDb = 30.0f; | 42 const float kAudioMeterMinDb = 30.0f; |
41 const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb; | 43 const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb; |
42 | 44 |
43 // Maximum level to draw to display unclipped meter. (1.0f displays clipping.) | 45 // Maximum level to draw to display unclipped meter. (1.0f displays clipping.) |
44 const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f; | 46 const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f; |
45 | 47 |
46 // Returns true if more than 5% of the samples are at min or max value. | 48 // Returns true if more than 5% of the samples are at min or max value. |
47 bool DetectClipping(const speech::AudioChunk& chunk) { | 49 bool DetectClipping(const speech::AudioChunk& chunk) { |
48 const int num_samples = chunk.NumSamples(); | 50 const int num_samples = chunk.NumSamples(); |
49 const int16* samples = chunk.SamplesData16(); | 51 const int16* samples = chunk.SamplesData16(); |
50 const int kThreshold = num_samples / 20; | 52 const int kThreshold = num_samples / 20; |
51 int clipping_samples = 0; | 53 int clipping_samples = 0; |
| 54 |
52 for (int i = 0; i < num_samples; ++i) { | 55 for (int i = 0; i < num_samples; ++i) { |
53 if (samples[i] <= -32767 || samples[i] >= 32767) { | 56 if (samples[i] <= -32767 || samples[i] >= 32767) { |
54 if (++clipping_samples > kThreshold) | 57 if (++clipping_samples > kThreshold) |
55 return true; | 58 return true; |
56 } | 59 } |
57 } | 60 } |
58 return false; | 61 return false; |
59 } | 62 } |
60 | 63 |
61 } // namespace | 64 } // namespace |
62 | 65 |
63 SpeechRecognizer* SpeechRecognizer::Create( | 66 SpeechRecognizer* SpeechRecognizer::Create( |
64 SpeechRecognitionEventListener* listener, | 67 SpeechRecognitionEventListener* listener, |
65 int caller_id, | 68 int caller_id, |
66 const std::string& language, | 69 const std::string& language, |
67 const std::string& grammar, | 70 const std::string& grammar, |
68 net::URLRequestContextGetter* context_getter, | 71 net::URLRequestContextGetter* context_getter, |
69 bool filter_profanities, | 72 bool filter_profanities, |
70 const std::string& hardware_info, | 73 const std::string& hardware_info, |
71 const std::string& origin_url) { | 74 const std::string& origin_url) { |
| 75 speech::GoogleOneShotRemoteEngineConfig remote_engine_config; |
| 76 remote_engine_config.language = language; |
| 77 remote_engine_config.grammar = grammar; |
| 78 remote_engine_config.audio_sample_rate = |
| 79 speech::SpeechRecognizerImpl::kAudioSampleRate; |
| 80 remote_engine_config.audio_num_bits_per_sample = |
| 81 speech::SpeechRecognizerImpl::kNumBitsPerAudioSample; |
| 82 remote_engine_config.filter_profanities = filter_profanities; |
| 83 remote_engine_config.hardware_info = hardware_info; |
| 84 remote_engine_config.origin_url = origin_url; |
| 85 |
| 86 // SpeechRecognizerImpl takes ownership of google_remote_engine. |
| 87 speech::GoogleOneShotRemoteEngine* google_remote_engine = |
| 88 new speech::GoogleOneShotRemoteEngine(context_getter); |
| 89 google_remote_engine->SetConfig(remote_engine_config); |
| 90 |
72 return new speech::SpeechRecognizerImpl(listener, | 91 return new speech::SpeechRecognizerImpl(listener, |
73 caller_id, | 92 caller_id, |
74 language, | 93 google_remote_engine); |
75 grammar, | |
76 context_getter, | |
77 filter_profanities, | |
78 hardware_info, | |
79 origin_url); | |
80 } | 94 } |
81 | 95 |
82 namespace speech { | 96 namespace speech { |
83 | 97 |
84 const int SpeechRecognizerImpl::kAudioSampleRate = 16000; | 98 const int SpeechRecognizerImpl::kAudioSampleRate = 16000; |
85 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO; | 99 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO; |
86 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; | 100 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; |
87 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; | 101 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; |
88 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; | 102 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; |
89 | 103 |
| 104 COMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0, |
| 105 kNumBitsPerAudioSample_must_be_a_multiple_of_8); |
| 106 |
90 SpeechRecognizerImpl::SpeechRecognizerImpl( | 107 SpeechRecognizerImpl::SpeechRecognizerImpl( |
91 SpeechRecognitionEventListener* listener, | 108 SpeechRecognitionEventListener* listener, |
92 int caller_id, | 109 int caller_id, |
93 const std::string& language, | 110 SpeechRecognitionEngine* engine) |
94 const std::string& grammar, | |
95 net::URLRequestContextGetter* context_getter, | |
96 bool filter_profanities, | |
97 const std::string& hardware_info, | |
98 const std::string& origin_url) | |
99 : listener_(listener), | 111 : listener_(listener), |
100 testing_audio_manager_(NULL), | 112 testing_audio_manager_(NULL), |
| 113 recognition_engine_(engine), |
101 endpointer_(kAudioSampleRate), | 114 endpointer_(kAudioSampleRate), |
102 context_getter_(context_getter), | |
103 caller_id_(caller_id), | 115 caller_id_(caller_id), |
104 language_(language), | 116 is_dispatching_event_(false), |
105 grammar_(grammar), | 117 state_(STATE_IDLE) { |
106 filter_profanities_(filter_profanities), | |
107 hardware_info_(hardware_info), | |
108 origin_url_(origin_url), | |
109 num_samples_recorded_(0), | |
110 audio_level_(0.0f) { | |
111 DCHECK(listener_ != NULL); | 118 DCHECK(listener_ != NULL); |
| 119 DCHECK(recognition_engine_ != NULL); |
112 endpointer_.set_speech_input_complete_silence_length( | 120 endpointer_.set_speech_input_complete_silence_length( |
113 base::Time::kMicrosecondsPerSecond / 2); | 121 base::Time::kMicrosecondsPerSecond / 2); |
114 endpointer_.set_long_speech_input_complete_silence_length( | 122 endpointer_.set_long_speech_input_complete_silence_length( |
115 base::Time::kMicrosecondsPerSecond); | 123 base::Time::kMicrosecondsPerSecond); |
116 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); | 124 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); |
117 endpointer_.StartSession(); | 125 endpointer_.StartSession(); |
| 126 recognition_engine_->set_delegate(this); |
118 } | 127 } |
119 | 128 |
120 SpeechRecognizerImpl::~SpeechRecognizerImpl() { | 129 SpeechRecognizerImpl::~SpeechRecognizerImpl() { |
121 // Recording should have stopped earlier due to the endpointer or | |
122 // |StopRecording| being called. | |
123 DCHECK(!audio_controller_.get()); | |
124 DCHECK(!recognition_engine_.get() || | |
125 !recognition_engine_->IsRecognitionPending()); | |
126 endpointer_.EndSession(); | 130 endpointer_.EndSession(); |
127 } | 131 } |
128 | 132 |
| 133 // ------- Methods that trigger Finite State Machine (FSM) events ------------ |
| 134 |
| 135 // NOTE:all the external events and requests should be enqueued (PostTask), even |
| 136 // if they come from the same (IO) thread, in order to preserve the relationship |
| 137 // of causality between events and avoid interleaved event processing due to |
| 138 // synchronous callbacks. |
| 139 |
129 void SpeechRecognizerImpl::StartRecognition() { | 140 void SpeechRecognizerImpl::StartRecognition() { |
| 141 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| 142 base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
| 143 this, FSMEventArgs(EVENT_START))); |
| 144 } |
| 145 |
| 146 void SpeechRecognizerImpl::AbortRecognition() { |
| 147 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| 148 base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
| 149 this, FSMEventArgs(EVENT_ABORT))); |
| 150 } |
| 151 |
| 152 void SpeechRecognizerImpl::StopAudioCapture() { |
| 153 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| 154 base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
| 155 this, FSMEventArgs(EVENT_STOP_CAPTURE))); |
| 156 } |
| 157 |
| 158 bool SpeechRecognizerImpl::IsActive() const { |
| 159 // Checking the FSM state from another thread (thus, while the FSM is |
| 160 // potentially concurrently evolving) is meaningless. |
130 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); | 161 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
131 DCHECK(!audio_controller_.get()); | 162 return state_ != STATE_IDLE; |
132 DCHECK(!recognition_engine_.get() || | 163 } |
133 !recognition_engine_->IsRecognitionPending()); | 164 |
134 | 165 bool SpeechRecognizerImpl::IsCapturingAudio() const { |
135 // The endpointer needs to estimate the environment/background noise before | 166 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive(). |
136 // starting to treat the audio as user input. In |HandleOnData| we wait until | 167 const bool is_capturing_audio = state_ >= STATE_STARTING && |
137 // such time has passed before switching to user input mode. | 168 state_ <= STATE_RECOGNIZING; |
138 endpointer_.SetEnvironmentEstimationMode(); | 169 DCHECK((is_capturing_audio && (audio_controller_.get() != NULL)) || |
139 | 170 (!is_capturing_audio && audio_controller_.get() == NULL)); |
140 AudioManager* audio_manager = (testing_audio_manager_ != NULL) ? | 171 return is_capturing_audio; |
141 testing_audio_manager_ : BrowserMainLoop::GetAudioManager(); | |
142 const int samples_per_packet = kAudioSampleRate * | |
143 GoogleOneShotRemoteEngine::kAudioPacketIntervalMs / 1000; | |
144 media::AudioParameters params( | |
145 media::AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout, | |
146 kAudioSampleRate, kNumBitsPerAudioSample, samples_per_packet); | |
147 audio_controller_ = AudioInputController::Create(audio_manager, this, params); | |
148 DCHECK(audio_controller_.get()); | |
149 VLOG(1) << "SpeechRecognizer starting record."; | |
150 num_samples_recorded_ = 0; | |
151 audio_controller_->Record(); | |
152 } | |
153 | |
154 void SpeechRecognizerImpl::AbortRecognition() { | |
155 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); | |
156 DCHECK(audio_controller_.get() || recognition_engine_.get()); | |
157 | |
158 // Stop recording if required. | |
159 if (audio_controller_.get()) { | |
160 CloseAudioControllerAsynchronously(); | |
161 } | |
162 | |
163 VLOG(1) << "SpeechRecognizer canceling recognition."; | |
164 recognition_engine_.reset(); | |
165 } | |
166 | |
167 void SpeechRecognizerImpl::StopAudioCapture() { | |
168 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); | |
169 | |
170 // If audio recording has already stopped and we are in recognition phase, | |
171 // silently ignore any more calls to stop recording. | |
172 if (!audio_controller_.get()) | |
173 return; | |
174 | |
175 CloseAudioControllerAsynchronously(); | |
176 listener_->OnSoundEnd(caller_id_); | |
177 listener_->OnAudioEnd(caller_id_); | |
178 | |
179 // If we haven't got any audio yet end the recognition sequence here. | |
180 if (recognition_engine_ == NULL) { | |
181 // Guard against the listener freeing us until we finish our job. | |
182 scoped_refptr<SpeechRecognizerImpl> me(this); | |
183 listener_->OnRecognitionEnd(caller_id_); | |
184 } else { | |
185 recognition_engine_->AudioChunksEnded(); | |
186 } | |
187 } | 172 } |
188 | 173 |
189 // Invoked in the audio thread. | 174 // Invoked in the audio thread. |
190 void SpeechRecognizerImpl::OnError(AudioInputController* controller, | 175 void SpeechRecognizerImpl::OnError(AudioInputController* controller, |
191 int error_code) { | 176 int error_code) { |
192 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 177 FSMEventArgs event_args(EVENT_AUDIO_ERROR); |
193 base::Bind(&SpeechRecognizerImpl::HandleOnError, | 178 event_args.audio_error_code = error_code; |
194 this, error_code)); | 179 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
195 } | 180 base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
196 | 181 this, event_args)); |
197 void SpeechRecognizerImpl::HandleOnError(int error_code) { | |
198 LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code; | |
199 | |
200 // Check if we are still recording before canceling recognition, as | |
201 // recording might have been stopped after this error was posted to the queue | |
202 // by |OnError|. | |
203 if (!audio_controller_.get()) | |
204 return; | |
205 | |
206 InformErrorAndAbortRecognition(content::SPEECH_RECOGNITION_ERROR_AUDIO); | |
207 } | 182 } |
208 | 183 |
209 void SpeechRecognizerImpl::OnData(AudioInputController* controller, | 184 void SpeechRecognizerImpl::OnData(AudioInputController* controller, |
210 const uint8* data, uint32 size) { | 185 const uint8* data, uint32 size) { |
211 if (size == 0) // This could happen when recording stops and is normal. | 186 if (size == 0) // This could happen when audio capture stops and is normal. |
212 return; | 187 return; |
213 scoped_refptr<AudioChunk> raw_audio( | 188 |
214 new AudioChunk(data, | 189 FSMEventArgs event_args(EVENT_AUDIO_DATA); |
215 static_cast<size_t>(size), | 190 event_args.audio_data = new AudioChunk(data, static_cast<size_t>(size), |
216 kNumBitsPerAudioSample / 8)); | 191 kNumBitsPerAudioSample / 8); |
217 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 192 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
218 base::Bind(&SpeechRecognizerImpl::HandleOnData, | 193 base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
219 this, raw_audio)); | 194 this, event_args)); |
220 } | |
221 | |
222 void SpeechRecognizerImpl::HandleOnData(scoped_refptr<AudioChunk> raw_audio) { | |
223 // Check if we are still recording and if not discard this buffer, as | |
224 // recording might have been stopped after this buffer was posted to the queue | |
225 // by |OnData|. | |
226 if (!audio_controller_.get()) | |
227 return; | |
228 | |
229 bool speech_was_heard_before_packet = endpointer_.DidStartReceivingSpeech(); | |
230 | |
231 float rms; | |
232 endpointer_.ProcessAudio(*raw_audio, &rms); | |
233 bool did_clip = DetectClipping(*raw_audio); | |
234 num_samples_recorded_ += raw_audio->NumSamples(); | |
235 | |
236 if (recognition_engine_ == NULL) { | |
237 // This was the first audio packet recorded, so start a request to the | |
238 // server to send the data and inform the listener. | |
239 listener_->OnAudioStart(caller_id_); | |
240 GoogleOneShotRemoteEngineConfig google_sr_config; | |
241 google_sr_config.language = language_; | |
242 google_sr_config.grammar = grammar_; | |
243 google_sr_config.audio_sample_rate = kAudioSampleRate; | |
244 google_sr_config.audio_num_bits_per_sample = kNumBitsPerAudioSample; | |
245 google_sr_config.filter_profanities = filter_profanities_; | |
246 google_sr_config.hardware_info = hardware_info_; | |
247 google_sr_config.origin_url = origin_url_; | |
248 GoogleOneShotRemoteEngine* google_sr_engine = | |
249 new GoogleOneShotRemoteEngine(context_getter_.get()); | |
250 google_sr_engine->SetConfig(google_sr_config); | |
251 recognition_engine_.reset(google_sr_engine); | |
252 recognition_engine_->set_delegate(this); | |
253 recognition_engine_->StartRecognition(); | |
254 } | |
255 | |
256 recognition_engine_->TakeAudioChunk(*raw_audio); | |
257 | |
258 if (endpointer_.IsEstimatingEnvironment()) { | |
259 // Check if we have gathered enough audio for the endpointer to do | |
260 // environment estimation and should move on to detect speech/end of speech. | |
261 if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs * | |
262 kAudioSampleRate) / 1000) { | |
263 endpointer_.SetUserInputMode(); | |
264 listener_->OnEnvironmentEstimationComplete(caller_id_); | |
265 } | |
266 return; // No more processing since we are still estimating environment. | |
267 } | |
268 | |
269 // Check if we have waited too long without hearing any speech. | |
270 bool speech_was_heard_after_packet = endpointer_.DidStartReceivingSpeech(); | |
271 if (!speech_was_heard_after_packet && | |
272 num_samples_recorded_ >= (kNoSpeechTimeoutMs / 1000) * kAudioSampleRate) { | |
273 InformErrorAndAbortRecognition( | |
274 content::SPEECH_RECOGNITION_ERROR_NO_SPEECH); | |
275 return; | |
276 } | |
277 | |
278 if (!speech_was_heard_before_packet && speech_was_heard_after_packet) | |
279 listener_->OnSoundStart(caller_id_); | |
280 | |
281 // Calculate the input volume to display in the UI, smoothing towards the | |
282 // new level. | |
283 float level = (rms - kAudioMeterMinDb) / | |
284 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); | |
285 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped); | |
286 if (level > audio_level_) { | |
287 audio_level_ += (level - audio_level_) * kUpSmoothingFactor; | |
288 } else { | |
289 audio_level_ += (level - audio_level_) * kDownSmoothingFactor; | |
290 } | |
291 | |
292 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / | |
293 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); | |
294 noise_level = std::min(std::max(0.0f, noise_level), | |
295 kAudioMeterRangeMaxUnclipped); | |
296 | |
297 listener_->OnAudioLevelsChange(caller_id_, did_clip ? 1.0f : audio_level_, | |
298 noise_level); | |
299 | |
300 if (endpointer_.speech_input_complete()) | |
301 StopAudioCapture(); | |
302 } | 195 } |
303 | 196 |
304 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} | 197 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} |
305 | 198 |
306 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult( | 199 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult( |
307 const content::SpeechRecognitionResult& result) { | 200 const content::SpeechRecognitionResult& result) { |
308 // Guard against the listener freeing us until we finish our job. | 201 FSMEventArgs event_args(EVENT_ENGINE_RESULT); |
| 202 event_args.engine_result = result; |
| 203 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| 204 base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
| 205 this, event_args)); |
| 206 } |
| 207 |
| 208 void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( |
| 209 const content::SpeechRecognitionError& error) { |
| 210 FSMEventArgs event_args(EVENT_ENGINE_ERROR); |
| 211 event_args.engine_error = error; |
| 212 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| 213 base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
| 214 this, event_args)); |
| 215 } |
| 216 |
| 217 // ----------------------- Core FSM implementation --------------------------- |
| 218 // TODO(primiano) After the changes in the media package (r129173), this class |
| 219 // slightly violates the SpeechRecognitionEventListener interface contract. In |
| 220 // particular, it is not true anymore that this class can be freed after the |
| 221 // OnRecognitionEnd event, since the audio_controller_.Close() asynchronous |
| 222 // call can be still in progress after the end event. Currently, it does not |
| 223 // represent a problem for the browser itself, since refcounting protects us |
| 224 // against such race conditions. However, we should fix this in the next CLs. |
| 225 // For instance, tests are currently working just because the |
| 226 // TestAudioInputController is not closing asynchronously as the real controller |
| 227 // does, but they will become flaky if TestAudioInputController will be fixed. |
| 228 |
| 229 void SpeechRecognizerImpl::DispatchEvent(const FSMEventArgs& event_args) { |
| 230 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
| 231 DCHECK_LE(event_args.event, EVENT_MAX_VALUE); |
| 232 DCHECK_LE(state_, STATE_MAX_VALUE); |
| 233 |
| 234 // Event dispatching must be sequential, otherwise it will break all the rules |
| 235 // and the assumptions of the finite state automata model. |
| 236 DCHECK(!is_dispatching_event_); |
| 237 is_dispatching_event_ = true; |
| 238 |
| 239 // Guard against the delegate freeing us until we finish processing the event. |
309 scoped_refptr<SpeechRecognizerImpl> me(this); | 240 scoped_refptr<SpeechRecognizerImpl> me(this); |
| 241 |
| 242 if (event_args.event == EVENT_AUDIO_DATA) { |
| 243 DCHECK(event_args.audio_data.get() != NULL); |
| 244 ProcessAudioPipeline(*event_args.audio_data); |
| 245 } |
| 246 |
| 247 // The audio pipeline must be processed before the event dispatch, otherwise |
| 248 // it would take actions according to the future state instead of the current. |
| 249 state_ = ExecuteTransitionAndGetNextState(event_args); |
| 250 |
| 251 is_dispatching_event_ = false; |
| 252 } |
| 253 |
| 254 SpeechRecognizerImpl::FSMState |
| 255 SpeechRecognizerImpl::ExecuteTransitionAndGetNextState( |
| 256 const FSMEventArgs& event_args) { |
| 257 const FSMEvent event = event_args.event; |
| 258 switch (state_) { |
| 259 case STATE_IDLE: |
| 260 switch (event) { |
| 261 // TODO(primiano) restore UNREACHABLE_CONDITION on EVENT_ABORT and |
| 262 // EVENT_STOP_CAPTURE below once speech input extensions are fixed. |
| 263 case EVENT_ABORT: |
| 264 return DoNothing(event_args); |
| 265 case EVENT_START: |
| 266 return StartRecording(event_args); |
| 267 case EVENT_STOP_CAPTURE: // Corner cases related to queued messages |
| 268 case EVENT_AUDIO_DATA: // being lately dispatched. |
| 269 case EVENT_ENGINE_RESULT: |
| 270 case EVENT_ENGINE_ERROR: |
| 271 case EVENT_AUDIO_ERROR: |
| 272 return DoNothing(event_args); |
| 273 } |
| 274 break; |
| 275 case STATE_STARTING: |
| 276 switch (event) { |
| 277 case EVENT_ABORT: |
| 278 return Abort(event_args); |
| 279 case EVENT_START: |
| 280 return NotFeasible(event_args); |
| 281 case EVENT_STOP_CAPTURE: |
| 282 return Abort(event_args); |
| 283 case EVENT_AUDIO_DATA: |
| 284 return StartRecognitionEngine(event_args); |
| 285 case EVENT_ENGINE_RESULT: |
| 286 return NotFeasible(event_args); |
| 287 case EVENT_ENGINE_ERROR: |
| 288 case EVENT_AUDIO_ERROR: |
| 289 return Abort(event_args); |
| 290 } |
| 291 break; |
| 292 case STATE_ESTIMATING_ENVIRONMENT: |
| 293 switch (event) { |
| 294 case EVENT_ABORT: |
| 295 return Abort(event_args); |
| 296 case EVENT_START: |
| 297 return NotFeasible(event_args); |
| 298 case EVENT_STOP_CAPTURE: |
| 299 return StopCaptureAndWaitForResult(event_args); |
| 300 case EVENT_AUDIO_DATA: |
| 301 return WaitEnvironmentEstimationCompletion(event_args); |
| 302 case EVENT_ENGINE_RESULT: |
| 303 return ProcessIntermediateResult(event_args); |
| 304 case EVENT_ENGINE_ERROR: |
| 305 case EVENT_AUDIO_ERROR: |
| 306 return Abort(event_args); |
| 307 } |
| 308 break; |
| 309 case STATE_WAITING_FOR_SPEECH: |
| 310 switch (event) { |
| 311 case EVENT_ABORT: |
| 312 return Abort(event_args); |
| 313 case EVENT_START: |
| 314 return NotFeasible(event_args); |
| 315 case EVENT_STOP_CAPTURE: |
| 316 return StopCaptureAndWaitForResult(event_args); |
| 317 case EVENT_AUDIO_DATA: |
| 318 return DetectUserSpeechOrTimeout(event_args); |
| 319 case EVENT_ENGINE_RESULT: |
| 320 return ProcessIntermediateResult(event_args); |
| 321 case EVENT_ENGINE_ERROR: |
| 322 case EVENT_AUDIO_ERROR: |
| 323 return Abort(event_args); |
| 324 } |
| 325 break; |
| 326 case STATE_RECOGNIZING: |
| 327 switch (event) { |
| 328 case EVENT_ABORT: |
| 329 return Abort(event_args); |
| 330 case EVENT_START: |
| 331 return NotFeasible(event_args); |
| 332 case EVENT_STOP_CAPTURE: |
| 333 return StopCaptureAndWaitForResult(event_args); |
| 334 case EVENT_AUDIO_DATA: |
| 335 return DetectEndOfSpeech(event_args); |
| 336 case EVENT_ENGINE_RESULT: |
| 337 return ProcessIntermediateResult(event_args); |
| 338 case EVENT_ENGINE_ERROR: |
| 339 case EVENT_AUDIO_ERROR: |
| 340 return Abort(event_args); |
| 341 } |
| 342 break; |
| 343 case STATE_WAITING_FINAL_RESULT: |
| 344 switch (event) { |
| 345 case EVENT_ABORT: |
| 346 return Abort(event_args); |
| 347 case EVENT_START: |
| 348 return NotFeasible(event_args); |
| 349 case EVENT_STOP_CAPTURE: |
| 350 case EVENT_AUDIO_DATA: |
| 351 return DoNothing(event_args); |
| 352 case EVENT_ENGINE_RESULT: |
| 353 return ProcessFinalResult(event_args); |
| 354 case EVENT_ENGINE_ERROR: |
| 355 case EVENT_AUDIO_ERROR: |
| 356 return Abort(event_args); |
| 357 } |
| 358 break; |
| 359 } |
| 360 return NotFeasible(event_args); |
| 361 } |
| 362 |
| 363 // ----------- Contract for all the FSM evolution functions below ------------- |
| 364 // - Are guaranteed to be executed in the IO thread; |
| 365 // - Are guaranteed to be not reentrant (themselves and each other); |
| 366 // - event_args members are guaranteed to be stable during the call; |
| 367 // - The class won't be freed in the meanwhile due to callbacks; |
| 368 // - IsCapturingAudio() returns true if and only if audio_controller_ != NULL. |
| 369 |
| 370 // TODO(primiano) the audio pipeline is currently serial. However, the |
| 371 // clipper->endpointer->vumeter chain and the sr_engine could be parallelized. |
| 372 // We should profile the execution to see if it would be worth or not. |
| 373 void SpeechRecognizerImpl::ProcessAudioPipeline(const AudioChunk& raw_audio) { |
| 374 const bool route_to_endpointer = state_ >= STATE_ESTIMATING_ENVIRONMENT && |
| 375 state_ <= STATE_RECOGNIZING; |
| 376 const bool route_to_sr_engine = route_to_endpointer; |
| 377 const bool route_to_vumeter = state_ >= STATE_WAITING_FOR_SPEECH && |
| 378 state_ <= STATE_RECOGNIZING; |
| 379 const bool clip_detected = DetectClipping(raw_audio); |
| 380 float rms = 0.0f; |
| 381 |
| 382 num_samples_recorded_ += raw_audio.NumSamples(); |
| 383 |
| 384 if (route_to_endpointer) |
| 385 endpointer_.ProcessAudio(raw_audio, &rms); |
| 386 |
| 387 if (route_to_vumeter) { |
| 388 DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|. |
| 389 UpdateSignalAndNoiseLevels(rms, clip_detected); |
| 390 } |
| 391 if (route_to_sr_engine) { |
| 392 DCHECK(recognition_engine_.get() != NULL); |
| 393 recognition_engine_->TakeAudioChunk(raw_audio); |
| 394 } |
| 395 } |
| 396 |
| 397 SpeechRecognizerImpl::FSMState |
| 398 SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) { |
| 399 DCHECK(recognition_engine_.get() != NULL); |
| 400 DCHECK(!IsCapturingAudio()); |
| 401 AudioManager* audio_manager = (testing_audio_manager_ != NULL) ? |
| 402 testing_audio_manager_ : |
| 403 BrowserMainLoop::GetAudioManager(); |
| 404 DCHECK(audio_manager != NULL); |
| 405 |
| 406 DVLOG(1) << "SpeechRecognizerImpl starting audio capture."; |
| 407 num_samples_recorded_ = 0; |
| 408 audio_level_ = 0; |
| 409 listener_->OnRecognitionStart(caller_id_); |
| 410 |
| 411 if (!audio_manager->HasAudioInputDevices()) { |
| 412 return AbortWithError(SpeechRecognitionError( |
| 413 content::SPEECH_RECOGNITION_ERROR_AUDIO, |
| 414 content::SPEECH_AUDIO_ERROR_DETAILS_NO_MIC)); |
| 415 } |
| 416 |
| 417 if (audio_manager->IsRecordingInProcess()) { |
| 418 return AbortWithError(SpeechRecognitionError( |
| 419 content::SPEECH_RECOGNITION_ERROR_AUDIO, |
| 420 content::SPEECH_AUDIO_ERROR_DETAILS_IN_USE)); |
| 421 } |
| 422 |
| 423 const int samples_per_packet = (kAudioSampleRate * |
| 424 recognition_engine_->GetDesiredAudioChunkDurationMs()) / 1000; |
| 425 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout, |
| 426 kAudioSampleRate, kNumBitsPerAudioSample, |
| 427 samples_per_packet); |
| 428 audio_controller_ = AudioInputController::Create(audio_manager, this, params); |
| 429 |
| 430 if (audio_controller_.get() == NULL) { |
| 431 return AbortWithError( |
| 432 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO)); |
| 433 } |
| 434 |
| 435 // The endpointer needs to estimate the environment/background noise before |
| 436 // starting to treat the audio as user input. We wait in the state |
| 437 // ESTIMATING_ENVIRONMENT until such interval has elapsed before switching |
| 438 // to user input mode. |
| 439 endpointer_.SetEnvironmentEstimationMode(); |
| 440 audio_controller_->Record(); |
| 441 return STATE_STARTING; |
| 442 } |
| 443 |
| 444 SpeechRecognizerImpl::FSMState |
| 445 SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) { |
| 446 // This is the first audio packet captured, so the recognition engine is |
| 447 // started and the delegate notified about the event. |
| 448 DCHECK(recognition_engine_.get() != NULL); |
| 449 recognition_engine_->StartRecognition(); |
| 450 listener_->OnAudioStart(caller_id_); |
| 451 |
| 452 // This is a little hack, since TakeAudioChunk() is already called by |
| 453 // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping |
| 454 // the first audio chunk captured after opening the audio device. |
| 455 recognition_engine_->TakeAudioChunk(*(event_args.audio_data)); |
| 456 return STATE_ESTIMATING_ENVIRONMENT; |
| 457 } |
| 458 |
| 459 SpeechRecognizerImpl::FSMState |
| 460 SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) { |
| 461 DCHECK(endpointer_.IsEstimatingEnvironment()); |
| 462 if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) { |
| 463 endpointer_.SetUserInputMode(); |
| 464 listener_->OnEnvironmentEstimationComplete(caller_id_); |
| 465 return STATE_WAITING_FOR_SPEECH; |
| 466 } else { |
| 467 return STATE_ESTIMATING_ENVIRONMENT; |
| 468 } |
| 469 } |
| 470 |
| 471 SpeechRecognizerImpl::FSMState |
| 472 SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) { |
| 473 if (endpointer_.DidStartReceivingSpeech()) { |
| 474 listener_->OnSoundStart(caller_id_); |
| 475 return STATE_RECOGNIZING; |
| 476 } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) { |
| 477 return AbortWithError( |
| 478 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH)); |
| 479 } |
| 480 return STATE_WAITING_FOR_SPEECH; |
| 481 } |
| 482 |
| 483 SpeechRecognizerImpl::FSMState |
| 484 SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) { |
| 485 if (endpointer_.speech_input_complete()) { |
| 486 return StopCaptureAndWaitForResult(event_args); |
| 487 } |
| 488 return STATE_RECOGNIZING; |
| 489 } |
| 490 |
| 491 SpeechRecognizerImpl::FSMState |
| 492 SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) { |
| 493 DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING); |
| 494 |
| 495 DVLOG(1) << "Concluding recognition"; |
| 496 CloseAudioControllerAsynchronously(); |
| 497 recognition_engine_->AudioChunksEnded(); |
| 498 |
| 499 if (state_ > STATE_WAITING_FOR_SPEECH) |
| 500 listener_->OnSoundEnd(caller_id_); |
| 501 |
| 502 listener_->OnAudioEnd(caller_id_); |
| 503 return STATE_WAITING_FINAL_RESULT; |
| 504 } |
| 505 |
| 506 SpeechRecognizerImpl::FSMState |
| 507 SpeechRecognizerImpl::Abort(const FSMEventArgs& event_args) { |
| 508 // TODO(primiano) Should raise SPEECH_RECOGNITION_ERROR_ABORTED in lack of |
| 509 // other specific error sources (so that it was an explicit abort request). |
| 510 // However, SPEECH_RECOGNITION_ERROR_ABORTED is not currently caught by |
| 511 // ChromeSpeechRecognitionManagerDelegate and would cause an exception. |
| 512 // JS support will probably need it in future. |
| 513 if (event_args.event == EVENT_AUDIO_ERROR) { |
| 514 return AbortWithError( |
| 515 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO)); |
| 516 } else if (event_args.event == EVENT_ENGINE_ERROR) { |
| 517 return AbortWithError(event_args.engine_error); |
| 518 } |
| 519 return AbortWithError(NULL); |
| 520 } |
| 521 |
| 522 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortWithError( |
| 523 const SpeechRecognitionError& error) { |
| 524 return AbortWithError(&error); |
| 525 } |
| 526 |
| 527 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortWithError( |
| 528 const SpeechRecognitionError* error) { |
| 529 if (IsCapturingAudio()) |
| 530 CloseAudioControllerAsynchronously(); |
| 531 |
| 532 DVLOG(1) << "SpeechRecognizerImpl canceling recognition. "; |
| 533 |
| 534 // The recognition engine is initialized only after STATE_STARTING. |
| 535 if (state_ > STATE_STARTING) { |
| 536 DCHECK(recognition_engine_.get() != NULL); |
| 537 recognition_engine_->EndRecognition(); |
| 538 } |
| 539 |
| 540 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT) |
| 541 listener_->OnSoundEnd(caller_id_); |
| 542 |
| 543 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT) |
| 544 listener_->OnAudioEnd(caller_id_); |
| 545 |
| 546 if (error != NULL) |
| 547 listener_->OnRecognitionError(caller_id_, *error); |
| 548 |
| 549 listener_->OnRecognitionEnd(caller_id_); |
| 550 |
| 551 return STATE_IDLE; |
| 552 } |
| 553 |
| 554 SpeechRecognizerImpl::FSMState |
| 555 SpeechRecognizerImpl::ProcessIntermediateResult(const FSMEventArgs&) { |
| 556 // This is in preparation for future speech recognition functions. |
| 557 NOTREACHED(); |
| 558 return state_; |
| 559 } |
| 560 |
| 561 SpeechRecognizerImpl::FSMState |
| 562 SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) { |
| 563 const SpeechRecognitionResult& result = event_args.engine_result; |
| 564 DVLOG(1) << "Got valid result"; |
| 565 recognition_engine_->EndRecognition(); |
310 listener_->OnRecognitionResult(caller_id_, result); | 566 listener_->OnRecognitionResult(caller_id_, result); |
311 listener_->OnRecognitionEnd(caller_id_); | 567 listener_->OnRecognitionEnd(caller_id_); |
312 } | 568 return STATE_IDLE; |
313 | 569 } |
314 void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( | 570 |
315 const content::SpeechRecognitionError& error) { | 571 SpeechRecognizerImpl::FSMState |
316 InformErrorAndAbortRecognition(error.code); | 572 SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const { |
317 } | 573 return state_; // Just keep the current state. |
318 | 574 } |
319 void SpeechRecognizerImpl::InformErrorAndAbortRecognition( | 575 |
320 content::SpeechRecognitionErrorCode error) { | 576 SpeechRecognizerImpl::FSMState |
321 DCHECK_NE(error, content::SPEECH_RECOGNITION_ERROR_NONE); | 577 SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) { |
322 AbortRecognition(); | 578 NOTREACHED() << "Unfeasible event " << event_args.event |
323 | 579 << " in state " << state_; |
324 // Guard against the listener freeing us until we finish our job. | 580 return state_; |
325 scoped_refptr<SpeechRecognizerImpl> me(this); | |
326 listener_->OnRecognitionError(caller_id_, error); | |
327 } | 581 } |
328 | 582 |
329 void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() { | 583 void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() { |
330 VLOG(1) << "SpeechRecognizer stopping record."; | 584 DCHECK(IsCapturingAudio()); |
| 585 DVLOG(1) << "SpeechRecognizerImpl stopping audio capture."; |
331 // Issues a Close on the audio controller, passing an empty callback. The only | 586 // Issues a Close on the audio controller, passing an empty callback. The only |
332 // purpose of such callback is to keep the audio controller refcounted until | 587 // purpose of such callback is to keep the audio controller refcounted until |
333 // Close has completed (in the audio thread) and automatically destroy it | 588 // Close has completed (in the audio thread) and automatically destroy it |
334 // afterwards (upon return from OnAudioClosed). | 589 // afterwards (upon return from OnAudioClosed). |
335 audio_controller_->Close(base::Bind(&SpeechRecognizerImpl::OnAudioClosed, | 590 audio_controller_->Close(base::Bind(&SpeechRecognizerImpl::OnAudioClosed, |
336 this, audio_controller_)); | 591 this, audio_controller_)); |
337 audio_controller_ = NULL; // The controller is still refcounted by Bind. | 592 audio_controller_ = NULL; // The controller is still refcounted by Bind. |
338 } | 593 } |
339 | 594 |
340 bool SpeechRecognizerImpl::IsActive() const { | 595 int SpeechRecognizerImpl::GetElapsedTimeMs() const { |
341 return (recognition_engine_.get() != NULL); | 596 return (num_samples_recorded_ * 1000) / kAudioSampleRate; |
342 } | 597 } |
343 | 598 |
344 bool SpeechRecognizerImpl::IsCapturingAudio() const { | 599 void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms, |
345 return (audio_controller_.get() != NULL); | 600 bool clip_detected) { |
| 601 // Calculate the input volume to display in the UI, smoothing towards the |
| 602 // new level. |
| 603 // TODO(primiano) Do we really need all this floating point arith here? |
| 604 // Perhaps it might be quite expensive on mobile. |
| 605 float level = (rms - kAudioMeterMinDb) / |
| 606 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); |
| 607 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped); |
| 608 const float smoothing_factor = (level > audio_level_) ? kUpSmoothingFactor : |
| 609 kDownSmoothingFactor; |
| 610 audio_level_ += (level - audio_level_) * smoothing_factor; |
| 611 |
| 612 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / |
| 613 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); |
| 614 noise_level = std::min(std::max(0.0f, noise_level), |
| 615 kAudioMeterRangeMaxUnclipped); |
| 616 |
| 617 listener_->OnAudioLevelsChange( |
| 618 caller_id_, clip_detected ? 1.0f : audio_level_, noise_level); |
346 } | 619 } |
347 | 620 |
348 const SpeechRecognitionEngine& | 621 const SpeechRecognitionEngine& |
349 SpeechRecognizerImpl::recognition_engine() const { | 622 SpeechRecognizerImpl::recognition_engine() const { |
350 return *(recognition_engine_.get()); | 623 return *(recognition_engine_.get()); |
351 } | 624 } |
352 | 625 |
353 void SpeechRecognizerImpl::SetAudioManagerForTesting( | 626 void SpeechRecognizerImpl::SetAudioManagerForTesting( |
354 AudioManager* audio_manager) { | 627 AudioManager* audio_manager) { |
355 testing_audio_manager_ = audio_manager; | 628 testing_audio_manager_ = audio_manager; |
356 } | 629 } |
357 | 630 |
| 631 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) |
| 632 : event(event_value), |
| 633 audio_error_code(0), |
| 634 audio_data(NULL), |
| 635 engine_error(content::SPEECH_RECOGNITION_ERROR_NONE) { |
| 636 } |
| 637 |
| 638 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { |
| 639 } |
358 | 640 |
359 } // namespace speech | 641 } // namespace speech |
OLD | NEW |