Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(95)

Side by Side Diff: content/browser/speech/speech_recognizer_impl.cc

Issue 9835049: Speech refactoring: Reimplemented speech_recognizer as a FSM. (CL1.5) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Added CONTENT_EXPORT on GoogleOneShotRemoteEngineConfig to address compilation issues on win. Created 8 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "content/browser/speech/speech_recognizer_impl.h" 5 #include "content/browser/speech/speech_recognizer_impl.h"
6 6
7 #include "base/basictypes.h"
7 #include "base/bind.h" 8 #include "base/bind.h"
8 #include "base/time.h" 9 #include "base/time.h"
9 #include "content/browser/browser_main_loop.h" 10 #include "content/browser/browser_main_loop.h"
10 #include "content/browser/speech/audio_buffer.h" 11 #include "content/browser/speech/audio_buffer.h"
11 #include "content/browser/speech/google_one_shot_remote_engine.h" 12 #include "content/browser/speech/google_one_shot_remote_engine.h"
12 #include "content/public/browser/browser_thread.h" 13 #include "content/public/browser/browser_thread.h"
13 #include "content/public/browser/speech_recognition_event_listener.h" 14 #include "content/public/browser/speech_recognition_event_listener.h"
14 #include "content/public/browser/speech_recognizer.h" 15 #include "content/public/browser/speech_recognizer.h"
15 #include "content/public/common/speech_recognition_error.h" 16 #include "content/public/common/speech_recognition_error.h"
16 #include "content/public/common/speech_recognition_result.h" 17 #include "content/public/common/speech_recognition_result.h"
17 #include "net/url_request/url_request_context_getter.h" 18 #include "net/url_request/url_request_context_getter.h"
18 19
19 using content::BrowserMainLoop; 20 using content::BrowserMainLoop;
20 using content::BrowserThread; 21 using content::BrowserThread;
21 using content::SpeechRecognitionError; 22 using content::SpeechRecognitionError;
22 using content::SpeechRecognitionEventListener; 23 using content::SpeechRecognitionEventListener;
23 using content::SpeechRecognitionResult; 24 using content::SpeechRecognitionResult;
24 using content::SpeechRecognizer; 25 using content::SpeechRecognizer;
25 using media::AudioInputController; 26 using media::AudioInputController;
26 using media::AudioManager; 27 using media::AudioManager;
28 using media::AudioParameters;
27 29
28 namespace { 30 namespace {
29 31
30 // The following constants are related to the volume level indicator shown in 32 // The following constants are related to the volume level indicator shown in
31 // the UI for recorded audio. 33 // the UI for recorded audio.
32 // Multiplier used when new volume is greater than previous level. 34 // Multiplier used when new volume is greater than previous level.
33 const float kUpSmoothingFactor = 1.0f; 35 const float kUpSmoothingFactor = 1.0f;
34 // Multiplier used when new volume is lesser than previous level. 36 // Multiplier used when new volume is lesser than previous level.
35 const float kDownSmoothingFactor = 0.7f; 37 const float kDownSmoothingFactor = 0.7f;
36 // RMS dB value of a maximum (unclipped) sine wave for int16 samples. 38 // RMS dB value of a maximum (unclipped) sine wave for int16 samples.
37 const float kAudioMeterMaxDb = 90.31f; 39 const float kAudioMeterMaxDb = 90.31f;
38 // This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0. 40 // This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0.
39 // Values lower than this will display as empty level-meter. 41 // Values lower than this will display as empty level-meter.
40 const float kAudioMeterMinDb = 30.0f; 42 const float kAudioMeterMinDb = 30.0f;
41 const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb; 43 const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb;
42 44
43 // Maximum level to draw to display unclipped meter. (1.0f displays clipping.) 45 // Maximum level to draw to display unclipped meter. (1.0f displays clipping.)
44 const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f; 46 const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f;
45 47
46 // Returns true if more than 5% of the samples are at min or max value. 48 // Returns true if more than 5% of the samples are at min or max value.
47 bool DetectClipping(const speech::AudioChunk& chunk) { 49 bool DetectClipping(const speech::AudioChunk& chunk) {
48 const int num_samples = chunk.NumSamples(); 50 const int num_samples = chunk.NumSamples();
49 const int16* samples = chunk.SamplesData16(); 51 const int16* samples = chunk.SamplesData16();
50 const int kThreshold = num_samples / 20; 52 const int kThreshold = num_samples / 20;
51 int clipping_samples = 0; 53 int clipping_samples = 0;
54
52 for (int i = 0; i < num_samples; ++i) { 55 for (int i = 0; i < num_samples; ++i) {
53 if (samples[i] <= -32767 || samples[i] >= 32767) { 56 if (samples[i] <= -32767 || samples[i] >= 32767) {
54 if (++clipping_samples > kThreshold) 57 if (++clipping_samples > kThreshold)
55 return true; 58 return true;
56 } 59 }
57 } 60 }
58 return false; 61 return false;
59 } 62 }
60 63
61 } // namespace 64 } // namespace
62 65
63 SpeechRecognizer* SpeechRecognizer::Create( 66 SpeechRecognizer* SpeechRecognizer::Create(
64 SpeechRecognitionEventListener* listener, 67 SpeechRecognitionEventListener* listener,
65 int caller_id, 68 int caller_id,
66 const std::string& language, 69 const std::string& language,
67 const std::string& grammar, 70 const std::string& grammar,
68 net::URLRequestContextGetter* context_getter, 71 net::URLRequestContextGetter* context_getter,
69 bool filter_profanities, 72 bool filter_profanities,
70 const std::string& hardware_info, 73 const std::string& hardware_info,
71 const std::string& origin_url) { 74 const std::string& origin_url) {
75 speech::GoogleOneShotRemoteEngineConfig remote_engine_config;
76 remote_engine_config.language = language;
77 remote_engine_config.grammar = grammar;
78 remote_engine_config.audio_sample_rate =
79 speech::SpeechRecognizerImpl::kAudioSampleRate;
80 remote_engine_config.audio_num_bits_per_sample =
81 speech::SpeechRecognizerImpl::kNumBitsPerAudioSample;
82 remote_engine_config.filter_profanities = filter_profanities;
83 remote_engine_config.hardware_info = hardware_info;
84 remote_engine_config.origin_url = origin_url;
85
86 // SpeechRecognizerImpl takes ownership of google_remote_engine.
87 speech::GoogleOneShotRemoteEngine* google_remote_engine =
88 new speech::GoogleOneShotRemoteEngine(context_getter);
89 google_remote_engine->SetConfig(remote_engine_config);
90
72 return new speech::SpeechRecognizerImpl(listener, 91 return new speech::SpeechRecognizerImpl(listener,
73 caller_id, 92 caller_id,
74 language, 93 google_remote_engine);
75 grammar,
76 context_getter,
77 filter_profanities,
78 hardware_info,
79 origin_url);
80 } 94 }
81 95
82 namespace speech { 96 namespace speech {
83 97
84 const int SpeechRecognizerImpl::kAudioSampleRate = 16000; 98 const int SpeechRecognizerImpl::kAudioSampleRate = 16000;
85 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO; 99 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO;
86 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; 100 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16;
87 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; 101 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000;
88 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; 102 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300;
89 103
104 COMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0,
105 kNumBitsPerAudioSample_must_be_a_multiple_of_8);
106
90 SpeechRecognizerImpl::SpeechRecognizerImpl( 107 SpeechRecognizerImpl::SpeechRecognizerImpl(
91 SpeechRecognitionEventListener* listener, 108 SpeechRecognitionEventListener* listener,
92 int caller_id, 109 int caller_id,
93 const std::string& language, 110 SpeechRecognitionEngine* engine)
94 const std::string& grammar,
95 net::URLRequestContextGetter* context_getter,
96 bool filter_profanities,
97 const std::string& hardware_info,
98 const std::string& origin_url)
99 : listener_(listener), 111 : listener_(listener),
100 testing_audio_manager_(NULL), 112 testing_audio_manager_(NULL),
113 recognition_engine_(engine),
101 endpointer_(kAudioSampleRate), 114 endpointer_(kAudioSampleRate),
102 context_getter_(context_getter),
103 caller_id_(caller_id), 115 caller_id_(caller_id),
104 language_(language), 116 is_dispatching_event_(false),
105 grammar_(grammar), 117 state_(STATE_IDLE) {
106 filter_profanities_(filter_profanities),
107 hardware_info_(hardware_info),
108 origin_url_(origin_url),
109 num_samples_recorded_(0),
110 audio_level_(0.0f) {
111 DCHECK(listener_ != NULL); 118 DCHECK(listener_ != NULL);
119 DCHECK(recognition_engine_ != NULL);
112 endpointer_.set_speech_input_complete_silence_length( 120 endpointer_.set_speech_input_complete_silence_length(
113 base::Time::kMicrosecondsPerSecond / 2); 121 base::Time::kMicrosecondsPerSecond / 2);
114 endpointer_.set_long_speech_input_complete_silence_length( 122 endpointer_.set_long_speech_input_complete_silence_length(
115 base::Time::kMicrosecondsPerSecond); 123 base::Time::kMicrosecondsPerSecond);
116 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); 124 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);
117 endpointer_.StartSession(); 125 endpointer_.StartSession();
126 recognition_engine_->set_delegate(this);
118 } 127 }
119 128
120 SpeechRecognizerImpl::~SpeechRecognizerImpl() { 129 SpeechRecognizerImpl::~SpeechRecognizerImpl() {
121 // Recording should have stopped earlier due to the endpointer or
122 // |StopRecording| being called.
123 DCHECK(!audio_controller_.get());
124 DCHECK(!recognition_engine_.get() ||
125 !recognition_engine_->IsRecognitionPending());
126 endpointer_.EndSession(); 130 endpointer_.EndSession();
127 } 131 }
128 132
133 // ------- Methods that trigger Finite State Machine (FSM) events ------------
134
135 // NOTE:all the external events and requests should be enqueued (PostTask), even
136 // if they come from the same (IO) thread, in order to preserve the relationship
137 // of causality between events and avoid interleaved event processing due to
138 // synchronous callbacks.
139
129 void SpeechRecognizerImpl::StartRecognition() { 140 void SpeechRecognizerImpl::StartRecognition() {
141 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
142 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
143 this, FSMEventArgs(EVENT_START)));
144 }
145
146 void SpeechRecognizerImpl::AbortRecognition() {
147 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
148 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
149 this, FSMEventArgs(EVENT_ABORT)));
150 }
151
152 void SpeechRecognizerImpl::StopAudioCapture() {
153 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
154 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
155 this, FSMEventArgs(EVENT_STOP_CAPTURE)));
156 }
157
158 bool SpeechRecognizerImpl::IsActive() const {
159 // Checking the FSM state from another thread (thus, while the FSM is
160 // potentially concurrently evolving) is meaningless.
130 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); 161 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
131 DCHECK(!audio_controller_.get()); 162 return state_ != STATE_IDLE;
132 DCHECK(!recognition_engine_.get() || 163 }
133 !recognition_engine_->IsRecognitionPending()); 164
134 165 bool SpeechRecognizerImpl::IsCapturingAudio() const {
135 // The endpointer needs to estimate the environment/background noise before 166 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive().
136 // starting to treat the audio as user input. In |HandleOnData| we wait until 167 const bool is_capturing_audio = state_ >= STATE_STARTING &&
137 // such time has passed before switching to user input mode. 168 state_ <= STATE_RECOGNIZING;
138 endpointer_.SetEnvironmentEstimationMode(); 169 DCHECK((is_capturing_audio && (audio_controller_.get() != NULL)) ||
139 170 (!is_capturing_audio && audio_controller_.get() == NULL));
140 AudioManager* audio_manager = (testing_audio_manager_ != NULL) ? 171 return is_capturing_audio;
141 testing_audio_manager_ : BrowserMainLoop::GetAudioManager();
142 const int samples_per_packet = kAudioSampleRate *
143 GoogleOneShotRemoteEngine::kAudioPacketIntervalMs / 1000;
144 media::AudioParameters params(
145 media::AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,
146 kAudioSampleRate, kNumBitsPerAudioSample, samples_per_packet);
147 audio_controller_ = AudioInputController::Create(audio_manager, this, params);
148 DCHECK(audio_controller_.get());
149 VLOG(1) << "SpeechRecognizer starting record.";
150 num_samples_recorded_ = 0;
151 audio_controller_->Record();
152 }
153
154 void SpeechRecognizerImpl::AbortRecognition() {
155 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
156 DCHECK(audio_controller_.get() || recognition_engine_.get());
157
158 // Stop recording if required.
159 if (audio_controller_.get()) {
160 CloseAudioControllerAsynchronously();
161 }
162
163 VLOG(1) << "SpeechRecognizer canceling recognition.";
164 recognition_engine_.reset();
165 }
166
167 void SpeechRecognizerImpl::StopAudioCapture() {
168 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
169
170 // If audio recording has already stopped and we are in recognition phase,
171 // silently ignore any more calls to stop recording.
172 if (!audio_controller_.get())
173 return;
174
175 CloseAudioControllerAsynchronously();
176 listener_->OnSoundEnd(caller_id_);
177 listener_->OnAudioEnd(caller_id_);
178
179 // If we haven't got any audio yet end the recognition sequence here.
180 if (recognition_engine_ == NULL) {
181 // Guard against the listener freeing us until we finish our job.
182 scoped_refptr<SpeechRecognizerImpl> me(this);
183 listener_->OnRecognitionEnd(caller_id_);
184 } else {
185 recognition_engine_->AudioChunksEnded();
186 }
187 } 172 }
188 173
189 // Invoked in the audio thread. 174 // Invoked in the audio thread.
190 void SpeechRecognizerImpl::OnError(AudioInputController* controller, 175 void SpeechRecognizerImpl::OnError(AudioInputController* controller,
191 int error_code) { 176 int error_code) {
192 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 177 FSMEventArgs event_args(EVENT_AUDIO_ERROR);
193 base::Bind(&SpeechRecognizerImpl::HandleOnError, 178 event_args.audio_error_code = error_code;
194 this, error_code)); 179 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
195 } 180 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
196 181 this, event_args));
197 void SpeechRecognizerImpl::HandleOnError(int error_code) {
198 LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code;
199
200 // Check if we are still recording before canceling recognition, as
201 // recording might have been stopped after this error was posted to the queue
202 // by |OnError|.
203 if (!audio_controller_.get())
204 return;
205
206 InformErrorAndAbortRecognition(content::SPEECH_RECOGNITION_ERROR_AUDIO);
207 } 182 }
208 183
209 void SpeechRecognizerImpl::OnData(AudioInputController* controller, 184 void SpeechRecognizerImpl::OnData(AudioInputController* controller,
210 const uint8* data, uint32 size) { 185 const uint8* data, uint32 size) {
211 if (size == 0) // This could happen when recording stops and is normal. 186 if (size == 0) // This could happen when audio capture stops and is normal.
212 return; 187 return;
213 scoped_refptr<AudioChunk> raw_audio( 188
214 new AudioChunk(data, 189 FSMEventArgs event_args(EVENT_AUDIO_DATA);
215 static_cast<size_t>(size), 190 event_args.audio_data = new AudioChunk(data, static_cast<size_t>(size),
216 kNumBitsPerAudioSample / 8)); 191 kNumBitsPerAudioSample / 8);
217 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 192 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
218 base::Bind(&SpeechRecognizerImpl::HandleOnData, 193 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
219 this, raw_audio)); 194 this, event_args));
220 }
221
222 void SpeechRecognizerImpl::HandleOnData(scoped_refptr<AudioChunk> raw_audio) {
223 // Check if we are still recording and if not discard this buffer, as
224 // recording might have been stopped after this buffer was posted to the queue
225 // by |OnData|.
226 if (!audio_controller_.get())
227 return;
228
229 bool speech_was_heard_before_packet = endpointer_.DidStartReceivingSpeech();
230
231 float rms;
232 endpointer_.ProcessAudio(*raw_audio, &rms);
233 bool did_clip = DetectClipping(*raw_audio);
234 num_samples_recorded_ += raw_audio->NumSamples();
235
236 if (recognition_engine_ == NULL) {
237 // This was the first audio packet recorded, so start a request to the
238 // server to send the data and inform the listener.
239 listener_->OnAudioStart(caller_id_);
240 GoogleOneShotRemoteEngineConfig google_sr_config;
241 google_sr_config.language = language_;
242 google_sr_config.grammar = grammar_;
243 google_sr_config.audio_sample_rate = kAudioSampleRate;
244 google_sr_config.audio_num_bits_per_sample = kNumBitsPerAudioSample;
245 google_sr_config.filter_profanities = filter_profanities_;
246 google_sr_config.hardware_info = hardware_info_;
247 google_sr_config.origin_url = origin_url_;
248 GoogleOneShotRemoteEngine* google_sr_engine =
249 new GoogleOneShotRemoteEngine(context_getter_.get());
250 google_sr_engine->SetConfig(google_sr_config);
251 recognition_engine_.reset(google_sr_engine);
252 recognition_engine_->set_delegate(this);
253 recognition_engine_->StartRecognition();
254 }
255
256 recognition_engine_->TakeAudioChunk(*raw_audio);
257
258 if (endpointer_.IsEstimatingEnvironment()) {
259 // Check if we have gathered enough audio for the endpointer to do
260 // environment estimation and should move on to detect speech/end of speech.
261 if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs *
262 kAudioSampleRate) / 1000) {
263 endpointer_.SetUserInputMode();
264 listener_->OnEnvironmentEstimationComplete(caller_id_);
265 }
266 return; // No more processing since we are still estimating environment.
267 }
268
269 // Check if we have waited too long without hearing any speech.
270 bool speech_was_heard_after_packet = endpointer_.DidStartReceivingSpeech();
271 if (!speech_was_heard_after_packet &&
272 num_samples_recorded_ >= (kNoSpeechTimeoutMs / 1000) * kAudioSampleRate) {
273 InformErrorAndAbortRecognition(
274 content::SPEECH_RECOGNITION_ERROR_NO_SPEECH);
275 return;
276 }
277
278 if (!speech_was_heard_before_packet && speech_was_heard_after_packet)
279 listener_->OnSoundStart(caller_id_);
280
281 // Calculate the input volume to display in the UI, smoothing towards the
282 // new level.
283 float level = (rms - kAudioMeterMinDb) /
284 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
285 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);
286 if (level > audio_level_) {
287 audio_level_ += (level - audio_level_) * kUpSmoothingFactor;
288 } else {
289 audio_level_ += (level - audio_level_) * kDownSmoothingFactor;
290 }
291
292 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /
293 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
294 noise_level = std::min(std::max(0.0f, noise_level),
295 kAudioMeterRangeMaxUnclipped);
296
297 listener_->OnAudioLevelsChange(caller_id_, did_clip ? 1.0f : audio_level_,
298 noise_level);
299
300 if (endpointer_.speech_input_complete())
301 StopAudioCapture();
302 } 195 }
303 196
304 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} 197 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {}
305 198
306 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult( 199 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult(
307 const content::SpeechRecognitionResult& result) { 200 const content::SpeechRecognitionResult& result) {
308 // Guard against the listener freeing us until we finish our job. 201 FSMEventArgs event_args(EVENT_ENGINE_RESULT);
202 event_args.engine_result = result;
203 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
204 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
205 this, event_args));
206 }
207
208 void SpeechRecognizerImpl::OnSpeechRecognitionEngineError(
209 const content::SpeechRecognitionError& error) {
210 FSMEventArgs event_args(EVENT_ENGINE_ERROR);
211 event_args.engine_error = error;
212 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
213 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
214 this, event_args));
215 }
216
217 // ----------------------- Core FSM implementation ---------------------------
218 // TODO(primiano) After the changes in the media package (r129173), this class
219 // slightly violates the SpeechRecognitionEventListener interface contract. In
220 // particular, it is not true anymore that this class can be freed after the
221 // OnRecognitionEnd event, since the audio_controller_.Close() asynchronous
222 // call can be still in progress after the end event. Currently, it does not
223 // represent a problem for the browser itself, since refcounting protects us
224 // against such race conditions. However, we should fix this in the next CLs.
225 // For instance, tests are currently working just because the
226 // TestAudioInputController is not closing asynchronously as the real controller
227 // does, but they will become flaky if TestAudioInputController will be fixed.
228
229 void SpeechRecognizerImpl::DispatchEvent(const FSMEventArgs& event_args) {
230 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
231 DCHECK_LE(event_args.event, EVENT_MAX_VALUE);
232 DCHECK_LE(state_, STATE_MAX_VALUE);
233
234 // Event dispatching must be sequential, otherwise it will break all the rules
235 // and the assumptions of the finite state automata model.
236 DCHECK(!is_dispatching_event_);
237 is_dispatching_event_ = true;
238
239 // Guard against the delegate freeing us until we finish processing the event.
309 scoped_refptr<SpeechRecognizerImpl> me(this); 240 scoped_refptr<SpeechRecognizerImpl> me(this);
241
242 if (event_args.event == EVENT_AUDIO_DATA) {
243 DCHECK(event_args.audio_data.get() != NULL);
244 ProcessAudioPipeline(*event_args.audio_data);
245 }
246
247 // The audio pipeline must be processed before the event dispatch, otherwise
248 // it would take actions according to the future state instead of the current.
249 state_ = ExecuteTransitionAndGetNextState(event_args);
250
251 is_dispatching_event_ = false;
252 }
253
254 SpeechRecognizerImpl::FSMState
255 SpeechRecognizerImpl::ExecuteTransitionAndGetNextState(
256 const FSMEventArgs& event_args) {
257 const FSMEvent event = event_args.event;
258 switch (state_) {
259 case STATE_IDLE:
260 switch (event) {
261 // TODO(primiano) restore UNREACHABLE_CONDITION on EVENT_ABORT and
262 // EVENT_STOP_CAPTURE below once speech input extensions are fixed.
263 case EVENT_ABORT:
264 return DoNothing(event_args);
265 case EVENT_START:
266 return StartRecording(event_args);
267 case EVENT_STOP_CAPTURE: // Corner cases related to queued messages
268 case EVENT_AUDIO_DATA: // being lately dispatched.
269 case EVENT_ENGINE_RESULT:
270 case EVENT_ENGINE_ERROR:
271 case EVENT_AUDIO_ERROR:
272 return DoNothing(event_args);
273 }
274 break;
275 case STATE_STARTING:
276 switch (event) {
277 case EVENT_ABORT:
278 return Abort(event_args);
279 case EVENT_START:
280 return NotFeasible(event_args);
281 case EVENT_STOP_CAPTURE:
282 return Abort(event_args);
283 case EVENT_AUDIO_DATA:
284 return StartRecognitionEngine(event_args);
285 case EVENT_ENGINE_RESULT:
286 return NotFeasible(event_args);
287 case EVENT_ENGINE_ERROR:
288 case EVENT_AUDIO_ERROR:
289 return Abort(event_args);
290 }
291 break;
292 case STATE_ESTIMATING_ENVIRONMENT:
293 switch (event) {
294 case EVENT_ABORT:
295 return Abort(event_args);
296 case EVENT_START:
297 return NotFeasible(event_args);
298 case EVENT_STOP_CAPTURE:
299 return StopCaptureAndWaitForResult(event_args);
300 case EVENT_AUDIO_DATA:
301 return WaitEnvironmentEstimationCompletion(event_args);
302 case EVENT_ENGINE_RESULT:
303 return ProcessIntermediateResult(event_args);
304 case EVENT_ENGINE_ERROR:
305 case EVENT_AUDIO_ERROR:
306 return Abort(event_args);
307 }
308 break;
309 case STATE_WAITING_FOR_SPEECH:
310 switch (event) {
311 case EVENT_ABORT:
312 return Abort(event_args);
313 case EVENT_START:
314 return NotFeasible(event_args);
315 case EVENT_STOP_CAPTURE:
316 return StopCaptureAndWaitForResult(event_args);
317 case EVENT_AUDIO_DATA:
318 return DetectUserSpeechOrTimeout(event_args);
319 case EVENT_ENGINE_RESULT:
320 return ProcessIntermediateResult(event_args);
321 case EVENT_ENGINE_ERROR:
322 case EVENT_AUDIO_ERROR:
323 return Abort(event_args);
324 }
325 break;
326 case STATE_RECOGNIZING:
327 switch (event) {
328 case EVENT_ABORT:
329 return Abort(event_args);
330 case EVENT_START:
331 return NotFeasible(event_args);
332 case EVENT_STOP_CAPTURE:
333 return StopCaptureAndWaitForResult(event_args);
334 case EVENT_AUDIO_DATA:
335 return DetectEndOfSpeech(event_args);
336 case EVENT_ENGINE_RESULT:
337 return ProcessIntermediateResult(event_args);
338 case EVENT_ENGINE_ERROR:
339 case EVENT_AUDIO_ERROR:
340 return Abort(event_args);
341 }
342 break;
343 case STATE_WAITING_FINAL_RESULT:
344 switch (event) {
345 case EVENT_ABORT:
346 return Abort(event_args);
347 case EVENT_START:
348 return NotFeasible(event_args);
349 case EVENT_STOP_CAPTURE:
350 case EVENT_AUDIO_DATA:
351 return DoNothing(event_args);
352 case EVENT_ENGINE_RESULT:
353 return ProcessFinalResult(event_args);
354 case EVENT_ENGINE_ERROR:
355 case EVENT_AUDIO_ERROR:
356 return Abort(event_args);
357 }
358 break;
359 }
360 return NotFeasible(event_args);
361 }
362
363 // ----------- Contract for all the FSM evolution functions below -------------
364 // - Are guaranteed to be executed in the IO thread;
365 // - Are guaranteed to be not reentrant (themselves and each other);
366 // - event_args members are guaranteed to be stable during the call;
367 // - The class won't be freed in the meanwhile due to callbacks;
368 // - IsCapturingAudio() returns true if and only if audio_controller_ != NULL.
369
370 // TODO(primiano) the audio pipeline is currently serial. However, the
371 // clipper->endpointer->vumeter chain and the sr_engine could be parallelized.
372 // We should profile the execution to see if it would be worth or not.
373 void SpeechRecognizerImpl::ProcessAudioPipeline(const AudioChunk& raw_audio) {
374 const bool route_to_endpointer = state_ >= STATE_ESTIMATING_ENVIRONMENT &&
375 state_ <= STATE_RECOGNIZING;
376 const bool route_to_sr_engine = route_to_endpointer;
377 const bool route_to_vumeter = state_ >= STATE_WAITING_FOR_SPEECH &&
378 state_ <= STATE_RECOGNIZING;
379 const bool clip_detected = DetectClipping(raw_audio);
380 float rms = 0.0f;
381
382 num_samples_recorded_ += raw_audio.NumSamples();
383
384 if (route_to_endpointer)
385 endpointer_.ProcessAudio(raw_audio, &rms);
386
387 if (route_to_vumeter) {
388 DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|.
389 UpdateSignalAndNoiseLevels(rms, clip_detected);
390 }
391 if (route_to_sr_engine) {
392 DCHECK(recognition_engine_.get() != NULL);
393 recognition_engine_->TakeAudioChunk(raw_audio);
394 }
395 }
396
397 SpeechRecognizerImpl::FSMState
398 SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) {
399 DCHECK(recognition_engine_.get() != NULL);
400 DCHECK(!IsCapturingAudio());
401 AudioManager* audio_manager = (testing_audio_manager_ != NULL) ?
402 testing_audio_manager_ :
403 BrowserMainLoop::GetAudioManager();
404 DCHECK(audio_manager != NULL);
405
406 DVLOG(1) << "SpeechRecognizerImpl starting audio capture.";
407 num_samples_recorded_ = 0;
408 audio_level_ = 0;
409 listener_->OnRecognitionStart(caller_id_);
410
411 if (!audio_manager->HasAudioInputDevices()) {
412 return AbortWithError(SpeechRecognitionError(
413 content::SPEECH_RECOGNITION_ERROR_AUDIO,
414 content::SPEECH_AUDIO_ERROR_DETAILS_NO_MIC));
415 }
416
417 if (audio_manager->IsRecordingInProcess()) {
418 return AbortWithError(SpeechRecognitionError(
419 content::SPEECH_RECOGNITION_ERROR_AUDIO,
420 content::SPEECH_AUDIO_ERROR_DETAILS_IN_USE));
421 }
422
423 const int samples_per_packet = (kAudioSampleRate *
424 recognition_engine_->GetDesiredAudioChunkDurationMs()) / 1000;
425 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,
426 kAudioSampleRate, kNumBitsPerAudioSample,
427 samples_per_packet);
428 audio_controller_ = AudioInputController::Create(audio_manager, this, params);
429
430 if (audio_controller_.get() == NULL) {
431 return AbortWithError(
432 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO));
433 }
434
435 // The endpointer needs to estimate the environment/background noise before
436 // starting to treat the audio as user input. We wait in the state
437 // ESTIMATING_ENVIRONMENT until such interval has elapsed before switching
438 // to user input mode.
439 endpointer_.SetEnvironmentEstimationMode();
440 audio_controller_->Record();
441 return STATE_STARTING;
442 }
443
444 SpeechRecognizerImpl::FSMState
445 SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) {
446 // This is the first audio packet captured, so the recognition engine is
447 // started and the delegate notified about the event.
448 DCHECK(recognition_engine_.get() != NULL);
449 recognition_engine_->StartRecognition();
450 listener_->OnAudioStart(caller_id_);
451
452 // This is a little hack, since TakeAudioChunk() is already called by
453 // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping
454 // the first audio chunk captured after opening the audio device.
455 recognition_engine_->TakeAudioChunk(*(event_args.audio_data));
456 return STATE_ESTIMATING_ENVIRONMENT;
457 }
458
459 SpeechRecognizerImpl::FSMState
460 SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) {
461 DCHECK(endpointer_.IsEstimatingEnvironment());
462 if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) {
463 endpointer_.SetUserInputMode();
464 listener_->OnEnvironmentEstimationComplete(caller_id_);
465 return STATE_WAITING_FOR_SPEECH;
466 } else {
467 return STATE_ESTIMATING_ENVIRONMENT;
468 }
469 }
470
471 SpeechRecognizerImpl::FSMState
472 SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) {
473 if (endpointer_.DidStartReceivingSpeech()) {
474 listener_->OnSoundStart(caller_id_);
475 return STATE_RECOGNIZING;
476 } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) {
477 return AbortWithError(
478 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH));
479 }
480 return STATE_WAITING_FOR_SPEECH;
481 }
482
483 SpeechRecognizerImpl::FSMState
484 SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) {
485 if (endpointer_.speech_input_complete()) {
486 return StopCaptureAndWaitForResult(event_args);
487 }
488 return STATE_RECOGNIZING;
489 }
490
491 SpeechRecognizerImpl::FSMState
492 SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) {
493 DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING);
494
495 DVLOG(1) << "Concluding recognition";
496 CloseAudioControllerAsynchronously();
497 recognition_engine_->AudioChunksEnded();
498
499 if (state_ > STATE_WAITING_FOR_SPEECH)
500 listener_->OnSoundEnd(caller_id_);
501
502 listener_->OnAudioEnd(caller_id_);
503 return STATE_WAITING_FINAL_RESULT;
504 }
505
506 SpeechRecognizerImpl::FSMState
507 SpeechRecognizerImpl::Abort(const FSMEventArgs& event_args) {
508 // TODO(primiano) Should raise SPEECH_RECOGNITION_ERROR_ABORTED in lack of
509 // other specific error sources (so that it was an explicit abort request).
510 // However, SPEECH_RECOGNITION_ERROR_ABORTED is not currently caught by
511 // ChromeSpeechRecognitionManagerDelegate and would cause an exception.
512 // JS support will probably need it in future.
513 if (event_args.event == EVENT_AUDIO_ERROR) {
514 return AbortWithError(
515 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO));
516 } else if (event_args.event == EVENT_ENGINE_ERROR) {
517 return AbortWithError(event_args.engine_error);
518 }
519 return AbortWithError(NULL);
520 }
521
522 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortWithError(
523 const SpeechRecognitionError& error) {
524 return AbortWithError(&error);
525 }
526
527 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortWithError(
528 const SpeechRecognitionError* error) {
529 if (IsCapturingAudio())
530 CloseAudioControllerAsynchronously();
531
532 DVLOG(1) << "SpeechRecognizerImpl canceling recognition. ";
533
534 // The recognition engine is initialized only after STATE_STARTING.
535 if (state_ > STATE_STARTING) {
536 DCHECK(recognition_engine_.get() != NULL);
537 recognition_engine_->EndRecognition();
538 }
539
540 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT)
541 listener_->OnSoundEnd(caller_id_);
542
543 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT)
544 listener_->OnAudioEnd(caller_id_);
545
546 if (error != NULL)
547 listener_->OnRecognitionError(caller_id_, *error);
548
549 listener_->OnRecognitionEnd(caller_id_);
550
551 return STATE_IDLE;
552 }
553
554 SpeechRecognizerImpl::FSMState
555 SpeechRecognizerImpl::ProcessIntermediateResult(const FSMEventArgs&) {
556 // This is in preparation for future speech recognition functions.
557 NOTREACHED();
558 return state_;
559 }
560
561 SpeechRecognizerImpl::FSMState
562 SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) {
563 const SpeechRecognitionResult& result = event_args.engine_result;
564 DVLOG(1) << "Got valid result";
565 recognition_engine_->EndRecognition();
310 listener_->OnRecognitionResult(caller_id_, result); 566 listener_->OnRecognitionResult(caller_id_, result);
311 listener_->OnRecognitionEnd(caller_id_); 567 listener_->OnRecognitionEnd(caller_id_);
312 } 568 return STATE_IDLE;
313 569 }
314 void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( 570
315 const content::SpeechRecognitionError& error) { 571 SpeechRecognizerImpl::FSMState
316 InformErrorAndAbortRecognition(error.code); 572 SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const {
317 } 573 return state_; // Just keep the current state.
318 574 }
319 void SpeechRecognizerImpl::InformErrorAndAbortRecognition( 575
320 content::SpeechRecognitionErrorCode error) { 576 SpeechRecognizerImpl::FSMState
321 DCHECK_NE(error, content::SPEECH_RECOGNITION_ERROR_NONE); 577 SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) {
322 AbortRecognition(); 578 NOTREACHED() << "Unfeasible event " << event_args.event
323 579 << " in state " << state_;
324 // Guard against the listener freeing us until we finish our job. 580 return state_;
325 scoped_refptr<SpeechRecognizerImpl> me(this);
326 listener_->OnRecognitionError(caller_id_, error);
327 } 581 }
328 582
329 void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() { 583 void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() {
330 VLOG(1) << "SpeechRecognizer stopping record."; 584 DCHECK(IsCapturingAudio());
585 DVLOG(1) << "SpeechRecognizerImpl stopping audio capture.";
331 // Issues a Close on the audio controller, passing an empty callback. The only 586 // Issues a Close on the audio controller, passing an empty callback. The only
332 // purpose of such callback is to keep the audio controller refcounted until 587 // purpose of such callback is to keep the audio controller refcounted until
333 // Close has completed (in the audio thread) and automatically destroy it 588 // Close has completed (in the audio thread) and automatically destroy it
334 // afterwards (upon return from OnAudioClosed). 589 // afterwards (upon return from OnAudioClosed).
335 audio_controller_->Close(base::Bind(&SpeechRecognizerImpl::OnAudioClosed, 590 audio_controller_->Close(base::Bind(&SpeechRecognizerImpl::OnAudioClosed,
336 this, audio_controller_)); 591 this, audio_controller_));
337 audio_controller_ = NULL; // The controller is still refcounted by Bind. 592 audio_controller_ = NULL; // The controller is still refcounted by Bind.
338 } 593 }
339 594
340 bool SpeechRecognizerImpl::IsActive() const { 595 int SpeechRecognizerImpl::GetElapsedTimeMs() const {
341 return (recognition_engine_.get() != NULL); 596 return (num_samples_recorded_ * 1000) / kAudioSampleRate;
342 } 597 }
343 598
344 bool SpeechRecognizerImpl::IsCapturingAudio() const { 599 void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms,
345 return (audio_controller_.get() != NULL); 600 bool clip_detected) {
601 // Calculate the input volume to display in the UI, smoothing towards the
602 // new level.
603 // TODO(primiano) Do we really need all this floating point arith here?
604 // Perhaps it might be quite expensive on mobile.
605 float level = (rms - kAudioMeterMinDb) /
606 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
607 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);
608 const float smoothing_factor = (level > audio_level_) ? kUpSmoothingFactor :
609 kDownSmoothingFactor;
610 audio_level_ += (level - audio_level_) * smoothing_factor;
611
612 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /
613 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
614 noise_level = std::min(std::max(0.0f, noise_level),
615 kAudioMeterRangeMaxUnclipped);
616
617 listener_->OnAudioLevelsChange(
618 caller_id_, clip_detected ? 1.0f : audio_level_, noise_level);
346 } 619 }
347 620
348 const SpeechRecognitionEngine& 621 const SpeechRecognitionEngine&
349 SpeechRecognizerImpl::recognition_engine() const { 622 SpeechRecognizerImpl::recognition_engine() const {
350 return *(recognition_engine_.get()); 623 return *(recognition_engine_.get());
351 } 624 }
352 625
353 void SpeechRecognizerImpl::SetAudioManagerForTesting( 626 void SpeechRecognizerImpl::SetAudioManagerForTesting(
354 AudioManager* audio_manager) { 627 AudioManager* audio_manager) {
355 testing_audio_manager_ = audio_manager; 628 testing_audio_manager_ = audio_manager;
356 } 629 }
357 630
631 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value)
632 : event(event_value),
633 audio_error_code(0),
634 audio_data(NULL),
635 engine_error(content::SPEECH_RECOGNITION_ERROR_NONE) {
636 }
637
638 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() {
639 }
358 640
359 } // namespace speech 641 } // namespace speech
OLDNEW
« no previous file with comments | « content/browser/speech/speech_recognizer_impl.h ('k') | content/browser/speech/speech_recognizer_impl_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698