Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(293)

Side by Side Diff: content/browser/speech/speech_recognizer_impl.cc

Issue 9835049: Speech refactoring: Reimplemented speech_recognizer as a FSM. (CL1.5) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Rebased from master due to renames in media:: package. Created 8 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "content/browser/speech/speech_recognizer_impl.h" 5 #include "content/browser/speech/speech_recognizer_impl.h"
6 6
7 #include "base/basictypes.h"
7 #include "base/bind.h" 8 #include "base/bind.h"
8 #include "base/time.h" 9 #include "base/time.h"
9 #include "content/browser/browser_main_loop.h" 10 #include "content/browser/browser_main_loop.h"
10 #include "content/browser/speech/audio_buffer.h" 11 #include "content/browser/speech/audio_buffer.h"
11 #include "content/browser/speech/google_one_shot_remote_engine.h" 12 #include "content/browser/speech/google_one_shot_remote_engine.h"
12 #include "content/public/browser/browser_thread.h" 13 #include "content/public/browser/browser_thread.h"
13 #include "content/public/browser/speech_recognition_event_listener.h" 14 #include "content/public/browser/speech_recognition_event_listener.h"
14 #include "content/public/browser/speech_recognizer.h" 15 #include "content/public/browser/speech_recognizer.h"
15 #include "content/public/common/speech_recognition_error.h" 16 #include "content/public/common/speech_recognition_error.h"
16 #include "content/public/common/speech_recognition_result.h" 17 #include "content/public/common/speech_recognition_result.h"
17 #include "net/url_request/url_request_context_getter.h" 18 #include "net/url_request/url_request_context_getter.h"
18 19
20 #define NOT_FEASIBLE() do { NOTREACHED(); return state_; } while(0)
bulach 2012/04/04 15:38:17 nit: we avoid using macros as much as possible.. t
Primiano Tucci (use gerrit) 2012/04/11 10:05:41 Ok. Turned into a regular function like the others
21
19 using content::BrowserMainLoop; 22 using content::BrowserMainLoop;
20 using content::BrowserThread; 23 using content::BrowserThread;
21 using content::SpeechRecognitionError; 24 using content::SpeechRecognitionError;
22 using content::SpeechRecognitionEventListener; 25 using content::SpeechRecognitionEventListener;
23 using content::SpeechRecognitionResult; 26 using content::SpeechRecognitionResult;
24 using content::SpeechRecognizer; 27 using content::SpeechRecognizer;
25 using media::AudioInputController; 28 using media::AudioInputController;
26 using media::AudioManager; 29 using media::AudioManager;
30 using media::AudioParameters;
27 31
28 namespace { 32 namespace {
29 33
30 // The following constants are related to the volume level indicator shown in 34 // The following constants are related to the volume level indicator shown in
31 // the UI for recorded audio. 35 // the UI for recorded audio.
32 // Multiplier used when new volume is greater than previous level. 36 // Multiplier used when new volume is greater than previous level.
33 const float kUpSmoothingFactor = 1.0f; 37 const float kUpSmoothingFactor = 1.0f;
34 // Multiplier used when new volume is lesser than previous level. 38 // Multiplier used when new volume is lesser than previous level.
35 const float kDownSmoothingFactor = 0.7f; 39 const float kDownSmoothingFactor = 0.7f;
36 // RMS dB value of a maximum (unclipped) sine wave for int16 samples. 40 // RMS dB value of a maximum (unclipped) sine wave for int16 samples.
37 const float kAudioMeterMaxDb = 90.31f; 41 const float kAudioMeterMaxDb = 90.31f;
38 // This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0. 42 // This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0.
39 // Values lower than this will display as empty level-meter. 43 // Values lower than this will display as empty level-meter.
40 const float kAudioMeterMinDb = 30.0f; 44 const float kAudioMeterMinDb = 30.0f;
41 const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb; 45 const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb;
42 46
43 // Maximum level to draw to display unclipped meter. (1.0f displays clipping.) 47 // Maximum level to draw to display unclipped meter. (1.0f displays clipping.)
44 const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f; 48 const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f;
45 49
46 // Returns true if more than 5% of the samples are at min or max value. 50 // Returns true if more than 5% of the samples are at min or max value.
47 bool DetectClipping(const speech::AudioChunk& chunk) { 51 bool DetectClipping(const speech::AudioChunk& chunk) {
48 const int num_samples = chunk.NumSamples(); 52 const int num_samples = chunk.NumSamples();
49 const int16* samples = chunk.SamplesData16(); 53 const int16* samples = chunk.SamplesData16();
50 const int kThreshold = num_samples / 20; 54 const int kThreshold = num_samples / 20;
51 int clipping_samples = 0; 55 int clipping_samples = 0;
56
52 for (int i = 0; i < num_samples; ++i) { 57 for (int i = 0; i < num_samples; ++i) {
53 if (samples[i] <= -32767 || samples[i] >= 32767) { 58 if (samples[i] <= -32767 || samples[i] >= 32767) {
54 if (++clipping_samples > kThreshold) 59 if (++clipping_samples > kThreshold)
55 return true; 60 return true;
56 } 61 }
57 } 62 }
58 return false; 63 return false;
59 } 64 }
60 65
61 } // namespace 66 } // namespace
62 67
63 SpeechRecognizer* SpeechRecognizer::Create( 68 SpeechRecognizer* SpeechRecognizer::Create(
64 SpeechRecognitionEventListener* listener, 69 SpeechRecognitionEventListener* listener,
65 int caller_id, 70 int caller_id,
66 const std::string& language, 71 const std::string& language,
67 const std::string& grammar, 72 const std::string& grammar,
68 net::URLRequestContextGetter* context_getter, 73 net::URLRequestContextGetter* context_getter,
69 bool filter_profanities, 74 bool filter_profanities,
70 const std::string& hardware_info, 75 const std::string& hardware_info,
71 const std::string& origin_url) { 76 const std::string& origin_url) {
77 speech::GoogleOneShotRemoteEngineConfig google_sr_config;
bulach 2012/04/04 15:38:17 nit: prefer to call "remote_engine_config"
Primiano Tucci (use gerrit) 2012/04/11 10:05:41 Done.
78 google_sr_config.language = language;
79 google_sr_config.grammar = grammar;
80 google_sr_config.audio_sample_rate =
81 speech::SpeechRecognizerImpl::kAudioSampleRate;
82 google_sr_config.audio_num_bits_per_sample =
83 speech::SpeechRecognizerImpl::kNumBitsPerAudioSample;
84 google_sr_config.filter_profanities = filter_profanities;
85 google_sr_config.hardware_info = hardware_info;
86 google_sr_config.origin_url = origin_url;
87
88 speech::GoogleOneShotRemoteEngine* google_sr_engine =
bulach 2012/04/04 15:38:17 nit: remote_engine. also, just to clarify could a
Primiano Tucci (use gerrit) 2012/04/11 10:05:41 Done.
89 new speech::GoogleOneShotRemoteEngine(context_getter);
90 google_sr_engine->SetConfig(google_sr_config);
91
72 return new speech::SpeechRecognizerImpl(listener, 92 return new speech::SpeechRecognizerImpl(listener,
73 caller_id, 93 caller_id,
74 language, 94 google_sr_engine);
75 grammar,
76 context_getter,
77 filter_profanities,
78 hardware_info,
79 origin_url);
80 } 95 }
81 96
82 namespace speech { 97 namespace speech {
83 98
84 const int SpeechRecognizerImpl::kAudioSampleRate = 16000; 99 const int SpeechRecognizerImpl::kAudioSampleRate = 16000;
85 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO; 100 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO;
86 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; 101 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16;
87 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; 102 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000;
88 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; 103 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300;
89 104
105 COMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0,
106 kNumBitsPerAudioSample_must_be_a_multiple_of_8);
107
90 SpeechRecognizerImpl::SpeechRecognizerImpl( 108 SpeechRecognizerImpl::SpeechRecognizerImpl(
91 SpeechRecognitionEventListener* listener, 109 SpeechRecognitionEventListener* listener,
92 int caller_id, 110 int caller_id,
93 const std::string& language, 111 SpeechRecognitionEngine* engine)
94 const std::string& grammar,
95 net::URLRequestContextGetter* context_getter,
96 bool filter_profanities,
97 const std::string& hardware_info,
98 const std::string& origin_url)
99 : listener_(listener), 112 : listener_(listener),
100 testing_audio_manager_(NULL), 113 testing_audio_manager_(NULL),
114 recognition_engine_(engine),
101 endpointer_(kAudioSampleRate), 115 endpointer_(kAudioSampleRate),
102 context_getter_(context_getter),
103 caller_id_(caller_id), 116 caller_id_(caller_id),
104 language_(language), 117 in_event_dispatching_(false),
105 grammar_(grammar), 118 state_(STATE_IDLE) {
106 filter_profanities_(filter_profanities),
107 hardware_info_(hardware_info),
108 origin_url_(origin_url),
109 num_samples_recorded_(0),
110 audio_level_(0.0f) {
111 DCHECK(listener_ != NULL); 119 DCHECK(listener_ != NULL);
120 DCHECK(recognition_engine_ != NULL);
112 endpointer_.set_speech_input_complete_silence_length( 121 endpointer_.set_speech_input_complete_silence_length(
113 base::Time::kMicrosecondsPerSecond / 2); 122 base::Time::kMicrosecondsPerSecond / 2);
114 endpointer_.set_long_speech_input_complete_silence_length( 123 endpointer_.set_long_speech_input_complete_silence_length(
115 base::Time::kMicrosecondsPerSecond); 124 base::Time::kMicrosecondsPerSecond);
116 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); 125 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);
117 endpointer_.StartSession(); 126 endpointer_.StartSession();
127 recognition_engine_->set_delegate(this);
118 } 128 }
119 129
120 SpeechRecognizerImpl::~SpeechRecognizerImpl() { 130 SpeechRecognizerImpl::~SpeechRecognizerImpl() {
121 // Recording should have stopped earlier due to the endpointer or
122 // |StopRecording| being called.
123 DCHECK(!audio_controller_.get());
124 DCHECK(!recognition_engine_.get() ||
125 !recognition_engine_->IsRecognitionPending());
126 endpointer_.EndSession(); 131 endpointer_.EndSession();
127 } 132 }
128 133
134 // ------- Methods that trigger Finite State Machine (FSM) events ------------
135
136 // NOTE:all the external events and requests should be enqueued (PostTask), even
137 // if they come from the same (IO) thread, in order to preserve the relationship
138 // of causality between events and avoid interleaved event processing due to
139 // synchronous callbacks.
140
129 void SpeechRecognizerImpl::StartRecognition() { 141 void SpeechRecognizerImpl::StartRecognition() {
142 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
143 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
144 this, FSMEventArgs(EVENT_START)));
145 }
146
147 void SpeechRecognizerImpl::AbortRecognition() {
148 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
149 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
150 this, FSMEventArgs(EVENT_ABORT)));
151 }
152
153 void SpeechRecognizerImpl::StopAudioCapture() {
154 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
155 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
156 this, FSMEventArgs(EVENT_STOP_CAPTURE)));
157 }
158
159 bool SpeechRecognizerImpl::IsActive() const {
160 // Checking the FSM state from another thread (thus, while the FSM is
161 // potentially concurrently evolving) is meaningless.
130 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); 162 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
131 DCHECK(!audio_controller_.get()); 163 return state_ != STATE_IDLE;
132 DCHECK(!recognition_engine_.get() || 164 }
133 !recognition_engine_->IsRecognitionPending()); 165
134 166 bool SpeechRecognizerImpl::IsCapturingAudio() const {
135 // The endpointer needs to estimate the environment/background noise before 167 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive().
136 // starting to treat the audio as user input. In |HandleOnData| we wait until 168 const bool is_capturing_audio = state_ >= STATE_STARTING &&
137 // such time has passed before switching to user input mode. 169 state_ <= STATE_RECOGNIZING;
138 endpointer_.SetEnvironmentEstimationMode(); 170 DCHECK((is_capturing_audio && (audio_controller_.get() != NULL)) ||
139 171 (!is_capturing_audio && audio_controller_.get() == NULL));
140 AudioManager* audio_manager = (testing_audio_manager_ != NULL) ? 172 return is_capturing_audio;
141 testing_audio_manager_ : BrowserMainLoop::GetAudioManager();
142 const int samples_per_packet = kAudioSampleRate *
143 GoogleOneShotRemoteEngine::kAudioPacketIntervalMs / 1000;
144 media::AudioParameters params(
145 media::AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,
146 kAudioSampleRate, kNumBitsPerAudioSample, samples_per_packet);
147 audio_controller_ = AudioInputController::Create(audio_manager, this, params);
148 DCHECK(audio_controller_.get());
149 VLOG(1) << "SpeechRecognizer starting record.";
150 num_samples_recorded_ = 0;
151 audio_controller_->Record();
152 }
153
154 void SpeechRecognizerImpl::AbortRecognition() {
155 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
156 DCHECK(audio_controller_.get() || recognition_engine_.get());
157
158 // Stop recording if required.
159 if (audio_controller_.get()) {
160 CloseAudioControllerAsynchronously();
161 }
162
163 VLOG(1) << "SpeechRecognizer canceling recognition.";
164 recognition_engine_.reset();
165 }
166
167 void SpeechRecognizerImpl::StopAudioCapture() {
168 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
169
170 // If audio recording has already stopped and we are in recognition phase,
171 // silently ignore any more calls to stop recording.
172 if (!audio_controller_.get())
173 return;
174
175 CloseAudioControllerAsynchronously();
176 listener_->OnSoundEnd(caller_id_);
177 listener_->OnAudioEnd(caller_id_);
178
179 // If we haven't got any audio yet end the recognition sequence here.
180 if (recognition_engine_ == NULL) {
181 // Guard against the listener freeing us until we finish our job.
182 scoped_refptr<SpeechRecognizerImpl> me(this);
183 listener_->OnRecognitionEnd(caller_id_);
184 } else {
185 recognition_engine_->AudioChunksEnded();
186 }
187 } 173 }
188 174
189 // Invoked in the audio thread. 175 // Invoked in the audio thread.
190 void SpeechRecognizerImpl::OnError(AudioInputController* controller, 176 void SpeechRecognizerImpl::OnError(AudioInputController* controller,
191 int error_code) { 177 int error_code) {
192 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 178 FSMEventArgs event_args(EVENT_AUDIO_ERROR);
193 base::Bind(&SpeechRecognizerImpl::HandleOnError, 179 event_args.audio_error_code = error_code;
194 this, error_code)); 180 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
195 } 181 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
196 182 this, event_args));
197 void SpeechRecognizerImpl::HandleOnError(int error_code) {
198 LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code;
199
200 // Check if we are still recording before canceling recognition, as
201 // recording might have been stopped after this error was posted to the queue
202 // by |OnError|.
203 if (!audio_controller_.get())
204 return;
205
206 InformErrorAndAbortRecognition(content::SPEECH_RECOGNITION_ERROR_AUDIO);
207 } 183 }
208 184
209 void SpeechRecognizerImpl::OnData(AudioInputController* controller, 185 void SpeechRecognizerImpl::OnData(AudioInputController* controller,
210 const uint8* data, uint32 size) { 186 const uint8* data, uint32 size) {
211 if (size == 0) // This could happen when recording stops and is normal. 187 if (size == 0) // This could happen when audio capture stops and is normal.
212 return; 188 return;
213 scoped_refptr<AudioChunk> raw_audio( 189
214 new AudioChunk(data, 190 FSMEventArgs event_args(EVENT_AUDIO_DATA);
215 static_cast<size_t>(size), 191 event_args.audio_data = new AudioChunk(data, static_cast<size_t>(size),
216 kNumBitsPerAudioSample / 8)); 192 kNumBitsPerAudioSample / 8);
217 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 193 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
218 base::Bind(&SpeechRecognizerImpl::HandleOnData, 194 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
219 this, raw_audio)); 195 this, event_args));
220 }
221
222 void SpeechRecognizerImpl::HandleOnData(scoped_refptr<AudioChunk> raw_audio) {
223 // Check if we are still recording and if not discard this buffer, as
224 // recording might have been stopped after this buffer was posted to the queue
225 // by |OnData|.
226 if (!audio_controller_.get())
227 return;
228
229 bool speech_was_heard_before_packet = endpointer_.DidStartReceivingSpeech();
230
231 float rms;
232 endpointer_.ProcessAudio(*raw_audio, &rms);
233 bool did_clip = DetectClipping(*raw_audio);
234 num_samples_recorded_ += raw_audio->NumSamples();
235
236 if (recognition_engine_ == NULL) {
237 // This was the first audio packet recorded, so start a request to the
238 // server to send the data and inform the listener.
239 listener_->OnAudioStart(caller_id_);
240 GoogleOneShotRemoteEngineConfig google_sr_config;
241 google_sr_config.language = language_;
242 google_sr_config.grammar = grammar_;
243 google_sr_config.audio_sample_rate = kAudioSampleRate;
244 google_sr_config.audio_num_bits_per_sample = kNumBitsPerAudioSample;
245 google_sr_config.filter_profanities = filter_profanities_;
246 google_sr_config.hardware_info = hardware_info_;
247 google_sr_config.origin_url = origin_url_;
248 GoogleOneShotRemoteEngine* google_sr_engine =
249 new GoogleOneShotRemoteEngine(context_getter_.get());
250 google_sr_engine->SetConfig(google_sr_config);
251 recognition_engine_.reset(google_sr_engine);
252 recognition_engine_->set_delegate(this);
253 recognition_engine_->StartRecognition();
254 }
255
256 recognition_engine_->TakeAudioChunk(*raw_audio);
257
258 if (endpointer_.IsEstimatingEnvironment()) {
259 // Check if we have gathered enough audio for the endpointer to do
260 // environment estimation and should move on to detect speech/end of speech.
261 if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs *
262 kAudioSampleRate) / 1000) {
263 endpointer_.SetUserInputMode();
264 listener_->OnEnvironmentEstimationComplete(caller_id_);
265 }
266 return; // No more processing since we are still estimating environment.
267 }
268
269 // Check if we have waited too long without hearing any speech.
270 bool speech_was_heard_after_packet = endpointer_.DidStartReceivingSpeech();
271 if (!speech_was_heard_after_packet &&
272 num_samples_recorded_ >= (kNoSpeechTimeoutMs / 1000) * kAudioSampleRate) {
273 InformErrorAndAbortRecognition(
274 content::SPEECH_RECOGNITION_ERROR_NO_SPEECH);
275 return;
276 }
277
278 if (!speech_was_heard_before_packet && speech_was_heard_after_packet)
279 listener_->OnSoundStart(caller_id_);
280
281 // Calculate the input volume to display in the UI, smoothing towards the
282 // new level.
283 float level = (rms - kAudioMeterMinDb) /
284 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
285 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);
286 if (level > audio_level_) {
287 audio_level_ += (level - audio_level_) * kUpSmoothingFactor;
288 } else {
289 audio_level_ += (level - audio_level_) * kDownSmoothingFactor;
290 }
291
292 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /
293 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
294 noise_level = std::min(std::max(0.0f, noise_level),
295 kAudioMeterRangeMaxUnclipped);
296
297 listener_->OnAudioLevelsChange(caller_id_, did_clip ? 1.0f : audio_level_,
298 noise_level);
299
300 if (endpointer_.speech_input_complete())
301 StopAudioCapture();
302 } 196 }
303 197
304 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} 198 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {}
305 199
306 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult( 200 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult(
307 const content::SpeechRecognitionResult& result) { 201 const content::SpeechRecognitionResult& result) {
308 // Guard against the listener freeing us until we finish our job. 202 FSMEventArgs event_args(EVENT_ENGINE_RESULT);
203 event_args.engine_result = result;
204 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
205 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
206 this, event_args));
207 }
208
209 void SpeechRecognizerImpl::OnSpeechRecognitionEngineError(
210 const content::SpeechRecognitionError& error) {
211 FSMEventArgs event_args(EVENT_ENGINE_ERROR);
212 event_args.engine_error = error;
213 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
214 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
215 this, event_args));
216 }
217
218 // ----------------------- Core FSM implementation ---------------------------
219 // TODO(primiano) After the changes in the media package (r129173), this class
220 // slightly violates the SpeechRecognitionEventListener interface contract. In
221 // particular, it is not true anymore that this class can be freed after the
222 // OnRecognitionEnd event, since the audio_controller_.Close() asynchronous
223 // call can be still in progress after the end event. Currently, it does not
224 // represent a problem for the browser itself, since refcounting protects us
225 // against such race conditions. However, we should fix this in the next CLs.
226 // For instance, tests are currently working just because the
227 // TestAudioInputController is not closing asynchronously as the real controller
228 // does, but they will become flaky if TestAudioInputController will be fixed.
229
230 void SpeechRecognizerImpl::DispatchEvent(const FSMEventArgs& event_args) {
231 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
232 DCHECK_LE(event_args.event, EVENT_MAX);
233 DCHECK_LE(state_, STATE_MAX);
234
235 // Event dispatching must be sequential, otherwise it will break all the rules
236 // and the assumptions of the finite state automata model.
237 DCHECK(!in_event_dispatching_);
238 in_event_dispatching_ = true;
239
240 // Guard against the delegate freeing us until we finish processing the event.
309 scoped_refptr<SpeechRecognizerImpl> me(this); 241 scoped_refptr<SpeechRecognizerImpl> me(this);
242
243 if (event_args.event == EVENT_AUDIO_DATA) {
244 DCHECK(event_args.audio_data.get() != NULL);
245 ProcessAudioPipeline(*event_args.audio_data);
246 }
247
248 // The audio pipeline must be processed before the event dispatch, otherwise
249 // it would take actions according to the future state instead of the current.
250 state_ = ExecuteTransitionAndGetNextState(event_args);
251
252 in_event_dispatching_ = false;
253 }
254
255 SpeechRecognizerImpl::FSMState
256 SpeechRecognizerImpl::ExecuteTransitionAndGetNextState(
257 const FSMEventArgs& event_args) {
258 const FSMEvent event = event_args.event;
259 switch (state_) {
260 case STATE_IDLE:
261 switch (event) {
262 // TODO(primiano) restore UNREACHABLE_CONDITION on EVENT_ABORT and
263 // EVENT_STOP_CAPTURE below once speech input extensions are fixed.
264 case EVENT_ABORT:
265 return DoNothing(event_args);
266 case EVENT_START:
267 return StartRecording(event_args);
268 case EVENT_STOP_CAPTURE:
269 return DoNothing(event_args);
270 case EVENT_AUDIO_DATA:
271 return DoNothing(event_args); // Corner cases related to queued
272 case EVENT_ENGINE_RESULT: // messages being lately dispatched.
273 return DoNothing(event_args);
274 case EVENT_ENGINE_ERROR:
275 return DoNothing(event_args);
276 case EVENT_AUDIO_ERROR:
277 return DoNothing(event_args);
bulach 2012/04/04 15:38:17 I find this is a bit hard to follow.. would it be
Primiano Tucci (use gerrit) 2012/04/11 10:05:41 Hmm, the point is that is not obvious whether the
278 }
279 break;
280 case STATE_STARTING:
281 switch (event) {
282 case EVENT_ABORT:
283 return Abort(event_args);
284 case EVENT_START:
285 NOT_FEASIBLE();
286 case EVENT_STOP_CAPTURE:
287 return Abort(event_args);
288 case EVENT_AUDIO_DATA:
289 return StartRecognitionEngine(event_args);
290 case EVENT_ENGINE_RESULT:
291 NOT_FEASIBLE();
292 case EVENT_ENGINE_ERROR:
293 return Abort(event_args);
294 case EVENT_AUDIO_ERROR:
295 return Abort(event_args);
bulach 2012/04/04 15:38:17 ditto here... maybe something like: case EVENT_AUD
Primiano Tucci (use gerrit) 2012/04/11 10:05:41 Grouped adjacent cases ending with the same action
296 }
297 break;
298 case STATE_ESTIMATING_ENVIRONMENT:
299 switch (event) {
300 case EVENT_ABORT:
301 return Abort(event_args);
302 case EVENT_START:
303 NOT_FEASIBLE();
304 case EVENT_STOP_CAPTURE:
305 return StopCaptureAndWaitForResult(event_args);
306 case EVENT_AUDIO_DATA:
307 return WaitEnvironmentEstimationCompletion(event_args);
308 case EVENT_ENGINE_RESULT:
309 return ProcessIntermediateResult(event_args);
310 case EVENT_ENGINE_ERROR:
311 return Abort(event_args);
312 case EVENT_AUDIO_ERROR:
313 return Abort(event_args);
314 }
315 break;
316 case STATE_WAITING_FOR_SPEECH:
317 switch (event) {
318 case EVENT_ABORT:
319 return Abort(event_args);
320 case EVENT_START:
321 NOT_FEASIBLE();
322 case EVENT_STOP_CAPTURE:
323 return StopCaptureAndWaitForResult(event_args);
324 case EVENT_AUDIO_DATA:
325 return DetectUserSpeechOrTimeout(event_args);
326 case EVENT_ENGINE_RESULT:
327 return ProcessIntermediateResult(event_args);
328 case EVENT_ENGINE_ERROR:
329 return Abort(event_args);
330 case EVENT_AUDIO_ERROR:
331 return Abort(event_args);
332 }
333 break;
334 case STATE_RECOGNIZING:
335 switch (event) {
336 case EVENT_ABORT:
337 return Abort(event_args);
338 case EVENT_START:
339 NOT_FEASIBLE();
340 case EVENT_STOP_CAPTURE:
341 return StopCaptureAndWaitForResult(event_args);
342 case EVENT_AUDIO_DATA:
343 return DetectEndOfSpeech(event_args);
344 case EVENT_ENGINE_RESULT:
345 return ProcessIntermediateResult(event_args);
346 case EVENT_ENGINE_ERROR:
347 return Abort(event_args);
348 case EVENT_AUDIO_ERROR:
349 return Abort(event_args);
350 }
351 break;
352 case STATE_WAITING_FINAL_RESULT:
353 switch (event) {
354 case EVENT_ABORT:
355 return Abort(event_args);
356 case EVENT_START:
357 NOT_FEASIBLE();
358 case EVENT_STOP_CAPTURE:
359 return DoNothing(event_args);
360 case EVENT_AUDIO_DATA:
361 return DoNothing(event_args);
362 case EVENT_ENGINE_RESULT:
363 return ProcessFinalResult(event_args);
364 case EVENT_ENGINE_ERROR:
365 return Abort(event_args);
366 case EVENT_AUDIO_ERROR:
367 return Abort(event_args);
368 }
369 break;
370 }
371 NOT_FEASIBLE();
372 }
373
374 // ----------- Contract for all the FSM evolution functions below -------------
375 // - Are guaranteed to be executed in the IO thread;
376 // - Are guaranteed to be not reentrant (themselves and each other);
377 // - event_args members are guaranteed to be stable during the call;
378 // - The class won't be freed in the meanwhile due to callbacks;
379 // - IsCapturingAudio() returns true if and only if audio_controller_ != NULL.
380
381 // TODO(primiano) the audio pipeline is currently serial. However, the
382 // clipper->endpointer->vumeter chain and the sr_engine could be parallelized.
383 // We should profile the execution to see if it would be worth or not.
384 void SpeechRecognizerImpl::ProcessAudioPipeline(const AudioChunk& raw_audio) {
385 const bool route_to_endpointer = state_ >= STATE_ESTIMATING_ENVIRONMENT &&
386 state_ <= STATE_RECOGNIZING;
387 const bool route_to_sr_engine = route_to_endpointer;
388 const bool route_to_vumeter = state_ >= STATE_WAITING_FOR_SPEECH &&
389 state_ <= STATE_RECOGNIZING;
390 const bool clip_detected = DetectClipping(raw_audio);
391 float rms = 0;
392
393 num_samples_recorded_ += raw_audio.NumSamples();
394
395 if (route_to_endpointer) {
bulach 2012/04/04 15:38:17 nit: we normally avoid {} on single line if blocks
Primiano Tucci (use gerrit) 2012/04/11 10:05:41 Done.
396 endpointer_.ProcessAudio(raw_audio, &rms);
397 }
398 if (route_to_vumeter) {
399 DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|.
400 UpdateSignalAndNoiseLevels(rms, clip_detected);
401 }
402 if (route_to_sr_engine) {
403 DCHECK(recognition_engine_.get());
404 recognition_engine_->TakeAudioChunk(raw_audio);
405 }
406 }
407
408 SpeechRecognizerImpl::FSMState
409 SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) {
410 DCHECK(recognition_engine_.get());
411 DCHECK(!IsCapturingAudio());
412 AudioManager* audio_manager = (testing_audio_manager_ != NULL) ?
413 testing_audio_manager_ :
414 BrowserMainLoop::GetAudioManager();
415 DCHECK(audio_manager != NULL);
416
417 VLOG(1) << "SpeechRecognizerImpl starting audio capture.";
bulach 2012/04/04 15:38:17 nit: DVLOG?
Primiano Tucci (use gerrit) 2012/04/11 10:05:41 Done.
418 num_samples_recorded_ = 0;
419 audio_level_ = 0;
420 listener_->OnRecognitionStart(caller_id_);
421
422 if (!audio_manager->HasAudioInputDevices()) {
423 return AbortWithError(SpeechRecognitionError(
424 content::SPEECH_RECOGNITION_ERROR_AUDIO,
425 content::SPEECH_AUDIO_ERROR_DETAILS_NO_MIC));
426 }
427
428 if (audio_manager->IsRecordingInProcess()) {
429 return AbortWithError(SpeechRecognitionError(
430 content::SPEECH_RECOGNITION_ERROR_AUDIO,
431 content::SPEECH_AUDIO_ERROR_DETAILS_IN_USE));
432 }
433
434 const int samples_per_packet = (kAudioSampleRate *
435 recognition_engine_->GetDesiredAudioChunkDurationMs()) / 1000;
436 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,
437 kAudioSampleRate, kNumBitsPerAudioSample,
438 samples_per_packet);
439 audio_controller_ = AudioInputController::Create(audio_manager, this, params);
440
441 if (audio_controller_.get() == NULL) {
bulach 2012/04/04 15:38:17 nit: if (!audio_controller_.get()) {
Primiano Tucci (use gerrit) 2012/04/11 10:05:41 Hmm is it strict? I feel to violate my moral and e
442 return AbortWithError(
443 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO));
444 }
445
446 // The endpointer needs to estimate the environment/background noise before
447 // starting to treat the audio as user input. We wait in the state
448 // ESTIMATING_ENVIRONMENT until such interval has elapsed before switching
449 // to user input mode.
450 endpointer_.SetEnvironmentEstimationMode();
451 audio_controller_->Record();
452 return STATE_STARTING;
453 }
454
455 SpeechRecognizerImpl::FSMState
456 SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) {
457 // This is the first audio packet captured, so the recognition engine is
458 // started and the delegate notified about the event.
459 DCHECK(recognition_engine_.get());
460 recognition_engine_->StartRecognition();
461 listener_->OnAudioStart(caller_id_);
462
463 // This is a little hack, since TakeAudioChunk() is already called by
464 // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping
465 // the first audio chunk captured after opening the audio device.
466 recognition_engine_->TakeAudioChunk(*(event_args.audio_data));
467 return STATE_ESTIMATING_ENVIRONMENT;
468 }
469
470 SpeechRecognizerImpl::FSMState
471 SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) {
472 DCHECK(endpointer_.IsEstimatingEnvironment());
473 if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) {
474 endpointer_.SetUserInputMode();
475 listener_->OnEnvironmentEstimationComplete(caller_id_);
476 return STATE_WAITING_FOR_SPEECH;
477 } else {
bulach 2012/04/04 15:38:17 nit: here, 491 and 500, remove the final "else" bl
Primiano Tucci (use gerrit) 2012/04/11 10:05:41 Done.
478 return STATE_ESTIMATING_ENVIRONMENT;
479 }
480 }
481
482 SpeechRecognizerImpl::FSMState
483 SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) {
484 if (endpointer_.DidStartReceivingSpeech()) {
485 listener_->OnSoundStart(caller_id_);
486 return STATE_RECOGNIZING;
487 } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) {
488 return AbortWithError(
489 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH));
490 } else {
491 return STATE_WAITING_FOR_SPEECH;
492 }
493 }
494
495 SpeechRecognizerImpl::FSMState
496 SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) {
497 if (endpointer_.speech_input_complete()) {
498 return StopCaptureAndWaitForResult(event_args);
499 } else {
500 return STATE_RECOGNIZING;
501 }
502 }
503
504 SpeechRecognizerImpl::FSMState
505 SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) {
506 DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING);
507
508 VLOG(1) << "Concluding recognition";
bulach 2012/04/04 15:38:17 nit: DVLOG?
Primiano Tucci (use gerrit) 2012/04/11 10:05:41 Done.
509 CloseAudioControllerAsynchronously();
510 recognition_engine_->AudioChunksEnded();
511
512 if (state_ > STATE_WAITING_FOR_SPEECH)
513 listener_->OnSoundEnd(caller_id_);
514
515 listener_->OnAudioEnd(caller_id_);
516 return STATE_WAITING_FINAL_RESULT;
517 }
518
519 SpeechRecognizerImpl::FSMState
520 SpeechRecognizerImpl::Abort(const FSMEventArgs& event_args) {
521 // TODO(primiano) Should raise SPEECH_RECOGNITION_ERROR_ABORTED in lack of
522 // other specific error sources (so that it was an explicit abort request).
523 // However, SPEECH_RECOGNITION_ERROR_ABORTED is not caught in UI layers
bulach 2012/04/04 15:38:17 which UI layers? I think it's about the renderers,
Primiano Tucci (use gerrit) 2012/04/11 10:05:41 Done.
524 // and currently would cause an exception. JS will probably need it in future.
525 if (event_args.event == EVENT_AUDIO_ERROR) {
526 return AbortWithError(
527 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO));
528 } else if (event_args.event == EVENT_ENGINE_ERROR) {
529 return AbortWithError(event_args.engine_error);
530 }
531 return AbortWithError(NULL);
532 }
533
534 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortWithError(
535 const SpeechRecognitionError& error) {
bulach 2012/04/04 15:38:17 can we avoid this overload?
Primiano Tucci (use gerrit) 2012/04/11 10:05:41 Hmm I guess it would make more verbose statements
536 return AbortWithError(&error);
537 }
538
539 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortWithError(
540 const SpeechRecognitionError* error) {
541 if (IsCapturingAudio())
542 CloseAudioControllerAsynchronously();
543
544 VLOG(1) << "SpeechRecognizerImpl canceling recognition. ";
545
546 // The recognition engine is initialized only after STATE_STARTING.
547 if (state_ > STATE_STARTING) {
548 DCHECK(recognition_engine_.get());
549 recognition_engine_->EndRecognition();
550 }
551
552 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT)
553 listener_->OnSoundEnd(caller_id_);
554
555 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT)
556 listener_->OnAudioEnd(caller_id_);
557
558 if (error != NULL)
559 listener_->OnRecognitionError(caller_id_, *error);
560
561 listener_->OnRecognitionEnd(caller_id_);
562
563 return STATE_IDLE;
564 }
565
566 SpeechRecognizerImpl::FSMState
567 SpeechRecognizerImpl::ProcessIntermediateResult(const FSMEventArgs&) {
568 // This is in preparation for future speech recognition functions.
bulach 2012/04/04 15:38:17 nit: indent
Primiano Tucci (use gerrit) 2012/04/11 10:05:41 Done.
569 NOTREACHED();
570 return state_;
571 }
572
573 SpeechRecognizerImpl::FSMState
574 SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) {
575 const SpeechRecognitionResult& result = event_args.engine_result;
576 VLOG(1) << "Got valid result";
bulach 2012/04/04 15:38:17 nit: DVLOG
Primiano Tucci (use gerrit) 2012/04/11 10:05:41 Done.
577 recognition_engine_->EndRecognition();
310 listener_->OnRecognitionResult(caller_id_, result); 578 listener_->OnRecognitionResult(caller_id_, result);
311 listener_->OnRecognitionEnd(caller_id_); 579 listener_->OnRecognitionEnd(caller_id_);
312 } 580 return STATE_IDLE;
313 581 }
314 void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( 582
315 const content::SpeechRecognitionError& error) { 583 SpeechRecognizerImpl::FSMState
316 InformErrorAndAbortRecognition(error.code); 584 SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const {
317 } 585 return state_; // Just keep the current state.
318
319 void SpeechRecognizerImpl::InformErrorAndAbortRecognition(
320 content::SpeechRecognitionErrorCode error) {
321 DCHECK_NE(error, content::SPEECH_RECOGNITION_ERROR_NONE);
322 AbortRecognition();
323
324 // Guard against the listener freeing us until we finish our job.
325 scoped_refptr<SpeechRecognizerImpl> me(this);
326 listener_->OnRecognitionError(caller_id_, error);
327 } 586 }
328 587
329 void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() { 588 void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() {
330 VLOG(1) << "SpeechRecognizer stopping record."; 589 DCHECK(IsCapturingAudio());
590 VLOG(1) << "SpeechRecognizerImpl stopping audio capture.";
331 // Issues a Close on the audio controller, passing an empty callback. The only 591 // Issues a Close on the audio controller, passing an empty callback. The only
332 // purpose of such callback is to keep the audio controller refcounted until 592 // purpose of such callback is to keep the audio controller refcounted until
333 // Close has completed (in the audio thread) and automatically destroy it 593 // Close has completed (in the audio thread) and automatically destroy it
334 // afterwards (upon return from OnAudioClosed). 594 // afterwards (upon return from OnAudioClosed).
335 audio_controller_->Close(base::Bind(&SpeechRecognizerImpl::OnAudioClosed, 595 audio_controller_->Close(base::Bind(&SpeechRecognizerImpl::OnAudioClosed,
336 this, audio_controller_)); 596 this, audio_controller_));
337 audio_controller_ = NULL; // The controller is still refcounted by Bind. 597 audio_controller_ = NULL; // The controller is still refcounted by Bind.
338 } 598 }
339 599
340 bool SpeechRecognizerImpl::IsActive() const { 600 int SpeechRecognizerImpl::GetElapsedTimeMs() const {
341 return (recognition_engine_.get() != NULL); 601 return (num_samples_recorded_ * 1000) / kAudioSampleRate;
342 } 602 }
343 603
344 bool SpeechRecognizerImpl::IsCapturingAudio() const { 604 void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms,
345 return (audio_controller_.get() != NULL); 605 bool clip_detected) {
606 // Calculate the input volume to display in the UI, smoothing towards the
607 // new level.
608 // TODO(primiano) Do we really need all this floating point arith here?
609 // Perhaps it might be quite expensive on mobile.
610 float level = (rms - kAudioMeterMinDb) /
611 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
612 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);
613 if (level > audio_level_) {
614 audio_level_ += (level - audio_level_) * kUpSmoothingFactor;
bulach 2012/04/04 15:38:17 nit: you can probably simplify this with: const s
Primiano Tucci (use gerrit) 2012/04/11 10:05:41 It was code "inherited" from the original class, b
615 } else {
616 audio_level_ += (level - audio_level_) * kDownSmoothingFactor;
617 }
618
619 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /
620 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
621 noise_level = std::min(std::max(0.0f, noise_level),
622 kAudioMeterRangeMaxUnclipped);
623
624 listener_->OnAudioLevelsChange(
625 caller_id_, clip_detected ? 1.0f : audio_level_, noise_level);
346 } 626 }
347 627
348 const SpeechRecognitionEngine& 628 const SpeechRecognitionEngine&
349 SpeechRecognizerImpl::recognition_engine() const { 629 SpeechRecognizerImpl::recognition_engine() const {
350 return *(recognition_engine_.get()); 630 return *(recognition_engine_.get());
351 } 631 }
352 632
353 void SpeechRecognizerImpl::SetAudioManagerForTesting( 633 void SpeechRecognizerImpl::SetAudioManagerForTesting(
354 AudioManager* audio_manager) { 634 AudioManager* audio_manager) {
355 testing_audio_manager_ = audio_manager; 635 testing_audio_manager_ = audio_manager;
356 } 636 }
357 637
638 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value)
639 : event(event_value),
640 audio_error_code(0),
641 audio_data(NULL),
642 engine_error(content::SPEECH_RECOGNITION_ERROR_NONE) {
643 }
644
645 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() {
646 }
358 647
359 } // namespace speech 648 } // namespace speech
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698