Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(565)

Side by Side Diff: chrome/browser/speech/speech_recognizer.cc

Issue 3341020: Speech input: Do environment estimation and detect the no-speech case. (Closed)
Patch Set: . Created 10 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/browser/speech/speech_recognizer.h" 5 #include "chrome/browser/speech/speech_recognizer.h"
6 6
7 #include "base/ref_counted.h" 7 #include "base/ref_counted.h"
8 #include "base/scoped_ptr.h" 8 #include "base/scoped_ptr.h"
9 #include "base/time.h" 9 #include "base/time.h"
10 #include "chrome/browser/chrome_thread.h" 10 #include "chrome/browser/chrome_thread.h"
11 #include "chrome/browser/profile.h" 11 #include "chrome/browser/profile.h"
12 #include "chrome/common/net/url_request_context_getter.h" 12 #include "chrome/common/net/url_request_context_getter.h"
13 #include "third_party/speex/include/speex/speex.h" 13 #include "third_party/speex/include/speex/speex.h"
14 14
15 using media::AudioInputController; 15 using media::AudioInputController;
16 using std::list; 16 using std::list;
17 using std::string; 17 using std::string;
18 18
19 namespace { 19 namespace {
20 const char* const kDefaultSpeechRecognitionUrl = 20 const char* const kDefaultSpeechRecognitionUrl =
21 "http://www.google.com/speech-api/v1/recognize?lang=en-us&client=chromium"; 21 "http://www.google.com/speech-api/v1/recognize?lang=en-us&client=chromium";
22 const char* const kContentTypeSpeex = 22 const char* const kContentTypeSpeex =
23 "audio/x-speex-with-header-byte; rate=16000"; 23 "audio/x-speex-with-header-byte; rate=16000";
24 const int kAudioSampleRate = 16000;
25 const int kSpeexEncodingQuality = 8; 24 const int kSpeexEncodingQuality = 8;
26 const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz). 25 const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz).
27 26
28 // Since the frame length gets written out as a byte in the encoded packet, 27 // Since the frame length gets written out as a byte in the encoded packet,
29 // make sure it is within the byte range. 28 // make sure it is within the byte range.
30 COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength); 29 COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength);
31 30
32 const int kAudioPacketIntervalMs = 100; // Record 100ms long audio packets. 31 const int kEndpointerEstimationTimeMs = 300;
33 const int kNumAudioChannels = 1; // Speech is recorded as mono.
34 const int kNumBitsPerAudioSample = 16;
35 } // namespace 32 } // namespace
36 33
37 namespace speech_input { 34 namespace speech_input {
38 35
36 const int SpeechRecognizer::kAudioSampleRate = 16000;
37 const int SpeechRecognizer::kAudioPacketIntervalMs = 100;
38 const int SpeechRecognizer::kNumAudioChannels = 1;
39 const int SpeechRecognizer::kNumBitsPerAudioSample = 16;
40 const int SpeechRecognizer::kNoSpeechTimeoutSec = 8;
41
39 // Provides a simple interface to encode raw audio using the Speex codec. 42 // Provides a simple interface to encode raw audio using the Speex codec.
40 class SpeexEncoder { 43 class SpeexEncoder {
41 public: 44 public:
42 SpeexEncoder(); 45 SpeexEncoder();
43 ~SpeexEncoder(); 46 ~SpeexEncoder();
44 47
45 int samples_per_frame() const { return samples_per_frame_; } 48 int samples_per_frame() const { return samples_per_frame_; }
46 49
47 // Encodes each frame of raw audio in |samples| and adds the 50 // Encodes each frame of raw audio in |samples| and adds the
48 // encoded frames as a set of strings to the |encoded_frames| list. 51 // encoded frames as a set of strings to the |encoded_frames| list.
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after
115 DCHECK(!request_.get() || !request_->HasPendingRequest()); 118 DCHECK(!request_.get() || !request_->HasPendingRequest());
116 DCHECK(audio_buffers_.empty()); 119 DCHECK(audio_buffers_.empty());
117 endpointer_.EndSession(); 120 endpointer_.EndSession();
118 } 121 }
119 122
120 bool SpeechRecognizer::StartRecording() { 123 bool SpeechRecognizer::StartRecording() {
121 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO)); 124 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO));
122 DCHECK(!audio_controller_.get()); 125 DCHECK(!audio_controller_.get());
123 DCHECK(!request_.get() || !request_->HasPendingRequest()); 126 DCHECK(!request_.get() || !request_->HasPendingRequest());
124 127
125 // TODO(satish): Normally for a short time (even 0.5s) the endpointer needs to 128 // The endpointer needs to estimate the environment/background noise before
126 // estimate the environment/background noise before starting to treat the 129 // starting to treat the audio as user input. In |HandleOnData| we wait until
127 // audio as user input. Once we have implemented a popup UI to notify the user 130 // such time has passed before switching to user input mode.
128 // that recording has started, we should perhaps have a short interval where
129 // we record background audio and then show the popup UI so that the user can
130 // start speaking after that. For now we just do these together so there isn't
131 // any background noise for the end pointer (still works ok).
132 endpointer_.SetEnvironmentEstimationMode(); 131 endpointer_.SetEnvironmentEstimationMode();
133 endpointer_.SetUserInputMode();
134 132
135 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; 133 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;
136 DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0); 134 DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0);
137 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels, 135 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels,
138 kAudioSampleRate, kNumBitsPerAudioSample); 136 kAudioSampleRate, kNumBitsPerAudioSample);
139 audio_controller_ = 137 audio_controller_ =
140 AudioInputController::Create(this, params, samples_per_packet); 138 AudioInputController::Create(this, params, samples_per_packet);
141 DCHECK(audio_controller_.get()); 139 DCHECK(audio_controller_.get());
142 LOG(INFO) << "SpeechRecognizer starting record."; 140 LOG(INFO) << "SpeechRecognizer starting record.";
141 num_samples_recorded_ = 0;
143 audio_controller_->Record(); 142 audio_controller_->Record();
144 143
145 return true; 144 return true;
146 } 145 }
147 146
148 void SpeechRecognizer::CancelRecognition() { 147 void SpeechRecognizer::CancelRecognition() {
149 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO)); 148 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO));
150 DCHECK(audio_controller_.get() || request_.get()); 149 DCHECK(audio_controller_.get() || request_.get());
151 150
152 // Stop recording if required. 151 // Stop recording if required.
(...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after
224 223
225 void SpeechRecognizer::HandleOnError(int error_code) { 224 void SpeechRecognizer::HandleOnError(int error_code) {
226 LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code; 225 LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code;
227 226
228 // Check if we are still recording before canceling recognition, as 227 // Check if we are still recording before canceling recognition, as
229 // recording might have been stopped after this error was posted to the queue 228 // recording might have been stopped after this error was posted to the queue
230 // by |OnError|. 229 // by |OnError|.
231 if (!audio_controller_.get()) 230 if (!audio_controller_.get())
232 return; 231 return;
233 232
234 delegate_->OnRecognizerError(caller_id_); 233 InformErrorAndCancelRecognition(RECOGNIZER_ERROR_CAPTURE);
235 CancelRecognition();
236 delegate_->DidCompleteRecording(caller_id_);
237 delegate_->DidCompleteRecognition(caller_id_);
238 } 234 }
239 235
240 void SpeechRecognizer::OnData(AudioInputController* controller, 236 void SpeechRecognizer::OnData(AudioInputController* controller,
241 const uint8* data, uint32 size) { 237 const uint8* data, uint32 size) {
242 if (size == 0) // This could happen when recording stops and is normal. 238 if (size == 0) // This could happen when recording stops and is normal.
243 return; 239 return;
244 240
245 string* str_data = new string(reinterpret_cast<const char*>(data), size); 241 string* str_data = new string(reinterpret_cast<const char*>(data), size);
246 ChromeThread::PostTask(ChromeThread::IO, FROM_HERE, 242 ChromeThread::PostTask(ChromeThread::IO, FROM_HERE,
247 NewRunnableMethod(this, 243 NewRunnableMethod(this,
(...skipping 10 matching lines...) Expand all
258 return; 254 return;
259 } 255 }
260 256
261 const short* samples = reinterpret_cast<const short*>(data->data()); 257 const short* samples = reinterpret_cast<const short*>(data->data());
262 DCHECK((data->length() % sizeof(short)) == 0); 258 DCHECK((data->length() % sizeof(short)) == 0);
263 int num_samples = data->length() / sizeof(short); 259 int num_samples = data->length() / sizeof(short);
264 260
265 encoder_->Encode(samples, num_samples, &audio_buffers_); 261 encoder_->Encode(samples, num_samples, &audio_buffers_);
266 endpointer_.ProcessAudio(samples, num_samples); 262 endpointer_.ProcessAudio(samples, num_samples);
267 delete data; 263 delete data;
264 num_samples_recorded_ += num_samples;
265
266 // Check if we have gathered enough audio for the endpointer to do environment
267 // estimation and should move on to detect speech/end of speech.
268 if (endpointer_.IsEstimatingEnvironment() &&
269 num_samples_recorded_ >= (kEndpointerEstimationTimeMs *
270 kAudioSampleRate) / 1000) {
271 endpointer_.SetUserInputMode();
272 delegate_->DidCompleteEnvironmentEstimation(caller_id_);
273 return;
274 }
275
276 // Check if we have waited too long without hearing any speech.
277 if (!endpointer_.DidStartReceivingSpeech() &&
278 num_samples_recorded_ >= kNoSpeechTimeoutSec * kAudioSampleRate) {
279 InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_SPEECH);
280 return;
281 }
268 282
269 if (endpointer_.speech_input_complete()) { 283 if (endpointer_.speech_input_complete()) {
270 StopRecording(); 284 StopRecording();
271 } 285 }
272 286
273 // TODO(satish): Once we have streaming POST, start sending the data received 287 // TODO(satish): Once we have streaming POST, start sending the data received
274 // here as POST chunks. 288 // here as POST chunks.
275 } 289 }
276 290
277 void SpeechRecognizer::SetRecognitionResult(bool error, const string16& value) { 291 void SpeechRecognizer::SetRecognitionResult(bool error, const string16& value) {
292 if (value.empty()) {
293 InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_RESULTS);
294 return;
295 }
296
278 delegate_->SetRecognitionResult(caller_id_, error, value); 297 delegate_->SetRecognitionResult(caller_id_, error, value);
279 298
280 // Guard against the delegate freeing us until we finish our job. 299 // Guard against the delegate freeing us until we finish our job.
281 scoped_refptr<SpeechRecognizer> me(this); 300 scoped_refptr<SpeechRecognizer> me(this);
282 delegate_->DidCompleteRecognition(caller_id_); 301 delegate_->DidCompleteRecognition(caller_id_);
283 } 302 }
284 303
304 void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) {
305 CancelRecognition();
306
307 // Guard against the delegate freeing us until we finish our job.
308 scoped_refptr<SpeechRecognizer> me(this);
309 delegate_->OnRecognizerError(caller_id_, error);
310 delegate_->DidCompleteRecording(caller_id_);
311 delegate_->DidCompleteRecognition(caller_id_);
312 }
313
285 } // namespace speech_input 314 } // namespace speech_input
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698