| OLD | NEW |
| 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/browser/speech/speech_recognizer.h" | 5 #include "chrome/browser/speech/speech_recognizer.h" |
| 6 | 6 |
| 7 #include "base/ref_counted.h" | 7 #include "base/ref_counted.h" |
| 8 #include "base/scoped_ptr.h" | 8 #include "base/scoped_ptr.h" |
| 9 #include "base/time.h" | 9 #include "base/time.h" |
| 10 #include "chrome/browser/chrome_thread.h" | 10 #include "chrome/browser/chrome_thread.h" |
| 11 #include "chrome/browser/profile.h" | 11 #include "chrome/browser/profile.h" |
| 12 #include "chrome/common/net/url_request_context_getter.h" | 12 #include "chrome/common/net/url_request_context_getter.h" |
| 13 #include "third_party/speex/include/speex/speex.h" | 13 #include "third_party/speex/include/speex/speex.h" |
| 14 | 14 |
| 15 using media::AudioInputController; | 15 using media::AudioInputController; |
| 16 using std::list; | 16 using std::list; |
| 17 using std::string; | 17 using std::string; |
| 18 | 18 |
| 19 namespace { | 19 namespace { |
| 20 const char* const kDefaultSpeechRecognitionUrl = | 20 const char* const kDefaultSpeechRecognitionUrl = |
| 21 "http://www.google.com/speech-api/v1/recognize?lang=en-us&client=chromium"; | 21 "http://www.google.com/speech-api/v1/recognize?lang=en-us&client=chromium"; |
| 22 const char* const kContentTypeSpeex = | 22 const char* const kContentTypeSpeex = |
| 23 "audio/x-speex-with-header-byte; rate=16000"; | 23 "audio/x-speex-with-header-byte; rate=16000"; |
| 24 const int kAudioSampleRate = 16000; | |
| 25 const int kSpeexEncodingQuality = 8; | 24 const int kSpeexEncodingQuality = 8; |
| 26 const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz). | 25 const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz). |
| 27 | 26 |
| 28 // Since the frame length gets written out as a byte in the encoded packet, | 27 // Since the frame length gets written out as a byte in the encoded packet, |
| 29 // make sure it is within the byte range. | 28 // make sure it is within the byte range. |
| 30 COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength); | 29 COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength); |
| 31 | 30 |
| 32 const int kAudioPacketIntervalMs = 100; // Record 100ms long audio packets. | 31 const int kEndpointerEstimationTimeMs = 300; |
| 33 const int kNumAudioChannels = 1; // Speech is recorded as mono. | |
| 34 const int kNumBitsPerAudioSample = 16; | |
| 35 } // namespace | 32 } // namespace |
| 36 | 33 |
| 37 namespace speech_input { | 34 namespace speech_input { |
| 38 | 35 |
| 36 const int SpeechRecognizer::kAudioSampleRate = 16000; |
| 37 const int SpeechRecognizer::kAudioPacketIntervalMs = 100; |
| 38 const int SpeechRecognizer::kNumAudioChannels = 1; |
| 39 const int SpeechRecognizer::kNumBitsPerAudioSample = 16; |
| 40 const int SpeechRecognizer::kNoSpeechTimeoutSec = 8; |
| 41 |
| 39 // Provides a simple interface to encode raw audio using the Speex codec. | 42 // Provides a simple interface to encode raw audio using the Speex codec. |
| 40 class SpeexEncoder { | 43 class SpeexEncoder { |
| 41 public: | 44 public: |
| 42 SpeexEncoder(); | 45 SpeexEncoder(); |
| 43 ~SpeexEncoder(); | 46 ~SpeexEncoder(); |
| 44 | 47 |
| 45 int samples_per_frame() const { return samples_per_frame_; } | 48 int samples_per_frame() const { return samples_per_frame_; } |
| 46 | 49 |
| 47 // Encodes each frame of raw audio in |samples| and adds the | 50 // Encodes each frame of raw audio in |samples| and adds the |
| 48 // encoded frames as a set of strings to the |encoded_frames| list. | 51 // encoded frames as a set of strings to the |encoded_frames| list. |
| (...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 115 DCHECK(!request_.get() || !request_->HasPendingRequest()); | 118 DCHECK(!request_.get() || !request_->HasPendingRequest()); |
| 116 DCHECK(audio_buffers_.empty()); | 119 DCHECK(audio_buffers_.empty()); |
| 117 endpointer_.EndSession(); | 120 endpointer_.EndSession(); |
| 118 } | 121 } |
| 119 | 122 |
| 120 bool SpeechRecognizer::StartRecording() { | 123 bool SpeechRecognizer::StartRecording() { |
| 121 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO)); | 124 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO)); |
| 122 DCHECK(!audio_controller_.get()); | 125 DCHECK(!audio_controller_.get()); |
| 123 DCHECK(!request_.get() || !request_->HasPendingRequest()); | 126 DCHECK(!request_.get() || !request_->HasPendingRequest()); |
| 124 | 127 |
| 125 // TODO(satish): Normally for a short time (even 0.5s) the endpointer needs to | 128 // The endpointer needs to estimate the environment/background noise before |
| 126 // estimate the environment/background noise before starting to treat the | 129 // starting to treat the audio as user input. In |HandleOnData| we wait until |
| 127 // audio as user input. Once we have implemented a popup UI to notify the user | 130 // such time has passed before switching to user input mode. |
| 128 // that recording has started, we should perhaps have a short interval where | |
| 129 // we record background audio and then show the popup UI so that the user can | |
| 130 // start speaking after that. For now we just do these together so there isn't | |
| 131 // any background noise for the end pointer (still works ok). | |
| 132 endpointer_.SetEnvironmentEstimationMode(); | 131 endpointer_.SetEnvironmentEstimationMode(); |
| 133 endpointer_.SetUserInputMode(); | |
| 134 | 132 |
| 135 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; | 133 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; |
| 136 DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0); | 134 DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0); |
| 137 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels, | 135 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels, |
| 138 kAudioSampleRate, kNumBitsPerAudioSample); | 136 kAudioSampleRate, kNumBitsPerAudioSample); |
| 139 audio_controller_ = | 137 audio_controller_ = |
| 140 AudioInputController::Create(this, params, samples_per_packet); | 138 AudioInputController::Create(this, params, samples_per_packet); |
| 141 DCHECK(audio_controller_.get()); | 139 DCHECK(audio_controller_.get()); |
| 142 LOG(INFO) << "SpeechRecognizer starting record."; | 140 LOG(INFO) << "SpeechRecognizer starting record."; |
| 141 num_samples_recorded_ = 0; |
| 143 audio_controller_->Record(); | 142 audio_controller_->Record(); |
| 144 | 143 |
| 145 return true; | 144 return true; |
| 146 } | 145 } |
| 147 | 146 |
| 148 void SpeechRecognizer::CancelRecognition() { | 147 void SpeechRecognizer::CancelRecognition() { |
| 149 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO)); | 148 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO)); |
| 150 DCHECK(audio_controller_.get() || request_.get()); | 149 DCHECK(audio_controller_.get() || request_.get()); |
| 151 | 150 |
| 152 // Stop recording if required. | 151 // Stop recording if required. |
| (...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 224 | 223 |
| 225 void SpeechRecognizer::HandleOnError(int error_code) { | 224 void SpeechRecognizer::HandleOnError(int error_code) { |
| 226 LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code; | 225 LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code; |
| 227 | 226 |
| 228 // Check if we are still recording before canceling recognition, as | 227 // Check if we are still recording before canceling recognition, as |
| 229 // recording might have been stopped after this error was posted to the queue | 228 // recording might have been stopped after this error was posted to the queue |
| 230 // by |OnError|. | 229 // by |OnError|. |
| 231 if (!audio_controller_.get()) | 230 if (!audio_controller_.get()) |
| 232 return; | 231 return; |
| 233 | 232 |
| 234 delegate_->OnRecognizerError(caller_id_); | 233 InformErrorAndCancelRecognition(RECOGNIZER_ERROR_CAPTURE); |
| 235 CancelRecognition(); | |
| 236 delegate_->DidCompleteRecording(caller_id_); | |
| 237 delegate_->DidCompleteRecognition(caller_id_); | |
| 238 } | 234 } |
| 239 | 235 |
| 240 void SpeechRecognizer::OnData(AudioInputController* controller, | 236 void SpeechRecognizer::OnData(AudioInputController* controller, |
| 241 const uint8* data, uint32 size) { | 237 const uint8* data, uint32 size) { |
| 242 if (size == 0) // This could happen when recording stops and is normal. | 238 if (size == 0) // This could happen when recording stops and is normal. |
| 243 return; | 239 return; |
| 244 | 240 |
| 245 string* str_data = new string(reinterpret_cast<const char*>(data), size); | 241 string* str_data = new string(reinterpret_cast<const char*>(data), size); |
| 246 ChromeThread::PostTask(ChromeThread::IO, FROM_HERE, | 242 ChromeThread::PostTask(ChromeThread::IO, FROM_HERE, |
| 247 NewRunnableMethod(this, | 243 NewRunnableMethod(this, |
| (...skipping 10 matching lines...) Expand all Loading... |
| 258 return; | 254 return; |
| 259 } | 255 } |
| 260 | 256 |
| 261 const short* samples = reinterpret_cast<const short*>(data->data()); | 257 const short* samples = reinterpret_cast<const short*>(data->data()); |
| 262 DCHECK((data->length() % sizeof(short)) == 0); | 258 DCHECK((data->length() % sizeof(short)) == 0); |
| 263 int num_samples = data->length() / sizeof(short); | 259 int num_samples = data->length() / sizeof(short); |
| 264 | 260 |
| 265 encoder_->Encode(samples, num_samples, &audio_buffers_); | 261 encoder_->Encode(samples, num_samples, &audio_buffers_); |
| 266 endpointer_.ProcessAudio(samples, num_samples); | 262 endpointer_.ProcessAudio(samples, num_samples); |
| 267 delete data; | 263 delete data; |
| 264 num_samples_recorded_ += num_samples; |
| 265 |
| 266 // Check if we have gathered enough audio for the endpointer to do environment |
| 267 // estimation and should move on to detect speech/end of speech. |
| 268 if (endpointer_.IsEstimatingEnvironment() && |
| 269 num_samples_recorded_ >= (kEndpointerEstimationTimeMs * |
| 270 kAudioSampleRate) / 1000) { |
| 271 endpointer_.SetUserInputMode(); |
| 272 delegate_->DidCompleteEnvironmentEstimation(caller_id_); |
| 273 return; |
| 274 } |
| 275 |
| 276 // Check if we have waited too long without hearing any speech. |
| 277 if (!endpointer_.DidStartReceivingSpeech() && |
| 278 num_samples_recorded_ >= kNoSpeechTimeoutSec * kAudioSampleRate) { |
| 279 InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_SPEECH); |
| 280 return; |
| 281 } |
| 268 | 282 |
| 269 if (endpointer_.speech_input_complete()) { | 283 if (endpointer_.speech_input_complete()) { |
| 270 StopRecording(); | 284 StopRecording(); |
| 271 } | 285 } |
| 272 | 286 |
| 273 // TODO(satish): Once we have streaming POST, start sending the data received | 287 // TODO(satish): Once we have streaming POST, start sending the data received |
| 274 // here as POST chunks. | 288 // here as POST chunks. |
| 275 } | 289 } |
| 276 | 290 |
| 277 void SpeechRecognizer::SetRecognitionResult(bool error, const string16& value) { | 291 void SpeechRecognizer::SetRecognitionResult(bool error, const string16& value) { |
| 292 if (value.empty()) { |
| 293 InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_RESULTS); |
| 294 return; |
| 295 } |
| 296 |
| 278 delegate_->SetRecognitionResult(caller_id_, error, value); | 297 delegate_->SetRecognitionResult(caller_id_, error, value); |
| 279 | 298 |
| 280 // Guard against the delegate freeing us until we finish our job. | 299 // Guard against the delegate freeing us until we finish our job. |
| 281 scoped_refptr<SpeechRecognizer> me(this); | 300 scoped_refptr<SpeechRecognizer> me(this); |
| 282 delegate_->DidCompleteRecognition(caller_id_); | 301 delegate_->DidCompleteRecognition(caller_id_); |
| 283 } | 302 } |
| 284 | 303 |
| 304 void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) { |
| 305 CancelRecognition(); |
| 306 |
| 307 // Guard against the delegate freeing us until we finish our job. |
| 308 scoped_refptr<SpeechRecognizer> me(this); |
| 309 delegate_->OnRecognizerError(caller_id_, error); |
| 310 delegate_->DidCompleteRecording(caller_id_); |
| 311 delegate_->DidCompleteRecognition(caller_id_); |
| 312 } |
| 313 |
| 285 } // namespace speech_input | 314 } // namespace speech_input |
| OLD | NEW |