| OLD | NEW |
| 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/browser/speech/speech_recognizer.h" | 5 #include "chrome/browser/speech/speech_recognizer.h" |
| 6 | 6 |
| 7 #include "base/ref_counted.h" | 7 #include "base/ref_counted.h" |
| 8 #include "base/scoped_ptr.h" | 8 #include "base/scoped_ptr.h" |
| 9 #include "base/time.h" |
| 9 #include "chrome/browser/chrome_thread.h" | 10 #include "chrome/browser/chrome_thread.h" |
| 10 #include "chrome/browser/profile.h" | 11 #include "chrome/browser/profile.h" |
| 11 #include "chrome/common/net/url_request_context_getter.h" | 12 #include "chrome/common/net/url_request_context_getter.h" |
| 12 #include "third_party/speex/include/speex/speex.h" | 13 #include "third_party/speex/include/speex/speex.h" |
| 13 | 14 |
| 14 using media::AudioInputController; | 15 using media::AudioInputController; |
| 15 using std::list; | 16 using std::list; |
| 16 using std::string; | 17 using std::string; |
| 17 | 18 |
| 18 namespace { | 19 namespace { |
| (...skipping 17 matching lines...) Expand all Loading... |
| 36 namespace speech_input { | 37 namespace speech_input { |
| 37 | 38 |
| 38 // Provides a simple interface to encode raw audio using the Speex codec. | 39 // Provides a simple interface to encode raw audio using the Speex codec. |
| 39 class SpeexEncoder { | 40 class SpeexEncoder { |
| 40 public: | 41 public: |
| 41 SpeexEncoder(); | 42 SpeexEncoder(); |
| 42 ~SpeexEncoder(); | 43 ~SpeexEncoder(); |
| 43 | 44 |
| 44 int samples_per_frame() const { return samples_per_frame_; } | 45 int samples_per_frame() const { return samples_per_frame_; } |
| 45 | 46 |
| 46 // Encodes each frame of raw audio in |raw_samples| and adds the | 47 // Encodes each frame of raw audio in |samples| and adds the |
| 47 // encoded frames as a set of strings to the |encoded_frames| list. | 48 // encoded frames as a set of strings to the |encoded_frames| list. |
| 48 // Ownership of the newly added strings is transferred to the caller. | 49 // Ownership of the newly added strings is transferred to the caller. |
| 49 void Encode(const string& raw_samples, | 50 void Encode(const short* samples, |
| 51 int num_samples, |
| 50 std::list<std::string*>* encoded_frames); | 52 std::list<std::string*>* encoded_frames); |
| 51 | 53 |
| 52 private: | 54 private: |
| 53 SpeexBits bits_; | 55 SpeexBits bits_; |
| 54 void* encoder_state_; | 56 void* encoder_state_; |
| 55 int samples_per_frame_; | 57 int samples_per_frame_; |
| 56 char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size. | 58 char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size. |
| 57 }; | 59 }; |
| 58 | 60 |
| 59 SpeexEncoder::SpeexEncoder() { | 61 SpeexEncoder::SpeexEncoder() { |
| 60 speex_bits_init(&bits_); | 62 speex_bits_init(&bits_); |
| 61 encoder_state_ = speex_encoder_init(&speex_wb_mode); | 63 encoder_state_ = speex_encoder_init(&speex_wb_mode); |
| 62 DCHECK(encoder_state_); | 64 DCHECK(encoder_state_); |
| 63 speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_); | 65 speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_); |
| 64 DCHECK(samples_per_frame_ > 0); | 66 DCHECK(samples_per_frame_ > 0); |
| 65 int quality = kSpeexEncodingQuality; | 67 int quality = kSpeexEncodingQuality; |
| 66 speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality); | 68 speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality); |
| 67 int vbr = 1; | 69 int vbr = 1; |
| 68 speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr); | 70 speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr); |
| 69 } | 71 } |
| 70 | 72 |
| 71 SpeexEncoder::~SpeexEncoder() { | 73 SpeexEncoder::~SpeexEncoder() { |
| 72 speex_bits_destroy(&bits_); | 74 speex_bits_destroy(&bits_); |
| 73 speex_encoder_destroy(encoder_state_); | 75 speex_encoder_destroy(encoder_state_); |
| 74 } | 76 } |
| 75 | 77 |
| 76 void SpeexEncoder::Encode(const string& raw_samples, | 78 void SpeexEncoder::Encode(const short* samples, |
| 79 int num_samples, |
| 77 std::list<std::string*>* encoded_frames) { | 80 std::list<std::string*>* encoded_frames) { |
| 78 const short* samples = reinterpret_cast<const short*>(raw_samples.data()); | |
| 79 DCHECK((raw_samples.length() % sizeof(short)) == 0); | |
| 80 int num_samples = raw_samples.length() / sizeof(short); | |
| 81 | |
| 82 // Drop incomplete frames, typically those which come in when recording stops. | 81 // Drop incomplete frames, typically those which come in when recording stops. |
| 83 num_samples -= (num_samples % samples_per_frame_); | 82 num_samples -= (num_samples % samples_per_frame_); |
| 84 for (int i = 0; i < num_samples; i += samples_per_frame_) { | 83 for (int i = 0; i < num_samples; i += samples_per_frame_) { |
| 85 speex_bits_reset(&bits_); | 84 speex_bits_reset(&bits_); |
| 86 speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i), | 85 speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i), |
| 87 &bits_); | 86 &bits_); |
| 88 | 87 |
| 89 // Encode the frame and place the size of the frame as the first byte. This | 88 // Encode the frame and place the size of the frame as the first byte. This |
| 90 // is the packet format for MIME type x-speex-with-header-byte. | 89 // is the packet format for MIME type x-speex-with-header-byte. |
| 91 int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1, | 90 int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1, |
| 92 kMaxSpeexFrameLength); | 91 kMaxSpeexFrameLength); |
| 93 encoded_frame_data_[0] = static_cast<char>(frame_length); | 92 encoded_frame_data_[0] = static_cast<char>(frame_length); |
| 94 encoded_frames->push_back(new string(encoded_frame_data_, | 93 encoded_frames->push_back(new string(encoded_frame_data_, |
| 95 frame_length + 1)); | 94 frame_length + 1)); |
| 96 } | 95 } |
| 97 } | 96 } |
| 98 | 97 |
| 99 SpeechRecognizer::SpeechRecognizer(Delegate* delegate, | 98 SpeechRecognizer::SpeechRecognizer(Delegate* delegate, |
| 100 const SpeechInputCallerId& caller_id) | 99 const SpeechInputCallerId& caller_id) |
| 101 : delegate_(delegate), | 100 : delegate_(delegate), |
| 102 caller_id_(caller_id), | 101 caller_id_(caller_id), |
| 103 encoder_(new SpeexEncoder()) { | 102 encoder_(new SpeexEncoder()), |
| 103 endpointer_(kAudioSampleRate) { |
| 104 endpointer_.set_speech_input_complete_silence_length( |
| 105 base::Time::kMicrosecondsPerSecond / 2); |
| 106 endpointer_.set_long_speech_input_complete_silence_length( |
| 107 base::Time::kMicrosecondsPerSecond); |
| 108 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); |
| 109 endpointer_.StartSession(); |
| 104 } | 110 } |
| 105 | 111 |
| 106 SpeechRecognizer::~SpeechRecognizer() { | 112 SpeechRecognizer::~SpeechRecognizer() { |
| 107 // Recording should have stopped earlier due to the endpointer or | 113 // Recording should have stopped earlier due to the endpointer or |
| 108 // |StopRecording| being called. | 114 // |StopRecording| being called. |
| 109 DCHECK(!audio_controller_.get()); | 115 DCHECK(!audio_controller_.get()); |
| 110 DCHECK(!request_.get() || !request_->HasPendingRequest()); | 116 DCHECK(!request_.get() || !request_->HasPendingRequest()); |
| 111 DCHECK(audio_buffers_.empty()); | 117 DCHECK(audio_buffers_.empty()); |
| 118 endpointer_.EndSession(); |
| 112 } | 119 } |
| 113 | 120 |
| 114 bool SpeechRecognizer::StartRecording() { | 121 bool SpeechRecognizer::StartRecording() { |
| 115 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO)); | 122 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO)); |
| 116 DCHECK(!audio_controller_.get()); | 123 DCHECK(!audio_controller_.get()); |
| 117 DCHECK(!request_.get() || !request_->HasPendingRequest()); | 124 DCHECK(!request_.get() || !request_->HasPendingRequest()); |
| 118 | 125 |
| 126 // TODO(satish): Normally for a short time (even 0.5s) the endpointer needs to |
| 127 // estimate the environment/background noise before starting to treat the |
| 128 // audio as user input. Once we have implemented a popup UI to notify the user |
| 129 // that recording has started, we should perhaps have a short interval where |
| 130 // we record background audio and then show the popup UI so that the user can |
| 131 // start speaking after that. For now we just do these together so there isn't |
| 132 // any background noise for the end pointer (still works ok). |
| 133 endpointer_.SetEnvironmentEstimationMode(); |
| 134 endpointer_.SetUserInputMode(); |
| 135 |
| 119 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; | 136 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; |
| 120 DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0); | 137 DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0); |
| 121 audio_controller_ = AudioInputController::Create(this, | 138 audio_controller_ = AudioInputController::Create(this, |
| 122 AudioManager::AUDIO_PCM_LINEAR, kNumAudioChannels, | 139 AudioManager::AUDIO_PCM_LINEAR, kNumAudioChannels, |
| 123 kAudioSampleRate, kNumBitsPerAudioSample, | 140 kAudioSampleRate, kNumBitsPerAudioSample, |
| 124 samples_per_packet); | 141 samples_per_packet); |
| 125 DCHECK(audio_controller_.get()); | 142 DCHECK(audio_controller_.get()); |
| 126 LOG(INFO) << "SpeechRecognizer starting record."; | 143 LOG(INFO) << "SpeechRecognizer starting record."; |
| 127 audio_controller_->Record(); | 144 audio_controller_->Record(); |
| 128 | 145 |
| (...skipping 20 matching lines...) Expand all Loading... |
| 149 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO)); | 166 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO)); |
| 150 | 167 |
| 151 // If audio recording has already stopped and we are in recognition phase, | 168 // If audio recording has already stopped and we are in recognition phase, |
| 152 // silently ignore any more calls to stop recording. | 169 // silently ignore any more calls to stop recording. |
| 153 if (!audio_controller_.get()) | 170 if (!audio_controller_.get()) |
| 154 return; | 171 return; |
| 155 | 172 |
| 156 LOG(INFO) << "SpeechRecognizer stopping record."; | 173 LOG(INFO) << "SpeechRecognizer stopping record."; |
| 157 audio_controller_->Close(); | 174 audio_controller_->Close(); |
| 158 audio_controller_ = NULL; // Releases the ref ptr. | 175 audio_controller_ = NULL; // Releases the ref ptr. |
| 176 |
| 159 delegate_->DidCompleteRecording(caller_id_); | 177 delegate_->DidCompleteRecording(caller_id_); |
| 160 | 178 |
| 161 // If we haven't got any audio yet end the recognition sequence here. | 179 // If we haven't got any audio yet end the recognition sequence here. |
| 162 if (audio_buffers_.empty()) { | 180 if (audio_buffers_.empty()) { |
| 163 // Guard against the delegate freeing us until we finish our job. | 181 // Guard against the delegate freeing us until we finish our job. |
| 164 scoped_refptr<SpeechRecognizer> me(this); | 182 scoped_refptr<SpeechRecognizer> me(this); |
| 165 delegate_->DidCompleteRecognition(caller_id_); | 183 delegate_->DidCompleteRecognition(caller_id_); |
| 166 return; | 184 return; |
| 167 } | 185 } |
| 168 | 186 |
| (...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 233 | 251 |
| 234 void SpeechRecognizer::HandleOnData(string* data) { | 252 void SpeechRecognizer::HandleOnData(string* data) { |
| 235 // Check if we are still recording and if not discard this buffer, as | 253 // Check if we are still recording and if not discard this buffer, as |
| 236 // recording might have been stopped after this buffer was posted to the queue | 254 // recording might have been stopped after this buffer was posted to the queue |
| 237 // by |OnData|. | 255 // by |OnData|. |
| 238 if (!audio_controller_.get()) { | 256 if (!audio_controller_.get()) { |
| 239 delete data; | 257 delete data; |
| 240 return; | 258 return; |
| 241 } | 259 } |
| 242 | 260 |
| 243 encoder_->Encode(*data, &audio_buffers_); | 261 const short* samples = reinterpret_cast<const short*>(data->data()); |
| 262 DCHECK((data->length() % sizeof(short)) == 0); |
| 263 int num_samples = data->length() / sizeof(short); |
| 264 |
| 265 encoder_->Encode(samples, num_samples, &audio_buffers_); |
| 266 endpointer_.ProcessAudio(samples, num_samples); |
| 244 delete data; | 267 delete data; |
| 245 | 268 |
| 269 if (endpointer_.speech_input_complete()) { |
| 270 StopRecording(); |
| 271 } |
| 272 |
| 246 // TODO(satish): Once we have streaming POST, start sending the data received | 273 // TODO(satish): Once we have streaming POST, start sending the data received |
| 247 // here as POST chunks. | 274 // here as POST chunks. |
| 248 } | 275 } |
| 249 | 276 |
| 250 void SpeechRecognizer::SetRecognitionResult(bool error, const string16& value) { | 277 void SpeechRecognizer::SetRecognitionResult(bool error, const string16& value) { |
| 251 delegate_->SetRecognitionResult(caller_id_, error, value); | 278 delegate_->SetRecognitionResult(caller_id_, error, value); |
| 252 | 279 |
| 253 // Guard against the delegate freeing us until we finish our job. | 280 // Guard against the delegate freeing us until we finish our job. |
| 254 scoped_refptr<SpeechRecognizer> me(this); | 281 scoped_refptr<SpeechRecognizer> me(this); |
| 255 delegate_->DidCompleteRecognition(caller_id_); | 282 delegate_->DidCompleteRecognition(caller_id_); |
| 256 } | 283 } |
| 257 | 284 |
| 258 } // namespace speech_input | 285 } // namespace speech_input |
| OLD | NEW |