| OLD | NEW |
| 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/browser/speech/speech_recognizer.h" | 5 #include "chrome/browser/speech/speech_recognizer.h" |
| 6 | 6 |
| 7 #include "base/ref_counted.h" | 7 #include "base/ref_counted.h" |
| 8 #include "base/scoped_ptr.h" | 8 #include "base/scoped_ptr.h" |
| 9 #include "base/time.h" | 9 #include "base/time.h" |
| 10 #include "chrome/browser/browser_thread.h" | 10 #include "chrome/browser/browser_thread.h" |
| 11 #include "chrome/browser/profiles/profile.h" | 11 #include "chrome/browser/profiles/profile.h" |
| 12 #include "chrome/common/net/url_request_context_getter.h" | 12 #include "chrome/common/net/url_request_context_getter.h" |
| 13 #include "third_party/speex/speex.h" | |
| 14 | 13 |
| 15 using media::AudioInputController; | 14 using media::AudioInputController; |
| 16 using std::list; | |
| 17 using std::string; | 15 using std::string; |
| 18 | 16 |
| 19 namespace { | 17 namespace { |
| 20 const char* const kContentTypeSpeex = | |
| 21 "audio/x-speex-with-header-byte; rate=16000"; | |
| 22 const int kSpeexEncodingQuality = 8; | |
| 23 const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz). | |
| 24 | |
| 25 // Since the frame length gets written out as a byte in the encoded packet, | |
| 26 // make sure it is within the byte range. | |
| 27 COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength); | |
| 28 | 18 |
| 29 // The following constants are related to the volume level indicator shown in | 19 // The following constants are related to the volume level indicator shown in |
| 30 // the UI for recorded audio. | 20 // the UI for recorded audio. |
| 31 // Multiplier used when new volume is greater than previous level. | 21 // Multiplier used when new volume is greater than previous level. |
| 32 const float kUpSmoothingFactor = 0.9f; | 22 const float kUpSmoothingFactor = 0.9f; |
| 33 // Multiplier used when new volume is lesser than previous level. | 23 // Multiplier used when new volume is lesser than previous level. |
| 34 const float kDownSmoothingFactor = 0.4f; | 24 const float kDownSmoothingFactor = 0.4f; |
| 35 const float kAudioMeterMinDb = 10.0f; // Lower bar for volume meter. | 25 const float kAudioMeterMinDb = 10.0f; // Lower bar for volume meter. |
| 36 const float kAudioMeterDbRange = 25.0f; | 26 const float kAudioMeterDbRange = 25.0f; |
| 37 } // namespace | 27 } // namespace |
| 38 | 28 |
| 39 namespace speech_input { | 29 namespace speech_input { |
| 40 | 30 |
| 41 const int SpeechRecognizer::kAudioSampleRate = 16000; | 31 const int SpeechRecognizer::kAudioSampleRate = 16000; |
| 42 const int SpeechRecognizer::kAudioPacketIntervalMs = 100; | 32 const int SpeechRecognizer::kAudioPacketIntervalMs = 100; |
| 43 const int SpeechRecognizer::kNumAudioChannels = 1; | 33 const int SpeechRecognizer::kNumAudioChannels = 1; |
| 44 const int SpeechRecognizer::kNumBitsPerAudioSample = 16; | 34 const int SpeechRecognizer::kNumBitsPerAudioSample = 16; |
| 45 const int SpeechRecognizer::kNoSpeechTimeoutSec = 8; | 35 const int SpeechRecognizer::kNoSpeechTimeoutSec = 8; |
| 46 const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300; | 36 const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300; |
| 47 | 37 |
| 48 // Provides a simple interface to encode raw audio using the Speex codec. | |
| 49 class SpeexEncoder { | |
| 50 public: | |
| 51 SpeexEncoder(); | |
| 52 ~SpeexEncoder(); | |
| 53 | |
| 54 int samples_per_frame() const { return samples_per_frame_; } | |
| 55 | |
| 56 // Encodes each frame of raw audio in |samples| and adds the | |
| 57 // encoded frames as a set of strings to the |encoded_frames| list. | |
| 58 // Ownership of the newly added strings is transferred to the caller. | |
| 59 void Encode(const short* samples, | |
| 60 int num_samples, | |
| 61 std::list<std::string*>* encoded_frames); | |
| 62 | |
| 63 private: | |
| 64 SpeexBits bits_; | |
| 65 void* encoder_state_; | |
| 66 int samples_per_frame_; | |
| 67 char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size. | |
| 68 }; | |
| 69 | |
| 70 SpeexEncoder::SpeexEncoder() { | |
| 71 // speex_bits_init() does not initialize all of the |bits_| struct. | |
| 72 memset(&bits_, 0, sizeof(bits_)); | |
| 73 speex_bits_init(&bits_); | |
| 74 encoder_state_ = speex_encoder_init(&speex_wb_mode); | |
| 75 DCHECK(encoder_state_); | |
| 76 speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_); | |
| 77 DCHECK(samples_per_frame_ > 0); | |
| 78 int quality = kSpeexEncodingQuality; | |
| 79 speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality); | |
| 80 int vbr = 1; | |
| 81 speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr); | |
| 82 memset(encoded_frame_data_, 0, sizeof(encoded_frame_data_)); | |
| 83 } | |
| 84 | |
| 85 SpeexEncoder::~SpeexEncoder() { | |
| 86 speex_bits_destroy(&bits_); | |
| 87 speex_encoder_destroy(encoder_state_); | |
| 88 } | |
| 89 | |
| 90 void SpeexEncoder::Encode(const short* samples, | |
| 91 int num_samples, | |
| 92 std::list<std::string*>* encoded_frames) { | |
| 93 // Drop incomplete frames, typically those which come in when recording stops. | |
| 94 num_samples -= (num_samples % samples_per_frame_); | |
| 95 for (int i = 0; i < num_samples; i += samples_per_frame_) { | |
| 96 speex_bits_reset(&bits_); | |
| 97 speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i), | |
| 98 &bits_); | |
| 99 | |
| 100 // Encode the frame and place the size of the frame as the first byte. This | |
| 101 // is the packet format for MIME type x-speex-with-header-byte. | |
| 102 int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1, | |
| 103 kMaxSpeexFrameLength); | |
| 104 encoded_frame_data_[0] = static_cast<char>(frame_length); | |
| 105 encoded_frames->push_back(new string(encoded_frame_data_, | |
| 106 frame_length + 1)); | |
| 107 } | |
| 108 } | |
| 109 | |
| 110 SpeechRecognizer::SpeechRecognizer(Delegate* delegate, | 38 SpeechRecognizer::SpeechRecognizer(Delegate* delegate, |
| 111 int caller_id, | 39 int caller_id, |
| 112 const std::string& language, | 40 const std::string& language, |
| 113 const std::string& grammar, | 41 const std::string& grammar, |
| 114 const std::string& hardware_info) | 42 const std::string& hardware_info) |
| 115 : delegate_(delegate), | 43 : delegate_(delegate), |
| 116 caller_id_(caller_id), | 44 caller_id_(caller_id), |
| 117 language_(language), | 45 language_(language), |
| 118 grammar_(grammar), | 46 grammar_(grammar), |
| 119 hardware_info_(hardware_info), | 47 hardware_info_(hardware_info), |
| 120 encoder_(new SpeexEncoder()), | 48 codec_(AudioEncoder::CODEC_SPEEX), |
| 49 encoder_(NULL), |
| 121 endpointer_(kAudioSampleRate), | 50 endpointer_(kAudioSampleRate), |
| 122 num_samples_recorded_(0), | 51 num_samples_recorded_(0), |
| 123 audio_level_(0.0f) { | 52 audio_level_(0.0f) { |
| 124 endpointer_.set_speech_input_complete_silence_length( | 53 endpointer_.set_speech_input_complete_silence_length( |
| 125 base::Time::kMicrosecondsPerSecond / 2); | 54 base::Time::kMicrosecondsPerSecond / 2); |
| 126 endpointer_.set_long_speech_input_complete_silence_length( | 55 endpointer_.set_long_speech_input_complete_silence_length( |
| 127 base::Time::kMicrosecondsPerSecond); | 56 base::Time::kMicrosecondsPerSecond); |
| 128 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); | 57 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); |
| 129 endpointer_.StartSession(); | 58 endpointer_.StartSession(); |
| 130 } | 59 } |
| 131 | 60 |
| 132 SpeechRecognizer::~SpeechRecognizer() { | 61 SpeechRecognizer::~SpeechRecognizer() { |
| 133 // Recording should have stopped earlier due to the endpointer or | 62 // Recording should have stopped earlier due to the endpointer or |
| 134 // |StopRecording| being called. | 63 // |StopRecording| being called. |
| 135 DCHECK(!audio_controller_.get()); | 64 DCHECK(!audio_controller_.get()); |
| 136 DCHECK(!request_.get() || !request_->HasPendingRequest()); | 65 DCHECK(!request_.get() || !request_->HasPendingRequest()); |
| 137 DCHECK(audio_buffers_.empty()); | 66 DCHECK(!encoder_.get()); |
| 138 endpointer_.EndSession(); | 67 endpointer_.EndSession(); |
| 139 } | 68 } |
| 140 | 69 |
| 141 bool SpeechRecognizer::StartRecording() { | 70 bool SpeechRecognizer::StartRecording() { |
| 142 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); | 71 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
| 143 DCHECK(!audio_controller_.get()); | 72 DCHECK(!audio_controller_.get()); |
| 144 DCHECK(!request_.get() || !request_->HasPendingRequest()); | 73 DCHECK(!request_.get() || !request_->HasPendingRequest()); |
| 74 DCHECK(!encoder_.get()); |
| 145 | 75 |
| 146 // The endpointer needs to estimate the environment/background noise before | 76 // The endpointer needs to estimate the environment/background noise before |
| 147 // starting to treat the audio as user input. In |HandleOnData| we wait until | 77 // starting to treat the audio as user input. In |HandleOnData| we wait until |
| 148 // such time has passed before switching to user input mode. | 78 // such time has passed before switching to user input mode. |
| 149 endpointer_.SetEnvironmentEstimationMode(); | 79 endpointer_.SetEnvironmentEstimationMode(); |
| 150 | 80 |
| 81 encoder_.reset(AudioEncoder::Create(codec_, kAudioSampleRate, |
| 82 kNumBitsPerAudioSample)); |
| 151 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; | 83 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; |
| 152 DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0); | |
| 153 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels, | 84 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels, |
| 154 kAudioSampleRate, kNumBitsPerAudioSample, | 85 kAudioSampleRate, kNumBitsPerAudioSample, |
| 155 samples_per_packet); | 86 samples_per_packet); |
| 156 audio_controller_ = AudioInputController::Create(this, params); | 87 audio_controller_ = AudioInputController::Create(this, params); |
| 157 DCHECK(audio_controller_.get()); | 88 DCHECK(audio_controller_.get()); |
| 158 VLOG(1) << "SpeechRecognizer starting record."; | 89 VLOG(1) << "SpeechRecognizer starting record."; |
| 159 num_samples_recorded_ = 0; | 90 num_samples_recorded_ = 0; |
| 160 audio_controller_->Record(); | 91 audio_controller_->Record(); |
| 161 | 92 |
| 162 return true; | 93 return true; |
| 163 } | 94 } |
| 164 | 95 |
| 165 void SpeechRecognizer::CancelRecognition() { | 96 void SpeechRecognizer::CancelRecognition() { |
| 166 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); | 97 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
| 167 DCHECK(audio_controller_.get() || request_.get()); | 98 DCHECK(audio_controller_.get() || request_.get()); |
| 168 | 99 |
| 169 // Stop recording if required. | 100 // Stop recording if required. |
| 170 if (audio_controller_.get()) { | 101 if (audio_controller_.get()) { |
| 171 VLOG(1) << "SpeechRecognizer stopping record."; | 102 VLOG(1) << "SpeechRecognizer stopping record."; |
| 172 audio_controller_->Close(); | 103 audio_controller_->Close(); |
| 173 audio_controller_ = NULL; // Releases the ref ptr. | 104 audio_controller_ = NULL; // Releases the ref ptr. |
| 174 } | 105 } |
| 175 | 106 |
| 176 VLOG(1) << "SpeechRecognizer canceling recognition."; | 107 VLOG(1) << "SpeechRecognizer canceling recognition."; |
| 177 ReleaseAudioBuffers(); | 108 encoder_.reset(); |
| 178 request_.reset(); | 109 request_.reset(); |
| 179 } | 110 } |
| 180 | 111 |
| 181 void SpeechRecognizer::StopRecording() { | 112 void SpeechRecognizer::StopRecording() { |
| 182 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); | 113 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
| 183 | 114 |
| 184 // If audio recording has already stopped and we are in recognition phase, | 115 // If audio recording has already stopped and we are in recognition phase, |
| 185 // silently ignore any more calls to stop recording. | 116 // silently ignore any more calls to stop recording. |
| 186 if (!audio_controller_.get()) | 117 if (!audio_controller_.get()) |
| 187 return; | 118 return; |
| 188 | 119 |
| 189 VLOG(1) << "SpeechRecognizer stopping record."; | 120 VLOG(1) << "SpeechRecognizer stopping record."; |
| 190 audio_controller_->Close(); | 121 audio_controller_->Close(); |
| 191 audio_controller_ = NULL; // Releases the ref ptr. | 122 audio_controller_ = NULL; // Releases the ref ptr. |
| 123 encoder_->Flush(); |
| 192 | 124 |
| 193 delegate_->DidCompleteRecording(caller_id_); | 125 delegate_->DidCompleteRecording(caller_id_); |
| 194 | 126 |
| 195 // If we haven't got any audio yet end the recognition sequence here. | 127 // Since the http request takes a single string as POST data, allocate |
| 196 if (audio_buffers_.empty()) { | 128 // one and copy over bytes from the audio buffers to the string. |
| 129 // And If we haven't got any audio yet end the recognition sequence here. |
| 130 string data; |
| 131 if (!encoder_->GetEncodedData(&data)) { |
| 197 // Guard against the delegate freeing us until we finish our job. | 132 // Guard against the delegate freeing us until we finish our job. |
| 198 scoped_refptr<SpeechRecognizer> me(this); | 133 scoped_refptr<SpeechRecognizer> me(this); |
| 199 delegate_->DidCompleteRecognition(caller_id_); | 134 delegate_->DidCompleteRecognition(caller_id_); |
| 200 return; | 135 } else { |
| 136 DCHECK(!request_.get()); |
| 137 request_.reset(new SpeechRecognitionRequest( |
| 138 Profile::GetDefaultRequestContext(), this)); |
| 139 request_->Send(language_, grammar_, hardware_info_, encoder_->mime_type(), |
| 140 data); |
| 201 } | 141 } |
| 202 | 142 encoder_.reset(); |
| 203 // We now have recorded audio in our buffers, so start a recognition request. | |
| 204 // Since the http request takes a single string as POST data, allocate | |
| 205 // one and copy over bytes from the audio buffers to the string. | |
| 206 int audio_buffer_length = 0; | |
| 207 for (AudioBufferQueue::iterator it = audio_buffers_.begin(); | |
| 208 it != audio_buffers_.end(); it++) { | |
| 209 audio_buffer_length += (*it)->length(); | |
| 210 } | |
| 211 string data; | |
| 212 data.reserve(audio_buffer_length); | |
| 213 for (AudioBufferQueue::iterator it = audio_buffers_.begin(); | |
| 214 it != audio_buffers_.end(); it++) { | |
| 215 data.append(*(*it)); | |
| 216 } | |
| 217 | |
| 218 DCHECK(!request_.get()); | |
| 219 request_.reset(new SpeechRecognitionRequest( | |
| 220 Profile::GetDefaultRequestContext(), this)); | |
| 221 request_->Send(language_, grammar_, hardware_info_, kContentTypeSpeex, data); | |
| 222 ReleaseAudioBuffers(); // No need to keep the audio anymore. | |
| 223 } | 143 } |
| 224 | 144 |
| 225 void SpeechRecognizer::ReleaseAudioBuffers() { | 145 void SpeechRecognizer::ReleaseAudioBuffers() { |
| 226 for (AudioBufferQueue::iterator it = audio_buffers_.begin(); | |
| 227 it != audio_buffers_.end(); it++) | |
| 228 delete *it; | |
| 229 audio_buffers_.clear(); | |
| 230 } | 146 } |
| 231 | 147 |
| 232 // Invoked in the audio thread. | 148 // Invoked in the audio thread. |
| 233 void SpeechRecognizer::OnError(AudioInputController* controller, | 149 void SpeechRecognizer::OnError(AudioInputController* controller, |
| 234 int error_code) { | 150 int error_code) { |
| 235 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 151 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| 236 NewRunnableMethod(this, | 152 NewRunnableMethod(this, |
| 237 &SpeechRecognizer::HandleOnError, | 153 &SpeechRecognizer::HandleOnError, |
| 238 error_code)); | 154 error_code)); |
| 239 } | 155 } |
| (...skipping 28 matching lines...) Expand all Loading... |
| 268 // by |OnData|. | 184 // by |OnData|. |
| 269 if (!audio_controller_.get()) { | 185 if (!audio_controller_.get()) { |
| 270 delete data; | 186 delete data; |
| 271 return; | 187 return; |
| 272 } | 188 } |
| 273 | 189 |
| 274 const short* samples = reinterpret_cast<const short*>(data->data()); | 190 const short* samples = reinterpret_cast<const short*>(data->data()); |
| 275 DCHECK((data->length() % sizeof(short)) == 0); | 191 DCHECK((data->length() % sizeof(short)) == 0); |
| 276 int num_samples = data->length() / sizeof(short); | 192 int num_samples = data->length() / sizeof(short); |
| 277 | 193 |
| 278 encoder_->Encode(samples, num_samples, &audio_buffers_); | 194 encoder_->Encode(samples, num_samples); |
| 279 float rms; | 195 float rms; |
| 280 endpointer_.ProcessAudio(samples, num_samples, &rms); | 196 endpointer_.ProcessAudio(samples, num_samples, &rms); |
| 281 delete data; | 197 delete data; |
| 282 num_samples_recorded_ += num_samples; | 198 num_samples_recorded_ += num_samples; |
| 283 | 199 |
| 284 if (endpointer_.IsEstimatingEnvironment()) { | 200 if (endpointer_.IsEstimatingEnvironment()) { |
| 285 // Check if we have gathered enough audio for the endpointer to do | 201 // Check if we have gathered enough audio for the endpointer to do |
| 286 // environment estimation and should move on to detect speech/end of speech. | 202 // environment estimation and should move on to detect speech/end of speech. |
| 287 if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs * | 203 if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs * |
| 288 kAudioSampleRate) / 1000) { | 204 kAudioSampleRate) / 1000) { |
| (...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 334 | 250 |
| 335 void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) { | 251 void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) { |
| 336 CancelRecognition(); | 252 CancelRecognition(); |
| 337 | 253 |
| 338 // Guard against the delegate freeing us until we finish our job. | 254 // Guard against the delegate freeing us until we finish our job. |
| 339 scoped_refptr<SpeechRecognizer> me(this); | 255 scoped_refptr<SpeechRecognizer> me(this); |
| 340 delegate_->OnRecognizerError(caller_id_, error); | 256 delegate_->OnRecognizerError(caller_id_, error); |
| 341 } | 257 } |
| 342 | 258 |
| 343 } // namespace speech_input | 259 } // namespace speech_input |
| OLD | NEW |