Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/browser/speech/speech_recognizer.h" | 5 #include "chrome/browser/speech/speech_recognizer.h" |
| 6 | 6 |
| 7 #include "base/ref_counted.h" | 7 #include "base/ref_counted.h" |
| 8 #include "base/scoped_ptr.h" | 8 #include "base/scoped_ptr.h" |
| 9 #include "base/time.h" | 9 #include "base/time.h" |
| 10 #include "chrome/browser/browser_thread.h" | 10 #include "chrome/browser/browser_thread.h" |
| 11 #include "chrome/browser/profiles/profile.h" | 11 #include "chrome/browser/profiles/profile.h" |
| 12 #include "chrome/common/net/url_request_context_getter.h" | 12 #include "chrome/common/net/url_request_context_getter.h" |
| 13 #include "third_party/speex/speex.h" | 13 #include "third_party/flac/flac.h" |
|
bulach
2011/01/12 16:27:07
no longer needed?
| |
| 14 | 14 |
| 15 using media::AudioInputController; | 15 using media::AudioInputController; |
| 16 using std::list; | 16 using std::list; |
| 17 using std::string; | 17 using std::string; |
| 18 | 18 |
| 19 namespace { | 19 namespace { |
| 20 const char* const kContentTypeFLAC = "audio/x-flac; rate=16000"; | |
| 20 const char* const kContentTypeSpeex = | 21 const char* const kContentTypeSpeex = |
| 21 "audio/x-speex-with-header-byte; rate=16000"; | 22 "audio/x-speex-with-header-byte; rate=16000"; |
|
bulach
2011/01/12 16:27:07
it'd be nice to move these to the new encoder inte
| |
| 22 const int kSpeexEncodingQuality = 8; | |
| 23 const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz). | |
| 24 | |
| 25 // Since the frame length gets written out as a byte in the encoded packet, | |
| 26 // make sure it is within the byte range. | |
| 27 COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength); | |
| 28 | 23 |
| 29 // The following constants are related to the volume level indicator shown in | 24 // The following constants are related to the volume level indicator shown in |
| 30 // the UI for recorded audio. | 25 // the UI for recorded audio. |
| 31 // Multiplier used when new volume is greater than previous level. | 26 // Multiplier used when new volume is greater than previous level. |
| 32 const float kUpSmoothingFactor = 0.9f; | 27 const float kUpSmoothingFactor = 0.9f; |
| 33 // Multiplier used when new volume is lesser than previous level. | 28 // Multiplier used when new volume is lesser than previous level. |
| 34 const float kDownSmoothingFactor = 0.4f; | 29 const float kDownSmoothingFactor = 0.4f; |
| 35 const float kAudioMeterMinDb = 10.0f; // Lower bar for volume meter. | 30 const float kAudioMeterMinDb = 10.0f; // Lower bar for volume meter. |
| 36 const float kAudioMeterDbRange = 25.0f; | 31 const float kAudioMeterDbRange = 25.0f; |
| 37 } // namespace | 32 } // namespace |
| 38 | 33 |
| 39 namespace speech_input { | 34 namespace speech_input { |
| 40 | 35 |
| 41 const int SpeechRecognizer::kAudioSampleRate = 16000; | 36 const int SpeechRecognizer::kAudioSampleRate = 16000; |
| 42 const int SpeechRecognizer::kAudioPacketIntervalMs = 100; | 37 const int SpeechRecognizer::kAudioPacketIntervalMs = 100; |
| 43 const int SpeechRecognizer::kNumAudioChannels = 1; | 38 const int SpeechRecognizer::kNumAudioChannels = 1; |
| 44 const int SpeechRecognizer::kNumBitsPerAudioSample = 16; | 39 const int SpeechRecognizer::kNumBitsPerAudioSample = 16; |
| 45 const int SpeechRecognizer::kNoSpeechTimeoutSec = 8; | 40 const int SpeechRecognizer::kNoSpeechTimeoutSec = 8; |
| 46 const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300; | 41 const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300; |
| 47 | 42 |
| 48 // Provides a simple interface to encode raw audio using the Speex codec. | |
| 49 class SpeexEncoder { | |
| 50 public: | |
| 51 SpeexEncoder(); | |
| 52 ~SpeexEncoder(); | |
| 53 | |
| 54 int samples_per_frame() const { return samples_per_frame_; } | |
| 55 | |
| 56 // Encodes each frame of raw audio in |samples| and adds the | |
| 57 // encoded frames as a set of strings to the |encoded_frames| list. | |
| 58 // Ownership of the newly added strings is transferred to the caller. | |
| 59 void Encode(const short* samples, | |
| 60 int num_samples, | |
| 61 std::list<std::string*>* encoded_frames); | |
| 62 | |
| 63 private: | |
| 64 SpeexBits bits_; | |
| 65 void* encoder_state_; | |
| 66 int samples_per_frame_; | |
| 67 char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size. | |
| 68 }; | |
| 69 | |
| 70 SpeexEncoder::SpeexEncoder() { | |
| 71 // speex_bits_init() does not initialize all of the |bits_| struct. | |
| 72 memset(&bits_, 0, sizeof(bits_)); | |
| 73 speex_bits_init(&bits_); | |
| 74 encoder_state_ = speex_encoder_init(&speex_wb_mode); | |
| 75 DCHECK(encoder_state_); | |
| 76 speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_); | |
| 77 DCHECK(samples_per_frame_ > 0); | |
| 78 int quality = kSpeexEncodingQuality; | |
| 79 speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality); | |
| 80 int vbr = 1; | |
| 81 speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr); | |
| 82 memset(encoded_frame_data_, 0, sizeof(encoded_frame_data_)); | |
| 83 } | |
| 84 | |
| 85 SpeexEncoder::~SpeexEncoder() { | |
| 86 speex_bits_destroy(&bits_); | |
| 87 speex_encoder_destroy(encoder_state_); | |
| 88 } | |
| 89 | |
| 90 void SpeexEncoder::Encode(const short* samples, | |
| 91 int num_samples, | |
| 92 std::list<std::string*>* encoded_frames) { | |
| 93 // Drop incomplete frames, typically those which come in when recording stops. | |
| 94 num_samples -= (num_samples % samples_per_frame_); | |
| 95 for (int i = 0; i < num_samples; i += samples_per_frame_) { | |
| 96 speex_bits_reset(&bits_); | |
| 97 speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i), | |
| 98 &bits_); | |
| 99 | |
| 100 // Encode the frame and place the size of the frame as the first byte. This | |
| 101 // is the packet format for MIME type x-speex-with-header-byte. | |
| 102 int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1, | |
| 103 kMaxSpeexFrameLength); | |
| 104 encoded_frame_data_[0] = static_cast<char>(frame_length); | |
| 105 encoded_frames->push_back(new string(encoded_frame_data_, | |
| 106 frame_length + 1)); | |
| 107 } | |
| 108 } | |
| 109 | |
| 110 SpeechRecognizer::SpeechRecognizer(Delegate* delegate, | 43 SpeechRecognizer::SpeechRecognizer(Delegate* delegate, |
| 111 int caller_id, | 44 int caller_id, |
| 112 const std::string& language, | 45 const std::string& language, |
| 113 const std::string& grammar, | 46 const std::string& grammar, |
| 114 const std::string& hardware_info) | 47 const std::string& hardware_info) |
| 115 : delegate_(delegate), | 48 : delegate_(delegate), |
| 116 caller_id_(caller_id), | 49 caller_id_(caller_id), |
| 117 language_(language), | 50 language_(language), |
| 118 grammar_(grammar), | 51 grammar_(grammar), |
| 119 hardware_info_(hardware_info), | 52 hardware_info_(hardware_info), |
| 120 encoder_(new SpeexEncoder()), | 53 codec_(AudioEncoder::FLAC), |
| 54 encoder_(NULL), | |
| 121 endpointer_(kAudioSampleRate), | 55 endpointer_(kAudioSampleRate), |
| 122 num_samples_recorded_(0), | 56 num_samples_recorded_(0), |
| 123 audio_level_(0.0f) { | 57 audio_level_(0.0f) { |
| 124 endpointer_.set_speech_input_complete_silence_length( | 58 endpointer_.set_speech_input_complete_silence_length( |
| 125 base::Time::kMicrosecondsPerSecond / 2); | 59 base::Time::kMicrosecondsPerSecond / 2); |
| 126 endpointer_.set_long_speech_input_complete_silence_length( | 60 endpointer_.set_long_speech_input_complete_silence_length( |
| 127 base::Time::kMicrosecondsPerSecond); | 61 base::Time::kMicrosecondsPerSecond); |
| 128 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); | 62 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); |
| 129 endpointer_.StartSession(); | 63 endpointer_.StartSession(); |
| 130 } | 64 } |
| 131 | 65 |
| 132 SpeechRecognizer::~SpeechRecognizer() { | 66 SpeechRecognizer::~SpeechRecognizer() { |
| 133 // Recording should have stopped earlier due to the endpointer or | 67 // Recording should have stopped earlier due to the endpointer or |
| 134 // |StopRecording| being called. | 68 // |StopRecording| being called. |
| 135 DCHECK(!audio_controller_.get()); | 69 DCHECK(!audio_controller_.get()); |
| 136 DCHECK(!request_.get() || !request_->HasPendingRequest()); | 70 DCHECK(!request_.get() || !request_->HasPendingRequest()); |
| 137 DCHECK(audio_buffers_.empty()); | 71 DCHECK(!encoder_.get()); |
| 138 endpointer_.EndSession(); | 72 endpointer_.EndSession(); |
| 139 } | 73 } |
| 140 | 74 |
| 141 bool SpeechRecognizer::StartRecording() { | 75 bool SpeechRecognizer::StartRecording() { |
| 142 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); | 76 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
| 143 DCHECK(!audio_controller_.get()); | 77 DCHECK(!audio_controller_.get()); |
| 144 DCHECK(!request_.get() || !request_->HasPendingRequest()); | 78 DCHECK(!request_.get() || !request_->HasPendingRequest()); |
| 79 DCHECK(!encoder_.get()); | |
| 145 | 80 |
| 146 // The endpointer needs to estimate the environment/background noise before | 81 // The endpointer needs to estimate the environment/background noise before |
| 147 // starting to treat the audio as user input. In |HandleOnData| we wait until | 82 // starting to treat the audio as user input. In |HandleOnData| we wait until |
| 148 // such time has passed before switching to user input mode. | 83 // such time has passed before switching to user input mode. |
| 149 endpointer_.SetEnvironmentEstimationMode(); | 84 endpointer_.SetEnvironmentEstimationMode(); |
| 150 | 85 |
| 86 encoder_.reset(AudioEncoder::Create(codec_, kAudioSampleRate, | |
| 87 kNumBitsPerAudioSample)); | |
| 151 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; | 88 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; |
| 152 DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0); | |
| 153 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels, | 89 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels, |
| 154 kAudioSampleRate, kNumBitsPerAudioSample, | 90 kAudioSampleRate, kNumBitsPerAudioSample, |
| 155 samples_per_packet); | 91 samples_per_packet); |
| 156 audio_controller_ = AudioInputController::Create(this, params); | 92 audio_controller_ = AudioInputController::Create(this, params); |
| 157 DCHECK(audio_controller_.get()); | 93 DCHECK(audio_controller_.get()); |
| 158 VLOG(1) << "SpeechRecognizer starting record."; | 94 VLOG(1) << "SpeechRecognizer starting record."; |
| 159 num_samples_recorded_ = 0; | 95 num_samples_recorded_ = 0; |
| 160 audio_controller_->Record(); | 96 audio_controller_->Record(); |
| 161 | 97 |
| 162 return true; | 98 return true; |
| 163 } | 99 } |
| 164 | 100 |
| 165 void SpeechRecognizer::CancelRecognition() { | 101 void SpeechRecognizer::CancelRecognition() { |
| 166 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); | 102 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
| 167 DCHECK(audio_controller_.get() || request_.get()); | 103 DCHECK(audio_controller_.get() || request_.get()); |
| 168 | 104 |
| 169 // Stop recording if required. | 105 // Stop recording if required. |
| 170 if (audio_controller_.get()) { | 106 if (audio_controller_.get()) { |
| 171 VLOG(1) << "SpeechRecognizer stopping record."; | 107 VLOG(1) << "SpeechRecognizer stopping record."; |
| 172 audio_controller_->Close(); | 108 audio_controller_->Close(); |
| 173 audio_controller_ = NULL; // Releases the ref ptr. | 109 audio_controller_ = NULL; // Releases the ref ptr. |
| 174 } | 110 } |
| 175 | 111 |
| 176 VLOG(1) << "SpeechRecognizer canceling recognition."; | 112 VLOG(1) << "SpeechRecognizer canceling recognition."; |
| 177 ReleaseAudioBuffers(); | 113 encoder_.reset(); |
| 178 request_.reset(); | 114 request_.reset(); |
| 179 } | 115 } |
| 180 | 116 |
| 181 void SpeechRecognizer::StopRecording() { | 117 void SpeechRecognizer::StopRecording() { |
| 182 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); | 118 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
| 183 | 119 |
| 184 // If audio recording has already stopped and we are in recognition phase, | 120 // If audio recording has already stopped and we are in recognition phase, |
| 185 // silently ignore any more calls to stop recording. | 121 // silently ignore any more calls to stop recording. |
| 186 if (!audio_controller_.get()) | 122 if (!audio_controller_.get()) |
| 187 return; | 123 return; |
| 188 | 124 |
| 189 VLOG(1) << "SpeechRecognizer stopping record."; | 125 VLOG(1) << "SpeechRecognizer stopping record."; |
| 190 audio_controller_->Close(); | 126 audio_controller_->Close(); |
| 191 audio_controller_ = NULL; // Releases the ref ptr. | 127 audio_controller_ = NULL; // Releases the ref ptr. |
| 128 encoder_->Flush(); | |
| 192 | 129 |
| 193 delegate_->DidCompleteRecording(caller_id_); | 130 delegate_->DidCompleteRecording(caller_id_); |
| 194 | 131 |
| 195 // If we haven't got any audio yet end the recognition sequence here. | 132 // Since the http request takes a single string as POST data, allocate |
| 196 if (audio_buffers_.empty()) { | 133 // one and copy over bytes from the audio buffers to the string. |
| 134 // And If we haven't got any audio yet end the recognition sequence here. | |
| 135 string data; | |
| 136 if (!encoder_->GetEncodedData(&data)) { | |
| 197 // Guard against the delegate freeing us until we finish our job. | 137 // Guard against the delegate freeing us until we finish our job. |
| 198 scoped_refptr<SpeechRecognizer> me(this); | 138 scoped_refptr<SpeechRecognizer> me(this); |
| 199 delegate_->DidCompleteRecognition(caller_id_); | 139 delegate_->DidCompleteRecognition(caller_id_); |
| 200 return; | 140 return; |
| 201 } | 141 } |
| 202 | 142 |
| 203 // We now have recorded audio in our buffers, so start a recognition request. | |
| 204 // Since the http request takes a single string as POST data, allocate | |
| 205 // one and copy over bytes from the audio buffers to the string. | |
| 206 int audio_buffer_length = 0; | |
| 207 for (AudioBufferQueue::iterator it = audio_buffers_.begin(); | |
| 208 it != audio_buffers_.end(); it++) { | |
| 209 audio_buffer_length += (*it)->length(); | |
| 210 } | |
| 211 string data; | |
| 212 data.reserve(audio_buffer_length); | |
| 213 for (AudioBufferQueue::iterator it = audio_buffers_.begin(); | |
| 214 it != audio_buffers_.end(); it++) { | |
| 215 data.append(*(*it)); | |
| 216 } | |
| 217 | |
| 218 DCHECK(!request_.get()); | 143 DCHECK(!request_.get()); |
| 219 request_.reset(new SpeechRecognitionRequest( | 144 request_.reset(new SpeechRecognitionRequest( |
| 220 Profile::GetDefaultRequestContext(), this)); | 145 Profile::GetDefaultRequestContext(), this)); |
| 221 request_->Send(language_, grammar_, hardware_info_, kContentTypeSpeex, data); | 146 request_->Send(language_, grammar_, hardware_info_, |
| 222 ReleaseAudioBuffers(); // No need to keep the audio anymore. | 147 (codec_ == AudioEncoder::FLAC) ? kContentTypeFLAC : kContentTypeSpeex, |
| 148 data); | |
| 149 encoder_.reset(); | |
| 223 } | 150 } |
| 224 | 151 |
| 225 void SpeechRecognizer::ReleaseAudioBuffers() { | 152 void SpeechRecognizer::ReleaseAudioBuffers() { |
| 226 for (AudioBufferQueue::iterator it = audio_buffers_.begin(); | |
| 227 it != audio_buffers_.end(); it++) | |
| 228 delete *it; | |
| 229 audio_buffers_.clear(); | |
| 230 } | 153 } |
| 231 | 154 |
| 232 // Invoked in the audio thread. | 155 // Invoked in the audio thread. |
| 233 void SpeechRecognizer::OnError(AudioInputController* controller, | 156 void SpeechRecognizer::OnError(AudioInputController* controller, |
| 234 int error_code) { | 157 int error_code) { |
| 235 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 158 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| 236 NewRunnableMethod(this, | 159 NewRunnableMethod(this, |
| 237 &SpeechRecognizer::HandleOnError, | 160 &SpeechRecognizer::HandleOnError, |
| 238 error_code)); | 161 error_code)); |
| 239 } | 162 } |
| (...skipping 28 matching lines...) Expand all Loading... | |
| 268 // by |OnData|. | 191 // by |OnData|. |
| 269 if (!audio_controller_.get()) { | 192 if (!audio_controller_.get()) { |
| 270 delete data; | 193 delete data; |
| 271 return; | 194 return; |
| 272 } | 195 } |
| 273 | 196 |
| 274 const short* samples = reinterpret_cast<const short*>(data->data()); | 197 const short* samples = reinterpret_cast<const short*>(data->data()); |
| 275 DCHECK((data->length() % sizeof(short)) == 0); | 198 DCHECK((data->length() % sizeof(short)) == 0); |
| 276 int num_samples = data->length() / sizeof(short); | 199 int num_samples = data->length() / sizeof(short); |
| 277 | 200 |
| 278 encoder_->Encode(samples, num_samples, &audio_buffers_); | 201 encoder_->Encode(samples, num_samples); |
| 279 float rms; | 202 float rms; |
| 280 endpointer_.ProcessAudio(samples, num_samples, &rms); | 203 endpointer_.ProcessAudio(samples, num_samples, &rms); |
| 281 delete data; | 204 delete data; |
| 282 num_samples_recorded_ += num_samples; | 205 num_samples_recorded_ += num_samples; |
| 283 | 206 |
| 284 if (endpointer_.IsEstimatingEnvironment()) { | 207 if (endpointer_.IsEstimatingEnvironment()) { |
| 285 // Check if we have gathered enough audio for the endpointer to do | 208 // Check if we have gathered enough audio for the endpointer to do |
| 286 // environment estimation and should move on to detect speech/end of speech. | 209 // environment estimation and should move on to detect speech/end of speech. |
| 287 if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs * | 210 if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs * |
| 288 kAudioSampleRate) / 1000) { | 211 kAudioSampleRate) / 1000) { |
| (...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 334 | 257 |
| 335 void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) { | 258 void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) { |
| 336 CancelRecognition(); | 259 CancelRecognition(); |
| 337 | 260 |
| 338 // Guard against the delegate freeing us until we finish our job. | 261 // Guard against the delegate freeing us until we finish our job. |
| 339 scoped_refptr<SpeechRecognizer> me(this); | 262 scoped_refptr<SpeechRecognizer> me(this); |
| 340 delegate_->OnRecognizerError(caller_id_, error); | 263 delegate_->OnRecognizerError(caller_id_, error); |
| 341 } | 264 } |
| 342 | 265 |
| 343 } // namespace speech_input | 266 } // namespace speech_input |
| OLD | NEW |