OLD | NEW |
---|---|
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/browser/speech/speech_recognizer.h" | 5 #include "chrome/browser/speech/speech_recognizer.h" |
6 | 6 |
7 #include "base/ref_counted.h" | 7 #include "base/ref_counted.h" |
8 #include "base/scoped_ptr.h" | 8 #include "base/scoped_ptr.h" |
9 #include "base/time.h" | 9 #include "base/time.h" |
10 #include "chrome/browser/browser_thread.h" | 10 #include "chrome/browser/browser_thread.h" |
11 #include "chrome/browser/profiles/profile.h" | 11 #include "chrome/browser/profiles/profile.h" |
12 #include "chrome/common/net/url_request_context_getter.h" | 12 #include "chrome/common/net/url_request_context_getter.h" |
13 #include "third_party/speex/speex.h" | 13 #include "third_party/flac/flac.h" |
bulach
2011/01/12 16:27:07
no longer needed?
| |
14 | 14 |
15 using media::AudioInputController; | 15 using media::AudioInputController; |
16 using std::list; | 16 using std::list; |
17 using std::string; | 17 using std::string; |
18 | 18 |
19 namespace { | 19 namespace { |
20 const char* const kContentTypeFLAC = "audio/x-flac; rate=16000"; | |
20 const char* const kContentTypeSpeex = | 21 const char* const kContentTypeSpeex = |
21 "audio/x-speex-with-header-byte; rate=16000"; | 22 "audio/x-speex-with-header-byte; rate=16000"; |
bulach
2011/01/12 16:27:07
it'd be nice to move these to the new encoder inte
| |
22 const int kSpeexEncodingQuality = 8; | |
23 const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz). | |
24 | |
25 // Since the frame length gets written out as a byte in the encoded packet, | |
26 // make sure it is within the byte range. | |
27 COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength); | |
28 | 23 |
29 // The following constants are related to the volume level indicator shown in | 24 // The following constants are related to the volume level indicator shown in |
30 // the UI for recorded audio. | 25 // the UI for recorded audio. |
31 // Multiplier used when new volume is greater than previous level. | 26 // Multiplier used when new volume is greater than previous level. |
32 const float kUpSmoothingFactor = 0.9f; | 27 const float kUpSmoothingFactor = 0.9f; |
33 // Multiplier used when new volume is lesser than previous level. | 28 // Multiplier used when new volume is lesser than previous level. |
34 const float kDownSmoothingFactor = 0.4f; | 29 const float kDownSmoothingFactor = 0.4f; |
35 const float kAudioMeterMinDb = 10.0f; // Lower bar for volume meter. | 30 const float kAudioMeterMinDb = 10.0f; // Lower bar for volume meter. |
36 const float kAudioMeterDbRange = 25.0f; | 31 const float kAudioMeterDbRange = 25.0f; |
37 } // namespace | 32 } // namespace |
38 | 33 |
39 namespace speech_input { | 34 namespace speech_input { |
40 | 35 |
41 const int SpeechRecognizer::kAudioSampleRate = 16000; | 36 const int SpeechRecognizer::kAudioSampleRate = 16000; |
42 const int SpeechRecognizer::kAudioPacketIntervalMs = 100; | 37 const int SpeechRecognizer::kAudioPacketIntervalMs = 100; |
43 const int SpeechRecognizer::kNumAudioChannels = 1; | 38 const int SpeechRecognizer::kNumAudioChannels = 1; |
44 const int SpeechRecognizer::kNumBitsPerAudioSample = 16; | 39 const int SpeechRecognizer::kNumBitsPerAudioSample = 16; |
45 const int SpeechRecognizer::kNoSpeechTimeoutSec = 8; | 40 const int SpeechRecognizer::kNoSpeechTimeoutSec = 8; |
46 const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300; | 41 const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300; |
47 | 42 |
48 // Provides a simple interface to encode raw audio using the Speex codec. | |
49 class SpeexEncoder { | |
50 public: | |
51 SpeexEncoder(); | |
52 ~SpeexEncoder(); | |
53 | |
54 int samples_per_frame() const { return samples_per_frame_; } | |
55 | |
56 // Encodes each frame of raw audio in |samples| and adds the | |
57 // encoded frames as a set of strings to the |encoded_frames| list. | |
58 // Ownership of the newly added strings is transferred to the caller. | |
59 void Encode(const short* samples, | |
60 int num_samples, | |
61 std::list<std::string*>* encoded_frames); | |
62 | |
63 private: | |
64 SpeexBits bits_; | |
65 void* encoder_state_; | |
66 int samples_per_frame_; | |
67 char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size. | |
68 }; | |
69 | |
70 SpeexEncoder::SpeexEncoder() { | |
71 // speex_bits_init() does not initialize all of the |bits_| struct. | |
72 memset(&bits_, 0, sizeof(bits_)); | |
73 speex_bits_init(&bits_); | |
74 encoder_state_ = speex_encoder_init(&speex_wb_mode); | |
75 DCHECK(encoder_state_); | |
76 speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_); | |
77 DCHECK(samples_per_frame_ > 0); | |
78 int quality = kSpeexEncodingQuality; | |
79 speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality); | |
80 int vbr = 1; | |
81 speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr); | |
82 memset(encoded_frame_data_, 0, sizeof(encoded_frame_data_)); | |
83 } | |
84 | |
85 SpeexEncoder::~SpeexEncoder() { | |
86 speex_bits_destroy(&bits_); | |
87 speex_encoder_destroy(encoder_state_); | |
88 } | |
89 | |
90 void SpeexEncoder::Encode(const short* samples, | |
91 int num_samples, | |
92 std::list<std::string*>* encoded_frames) { | |
93 // Drop incomplete frames, typically those which come in when recording stops. | |
94 num_samples -= (num_samples % samples_per_frame_); | |
95 for (int i = 0; i < num_samples; i += samples_per_frame_) { | |
96 speex_bits_reset(&bits_); | |
97 speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i), | |
98 &bits_); | |
99 | |
100 // Encode the frame and place the size of the frame as the first byte. This | |
101 // is the packet format for MIME type x-speex-with-header-byte. | |
102 int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1, | |
103 kMaxSpeexFrameLength); | |
104 encoded_frame_data_[0] = static_cast<char>(frame_length); | |
105 encoded_frames->push_back(new string(encoded_frame_data_, | |
106 frame_length + 1)); | |
107 } | |
108 } | |
109 | |
110 SpeechRecognizer::SpeechRecognizer(Delegate* delegate, | 43 SpeechRecognizer::SpeechRecognizer(Delegate* delegate, |
111 int caller_id, | 44 int caller_id, |
112 const std::string& language, | 45 const std::string& language, |
113 const std::string& grammar, | 46 const std::string& grammar, |
114 const std::string& hardware_info) | 47 const std::string& hardware_info) |
115 : delegate_(delegate), | 48 : delegate_(delegate), |
116 caller_id_(caller_id), | 49 caller_id_(caller_id), |
117 language_(language), | 50 language_(language), |
118 grammar_(grammar), | 51 grammar_(grammar), |
119 hardware_info_(hardware_info), | 52 hardware_info_(hardware_info), |
120 encoder_(new SpeexEncoder()), | 53 codec_(AudioEncoder::FLAC), |
54 encoder_(NULL), | |
121 endpointer_(kAudioSampleRate), | 55 endpointer_(kAudioSampleRate), |
122 num_samples_recorded_(0), | 56 num_samples_recorded_(0), |
123 audio_level_(0.0f) { | 57 audio_level_(0.0f) { |
124 endpointer_.set_speech_input_complete_silence_length( | 58 endpointer_.set_speech_input_complete_silence_length( |
125 base::Time::kMicrosecondsPerSecond / 2); | 59 base::Time::kMicrosecondsPerSecond / 2); |
126 endpointer_.set_long_speech_input_complete_silence_length( | 60 endpointer_.set_long_speech_input_complete_silence_length( |
127 base::Time::kMicrosecondsPerSecond); | 61 base::Time::kMicrosecondsPerSecond); |
128 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); | 62 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); |
129 endpointer_.StartSession(); | 63 endpointer_.StartSession(); |
130 } | 64 } |
131 | 65 |
132 SpeechRecognizer::~SpeechRecognizer() { | 66 SpeechRecognizer::~SpeechRecognizer() { |
133 // Recording should have stopped earlier due to the endpointer or | 67 // Recording should have stopped earlier due to the endpointer or |
134 // |StopRecording| being called. | 68 // |StopRecording| being called. |
135 DCHECK(!audio_controller_.get()); | 69 DCHECK(!audio_controller_.get()); |
136 DCHECK(!request_.get() || !request_->HasPendingRequest()); | 70 DCHECK(!request_.get() || !request_->HasPendingRequest()); |
137 DCHECK(audio_buffers_.empty()); | 71 DCHECK(!encoder_.get()); |
138 endpointer_.EndSession(); | 72 endpointer_.EndSession(); |
139 } | 73 } |
140 | 74 |
141 bool SpeechRecognizer::StartRecording() { | 75 bool SpeechRecognizer::StartRecording() { |
142 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); | 76 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
143 DCHECK(!audio_controller_.get()); | 77 DCHECK(!audio_controller_.get()); |
144 DCHECK(!request_.get() || !request_->HasPendingRequest()); | 78 DCHECK(!request_.get() || !request_->HasPendingRequest()); |
79 DCHECK(!encoder_.get()); | |
145 | 80 |
146 // The endpointer needs to estimate the environment/background noise before | 81 // The endpointer needs to estimate the environment/background noise before |
147 // starting to treat the audio as user input. In |HandleOnData| we wait until | 82 // starting to treat the audio as user input. In |HandleOnData| we wait until |
148 // such time has passed before switching to user input mode. | 83 // such time has passed before switching to user input mode. |
149 endpointer_.SetEnvironmentEstimationMode(); | 84 endpointer_.SetEnvironmentEstimationMode(); |
150 | 85 |
86 encoder_.reset(AudioEncoder::Create(codec_, kAudioSampleRate, | |
87 kNumBitsPerAudioSample)); | |
151 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; | 88 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; |
152 DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0); | |
153 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels, | 89 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels, |
154 kAudioSampleRate, kNumBitsPerAudioSample, | 90 kAudioSampleRate, kNumBitsPerAudioSample, |
155 samples_per_packet); | 91 samples_per_packet); |
156 audio_controller_ = AudioInputController::Create(this, params); | 92 audio_controller_ = AudioInputController::Create(this, params); |
157 DCHECK(audio_controller_.get()); | 93 DCHECK(audio_controller_.get()); |
158 VLOG(1) << "SpeechRecognizer starting record."; | 94 VLOG(1) << "SpeechRecognizer starting record."; |
159 num_samples_recorded_ = 0; | 95 num_samples_recorded_ = 0; |
160 audio_controller_->Record(); | 96 audio_controller_->Record(); |
161 | 97 |
162 return true; | 98 return true; |
163 } | 99 } |
164 | 100 |
165 void SpeechRecognizer::CancelRecognition() { | 101 void SpeechRecognizer::CancelRecognition() { |
166 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); | 102 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
167 DCHECK(audio_controller_.get() || request_.get()); | 103 DCHECK(audio_controller_.get() || request_.get()); |
168 | 104 |
169 // Stop recording if required. | 105 // Stop recording if required. |
170 if (audio_controller_.get()) { | 106 if (audio_controller_.get()) { |
171 VLOG(1) << "SpeechRecognizer stopping record."; | 107 VLOG(1) << "SpeechRecognizer stopping record."; |
172 audio_controller_->Close(); | 108 audio_controller_->Close(); |
173 audio_controller_ = NULL; // Releases the ref ptr. | 109 audio_controller_ = NULL; // Releases the ref ptr. |
174 } | 110 } |
175 | 111 |
176 VLOG(1) << "SpeechRecognizer canceling recognition."; | 112 VLOG(1) << "SpeechRecognizer canceling recognition."; |
177 ReleaseAudioBuffers(); | 113 encoder_.reset(); |
178 request_.reset(); | 114 request_.reset(); |
179 } | 115 } |
180 | 116 |
181 void SpeechRecognizer::StopRecording() { | 117 void SpeechRecognizer::StopRecording() { |
182 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); | 118 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
183 | 119 |
184 // If audio recording has already stopped and we are in recognition phase, | 120 // If audio recording has already stopped and we are in recognition phase, |
185 // silently ignore any more calls to stop recording. | 121 // silently ignore any more calls to stop recording. |
186 if (!audio_controller_.get()) | 122 if (!audio_controller_.get()) |
187 return; | 123 return; |
188 | 124 |
189 VLOG(1) << "SpeechRecognizer stopping record."; | 125 VLOG(1) << "SpeechRecognizer stopping record."; |
190 audio_controller_->Close(); | 126 audio_controller_->Close(); |
191 audio_controller_ = NULL; // Releases the ref ptr. | 127 audio_controller_ = NULL; // Releases the ref ptr. |
128 encoder_->Flush(); | |
192 | 129 |
193 delegate_->DidCompleteRecording(caller_id_); | 130 delegate_->DidCompleteRecording(caller_id_); |
194 | 131 |
195 // If we haven't got any audio yet end the recognition sequence here. | 132 // Since the http request takes a single string as POST data, allocate |
196 if (audio_buffers_.empty()) { | 133 // one and copy over bytes from the audio buffers to the string. |
134 // And If we haven't got any audio yet end the recognition sequence here. | |
135 string data; | |
136 if (!encoder_->GetEncodedData(&data)) { | |
197 // Guard against the delegate freeing us until we finish our job. | 137 // Guard against the delegate freeing us until we finish our job. |
198 scoped_refptr<SpeechRecognizer> me(this); | 138 scoped_refptr<SpeechRecognizer> me(this); |
199 delegate_->DidCompleteRecognition(caller_id_); | 139 delegate_->DidCompleteRecognition(caller_id_); |
200 return; | 140 return; |
201 } | 141 } |
202 | 142 |
203 // We now have recorded audio in our buffers, so start a recognition request. | |
204 // Since the http request takes a single string as POST data, allocate | |
205 // one and copy over bytes from the audio buffers to the string. | |
206 int audio_buffer_length = 0; | |
207 for (AudioBufferQueue::iterator it = audio_buffers_.begin(); | |
208 it != audio_buffers_.end(); it++) { | |
209 audio_buffer_length += (*it)->length(); | |
210 } | |
211 string data; | |
212 data.reserve(audio_buffer_length); | |
213 for (AudioBufferQueue::iterator it = audio_buffers_.begin(); | |
214 it != audio_buffers_.end(); it++) { | |
215 data.append(*(*it)); | |
216 } | |
217 | |
218 DCHECK(!request_.get()); | 143 DCHECK(!request_.get()); |
219 request_.reset(new SpeechRecognitionRequest( | 144 request_.reset(new SpeechRecognitionRequest( |
220 Profile::GetDefaultRequestContext(), this)); | 145 Profile::GetDefaultRequestContext(), this)); |
221 request_->Send(language_, grammar_, hardware_info_, kContentTypeSpeex, data); | 146 request_->Send(language_, grammar_, hardware_info_, |
222 ReleaseAudioBuffers(); // No need to keep the audio anymore. | 147 (codec_ == AudioEncoder::FLAC) ? kContentTypeFLAC : kContentTypeSpeex, |
148 data); | |
149 encoder_.reset(); | |
223 } | 150 } |
224 | 151 |
225 void SpeechRecognizer::ReleaseAudioBuffers() { | 152 void SpeechRecognizer::ReleaseAudioBuffers() { |
226 for (AudioBufferQueue::iterator it = audio_buffers_.begin(); | |
227 it != audio_buffers_.end(); it++) | |
228 delete *it; | |
229 audio_buffers_.clear(); | |
230 } | 153 } |
231 | 154 |
232 // Invoked in the audio thread. | 155 // Invoked in the audio thread. |
233 void SpeechRecognizer::OnError(AudioInputController* controller, | 156 void SpeechRecognizer::OnError(AudioInputController* controller, |
234 int error_code) { | 157 int error_code) { |
235 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 158 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
236 NewRunnableMethod(this, | 159 NewRunnableMethod(this, |
237 &SpeechRecognizer::HandleOnError, | 160 &SpeechRecognizer::HandleOnError, |
238 error_code)); | 161 error_code)); |
239 } | 162 } |
(...skipping 28 matching lines...) Expand all Loading... | |
268 // by |OnData|. | 191 // by |OnData|. |
269 if (!audio_controller_.get()) { | 192 if (!audio_controller_.get()) { |
270 delete data; | 193 delete data; |
271 return; | 194 return; |
272 } | 195 } |
273 | 196 |
274 const short* samples = reinterpret_cast<const short*>(data->data()); | 197 const short* samples = reinterpret_cast<const short*>(data->data()); |
275 DCHECK((data->length() % sizeof(short)) == 0); | 198 DCHECK((data->length() % sizeof(short)) == 0); |
276 int num_samples = data->length() / sizeof(short); | 199 int num_samples = data->length() / sizeof(short); |
277 | 200 |
278 encoder_->Encode(samples, num_samples, &audio_buffers_); | 201 encoder_->Encode(samples, num_samples); |
279 float rms; | 202 float rms; |
280 endpointer_.ProcessAudio(samples, num_samples, &rms); | 203 endpointer_.ProcessAudio(samples, num_samples, &rms); |
281 delete data; | 204 delete data; |
282 num_samples_recorded_ += num_samples; | 205 num_samples_recorded_ += num_samples; |
283 | 206 |
284 if (endpointer_.IsEstimatingEnvironment()) { | 207 if (endpointer_.IsEstimatingEnvironment()) { |
285 // Check if we have gathered enough audio for the endpointer to do | 208 // Check if we have gathered enough audio for the endpointer to do |
286 // environment estimation and should move on to detect speech/end of speech. | 209 // environment estimation and should move on to detect speech/end of speech. |
287 if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs * | 210 if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs * |
288 kAudioSampleRate) / 1000) { | 211 kAudioSampleRate) / 1000) { |
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
334 | 257 |
335 void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) { | 258 void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) { |
336 CancelRecognition(); | 259 CancelRecognition(); |
337 | 260 |
338 // Guard against the delegate freeing us until we finish our job. | 261 // Guard against the delegate freeing us until we finish our job. |
339 scoped_refptr<SpeechRecognizer> me(this); | 262 scoped_refptr<SpeechRecognizer> me(this); |
340 delegate_->OnRecognizerError(caller_id_, error); | 263 delegate_->OnRecognizerError(caller_id_, error); |
341 } | 264 } |
342 | 265 |
343 } // namespace speech_input | 266 } // namespace speech_input |
OLD | NEW |