OLD | NEW |
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/browser/speech/speech_recognizer.h" | 5 #include "chrome/browser/speech/speech_recognizer.h" |
6 | 6 |
7 #include "base/ref_counted.h" | 7 #include "base/ref_counted.h" |
8 #include "base/scoped_ptr.h" | 8 #include "base/scoped_ptr.h" |
9 #include "base/time.h" | 9 #include "base/time.h" |
10 #include "chrome/browser/browser_thread.h" | 10 #include "chrome/browser/browser_thread.h" |
11 #include "chrome/browser/profiles/profile.h" | 11 #include "chrome/browser/profiles/profile.h" |
12 #include "chrome/common/net/url_request_context_getter.h" | 12 #include "chrome/common/net/url_request_context_getter.h" |
13 #include "third_party/speex/speex.h" | |
14 | 13 |
15 using media::AudioInputController; | 14 using media::AudioInputController; |
16 using std::list; | |
17 using std::string; | 15 using std::string; |
18 | 16 |
19 namespace { | 17 namespace { |
20 const char* const kContentTypeSpeex = | |
21 "audio/x-speex-with-header-byte; rate=16000"; | |
22 const int kSpeexEncodingQuality = 8; | |
23 const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz). | |
24 | |
25 // Since the frame length gets written out as a byte in the encoded packet, | |
26 // make sure it is within the byte range. | |
27 COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength); | |
28 | 18 |
29 // The following constants are related to the volume level indicator shown in | 19 // The following constants are related to the volume level indicator shown in |
30 // the UI for recorded audio. | 20 // the UI for recorded audio. |
31 // Multiplier used when new volume is greater than previous level. | 21 // Multiplier used when new volume is greater than previous level. |
32 const float kUpSmoothingFactor = 0.9f; | 22 const float kUpSmoothingFactor = 0.9f; |
33 // Multiplier used when new volume is lesser than previous level. | 23 // Multiplier used when new volume is lesser than previous level. |
34 const float kDownSmoothingFactor = 0.4f; | 24 const float kDownSmoothingFactor = 0.4f; |
35 const float kAudioMeterMinDb = 10.0f; // Lower bar for volume meter. | 25 const float kAudioMeterMinDb = 10.0f; // Lower bar for volume meter. |
36 const float kAudioMeterDbRange = 25.0f; | 26 const float kAudioMeterDbRange = 25.0f; |
37 } // namespace | 27 } // namespace |
38 | 28 |
39 namespace speech_input { | 29 namespace speech_input { |
40 | 30 |
41 const int SpeechRecognizer::kAudioSampleRate = 16000; | 31 const int SpeechRecognizer::kAudioSampleRate = 16000; |
42 const int SpeechRecognizer::kAudioPacketIntervalMs = 100; | 32 const int SpeechRecognizer::kAudioPacketIntervalMs = 100; |
43 const int SpeechRecognizer::kNumAudioChannels = 1; | 33 const int SpeechRecognizer::kNumAudioChannels = 1; |
44 const int SpeechRecognizer::kNumBitsPerAudioSample = 16; | 34 const int SpeechRecognizer::kNumBitsPerAudioSample = 16; |
45 const int SpeechRecognizer::kNoSpeechTimeoutSec = 8; | 35 const int SpeechRecognizer::kNoSpeechTimeoutSec = 8; |
46 const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300; | 36 const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300; |
47 | 37 |
48 // Provides a simple interface to encode raw audio using the Speex codec. | |
49 class SpeexEncoder { | |
50 public: | |
51 SpeexEncoder(); | |
52 ~SpeexEncoder(); | |
53 | |
54 int samples_per_frame() const { return samples_per_frame_; } | |
55 | |
56 // Encodes each frame of raw audio in |samples| and adds the | |
57 // encoded frames as a set of strings to the |encoded_frames| list. | |
58 // Ownership of the newly added strings is transferred to the caller. | |
59 void Encode(const short* samples, | |
60 int num_samples, | |
61 std::list<std::string*>* encoded_frames); | |
62 | |
63 private: | |
64 SpeexBits bits_; | |
65 void* encoder_state_; | |
66 int samples_per_frame_; | |
67 char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size. | |
68 }; | |
69 | |
70 SpeexEncoder::SpeexEncoder() { | |
71 // speex_bits_init() does not initialize all of the |bits_| struct. | |
72 memset(&bits_, 0, sizeof(bits_)); | |
73 speex_bits_init(&bits_); | |
74 encoder_state_ = speex_encoder_init(&speex_wb_mode); | |
75 DCHECK(encoder_state_); | |
76 speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_); | |
77 DCHECK(samples_per_frame_ > 0); | |
78 int quality = kSpeexEncodingQuality; | |
79 speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality); | |
80 int vbr = 1; | |
81 speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr); | |
82 memset(encoded_frame_data_, 0, sizeof(encoded_frame_data_)); | |
83 } | |
84 | |
85 SpeexEncoder::~SpeexEncoder() { | |
86 speex_bits_destroy(&bits_); | |
87 speex_encoder_destroy(encoder_state_); | |
88 } | |
89 | |
90 void SpeexEncoder::Encode(const short* samples, | |
91 int num_samples, | |
92 std::list<std::string*>* encoded_frames) { | |
93 // Drop incomplete frames, typically those which come in when recording stops. | |
94 num_samples -= (num_samples % samples_per_frame_); | |
95 for (int i = 0; i < num_samples; i += samples_per_frame_) { | |
96 speex_bits_reset(&bits_); | |
97 speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i), | |
98 &bits_); | |
99 | |
100 // Encode the frame and place the size of the frame as the first byte. This | |
101 // is the packet format for MIME type x-speex-with-header-byte. | |
102 int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1, | |
103 kMaxSpeexFrameLength); | |
104 encoded_frame_data_[0] = static_cast<char>(frame_length); | |
105 encoded_frames->push_back(new string(encoded_frame_data_, | |
106 frame_length + 1)); | |
107 } | |
108 } | |
109 | |
110 SpeechRecognizer::SpeechRecognizer(Delegate* delegate, | 38 SpeechRecognizer::SpeechRecognizer(Delegate* delegate, |
111 int caller_id, | 39 int caller_id, |
112 const std::string& language, | 40 const std::string& language, |
113 const std::string& grammar, | 41 const std::string& grammar, |
114 const std::string& hardware_info) | 42 const std::string& hardware_info) |
115 : delegate_(delegate), | 43 : delegate_(delegate), |
116 caller_id_(caller_id), | 44 caller_id_(caller_id), |
117 language_(language), | 45 language_(language), |
118 grammar_(grammar), | 46 grammar_(grammar), |
119 hardware_info_(hardware_info), | 47 hardware_info_(hardware_info), |
120 encoder_(new SpeexEncoder()), | 48 codec_(AudioEncoder::CODEC_SPEEX), |
| 49 encoder_(NULL), |
121 endpointer_(kAudioSampleRate), | 50 endpointer_(kAudioSampleRate), |
122 num_samples_recorded_(0), | 51 num_samples_recorded_(0), |
123 audio_level_(0.0f) { | 52 audio_level_(0.0f) { |
124 endpointer_.set_speech_input_complete_silence_length( | 53 endpointer_.set_speech_input_complete_silence_length( |
125 base::Time::kMicrosecondsPerSecond / 2); | 54 base::Time::kMicrosecondsPerSecond / 2); |
126 endpointer_.set_long_speech_input_complete_silence_length( | 55 endpointer_.set_long_speech_input_complete_silence_length( |
127 base::Time::kMicrosecondsPerSecond); | 56 base::Time::kMicrosecondsPerSecond); |
128 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); | 57 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); |
129 endpointer_.StartSession(); | 58 endpointer_.StartSession(); |
130 } | 59 } |
131 | 60 |
132 SpeechRecognizer::~SpeechRecognizer() { | 61 SpeechRecognizer::~SpeechRecognizer() { |
133 // Recording should have stopped earlier due to the endpointer or | 62 // Recording should have stopped earlier due to the endpointer or |
134 // |StopRecording| being called. | 63 // |StopRecording| being called. |
135 DCHECK(!audio_controller_.get()); | 64 DCHECK(!audio_controller_.get()); |
136 DCHECK(!request_.get() || !request_->HasPendingRequest()); | 65 DCHECK(!request_.get() || !request_->HasPendingRequest()); |
137 DCHECK(audio_buffers_.empty()); | 66 DCHECK(!encoder_.get()); |
138 endpointer_.EndSession(); | 67 endpointer_.EndSession(); |
139 } | 68 } |
140 | 69 |
141 bool SpeechRecognizer::StartRecording() { | 70 bool SpeechRecognizer::StartRecording() { |
142 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); | 71 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
143 DCHECK(!audio_controller_.get()); | 72 DCHECK(!audio_controller_.get()); |
144 DCHECK(!request_.get() || !request_->HasPendingRequest()); | 73 DCHECK(!request_.get() || !request_->HasPendingRequest()); |
| 74 DCHECK(!encoder_.get()); |
145 | 75 |
146 // The endpointer needs to estimate the environment/background noise before | 76 // The endpointer needs to estimate the environment/background noise before |
147 // starting to treat the audio as user input. In |HandleOnData| we wait until | 77 // starting to treat the audio as user input. In |HandleOnData| we wait until |
148 // such time has passed before switching to user input mode. | 78 // such time has passed before switching to user input mode. |
149 endpointer_.SetEnvironmentEstimationMode(); | 79 endpointer_.SetEnvironmentEstimationMode(); |
150 | 80 |
| 81 encoder_.reset(AudioEncoder::Create(codec_, kAudioSampleRate, |
| 82 kNumBitsPerAudioSample)); |
151 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; | 83 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; |
152 DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0); | |
153 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels, | 84 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels, |
154 kAudioSampleRate, kNumBitsPerAudioSample, | 85 kAudioSampleRate, kNumBitsPerAudioSample, |
155 samples_per_packet); | 86 samples_per_packet); |
156 audio_controller_ = AudioInputController::Create(this, params); | 87 audio_controller_ = AudioInputController::Create(this, params); |
157 DCHECK(audio_controller_.get()); | 88 DCHECK(audio_controller_.get()); |
158 VLOG(1) << "SpeechRecognizer starting record."; | 89 VLOG(1) << "SpeechRecognizer starting record."; |
159 num_samples_recorded_ = 0; | 90 num_samples_recorded_ = 0; |
160 audio_controller_->Record(); | 91 audio_controller_->Record(); |
161 | 92 |
162 return true; | 93 return true; |
163 } | 94 } |
164 | 95 |
165 void SpeechRecognizer::CancelRecognition() { | 96 void SpeechRecognizer::CancelRecognition() { |
166 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); | 97 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
167 DCHECK(audio_controller_.get() || request_.get()); | 98 DCHECK(audio_controller_.get() || request_.get()); |
168 | 99 |
169 // Stop recording if required. | 100 // Stop recording if required. |
170 if (audio_controller_.get()) { | 101 if (audio_controller_.get()) { |
171 VLOG(1) << "SpeechRecognizer stopping record."; | 102 VLOG(1) << "SpeechRecognizer stopping record."; |
172 audio_controller_->Close(); | 103 audio_controller_->Close(); |
173 audio_controller_ = NULL; // Releases the ref ptr. | 104 audio_controller_ = NULL; // Releases the ref ptr. |
174 } | 105 } |
175 | 106 |
176 VLOG(1) << "SpeechRecognizer canceling recognition."; | 107 VLOG(1) << "SpeechRecognizer canceling recognition."; |
177 ReleaseAudioBuffers(); | 108 encoder_.reset(); |
178 request_.reset(); | 109 request_.reset(); |
179 } | 110 } |
180 | 111 |
181 void SpeechRecognizer::StopRecording() { | 112 void SpeechRecognizer::StopRecording() { |
182 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); | 113 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
183 | 114 |
184 // If audio recording has already stopped and we are in recognition phase, | 115 // If audio recording has already stopped and we are in recognition phase, |
185 // silently ignore any more calls to stop recording. | 116 // silently ignore any more calls to stop recording. |
186 if (!audio_controller_.get()) | 117 if (!audio_controller_.get()) |
187 return; | 118 return; |
188 | 119 |
189 VLOG(1) << "SpeechRecognizer stopping record."; | 120 VLOG(1) << "SpeechRecognizer stopping record."; |
190 audio_controller_->Close(); | 121 audio_controller_->Close(); |
191 audio_controller_ = NULL; // Releases the ref ptr. | 122 audio_controller_ = NULL; // Releases the ref ptr. |
| 123 encoder_->Flush(); |
192 | 124 |
193 delegate_->DidCompleteRecording(caller_id_); | 125 delegate_->DidCompleteRecording(caller_id_); |
194 | 126 |
195 // If we haven't got any audio yet end the recognition sequence here. | 127 // Since the http request takes a single string as POST data, allocate |
196 if (audio_buffers_.empty()) { | 128 // one and copy over bytes from the audio buffers to the string. |
| 129 // And If we haven't got any audio yet end the recognition sequence here. |
| 130 string data; |
| 131 if (!encoder_->GetEncodedData(&data)) { |
197 // Guard against the delegate freeing us until we finish our job. | 132 // Guard against the delegate freeing us until we finish our job. |
198 scoped_refptr<SpeechRecognizer> me(this); | 133 scoped_refptr<SpeechRecognizer> me(this); |
199 delegate_->DidCompleteRecognition(caller_id_); | 134 delegate_->DidCompleteRecognition(caller_id_); |
200 return; | 135 } else { |
| 136 DCHECK(!request_.get()); |
| 137 request_.reset(new SpeechRecognitionRequest( |
| 138 Profile::GetDefaultRequestContext(), this)); |
| 139 request_->Send(language_, grammar_, hardware_info_, encoder_->mime_type(), |
| 140 data); |
201 } | 141 } |
202 | 142 encoder_.reset(); |
203 // We now have recorded audio in our buffers, so start a recognition request. | |
204 // Since the http request takes a single string as POST data, allocate | |
205 // one and copy over bytes from the audio buffers to the string. | |
206 int audio_buffer_length = 0; | |
207 for (AudioBufferQueue::iterator it = audio_buffers_.begin(); | |
208 it != audio_buffers_.end(); it++) { | |
209 audio_buffer_length += (*it)->length(); | |
210 } | |
211 string data; | |
212 data.reserve(audio_buffer_length); | |
213 for (AudioBufferQueue::iterator it = audio_buffers_.begin(); | |
214 it != audio_buffers_.end(); it++) { | |
215 data.append(*(*it)); | |
216 } | |
217 | |
218 DCHECK(!request_.get()); | |
219 request_.reset(new SpeechRecognitionRequest( | |
220 Profile::GetDefaultRequestContext(), this)); | |
221 request_->Send(language_, grammar_, hardware_info_, kContentTypeSpeex, data); | |
222 ReleaseAudioBuffers(); // No need to keep the audio anymore. | |
223 } | 143 } |
224 | 144 |
225 void SpeechRecognizer::ReleaseAudioBuffers() { | 145 void SpeechRecognizer::ReleaseAudioBuffers() { |
226 for (AudioBufferQueue::iterator it = audio_buffers_.begin(); | |
227 it != audio_buffers_.end(); it++) | |
228 delete *it; | |
229 audio_buffers_.clear(); | |
230 } | 146 } |
231 | 147 |
232 // Invoked in the audio thread. | 148 // Invoked in the audio thread. |
233 void SpeechRecognizer::OnError(AudioInputController* controller, | 149 void SpeechRecognizer::OnError(AudioInputController* controller, |
234 int error_code) { | 150 int error_code) { |
235 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 151 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
236 NewRunnableMethod(this, | 152 NewRunnableMethod(this, |
237 &SpeechRecognizer::HandleOnError, | 153 &SpeechRecognizer::HandleOnError, |
238 error_code)); | 154 error_code)); |
239 } | 155 } |
(...skipping 28 matching lines...) Expand all Loading... |
268 // by |OnData|. | 184 // by |OnData|. |
269 if (!audio_controller_.get()) { | 185 if (!audio_controller_.get()) { |
270 delete data; | 186 delete data; |
271 return; | 187 return; |
272 } | 188 } |
273 | 189 |
274 const short* samples = reinterpret_cast<const short*>(data->data()); | 190 const short* samples = reinterpret_cast<const short*>(data->data()); |
275 DCHECK((data->length() % sizeof(short)) == 0); | 191 DCHECK((data->length() % sizeof(short)) == 0); |
276 int num_samples = data->length() / sizeof(short); | 192 int num_samples = data->length() / sizeof(short); |
277 | 193 |
278 encoder_->Encode(samples, num_samples, &audio_buffers_); | 194 encoder_->Encode(samples, num_samples); |
279 float rms; | 195 float rms; |
280 endpointer_.ProcessAudio(samples, num_samples, &rms); | 196 endpointer_.ProcessAudio(samples, num_samples, &rms); |
281 delete data; | 197 delete data; |
282 num_samples_recorded_ += num_samples; | 198 num_samples_recorded_ += num_samples; |
283 | 199 |
284 if (endpointer_.IsEstimatingEnvironment()) { | 200 if (endpointer_.IsEstimatingEnvironment()) { |
285 // Check if we have gathered enough audio for the endpointer to do | 201 // Check if we have gathered enough audio for the endpointer to do |
286 // environment estimation and should move on to detect speech/end of speech. | 202 // environment estimation and should move on to detect speech/end of speech. |
287 if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs * | 203 if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs * |
288 kAudioSampleRate) / 1000) { | 204 kAudioSampleRate) / 1000) { |
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
334 | 250 |
335 void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) { | 251 void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) { |
336 CancelRecognition(); | 252 CancelRecognition(); |
337 | 253 |
338 // Guard against the delegate freeing us until we finish our job. | 254 // Guard against the delegate freeing us until we finish our job. |
339 scoped_refptr<SpeechRecognizer> me(this); | 255 scoped_refptr<SpeechRecognizer> me(this); |
340 delegate_->OnRecognizerError(caller_id_, error); | 256 delegate_->OnRecognizerError(caller_id_, error); |
341 } | 257 } |
342 | 258 |
343 } // namespace speech_input | 259 } // namespace speech_input |
OLD | NEW |