Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(567)

Side by Side Diff: chrome/browser/speech/speech_recognizer.cc

Issue 6111009: Add the option of compressing speech input audio using FLAC. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: . Created 9 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/browser/speech/speech_recognizer.h" 5 #include "chrome/browser/speech/speech_recognizer.h"
6 6
7 #include "base/ref_counted.h" 7 #include "base/ref_counted.h"
8 #include "base/scoped_ptr.h" 8 #include "base/scoped_ptr.h"
9 #include "base/time.h" 9 #include "base/time.h"
10 #include "chrome/browser/browser_thread.h" 10 #include "chrome/browser/browser_thread.h"
11 #include "chrome/browser/profiles/profile.h" 11 #include "chrome/browser/profiles/profile.h"
12 #include "chrome/common/net/url_request_context_getter.h" 12 #include "chrome/common/net/url_request_context_getter.h"
13 #include "third_party/speex/speex.h"
14 13
15 using media::AudioInputController; 14 using media::AudioInputController;
16 using std::list;
17 using std::string; 15 using std::string;
18 16
19 namespace { 17 namespace {
20 const char* const kContentTypeSpeex =
21 "audio/x-speex-with-header-byte; rate=16000";
22 const int kSpeexEncodingQuality = 8;
23 const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz).
24
25 // Since the frame length gets written out as a byte in the encoded packet,
26 // make sure it is within the byte range.
27 COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength);
28 18
29 // The following constants are related to the volume level indicator shown in 19 // The following constants are related to the volume level indicator shown in
30 // the UI for recorded audio. 20 // the UI for recorded audio.
31 // Multiplier used when new volume is greater than previous level. 21 // Multiplier used when new volume is greater than previous level.
32 const float kUpSmoothingFactor = 0.9f; 22 const float kUpSmoothingFactor = 0.9f;
33 // Multiplier used when new volume is lesser than previous level. 23 // Multiplier used when new volume is lesser than previous level.
34 const float kDownSmoothingFactor = 0.4f; 24 const float kDownSmoothingFactor = 0.4f;
35 const float kAudioMeterMinDb = 10.0f; // Lower bar for volume meter. 25 const float kAudioMeterMinDb = 10.0f; // Lower bar for volume meter.
36 const float kAudioMeterDbRange = 25.0f; 26 const float kAudioMeterDbRange = 25.0f;
37 } // namespace 27 } // namespace
38 28
39 namespace speech_input { 29 namespace speech_input {
40 30
41 const int SpeechRecognizer::kAudioSampleRate = 16000; 31 const int SpeechRecognizer::kAudioSampleRate = 16000;
42 const int SpeechRecognizer::kAudioPacketIntervalMs = 100; 32 const int SpeechRecognizer::kAudioPacketIntervalMs = 100;
43 const int SpeechRecognizer::kNumAudioChannels = 1; 33 const int SpeechRecognizer::kNumAudioChannels = 1;
44 const int SpeechRecognizer::kNumBitsPerAudioSample = 16; 34 const int SpeechRecognizer::kNumBitsPerAudioSample = 16;
45 const int SpeechRecognizer::kNoSpeechTimeoutSec = 8; 35 const int SpeechRecognizer::kNoSpeechTimeoutSec = 8;
46 const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300; 36 const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300;
47 37
48 // Provides a simple interface to encode raw audio using the Speex codec.
49 class SpeexEncoder {
50 public:
51 SpeexEncoder();
52 ~SpeexEncoder();
53
54 int samples_per_frame() const { return samples_per_frame_; }
55
56 // Encodes each frame of raw audio in |samples| and adds the
57 // encoded frames as a set of strings to the |encoded_frames| list.
58 // Ownership of the newly added strings is transferred to the caller.
59 void Encode(const short* samples,
60 int num_samples,
61 std::list<std::string*>* encoded_frames);
62
63 private:
64 SpeexBits bits_;
65 void* encoder_state_;
66 int samples_per_frame_;
67 char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size.
68 };
69
70 SpeexEncoder::SpeexEncoder() {
71 // speex_bits_init() does not initialize all of the |bits_| struct.
72 memset(&bits_, 0, sizeof(bits_));
73 speex_bits_init(&bits_);
74 encoder_state_ = speex_encoder_init(&speex_wb_mode);
75 DCHECK(encoder_state_);
76 speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_);
77 DCHECK(samples_per_frame_ > 0);
78 int quality = kSpeexEncodingQuality;
79 speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality);
80 int vbr = 1;
81 speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr);
82 memset(encoded_frame_data_, 0, sizeof(encoded_frame_data_));
83 }
84
85 SpeexEncoder::~SpeexEncoder() {
86 speex_bits_destroy(&bits_);
87 speex_encoder_destroy(encoder_state_);
88 }
89
90 void SpeexEncoder::Encode(const short* samples,
91 int num_samples,
92 std::list<std::string*>* encoded_frames) {
93 // Drop incomplete frames, typically those which come in when recording stops.
94 num_samples -= (num_samples % samples_per_frame_);
95 for (int i = 0; i < num_samples; i += samples_per_frame_) {
96 speex_bits_reset(&bits_);
97 speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i),
98 &bits_);
99
100 // Encode the frame and place the size of the frame as the first byte. This
101 // is the packet format for MIME type x-speex-with-header-byte.
102 int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1,
103 kMaxSpeexFrameLength);
104 encoded_frame_data_[0] = static_cast<char>(frame_length);
105 encoded_frames->push_back(new string(encoded_frame_data_,
106 frame_length + 1));
107 }
108 }
109
110 SpeechRecognizer::SpeechRecognizer(Delegate* delegate, 38 SpeechRecognizer::SpeechRecognizer(Delegate* delegate,
111 int caller_id, 39 int caller_id,
112 const std::string& language, 40 const std::string& language,
113 const std::string& grammar, 41 const std::string& grammar,
114 const std::string& hardware_info) 42 const std::string& hardware_info)
115 : delegate_(delegate), 43 : delegate_(delegate),
116 caller_id_(caller_id), 44 caller_id_(caller_id),
117 language_(language), 45 language_(language),
118 grammar_(grammar), 46 grammar_(grammar),
119 hardware_info_(hardware_info), 47 hardware_info_(hardware_info),
120 encoder_(new SpeexEncoder()), 48 codec_(AudioEncoder::CODEC_SPEEX),
49 encoder_(NULL),
121 endpointer_(kAudioSampleRate), 50 endpointer_(kAudioSampleRate),
122 num_samples_recorded_(0), 51 num_samples_recorded_(0),
123 audio_level_(0.0f) { 52 audio_level_(0.0f) {
124 endpointer_.set_speech_input_complete_silence_length( 53 endpointer_.set_speech_input_complete_silence_length(
125 base::Time::kMicrosecondsPerSecond / 2); 54 base::Time::kMicrosecondsPerSecond / 2);
126 endpointer_.set_long_speech_input_complete_silence_length( 55 endpointer_.set_long_speech_input_complete_silence_length(
127 base::Time::kMicrosecondsPerSecond); 56 base::Time::kMicrosecondsPerSecond);
128 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); 57 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);
129 endpointer_.StartSession(); 58 endpointer_.StartSession();
130 } 59 }
131 60
132 SpeechRecognizer::~SpeechRecognizer() { 61 SpeechRecognizer::~SpeechRecognizer() {
133 // Recording should have stopped earlier due to the endpointer or 62 // Recording should have stopped earlier due to the endpointer or
134 // |StopRecording| being called. 63 // |StopRecording| being called.
135 DCHECK(!audio_controller_.get()); 64 DCHECK(!audio_controller_.get());
136 DCHECK(!request_.get() || !request_->HasPendingRequest()); 65 DCHECK(!request_.get() || !request_->HasPendingRequest());
137 DCHECK(audio_buffers_.empty()); 66 DCHECK(!encoder_.get());
138 endpointer_.EndSession(); 67 endpointer_.EndSession();
139 } 68 }
140 69
141 bool SpeechRecognizer::StartRecording() { 70 bool SpeechRecognizer::StartRecording() {
142 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); 71 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
143 DCHECK(!audio_controller_.get()); 72 DCHECK(!audio_controller_.get());
144 DCHECK(!request_.get() || !request_->HasPendingRequest()); 73 DCHECK(!request_.get() || !request_->HasPendingRequest());
74 DCHECK(!encoder_.get());
145 75
146 // The endpointer needs to estimate the environment/background noise before 76 // The endpointer needs to estimate the environment/background noise before
147 // starting to treat the audio as user input. In |HandleOnData| we wait until 77 // starting to treat the audio as user input. In |HandleOnData| we wait until
148 // such time has passed before switching to user input mode. 78 // such time has passed before switching to user input mode.
149 endpointer_.SetEnvironmentEstimationMode(); 79 endpointer_.SetEnvironmentEstimationMode();
150 80
81 encoder_.reset(AudioEncoder::Create(codec_, kAudioSampleRate,
82 kNumBitsPerAudioSample));
151 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; 83 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;
152 DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0);
153 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels, 84 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels,
154 kAudioSampleRate, kNumBitsPerAudioSample, 85 kAudioSampleRate, kNumBitsPerAudioSample,
155 samples_per_packet); 86 samples_per_packet);
156 audio_controller_ = AudioInputController::Create(this, params); 87 audio_controller_ = AudioInputController::Create(this, params);
157 DCHECK(audio_controller_.get()); 88 DCHECK(audio_controller_.get());
158 VLOG(1) << "SpeechRecognizer starting record."; 89 VLOG(1) << "SpeechRecognizer starting record.";
159 num_samples_recorded_ = 0; 90 num_samples_recorded_ = 0;
160 audio_controller_->Record(); 91 audio_controller_->Record();
161 92
162 return true; 93 return true;
163 } 94 }
164 95
165 void SpeechRecognizer::CancelRecognition() { 96 void SpeechRecognizer::CancelRecognition() {
166 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); 97 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
167 DCHECK(audio_controller_.get() || request_.get()); 98 DCHECK(audio_controller_.get() || request_.get());
168 99
169 // Stop recording if required. 100 // Stop recording if required.
170 if (audio_controller_.get()) { 101 if (audio_controller_.get()) {
171 VLOG(1) << "SpeechRecognizer stopping record."; 102 VLOG(1) << "SpeechRecognizer stopping record.";
172 audio_controller_->Close(); 103 audio_controller_->Close();
173 audio_controller_ = NULL; // Releases the ref ptr. 104 audio_controller_ = NULL; // Releases the ref ptr.
174 } 105 }
175 106
176 VLOG(1) << "SpeechRecognizer canceling recognition."; 107 VLOG(1) << "SpeechRecognizer canceling recognition.";
177 ReleaseAudioBuffers(); 108 encoder_.reset();
178 request_.reset(); 109 request_.reset();
179 } 110 }
180 111
181 void SpeechRecognizer::StopRecording() { 112 void SpeechRecognizer::StopRecording() {
182 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); 113 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
183 114
184 // If audio recording has already stopped and we are in recognition phase, 115 // If audio recording has already stopped and we are in recognition phase,
185 // silently ignore any more calls to stop recording. 116 // silently ignore any more calls to stop recording.
186 if (!audio_controller_.get()) 117 if (!audio_controller_.get())
187 return; 118 return;
188 119
189 VLOG(1) << "SpeechRecognizer stopping record."; 120 VLOG(1) << "SpeechRecognizer stopping record.";
190 audio_controller_->Close(); 121 audio_controller_->Close();
191 audio_controller_ = NULL; // Releases the ref ptr. 122 audio_controller_ = NULL; // Releases the ref ptr.
123 encoder_->Flush();
192 124
193 delegate_->DidCompleteRecording(caller_id_); 125 delegate_->DidCompleteRecording(caller_id_);
194 126
195 // If we haven't got any audio yet end the recognition sequence here. 127 // Since the http request takes a single string as POST data, allocate
196 if (audio_buffers_.empty()) { 128 // one and copy over bytes from the audio buffers to the string.
129 // And If we haven't got any audio yet end the recognition sequence here.
130 string data;
131 if (!encoder_->GetEncodedData(&data)) {
197 // Guard against the delegate freeing us until we finish our job. 132 // Guard against the delegate freeing us until we finish our job.
198 scoped_refptr<SpeechRecognizer> me(this); 133 scoped_refptr<SpeechRecognizer> me(this);
199 delegate_->DidCompleteRecognition(caller_id_); 134 delegate_->DidCompleteRecognition(caller_id_);
200 return; 135 } else {
136 DCHECK(!request_.get());
137 request_.reset(new SpeechRecognitionRequest(
138 Profile::GetDefaultRequestContext(), this));
139 request_->Send(language_, grammar_, hardware_info_, encoder_->mime_type(),
140 data);
201 } 141 }
202 142 encoder_.reset();
203 // We now have recorded audio in our buffers, so start a recognition request.
204 // Since the http request takes a single string as POST data, allocate
205 // one and copy over bytes from the audio buffers to the string.
206 int audio_buffer_length = 0;
207 for (AudioBufferQueue::iterator it = audio_buffers_.begin();
208 it != audio_buffers_.end(); it++) {
209 audio_buffer_length += (*it)->length();
210 }
211 string data;
212 data.reserve(audio_buffer_length);
213 for (AudioBufferQueue::iterator it = audio_buffers_.begin();
214 it != audio_buffers_.end(); it++) {
215 data.append(*(*it));
216 }
217
218 DCHECK(!request_.get());
219 request_.reset(new SpeechRecognitionRequest(
220 Profile::GetDefaultRequestContext(), this));
221 request_->Send(language_, grammar_, hardware_info_, kContentTypeSpeex, data);
222 ReleaseAudioBuffers(); // No need to keep the audio anymore.
223 } 143 }
224 144
225 void SpeechRecognizer::ReleaseAudioBuffers() { 145 void SpeechRecognizer::ReleaseAudioBuffers() {
226 for (AudioBufferQueue::iterator it = audio_buffers_.begin();
227 it != audio_buffers_.end(); it++)
228 delete *it;
229 audio_buffers_.clear();
230 } 146 }
231 147
232 // Invoked in the audio thread. 148 // Invoked in the audio thread.
233 void SpeechRecognizer::OnError(AudioInputController* controller, 149 void SpeechRecognizer::OnError(AudioInputController* controller,
234 int error_code) { 150 int error_code) {
235 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 151 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
236 NewRunnableMethod(this, 152 NewRunnableMethod(this,
237 &SpeechRecognizer::HandleOnError, 153 &SpeechRecognizer::HandleOnError,
238 error_code)); 154 error_code));
239 } 155 }
(...skipping 28 matching lines...) Expand all
268 // by |OnData|. 184 // by |OnData|.
269 if (!audio_controller_.get()) { 185 if (!audio_controller_.get()) {
270 delete data; 186 delete data;
271 return; 187 return;
272 } 188 }
273 189
274 const short* samples = reinterpret_cast<const short*>(data->data()); 190 const short* samples = reinterpret_cast<const short*>(data->data());
275 DCHECK((data->length() % sizeof(short)) == 0); 191 DCHECK((data->length() % sizeof(short)) == 0);
276 int num_samples = data->length() / sizeof(short); 192 int num_samples = data->length() / sizeof(short);
277 193
278 encoder_->Encode(samples, num_samples, &audio_buffers_); 194 encoder_->Encode(samples, num_samples);
279 float rms; 195 float rms;
280 endpointer_.ProcessAudio(samples, num_samples, &rms); 196 endpointer_.ProcessAudio(samples, num_samples, &rms);
281 delete data; 197 delete data;
282 num_samples_recorded_ += num_samples; 198 num_samples_recorded_ += num_samples;
283 199
284 if (endpointer_.IsEstimatingEnvironment()) { 200 if (endpointer_.IsEstimatingEnvironment()) {
285 // Check if we have gathered enough audio for the endpointer to do 201 // Check if we have gathered enough audio for the endpointer to do
286 // environment estimation and should move on to detect speech/end of speech. 202 // environment estimation and should move on to detect speech/end of speech.
287 if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs * 203 if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs *
288 kAudioSampleRate) / 1000) { 204 kAudioSampleRate) / 1000) {
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
334 250
335 void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) { 251 void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) {
336 CancelRecognition(); 252 CancelRecognition();
337 253
338 // Guard against the delegate freeing us until we finish our job. 254 // Guard against the delegate freeing us until we finish our job.
339 scoped_refptr<SpeechRecognizer> me(this); 255 scoped_refptr<SpeechRecognizer> me(this);
340 delegate_->OnRecognizerError(caller_id_, error); 256 delegate_->OnRecognizerError(caller_id_, error);
341 } 257 }
342 258
343 } // namespace speech_input 259 } // namespace speech_input
OLDNEW
« no previous file with comments | « chrome/browser/speech/speech_recognizer.h ('k') | chrome/browser/speech/speech_recognizer_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698