Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(42)

Side by Side Diff: chrome/browser/speech/speech_recognizer.cc

Issue 3117026: Add an endpointer for detecting end of speech. (Closed)
Patch Set: Merged with latest. Created 10 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/browser/speech/speech_recognizer.h" 5 #include "chrome/browser/speech/speech_recognizer.h"
6 6
7 #include "base/ref_counted.h" 7 #include "base/ref_counted.h"
8 #include "base/scoped_ptr.h" 8 #include "base/scoped_ptr.h"
9 #include "base/time.h"
9 #include "chrome/browser/chrome_thread.h" 10 #include "chrome/browser/chrome_thread.h"
10 #include "chrome/browser/profile.h" 11 #include "chrome/browser/profile.h"
11 #include "chrome/common/net/url_request_context_getter.h" 12 #include "chrome/common/net/url_request_context_getter.h"
12 #include "third_party/speex/include/speex/speex.h" 13 #include "third_party/speex/include/speex/speex.h"
13 14
14 using media::AudioInputController; 15 using media::AudioInputController;
15 using std::list; 16 using std::list;
16 using std::string; 17 using std::string;
17 18
18 namespace { 19 namespace {
(...skipping 17 matching lines...) Expand all
36 namespace speech_input { 37 namespace speech_input {
37 38
38 // Provides a simple interface to encode raw audio using the Speex codec. 39 // Provides a simple interface to encode raw audio using the Speex codec.
39 class SpeexEncoder { 40 class SpeexEncoder {
40 public: 41 public:
41 SpeexEncoder(); 42 SpeexEncoder();
42 ~SpeexEncoder(); 43 ~SpeexEncoder();
43 44
44 int samples_per_frame() const { return samples_per_frame_; } 45 int samples_per_frame() const { return samples_per_frame_; }
45 46
46 // Encodes each frame of raw audio in |raw_samples| and adds the 47 // Encodes each frame of raw audio in |samples| and adds the
47 // encoded frames as a set of strings to the |encoded_frames| list. 48 // encoded frames as a set of strings to the |encoded_frames| list.
48 // Ownership of the newly added strings is transferred to the caller. 49 // Ownership of the newly added strings is transferred to the caller.
49 void Encode(const string& raw_samples, 50 void Encode(const short* samples,
51 int num_samples,
50 std::list<std::string*>* encoded_frames); 52 std::list<std::string*>* encoded_frames);
51 53
52 private: 54 private:
53 SpeexBits bits_; 55 SpeexBits bits_;
54 void* encoder_state_; 56 void* encoder_state_;
55 int samples_per_frame_; 57 int samples_per_frame_;
56 char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size. 58 char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size.
57 }; 59 };
58 60
59 SpeexEncoder::SpeexEncoder() { 61 SpeexEncoder::SpeexEncoder() {
60 speex_bits_init(&bits_); 62 speex_bits_init(&bits_);
61 encoder_state_ = speex_encoder_init(&speex_wb_mode); 63 encoder_state_ = speex_encoder_init(&speex_wb_mode);
62 DCHECK(encoder_state_); 64 DCHECK(encoder_state_);
63 speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_); 65 speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_);
64 DCHECK(samples_per_frame_ > 0); 66 DCHECK(samples_per_frame_ > 0);
65 int quality = kSpeexEncodingQuality; 67 int quality = kSpeexEncodingQuality;
66 speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality); 68 speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality);
67 int vbr = 1; 69 int vbr = 1;
68 speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr); 70 speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr);
69 } 71 }
70 72
71 SpeexEncoder::~SpeexEncoder() { 73 SpeexEncoder::~SpeexEncoder() {
72 speex_bits_destroy(&bits_); 74 speex_bits_destroy(&bits_);
73 speex_encoder_destroy(encoder_state_); 75 speex_encoder_destroy(encoder_state_);
74 } 76 }
75 77
76 void SpeexEncoder::Encode(const string& raw_samples, 78 void SpeexEncoder::Encode(const short* samples,
79 int num_samples,
77 std::list<std::string*>* encoded_frames) { 80 std::list<std::string*>* encoded_frames) {
78 const short* samples = reinterpret_cast<const short*>(raw_samples.data());
79 DCHECK((raw_samples.length() % sizeof(short)) == 0);
80 int num_samples = raw_samples.length() / sizeof(short);
81
82 // Drop incomplete frames, typically those which come in when recording stops. 81 // Drop incomplete frames, typically those which come in when recording stops.
83 num_samples -= (num_samples % samples_per_frame_); 82 num_samples -= (num_samples % samples_per_frame_);
84 for (int i = 0; i < num_samples; i += samples_per_frame_) { 83 for (int i = 0; i < num_samples; i += samples_per_frame_) {
85 speex_bits_reset(&bits_); 84 speex_bits_reset(&bits_);
86 speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i), 85 speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i),
87 &bits_); 86 &bits_);
88 87
89 // Encode the frame and place the size of the frame as the first byte. This 88 // Encode the frame and place the size of the frame as the first byte. This
90 // is the packet format for MIME type x-speex-with-header-byte. 89 // is the packet format for MIME type x-speex-with-header-byte.
91 int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1, 90 int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1,
92 kMaxSpeexFrameLength); 91 kMaxSpeexFrameLength);
93 encoded_frame_data_[0] = static_cast<char>(frame_length); 92 encoded_frame_data_[0] = static_cast<char>(frame_length);
94 encoded_frames->push_back(new string(encoded_frame_data_, 93 encoded_frames->push_back(new string(encoded_frame_data_,
95 frame_length + 1)); 94 frame_length + 1));
96 } 95 }
97 } 96 }
98 97
99 SpeechRecognizer::SpeechRecognizer(Delegate* delegate, 98 SpeechRecognizer::SpeechRecognizer(Delegate* delegate,
100 const SpeechInputCallerId& caller_id) 99 const SpeechInputCallerId& caller_id)
101 : delegate_(delegate), 100 : delegate_(delegate),
102 caller_id_(caller_id), 101 caller_id_(caller_id),
103 encoder_(new SpeexEncoder()) { 102 encoder_(new SpeexEncoder()),
103 endpointer_(kAudioSampleRate) {
104 endpointer_.set_speech_input_complete_silence_length(
105 base::Time::kMicrosecondsPerSecond / 2);
106 endpointer_.set_long_speech_input_complete_silence_length(
107 base::Time::kMicrosecondsPerSecond);
108 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);
109 endpointer_.StartSession();
104 } 110 }
105 111
106 SpeechRecognizer::~SpeechRecognizer() { 112 SpeechRecognizer::~SpeechRecognizer() {
107 // Recording should have stopped earlier due to the endpointer or 113 // Recording should have stopped earlier due to the endpointer or
108 // |StopRecording| being called. 114 // |StopRecording| being called.
109 DCHECK(!audio_controller_.get()); 115 DCHECK(!audio_controller_.get());
110 DCHECK(!request_.get() || !request_->HasPendingRequest()); 116 DCHECK(!request_.get() || !request_->HasPendingRequest());
111 DCHECK(audio_buffers_.empty()); 117 DCHECK(audio_buffers_.empty());
118 endpointer_.EndSession();
112 } 119 }
113 120
114 bool SpeechRecognizer::StartRecording() { 121 bool SpeechRecognizer::StartRecording() {
115 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO)); 122 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO));
116 DCHECK(!audio_controller_.get()); 123 DCHECK(!audio_controller_.get());
117 DCHECK(!request_.get() || !request_->HasPendingRequest()); 124 DCHECK(!request_.get() || !request_->HasPendingRequest());
118 125
126 // TODO(satish): Normally for a short time (even 0.5s) the endpointer needs to
127 // estimate the environment/background noise before starting to treat the
128 // audio as user input. Once we have implemented a popup UI to notify the user
129 // that recording has started, we should perhaps have a short interval where
130 // we record background audio and then show the popup UI so that the user can
131 // start speaking after that. For now we just do these together so there isn't
132 // any background noise for the end pointer (still works ok).
133 endpointer_.SetEnvironmentEstimationMode();
134 endpointer_.SetUserInputMode();
135
119 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; 136 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;
120 DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0); 137 DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0);
121 audio_controller_ = AudioInputController::Create(this, 138 audio_controller_ = AudioInputController::Create(this,
122 AudioManager::AUDIO_PCM_LINEAR, kNumAudioChannels, 139 AudioManager::AUDIO_PCM_LINEAR, kNumAudioChannels,
123 kAudioSampleRate, kNumBitsPerAudioSample, 140 kAudioSampleRate, kNumBitsPerAudioSample,
124 samples_per_packet); 141 samples_per_packet);
125 DCHECK(audio_controller_.get()); 142 DCHECK(audio_controller_.get());
126 LOG(INFO) << "SpeechRecognizer starting record."; 143 LOG(INFO) << "SpeechRecognizer starting record.";
127 audio_controller_->Record(); 144 audio_controller_->Record();
128 145
(...skipping 20 matching lines...) Expand all
149 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO)); 166 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO));
150 167
151 // If audio recording has already stopped and we are in recognition phase, 168 // If audio recording has already stopped and we are in recognition phase,
152 // silently ignore any more calls to stop recording. 169 // silently ignore any more calls to stop recording.
153 if (!audio_controller_.get()) 170 if (!audio_controller_.get())
154 return; 171 return;
155 172
156 LOG(INFO) << "SpeechRecognizer stopping record."; 173 LOG(INFO) << "SpeechRecognizer stopping record.";
157 audio_controller_->Close(); 174 audio_controller_->Close();
158 audio_controller_ = NULL; // Releases the ref ptr. 175 audio_controller_ = NULL; // Releases the ref ptr.
176
159 delegate_->DidCompleteRecording(caller_id_); 177 delegate_->DidCompleteRecording(caller_id_);
160 178
161 // If we haven't got any audio yet end the recognition sequence here. 179 // If we haven't got any audio yet end the recognition sequence here.
162 if (audio_buffers_.empty()) { 180 if (audio_buffers_.empty()) {
163 // Guard against the delegate freeing us until we finish our job. 181 // Guard against the delegate freeing us until we finish our job.
164 scoped_refptr<SpeechRecognizer> me(this); 182 scoped_refptr<SpeechRecognizer> me(this);
165 delegate_->DidCompleteRecognition(caller_id_); 183 delegate_->DidCompleteRecognition(caller_id_);
166 return; 184 return;
167 } 185 }
168 186
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after
233 251
234 void SpeechRecognizer::HandleOnData(string* data) { 252 void SpeechRecognizer::HandleOnData(string* data) {
235 // Check if we are still recording and if not discard this buffer, as 253 // Check if we are still recording and if not discard this buffer, as
236 // recording might have been stopped after this buffer was posted to the queue 254 // recording might have been stopped after this buffer was posted to the queue
237 // by |OnData|. 255 // by |OnData|.
238 if (!audio_controller_.get()) { 256 if (!audio_controller_.get()) {
239 delete data; 257 delete data;
240 return; 258 return;
241 } 259 }
242 260
243 encoder_->Encode(*data, &audio_buffers_); 261 const short* samples = reinterpret_cast<const short*>(data->data());
262 DCHECK((data->length() % sizeof(short)) == 0);
263 int num_samples = data->length() / sizeof(short);
264
265 encoder_->Encode(samples, num_samples, &audio_buffers_);
266 endpointer_.ProcessAudio(samples, num_samples);
244 delete data; 267 delete data;
245 268
269 if (endpointer_.speech_input_complete()) {
270 StopRecording();
271 }
272
246 // TODO(satish): Once we have streaming POST, start sending the data received 273 // TODO(satish): Once we have streaming POST, start sending the data received
247 // here as POST chunks. 274 // here as POST chunks.
248 } 275 }
249 276
250 void SpeechRecognizer::SetRecognitionResult(bool error, const string16& value) { 277 void SpeechRecognizer::SetRecognitionResult(bool error, const string16& value) {
251 delegate_->SetRecognitionResult(caller_id_, error, value); 278 delegate_->SetRecognitionResult(caller_id_, error, value);
252 279
253 // Guard against the delegate freeing us until we finish our job. 280 // Guard against the delegate freeing us until we finish our job.
254 scoped_refptr<SpeechRecognizer> me(this); 281 scoped_refptr<SpeechRecognizer> me(this);
255 delegate_->DidCompleteRecognition(caller_id_); 282 delegate_->DidCompleteRecognition(caller_id_);
256 } 283 }
257 284
258 } // namespace speech_input 285 } // namespace speech_input
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698