OLD | NEW |
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/browser/speech/speech_recognizer.h" | 5 #include "chrome/browser/speech/speech_recognizer.h" |
6 | 6 |
7 #include "base/ref_counted.h" | 7 #include "base/ref_counted.h" |
8 #include "base/scoped_ptr.h" | 8 #include "base/scoped_ptr.h" |
| 9 #include "base/time.h" |
9 #include "chrome/browser/chrome_thread.h" | 10 #include "chrome/browser/chrome_thread.h" |
10 #include "chrome/browser/profile.h" | 11 #include "chrome/browser/profile.h" |
11 #include "chrome/common/net/url_request_context_getter.h" | 12 #include "chrome/common/net/url_request_context_getter.h" |
12 #include "third_party/speex/include/speex/speex.h" | 13 #include "third_party/speex/include/speex/speex.h" |
13 | 14 |
14 using media::AudioInputController; | 15 using media::AudioInputController; |
15 using std::list; | 16 using std::list; |
16 using std::string; | 17 using std::string; |
17 | 18 |
18 namespace { | 19 namespace { |
(...skipping 17 matching lines...) Expand all Loading... |
36 namespace speech_input { | 37 namespace speech_input { |
37 | 38 |
38 // Provides a simple interface to encode raw audio using the Speex codec. | 39 // Provides a simple interface to encode raw audio using the Speex codec. |
39 class SpeexEncoder { | 40 class SpeexEncoder { |
40 public: | 41 public: |
41 SpeexEncoder(); | 42 SpeexEncoder(); |
42 ~SpeexEncoder(); | 43 ~SpeexEncoder(); |
43 | 44 |
44 int samples_per_frame() const { return samples_per_frame_; } | 45 int samples_per_frame() const { return samples_per_frame_; } |
45 | 46 |
46 // Encodes each frame of raw audio in |raw_samples| and adds the | 47 // Encodes each frame of raw audio in |samples| and adds the |
47 // encoded frames as a set of strings to the |encoded_frames| list. | 48 // encoded frames as a set of strings to the |encoded_frames| list. |
48 // Ownership of the newly added strings is transferred to the caller. | 49 // Ownership of the newly added strings is transferred to the caller. |
49 void Encode(const string& raw_samples, | 50 void Encode(const short* samples, |
| 51 int num_samples, |
50 std::list<std::string*>* encoded_frames); | 52 std::list<std::string*>* encoded_frames); |
51 | 53 |
52 private: | 54 private: |
53 SpeexBits bits_; | 55 SpeexBits bits_; |
54 void* encoder_state_; | 56 void* encoder_state_; |
55 int samples_per_frame_; | 57 int samples_per_frame_; |
56 char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size. | 58 char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size. |
57 }; | 59 }; |
58 | 60 |
59 SpeexEncoder::SpeexEncoder() { | 61 SpeexEncoder::SpeexEncoder() { |
60 speex_bits_init(&bits_); | 62 speex_bits_init(&bits_); |
61 encoder_state_ = speex_encoder_init(&speex_wb_mode); | 63 encoder_state_ = speex_encoder_init(&speex_wb_mode); |
62 DCHECK(encoder_state_); | 64 DCHECK(encoder_state_); |
63 speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_); | 65 speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_); |
64 DCHECK(samples_per_frame_ > 0); | 66 DCHECK(samples_per_frame_ > 0); |
65 int quality = kSpeexEncodingQuality; | 67 int quality = kSpeexEncodingQuality; |
66 speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality); | 68 speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality); |
67 int vbr = 1; | 69 int vbr = 1; |
68 speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr); | 70 speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr); |
69 } | 71 } |
70 | 72 |
71 SpeexEncoder::~SpeexEncoder() { | 73 SpeexEncoder::~SpeexEncoder() { |
72 speex_bits_destroy(&bits_); | 74 speex_bits_destroy(&bits_); |
73 speex_encoder_destroy(encoder_state_); | 75 speex_encoder_destroy(encoder_state_); |
74 } | 76 } |
75 | 77 |
76 void SpeexEncoder::Encode(const string& raw_samples, | 78 void SpeexEncoder::Encode(const short* samples, |
| 79 int num_samples, |
77 std::list<std::string*>* encoded_frames) { | 80 std::list<std::string*>* encoded_frames) { |
78 const short* samples = reinterpret_cast<const short*>(raw_samples.data()); | |
79 DCHECK((raw_samples.length() % sizeof(short)) == 0); | |
80 int num_samples = raw_samples.length() / sizeof(short); | |
81 | |
82 // Drop incomplete frames, typically those which come in when recording stops. | 81 // Drop incomplete frames, typically those which come in when recording stops. |
83 num_samples -= (num_samples % samples_per_frame_); | 82 num_samples -= (num_samples % samples_per_frame_); |
84 for (int i = 0; i < num_samples; i += samples_per_frame_) { | 83 for (int i = 0; i < num_samples; i += samples_per_frame_) { |
85 speex_bits_reset(&bits_); | 84 speex_bits_reset(&bits_); |
86 speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i), | 85 speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i), |
87 &bits_); | 86 &bits_); |
88 | 87 |
89 // Encode the frame and place the size of the frame as the first byte. This | 88 // Encode the frame and place the size of the frame as the first byte. This |
90 // is the packet format for MIME type x-speex-with-header-byte. | 89 // is the packet format for MIME type x-speex-with-header-byte. |
91 int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1, | 90 int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1, |
92 kMaxSpeexFrameLength); | 91 kMaxSpeexFrameLength); |
93 encoded_frame_data_[0] = static_cast<char>(frame_length); | 92 encoded_frame_data_[0] = static_cast<char>(frame_length); |
94 encoded_frames->push_back(new string(encoded_frame_data_, | 93 encoded_frames->push_back(new string(encoded_frame_data_, |
95 frame_length + 1)); | 94 frame_length + 1)); |
96 } | 95 } |
97 } | 96 } |
98 | 97 |
99 SpeechRecognizer::SpeechRecognizer(Delegate* delegate, | 98 SpeechRecognizer::SpeechRecognizer(Delegate* delegate, |
100 const SpeechInputCallerId& caller_id) | 99 const SpeechInputCallerId& caller_id) |
101 : delegate_(delegate), | 100 : delegate_(delegate), |
102 caller_id_(caller_id), | 101 caller_id_(caller_id), |
103 encoder_(new SpeexEncoder()) { | 102 encoder_(new SpeexEncoder()), |
| 103 endpointer_(kAudioSampleRate) { |
| 104 endpointer_.set_speech_input_complete_silence_length( |
| 105 base::Time::kMicrosecondsPerSecond / 2); |
| 106 endpointer_.set_long_speech_input_complete_silence_length( |
| 107 base::Time::kMicrosecondsPerSecond); |
| 108 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); |
| 109 endpointer_.StartSession(); |
104 } | 110 } |
105 | 111 |
106 SpeechRecognizer::~SpeechRecognizer() { | 112 SpeechRecognizer::~SpeechRecognizer() { |
107 // Recording should have stopped earlier due to the endpointer or | 113 // Recording should have stopped earlier due to the endpointer or |
108 // |StopRecording| being called. | 114 // |StopRecording| being called. |
109 DCHECK(!audio_controller_.get()); | 115 DCHECK(!audio_controller_.get()); |
110 DCHECK(!request_.get() || !request_->HasPendingRequest()); | 116 DCHECK(!request_.get() || !request_->HasPendingRequest()); |
111 DCHECK(audio_buffers_.empty()); | 117 DCHECK(audio_buffers_.empty()); |
| 118 endpointer_.EndSession(); |
112 } | 119 } |
113 | 120 |
114 bool SpeechRecognizer::StartRecording() { | 121 bool SpeechRecognizer::StartRecording() { |
115 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO)); | 122 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO)); |
116 DCHECK(!audio_controller_.get()); | 123 DCHECK(!audio_controller_.get()); |
117 DCHECK(!request_.get() || !request_->HasPendingRequest()); | 124 DCHECK(!request_.get() || !request_->HasPendingRequest()); |
118 | 125 |
| 126 // TODO(satish): Normally for a short time (even 0.5s) the endpointer needs to |
| 127 // estimate the environment/background noise before starting to treat the |
| 128 // audio as user input. Once we have implemented a popup UI to notify the user |
| 129 // that recording has started, we should perhaps have a short interval where |
| 130 // we record background audio and then show the popup UI so that the user can |
| 131 // start speaking after that. For now we just do these together so there isn't |
| 132 // any background noise for the end pointer (still works ok). |
| 133 endpointer_.SetEnvironmentEstimationMode(); |
| 134 endpointer_.SetUserInputMode(); |
| 135 |
119 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; | 136 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; |
120 DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0); | 137 DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0); |
121 audio_controller_ = AudioInputController::Create(this, | 138 audio_controller_ = AudioInputController::Create(this, |
122 AudioManager::AUDIO_PCM_LINEAR, kNumAudioChannels, | 139 AudioManager::AUDIO_PCM_LINEAR, kNumAudioChannels, |
123 kAudioSampleRate, kNumBitsPerAudioSample, | 140 kAudioSampleRate, kNumBitsPerAudioSample, |
124 samples_per_packet); | 141 samples_per_packet); |
125 DCHECK(audio_controller_.get()); | 142 DCHECK(audio_controller_.get()); |
126 LOG(INFO) << "SpeechRecognizer starting record."; | 143 LOG(INFO) << "SpeechRecognizer starting record."; |
127 audio_controller_->Record(); | 144 audio_controller_->Record(); |
128 | 145 |
(...skipping 20 matching lines...) Expand all Loading... |
149 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO)); | 166 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO)); |
150 | 167 |
151 // If audio recording has already stopped and we are in recognition phase, | 168 // If audio recording has already stopped and we are in recognition phase, |
152 // silently ignore any more calls to stop recording. | 169 // silently ignore any more calls to stop recording. |
153 if (!audio_controller_.get()) | 170 if (!audio_controller_.get()) |
154 return; | 171 return; |
155 | 172 |
156 LOG(INFO) << "SpeechRecognizer stopping record."; | 173 LOG(INFO) << "SpeechRecognizer stopping record."; |
157 audio_controller_->Close(); | 174 audio_controller_->Close(); |
158 audio_controller_ = NULL; // Releases the ref ptr. | 175 audio_controller_ = NULL; // Releases the ref ptr. |
| 176 |
159 delegate_->DidCompleteRecording(caller_id_); | 177 delegate_->DidCompleteRecording(caller_id_); |
160 | 178 |
161 // If we haven't got any audio yet end the recognition sequence here. | 179 // If we haven't got any audio yet end the recognition sequence here. |
162 if (audio_buffers_.empty()) { | 180 if (audio_buffers_.empty()) { |
163 // Guard against the delegate freeing us until we finish our job. | 181 // Guard against the delegate freeing us until we finish our job. |
164 scoped_refptr<SpeechRecognizer> me(this); | 182 scoped_refptr<SpeechRecognizer> me(this); |
165 delegate_->DidCompleteRecognition(caller_id_); | 183 delegate_->DidCompleteRecognition(caller_id_); |
166 return; | 184 return; |
167 } | 185 } |
168 | 186 |
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
233 | 251 |
234 void SpeechRecognizer::HandleOnData(string* data) { | 252 void SpeechRecognizer::HandleOnData(string* data) { |
235 // Check if we are still recording and if not discard this buffer, as | 253 // Check if we are still recording and if not discard this buffer, as |
236 // recording might have been stopped after this buffer was posted to the queue | 254 // recording might have been stopped after this buffer was posted to the queue |
237 // by |OnData|. | 255 // by |OnData|. |
238 if (!audio_controller_.get()) { | 256 if (!audio_controller_.get()) { |
239 delete data; | 257 delete data; |
240 return; | 258 return; |
241 } | 259 } |
242 | 260 |
243 encoder_->Encode(*data, &audio_buffers_); | 261 const short* samples = reinterpret_cast<const short*>(data->data()); |
| 262 DCHECK((data->length() % sizeof(short)) == 0); |
| 263 int num_samples = data->length() / sizeof(short); |
| 264 |
| 265 encoder_->Encode(samples, num_samples, &audio_buffers_); |
| 266 endpointer_.ProcessAudio(samples, num_samples); |
244 delete data; | 267 delete data; |
245 | 268 |
| 269 if (endpointer_.speech_input_complete()) { |
| 270 StopRecording(); |
| 271 } |
| 272 |
246 // TODO(satish): Once we have streaming POST, start sending the data received | 273 // TODO(satish): Once we have streaming POST, start sending the data received |
247 // here as POST chunks. | 274 // here as POST chunks. |
248 } | 275 } |
249 | 276 |
250 void SpeechRecognizer::SetRecognitionResult(bool error, const string16& value) { | 277 void SpeechRecognizer::SetRecognitionResult(bool error, const string16& value) { |
251 delegate_->SetRecognitionResult(caller_id_, error, value); | 278 delegate_->SetRecognitionResult(caller_id_, error, value); |
252 | 279 |
253 // Guard against the delegate freeing us until we finish our job. | 280 // Guard against the delegate freeing us until we finish our job. |
254 scoped_refptr<SpeechRecognizer> me(this); | 281 scoped_refptr<SpeechRecognizer> me(this); |
255 delegate_->DidCompleteRecognition(caller_id_); | 282 delegate_->DidCompleteRecognition(caller_id_); |
256 } | 283 } |
257 | 284 |
258 } // namespace speech_input | 285 } // namespace speech_input |
OLD | NEW |