Index: chrome/browser/speech/speech_recognizer.cc |
diff --git a/chrome/browser/speech/speech_recognizer.cc b/chrome/browser/speech/speech_recognizer.cc |
index 2e8f353b0b348b4cf6cb242b63d3972a696650ad..0df671c67140b2b9a7b019328390ebd2952d741e 100644 |
--- a/chrome/browser/speech/speech_recognizer.cc |
+++ b/chrome/browser/speech/speech_recognizer.cc |
@@ -6,6 +6,7 @@ |
#include "base/ref_counted.h" |
#include "base/scoped_ptr.h" |
+#include "base/time.h" |
#include "chrome/browser/chrome_thread.h" |
#include "chrome/browser/profile.h" |
#include "chrome/common/net/url_request_context_getter.h" |
@@ -43,10 +44,11 @@ class SpeexEncoder { |
int samples_per_frame() const { return samples_per_frame_; } |
- // Encodes each frame of raw audio in |raw_samples| and adds the |
+ // Encodes each frame of raw audio in |samples| and adds the |
// encoded frames as a set of strings to the |encoded_frames| list. |
// Ownership of the newly added strings is transferred to the caller. |
- void Encode(const string& raw_samples, |
+ void Encode(const short* samples, |
+ int num_samples, |
std::list<std::string*>* encoded_frames); |
private: |
@@ -73,12 +75,9 @@ SpeexEncoder::~SpeexEncoder() { |
speex_encoder_destroy(encoder_state_); |
} |
-void SpeexEncoder::Encode(const string& raw_samples, |
+void SpeexEncoder::Encode(const short* samples, |
+ int num_samples, |
std::list<std::string*>* encoded_frames) { |
- const short* samples = reinterpret_cast<const short*>(raw_samples.data()); |
- DCHECK((raw_samples.length() % sizeof(short)) == 0); |
- int num_samples = raw_samples.length() / sizeof(short); |
- |
// Drop incomplete frames, typically those which come in when recording stops. |
num_samples -= (num_samples % samples_per_frame_); |
for (int i = 0; i < num_samples; i += samples_per_frame_) { |
@@ -100,7 +99,14 @@ SpeechRecognizer::SpeechRecognizer(Delegate* delegate, |
const SpeechInputCallerId& caller_id) |
: delegate_(delegate), |
caller_id_(caller_id), |
- encoder_(new SpeexEncoder()) { |
+ encoder_(new SpeexEncoder()), |
+ endpointer_(kAudioSampleRate) { |
+ endpointer_.set_speech_input_complete_silence_length( |
+ base::Time::kMicrosecondsPerSecond / 2); |
+ endpointer_.set_long_speech_input_complete_silence_length( |
+ base::Time::kMicrosecondsPerSecond); |
+ endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); |
+ endpointer_.StartSession(); |
} |
SpeechRecognizer::~SpeechRecognizer() { |
@@ -109,6 +115,7 @@ SpeechRecognizer::~SpeechRecognizer() { |
DCHECK(!audio_controller_.get()); |
DCHECK(!request_.get() || !request_->HasPendingRequest()); |
DCHECK(audio_buffers_.empty()); |
+ endpointer_.EndSession(); |
} |
bool SpeechRecognizer::StartRecording() { |
@@ -116,6 +123,16 @@ bool SpeechRecognizer::StartRecording() { |
DCHECK(!audio_controller_.get()); |
DCHECK(!request_.get() || !request_->HasPendingRequest()); |
+ // TODO(satish): Normally for a short time (even 0.5s) the endpointer needs to |
+ // estimate the environment/background noise before starting to treat the |
+ // audio as user input. Once we have implemented a popup UI to notify the user |
+ // that recording has started, we should perhaps have a short interval where |
+ // we record background audio and then show the popup UI so that the user can |
+ // start speaking after that. For now we just do these together so there isn't |
+ // any background noise for the end pointer (still works ok). |
+ endpointer_.SetEnvironmentEstimationMode(); |
+ endpointer_.SetUserInputMode(); |
+ |
int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; |
DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0); |
audio_controller_ = AudioInputController::Create(this, |
@@ -156,6 +173,7 @@ void SpeechRecognizer::StopRecording() { |
LOG(INFO) << "SpeechRecognizer stopping record."; |
audio_controller_->Close(); |
audio_controller_ = NULL; // Releases the ref ptr. |
+ |
delegate_->DidCompleteRecording(caller_id_); |
// If we haven't got any audio yet end the recognition sequence here. |
@@ -240,9 +258,18 @@ void SpeechRecognizer::HandleOnData(string* data) { |
return; |
} |
- encoder_->Encode(*data, &audio_buffers_); |
+ const short* samples = reinterpret_cast<const short*>(data->data()); |
+ DCHECK((data->length() % sizeof(short)) == 0); |
+ int num_samples = data->length() / sizeof(short); |
+ |
+ encoder_->Encode(samples, num_samples, &audio_buffers_); |
+ endpointer_.ProcessAudio(samples, num_samples); |
delete data; |
+ if (endpointer_.speech_input_complete()) { |
+ StopRecording(); |
+ } |
+ |
// TODO(satish): Once we have streaming POST, start sending the data received |
// here as POST chunks. |
} |