chrome/browser/speech/speech_recognizer.cc - Issue 3117026: Add an endpointer for detecting end of speech.

Unified Diff: chrome/browser/speech/speech_recognizer.cc

Issue 3117026: Add an endpointer for detecting end of speech. (Closed)

Patch Set: Merged with latest. Created 10 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: chrome/browser/speech/speech_recognizer.cc

diff --git a/chrome/browser/speech/speech_recognizer.cc b/chrome/browser/speech/speech_recognizer.cc

index 2e8f353b0b348b4cf6cb242b63d3972a696650ad..0df671c67140b2b9a7b019328390ebd2952d741e 100644

--- a/chrome/browser/speech/speech_recognizer.cc

+++ b/chrome/browser/speech/speech_recognizer.cc

@@ -6,6 +6,7 @@

#include "base/ref_counted.h"

#include "base/scoped_ptr.h"

+#include "base/time.h"

#include "chrome/browser/chrome_thread.h"

#include "chrome/browser/profile.h"

#include "chrome/common/net/url_request_context_getter.h"

@@ -43,10 +44,11 @@ class SpeexEncoder {

int samples_per_frame() const { return samples_per_frame_; }

- // Encodes each frame of raw audio in |raw_samples| and adds the

+ // Encodes each frame of raw audio in |samples| and adds the

// encoded frames as a set of strings to the |encoded_frames| list.

// Ownership of the newly added strings is transferred to the caller.

- void Encode(const string& raw_samples,

+ void Encode(const short* samples,

+ int num_samples,

std::list<std::string*>* encoded_frames);

private:

@@ -73,12 +75,9 @@ SpeexEncoder::~SpeexEncoder() {

speex_encoder_destroy(encoder_state_);

}

-void SpeexEncoder::Encode(const string& raw_samples,

+void SpeexEncoder::Encode(const short* samples,

+ int num_samples,

std::list<std::string*>* encoded_frames) {

- const short* samples = reinterpret_cast<const short*>(raw_samples.data());

- DCHECK((raw_samples.length() % sizeof(short)) == 0);

- int num_samples = raw_samples.length() / sizeof(short);

// Drop incomplete frames, typically those which come in when recording stops.

num_samples -= (num_samples % samples_per_frame_);

for (int i = 0; i < num_samples; i += samples_per_frame_) {

@@ -100,7 +99,14 @@ SpeechRecognizer::SpeechRecognizer(Delegate* delegate,

const SpeechInputCallerId& caller_id)

: delegate_(delegate),

caller_id_(caller_id),

- encoder_(new SpeexEncoder()) {

+ encoder_(new SpeexEncoder()),

+ endpointer_(kAudioSampleRate) {

+ endpointer_.set_speech_input_complete_silence_length(

+ base::Time::kMicrosecondsPerSecond / 2);

+ endpointer_.set_long_speech_input_complete_silence_length(

+ base::Time::kMicrosecondsPerSecond);

+ endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);

+ endpointer_.StartSession();

}

SpeechRecognizer::~SpeechRecognizer() {

@@ -109,6 +115,7 @@ SpeechRecognizer::~SpeechRecognizer() {

DCHECK(!audio_controller_.get());

DCHECK(!request_.get() || !request_->HasPendingRequest());

DCHECK(audio_buffers_.empty());

+ endpointer_.EndSession();

}

bool SpeechRecognizer::StartRecording() {

@@ -116,6 +123,16 @@ bool SpeechRecognizer::StartRecording() {

DCHECK(!audio_controller_.get());

DCHECK(!request_.get() || !request_->HasPendingRequest());

+ // TODO(satish): Normally for a short time (even 0.5s) the endpointer needs to

+ // estimate the environment/background noise before starting to treat the

+ // audio as user input. Once we have implemented a popup UI to notify the user

+ // that recording has started, we should perhaps have a short interval where

+ // we record background audio and then show the popup UI so that the user can

+ // start speaking after that. For now we just do these together so there isn't

+ // any background noise for the end pointer (still works ok).

+ endpointer_.SetEnvironmentEstimationMode();

+ endpointer_.SetUserInputMode();

int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;

DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0);

audio_controller_ = AudioInputController::Create(this,

@@ -156,6 +173,7 @@ void SpeechRecognizer::StopRecording() {

LOG(INFO) << "SpeechRecognizer stopping record.";

audio_controller_->Close();

audio_controller_ = NULL; // Releases the ref ptr.

delegate_->DidCompleteRecording(caller_id_);

// If we haven't got any audio yet end the recognition sequence here.

@@ -240,9 +258,18 @@ void SpeechRecognizer::HandleOnData(string* data) {

return;

}

- encoder_->Encode(*data, &audio_buffers_);

+ const short* samples = reinterpret_cast<const short*>(data->data());

+ DCHECK((data->length() % sizeof(short)) == 0);

+ int num_samples = data->length() / sizeof(short);

+ encoder_->Encode(samples, num_samples, &audio_buffers_);

+ endpointer_.ProcessAudio(samples, num_samples);

delete data;

+ if (endpointer_.speech_input_complete()) {

+ StopRecording();

+ }

// TODO(satish): Once we have streaming POST, start sending the data received

// here as POST chunks.

}

« chrome/browser/speech/speech_recognizer.h ('K') | « chrome/browser/speech/speech_recognizer.h ('k') | chrome/chrome_browser.gypi » ('j') | no next file with comments »