chrome/browser/speech/speech_recognizer.cc - Issue 3341020: Speech input: Do environment estimation and detect the no-speech case.

Unified Diff: chrome/browser/speech/speech_recognizer.cc

Issue 3341020: Speech input: Do environment estimation and detect the no-speech case. (Closed)

Patch Set: . Created 10 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

« chrome/browser/speech/speech_input_manager.cc ('K') | « chrome/browser/speech/speech_recognizer.h ('k') | chrome/browser/speech/speech_recognizer_unittest.cc » ('j') | chrome/browser/speech/speech_recognizer_unittest.cc » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: chrome/browser/speech/speech_recognizer.cc

diff --git a/chrome/browser/speech/speech_recognizer.cc b/chrome/browser/speech/speech_recognizer.cc

index df41bf54d1c4b968ab33cdff9054deec08d58c42..b6c4c7dd09a47a8fa318e39c1016f5a6105c4feb 100644

--- a/chrome/browser/speech/speech_recognizer.cc

+++ b/chrome/browser/speech/speech_recognizer.cc

@@ -21,7 +21,6 @@ const char* const kDefaultSpeechRecognitionUrl =

"http://www.google.com/speech-api/v1/recognize?lang=en-us&client=chromium";

const char* const kContentTypeSpeex =

"audio/x-speex-with-header-byte; rate=16000";

-const int kAudioSampleRate = 16000;

const int kSpeexEncodingQuality = 8;

const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz).

@@ -29,13 +28,17 @@ const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz).

// make sure it is within the byte range.

COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength);

-const int kAudioPacketIntervalMs = 100; // Record 100ms long audio packets.

-const int kNumAudioChannels = 1; // Speech is recorded as mono.

-const int kNumBitsPerAudioSample = 16;

+const int kEndpointerEstimationTimeMs = 300;

} // namespace

namespace speech_input {

+const int SpeechRecognizer::kAudioSampleRate = 16000;

+const int SpeechRecognizer::kAudioPacketIntervalMs = 100;

+const int SpeechRecognizer::kNumAudioChannels = 1;

+const int SpeechRecognizer::kNumBitsPerAudioSample = 16;

+const int SpeechRecognizer::kNoSpeechTimeoutSec = 8;

// Provides a simple interface to encode raw audio using the Speex codec.

class SpeexEncoder {

public:

@@ -122,15 +125,10 @@ bool SpeechRecognizer::StartRecording() {

DCHECK(!audio_controller_.get());

DCHECK(!request_.get() || !request_->HasPendingRequest());

- // TODO(satish): Normally for a short time (even 0.5s) the endpointer needs to

- // estimate the environment/background noise before starting to treat the

- // audio as user input. Once we have implemented a popup UI to notify the user

- // that recording has started, we should perhaps have a short interval where

- // we record background audio and then show the popup UI so that the user can

- // start speaking after that. For now we just do these together so there isn't

- // any background noise for the end pointer (still works ok).

+ // The endpointer needs to estimate the environment/background noise before

+ // starting to treat the audio as user input. In |HandleOnData| we wait until

+ // such time has passed before switching to user input mode.

endpointer_.SetEnvironmentEstimationMode();

- endpointer_.SetUserInputMode();

int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;

DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0);

@@ -140,6 +138,7 @@ bool SpeechRecognizer::StartRecording() {

AudioInputController::Create(this, params, samples_per_packet);

DCHECK(audio_controller_.get());

LOG(INFO) << "SpeechRecognizer starting record.";

+ num_samples_recorded_ = 0;

audio_controller_->Record();

return true;

@@ -231,10 +230,7 @@ void SpeechRecognizer::HandleOnError(int error_code) {

if (!audio_controller_.get())

return;

- delegate_->OnRecognizerError(caller_id_);

- CancelRecognition();

- delegate_->DidCompleteRecording(caller_id_);

- delegate_->DidCompleteRecognition(caller_id_);

+ InformErrorAndCancelRecognition(RECOGNIZER_ERROR_CAPTURE);

}

void SpeechRecognizer::OnData(AudioInputController* controller,

@@ -265,6 +261,24 @@ void SpeechRecognizer::HandleOnData(string* data) {

encoder_->Encode(samples, num_samples, &audio_buffers_);

endpointer_.ProcessAudio(samples, num_samples);

delete data;

+ num_samples_recorded_ += num_samples;

+ // Check if we have gathered enough audio for the endpointer to do environment

+ // estimation and should move on to detect speech/end of speech.

+ if (endpointer_.IsEstimatingEnvironment() &&

+ num_samples_recorded_ >= (kEndpointerEstimationTimeMs *

+ kAudioSampleRate) / 1000) {

+ endpointer_.SetUserInputMode();

+ delegate_->DidCompleteEnvironmentEstimation(caller_id_);

+ return;

+ }

+ // Check if we have waited too long without hearing any speech.

+ if (!endpointer_.DidStartReceivingSpeech() &&

+ num_samples_recorded_ >= kNoSpeechTimeoutSec * kAudioSampleRate) {

+ InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_SPEECH);

+ return;

+ }

if (endpointer_.speech_input_complete()) {

StopRecording();

@@ -275,6 +289,11 @@ void SpeechRecognizer::HandleOnData(string* data) {

}

void SpeechRecognizer::SetRecognitionResult(bool error, const string16& value) {

+ if (value.empty()) {

+ InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_RESULTS);

+ return;

+ }

delegate_->SetRecognitionResult(caller_id_, error, value);

// Guard against the delegate freeing us until we finish our job.

@@ -282,4 +301,14 @@ void SpeechRecognizer::SetRecognitionResult(bool error, const string16& value) {

delegate_->DidCompleteRecognition(caller_id_);

}

+void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) {

+ CancelRecognition();

+ // Guard against the delegate freeing us until we finish our job.

+ scoped_refptr<SpeechRecognizer> me(this);

+ delegate_->OnRecognizerError(caller_id_, error);

+ delegate_->DidCompleteRecording(caller_id_);

+ delegate_->DidCompleteRecognition(caller_id_);

} // namespace speech_input