Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(194)

Unified Diff: chrome/browser/speech/speech_recognizer.cc

Issue 3341020: Speech input: Do environment estimation and detect the no-speech case. (Closed)
Patch Set: . Created 10 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/browser/speech/speech_recognizer.cc
diff --git a/chrome/browser/speech/speech_recognizer.cc b/chrome/browser/speech/speech_recognizer.cc
index df41bf54d1c4b968ab33cdff9054deec08d58c42..b6c4c7dd09a47a8fa318e39c1016f5a6105c4feb 100644
--- a/chrome/browser/speech/speech_recognizer.cc
+++ b/chrome/browser/speech/speech_recognizer.cc
@@ -21,7 +21,6 @@ const char* const kDefaultSpeechRecognitionUrl =
"http://www.google.com/speech-api/v1/recognize?lang=en-us&client=chromium";
const char* const kContentTypeSpeex =
"audio/x-speex-with-header-byte; rate=16000";
-const int kAudioSampleRate = 16000;
const int kSpeexEncodingQuality = 8;
const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz).
@@ -29,13 +28,17 @@ const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz).
// make sure it is within the byte range.
COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength);
-const int kAudioPacketIntervalMs = 100; // Record 100ms long audio packets.
-const int kNumAudioChannels = 1; // Speech is recorded as mono.
-const int kNumBitsPerAudioSample = 16;
+const int kEndpointerEstimationTimeMs = 300;
} // namespace
namespace speech_input {
+const int SpeechRecognizer::kAudioSampleRate = 16000;
+const int SpeechRecognizer::kAudioPacketIntervalMs = 100;
+const int SpeechRecognizer::kNumAudioChannels = 1;
+const int SpeechRecognizer::kNumBitsPerAudioSample = 16;
+const int SpeechRecognizer::kNoSpeechTimeoutSec = 8;
+
// Provides a simple interface to encode raw audio using the Speex codec.
class SpeexEncoder {
public:
@@ -122,15 +125,10 @@ bool SpeechRecognizer::StartRecording() {
DCHECK(!audio_controller_.get());
DCHECK(!request_.get() || !request_->HasPendingRequest());
- // TODO(satish): Normally for a short time (even 0.5s) the endpointer needs to
- // estimate the environment/background noise before starting to treat the
- // audio as user input. Once we have implemented a popup UI to notify the user
- // that recording has started, we should perhaps have a short interval where
- // we record background audio and then show the popup UI so that the user can
- // start speaking after that. For now we just do these together so there isn't
- // any background noise for the end pointer (still works ok).
+ // The endpointer needs to estimate the environment/background noise before
+ // starting to treat the audio as user input. In |HandleOnData| we wait until
+ // such time has passed before switching to user input mode.
endpointer_.SetEnvironmentEstimationMode();
- endpointer_.SetUserInputMode();
int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;
DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0);
@@ -140,6 +138,7 @@ bool SpeechRecognizer::StartRecording() {
AudioInputController::Create(this, params, samples_per_packet);
DCHECK(audio_controller_.get());
LOG(INFO) << "SpeechRecognizer starting record.";
+ num_samples_recorded_ = 0;
audio_controller_->Record();
return true;
@@ -231,10 +230,7 @@ void SpeechRecognizer::HandleOnError(int error_code) {
if (!audio_controller_.get())
return;
- delegate_->OnRecognizerError(caller_id_);
- CancelRecognition();
- delegate_->DidCompleteRecording(caller_id_);
- delegate_->DidCompleteRecognition(caller_id_);
+ InformErrorAndCancelRecognition(RECOGNIZER_ERROR_CAPTURE);
}
void SpeechRecognizer::OnData(AudioInputController* controller,
@@ -265,6 +261,24 @@ void SpeechRecognizer::HandleOnData(string* data) {
encoder_->Encode(samples, num_samples, &audio_buffers_);
endpointer_.ProcessAudio(samples, num_samples);
delete data;
+ num_samples_recorded_ += num_samples;
+
+ // Check if we have gathered enough audio for the endpointer to do environment
+ // estimation and should move on to detect speech/end of speech.
+ if (endpointer_.IsEstimatingEnvironment() &&
+ num_samples_recorded_ >= (kEndpointerEstimationTimeMs *
+ kAudioSampleRate) / 1000) {
+ endpointer_.SetUserInputMode();
+ delegate_->DidCompleteEnvironmentEstimation(caller_id_);
+ return;
+ }
+
+ // Check if we have waited too long without hearing any speech.
+ if (!endpointer_.DidStartReceivingSpeech() &&
+ num_samples_recorded_ >= kNoSpeechTimeoutSec * kAudioSampleRate) {
+ InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_SPEECH);
+ return;
+ }
if (endpointer_.speech_input_complete()) {
StopRecording();
@@ -275,6 +289,11 @@ void SpeechRecognizer::HandleOnData(string* data) {
}
void SpeechRecognizer::SetRecognitionResult(bool error, const string16& value) {
+ if (value.empty()) {
+ InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_RESULTS);
+ return;
+ }
+
delegate_->SetRecognitionResult(caller_id_, error, value);
// Guard against the delegate freeing us until we finish our job.
@@ -282,4 +301,14 @@ void SpeechRecognizer::SetRecognitionResult(bool error, const string16& value) {
delegate_->DidCompleteRecognition(caller_id_);
}
+void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) {
+ CancelRecognition();
+
+ // Guard against the delegate freeing us until we finish our job.
+ scoped_refptr<SpeechRecognizer> me(this);
+ delegate_->OnRecognizerError(caller_id_, error);
+ delegate_->DidCompleteRecording(caller_id_);
+ delegate_->DidCompleteRecognition(caller_id_);
+}
+
} // namespace speech_input

Powered by Google App Engine
This is Rietveld 408576698