content/browser/speech/speech_recognizer_impl.cc - Issue 9835049: Speech refactoring: Reimplemented speech_recognizer as a FSM. (CL1.5)

Unified Diff: content/browser/speech/speech_recognizer_impl.cc

Issue 9835049: Speech refactoring: Reimplemented speech_recognizer as a FSM. (CL1.5) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Rebased from master. Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« content/browser/speech/speech_recognizer_impl.h ('K') | « content/browser/speech/speech_recognizer_impl.h ('k') | content/browser/speech/speech_recognizer_impl_unittest.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: content/browser/speech/speech_recognizer_impl.cc

diff --git a/content/browser/speech/speech_recognizer_impl.cc b/content/browser/speech/speech_recognizer_impl.cc

index 007f3ee45a9941007714b1c5e4b25bf571fdcd1a..de2b72b503537320c4a7313814d2eecfeef297ba 100644

--- a/content/browser/speech/speech_recognizer_impl.cc

+++ b/content/browser/speech/speech_recognizer_impl.cc

@@ -16,6 +16,8 @@

#include "content/public/common/speech_recognition_result.h"

#include "net/url_request/url_request_context_getter.h"

+#define UNREACHABLE_CONDITION() do { NOTREACHED(); return state_; } while(0)

Satish 2012/03/27 09:47:42 can this be changed to a method InvalidInput() alo

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 I used a macro since, in case of bugs/failing DCHE

using content::BrowserMainLoop;

using content::BrowserThread;

using content::SpeechRecognitionError;

@@ -24,8 +26,12 @@ using content::SpeechRecognitionResult;

using content::SpeechRecognizer;

using media::AudioInputController;

+// TODO(primiano) what about a watchdog here to avoid getting stuck if the

+// SpeechRecognitionEngine does not deliver a result (in reasonable time)?

Satish 2012/03/27 09:47:42 for remote engines, the network connection should

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

namespace {

+// Enables spontaneous transition from WaitingForSpeech to RecognizingSpeech,

Satish 2012/03/27 09:47:42 add newline above

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

+// which is required for the mock recognition engine which sends fake results.

+const bool skipSilenceDetectionForTesting = false;

Satish 2012/03/27 09:47:42 This doesn't seem to be set to true anywhere else

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

// The following constants are related to the volume level indicator shown in

// the UI for recorded audio.

// Multiplier used when new volume is greater than previous level.

@@ -48,6 +54,7 @@ bool DetectClipping(const speech::AudioChunk& chunk) {

const int16* samples = chunk.SamplesData16();

const int kThreshold = num_samples / 20;

int clipping_samples = 0;

for (int i = 0; i < num_samples; ++i) {

if (samples[i] <= -32767 || samples[i] >= 32767) {

if (++clipping_samples > kThreshold)

@@ -68,18 +75,27 @@ SpeechRecognizer* SpeechRecognizer::Create(

bool filter_profanities,

const std::string& hardware_info,

const std::string& origin_url) {

+ speech::GoogleOneShotRemoteEngineConfig google_sr_config;

+ google_sr_config.language = language;

+ google_sr_config.grammar = grammar;

+ google_sr_config.audio_sample_rate =

+ speech::SpeechRecognizerImpl::kAudioSampleRate;

+ google_sr_config.audio_num_bits_per_sample =

+ speech::SpeechRecognizerImpl::kNumBitsPerAudioSample;

+ google_sr_config.filter_profanities = filter_profanities;

+ google_sr_config.hardware_info = hardware_info;

+ google_sr_config.origin_url = origin_url;

+ speech::GoogleOneShotRemoteEngine* google_sr_engine =

+ new speech::GoogleOneShotRemoteEngine(context_getter);

+ google_sr_engine->SetConfig(google_sr_config);

Satish 2012/03/27 09:47:42 Is this config ever changed after creating the eng

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 It can be changed, so that the recognition engine

return new speech::SpeechRecognizerImpl(listener,

caller_id,

- language,

- grammar,

- context_getter,

- filter_profanities,

- hardware_info,

- origin_url);

+ google_sr_engine);

}

namespace speech {

const int SpeechRecognizerImpl::kAudioSampleRate = 16000;

const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO;

const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16;

@@ -89,242 +105,458 @@ const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300;

SpeechRecognizerImpl::SpeechRecognizerImpl(

SpeechRecognitionEventListener* listener,

int caller_id,

- const std::string& language,

- const std::string& grammar,

- net::URLRequestContextGetter* context_getter,

- bool filter_profanities,

- const std::string& hardware_info,

- const std::string& origin_url)

+ SpeechRecognitionEngine* engine)

: listener_(listener),

testing_audio_manager_(NULL),

+ recognition_engine_(engine),

endpointer_(kAudioSampleRate),

- context_getter_(context_getter),

caller_id_(caller_id),

Satish 2012/03/27 09:47:42 this initializer list should be in the same order

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Isn't it? (except for non pod fields and pod field

- language_(language),

- grammar_(grammar),

- filter_profanities_(filter_profanities),

- hardware_info_(hardware_info),

- origin_url_(origin_url),

- num_samples_recorded_(0),

- audio_level_(0.0f) {

+ event_dispatch_nesting_level_(0),

+ state_(kIdle),

+ event_args_(NULL) {

DCHECK(listener_ != NULL);

+ DCHECK(recognition_engine_ != NULL);

endpointer_.set_speech_input_complete_silence_length(

base::Time::kMicrosecondsPerSecond / 2);

endpointer_.set_long_speech_input_complete_silence_length(

base::Time::kMicrosecondsPerSecond);

endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);

endpointer_.StartSession();

+ recognition_engine_->set_delegate(this);

}

SpeechRecognizerImpl::~SpeechRecognizerImpl() {

Satish 2012/03/27 09:47:42 add a DCHECK to verify you are in a valid (idle?)

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Hmm, the browser could be closed while a recogniti

- // Recording should have stopped earlier due to the endpointer or

- // |StopRecording| being called.

- DCHECK(!audio_controller_.get());

- DCHECK(!recognition_engine_.get() ||

- !recognition_engine_->IsRecognitionPending());

endpointer_.EndSession();

}

+// ------- Methods that trigger Finite State Machine (FSM) events ------------

+// NOTE: all the external events and request should be enqueued (PostTask), even

+// if they come from the same (IO) thread, in order to preserve the relationship

+// of causality between events.

+// Imagine what would happen if a Start has been enqueued from another thread

Satish 2012/03/27 09:47:42 137-145 looks like a scare tactic :) and could be

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

+// (but not yet processed) and we suddenly issue a Stop from the IO thread.

+// Furthermore, even if you are sure to not interleave start and stop requests,

+// asynchronous event processing mixed with synchronous callback can cause very

+// mind-breaking side effects.

+// For instance, if someone could call Abort synchronously (instead of posting

+// the event on the queue), it will receive interleaved callbacks (e.g. an error

+// or the audio-end event) before the Abort call is effectively ended.

+// Is your (caller) code ready for this?

void SpeechRecognizerImpl::StartRecognition() {

+ FSMEventArgs args;

+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

+ base::Bind(&SpeechRecognizerImpl::DispatchEvent,

+ this, kStartRequest, args));

Satish 2012/03/27 09:47:42 could make it simple by replacing 'args' with 'FSM

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

+void SpeechRecognizerImpl::AbortRecognition() {

+ FSMEventArgs args;

+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

+ base::Bind(&SpeechRecognizerImpl::DispatchEvent,

+ this, kAbortRequest, args));

+void SpeechRecognizerImpl::StopAudioCapture() {

+ FSMEventArgs args;

+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

+ base::Bind(&SpeechRecognizerImpl::DispatchEvent,

+ this, kStopCaptureRequest, args));

+bool SpeechRecognizerImpl::IsActive() const {

+ // Checking the FSM state from another thread (thus, while the FSM is

+ // potentially concurrently evolving) is meaningless.

+ // If you're doing it, probably you have some design issues.

DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

- DCHECK(!audio_controller_.get());

- DCHECK(!recognition_engine_.get() ||

- !recognition_engine_->IsRecognitionPending());

+ return state_ != kIdle;

- // The endpointer needs to estimate the environment/background noise before

- // starting to treat the audio as user input. In |HandleOnData| we wait until

- // such time has passed before switching to user input mode.

- endpointer_.SetEnvironmentEstimationMode();

+bool SpeechRecognizerImpl::IsCapturingAudio() const {

+ DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive().

+ return state_ >= kStartingRecognition && state_ <= kRecognizingSpeech;

Satish 2012/03/27 09:47:42 Would checking for audio_controller_ != NULL be mo

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 IMHO, all the decisions related to the evolution o

+// Invoked in the audio thread.

+void SpeechRecognizerImpl::OnError(AudioInputController* controller,

+ int error_code) {

+ FSMEventArgs args;

+ args.audio_error_code = error_code;

+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

+ base::Bind(&SpeechRecognizerImpl::DispatchEvent,

+ this, kAudioError, args));

+void SpeechRecognizerImpl::OnData(AudioInputController* controller,

+ const uint8* data, uint32 size) {

+ if (size == 0) // This could happen when audio capture stops and is normal.

+ return;

+ FSMEventArgs args;

+ args.audio_data = new AudioChunk(data, static_cast<size_t>(size),

Satish 2012/03/27 09:47:42 add a comment here that the event handler takes ow

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

+ kNumBitsPerAudioSample / 8);

Satish 2012/03/27 09:47:42 since we are assuming kNumBitsPerAudioSample as a

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

+ base::Bind(&SpeechRecognizerImpl::DispatchEvent,

+ this, kAudioData, args));

+void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult(

+ const content::SpeechRecognitionResult& result) {

+ FSMEvent event = kRecognitionResult;

Satish 2012/03/27 09:47:42 can this value be passed directly to the base::Bin

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 It must! honestly don't know why I did pass throug

+ FSMEventArgs args;

+ args.speech_result = result;

+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

+ base::Bind(&SpeechRecognizerImpl::DispatchEvent,

+ this, event, args));

+void SpeechRecognizerImpl::OnSpeechRecognitionEngineError(

+ const content::SpeechRecognitionError& error) {

+ FSMEvent event = kRecognitionError;

Satish 2012/03/27 09:47:42 ditto

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

+ FSMEventArgs args;

+ args.error = error;

+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

+ base::Bind(&SpeechRecognizerImpl::DispatchEvent,

+ this, event, args));

+// ----------------------- Core FSM implementation ---------------------------

+void SpeechRecognizerImpl::DispatchEvent(FSMEvent event, FSMEventArgs args) {

+ DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

+ DCHECK_LE(event, kMaxEvent);

+ DCHECK_LE(state_, kMaxState);

+ // Event dispatching must be sequential, otherwise it will break all the rules

Satish 2012/03/27 09:47:42 add newline above full length comments such as the

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

+ // and the assumptions of the finite state automata model.

+ DCHECK_EQ(event_dispatch_nesting_level_, 0);

Satish 2012/03/27 09:47:42 could be clearer if this variable was a bool such

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Right.

+ ++event_dispatch_nesting_level_;

+ // Guard against the delegate freeing us until we finish processing the event.

Satish 2012/03/27 09:47:42 ditto

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

+ scoped_refptr<SpeechRecognizerImpl> me(this);

+ event_ = event;

Satish 2012/03/27 09:47:42 These look a bit dangerous as they are invalid aft

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Mmm what do you mean? They are used only by (priva

+ event_args_ = &args;

+ if (event == kAudioData)

+ ProcessAudioPipeline();

+ // The audio pipeline must be processed before the ProcessEvent, otherwise it

Satish 2012/03/27 09:47:42 add newline above

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

+ // would take actions according to the future state and not the current one.

+ state_ = ProcessEvent(event);

+ // Cleanup event args.

+ if (args.audio_data)

Satish 2012/03/27 09:47:42 this cleanup should be part of the FSMEventArgs de

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 AudioChunk is now refcounted and should be destroy

+ delete args.audio_data;

+ event_args_ = NULL;

+ --event_dispatch_nesting_level_;

+// ----------- Contract for all the FSM evolution functions below -------------

+// - Are guaranteed to be executed in the IO thread;

+// - Are guaranteed to be not reentrant (themselves and each other);

+// - event_args_ is guaranteed to be non NULL;

+// - event_args_ members are guaranteed to be stable during the call;

+// - The class won't be freed in the meanwhile due to callbacks;

+// TODO(primiano) the audio pipeline is currently serial. However, the

+// clipper->endpointer->vumeter chain and the sr_engine could be parallelized.

+// We should profile the execution to see if it would be worth or not.

+void SpeechRecognizerImpl::ProcessAudioPipeline() {

+ const bool always = true;

Satish 2012/03/27 09:47:42 remove this as its used only in the next line

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

+ const bool route_audio_to_clipper = always;

Satish 2012/03/27 09:47:42 only use 1 space on either side of = and && operat

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

+ const bool route_audio_to_endpointer = state_ >= kEstimatingEnvironment &&

+ state_ <= kRecognizingSpeech;

+ const bool route_audio_to_sr_engine = route_audio_to_endpointer;

+ const bool route_audio_to_vumeter = state_ >= kWaitingForSpeech &&

+ state_ <= kRecognizingSpeech;

+ AudioChunk& recorded_audio_data = *(event_args_->audio_data);

Satish 2012/03/27 09:47:42 use "const AudioChunk&"

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

+ num_samples_recorded_ += recorded_audio_data.NumSamples();

+ if (route_audio_to_clipper) {

+ clipper_detected_clip_ = DetectClipping(recorded_audio_data);

Satish 2012/03/27 09:47:42 clipper_detected_clip_ is set here and used in Upd

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

+ }

+ if (route_audio_to_endpointer) {

+ endpointer_.ProcessAudio(recorded_audio_data, &rms_);

Satish 2012/03/27 09:47:42 ditto for 'rms_'

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

+ }

+ if (route_audio_to_vumeter) {

+ DCHECK(route_audio_to_endpointer); // Depends on endpointer due to |rms_|.

+ UpdateSignalAndNoiseLevels(rms_);

Satish 2012/03/27 09:47:42 since this is the only method making use of clippi

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

+ }

+ if (route_audio_to_sr_engine) {

+ DCHECK(recognition_engine_.get());

+ recognition_engine_->TakeAudioChunk(recorded_audio_data);

+ }

+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ProcessEvent(

Satish 2012/03/27 09:47:42 DispatchEvent and ProcessEvent are too similar, pl

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done. ExecuteTransitionAndGetNextState Done.

+ FSMEvent event) {

+ switch (state_) {

+ case kIdle:

+ switch (event) {

+ // TODO(primiano) restore UNREACHABLE_CONDITION above when speech

+ // input extensions are fixed.

+ case kAbortRequest: return DoNothing(); //UNREACHABLE_CONDITION();

+ case kStartRequest: return InitializeAndStartRecording();

Satish 2012/03/27 09:47:42 since this is the only valid event in this state,

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Hmm IMHO it might introduce bugs if a new event is

+ case kStopCaptureRequest: return DoNothing(); //UNREACHABLE_CONDITION();

+ case kAudioData: return DoNothing(); // Corner cases related to

+ case kRecognitionResult: return DoNothing(); // queued messages being

+ case kRecognitionError: return DoNothing(); // lately dispatched.

+ case kAudioError: return DoNothing();

+ }

+ break;

+ case kStartingRecognition:

+ switch (event) {

+ case kAbortRequest: return Abort();

Satish 2012/03/27 09:47:42 would be simpler to collapse multiple similar hand

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 IMHO it would become more difficult to read (since

+ case kStartRequest: UNREACHABLE_CONDITION();

+ case kStopCaptureRequest: return Abort();

+ case kAudioData: return StartSpeechRecognition();

+ case kRecognitionResult: UNREACHABLE_CONDITION();

+ case kRecognitionError: return Abort();

+ case kAudioError: return Abort();

+ }

+ break;

+ case kEstimatingEnvironment:

+ switch (event) {

+ case kAbortRequest: return Abort();

Satish 2012/03/27 09:47:42 hmm, since kAbortRequest, kRecognitionError and kA

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 They are not exactly equivalent since they trigger

+ case kStartRequest: UNREACHABLE_CONDITION();

+ case kStopCaptureRequest: return StopCaptureAndWaitForResult();

+ case kAudioData: return EnvironmentEstimation();

+ case kRecognitionResult: return ProcessIntermediateRecognitionResult();

+ case kRecognitionError: return Abort();

+ case kAudioError: return Abort();

+ }

+ break;

+ case kWaitingForSpeech:

+ switch (event) {

+ case kAbortRequest: return Abort();

+ case kStartRequest: UNREACHABLE_CONDITION();

+ case kStopCaptureRequest: return StopCaptureAndWaitForResult();

+ case kAudioData: return DetectUserSpeechOrTimeout();

+ case kRecognitionResult: return ProcessIntermediateRecognitionResult();

+ case kRecognitionError: return Abort();

+ case kAudioError: return Abort();

+ }

+ break;

+ case kRecognizingSpeech:

+ switch (event) {

+ case kAbortRequest: return Abort();

+ case kStartRequest: UNREACHABLE_CONDITION();

+ case kStopCaptureRequest: return StopCaptureAndWaitForResult();

+ case kAudioData: return DetectEndOfSpeech();

+ case kRecognitionResult: return ProcessIntermediateRecognitionResult();

+ case kRecognitionError: return Abort();

+ case kAudioError: return Abort();

+ }

+ break;

+ case kWaitingFinalResult:

+ switch (event) {

+ case kAbortRequest: return Abort();

+ case kStartRequest: UNREACHABLE_CONDITION();

+ case kStopCaptureRequest: return DoNothing();

+ case kAudioData: return DoNothing();

+ case kRecognitionResult: return ProcessFinalRecognitionResult();

+ case kRecognitionError: return Abort();

+ case kAudioError: return Abort();

+ }

+ break;

+ }

+ UNREACHABLE_CONDITION();

+SpeechRecognizerImpl::FSMState

+SpeechRecognizerImpl::InitializeAndStartRecording() {

+ DCHECK(recognition_engine_.get());

+ DCHECK(audio_controller_.get() == NULL);

AudioManager* audio_manager = (testing_audio_manager_ != NULL) ?

testing_audio_manager_ :

BrowserMainLoop::GetAudioManager();

+ DCHECK(audio_manager != NULL);

+ VLOG(1) << "SpeechRecognizerImpl starting audio capture.";

+ num_samples_recorded_ = 0;

+ rms_ = 0;

+ audio_level_ = 0;

+ clipper_detected_clip_ = false;

+ listener_->OnRecognitionStart(caller_id_);

+ if (!audio_manager->HasAudioInputDevices()) {

+ return Abort(SpeechRecognitionError(

+ content::SPEECH_RECOGNITION_ERROR_AUDIO,

+ content::SPEECH_AUDIO_ERROR_DETAILS_NO_MIC));

+ }

+ if (audio_manager->IsRecordingInProcess()) {

+ return Abort(SpeechRecognitionError(

+ content::SPEECH_RECOGNITION_ERROR_AUDIO,

+ content::SPEECH_AUDIO_ERROR_DETAILS_IN_USE));

+ }

const int samples_per_packet = kAudioSampleRate *

Satish 2012/03/27 09:47:42 add parentheses around (kAudioSampleRate * ..) / 1

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

- GoogleOneShotRemoteEngine::kAudioPacketIntervalMs / 1000;

+ recognition_engine_->GetDesiredAudioChunkDurationMs() / 1000;

AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,

kAudioSampleRate, kNumBitsPerAudioSample,

samples_per_packet);

audio_controller_ = AudioInputController::Create(audio_manager, this, params);

- DCHECK(audio_controller_.get());

- VLOG(1) << "SpeechRecognizer starting record.";

- num_samples_recorded_ = 0;

- audio_controller_->Record();

-void SpeechRecognizerImpl::AbortRecognition() {

- DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

- DCHECK(audio_controller_.get() || recognition_engine_.get());

- // Stop recording if required.

- if (audio_controller_.get()) {

- CloseAudioControllerSynchronously();

+ if (audio_controller_.get() == NULL) {

+ return Abort(

+ SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO));

}

- VLOG(1) << "SpeechRecognizer canceling recognition.";

- recognition_engine_.reset();

+ // The endpointer needs to estimate the environment/background noise before

+ // starting to treat the audio as user input. We wait in the state

+ // kEstimatingEnvironment until such interval has elapsed before switching

+ // to user input mode.

+ endpointer_.SetEnvironmentEstimationMode();

+ audio_controller_->Record();

+ return kStartingRecognition;

}

-void SpeechRecognizerImpl::StopAudioCapture() {

- DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::StartSpeechRecognition() {

+ // This was the first audio packet recorded, so start a request to the

Satish 2012/03/27 09:47:42 update comment to say that the first audio packet

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

+ // engine to send the data and inform the delegate.

+ DCHECK(recognition_engine_.get());

+ recognition_engine_->StartRecognition();

+ listener_->OnAudioStart(caller_id_);

+ // TODO(primiano) this is a little hack, since TakeAudioChunk() is already

Satish 2012/03/27 09:47:42 add newline above

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

+ // called by ProcessAudioPipeline(). I hate it since it weakens the

+ // architectural beauty of this class. But it is the best tradeoff, unless we

Satish 2012/03/27 09:47:42 could remove reference to 'architectural beauty' :

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

+ // allow the drop the first audio chunk captured after opening the audio dev.

+ recognition_engine_->TakeAudioChunk(*(event_args_->audio_data));

+ return kEstimatingEnvironment;

- // If audio recording has already stopped and we are in recognition phase,

- // silently ignore any more calls to stop recording.

- if (!audio_controller_.get())

- return;

+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::EnvironmentEstimation() {

Satish 2012/03/27 09:47:42 this method's name doesn't indicate what it actual

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done. WaitEnvironmentEstimationCompletion

+ DCHECK(endpointer_.IsEstimatingEnvironment());

+ if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) {

+ endpointer_.SetUserInputMode();

+ listener_->OnEnvironmentEstimationComplete(caller_id_);

+ return kWaitingForSpeech;

+ } else {

+ return kEstimatingEnvironment;

+ }

- CloseAudioControllerSynchronously();

- listener_->OnSoundEnd(caller_id_);

- listener_->OnAudioEnd(caller_id_);

+SpeechRecognizerImpl::FSMState

+SpeechRecognizerImpl::DetectUserSpeechOrTimeout() {

+ if (skipSilenceDetectionForTesting)

+ return kRecognizingSpeech;

- // If we haven't got any audio yet end the recognition sequence here.

- if (recognition_engine_ == NULL) {

- // Guard against the listener freeing us until we finish our job.

- scoped_refptr<SpeechRecognizerImpl> me(this);

- listener_->OnRecognitionEnd(caller_id_);

+ if (endpointer_.DidStartReceivingSpeech()) {

+ listener_->OnSoundStart(caller_id_);

+ return kRecognizingSpeech;

+ } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) {

+ return Abort(

+ SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH));

} else {

- recognition_engine_->AudioChunksEnded();

+ return kWaitingForSpeech;

}

-// Invoked in the audio thread.

-void SpeechRecognizerImpl::OnError(AudioInputController* controller,

- int error_code) {

- BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

- base::Bind(&SpeechRecognizerImpl::HandleOnError,

- this, error_code));

+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::DetectEndOfSpeech() {

+ if (endpointer_.speech_input_complete()) {

+ return StopCaptureAndWaitForResult();

+ } else {

+ return kRecognizingSpeech;

+ }

}

-void SpeechRecognizerImpl::HandleOnError(int error_code) {

- LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code;

+SpeechRecognizerImpl::FSMState

+SpeechRecognizerImpl::StopCaptureAndWaitForResult() {

+ DCHECK(state_ >= kEstimatingEnvironment && state_ <= kRecognizingSpeech);

- // Check if we are still recording before canceling recognition, as

- // recording might have been stopped after this error was posted to the queue

- // by |OnError|.

- if (!audio_controller_.get())

- return;

+ VLOG(1) << "Concluding recognition";

+ CloseAudioControllerSynchronously();

+ recognition_engine_->AudioChunksEnded();

+ if (state_ > kWaitingForSpeech)

+ listener_->OnSoundEnd(caller_id_);

- InformErrorAndAbortRecognition(content::SPEECH_RECOGNITION_ERROR_AUDIO);

+ listener_->OnAudioEnd(caller_id_);

+ return kWaitingFinalResult;

}

-void SpeechRecognizerImpl::OnData(AudioInputController* controller,

- const uint8* data, uint32 size) {

- if (size == 0) // This could happen when recording stops and is normal.

- return;

- AudioChunk* raw_audio = new AudioChunk(data, static_cast<size_t>(size),

- kNumBitsPerAudioSample / 8);

- BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

- base::Bind(&SpeechRecognizerImpl::HandleOnData,

- this, raw_audio));

+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort() {

+ // TODO(primiano) Should raise SPEECH_RECOGNITION_ERROR_ABORTED in lack of

+ // other specific error sources (so that it was an explicit abort request).

+ // However, SPEECH_RECOGNITION_ERROR_ABORTED is not caught in UI layers

+ // and currently would cause an exception. JS will probably need it in future.

+ SpeechRecognitionError error(content::SPEECH_RECOGNITION_ERROR_NONE);

+ bool has_error = false;

+ if (event_ == kAudioError) {

+ has_error = true;

+ error.code = content::SPEECH_RECOGNITION_ERROR_AUDIO;

+ } else if (event_ == kRecognitionError) {

+ has_error = true;

+ error = event_args_->error;

+ }

+ return Abort(has_error, error);

}

-void SpeechRecognizerImpl::HandleOnData(AudioChunk* raw_audio) {

- scoped_ptr<AudioChunk> free_raw_audio_on_return(raw_audio);

- // Check if we are still recording and if not discard this buffer, as

- // recording might have been stopped after this buffer was posted to the queue

- // by |OnData|.

- if (!audio_controller_.get())

- return;

+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort(

+ const SpeechRecognitionError& error) {

+ return Abort(true, error);

- bool speech_was_heard_before_packet = endpointer_.DidStartReceivingSpeech();

- float rms;

- endpointer_.ProcessAudio(*raw_audio, &rms);

- bool did_clip = DetectClipping(*raw_audio);

- num_samples_recorded_ += raw_audio->NumSamples();

- if (recognition_engine_ == NULL) {

- // This was the first audio packet recorded, so start a request to the

- // server to send the data and inform the listener.

- listener_->OnAudioStart(caller_id_);

- GoogleOneShotRemoteEngineConfig google_sr_config;

- google_sr_config.language = language_;

- google_sr_config.grammar = grammar_;

- google_sr_config.audio_sample_rate = kAudioSampleRate;

- google_sr_config.audio_num_bits_per_sample = kNumBitsPerAudioSample;

- google_sr_config.filter_profanities = filter_profanities_;

- google_sr_config.hardware_info = hardware_info_;

- google_sr_config.origin_url = origin_url_;

- GoogleOneShotRemoteEngine* google_sr_engine =

- new GoogleOneShotRemoteEngine(context_getter_.get());

- google_sr_engine->SetConfig(google_sr_config);

- recognition_engine_.reset(google_sr_engine);

- recognition_engine_->set_delegate(this);

- recognition_engine_->StartRecognition();

- }

+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort(

+ bool has_error, const SpeechRecognitionError& error) {

Satish 2012/03/27 09:47:42 can we change 'error' to be a pointer and remove '

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

+ if (audio_controller_)

+ CloseAudioControllerSynchronously();

- recognition_engine_->TakeAudioChunk(*raw_audio);

+ VLOG(1) << "SpeechRecognizerImpl canceling recognition. " <<

+ error.code << " " << error.details;

- if (endpointer_.IsEstimatingEnvironment()) {

- // Check if we have gathered enough audio for the endpointer to do

- // environment estimation and should move on to detect speech/end of speech.

- if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs *

- kAudioSampleRate) / 1000) {

- endpointer_.SetUserInputMode();

- listener_->OnEnvironmentEstimationComplete(caller_id_);

- }

- return; // No more processing since we are still estimating environment.

+ // The recognition engine is initialized only after kStartingRecognition.

+ if (state_ > kStartingRecognition) {

+ DCHECK(recognition_engine_.get());

+ recognition_engine_->EndRecognition();

+ //TODO(primiano) reset the engine? Why, after all?

Satish 2012/03/27 09:47:42 This comment is unclear, please reword if required

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

+ //recognition_engine_.reset();

}

- // Check if we have waited too long without hearing any speech.

- bool speech_was_heard_after_packet = endpointer_.DidStartReceivingSpeech();

- if (!speech_was_heard_after_packet &&

- num_samples_recorded_ >= (kNoSpeechTimeoutMs / 1000) * kAudioSampleRate) {

- InformErrorAndAbortRecognition(

- content::SPEECH_RECOGNITION_ERROR_NO_SPEECH);

- return;

- }

+ if (state_ > kWaitingForSpeech && state_ < kWaitingFinalResult)

Satish 2012/03/27 09:47:42 would be useful for the unittest to verify that al

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

+ listener_->OnSoundEnd(caller_id_);

- if (!speech_was_heard_before_packet && speech_was_heard_after_packet)

- listener_->OnSoundStart(caller_id_);

+ if (state_ > kStartingRecognition && state_ < kWaitingFinalResult)

+ listener_->OnAudioEnd(caller_id_);

- // Calculate the input volume to display in the UI, smoothing towards the

- // new level.

- float level = (rms - kAudioMeterMinDb) /

- (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

- level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);

- if (level > audio_level_) {

- audio_level_ += (level - audio_level_) * kUpSmoothingFactor;

- } else {

- audio_level_ += (level - audio_level_) * kDownSmoothingFactor;

- }

+ if (has_error)

+ listener_->OnRecognitionError(caller_id_, error);

- float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /

- (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

- noise_level = std::min(std::max(0.0f, noise_level),

- kAudioMeterRangeMaxUnclipped);

+ listener_->OnRecognitionEnd(caller_id_);

- listener_->OnAudioLevelsChange(caller_id_, did_clip ? 1.0f : audio_level_,

- noise_level);

+ return kIdle;

- if (endpointer_.speech_input_complete())

- StopAudioCapture();

+SpeechRecognizerImpl::FSMState

+SpeechRecognizerImpl::ProcessIntermediateRecognitionResult() {

+// This is in preparation for future speech recognition functions.

Satish 2012/03/27 09:47:42 remove these commented lines

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

+// DCHECK(continuous_mode_);

+// const SpeechRecognitionResult& result = event_args_->speech_result;

+// VLOG(1) << "Got intermediate result";

+// listener_->OnRecognitionResult(caller_id_, result);

+ NOTREACHED();

+ return state_;

}

-void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult(

- const content::SpeechRecognitionResult& result) {

- // Guard against the listener freeing us until we finish our job.

- scoped_refptr<SpeechRecognizerImpl> me(this);

+SpeechRecognizerImpl::FSMState

+SpeechRecognizerImpl::ProcessFinalRecognitionResult() {

+ const SpeechRecognitionResult& result = event_args_->speech_result;

+ VLOG(1) << "Got valid result";

+ recognition_engine_->EndRecognition();

listener_->OnRecognitionResult(caller_id_, result);

listener_->OnRecognitionEnd(caller_id_);

+ return kIdle;

}

-void SpeechRecognizerImpl::OnSpeechRecognitionEngineError(

- const content::SpeechRecognitionError& error) {

- InformErrorAndAbortRecognition(error.code);

-void SpeechRecognizerImpl::InformErrorAndAbortRecognition(

- content::SpeechRecognitionErrorCode error) {

- DCHECK_NE(error, content::SPEECH_RECOGNITION_ERROR_NONE);

- AbortRecognition();

- // Guard against the listener freeing us until we finish our job.

- scoped_refptr<SpeechRecognizerImpl> me(this);

- listener_->OnRecognitionError(caller_id_, error);

+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::DoNothing() const {

+ return state_; // Just keep the current state.

Satish 2012/03/27 09:47:42 2 spaces before //, here and other places in this

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

}

void SpeechRecognizerImpl::CloseAudioControllerSynchronously() {

- VLOG(1) << "SpeechRecognizer stopping record.";

+ DCHECK(audio_controller_);

+ VLOG(1) << "SpeechRecognizerImpl stopping audio capture.";

// TODO(satish): investigate the possibility to utilize the closure

// and switch to async. version of this method. Compare with how

@@ -336,12 +568,31 @@ void SpeechRecognizerImpl::CloseAudioControllerSynchronously() {

audio_controller_ = NULL; // Releases the ref ptr.

}

-bool SpeechRecognizerImpl::IsActive() const {

- return (recognition_engine_.get() != NULL);

+int SpeechRecognizerImpl::GetElapsedTimeMs() const {

+ return num_samples_recorded_ * 1000 / kAudioSampleRate;

Satish 2012/03/27 09:47:42 use parenthesis around (num_samples_recorded_ * 10

Primiano Tucci (use gerrit) 2012/03/28 13:24:44 Done.

}

-bool SpeechRecognizerImpl::IsCapturingAudio() const {

- return (audio_controller_.get() != NULL);

+void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms) {

+ // Calculate the input volume to display in the UI, smoothing towards the

+ // new level.

+ // TODO(primiano) Do we really need all this floating point arith here?

+ // Perhaps it might be quite expensive on mobile.

+ float level = (rms - kAudioMeterMinDb) /

+ (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

+ level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);

+ if (level > audio_level_) {

+ audio_level_ += (level - audio_level_) * kUpSmoothingFactor;

+ } else {

+ audio_level_ += (level - audio_level_) * kDownSmoothingFactor;

+ }

+ float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /

+ (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

+ noise_level = std::min(std::max(0.0f, noise_level),

+ kAudioMeterRangeMaxUnclipped);

+ listener_->OnAudioLevelsChange(

+ caller_id_, clipper_detected_clip_ ? 1.0f : audio_level_, noise_level);

}

const SpeechRecognitionEngine&

@@ -354,5 +605,10 @@ void SpeechRecognizerImpl::SetAudioManagerForTesting(

testing_audio_manager_ = audio_manager;

}

+SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs()

+ : audio_error_code(0),

+ audio_data(NULL),

+ error(content::SPEECH_RECOGNITION_ERROR_NONE) {

} // namespace speech