content/browser/speech/speech_recognizer_impl.cc - Issue 9835049: Speech refactoring: Reimplemented speech_recognizer as a FSM. (CL1.5)

Unified Diff: content/browser/speech/speech_recognizer_impl.cc

Issue 9835049: Speech refactoring: Reimplemented speech_recognizer as a FSM. (CL1.5) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Minor style fixes. Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« content/browser/speech/speech_recognizer_impl.h ('K') | « content/browser/speech/speech_recognizer_impl.h ('k') | content/browser/speech/speech_recognizer_impl_unittest.cc » ('j') | content/browser/speech/speech_recognizer_impl_unittest.cc » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: content/browser/speech/speech_recognizer_impl.cc

diff --git a/content/browser/speech/speech_recognizer_impl.cc b/content/browser/speech/speech_recognizer_impl.cc

index 05f81e7bbba3908bf2f64e67222132aa91db694c..923bf4dec48643939b646a90b18d2705a1e28496 100644

--- a/content/browser/speech/speech_recognizer_impl.cc

+++ b/content/browser/speech/speech_recognizer_impl.cc

@@ -4,6 +4,7 @@

#include "content/browser/speech/speech_recognizer_impl.h"

+#include "base/basictypes.h"

#include "base/bind.h"

#include "base/time.h"

#include "content/browser/browser_main_loop.h"

@@ -16,6 +17,8 @@

#include "content/public/common/speech_recognition_result.h"

#include "net/url_request/url_request_context_getter.h"

+#define BIND(x) base::Bind(&SpeechRecognizerImpl::x, this)

hans 2012/04/02 16:05:59 Hmm, not super happy about this macro and the use

Primiano Tucci (use gerrit) 2012/04/03 10:16:39 Reverted to switch-style FSM as agreed.

using content::BrowserMainLoop;

using content::BrowserThread;

using content::SpeechRecognitionError;

@@ -48,6 +51,7 @@ bool DetectClipping(const speech::AudioChunk& chunk) {

const int16* samples = chunk.SamplesData16();

const int kThreshold = num_samples / 20;

int clipping_samples = 0;

for (int i = 0; i < num_samples; ++i) {

if (samples[i] <= -32767 || samples[i] >= 32767) {

if (++clipping_samples > kThreshold)

@@ -68,14 +72,24 @@ SpeechRecognizer* SpeechRecognizer::Create(

bool filter_profanities,

const std::string& hardware_info,

const std::string& origin_url) {

+ speech::GoogleOneShotRemoteEngineConfig google_sr_config;

+ google_sr_config.language = language;

+ google_sr_config.grammar = grammar;

+ google_sr_config.audio_sample_rate =

+ speech::SpeechRecognizerImpl::kAudioSampleRate;

+ google_sr_config.audio_num_bits_per_sample =

+ speech::SpeechRecognizerImpl::kNumBitsPerAudioSample;

+ google_sr_config.filter_profanities = filter_profanities;

+ google_sr_config.hardware_info = hardware_info;

+ google_sr_config.origin_url = origin_url;

+ speech::GoogleOneShotRemoteEngine* google_sr_engine =

+ new speech::GoogleOneShotRemoteEngine(context_getter);

+ google_sr_engine->SetConfig(google_sr_config);

return new speech::SpeechRecognizerImpl(listener,

caller_id,

- language,

- grammar,

- context_getter,

- filter_profanities,

- hardware_info,

- origin_url);

+ google_sr_engine);

}

namespace speech {

@@ -86,248 +100,440 @@ const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16;

const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000;

const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300;

+COMPILE_ASSERT((SpeechRecognizerImpl::kNumBitsPerAudioSample & 0x7) == 0,

hans 2012/04/02 16:05:59 I think using the % operator instead of & would ma

Primiano Tucci (use gerrit) 2012/04/03 10:16:39 Done.

+ kNumBitsPerAudioSample_must_be_a_multiple_of_8);

SpeechRecognizerImpl::SpeechRecognizerImpl(

SpeechRecognitionEventListener* listener,

int caller_id,

- const std::string& language,

- const std::string& grammar,

- net::URLRequestContextGetter* context_getter,

- bool filter_profanities,

- const std::string& hardware_info,

- const std::string& origin_url)

+ SpeechRecognitionEngine* engine)

: listener_(listener),

testing_audio_manager_(NULL),

+ recognition_engine_(engine),

endpointer_(kAudioSampleRate),

- context_getter_(context_getter),

caller_id_(caller_id),

- language_(language),

- grammar_(grammar),

- filter_profanities_(filter_profanities),

- hardware_info_(hardware_info),

- origin_url_(origin_url),

- num_samples_recorded_(0),

- audio_level_(0.0f) {

+ in_event_dispatching_(false),

+ state_(STATE_IDLE) {

DCHECK(listener_ != NULL);

+ DCHECK(recognition_engine_ != NULL);

+ InitializeFSM();

endpointer_.set_speech_input_complete_silence_length(

base::Time::kMicrosecondsPerSecond / 2);

endpointer_.set_long_speech_input_complete_silence_length(

base::Time::kMicrosecondsPerSecond);

endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);

endpointer_.StartSession();

+ recognition_engine_->set_delegate(this);

}

SpeechRecognizerImpl::~SpeechRecognizerImpl() {

- // Recording should have stopped earlier due to the endpointer or

- // |StopRecording| being called.

- DCHECK(!audio_controller_.get());

- DCHECK(!recognition_engine_.get() ||

- !recognition_engine_->IsRecognitionPending());

endpointer_.EndSession();

}

-void SpeechRecognizerImpl::StartRecognition() {

- DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

- DCHECK(!audio_controller_.get());

- DCHECK(!recognition_engine_.get() ||

- !recognition_engine_->IsRecognitionPending());

+// ------- Methods that trigger Finite State Machine (FSM) events ------------

- // The endpointer needs to estimate the environment/background noise before

- // starting to treat the audio as user input. In |HandleOnData| we wait until

- // such time has passed before switching to user input mode.

- endpointer_.SetEnvironmentEstimationMode();

+// NOTE: all the external events and request should be enqueued (PostTask), even

hans 2012/04/02 16:05:59 s/request/requests/ ?

Primiano Tucci (use gerrit) 2012/04/03 10:16:39 Done.

+// if they come from the same (IO) thread, in order to preserve the relationship

+// of causality between events and avoid interleaved event processing due to

+// synchronous callbacks.

- AudioManager* audio_manager = (testing_audio_manager_ != NULL) ?

- testing_audio_manager_ :

- BrowserMainLoop::GetAudioManager();

- const int samples_per_packet = kAudioSampleRate *

- GoogleOneShotRemoteEngine::kAudioPacketIntervalMs / 1000;

- AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,

- kAudioSampleRate, kNumBitsPerAudioSample,

- samples_per_packet);

- audio_controller_ = AudioInputController::Create(audio_manager, this, params);

- DCHECK(audio_controller_.get());

- VLOG(1) << "SpeechRecognizer starting record.";

- num_samples_recorded_ = 0;

- audio_controller_->Record();

+void SpeechRecognizerImpl::StartRecognition() {

+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

+ base::Bind(&SpeechRecognizerImpl::DispatchEvent,

+ this, EVENT_START, FSMEventArgs()));

}

void SpeechRecognizerImpl::AbortRecognition() {

- DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

- DCHECK(audio_controller_.get() || recognition_engine_.get());

- // Stop recording if required.

- if (audio_controller_.get()) {

- CloseAudioControllerAsynchronously();

- }

- VLOG(1) << "SpeechRecognizer canceling recognition.";

- recognition_engine_.reset();

+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

+ base::Bind(&SpeechRecognizerImpl::DispatchEvent,

+ this, EVENT_ABORT, FSMEventArgs()));

}

void SpeechRecognizerImpl::StopAudioCapture() {

- DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

- // If audio recording has already stopped and we are in recognition phase,

- // silently ignore any more calls to stop recording.

- if (!audio_controller_.get())

- return;

+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

+ base::Bind(&SpeechRecognizerImpl::DispatchEvent,

+ this, EVENT_STOP_CAPTURE,

+ FSMEventArgs()));

- CloseAudioControllerAsynchronously();

- listener_->OnSoundEnd(caller_id_);

- listener_->OnAudioEnd(caller_id_);

+bool SpeechRecognizerImpl::IsActive() const {

+ // Checking the FSM state from another thread (thus, while the FSM is

+ // potentially concurrently evolving) is meaningless.

+ // If you're doing it, probably you have some design issues.

hans 2012/04/02 16:05:59 i'm not sure this comment adds much.. i think the

Primiano Tucci (use gerrit) 2012/04/03 10:16:39 Agree, removed the last line.

+ DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

+ return state_ != STATE_IDLE;

- // If we haven't got any audio yet end the recognition sequence here.

- if (recognition_engine_ == NULL) {

- // Guard against the listener freeing us until we finish our job.

- scoped_refptr<SpeechRecognizerImpl> me(this);

- listener_->OnRecognitionEnd(caller_id_);

- } else {

- recognition_engine_->AudioChunksEnded();

- }

+bool SpeechRecognizerImpl::IsCapturingAudio() const {

+ DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive().

+ const bool is_capturing_audio = state_ >= STATE_STARTING &&

+ state_ <= STATE_RECOGNIZING;

+ DCHECK((is_capturing_audio && (audio_controller_.get() != NULL)) ||

+ (!is_capturing_audio && audio_controller_.get() == NULL));

+ return is_capturing_audio;

}

// Invoked in the audio thread.

void SpeechRecognizerImpl::OnError(AudioInputController* controller,

int error_code) {

+ FSMEventArgs args;

+ args.audio_error_code = error_code;

BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

- base::Bind(&SpeechRecognizerImpl::HandleOnError,

- this, error_code));

+ base::Bind(&SpeechRecognizerImpl::DispatchEvent,

+ this, EVENT_AUDIO_ERROR, args));

}

-void SpeechRecognizerImpl::HandleOnError(int error_code) {

- LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code;

- // Check if we are still recording before canceling recognition, as

- // recording might have been stopped after this error was posted to the queue

- // by |OnError|.

- if (!audio_controller_.get())

+void SpeechRecognizerImpl::OnData(AudioInputController* controller,

+ const uint8* data, uint32 size) {

+ if (size == 0) // This could happen when audio capture stops and is normal.

return;

- InformErrorAndAbortRecognition(content::SPEECH_RECOGNITION_ERROR_AUDIO);

+ FSMEventArgs args;

+ args.audio_data = new AudioChunk(data, static_cast<size_t>(size),

+ kNumBitsPerAudioSample / 8);

+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

+ base::Bind(&SpeechRecognizerImpl::DispatchEvent,

+ this, EVENT_AUDIO_DATA, args));

}

-void SpeechRecognizerImpl::OnData(AudioInputController* controller,

- const uint8* data, uint32 size) {

- if (size == 0) // This could happen when recording stops and is normal.

- return;

- scoped_refptr<AudioChunk> raw_audio(

- new AudioChunk(data,

- static_cast<size_t>(size),

- kNumBitsPerAudioSample / 8));

+void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {}

+void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult(

+ const content::SpeechRecognitionResult& result) {

+ FSMEventArgs args;

+ args.engine_result = result;

BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

- base::Bind(&SpeechRecognizerImpl::HandleOnData,

- this, raw_audio));

+ base::Bind(&SpeechRecognizerImpl::DispatchEvent,

+ this, EVENT_ENGINE_RESULT, args));

}

-void SpeechRecognizerImpl::HandleOnData(scoped_refptr<AudioChunk> raw_audio) {

- // Check if we are still recording and if not discard this buffer, as

- // recording might have been stopped after this buffer was posted to the queue

- // by |OnData|.

- if (!audio_controller_.get())

- return;

+void SpeechRecognizerImpl::OnSpeechRecognitionEngineError(

+ const content::SpeechRecognitionError& error) {

+ FSMEventArgs args;

+ args.engine_error = error;

+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

+ base::Bind(&SpeechRecognizerImpl::DispatchEvent,

+ this, EVENT_ENGINE_ERROR, args));

- bool speech_was_heard_before_packet = endpointer_.DidStartReceivingSpeech();

- float rms;

- endpointer_.ProcessAudio(*raw_audio, &rms);

- bool did_clip = DetectClipping(*raw_audio);

- num_samples_recorded_ += raw_audio->NumSamples();

- if (recognition_engine_ == NULL) {

- // This was the first audio packet recorded, so start a request to the

- // server to send the data and inform the listener.

- listener_->OnAudioStart(caller_id_);

- GoogleOneShotRemoteEngineConfig google_sr_config;

- google_sr_config.language = language_;

- google_sr_config.grammar = grammar_;

- google_sr_config.audio_sample_rate = kAudioSampleRate;

- google_sr_config.audio_num_bits_per_sample = kNumBitsPerAudioSample;

- google_sr_config.filter_profanities = filter_profanities_;

- google_sr_config.hardware_info = hardware_info_;

- google_sr_config.origin_url = origin_url_;

- GoogleOneShotRemoteEngine* google_sr_engine =

- new GoogleOneShotRemoteEngine(context_getter_.get());

- google_sr_engine->SetConfig(google_sr_config);

- recognition_engine_.reset(google_sr_engine);

- recognition_engine_->set_delegate(this);

- recognition_engine_->StartRecognition();

+// ----------------------- Core FSM implementation ---------------------------

+// TODO(primiano) After the changes in the media package (r129173), this class

+// slightly violates the SpeechRecognitionEventListener interface contract. In

+// particular, it is not true anymore that this class can be freed after the

+// OnRecognitionEnd event, since the audio_controller_.Close() asynchronous

+// call can be still in progress after the end event. Currently, it does not

+// represent a problem for the browser itself, since since refcounting protects

hans 2012/04/02 16:05:59 s/since since/since/

Primiano Tucci (use gerrit) 2012/04/03 10:16:39 Done.

+// us against such race conditions. However, we should fix this in the next CLs.

+// For instance, tests are currently working just because the

+// TestAudioInputController is not closing asynchronously as the real controller

+// does, but they will become flaky if TestAudioInputController will be fixed.

+void SpeechRecognizerImpl::InitializeFSM() {

+ fsm[STATE_IDLE][EVENT_ABORT] = BIND(DoNothing);

+ fsm[STATE_IDLE][EVENT_START] = BIND(StartRecording);

+ fsm[STATE_IDLE][EVENT_STOP_CAPTURE] = BIND(DoNothing);

+ fsm[STATE_IDLE][EVENT_AUDIO_DATA] = BIND(DoNothing);

+ fsm[STATE_IDLE][EVENT_ENGINE_RESULT] = BIND(DoNothing);

+ fsm[STATE_IDLE][EVENT_ENGINE_ERROR] = BIND(DoNothing);

+ fsm[STATE_IDLE][EVENT_AUDIO_ERROR] = BIND(DoNothing);

+ fsm[STATE_STARTING][EVENT_ABORT] = BIND(Abort);

+ fsm[STATE_STARTING][EVENT_START] = kUnfeasibleTransition;

+ fsm[STATE_STARTING][EVENT_STOP_CAPTURE] = BIND(Abort);

+ fsm[STATE_STARTING][EVENT_AUDIO_DATA] = BIND(StartRecognitionEngine);

+ fsm[STATE_STARTING][EVENT_ENGINE_RESULT] = kUnfeasibleTransition;

+ fsm[STATE_STARTING][EVENT_ENGINE_ERROR] = BIND(Abort);

+ fsm[STATE_STARTING][EVENT_AUDIO_ERROR] = BIND(Abort);

+ fsm[STATE_ESTIMATING_ENVIRONMENT][EVENT_ABORT] = BIND(Abort);

+ fsm[STATE_ESTIMATING_ENVIRONMENT][EVENT_START] = kUnfeasibleTransition;

+ fsm[STATE_ESTIMATING_ENVIRONMENT][EVENT_STOP_CAPTURE] =

+ BIND(StopCaptureAndWaitResult);

+ fsm[STATE_ESTIMATING_ENVIRONMENT][EVENT_AUDIO_DATA] =

+ BIND(WaitEnvironmentEstimationCompletion);

+ fsm[STATE_ESTIMATING_ENVIRONMENT][EVENT_ENGINE_RESULT] =

+ BIND(ProcessIntermediateResult);

+ fsm[STATE_ESTIMATING_ENVIRONMENT][EVENT_ENGINE_ERROR] = BIND(Abort);

+ fsm[STATE_ESTIMATING_ENVIRONMENT][EVENT_AUDIO_ERROR] = BIND(Abort);

+ fsm[STATE_WAITING_FOR_SPEECH][EVENT_ABORT] = BIND(Abort);

+ fsm[STATE_WAITING_FOR_SPEECH][EVENT_START] = kUnfeasibleTransition;

+ fsm[STATE_WAITING_FOR_SPEECH][EVENT_STOP_CAPTURE] =

+ BIND(StopCaptureAndWaitResult);

+ fsm[STATE_WAITING_FOR_SPEECH][EVENT_AUDIO_DATA] =

+ BIND(DetectUserSpeechOrTimeout);

+ fsm[STATE_WAITING_FOR_SPEECH][EVENT_ENGINE_RESULT] =

+ BIND(ProcessIntermediateResult);

+ fsm[STATE_WAITING_FOR_SPEECH][EVENT_ENGINE_ERROR] = BIND(Abort);

+ fsm[STATE_WAITING_FOR_SPEECH][EVENT_AUDIO_ERROR] = BIND(Abort);

+ fsm[STATE_RECOGNIZING][EVENT_ABORT] = BIND(Abort);

+ fsm[STATE_RECOGNIZING][EVENT_START] = kUnfeasibleTransition;

+ fsm[STATE_RECOGNIZING][EVENT_STOP_CAPTURE] = BIND(StopCaptureAndWaitResult);

+ fsm[STATE_RECOGNIZING][EVENT_AUDIO_DATA] = BIND(DetectEndOfSpeech);

+ fsm[STATE_RECOGNIZING][EVENT_ENGINE_RESULT] = BIND(ProcessIntermediateResult);

+ fsm[STATE_RECOGNIZING][EVENT_ENGINE_ERROR] = BIND(Abort);

+ fsm[STATE_RECOGNIZING][EVENT_AUDIO_ERROR] = BIND(Abort);

+ fsm[STATE_WAITING_FINAL_RESULT][EVENT_ABORT] = BIND(Abort);

+ fsm[STATE_WAITING_FINAL_RESULT][EVENT_START] = kUnfeasibleTransition;

+ fsm[STATE_WAITING_FINAL_RESULT][EVENT_STOP_CAPTURE] = BIND(DoNothing);

+ fsm[STATE_WAITING_FINAL_RESULT][EVENT_AUDIO_DATA] = BIND(DoNothing);

+ fsm[STATE_WAITING_FINAL_RESULT][EVENT_ENGINE_RESULT] =

+ BIND(ProcessFinalResult);

+ fsm[STATE_WAITING_FINAL_RESULT][EVENT_ENGINE_ERROR] = BIND(Abort);

+ fsm[STATE_WAITING_FINAL_RESULT][EVENT_AUDIO_ERROR] = BIND(Abort);

+void SpeechRecognizerImpl::DispatchEvent(FSMEvent event, FSMEventArgs args) {

+ DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

+ DCHECK_LE(event, EVENT_MAX);

+ DCHECK_LE(state_, STATE_MAX);

+ // Event dispatching must be sequential, otherwise it will break all the rules

+ // and the assumptions of the finite state automata model.

+ DCHECK(!in_event_dispatching_);

+ in_event_dispatching_ = true;

+ // Guard against the delegate freeing us until we finish processing the event.

+ scoped_refptr<SpeechRecognizerImpl> me(this);

+ args.event = event;

+ if (event == EVENT_AUDIO_DATA) {

+ DCHECK(args.audio_data.get() != NULL);

+ ProcessAudioPipeline(*(args.audio_data.get()));

hans 2012/04/02 16:05:59 I think you can just do ProcessAudioPipeline(*args

Primiano Tucci (use gerrit) 2012/04/03 10:16:39 Done.

}

- recognition_engine_->TakeAudioChunk(*raw_audio);

+ // The audio pipeline must be processed before the event dispatch, otherwise

+ // it would take actions according to the future state instead of the current.

+ const TransitionFunction& transition = fsm[state_][event];

hans 2012/04/02 16:05:59 i liked the switch-case better

Satish 2012/04/02 21:57:09 I was thinking earlier that a table would be appea

+ if(transition.Equals(kUnfeasibleTransition)) {

+ NOTREACHED() << "Unfeasible event " << event << " in state " << state_;

+ } else {

+ state_ = transition.Run(args);

+ }

- if (endpointer_.IsEstimatingEnvironment()) {

- // Check if we have gathered enough audio for the endpointer to do

- // environment estimation and should move on to detect speech/end of speech.

- if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs *

- kAudioSampleRate) / 1000) {

- endpointer_.SetUserInputMode();

- listener_->OnEnvironmentEstimationComplete(caller_id_);

- }

- return; // No more processing since we are still estimating environment.

+ in_event_dispatching_ = false;

+// ----------- Contract for all the FSM evolution functions below -------------

+// - Are guaranteed to be executed in the IO thread;

+// - Are guaranteed to be not reentrant (themselves and each other);

+// - event_args members are guaranteed to be stable during the call;

+// - The class won't be freed in the meanwhile due to callbacks;

+// - IsCapturingAudio() returns true if and only if audio_controller_ != NULL.

+// TODO(primiano) the audio pipeline is currently serial. However, the

+// clipper->endpointer->vumeter chain and the sr_engine could be parallelized.

+// We should profile the execution to see if it would be worth or not.

+void SpeechRecognizerImpl::ProcessAudioPipeline(const AudioChunk& raw_audio) {

+ const bool route_to_endpointer = state_ >= STATE_ESTIMATING_ENVIRONMENT &&

+ state_ <= STATE_RECOGNIZING;

+ const bool route_to_sr_engine = route_to_endpointer;

+ const bool route_to_vumeter = state_ >= STATE_WAITING_FOR_SPEECH &&

+ state_ <= STATE_RECOGNIZING;

+ const bool clip_detected = DetectClipping(raw_audio);

+ float rms = 0;

+ num_samples_recorded_ += raw_audio.NumSamples();

+ if (route_to_endpointer) {

+ endpointer_.ProcessAudio(raw_audio, &rms);

}

+ if (route_to_vumeter) {

+ DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|.

+ UpdateSignalAndNoiseLevels(rms, clip_detected);

+ }

+ if (route_to_sr_engine) {

+ DCHECK(recognition_engine_.get());

+ recognition_engine_->TakeAudioChunk(raw_audio);

+ }

- // Check if we have waited too long without hearing any speech.

- bool speech_was_heard_after_packet = endpointer_.DidStartReceivingSpeech();

- if (!speech_was_heard_after_packet &&

- num_samples_recorded_ >= (kNoSpeechTimeoutMs / 1000) * kAudioSampleRate) {

- InformErrorAndAbortRecognition(

- content::SPEECH_RECOGNITION_ERROR_NO_SPEECH);

- return;

+SpeechRecognizerImpl::FSMState

+SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) {

+ DCHECK(recognition_engine_.get());

+ DCHECK(!IsCapturingAudio());

+ AudioManager* audio_manager = (testing_audio_manager_ != NULL) ?

+ testing_audio_manager_ :

+ BrowserMainLoop::GetAudioManager();

+ DCHECK(audio_manager != NULL);

+ VLOG(1) << "SpeechRecognizerImpl starting audio capture.";

+ num_samples_recorded_ = 0;

+ audio_level_ = 0;

+ listener_->OnRecognitionStart(caller_id_);

+ if (!audio_manager->HasAudioInputDevices()) {

+ return AbortWithError(SpeechRecognitionError(

+ content::SPEECH_RECOGNITION_ERROR_AUDIO,

+ content::SPEECH_AUDIO_ERROR_DETAILS_NO_MIC));

}

- if (!speech_was_heard_before_packet && speech_was_heard_after_packet)

+ if (audio_manager->IsRecordingInProcess()) {

+ return AbortWithError(SpeechRecognitionError(

+ content::SPEECH_RECOGNITION_ERROR_AUDIO,

+ content::SPEECH_AUDIO_ERROR_DETAILS_IN_USE));

+ }

+ const int samples_per_packet = (kAudioSampleRate *

+ recognition_engine_->GetDesiredAudioChunkDurationMs()) / 1000;

+ AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,

+ kAudioSampleRate, kNumBitsPerAudioSample,

+ samples_per_packet);

+ audio_controller_ = AudioInputController::Create(audio_manager, this, params);

+ if (audio_controller_.get() == NULL) {

+ return AbortWithError(

+ SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO));

+ }

+ // The endpointer needs to estimate the environment/background noise before

+ // starting to treat the audio as user input. We wait in the state

+ // ESTIMATING_ENVIRONMENT until such interval has elapsed before switching

+ // to user input mode.

+ endpointer_.SetEnvironmentEstimationMode();

+ audio_controller_->Record();

+ return STATE_STARTING;

+SpeechRecognizerImpl::FSMState

+SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) {

+ // This is the first audio packet captured, so the recognition engine is

+ // started and the delegate notifies about the event.

hans 2012/04/02 16:05:59 s/notifies/notified/

Primiano Tucci (use gerrit) 2012/04/03 10:16:39 Done.

+ DCHECK(recognition_engine_.get());

+ recognition_engine_->StartRecognition();

+ listener_->OnAudioStart(caller_id_);

+ // This is a little hack, since TakeAudioChunk() is already called by

+ // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping

+ // the first audio chunk captured after opening the audio device.

+ recognition_engine_->TakeAudioChunk(*(event_args.audio_data));

+ return STATE_ESTIMATING_ENVIRONMENT;

+SpeechRecognizerImpl::FSMState

+SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) {

+ DCHECK(endpointer_.IsEstimatingEnvironment());

+ if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) {

+ endpointer_.SetUserInputMode();

+ listener_->OnEnvironmentEstimationComplete(caller_id_);

+ return STATE_WAITING_FOR_SPEECH;

+ } else {

+ return STATE_ESTIMATING_ENVIRONMENT;

+ }

+SpeechRecognizerImpl::FSMState

+SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) {

+ if (endpointer_.DidStartReceivingSpeech()) {

listener_->OnSoundStart(caller_id_);

+ return STATE_RECOGNIZING;

+ } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) {

+ return AbortWithError(

+ SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH));

+ } else {

+ return STATE_WAITING_FOR_SPEECH;

+ }

- // Calculate the input volume to display in the UI, smoothing towards the

- // new level.

- float level = (rms - kAudioMeterMinDb) /

- (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

- level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);

- if (level > audio_level_) {

- audio_level_ += (level - audio_level_) * kUpSmoothingFactor;

+SpeechRecognizerImpl::FSMState

+SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) {

+ if (endpointer_.speech_input_complete()) {

+ return StopCaptureAndWaitResult(event_args);

} else {

- audio_level_ += (level - audio_level_) * kDownSmoothingFactor;

+ return STATE_RECOGNIZING;

}

- float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /

- (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

- noise_level = std::min(std::max(0.0f, noise_level),

- kAudioMeterRangeMaxUnclipped);

+SpeechRecognizerImpl::FSMState

+SpeechRecognizerImpl::StopCaptureAndWaitResult(const FSMEventArgs&) {

+ DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING);

+ VLOG(1) << "Concluding recognition";

+ CloseAudioControllerAsynchronously();

+ recognition_engine_->AudioChunksEnded();

- listener_->OnAudioLevelsChange(caller_id_, did_clip ? 1.0f : audio_level_,

- noise_level);

+ if (state_ > STATE_WAITING_FOR_SPEECH)

+ listener_->OnSoundEnd(caller_id_);

- if (endpointer_.speech_input_complete())

- StopAudioCapture();

+ listener_->OnAudioEnd(caller_id_);

+ return STATE_WAITING_FINAL_RESULT;

}

-void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {}

+SpeechRecognizerImpl::FSMState

+SpeechRecognizerImpl::Abort(const FSMEventArgs& event_args) {

+ // TODO(primiano) Should raise SPEECH_RECOGNITION_ERROR_ABORTED in lack of

+ // other specific error sources (so that it was an explicit abort request).

+ // However, SPEECH_RECOGNITION_ERROR_ABORTED is not caught in UI layers

+ // and currently would cause an exception. JS will probably need it in future.

+ if (event_args.event == EVENT_AUDIO_ERROR) {

+ return AbortWithError(

+ SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO));

+ } else if (event_args.event == EVENT_ENGINE_ERROR) {

+ return AbortWithError(event_args.engine_error);

+ }

+ return AbortWithError(NULL);

+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortWithError(

+ const SpeechRecognitionError& error) {

+ return AbortWithError(&error);

+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortWithError(

+ const SpeechRecognitionError* error) {

+ if (IsCapturingAudio())

+ CloseAudioControllerAsynchronously();

+ VLOG(1) << "SpeechRecognizerImpl canceling recognition. ";

+ // The recognition engine is initialized only after STATE_STARTING.

+ if (state_ > STATE_STARTING) {

+ DCHECK(recognition_engine_.get());

+ recognition_engine_->EndRecognition();

+ }

+ if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT)

+ listener_->OnSoundEnd(caller_id_);

+ if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT)

+ listener_->OnAudioEnd(caller_id_);

+ if (error != NULL)

+ listener_->OnRecognitionError(caller_id_, *error);

hans 2012/04/02 16:05:59 just a thought (maybe for the future).. i wonder w

Primiano Tucci (use gerrit) 2012/04/03 10:16:39 We should think on the implications that it might

-void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult(

- const content::SpeechRecognitionResult& result) {

- // Guard against the listener freeing us until we finish our job.

- scoped_refptr<SpeechRecognizerImpl> me(this);

- listener_->OnRecognitionResult(caller_id_, result);

listener_->OnRecognitionEnd(caller_id_);

+ return STATE_IDLE;

}

-void SpeechRecognizerImpl::OnSpeechRecognitionEngineError(

- const content::SpeechRecognitionError& error) {

- InformErrorAndAbortRecognition(error.code);

+SpeechRecognizerImpl::FSMState

+SpeechRecognizerImpl::ProcessIntermediateResult(const FSMEventArgs&) {

+// This is in preparation for future speech recognition functions.

+ NOTREACHED();

+ return state_;

}

-void SpeechRecognizerImpl::InformErrorAndAbortRecognition(

- content::SpeechRecognitionErrorCode error) {

- DCHECK_NE(error, content::SPEECH_RECOGNITION_ERROR_NONE);

- AbortRecognition();

+SpeechRecognizerImpl::FSMState

+SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) {

+ const SpeechRecognitionResult& result = event_args.engine_result;

+ VLOG(1) << "Got valid result";

+ recognition_engine_->EndRecognition();

+ listener_->OnRecognitionResult(caller_id_, result);

+ listener_->OnRecognitionEnd(caller_id_);

+ return STATE_IDLE;

- // Guard against the listener freeing us until we finish our job.

- scoped_refptr<SpeechRecognizerImpl> me(this);

- listener_->OnRecognitionError(caller_id_, error);

+SpeechRecognizerImpl::FSMState

+SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const {

+ return state_; // Just keep the current state.

}

void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() {

- VLOG(1) << "SpeechRecognizer stopping record.";

+ DCHECK(IsCapturingAudio());

+ VLOG(1) << "SpeechRecognizerImpl stopping audio capture.";

// Issues a Close on the audio controller, passing an empty callback. The only

// purpose of such callback is to keep the audio controller refcounted until

// Close has completed (in the audio thread) and automatically destroy it

@@ -337,12 +543,32 @@ void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() {

audio_controller_ = NULL; // The controller is still refcounted by Bind.

}

-bool SpeechRecognizerImpl::IsActive() const {

- return (recognition_engine_.get() != NULL);

+int SpeechRecognizerImpl::GetElapsedTimeMs() const {

+ return (num_samples_recorded_ * 1000) / kAudioSampleRate;

}

-bool SpeechRecognizerImpl::IsCapturingAudio() const {

- return (audio_controller_.get() != NULL);

+void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms,

+ bool clip_detected) {

+ // Calculate the input volume to display in the UI, smoothing towards the

+ // new level.

+ // TODO(primiano) Do we really need all this floating point arith here?

+ // Perhaps it might be quite expensive on mobile.

+ float level = (rms - kAudioMeterMinDb) /

+ (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

+ level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);

+ if (level > audio_level_) {

+ audio_level_ += (level - audio_level_) * kUpSmoothingFactor;

+ } else {

+ audio_level_ += (level - audio_level_) * kDownSmoothingFactor;

+ }

+ float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /

+ (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

+ noise_level = std::min(std::max(0.0f, noise_level),

+ kAudioMeterRangeMaxUnclipped);

+ listener_->OnAudioLevelsChange(

+ caller_id_, clip_detected ? 1.0f : audio_level_, noise_level);

}

const SpeechRecognitionEngine&

@@ -355,5 +581,13 @@ void SpeechRecognizerImpl::SetAudioManagerForTesting(

testing_audio_manager_ = audio_manager;

}

+SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs()

+ : audio_error_code(0),

+ audio_data(NULL),

+ engine_error(content::SPEECH_RECOGNITION_ERROR_NONE) {

+SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() {

} // namespace speech