content/browser/speech/speech_recognizer_impl.cc - Issue 9663066: Refactoring of chrome speech recognition architecture (CL1.3)

Unified Diff: content/browser/speech/speech_recognizer_impl.cc

Issue 9663066: Refactoring of chrome speech recognition architecture (CL1.3) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Fixed according to (partial) Satish review. Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« content/browser/speech/speech_recognizer_impl.h ('K') | « content/browser/speech/speech_recognizer_impl.h ('k') | content/browser/speech/speech_recognizer_impl_unittest.cc » ('j') | content/public/common/speech_recognition_result.h » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: content/browser/speech/speech_recognizer_impl.cc

diff --git a/content/browser/speech/speech_recognizer_impl.cc b/content/browser/speech/speech_recognizer_impl.cc

index 84f46a5e69f593735d128a27b7c288ccaa381cff..a8abc2daa54ca3e7d5ffcdcdf1592485bf69d47a 100644

--- a/content/browser/speech/speech_recognizer_impl.cc

+++ b/content/browser/speech/speech_recognizer_impl.cc

@@ -8,20 +8,29 @@

#include "base/time.h"

#include "content/browser/browser_main_loop.h"

#include "content/browser/speech/audio_buffer.h"

-#include "content/public/browser/speech_recognition_event_listener.h"

+#include "content/browser/speech/google_one_shot_remote_engine.h"

#include "content/public/browser/browser_thread.h"

+#include "content/public/browser/speech_recognition_event_listener.h"

+#include "content/public/browser/speech_recognizer.h"

#include "content/public/common/speech_recognition_result.h"

#include "net/url_request/url_request_context_getter.h"

+#define UNREACHABLE_CONDITION() do { NOTREACHED(); return state_; } while(0)

using content::BrowserMainLoop;

using content::BrowserThread;

+using content::SpeechRecognitionError;

using content::SpeechRecognitionEventListener;

+using content::SpeechRecognitionResult;

using content::SpeechRecognizer;

using media::AudioInputController;

-using std::string;

+// TODO(primiano) what about a watchdog here to avoid getting stuck if the

+// SpeechRecognitionEngine does not deliver a result (in reasonable time)?

namespace {

+// Enables spontaneous transition from WaitingForSpeech to RecognizingSpeech,

+// which is required for the mock recognition engine which sends fake results.

+const bool skipSilenceDetectionForTesting = false;

// The following constants are related to the volume level indicator shown in

// the UI for recorded audio.

// Multiplier used when new volume is greater than previous level.

@@ -44,6 +53,7 @@ bool DetectClipping(const speech::AudioChunk& chunk) {

const int16* samples = chunk.SamplesData16();

const int kThreshold = num_samples / 20;

int clipping_samples = 0;

for (int i = 0; i < num_samples; ++i) {

if (samples[i] <= -32767 || samples[i] >= 32767) {

if (++clipping_samples > kThreshold)

@@ -55,6 +65,7 @@ bool DetectClipping(const speech::AudioChunk& chunk) {

} // namespace

+// TODO(primiano) transitional, see description in speech_recognizer.h.

SpeechRecognizer* SpeechRecognizer::Create(

SpeechRecognitionEventListener* listener,

int caller_id,

@@ -64,269 +75,486 @@ SpeechRecognizer* SpeechRecognizer::Create(

bool filter_profanities,

const std::string& hardware_info,

const std::string& origin_url) {

- return new speech::SpeechRecognizerImpl(

- listener, caller_id, language, grammar, context_getter,

- filter_profanities, hardware_info, origin_url);

+ speech::GoogleOneShotRemoteEngineConfig google_sr_config;

+ google_sr_config.language = language;

+ google_sr_config.grammar = grammar;

+ google_sr_config.audio_sample_rate =

+ speech::SpeechRecognizerImpl::kAudioSampleRate;

+ google_sr_config.audio_num_bits_per_sample =

+ speech::SpeechRecognizerImpl::kNumBitsPerAudioSample;

+ google_sr_config.filter_profanities = filter_profanities;

+ google_sr_config.hardware_info = hardware_info;

+ google_sr_config.origin_url = origin_url;

+ speech::GoogleOneShotRemoteEngine* google_sr_engine =

+ new speech::GoogleOneShotRemoteEngine(context_getter);

+ google_sr_engine->SetConfiguration(google_sr_config);

+ return new speech::SpeechRecognizerImpl(listener,

+ caller_id,

+ google_sr_engine);

}

namespace speech {

const int SpeechRecognizerImpl::kAudioSampleRate = 16000;

-const int SpeechRecognizerImpl::kAudioPacketIntervalMs = 100;

const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO;

const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16;

-const int SpeechRecognizerImpl::kNoSpeechTimeoutSec = 8;

+const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000;

const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300;

SpeechRecognizerImpl::SpeechRecognizerImpl(

SpeechRecognitionEventListener* listener,

int caller_id,

- const std::string& language,

- const std::string& grammar,

- net::URLRequestContextGetter* context_getter,

- bool filter_profanities,

- const std::string& hardware_info,

- const std::string& origin_url)

+ SpeechRecognitionEngine* engine)

: listener_(listener),

- caller_id_(caller_id),

- language_(language),

- grammar_(grammar),

- filter_profanities_(filter_profanities),

- hardware_info_(hardware_info),

- origin_url_(origin_url),

- context_getter_(context_getter),

- codec_(AudioEncoder::CODEC_FLAC),

- encoder_(NULL),

+ testing_audio_manager_(NULL),

+ recognition_engine_(engine),

endpointer_(kAudioSampleRate),

- num_samples_recorded_(0),

- audio_level_(0.0f),

- audio_manager_(NULL) {

+ caller_id_(caller_id),

+ event_dispatch_nesting_level_(0),

+ state_(kIdle),

+ event_args_(NULL) {

+ DCHECK(listener_ != NULL);

+ DCHECK(recognition_engine_ != NULL);

endpointer_.set_speech_input_complete_silence_length(

base::Time::kMicrosecondsPerSecond / 2);

endpointer_.set_long_speech_input_complete_silence_length(

base::Time::kMicrosecondsPerSecond);

endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);

endpointer_.StartSession();

+ recognition_engine_->set_delegate(this);

}

SpeechRecognizerImpl::~SpeechRecognizerImpl() {

- // Recording should have stopped earlier due to the endpointer or

- // |StopRecording| being called.

- DCHECK(!audio_controller_.get());

- DCHECK(!request_.get() || !request_->HasPendingRequest());

- DCHECK(!encoder_.get());

endpointer_.EndSession();

}

-bool SpeechRecognizerImpl::StartRecognition() {

- DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

- DCHECK(!audio_controller_.get());

- DCHECK(!request_.get() || !request_->HasPendingRequest());

- DCHECK(!encoder_.get());

- // The endpointer needs to estimate the environment/background noise before

- // starting to treat the audio as user input. In |HandleOnData| we wait until

- // such time has passed before switching to user input mode.

- endpointer_.SetEnvironmentEstimationMode();

- encoder_.reset(AudioEncoder::Create(codec_, kAudioSampleRate,

- kNumBitsPerAudioSample));

- int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;

- AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,

- kAudioSampleRate, kNumBitsPerAudioSample,

- samples_per_packet);

- audio_controller_ = AudioInputController::Create(

- audio_manager_ ? audio_manager_ : BrowserMainLoop::GetAudioManager(),

- this, params);

- DCHECK(audio_controller_.get());

- VLOG(1) << "SpeechRecognizer starting record.";

- num_samples_recorded_ = 0;

- audio_controller_->Record();

- return true;

+// ------- Methods that trigger Finite State Machine (FSM) events ------------

+// NOTE: all the external events and request should be enqueued (PostTask), even

+// if they come from the same (IO) thread, in order to preserve the relationship

+// of causality between events.

+// Imagine what would happen if a Start has been enqueued from another thread

+// (but not yet processed) and we suddenly issue a Stop from the IO thread.

+// Furthermore, even if you are sure to not interleave start and stop requests,

+// asynchronous event processing mixed with synchronous callback can cause very

+// mind-breaking side effects.

+// For instance, if someone could call Abort synchronously (instead of posting

+// the event on the queue), it will receive interleaved callbacks (e.g. an error

+// or the audio-end event) before the Abort call is effectively ended.

+// Is your (caller) code ready for this?

+void SpeechRecognizerImpl::StartRecognition() {

+ FSMEventArgs args;

+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

+ base::Bind(&SpeechRecognizerImpl::DispatchEvent,

+ this, kStartRequest, args));

}

void SpeechRecognizerImpl::AbortRecognition() {

- DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

- DCHECK(audio_controller_.get() || request_.get());

- // Stop recording if required.

- if (audio_controller_.get()) {

- CloseAudioControllerSynchronously();

- }

- VLOG(1) << "SpeechRecognizer canceling recognition.";

- encoder_.reset();

- request_.reset();

+ FSMEventArgs args;

+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

+ base::Bind(&SpeechRecognizerImpl::DispatchEvent,

+ this, kAbortRequest, args));

}

void SpeechRecognizerImpl::StopAudioCapture() {

- DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

- // If audio recording has already stopped and we are in recognition phase,

- // silently ignore any more calls to stop recording.

- if (!audio_controller_.get())

- return;

- CloseAudioControllerSynchronously();

+ FSMEventArgs args;

+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

+ base::Bind(&SpeechRecognizerImpl::DispatchEvent,

+ this, kStopCaptureRequest, args));

- listener_->OnSoundEnd(caller_id_);

- listener_->OnAudioEnd(caller_id_);

+bool SpeechRecognizerImpl::IsActive() const {

+ // Checking the FSM state from another thread (thus, while the FSM is

+ // potentially concurrently evolving) is meaningless.

+ // If you're doing it, probably you have some design issues.

+ DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

+ return state_ != kIdle;

- // UploadAudioChunk requires a non-empty final buffer. So we encode a packet

- // of silence in case encoder had no data already.

- std::vector<short> samples((kAudioSampleRate * kAudioPacketIntervalMs) /

- 1000);

- AudioChunk dummy_chunk(reinterpret_cast<uint8*>(&samples[0]),

- samples.size() * sizeof(short),

- encoder_->bits_per_sample() / 8);

- encoder_->Encode(dummy_chunk);

- encoder_->Flush();

- scoped_ptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());

- DCHECK(!encoded_data->IsEmpty());

- encoder_.reset();

- // If we haven't got any audio yet end the recognition sequence here.

- if (request_ == NULL) {

- // Guard against the listener freeing us until we finish our job.

- scoped_refptr<SpeechRecognizerImpl> me(this);

- listener_->OnRecognitionEnd(caller_id_);

- } else {

- request_->UploadAudioChunk(*encoded_data, true /* is_last_chunk */);

- }

+bool SpeechRecognizerImpl::IsCapturingAudio() const {

+ DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive().

+ return state_ >= kStartingRecognition && state_ <= kRecognizingSpeech;

}

// Invoked in the audio thread.

void SpeechRecognizerImpl::OnError(AudioInputController* controller,

int error_code) {

+ FSMEventArgs args;

+ args.audio_error_code = error_code;

BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

- base::Bind(&SpeechRecognizerImpl::HandleOnError,

- this, error_code));

+ base::Bind(&SpeechRecognizerImpl::DispatchEvent,

+ this, kAudioError, args));

}

-void SpeechRecognizerImpl::HandleOnError(int error_code) {

- LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code;

- // Check if we are still recording before canceling recognition, as

- // recording might have been stopped after this error was posted to the queue

- // by |OnError|.

- if (!audio_controller_.get())

+void SpeechRecognizerImpl::OnData(AudioInputController* controller,

+ const uint8* data, uint32 size) {

+ if (size == 0) // This could happen when audio capture stops and is normal.

return;

- InformErrorAndAbortRecognition(content::SPEECH_RECOGNITION_ERROR_AUDIO);

+ FSMEventArgs args;

+ args.audio_data = new AudioChunk(data, static_cast<size_t>(size),

+ kNumBitsPerAudioSample / 8);

+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

+ base::Bind(&SpeechRecognizerImpl::DispatchEvent,

+ this, kAudioData, args));

}

-void SpeechRecognizerImpl::OnData(AudioInputController* controller,

- const uint8* data, uint32 size) {

- if (size == 0) // This could happen when recording stops and is normal.

- return;

- AudioChunk* raw_audio = new AudioChunk(data, static_cast<size_t>(size),

- kNumBitsPerAudioSample / 8);

+void SpeechRecognizerImpl::OnSpeechEngineResult(

+ const content::SpeechRecognitionResult& result) {

+ FSMEvent event = kRecognitionResult;

+ FSMEventArgs args;

+ args.speech_result = result;

BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

- base::Bind(&SpeechRecognizerImpl::HandleOnData,

- this, raw_audio));

+ base::Bind(&SpeechRecognizerImpl::DispatchEvent,

+ this, event, args));

}

-void SpeechRecognizerImpl::HandleOnData(AudioChunk* raw_audio) {

- scoped_ptr<AudioChunk> free_raw_audio_on_return(raw_audio);

- // Check if we are still recording and if not discard this buffer, as

- // recording might have been stopped after this buffer was posted to the queue

- // by |OnData|.

- if (!audio_controller_.get())

- return;

+void SpeechRecognizerImpl::OnSpeechEngineError(

+ const content::SpeechRecognitionError& error) {

+ FSMEvent event = kRecognitionError;

+ FSMEventArgs args;

+ args.error = error;

+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

+ base::Bind(&SpeechRecognizerImpl::DispatchEvent,

+ this, event, args));

+// ----------------------- Core FSM implementation ---------------------------

+void SpeechRecognizerImpl::DispatchEvent(FSMEvent event, FSMEventArgs args) {

+ DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

+ DCHECK_LE(event, kMaxEvent);

+ DCHECK_LE(state_, kMaxState);

+ // Event dispatching must be sequential, otherwise it will break all the rules

+ // and the assumptions of the finite state automata model.

+ DCHECK_EQ(event_dispatch_nesting_level_, 0);

+ ++event_dispatch_nesting_level_;

+ // Guard against the delegate freeing us until we finish processing the event.

+ scoped_refptr<SpeechRecognizerImpl> me(this);

- bool speech_was_heard_before_packet = endpointer_.DidStartReceivingSpeech();

- encoder_->Encode(*raw_audio);

- float rms;

- endpointer_.ProcessAudio(*raw_audio, &rms);

- bool did_clip = DetectClipping(*raw_audio);

- num_samples_recorded_ += raw_audio->NumSamples();

- if (request_ == NULL) {

- // This was the first audio packet recorded, so start a request to the

- // server to send the data and inform the listener.

- listener_->OnAudioStart(caller_id_);

- request_.reset(new SpeechRecognitionRequest(context_getter_.get(), this));

- request_->Start(language_, grammar_, filter_profanities_,

- hardware_info_, origin_url_, encoder_->mime_type());

+ event_ = event;

+ event_args_ = &args;

+ if (event == kAudioData)

+ ProcessAudioPipeline();

+ // The audio pipeline must be processed before the ProcessEvent, otherwise it

+ // would take actions according to the future state and not the current one.

+ state_ = ProcessEvent(event);

+ // Cleanup event args.

+ if (args.audio_data)

+ delete args.audio_data;

+ event_args_ = NULL;

+ --event_dispatch_nesting_level_;

+// ----------- Contract for all the FSM evolution functions below -------------

+// - Are guaranteed to be executed in the IO thread;

+// - Are guaranteed to be not reentrant (themselves and each other);

+// - event_args_ is guaranteed to be non NULL;

+// - event_args_ members are guaranteed to be stable during the call;

+// - The class won't be freed in the meanwhile due to callbacks;

+// TODO(primiano) the audio pipeline is currently serial. However, the

+// clipper->endpointer->vumeter chain and the sr_engine could be parallelized.

+// We should profile the execution to see if it would be worth or not.

+void SpeechRecognizerImpl::ProcessAudioPipeline() {

+ const bool always = true;

+ const bool route_audio_to_clipper = always;

+ const bool route_audio_to_endpointer = state_ >= kEstimatingEnvironment &&

+ state_ <= kRecognizingSpeech;

+ const bool route_audio_to_sr_engine = route_audio_to_endpointer;

+ const bool route_audio_to_vumeter = state_ >= kWaitingForSpeech &&

+ state_ <= kRecognizingSpeech;

+ AudioChunk& recorded_audio_data = *(event_args_->audio_data);

+ num_samples_recorded_ += recorded_audio_data.NumSamples();

+ if (route_audio_to_clipper) {

+ clipper_detected_clip_ = DetectClipping(recorded_audio_data);

+ }

+ if (route_audio_to_endpointer) {

+ endpointer_.ProcessAudio(recorded_audio_data, &rms_);

}

+ if (route_audio_to_vumeter) {

+ DCHECK(route_audio_to_endpointer); // Depends on endpointer due to |rms_|.

+ UpdateSignalAndNoiseLevels(rms_);

+ }

+ if (route_audio_to_sr_engine) {

+ DCHECK(recognition_engine_.get());

+ recognition_engine_->TakeAudioChunk(recorded_audio_data);

+ }

- scoped_ptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());

- DCHECK(!encoded_data->IsEmpty());

- request_->UploadAudioChunk(*encoded_data, false /* is_last_chunk */);

- if (endpointer_.IsEstimatingEnvironment()) {

- // Check if we have gathered enough audio for the endpointer to do

- // environment estimation and should move on to detect speech/end of speech.

- if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs *

- kAudioSampleRate) / 1000) {

- endpointer_.SetUserInputMode();

- listener_->OnEnvironmentEstimationComplete(caller_id_);

- }

- return; // No more processing since we are still estimating environment.

+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ProcessEvent(

+ FSMEvent event) {

+ switch (state_) {

+ case kIdle:

+ switch (event) {

+ // TODO(primiano) restore UNREACHABLE_CONDITION above when speech

+ // input extensions are fixed.

+ case kAbortRequest: return DoNothing(); //UNREACHABLE_CONDITION();

+ case kStartRequest: return InitializeAndStartRecording();

+ case kStopCaptureRequest: return DoNothing(); //UNREACHABLE_CONDITION();

+ case kAudioData: return DoNothing(); // Corner cases related to

+ case kRecognitionResult: return DoNothing(); // queued messages being

+ case kRecognitionError: return DoNothing(); // lately dispatched.

+ case kAudioError: return DoNothing();

+ }

+ break;

+ case kStartingRecognition:

+ switch (event) {

+ case kAbortRequest: return Abort();

+ case kStartRequest: UNREACHABLE_CONDITION();

+ case kStopCaptureRequest: return Abort();

+ case kAudioData: return StartSpeechRecognition();

+ case kRecognitionResult: UNREACHABLE_CONDITION();

+ case kRecognitionError: return Abort();

+ case kAudioError: return Abort();

+ }

+ break;

+ case kEstimatingEnvironment:

+ switch (event) {

+ case kAbortRequest: return Abort();

+ case kStartRequest: UNREACHABLE_CONDITION();

+ case kStopCaptureRequest: return StopCaptureAndWaitForResult();

+ case kAudioData: return EnvironmentEstimation();

+ case kRecognitionResult: return ProcessIntermediateRecognitionResult();

+ case kRecognitionError: return Abort();

+ case kAudioError: return Abort();

+ }

+ break;

+ case kWaitingForSpeech:

+ switch (event) {

+ case kAbortRequest: return Abort();

+ case kStartRequest: UNREACHABLE_CONDITION();

+ case kStopCaptureRequest: return StopCaptureAndWaitForResult();

+ case kAudioData: return DetectUserSpeechOrTimeout();

+ case kRecognitionResult: return ProcessIntermediateRecognitionResult();

+ case kRecognitionError: return Abort();

+ case kAudioError: return Abort();

+ }

+ break;

+ case kRecognizingSpeech:

+ switch (event) {

+ case kAbortRequest: return Abort();

+ case kStartRequest: UNREACHABLE_CONDITION();

+ case kStopCaptureRequest: return StopCaptureAndWaitForResult();

+ case kAudioData: return DetectEndOfSpeech();

+ case kRecognitionResult: return ProcessIntermediateRecognitionResult();

+ case kRecognitionError: return Abort();

+ case kAudioError: return Abort();

+ }

+ break;

+ case kWaitingFinalResult:

+ switch (event) {

+ case kAbortRequest: return Abort();

+ case kStartRequest: UNREACHABLE_CONDITION();

+ case kStopCaptureRequest: return DoNothing();

+ case kAudioData: return DoNothing();

+ case kRecognitionResult: return ProcessFinalRecognitionResult();

+ case kRecognitionError: return Abort();

+ case kAudioError: return Abort();

+ }

+ break;

}

+ UNREACHABLE_CONDITION();

- // Check if we have waited too long without hearing any speech.

- bool speech_was_heard_after_packet = endpointer_.DidStartReceivingSpeech();

- if (!speech_was_heard_after_packet &&

- num_samples_recorded_ >= kNoSpeechTimeoutSec * kAudioSampleRate) {

- InformErrorAndAbortRecognition(

- content::SPEECH_RECOGNITION_ERROR_NO_SPEECH);

- return;

+SpeechRecognizerImpl::FSMState

+SpeechRecognizerImpl::InitializeAndStartRecording() {

+ DCHECK(recognition_engine_.get());

+ DCHECK(audio_controller_.get() == NULL);

+ AudioManager* audio_manager = (testing_audio_manager_ != NULL) ?

+ testing_audio_manager_ :

+ BrowserMainLoop::GetAudioManager();

+ DCHECK(audio_manager != NULL);

+ VLOG(1) << "SpeechRecognizerImpl starting audio capture.";

+ num_samples_recorded_ = 0;

+ rms_ = 0;

+ audio_level_ = 0;

+ clipper_detected_clip_ = false;

+ listener_->OnRecognitionStart(caller_id_);

+ if (!audio_manager->HasAudioInputDevices()) {

+ return Abort(SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO,

+ content::AUDIO_ERROR_NO_MIC));

+ }

+ if (audio_manager->IsRecordingInProcess()) {

+ return Abort(SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO,

+ content::AUDIO_ERROR_MIC_IN_USE));

+ }

+ const int samples_per_packet = kAudioSampleRate *

+ recognition_engine_->GetDesiredAudioChunkDurationMs() / 1000;

+ AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,

+ kAudioSampleRate, kNumBitsPerAudioSample,

+ samples_per_packet);

+ audio_controller_ = AudioInputController::Create(audio_manager, this, params);

+ if (audio_controller_.get() == NULL) {

+ return Abort(

+ SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO));

+ }

+ // The endpointer needs to estimate the environment/background noise before

+ // starting to treat the audio as user input. We wait in the state

+ // kEstimatingEnvironment until such interval has elapsed before switching

+ // to user input mode.

+ endpointer_.SetEnvironmentEstimationMode();

+ audio_controller_->Record();

+ return kStartingRecognition;

+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::StartSpeechRecognition() {

+ // This was the first audio packet recorded, so start a request to the

+ // engine to send the data and inform the delegate.

+ DCHECK(recognition_engine_.get());

+ recognition_engine_->Initialize();

+ listener_->OnAudioStart(caller_id_);

+ // TODO(primiano) this is a little hack, since TakeAudioChunk() is already

+ // called by ProcessAudioPipeline(). I hate it since it weakens the

+ // architectural beauty of this class. But it is the best tradeoff, unless we

+ // allow the drop the first audio chunk captured after opening the audio dev.

+ recognition_engine_->TakeAudioChunk(*(event_args_->audio_data));

+ return kEstimatingEnvironment;

+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::EnvironmentEstimation() {

+ DCHECK(endpointer_.IsEstimatingEnvironment());

+ if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) {

+ endpointer_.SetUserInputMode();

+ listener_->OnEnvironmentEstimationComplete(caller_id_);

+ return kWaitingForSpeech;

+ } else {

+ return kEstimatingEnvironment;

}

- if (!speech_was_heard_before_packet && speech_was_heard_after_packet)

+SpeechRecognizerImpl::FSMState

+SpeechRecognizerImpl::DetectUserSpeechOrTimeout() {

+ if (skipSilenceDetectionForTesting)

+ return kRecognizingSpeech;

+ if (endpointer_.DidStartReceivingSpeech()) {

listener_->OnSoundStart(caller_id_);

+ return kRecognizingSpeech;

+ } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) {

+ return Abort(

+ SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH));

+ } else {

+ return kWaitingForSpeech;

+ }

- // Calculate the input volume to display in the UI, smoothing towards the

- // new level.

- float level = (rms - kAudioMeterMinDb) /

- (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

- level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);

- if (level > audio_level_) {

- audio_level_ += (level - audio_level_) * kUpSmoothingFactor;

+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::DetectEndOfSpeech() {

+ if (endpointer_.speech_input_complete()) {

+ return StopCaptureAndWaitForResult();

} else {

- audio_level_ += (level - audio_level_) * kDownSmoothingFactor;

+ return kRecognizingSpeech;

}

- float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /

- (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

- noise_level = std::min(std::max(0.0f, noise_level),

- kAudioMeterRangeMaxUnclipped);

+SpeechRecognizerImpl::FSMState

+SpeechRecognizerImpl::StopCaptureAndWaitForResult() {

+ DCHECK(state_ >= kEstimatingEnvironment && state_ <= kRecognizingSpeech);

+ VLOG(1) << "Concluding recognition";

+ CloseAudioControllerSynchronously();

+ recognition_engine_->AudioChunksEnded();

- listener_->OnAudioLevelsChange(caller_id_, did_clip ? 1.0f : audio_level_,

- noise_level);

+ if (state_ > kWaitingForSpeech)

+ listener_->OnSoundEnd(caller_id_);

- if (endpointer_.speech_input_complete())

- StopAudioCapture();

+ listener_->OnAudioEnd(caller_id_);

+ return kWaitingFinalResult;

}

-void SpeechRecognizerImpl::SetRecognitionResult(

- const content::SpeechRecognitionResult& result) {

- if (result.error != content::SPEECH_RECOGNITION_ERROR_NONE) {

- InformErrorAndAbortRecognition(result.error);

- return;

+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort() {

+ // TODO(primiano) Should raise SPEECH_RECOGNITION_ERROR_ABORTED in lack of

+ // other specific error sources (so that it was an explicit abort request).

+ // However, SPEECH_RECOGNITION_ERROR_ABORTED is not caught in UI layers

+ // and currently would cause an exception. JS will probably need it in future.

+ SpeechRecognitionError error;

+ bool has_error = false;

+ if (event_ == kAudioError) {

+ has_error = true;

+ error.code = content::SPEECH_RECOGNITION_ERROR_AUDIO;

+ } else if (event_ == kRecognitionError) {

+ has_error = true;

+ error = event_args_->error;

}

+ return Abort(has_error, error);

+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort(

+ const SpeechRecognitionError& error) {

+ return Abort(true, error);

+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort(

+ bool has_error, const SpeechRecognitionError& error) {

+ if (audio_controller_)

+ CloseAudioControllerSynchronously();

+ VLOG(1) << "SpeechRecognizerImpl canceling recognition. " <<

+ error.code << " " << error.details;

+ // The recognition engine is initialized only after kStartingRecognition.

+ if (state_ > kStartingRecognition) {

+ DCHECK(recognition_engine_.get());

+ recognition_engine_->Cleanup();

+ //TODO(primiano) reset the engine? Why, after all?

+ //recognition_engine_.reset();

+ }

+ if (state_ > kWaitingForSpeech && state_ < kWaitingFinalResult)

+ listener_->OnSoundEnd(caller_id_);

+ if (state_ > kStartingRecognition && state_ < kWaitingFinalResult)

+ listener_->OnAudioEnd(caller_id_);

+ if (has_error)

+ listener_->OnRecognitionError(caller_id_, error);

- // Guard against the listener freeing us until we finish our job.

- scoped_refptr<SpeechRecognizerImpl> me(this);

- listener_->OnRecognitionResult(caller_id_, result);

listener_->OnRecognitionEnd(caller_id_);

+ return kIdle;

}

-void SpeechRecognizerImpl::InformErrorAndAbortRecognition(

- content::SpeechRecognitionErrorCode error) {

- DCHECK_NE(error, content::SPEECH_RECOGNITION_ERROR_NONE);

- AbortRecognition();

+SpeechRecognizerImpl::FSMState

+SpeechRecognizerImpl::ProcessIntermediateRecognitionResult() {

+// This is in preparation for future speech recognition functions.

+// DCHECK(continuous_mode_);

+// const SpeechRecognitionResult& result = event_args_->speech_result;

+// VLOG(1) << "Got intermediate result";

+// listener_->OnRecognitionResult(caller_id_, result);

+ NOTREACHED();

+ return state_;

- // Guard against the listener freeing us until we finish our job.

- scoped_refptr<SpeechRecognizerImpl> me(this);

- listener_->OnRecognitionError(caller_id_, error);

+SpeechRecognizerImpl::FSMState

+SpeechRecognizerImpl::ProcessFinalRecognitionResult() {

+ const SpeechRecognitionResult& result = event_args_->speech_result;

+ VLOG(1) << "Got valid result";

+ recognition_engine_->Cleanup();

+ listener_->OnRecognitionResult(caller_id_, result);

+ listener_->OnRecognitionEnd(caller_id_);

+ return kIdle;

+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::DoNothing() const {

+ return state_; // Just keep the current state.

}

void SpeechRecognizerImpl::CloseAudioControllerSynchronously() {

- VLOG(1) << "SpeechRecognizer stopping record.";

+ DCHECK(audio_controller_);

+ VLOG(1) << "SpeechRecognizerImpl stopping audio capture.";

// TODO(satish): investigate the possibility to utilize the closure

// and switch to async. version of this method. Compare with how

@@ -338,17 +566,45 @@ void SpeechRecognizerImpl::CloseAudioControllerSynchronously() {

audio_controller_ = NULL; // Releases the ref ptr.

}

-void SpeechRecognizerImpl::SetAudioManagerForTesting(

- AudioManager* audio_manager) {

- audio_manager_ = audio_manager;

+int SpeechRecognizerImpl::GetElapsedTimeMs() const {

+ return num_samples_recorded_ * 1000 / kAudioSampleRate;

}

-bool SpeechRecognizerImpl::IsActive() const {

- return (request_.get() != NULL);

+void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms) {

+ // Calculate the input volume to display in the UI, smoothing towards the

+ // new level.

+ // TODO(primiano) Do we really need all this floating point arith here?

+ // Perhaps it might be quite expensive on mobile.

+ float level = (rms - kAudioMeterMinDb) /

+ (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

+ level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);

+ if (level > audio_level_) {

+ audio_level_ += (level - audio_level_) * kUpSmoothingFactor;

+ } else {

+ audio_level_ += (level - audio_level_) * kDownSmoothingFactor;

+ }

+ float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /

+ (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

+ noise_level = std::min(std::max(0.0f, noise_level),

+ kAudioMeterRangeMaxUnclipped);

+ listener_->OnAudioLevelsChange(

+ caller_id_, clipper_detected_clip_ ? 1.0f : audio_level_, noise_level);

}

-bool SpeechRecognizerImpl::IsCapturingAudio() const {

- return (audio_controller_.get() != NULL);

+const SpeechRecognitionEngine&

+ SpeechRecognizerImpl::recognition_engine() const {

+ return *(recognition_engine_.get());

+void SpeechRecognizerImpl::SetAudioManagerForTesting(

+ AudioManager* audio_manager) {

+ testing_audio_manager_ = audio_manager;

+SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs()

+ : audio_error_code(0), audio_data(NULL) {

}

} // namespace speech