Index: content/browser/speech/speech_recognizer_impl.cc |
diff --git a/content/browser/speech/speech_recognizer_impl.cc b/content/browser/speech/speech_recognizer_impl.cc |
index 07bd75e1be7bea5f54be4ed4948ecd4a7fb6a74e..df4c74089af0ce7a63ae1b576d35a774dbfdc719 100644 |
--- a/content/browser/speech/speech_recognizer_impl.cc |
+++ b/content/browser/speech/speech_recognizer_impl.cc |
@@ -4,6 +4,7 @@ |
#include "content/browser/speech/speech_recognizer_impl.h" |
+#include "base/basictypes.h" |
#include "base/bind.h" |
#include "base/time.h" |
#include "content/browser/browser_main_loop.h" |
@@ -16,6 +17,8 @@ |
#include "content/public/common/speech_recognition_result.h" |
#include "net/url_request/url_request_context_getter.h" |
+#define NOT_FEASIBLE() do { NOTREACHED(); return state_; } while(0) |
bulach
2012/04/04 15:38:17
nit: we avoid using macros as much as possible.. t
Primiano Tucci (use gerrit)
2012/04/11 10:05:41
Ok. Turned into a regular function like the others
|
+ |
using content::BrowserMainLoop; |
using content::BrowserThread; |
using content::SpeechRecognitionError; |
@@ -24,6 +27,7 @@ using content::SpeechRecognitionResult; |
using content::SpeechRecognizer; |
using media::AudioInputController; |
using media::AudioManager; |
+using media::AudioParameters; |
namespace { |
@@ -49,6 +53,7 @@ bool DetectClipping(const speech::AudioChunk& chunk) { |
const int16* samples = chunk.SamplesData16(); |
const int kThreshold = num_samples / 20; |
int clipping_samples = 0; |
+ |
for (int i = 0; i < num_samples; ++i) { |
if (samples[i] <= -32767 || samples[i] >= 32767) { |
if (++clipping_samples > kThreshold) |
@@ -69,14 +74,24 @@ SpeechRecognizer* SpeechRecognizer::Create( |
bool filter_profanities, |
const std::string& hardware_info, |
const std::string& origin_url) { |
+ speech::GoogleOneShotRemoteEngineConfig google_sr_config; |
bulach
2012/04/04 15:38:17
nit: prefer to call "remote_engine_config"
Primiano Tucci (use gerrit)
2012/04/11 10:05:41
Done.
|
+ google_sr_config.language = language; |
+ google_sr_config.grammar = grammar; |
+ google_sr_config.audio_sample_rate = |
+ speech::SpeechRecognizerImpl::kAudioSampleRate; |
+ google_sr_config.audio_num_bits_per_sample = |
+ speech::SpeechRecognizerImpl::kNumBitsPerAudioSample; |
+ google_sr_config.filter_profanities = filter_profanities; |
+ google_sr_config.hardware_info = hardware_info; |
+ google_sr_config.origin_url = origin_url; |
+ |
+ speech::GoogleOneShotRemoteEngine* google_sr_engine = |
bulach
2012/04/04 15:38:17
nit: remote_engine.
also, just to clarify could a
Primiano Tucci (use gerrit)
2012/04/11 10:05:41
Done.
|
+ new speech::GoogleOneShotRemoteEngine(context_getter); |
+ google_sr_engine->SetConfig(google_sr_config); |
+ |
return new speech::SpeechRecognizerImpl(listener, |
caller_id, |
- language, |
- grammar, |
- context_getter, |
- filter_profanities, |
- hardware_info, |
- origin_url); |
+ google_sr_engine); |
} |
namespace speech { |
@@ -87,247 +102,492 @@ const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; |
const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; |
const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; |
+COMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0, |
+ kNumBitsPerAudioSample_must_be_a_multiple_of_8); |
+ |
SpeechRecognizerImpl::SpeechRecognizerImpl( |
SpeechRecognitionEventListener* listener, |
int caller_id, |
- const std::string& language, |
- const std::string& grammar, |
- net::URLRequestContextGetter* context_getter, |
- bool filter_profanities, |
- const std::string& hardware_info, |
- const std::string& origin_url) |
+ SpeechRecognitionEngine* engine) |
: listener_(listener), |
testing_audio_manager_(NULL), |
+ recognition_engine_(engine), |
endpointer_(kAudioSampleRate), |
- context_getter_(context_getter), |
caller_id_(caller_id), |
- language_(language), |
- grammar_(grammar), |
- filter_profanities_(filter_profanities), |
- hardware_info_(hardware_info), |
- origin_url_(origin_url), |
- num_samples_recorded_(0), |
- audio_level_(0.0f) { |
+ in_event_dispatching_(false), |
+ state_(STATE_IDLE) { |
DCHECK(listener_ != NULL); |
+ DCHECK(recognition_engine_ != NULL); |
endpointer_.set_speech_input_complete_silence_length( |
base::Time::kMicrosecondsPerSecond / 2); |
endpointer_.set_long_speech_input_complete_silence_length( |
base::Time::kMicrosecondsPerSecond); |
endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); |
endpointer_.StartSession(); |
+ recognition_engine_->set_delegate(this); |
} |
SpeechRecognizerImpl::~SpeechRecognizerImpl() { |
- // Recording should have stopped earlier due to the endpointer or |
- // |StopRecording| being called. |
- DCHECK(!audio_controller_.get()); |
- DCHECK(!recognition_engine_.get() || |
- !recognition_engine_->IsRecognitionPending()); |
endpointer_.EndSession(); |
} |
-void SpeechRecognizerImpl::StartRecognition() { |
- DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
- DCHECK(!audio_controller_.get()); |
- DCHECK(!recognition_engine_.get() || |
- !recognition_engine_->IsRecognitionPending()); |
+// ------- Methods that trigger Finite State Machine (FSM) events ------------ |
- // The endpointer needs to estimate the environment/background noise before |
- // starting to treat the audio as user input. In |HandleOnData| we wait until |
- // such time has passed before switching to user input mode. |
- endpointer_.SetEnvironmentEstimationMode(); |
+// NOTE:all the external events and requests should be enqueued (PostTask), even |
+// if they come from the same (IO) thread, in order to preserve the relationship |
+// of causality between events and avoid interleaved event processing due to |
+// synchronous callbacks. |
- AudioManager* audio_manager = (testing_audio_manager_ != NULL) ? |
- testing_audio_manager_ : BrowserMainLoop::GetAudioManager(); |
- const int samples_per_packet = kAudioSampleRate * |
- GoogleOneShotRemoteEngine::kAudioPacketIntervalMs / 1000; |
- media::AudioParameters params( |
- media::AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout, |
- kAudioSampleRate, kNumBitsPerAudioSample, samples_per_packet); |
- audio_controller_ = AudioInputController::Create(audio_manager, this, params); |
- DCHECK(audio_controller_.get()); |
- VLOG(1) << "SpeechRecognizer starting record."; |
- num_samples_recorded_ = 0; |
- audio_controller_->Record(); |
+void SpeechRecognizerImpl::StartRecognition() { |
+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
+ base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
+ this, FSMEventArgs(EVENT_START))); |
} |
void SpeechRecognizerImpl::AbortRecognition() { |
- DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
- DCHECK(audio_controller_.get() || recognition_engine_.get()); |
- |
- // Stop recording if required. |
- if (audio_controller_.get()) { |
- CloseAudioControllerAsynchronously(); |
- } |
- |
- VLOG(1) << "SpeechRecognizer canceling recognition."; |
- recognition_engine_.reset(); |
+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
+ base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
+ this, FSMEventArgs(EVENT_ABORT))); |
} |
void SpeechRecognizerImpl::StopAudioCapture() { |
- DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
- |
- // If audio recording has already stopped and we are in recognition phase, |
- // silently ignore any more calls to stop recording. |
- if (!audio_controller_.get()) |
- return; |
+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
+ base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
+ this, FSMEventArgs(EVENT_STOP_CAPTURE))); |
+} |
- CloseAudioControllerAsynchronously(); |
- listener_->OnSoundEnd(caller_id_); |
- listener_->OnAudioEnd(caller_id_); |
+bool SpeechRecognizerImpl::IsActive() const { |
+ // Checking the FSM state from another thread (thus, while the FSM is |
+ // potentially concurrently evolving) is meaningless. |
+ DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
+ return state_ != STATE_IDLE; |
+} |
- // If we haven't got any audio yet end the recognition sequence here. |
- if (recognition_engine_ == NULL) { |
- // Guard against the listener freeing us until we finish our job. |
- scoped_refptr<SpeechRecognizerImpl> me(this); |
- listener_->OnRecognitionEnd(caller_id_); |
- } else { |
- recognition_engine_->AudioChunksEnded(); |
- } |
+bool SpeechRecognizerImpl::IsCapturingAudio() const { |
+ DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive(). |
+ const bool is_capturing_audio = state_ >= STATE_STARTING && |
+ state_ <= STATE_RECOGNIZING; |
+ DCHECK((is_capturing_audio && (audio_controller_.get() != NULL)) || |
+ (!is_capturing_audio && audio_controller_.get() == NULL)); |
+ return is_capturing_audio; |
} |
// Invoked in the audio thread. |
void SpeechRecognizerImpl::OnError(AudioInputController* controller, |
int error_code) { |
+ FSMEventArgs event_args(EVENT_AUDIO_ERROR); |
+ event_args.audio_error_code = error_code; |
BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
- base::Bind(&SpeechRecognizerImpl::HandleOnError, |
- this, error_code)); |
+ base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
+ this, event_args)); |
} |
-void SpeechRecognizerImpl::HandleOnError(int error_code) { |
- LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code; |
- |
- // Check if we are still recording before canceling recognition, as |
- // recording might have been stopped after this error was posted to the queue |
- // by |OnError|. |
- if (!audio_controller_.get()) |
+void SpeechRecognizerImpl::OnData(AudioInputController* controller, |
+ const uint8* data, uint32 size) { |
+ if (size == 0) // This could happen when audio capture stops and is normal. |
return; |
- InformErrorAndAbortRecognition(content::SPEECH_RECOGNITION_ERROR_AUDIO); |
+ FSMEventArgs event_args(EVENT_AUDIO_DATA); |
+ event_args.audio_data = new AudioChunk(data, static_cast<size_t>(size), |
+ kNumBitsPerAudioSample / 8); |
+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
+ base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
+ this, event_args)); |
} |
-void SpeechRecognizerImpl::OnData(AudioInputController* controller, |
- const uint8* data, uint32 size) { |
- if (size == 0) // This could happen when recording stops and is normal. |
- return; |
- scoped_refptr<AudioChunk> raw_audio( |
- new AudioChunk(data, |
- static_cast<size_t>(size), |
- kNumBitsPerAudioSample / 8)); |
+void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} |
+ |
+void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult( |
+ const content::SpeechRecognitionResult& result) { |
+ FSMEventArgs event_args(EVENT_ENGINE_RESULT); |
+ event_args.engine_result = result; |
BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
- base::Bind(&SpeechRecognizerImpl::HandleOnData, |
- this, raw_audio)); |
+ base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
+ this, event_args)); |
} |
-void SpeechRecognizerImpl::HandleOnData(scoped_refptr<AudioChunk> raw_audio) { |
- // Check if we are still recording and if not discard this buffer, as |
- // recording might have been stopped after this buffer was posted to the queue |
- // by |OnData|. |
- if (!audio_controller_.get()) |
- return; |
+void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( |
+ const content::SpeechRecognitionError& error) { |
+ FSMEventArgs event_args(EVENT_ENGINE_ERROR); |
+ event_args.engine_error = error; |
+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
+ base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
+ this, event_args)); |
+} |
+ |
+// ----------------------- Core FSM implementation --------------------------- |
+// TODO(primiano) After the changes in the media package (r129173), this class |
+// slightly violates the SpeechRecognitionEventListener interface contract. In |
+// particular, it is not true anymore that this class can be freed after the |
+// OnRecognitionEnd event, since the audio_controller_.Close() asynchronous |
+// call can be still in progress after the end event. Currently, it does not |
+// represent a problem for the browser itself, since refcounting protects us |
+// against such race conditions. However, we should fix this in the next CLs. |
+// For instance, tests are currently working just because the |
+// TestAudioInputController is not closing asynchronously as the real controller |
+// does, but they will become flaky if TestAudioInputController will be fixed. |
+ |
+void SpeechRecognizerImpl::DispatchEvent(const FSMEventArgs& event_args) { |
+ DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
+ DCHECK_LE(event_args.event, EVENT_MAX); |
+ DCHECK_LE(state_, STATE_MAX); |
+ |
+ // Event dispatching must be sequential, otherwise it will break all the rules |
+ // and the assumptions of the finite state automata model. |
+ DCHECK(!in_event_dispatching_); |
+ in_event_dispatching_ = true; |
- bool speech_was_heard_before_packet = endpointer_.DidStartReceivingSpeech(); |
- |
- float rms; |
- endpointer_.ProcessAudio(*raw_audio, &rms); |
- bool did_clip = DetectClipping(*raw_audio); |
- num_samples_recorded_ += raw_audio->NumSamples(); |
- |
- if (recognition_engine_ == NULL) { |
- // This was the first audio packet recorded, so start a request to the |
- // server to send the data and inform the listener. |
- listener_->OnAudioStart(caller_id_); |
- GoogleOneShotRemoteEngineConfig google_sr_config; |
- google_sr_config.language = language_; |
- google_sr_config.grammar = grammar_; |
- google_sr_config.audio_sample_rate = kAudioSampleRate; |
- google_sr_config.audio_num_bits_per_sample = kNumBitsPerAudioSample; |
- google_sr_config.filter_profanities = filter_profanities_; |
- google_sr_config.hardware_info = hardware_info_; |
- google_sr_config.origin_url = origin_url_; |
- GoogleOneShotRemoteEngine* google_sr_engine = |
- new GoogleOneShotRemoteEngine(context_getter_.get()); |
- google_sr_engine->SetConfig(google_sr_config); |
- recognition_engine_.reset(google_sr_engine); |
- recognition_engine_->set_delegate(this); |
- recognition_engine_->StartRecognition(); |
+ // Guard against the delegate freeing us until we finish processing the event. |
+ scoped_refptr<SpeechRecognizerImpl> me(this); |
+ |
+ if (event_args.event == EVENT_AUDIO_DATA) { |
+ DCHECK(event_args.audio_data.get() != NULL); |
+ ProcessAudioPipeline(*event_args.audio_data); |
} |
- recognition_engine_->TakeAudioChunk(*raw_audio); |
+ // The audio pipeline must be processed before the event dispatch, otherwise |
+ // it would take actions according to the future state instead of the current. |
+ state_ = ExecuteTransitionAndGetNextState(event_args); |
- if (endpointer_.IsEstimatingEnvironment()) { |
- // Check if we have gathered enough audio for the endpointer to do |
- // environment estimation and should move on to detect speech/end of speech. |
- if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs * |
- kAudioSampleRate) / 1000) { |
- endpointer_.SetUserInputMode(); |
- listener_->OnEnvironmentEstimationComplete(caller_id_); |
- } |
- return; // No more processing since we are still estimating environment. |
+ in_event_dispatching_ = false; |
+} |
+ |
+SpeechRecognizerImpl::FSMState |
+SpeechRecognizerImpl::ExecuteTransitionAndGetNextState( |
+ const FSMEventArgs& event_args) { |
+ const FSMEvent event = event_args.event; |
+ switch (state_) { |
+ case STATE_IDLE: |
+ switch (event) { |
+ // TODO(primiano) restore UNREACHABLE_CONDITION on EVENT_ABORT and |
+ // EVENT_STOP_CAPTURE below once speech input extensions are fixed. |
+ case EVENT_ABORT: |
+ return DoNothing(event_args); |
+ case EVENT_START: |
+ return StartRecording(event_args); |
+ case EVENT_STOP_CAPTURE: |
+ return DoNothing(event_args); |
+ case EVENT_AUDIO_DATA: |
+ return DoNothing(event_args); // Corner cases related to queued |
+ case EVENT_ENGINE_RESULT: // messages being lately dispatched. |
+ return DoNothing(event_args); |
+ case EVENT_ENGINE_ERROR: |
+ return DoNothing(event_args); |
+ case EVENT_AUDIO_ERROR: |
+ return DoNothing(event_args); |
bulach
2012/04/04 15:38:17
I find this is a bit hard to follow..
would it be
Primiano Tucci (use gerrit)
2012/04/11 10:05:41
Hmm, the point is that is not obvious whether the
|
+ } |
+ break; |
+ case STATE_STARTING: |
+ switch (event) { |
+ case EVENT_ABORT: |
+ return Abort(event_args); |
+ case EVENT_START: |
+ NOT_FEASIBLE(); |
+ case EVENT_STOP_CAPTURE: |
+ return Abort(event_args); |
+ case EVENT_AUDIO_DATA: |
+ return StartRecognitionEngine(event_args); |
+ case EVENT_ENGINE_RESULT: |
+ NOT_FEASIBLE(); |
+ case EVENT_ENGINE_ERROR: |
+ return Abort(event_args); |
+ case EVENT_AUDIO_ERROR: |
+ return Abort(event_args); |
bulach
2012/04/04 15:38:17
ditto here...
maybe something like:
case EVENT_AUD
Primiano Tucci (use gerrit)
2012/04/11 10:05:41
Grouped adjacent cases ending with the same action
|
+ } |
+ break; |
+ case STATE_ESTIMATING_ENVIRONMENT: |
+ switch (event) { |
+ case EVENT_ABORT: |
+ return Abort(event_args); |
+ case EVENT_START: |
+ NOT_FEASIBLE(); |
+ case EVENT_STOP_CAPTURE: |
+ return StopCaptureAndWaitForResult(event_args); |
+ case EVENT_AUDIO_DATA: |
+ return WaitEnvironmentEstimationCompletion(event_args); |
+ case EVENT_ENGINE_RESULT: |
+ return ProcessIntermediateResult(event_args); |
+ case EVENT_ENGINE_ERROR: |
+ return Abort(event_args); |
+ case EVENT_AUDIO_ERROR: |
+ return Abort(event_args); |
+ } |
+ break; |
+ case STATE_WAITING_FOR_SPEECH: |
+ switch (event) { |
+ case EVENT_ABORT: |
+ return Abort(event_args); |
+ case EVENT_START: |
+ NOT_FEASIBLE(); |
+ case EVENT_STOP_CAPTURE: |
+ return StopCaptureAndWaitForResult(event_args); |
+ case EVENT_AUDIO_DATA: |
+ return DetectUserSpeechOrTimeout(event_args); |
+ case EVENT_ENGINE_RESULT: |
+ return ProcessIntermediateResult(event_args); |
+ case EVENT_ENGINE_ERROR: |
+ return Abort(event_args); |
+ case EVENT_AUDIO_ERROR: |
+ return Abort(event_args); |
+ } |
+ break; |
+ case STATE_RECOGNIZING: |
+ switch (event) { |
+ case EVENT_ABORT: |
+ return Abort(event_args); |
+ case EVENT_START: |
+ NOT_FEASIBLE(); |
+ case EVENT_STOP_CAPTURE: |
+ return StopCaptureAndWaitForResult(event_args); |
+ case EVENT_AUDIO_DATA: |
+ return DetectEndOfSpeech(event_args); |
+ case EVENT_ENGINE_RESULT: |
+ return ProcessIntermediateResult(event_args); |
+ case EVENT_ENGINE_ERROR: |
+ return Abort(event_args); |
+ case EVENT_AUDIO_ERROR: |
+ return Abort(event_args); |
+ } |
+ break; |
+ case STATE_WAITING_FINAL_RESULT: |
+ switch (event) { |
+ case EVENT_ABORT: |
+ return Abort(event_args); |
+ case EVENT_START: |
+ NOT_FEASIBLE(); |
+ case EVENT_STOP_CAPTURE: |
+ return DoNothing(event_args); |
+ case EVENT_AUDIO_DATA: |
+ return DoNothing(event_args); |
+ case EVENT_ENGINE_RESULT: |
+ return ProcessFinalResult(event_args); |
+ case EVENT_ENGINE_ERROR: |
+ return Abort(event_args); |
+ case EVENT_AUDIO_ERROR: |
+ return Abort(event_args); |
+ } |
+ break; |
} |
+ NOT_FEASIBLE(); |
+} |
- // Check if we have waited too long without hearing any speech. |
- bool speech_was_heard_after_packet = endpointer_.DidStartReceivingSpeech(); |
- if (!speech_was_heard_after_packet && |
- num_samples_recorded_ >= (kNoSpeechTimeoutMs / 1000) * kAudioSampleRate) { |
- InformErrorAndAbortRecognition( |
- content::SPEECH_RECOGNITION_ERROR_NO_SPEECH); |
- return; |
+// ----------- Contract for all the FSM evolution functions below ------------- |
+// - Are guaranteed to be executed in the IO thread; |
+// - Are guaranteed to be not reentrant (themselves and each other); |
+// - event_args members are guaranteed to be stable during the call; |
+// - The class won't be freed in the meanwhile due to callbacks; |
+// - IsCapturingAudio() returns true if and only if audio_controller_ != NULL. |
+ |
+// TODO(primiano) the audio pipeline is currently serial. However, the |
+// clipper->endpointer->vumeter chain and the sr_engine could be parallelized. |
+// We should profile the execution to see if it would be worth or not. |
+void SpeechRecognizerImpl::ProcessAudioPipeline(const AudioChunk& raw_audio) { |
+ const bool route_to_endpointer = state_ >= STATE_ESTIMATING_ENVIRONMENT && |
+ state_ <= STATE_RECOGNIZING; |
+ const bool route_to_sr_engine = route_to_endpointer; |
+ const bool route_to_vumeter = state_ >= STATE_WAITING_FOR_SPEECH && |
+ state_ <= STATE_RECOGNIZING; |
+ const bool clip_detected = DetectClipping(raw_audio); |
+ float rms = 0; |
+ |
+ num_samples_recorded_ += raw_audio.NumSamples(); |
+ |
+ if (route_to_endpointer) { |
bulach
2012/04/04 15:38:17
nit: we normally avoid {} on single line if blocks
Primiano Tucci (use gerrit)
2012/04/11 10:05:41
Done.
|
+ endpointer_.ProcessAudio(raw_audio, &rms); |
+ } |
+ if (route_to_vumeter) { |
+ DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|. |
+ UpdateSignalAndNoiseLevels(rms, clip_detected); |
+ } |
+ if (route_to_sr_engine) { |
+ DCHECK(recognition_engine_.get()); |
+ recognition_engine_->TakeAudioChunk(raw_audio); |
+ } |
+} |
+ |
+SpeechRecognizerImpl::FSMState |
+SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) { |
+ DCHECK(recognition_engine_.get()); |
+ DCHECK(!IsCapturingAudio()); |
+ AudioManager* audio_manager = (testing_audio_manager_ != NULL) ? |
+ testing_audio_manager_ : |
+ BrowserMainLoop::GetAudioManager(); |
+ DCHECK(audio_manager != NULL); |
+ |
+ VLOG(1) << "SpeechRecognizerImpl starting audio capture."; |
bulach
2012/04/04 15:38:17
nit: DVLOG?
Primiano Tucci (use gerrit)
2012/04/11 10:05:41
Done.
|
+ num_samples_recorded_ = 0; |
+ audio_level_ = 0; |
+ listener_->OnRecognitionStart(caller_id_); |
+ |
+ if (!audio_manager->HasAudioInputDevices()) { |
+ return AbortWithError(SpeechRecognitionError( |
+ content::SPEECH_RECOGNITION_ERROR_AUDIO, |
+ content::SPEECH_AUDIO_ERROR_DETAILS_NO_MIC)); |
+ } |
+ |
+ if (audio_manager->IsRecordingInProcess()) { |
+ return AbortWithError(SpeechRecognitionError( |
+ content::SPEECH_RECOGNITION_ERROR_AUDIO, |
+ content::SPEECH_AUDIO_ERROR_DETAILS_IN_USE)); |
+ } |
+ |
+ const int samples_per_packet = (kAudioSampleRate * |
+ recognition_engine_->GetDesiredAudioChunkDurationMs()) / 1000; |
+ AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout, |
+ kAudioSampleRate, kNumBitsPerAudioSample, |
+ samples_per_packet); |
+ audio_controller_ = AudioInputController::Create(audio_manager, this, params); |
+ |
+ if (audio_controller_.get() == NULL) { |
bulach
2012/04/04 15:38:17
nit: if (!audio_controller_.get()) {
Primiano Tucci (use gerrit)
2012/04/11 10:05:41
Hmm is it strict? I feel to violate my moral and e
|
+ return AbortWithError( |
+ SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO)); |
} |
- if (!speech_was_heard_before_packet && speech_was_heard_after_packet) |
+ // The endpointer needs to estimate the environment/background noise before |
+ // starting to treat the audio as user input. We wait in the state |
+ // ESTIMATING_ENVIRONMENT until such interval has elapsed before switching |
+ // to user input mode. |
+ endpointer_.SetEnvironmentEstimationMode(); |
+ audio_controller_->Record(); |
+ return STATE_STARTING; |
+} |
+ |
+SpeechRecognizerImpl::FSMState |
+SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) { |
+ // This is the first audio packet captured, so the recognition engine is |
+ // started and the delegate notified about the event. |
+ DCHECK(recognition_engine_.get()); |
+ recognition_engine_->StartRecognition(); |
+ listener_->OnAudioStart(caller_id_); |
+ |
+ // This is a little hack, since TakeAudioChunk() is already called by |
+ // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping |
+ // the first audio chunk captured after opening the audio device. |
+ recognition_engine_->TakeAudioChunk(*(event_args.audio_data)); |
+ return STATE_ESTIMATING_ENVIRONMENT; |
+} |
+ |
+SpeechRecognizerImpl::FSMState |
+SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) { |
+ DCHECK(endpointer_.IsEstimatingEnvironment()); |
+ if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) { |
+ endpointer_.SetUserInputMode(); |
+ listener_->OnEnvironmentEstimationComplete(caller_id_); |
+ return STATE_WAITING_FOR_SPEECH; |
+ } else { |
bulach
2012/04/04 15:38:17
nit: here, 491 and 500, remove the final "else" bl
Primiano Tucci (use gerrit)
2012/04/11 10:05:41
Done.
|
+ return STATE_ESTIMATING_ENVIRONMENT; |
+ } |
+} |
+ |
+SpeechRecognizerImpl::FSMState |
+SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) { |
+ if (endpointer_.DidStartReceivingSpeech()) { |
listener_->OnSoundStart(caller_id_); |
+ return STATE_RECOGNIZING; |
+ } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) { |
+ return AbortWithError( |
+ SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH)); |
+ } else { |
+ return STATE_WAITING_FOR_SPEECH; |
+ } |
+} |
- // Calculate the input volume to display in the UI, smoothing towards the |
- // new level. |
- float level = (rms - kAudioMeterMinDb) / |
- (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); |
- level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped); |
- if (level > audio_level_) { |
- audio_level_ += (level - audio_level_) * kUpSmoothingFactor; |
+SpeechRecognizerImpl::FSMState |
+SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) { |
+ if (endpointer_.speech_input_complete()) { |
+ return StopCaptureAndWaitForResult(event_args); |
} else { |
- audio_level_ += (level - audio_level_) * kDownSmoothingFactor; |
+ return STATE_RECOGNIZING; |
} |
+} |
- float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / |
- (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); |
- noise_level = std::min(std::max(0.0f, noise_level), |
- kAudioMeterRangeMaxUnclipped); |
+SpeechRecognizerImpl::FSMState |
+SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) { |
+ DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING); |
- listener_->OnAudioLevelsChange(caller_id_, did_clip ? 1.0f : audio_level_, |
- noise_level); |
+ VLOG(1) << "Concluding recognition"; |
bulach
2012/04/04 15:38:17
nit: DVLOG?
Primiano Tucci (use gerrit)
2012/04/11 10:05:41
Done.
|
+ CloseAudioControllerAsynchronously(); |
+ recognition_engine_->AudioChunksEnded(); |
+ |
+ if (state_ > STATE_WAITING_FOR_SPEECH) |
+ listener_->OnSoundEnd(caller_id_); |
- if (endpointer_.speech_input_complete()) |
- StopAudioCapture(); |
+ listener_->OnAudioEnd(caller_id_); |
+ return STATE_WAITING_FINAL_RESULT; |
} |
-void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} |
+SpeechRecognizerImpl::FSMState |
+SpeechRecognizerImpl::Abort(const FSMEventArgs& event_args) { |
+ // TODO(primiano) Should raise SPEECH_RECOGNITION_ERROR_ABORTED in lack of |
+ // other specific error sources (so that it was an explicit abort request). |
+ // However, SPEECH_RECOGNITION_ERROR_ABORTED is not caught in UI layers |
bulach
2012/04/04 15:38:17
which UI layers? I think it's about the renderers,
Primiano Tucci (use gerrit)
2012/04/11 10:05:41
Done.
|
+ // and currently would cause an exception. JS will probably need it in future. |
+ if (event_args.event == EVENT_AUDIO_ERROR) { |
+ return AbortWithError( |
+ SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO)); |
+ } else if (event_args.event == EVENT_ENGINE_ERROR) { |
+ return AbortWithError(event_args.engine_error); |
+ } |
+ return AbortWithError(NULL); |
+} |
+ |
+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortWithError( |
+ const SpeechRecognitionError& error) { |
bulach
2012/04/04 15:38:17
can we avoid this overload?
Primiano Tucci (use gerrit)
2012/04/11 10:05:41
Hmm I guess it would make more verbose statements
|
+ return AbortWithError(&error); |
+} |
+ |
+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortWithError( |
+ const SpeechRecognitionError* error) { |
+ if (IsCapturingAudio()) |
+ CloseAudioControllerAsynchronously(); |
+ |
+ VLOG(1) << "SpeechRecognizerImpl canceling recognition. "; |
+ |
+ // The recognition engine is initialized only after STATE_STARTING. |
+ if (state_ > STATE_STARTING) { |
+ DCHECK(recognition_engine_.get()); |
+ recognition_engine_->EndRecognition(); |
+ } |
+ |
+ if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT) |
+ listener_->OnSoundEnd(caller_id_); |
+ |
+ if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT) |
+ listener_->OnAudioEnd(caller_id_); |
+ |
+ if (error != NULL) |
+ listener_->OnRecognitionError(caller_id_, *error); |
-void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult( |
- const content::SpeechRecognitionResult& result) { |
- // Guard against the listener freeing us until we finish our job. |
- scoped_refptr<SpeechRecognizerImpl> me(this); |
- listener_->OnRecognitionResult(caller_id_, result); |
listener_->OnRecognitionEnd(caller_id_); |
+ |
+ return STATE_IDLE; |
} |
-void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( |
- const content::SpeechRecognitionError& error) { |
- InformErrorAndAbortRecognition(error.code); |
+SpeechRecognizerImpl::FSMState |
+SpeechRecognizerImpl::ProcessIntermediateResult(const FSMEventArgs&) { |
+// This is in preparation for future speech recognition functions. |
bulach
2012/04/04 15:38:17
nit: indent
Primiano Tucci (use gerrit)
2012/04/11 10:05:41
Done.
|
+ NOTREACHED(); |
+ return state_; |
} |
-void SpeechRecognizerImpl::InformErrorAndAbortRecognition( |
- content::SpeechRecognitionErrorCode error) { |
- DCHECK_NE(error, content::SPEECH_RECOGNITION_ERROR_NONE); |
- AbortRecognition(); |
+SpeechRecognizerImpl::FSMState |
+SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) { |
+ const SpeechRecognitionResult& result = event_args.engine_result; |
+ VLOG(1) << "Got valid result"; |
bulach
2012/04/04 15:38:17
nit: DVLOG
Primiano Tucci (use gerrit)
2012/04/11 10:05:41
Done.
|
+ recognition_engine_->EndRecognition(); |
+ listener_->OnRecognitionResult(caller_id_, result); |
+ listener_->OnRecognitionEnd(caller_id_); |
+ return STATE_IDLE; |
+} |
- // Guard against the listener freeing us until we finish our job. |
- scoped_refptr<SpeechRecognizerImpl> me(this); |
- listener_->OnRecognitionError(caller_id_, error); |
+SpeechRecognizerImpl::FSMState |
+SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const { |
+ return state_; // Just keep the current state. |
} |
void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() { |
- VLOG(1) << "SpeechRecognizer stopping record."; |
+ DCHECK(IsCapturingAudio()); |
+ VLOG(1) << "SpeechRecognizerImpl stopping audio capture."; |
// Issues a Close on the audio controller, passing an empty callback. The only |
// purpose of such callback is to keep the audio controller refcounted until |
// Close has completed (in the audio thread) and automatically destroy it |
@@ -337,12 +597,32 @@ void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() { |
audio_controller_ = NULL; // The controller is still refcounted by Bind. |
} |
-bool SpeechRecognizerImpl::IsActive() const { |
- return (recognition_engine_.get() != NULL); |
+int SpeechRecognizerImpl::GetElapsedTimeMs() const { |
+ return (num_samples_recorded_ * 1000) / kAudioSampleRate; |
} |
-bool SpeechRecognizerImpl::IsCapturingAudio() const { |
- return (audio_controller_.get() != NULL); |
+void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms, |
+ bool clip_detected) { |
+ // Calculate the input volume to display in the UI, smoothing towards the |
+ // new level. |
+ // TODO(primiano) Do we really need all this floating point arith here? |
+ // Perhaps it might be quite expensive on mobile. |
+ float level = (rms - kAudioMeterMinDb) / |
+ (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); |
+ level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped); |
+ if (level > audio_level_) { |
+ audio_level_ += (level - audio_level_) * kUpSmoothingFactor; |
bulach
2012/04/04 15:38:17
nit: you can probably simplify this with:
const s
Primiano Tucci (use gerrit)
2012/04/11 10:05:41
It was code "inherited" from the original class, b
|
+ } else { |
+ audio_level_ += (level - audio_level_) * kDownSmoothingFactor; |
+ } |
+ |
+ float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / |
+ (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); |
+ noise_level = std::min(std::max(0.0f, noise_level), |
+ kAudioMeterRangeMaxUnclipped); |
+ |
+ listener_->OnAudioLevelsChange( |
+ caller_id_, clip_detected ? 1.0f : audio_level_, noise_level); |
} |
const SpeechRecognitionEngine& |
@@ -355,5 +635,14 @@ void SpeechRecognizerImpl::SetAudioManagerForTesting( |
testing_audio_manager_ = audio_manager; |
} |
+SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) |
+ : event(event_value), |
+ audio_error_code(0), |
+ audio_data(NULL), |
+ engine_error(content::SPEECH_RECOGNITION_ERROR_NONE) { |
+} |
+ |
+SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { |
+} |
} // namespace speech |