Index: content/browser/speech/speech_recognizer_impl.cc |
diff --git a/content/browser/speech/speech_recognizer_impl.cc b/content/browser/speech/speech_recognizer_impl.cc |
index 007f3ee45a9941007714b1c5e4b25bf571fdcd1a..de2b72b503537320c4a7313814d2eecfeef297ba 100644 |
--- a/content/browser/speech/speech_recognizer_impl.cc |
+++ b/content/browser/speech/speech_recognizer_impl.cc |
@@ -16,6 +16,8 @@ |
#include "content/public/common/speech_recognition_result.h" |
#include "net/url_request/url_request_context_getter.h" |
+#define UNREACHABLE_CONDITION() do { NOTREACHED(); return state_; } while(0) |
Satish
2012/03/27 09:47:42
can this be changed to a method InvalidInput() alo
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
I used a macro since, in case of bugs/failing DCHE
|
+ |
using content::BrowserMainLoop; |
using content::BrowserThread; |
using content::SpeechRecognitionError; |
@@ -24,8 +26,12 @@ using content::SpeechRecognitionResult; |
using content::SpeechRecognizer; |
using media::AudioInputController; |
+// TODO(primiano) what about a watchdog here to avoid getting stuck if the |
+// SpeechRecognitionEngine does not deliver a result (in reasonable time)? |
Satish
2012/03/27 09:47:42
for remote engines, the network connection should
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
namespace { |
- |
+// Enables spontaneous transition from WaitingForSpeech to RecognizingSpeech, |
Satish
2012/03/27 09:47:42
add newline above
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
+// which is required for the mock recognition engine which sends fake results. |
+const bool skipSilenceDetectionForTesting = false; |
Satish
2012/03/27 09:47:42
This doesn't seem to be set to true anywhere else
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
// The following constants are related to the volume level indicator shown in |
// the UI for recorded audio. |
// Multiplier used when new volume is greater than previous level. |
@@ -48,6 +54,7 @@ bool DetectClipping(const speech::AudioChunk& chunk) { |
const int16* samples = chunk.SamplesData16(); |
const int kThreshold = num_samples / 20; |
int clipping_samples = 0; |
+ |
for (int i = 0; i < num_samples; ++i) { |
if (samples[i] <= -32767 || samples[i] >= 32767) { |
if (++clipping_samples > kThreshold) |
@@ -68,18 +75,27 @@ SpeechRecognizer* SpeechRecognizer::Create( |
bool filter_profanities, |
const std::string& hardware_info, |
const std::string& origin_url) { |
+ speech::GoogleOneShotRemoteEngineConfig google_sr_config; |
+ google_sr_config.language = language; |
+ google_sr_config.grammar = grammar; |
+ google_sr_config.audio_sample_rate = |
+ speech::SpeechRecognizerImpl::kAudioSampleRate; |
+ google_sr_config.audio_num_bits_per_sample = |
+ speech::SpeechRecognizerImpl::kNumBitsPerAudioSample; |
+ google_sr_config.filter_profanities = filter_profanities; |
+ google_sr_config.hardware_info = hardware_info; |
+ google_sr_config.origin_url = origin_url; |
+ |
+ speech::GoogleOneShotRemoteEngine* google_sr_engine = |
+ new speech::GoogleOneShotRemoteEngine(context_getter); |
+ google_sr_engine->SetConfig(google_sr_config); |
Satish
2012/03/27 09:47:42
Is this config ever changed after creating the eng
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
It can be changed, so that the recognition engine
|
+ |
return new speech::SpeechRecognizerImpl(listener, |
caller_id, |
- language, |
- grammar, |
- context_getter, |
- filter_profanities, |
- hardware_info, |
- origin_url); |
+ google_sr_engine); |
} |
namespace speech { |
- |
const int SpeechRecognizerImpl::kAudioSampleRate = 16000; |
const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO; |
const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; |
@@ -89,242 +105,458 @@ const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; |
SpeechRecognizerImpl::SpeechRecognizerImpl( |
SpeechRecognitionEventListener* listener, |
int caller_id, |
- const std::string& language, |
- const std::string& grammar, |
- net::URLRequestContextGetter* context_getter, |
- bool filter_profanities, |
- const std::string& hardware_info, |
- const std::string& origin_url) |
+ SpeechRecognitionEngine* engine) |
: listener_(listener), |
testing_audio_manager_(NULL), |
+ recognition_engine_(engine), |
endpointer_(kAudioSampleRate), |
- context_getter_(context_getter), |
caller_id_(caller_id), |
Satish
2012/03/27 09:47:42
this initializer list should be in the same order
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Isn't it? (except for non pod fields and pod field
|
- language_(language), |
- grammar_(grammar), |
- filter_profanities_(filter_profanities), |
- hardware_info_(hardware_info), |
- origin_url_(origin_url), |
- num_samples_recorded_(0), |
- audio_level_(0.0f) { |
+ event_dispatch_nesting_level_(0), |
+ state_(kIdle), |
+ event_args_(NULL) { |
DCHECK(listener_ != NULL); |
+ DCHECK(recognition_engine_ != NULL); |
endpointer_.set_speech_input_complete_silence_length( |
base::Time::kMicrosecondsPerSecond / 2); |
endpointer_.set_long_speech_input_complete_silence_length( |
base::Time::kMicrosecondsPerSecond); |
endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); |
endpointer_.StartSession(); |
+ recognition_engine_->set_delegate(this); |
} |
SpeechRecognizerImpl::~SpeechRecognizerImpl() { |
Satish
2012/03/27 09:47:42
add a DCHECK to verify you are in a valid (idle?)
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Hmm, the browser could be closed while a recogniti
|
- // Recording should have stopped earlier due to the endpointer or |
- // |StopRecording| being called. |
- DCHECK(!audio_controller_.get()); |
- DCHECK(!recognition_engine_.get() || |
- !recognition_engine_->IsRecognitionPending()); |
endpointer_.EndSession(); |
} |
+// ------- Methods that trigger Finite State Machine (FSM) events ------------ |
+ |
+// NOTE: all the external events and request should be enqueued (PostTask), even |
+// if they come from the same (IO) thread, in order to preserve the relationship |
+// of causality between events. |
+// Imagine what would happen if a Start has been enqueued from another thread |
Satish
2012/03/27 09:47:42
137-145 looks like a scare tactic :) and could be
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
+// (but not yet processed) and we suddenly issue a Stop from the IO thread. |
+// Furthermore, even if you are sure to not interleave start and stop requests, |
+// asynchronous event processing mixed with synchronous callback can cause very |
+// mind-breaking side effects. |
+// For instance, if someone could call Abort synchronously (instead of posting |
+// the event on the queue), it will receive interleaved callbacks (e.g. an error |
+// or the audio-end event) before the Abort call is effectively ended. |
+// Is your (caller) code ready for this? |
+ |
void SpeechRecognizerImpl::StartRecognition() { |
+ FSMEventArgs args; |
+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
+ base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
+ this, kStartRequest, args)); |
Satish
2012/03/27 09:47:42
could make it simple by replacing 'args' with 'FSM
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
+} |
+ |
+void SpeechRecognizerImpl::AbortRecognition() { |
+ FSMEventArgs args; |
+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
+ base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
+ this, kAbortRequest, args)); |
+} |
+ |
+void SpeechRecognizerImpl::StopAudioCapture() { |
+ FSMEventArgs args; |
+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
+ base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
+ this, kStopCaptureRequest, args)); |
+} |
+ |
+bool SpeechRecognizerImpl::IsActive() const { |
+ // Checking the FSM state from another thread (thus, while the FSM is |
+ // potentially concurrently evolving) is meaningless. |
+ // If you're doing it, probably you have some design issues. |
DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
- DCHECK(!audio_controller_.get()); |
- DCHECK(!recognition_engine_.get() || |
- !recognition_engine_->IsRecognitionPending()); |
+ return state_ != kIdle; |
+} |
- // The endpointer needs to estimate the environment/background noise before |
- // starting to treat the audio as user input. In |HandleOnData| we wait until |
- // such time has passed before switching to user input mode. |
- endpointer_.SetEnvironmentEstimationMode(); |
+bool SpeechRecognizerImpl::IsCapturingAudio() const { |
+ DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive(). |
+ return state_ >= kStartingRecognition && state_ <= kRecognizingSpeech; |
Satish
2012/03/27 09:47:42
Would checking for audio_controller_ != NULL be mo
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
IMHO, all the decisions related to the evolution o
|
+} |
+ |
+// Invoked in the audio thread. |
+void SpeechRecognizerImpl::OnError(AudioInputController* controller, |
+ int error_code) { |
+ FSMEventArgs args; |
+ args.audio_error_code = error_code; |
+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
+ base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
+ this, kAudioError, args)); |
+} |
+ |
+void SpeechRecognizerImpl::OnData(AudioInputController* controller, |
+ const uint8* data, uint32 size) { |
+ if (size == 0) // This could happen when audio capture stops and is normal. |
+ return; |
+ |
+ FSMEventArgs args; |
+ args.audio_data = new AudioChunk(data, static_cast<size_t>(size), |
Satish
2012/03/27 09:47:42
add a comment here that the event handler takes ow
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
+ kNumBitsPerAudioSample / 8); |
Satish
2012/03/27 09:47:42
since we are assuming kNumBitsPerAudioSample as a
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
+ base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
+ this, kAudioData, args)); |
+} |
+ |
+void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult( |
+ const content::SpeechRecognitionResult& result) { |
+ FSMEvent event = kRecognitionResult; |
Satish
2012/03/27 09:47:42
can this value be passed directly to the base::Bin
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
It must! honestly don't know why I did pass throug
|
+ FSMEventArgs args; |
+ args.speech_result = result; |
+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
+ base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
+ this, event, args)); |
+} |
+ |
+void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( |
+ const content::SpeechRecognitionError& error) { |
+ FSMEvent event = kRecognitionError; |
Satish
2012/03/27 09:47:42
ditto
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
+ FSMEventArgs args; |
+ args.error = error; |
+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
+ base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
+ this, event, args)); |
+} |
+ |
+// ----------------------- Core FSM implementation --------------------------- |
+ |
+void SpeechRecognizerImpl::DispatchEvent(FSMEvent event, FSMEventArgs args) { |
+ DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
+ DCHECK_LE(event, kMaxEvent); |
+ DCHECK_LE(state_, kMaxState); |
+ // Event dispatching must be sequential, otherwise it will break all the rules |
Satish
2012/03/27 09:47:42
add newline above full length comments such as the
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
+ // and the assumptions of the finite state automata model. |
+ DCHECK_EQ(event_dispatch_nesting_level_, 0); |
Satish
2012/03/27 09:47:42
could be clearer if this variable was a bool such
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Right.
|
+ ++event_dispatch_nesting_level_; |
+ // Guard against the delegate freeing us until we finish processing the event. |
Satish
2012/03/27 09:47:42
ditto
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
+ scoped_refptr<SpeechRecognizerImpl> me(this); |
+ |
+ event_ = event; |
Satish
2012/03/27 09:47:42
These look a bit dangerous as they are invalid aft
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Mmm what do you mean? They are used only by (priva
|
+ event_args_ = &args; |
+ |
+ if (event == kAudioData) |
+ ProcessAudioPipeline(); |
+ // The audio pipeline must be processed before the ProcessEvent, otherwise it |
Satish
2012/03/27 09:47:42
add newline above
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
+ // would take actions according to the future state and not the current one. |
+ state_ = ProcessEvent(event); |
+ |
+ // Cleanup event args. |
+ if (args.audio_data) |
Satish
2012/03/27 09:47:42
this cleanup should be part of the FSMEventArgs de
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
AudioChunk is now refcounted and should be destroy
|
+ delete args.audio_data; |
+ event_args_ = NULL; |
+ --event_dispatch_nesting_level_; |
+} |
+ |
+// ----------- Contract for all the FSM evolution functions below ------------- |
+// - Are guaranteed to be executed in the IO thread; |
+// - Are guaranteed to be not reentrant (themselves and each other); |
+// - event_args_ is guaranteed to be non NULL; |
+// - event_args_ members are guaranteed to be stable during the call; |
+// - The class won't be freed in the meanwhile due to callbacks; |
+ |
+// TODO(primiano) the audio pipeline is currently serial. However, the |
+// clipper->endpointer->vumeter chain and the sr_engine could be parallelized. |
+// We should profile the execution to see if it would be worth or not. |
+void SpeechRecognizerImpl::ProcessAudioPipeline() { |
+ const bool always = true; |
Satish
2012/03/27 09:47:42
remove this as its used only in the next line
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
+ const bool route_audio_to_clipper = always; |
Satish
2012/03/27 09:47:42
only use 1 space on either side of = and && operat
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
+ const bool route_audio_to_endpointer = state_ >= kEstimatingEnvironment && |
+ state_ <= kRecognizingSpeech; |
+ const bool route_audio_to_sr_engine = route_audio_to_endpointer; |
+ const bool route_audio_to_vumeter = state_ >= kWaitingForSpeech && |
+ state_ <= kRecognizingSpeech; |
+ |
+ AudioChunk& recorded_audio_data = *(event_args_->audio_data); |
Satish
2012/03/27 09:47:42
use "const AudioChunk&"
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
+ |
+ num_samples_recorded_ += recorded_audio_data.NumSamples(); |
+ |
+ if (route_audio_to_clipper) { |
+ clipper_detected_clip_ = DetectClipping(recorded_audio_data); |
Satish
2012/03/27 09:47:42
clipper_detected_clip_ is set here and used in Upd
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
+ } |
+ if (route_audio_to_endpointer) { |
+ endpointer_.ProcessAudio(recorded_audio_data, &rms_); |
Satish
2012/03/27 09:47:42
ditto for 'rms_'
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
+ } |
+ if (route_audio_to_vumeter) { |
+ DCHECK(route_audio_to_endpointer); // Depends on endpointer due to |rms_|. |
+ UpdateSignalAndNoiseLevels(rms_); |
Satish
2012/03/27 09:47:42
since this is the only method making use of clippi
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
+ } |
+ if (route_audio_to_sr_engine) { |
+ DCHECK(recognition_engine_.get()); |
+ recognition_engine_->TakeAudioChunk(recorded_audio_data); |
+ } |
+} |
+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ProcessEvent( |
Satish
2012/03/27 09:47:42
DispatchEvent and ProcessEvent are too similar, pl
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done. ExecuteTransitionAndGetNextState
Done.
|
+ FSMEvent event) { |
+ switch (state_) { |
+ case kIdle: |
+ switch (event) { |
+ // TODO(primiano) restore UNREACHABLE_CONDITION above when speech |
+ // input extensions are fixed. |
+ case kAbortRequest: return DoNothing(); //UNREACHABLE_CONDITION(); |
+ case kStartRequest: return InitializeAndStartRecording(); |
Satish
2012/03/27 09:47:42
since this is the only valid event in this state,
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Hmm IMHO it might introduce bugs if a new event is
|
+ case kStopCaptureRequest: return DoNothing(); //UNREACHABLE_CONDITION(); |
+ case kAudioData: return DoNothing(); // Corner cases related to |
+ case kRecognitionResult: return DoNothing(); // queued messages being |
+ case kRecognitionError: return DoNothing(); // lately dispatched. |
+ case kAudioError: return DoNothing(); |
+ } |
+ break; |
+ case kStartingRecognition: |
+ switch (event) { |
+ case kAbortRequest: return Abort(); |
Satish
2012/03/27 09:47:42
would be simpler to collapse multiple similar hand
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
IMHO it would become more difficult to read (since
|
+ case kStartRequest: UNREACHABLE_CONDITION(); |
+ case kStopCaptureRequest: return Abort(); |
+ case kAudioData: return StartSpeechRecognition(); |
+ case kRecognitionResult: UNREACHABLE_CONDITION(); |
+ case kRecognitionError: return Abort(); |
+ case kAudioError: return Abort(); |
+ } |
+ break; |
+ case kEstimatingEnvironment: |
+ switch (event) { |
+ case kAbortRequest: return Abort(); |
Satish
2012/03/27 09:47:42
hmm, since kAbortRequest, kRecognitionError and kA
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
They are not exactly equivalent since they trigger
|
+ case kStartRequest: UNREACHABLE_CONDITION(); |
+ case kStopCaptureRequest: return StopCaptureAndWaitForResult(); |
+ case kAudioData: return EnvironmentEstimation(); |
+ case kRecognitionResult: return ProcessIntermediateRecognitionResult(); |
+ case kRecognitionError: return Abort(); |
+ case kAudioError: return Abort(); |
+ } |
+ break; |
+ case kWaitingForSpeech: |
+ switch (event) { |
+ case kAbortRequest: return Abort(); |
+ case kStartRequest: UNREACHABLE_CONDITION(); |
+ case kStopCaptureRequest: return StopCaptureAndWaitForResult(); |
+ case kAudioData: return DetectUserSpeechOrTimeout(); |
+ case kRecognitionResult: return ProcessIntermediateRecognitionResult(); |
+ case kRecognitionError: return Abort(); |
+ case kAudioError: return Abort(); |
+ } |
+ break; |
+ case kRecognizingSpeech: |
+ switch (event) { |
+ case kAbortRequest: return Abort(); |
+ case kStartRequest: UNREACHABLE_CONDITION(); |
+ case kStopCaptureRequest: return StopCaptureAndWaitForResult(); |
+ case kAudioData: return DetectEndOfSpeech(); |
+ case kRecognitionResult: return ProcessIntermediateRecognitionResult(); |
+ case kRecognitionError: return Abort(); |
+ case kAudioError: return Abort(); |
+ } |
+ break; |
+ case kWaitingFinalResult: |
+ switch (event) { |
+ case kAbortRequest: return Abort(); |
+ case kStartRequest: UNREACHABLE_CONDITION(); |
+ case kStopCaptureRequest: return DoNothing(); |
+ case kAudioData: return DoNothing(); |
+ case kRecognitionResult: return ProcessFinalRecognitionResult(); |
+ case kRecognitionError: return Abort(); |
+ case kAudioError: return Abort(); |
+ } |
+ break; |
+ } |
+ UNREACHABLE_CONDITION(); |
+} |
+ |
+SpeechRecognizerImpl::FSMState |
+SpeechRecognizerImpl::InitializeAndStartRecording() { |
+ DCHECK(recognition_engine_.get()); |
+ DCHECK(audio_controller_.get() == NULL); |
AudioManager* audio_manager = (testing_audio_manager_ != NULL) ? |
testing_audio_manager_ : |
BrowserMainLoop::GetAudioManager(); |
+ DCHECK(audio_manager != NULL); |
+ |
+ VLOG(1) << "SpeechRecognizerImpl starting audio capture."; |
+ num_samples_recorded_ = 0; |
+ rms_ = 0; |
+ audio_level_ = 0; |
+ clipper_detected_clip_ = false; |
+ listener_->OnRecognitionStart(caller_id_); |
+ |
+ if (!audio_manager->HasAudioInputDevices()) { |
+ return Abort(SpeechRecognitionError( |
+ content::SPEECH_RECOGNITION_ERROR_AUDIO, |
+ content::SPEECH_AUDIO_ERROR_DETAILS_NO_MIC)); |
+ } |
+ |
+ if (audio_manager->IsRecordingInProcess()) { |
+ return Abort(SpeechRecognitionError( |
+ content::SPEECH_RECOGNITION_ERROR_AUDIO, |
+ content::SPEECH_AUDIO_ERROR_DETAILS_IN_USE)); |
+ } |
+ |
const int samples_per_packet = kAudioSampleRate * |
Satish
2012/03/27 09:47:42
add parentheses around (kAudioSampleRate * ..) / 1
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
- GoogleOneShotRemoteEngine::kAudioPacketIntervalMs / 1000; |
+ recognition_engine_->GetDesiredAudioChunkDurationMs() / 1000; |
AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout, |
kAudioSampleRate, kNumBitsPerAudioSample, |
samples_per_packet); |
audio_controller_ = AudioInputController::Create(audio_manager, this, params); |
- DCHECK(audio_controller_.get()); |
- VLOG(1) << "SpeechRecognizer starting record."; |
- num_samples_recorded_ = 0; |
- audio_controller_->Record(); |
-} |
- |
-void SpeechRecognizerImpl::AbortRecognition() { |
- DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
- DCHECK(audio_controller_.get() || recognition_engine_.get()); |
- // Stop recording if required. |
- if (audio_controller_.get()) { |
- CloseAudioControllerSynchronously(); |
+ if (audio_controller_.get() == NULL) { |
+ return Abort( |
+ SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO)); |
} |
- VLOG(1) << "SpeechRecognizer canceling recognition."; |
- recognition_engine_.reset(); |
+ // The endpointer needs to estimate the environment/background noise before |
+ // starting to treat the audio as user input. We wait in the state |
+ // kEstimatingEnvironment until such interval has elapsed before switching |
+ // to user input mode. |
+ endpointer_.SetEnvironmentEstimationMode(); |
+ audio_controller_->Record(); |
+ return kStartingRecognition; |
} |
-void SpeechRecognizerImpl::StopAudioCapture() { |
- DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::StartSpeechRecognition() { |
+ // This was the first audio packet recorded, so start a request to the |
Satish
2012/03/27 09:47:42
update comment to say that the first audio packet
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
+ // engine to send the data and inform the delegate. |
+ DCHECK(recognition_engine_.get()); |
+ recognition_engine_->StartRecognition(); |
+ listener_->OnAudioStart(caller_id_); |
+ // TODO(primiano) this is a little hack, since TakeAudioChunk() is already |
Satish
2012/03/27 09:47:42
add newline above
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
+ // called by ProcessAudioPipeline(). I hate it since it weakens the |
+ // architectural beauty of this class. But it is the best tradeoff, unless we |
Satish
2012/03/27 09:47:42
could remove reference to 'architectural beauty' :
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
+ // allow the drop the first audio chunk captured after opening the audio dev. |
+ recognition_engine_->TakeAudioChunk(*(event_args_->audio_data)); |
+ return kEstimatingEnvironment; |
+} |
- // If audio recording has already stopped and we are in recognition phase, |
- // silently ignore any more calls to stop recording. |
- if (!audio_controller_.get()) |
- return; |
+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::EnvironmentEstimation() { |
Satish
2012/03/27 09:47:42
this method's name doesn't indicate what it actual
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done. WaitEnvironmentEstimationCompletion
|
+ DCHECK(endpointer_.IsEstimatingEnvironment()); |
+ if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) { |
+ endpointer_.SetUserInputMode(); |
+ listener_->OnEnvironmentEstimationComplete(caller_id_); |
+ return kWaitingForSpeech; |
+ } else { |
+ return kEstimatingEnvironment; |
+ } |
+} |
- CloseAudioControllerSynchronously(); |
- listener_->OnSoundEnd(caller_id_); |
- listener_->OnAudioEnd(caller_id_); |
+SpeechRecognizerImpl::FSMState |
+SpeechRecognizerImpl::DetectUserSpeechOrTimeout() { |
+ if (skipSilenceDetectionForTesting) |
+ return kRecognizingSpeech; |
- // If we haven't got any audio yet end the recognition sequence here. |
- if (recognition_engine_ == NULL) { |
- // Guard against the listener freeing us until we finish our job. |
- scoped_refptr<SpeechRecognizerImpl> me(this); |
- listener_->OnRecognitionEnd(caller_id_); |
+ if (endpointer_.DidStartReceivingSpeech()) { |
+ listener_->OnSoundStart(caller_id_); |
+ return kRecognizingSpeech; |
+ } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) { |
+ return Abort( |
+ SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH)); |
} else { |
- recognition_engine_->AudioChunksEnded(); |
+ return kWaitingForSpeech; |
} |
} |
-// Invoked in the audio thread. |
-void SpeechRecognizerImpl::OnError(AudioInputController* controller, |
- int error_code) { |
- BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
- base::Bind(&SpeechRecognizerImpl::HandleOnError, |
- this, error_code)); |
+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::DetectEndOfSpeech() { |
+ if (endpointer_.speech_input_complete()) { |
+ return StopCaptureAndWaitForResult(); |
+ } else { |
+ return kRecognizingSpeech; |
+ } |
} |
-void SpeechRecognizerImpl::HandleOnError(int error_code) { |
- LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code; |
+SpeechRecognizerImpl::FSMState |
+SpeechRecognizerImpl::StopCaptureAndWaitForResult() { |
+ DCHECK(state_ >= kEstimatingEnvironment && state_ <= kRecognizingSpeech); |
- // Check if we are still recording before canceling recognition, as |
- // recording might have been stopped after this error was posted to the queue |
- // by |OnError|. |
- if (!audio_controller_.get()) |
- return; |
+ VLOG(1) << "Concluding recognition"; |
+ CloseAudioControllerSynchronously(); |
+ recognition_engine_->AudioChunksEnded(); |
+ |
+ if (state_ > kWaitingForSpeech) |
+ listener_->OnSoundEnd(caller_id_); |
- InformErrorAndAbortRecognition(content::SPEECH_RECOGNITION_ERROR_AUDIO); |
+ listener_->OnAudioEnd(caller_id_); |
+ return kWaitingFinalResult; |
} |
-void SpeechRecognizerImpl::OnData(AudioInputController* controller, |
- const uint8* data, uint32 size) { |
- if (size == 0) // This could happen when recording stops and is normal. |
- return; |
- AudioChunk* raw_audio = new AudioChunk(data, static_cast<size_t>(size), |
- kNumBitsPerAudioSample / 8); |
- BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
- base::Bind(&SpeechRecognizerImpl::HandleOnData, |
- this, raw_audio)); |
+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort() { |
+ // TODO(primiano) Should raise SPEECH_RECOGNITION_ERROR_ABORTED in lack of |
+ // other specific error sources (so that it was an explicit abort request). |
+ // However, SPEECH_RECOGNITION_ERROR_ABORTED is not caught in UI layers |
+ // and currently would cause an exception. JS will probably need it in future. |
+ SpeechRecognitionError error(content::SPEECH_RECOGNITION_ERROR_NONE); |
+ bool has_error = false; |
+ if (event_ == kAudioError) { |
+ has_error = true; |
+ error.code = content::SPEECH_RECOGNITION_ERROR_AUDIO; |
+ } else if (event_ == kRecognitionError) { |
+ has_error = true; |
+ error = event_args_->error; |
+ } |
+ return Abort(has_error, error); |
} |
-void SpeechRecognizerImpl::HandleOnData(AudioChunk* raw_audio) { |
- scoped_ptr<AudioChunk> free_raw_audio_on_return(raw_audio); |
- // Check if we are still recording and if not discard this buffer, as |
- // recording might have been stopped after this buffer was posted to the queue |
- // by |OnData|. |
- if (!audio_controller_.get()) |
- return; |
+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort( |
+ const SpeechRecognitionError& error) { |
+ return Abort(true, error); |
+} |
- bool speech_was_heard_before_packet = endpointer_.DidStartReceivingSpeech(); |
- |
- float rms; |
- endpointer_.ProcessAudio(*raw_audio, &rms); |
- bool did_clip = DetectClipping(*raw_audio); |
- num_samples_recorded_ += raw_audio->NumSamples(); |
- |
- if (recognition_engine_ == NULL) { |
- // This was the first audio packet recorded, so start a request to the |
- // server to send the data and inform the listener. |
- listener_->OnAudioStart(caller_id_); |
- GoogleOneShotRemoteEngineConfig google_sr_config; |
- google_sr_config.language = language_; |
- google_sr_config.grammar = grammar_; |
- google_sr_config.audio_sample_rate = kAudioSampleRate; |
- google_sr_config.audio_num_bits_per_sample = kNumBitsPerAudioSample; |
- google_sr_config.filter_profanities = filter_profanities_; |
- google_sr_config.hardware_info = hardware_info_; |
- google_sr_config.origin_url = origin_url_; |
- GoogleOneShotRemoteEngine* google_sr_engine = |
- new GoogleOneShotRemoteEngine(context_getter_.get()); |
- google_sr_engine->SetConfig(google_sr_config); |
- recognition_engine_.reset(google_sr_engine); |
- recognition_engine_->set_delegate(this); |
- recognition_engine_->StartRecognition(); |
- } |
+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort( |
+ bool has_error, const SpeechRecognitionError& error) { |
Satish
2012/03/27 09:47:42
can we change 'error' to be a pointer and remove '
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
+ if (audio_controller_) |
+ CloseAudioControllerSynchronously(); |
- recognition_engine_->TakeAudioChunk(*raw_audio); |
+ VLOG(1) << "SpeechRecognizerImpl canceling recognition. " << |
+ error.code << " " << error.details; |
- if (endpointer_.IsEstimatingEnvironment()) { |
- // Check if we have gathered enough audio for the endpointer to do |
- // environment estimation and should move on to detect speech/end of speech. |
- if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs * |
- kAudioSampleRate) / 1000) { |
- endpointer_.SetUserInputMode(); |
- listener_->OnEnvironmentEstimationComplete(caller_id_); |
- } |
- return; // No more processing since we are still estimating environment. |
+ // The recognition engine is initialized only after kStartingRecognition. |
+ if (state_ > kStartingRecognition) { |
+ DCHECK(recognition_engine_.get()); |
+ recognition_engine_->EndRecognition(); |
+ //TODO(primiano) reset the engine? Why, after all? |
Satish
2012/03/27 09:47:42
This comment is unclear, please reword if required
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
+ //recognition_engine_.reset(); |
} |
- // Check if we have waited too long without hearing any speech. |
- bool speech_was_heard_after_packet = endpointer_.DidStartReceivingSpeech(); |
- if (!speech_was_heard_after_packet && |
- num_samples_recorded_ >= (kNoSpeechTimeoutMs / 1000) * kAudioSampleRate) { |
- InformErrorAndAbortRecognition( |
- content::SPEECH_RECOGNITION_ERROR_NO_SPEECH); |
- return; |
- } |
+ if (state_ > kWaitingForSpeech && state_ < kWaitingFinalResult) |
Satish
2012/03/27 09:47:42
would be useful for the unittest to verify that al
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
+ listener_->OnSoundEnd(caller_id_); |
- if (!speech_was_heard_before_packet && speech_was_heard_after_packet) |
- listener_->OnSoundStart(caller_id_); |
+ if (state_ > kStartingRecognition && state_ < kWaitingFinalResult) |
+ listener_->OnAudioEnd(caller_id_); |
- // Calculate the input volume to display in the UI, smoothing towards the |
- // new level. |
- float level = (rms - kAudioMeterMinDb) / |
- (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); |
- level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped); |
- if (level > audio_level_) { |
- audio_level_ += (level - audio_level_) * kUpSmoothingFactor; |
- } else { |
- audio_level_ += (level - audio_level_) * kDownSmoothingFactor; |
- } |
+ if (has_error) |
+ listener_->OnRecognitionError(caller_id_, error); |
- float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / |
- (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); |
- noise_level = std::min(std::max(0.0f, noise_level), |
- kAudioMeterRangeMaxUnclipped); |
+ listener_->OnRecognitionEnd(caller_id_); |
- listener_->OnAudioLevelsChange(caller_id_, did_clip ? 1.0f : audio_level_, |
- noise_level); |
+ return kIdle; |
+} |
- if (endpointer_.speech_input_complete()) |
- StopAudioCapture(); |
+SpeechRecognizerImpl::FSMState |
+SpeechRecognizerImpl::ProcessIntermediateRecognitionResult() { |
+// This is in preparation for future speech recognition functions. |
Satish
2012/03/27 09:47:42
remove these commented lines
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
+// DCHECK(continuous_mode_); |
+// const SpeechRecognitionResult& result = event_args_->speech_result; |
+// VLOG(1) << "Got intermediate result"; |
+// listener_->OnRecognitionResult(caller_id_, result); |
+ NOTREACHED(); |
+ return state_; |
} |
-void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult( |
- const content::SpeechRecognitionResult& result) { |
- // Guard against the listener freeing us until we finish our job. |
- scoped_refptr<SpeechRecognizerImpl> me(this); |
+SpeechRecognizerImpl::FSMState |
+SpeechRecognizerImpl::ProcessFinalRecognitionResult() { |
+ const SpeechRecognitionResult& result = event_args_->speech_result; |
+ VLOG(1) << "Got valid result"; |
+ recognition_engine_->EndRecognition(); |
listener_->OnRecognitionResult(caller_id_, result); |
listener_->OnRecognitionEnd(caller_id_); |
+ return kIdle; |
} |
-void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( |
- const content::SpeechRecognitionError& error) { |
- InformErrorAndAbortRecognition(error.code); |
-} |
- |
-void SpeechRecognizerImpl::InformErrorAndAbortRecognition( |
- content::SpeechRecognitionErrorCode error) { |
- DCHECK_NE(error, content::SPEECH_RECOGNITION_ERROR_NONE); |
- AbortRecognition(); |
- |
- // Guard against the listener freeing us until we finish our job. |
- scoped_refptr<SpeechRecognizerImpl> me(this); |
- listener_->OnRecognitionError(caller_id_, error); |
+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::DoNothing() const { |
+ return state_; // Just keep the current state. |
Satish
2012/03/27 09:47:42
2 spaces before //, here and other places in this
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
} |
void SpeechRecognizerImpl::CloseAudioControllerSynchronously() { |
- VLOG(1) << "SpeechRecognizer stopping record."; |
+ DCHECK(audio_controller_); |
+ VLOG(1) << "SpeechRecognizerImpl stopping audio capture."; |
// TODO(satish): investigate the possibility to utilize the closure |
// and switch to async. version of this method. Compare with how |
@@ -336,12 +568,31 @@ void SpeechRecognizerImpl::CloseAudioControllerSynchronously() { |
audio_controller_ = NULL; // Releases the ref ptr. |
} |
-bool SpeechRecognizerImpl::IsActive() const { |
- return (recognition_engine_.get() != NULL); |
+int SpeechRecognizerImpl::GetElapsedTimeMs() const { |
+ return num_samples_recorded_ * 1000 / kAudioSampleRate; |
Satish
2012/03/27 09:47:42
use parenthesis around (num_samples_recorded_ * 10
Primiano Tucci (use gerrit)
2012/03/28 13:24:44
Done.
|
} |
-bool SpeechRecognizerImpl::IsCapturingAudio() const { |
- return (audio_controller_.get() != NULL); |
+void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms) { |
+ // Calculate the input volume to display in the UI, smoothing towards the |
+ // new level. |
+ // TODO(primiano) Do we really need all this floating point arith here? |
+ // Perhaps it might be quite expensive on mobile. |
+ float level = (rms - kAudioMeterMinDb) / |
+ (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); |
+ level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped); |
+ if (level > audio_level_) { |
+ audio_level_ += (level - audio_level_) * kUpSmoothingFactor; |
+ } else { |
+ audio_level_ += (level - audio_level_) * kDownSmoothingFactor; |
+ } |
+ |
+ float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / |
+ (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); |
+ noise_level = std::min(std::max(0.0f, noise_level), |
+ kAudioMeterRangeMaxUnclipped); |
+ |
+ listener_->OnAudioLevelsChange( |
+ caller_id_, clipper_detected_clip_ ? 1.0f : audio_level_, noise_level); |
} |
const SpeechRecognitionEngine& |
@@ -354,5 +605,10 @@ void SpeechRecognizerImpl::SetAudioManagerForTesting( |
testing_audio_manager_ = audio_manager; |
} |
+SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs() |
+ : audio_error_code(0), |
+ audio_data(NULL), |
+ error(content::SPEECH_RECOGNITION_ERROR_NONE) { |
+} |
} // namespace speech |