Index: content/browser/speech/speech_recognizer_impl.cc |
diff --git a/content/browser/speech/speech_recognizer_impl.cc b/content/browser/speech/speech_recognizer_impl.cc |
index 84f46a5e69f593735d128a27b7c288ccaa381cff..36b9b263593d3d999f64e98404a6b809542957c2 100644 |
--- a/content/browser/speech/speech_recognizer_impl.cc |
+++ b/content/browser/speech/speech_recognizer_impl.cc |
@@ -8,20 +8,29 @@ |
#include "base/time.h" |
#include "content/browser/browser_main_loop.h" |
#include "content/browser/speech/audio_buffer.h" |
-#include "content/public/browser/speech_recognition_event_listener.h" |
+#include "content/browser/speech/google_ssfe_remote_engine.h" |
#include "content/public/browser/browser_thread.h" |
+#include "content/public/browser/speech_recognition_event_listener.h" |
+#include "content/public/browser/speech_recognizer.h" |
#include "content/public/common/speech_recognition_result.h" |
#include "net/url_request/url_request_context_getter.h" |
+#define UNREACHABLE_CONDITION() do{ NOTREACHED(); return state_; } while(0) |
hans
2012/03/16 11:12:56
ultra nit: there should be a space between the "do
Primiano Tucci (use gerrit)
2012/03/16 15:03:42
Done.
|
+ |
using content::BrowserMainLoop; |
using content::BrowserThread; |
+using content::SpeechRecognitionError; |
using content::SpeechRecognitionEventListener; |
+using content::SpeechRecognitionResult; |
using content::SpeechRecognizer; |
using media::AudioInputController; |
-using std::string; |
+// TODO(primiano) what about a watchdog here to avoid getting stuck if the |
+// SpeechRecognitionEngine does not deliver a result (in reasonable time)? |
namespace { |
- |
+// Enables spontaneous transition from WaitingForSpeech to RecognizingSpeech, |
+// which is required for the mock recognition engine which sends fake results. |
+const bool skipSilenceDetectionForTesting = false; |
// The following constants are related to the volume level indicator shown in |
// the UI for recorded audio. |
// Multiplier used when new volume is greater than previous level. |
@@ -44,6 +53,7 @@ bool DetectClipping(const speech::AudioChunk& chunk) { |
const int16* samples = chunk.SamplesData16(); |
const int kThreshold = num_samples / 20; |
int clipping_samples = 0; |
+ |
for (int i = 0; i < num_samples; ++i) { |
if (samples[i] <= -32767 || samples[i] >= 32767) { |
if (++clipping_samples > kThreshold) |
@@ -55,6 +65,7 @@ bool DetectClipping(const speech::AudioChunk& chunk) { |
} // namespace |
+// TODO(primiano) transitional, see description in speech_recognizer.h. |
SpeechRecognizer* SpeechRecognizer::Create( |
SpeechRecognitionEventListener* listener, |
int caller_id, |
@@ -64,269 +75,484 @@ SpeechRecognizer* SpeechRecognizer::Create( |
bool filter_profanities, |
const std::string& hardware_info, |
const std::string& origin_url) { |
- return new speech::SpeechRecognizerImpl( |
- listener, caller_id, language, grammar, context_getter, |
- filter_profanities, hardware_info, origin_url); |
+ speech::GoogleSSFERemoteEngineConfig google_sr_config; |
+ google_sr_config.language = language; |
+ google_sr_config.grammar = grammar; |
+ google_sr_config.audio_sample_rate = |
+ speech::SpeechRecognizerImpl::kAudioSampleRate; |
+ google_sr_config.audio_num_bits_per_sample = |
+ speech::SpeechRecognizerImpl::kNumBitsPerAudioSample; |
+ google_sr_config.filter_profanities = filter_profanities; |
+ google_sr_config.hardware_info = hardware_info; |
+ google_sr_config.origin_url = origin_url; |
+ |
+ speech::GoogleSSFERemoteEngine* google_sr_engine = |
+ new speech::GoogleSSFERemoteEngine(context_getter); |
+ google_sr_engine->SetConfiguration(google_sr_config); |
+ |
+ return new speech::SpeechRecognizerImpl(listener, |
+ caller_id, |
+ google_sr_engine); |
} |
namespace speech { |
- |
const int SpeechRecognizerImpl::kAudioSampleRate = 16000; |
-const int SpeechRecognizerImpl::kAudioPacketIntervalMs = 100; |
const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO; |
const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; |
-const int SpeechRecognizerImpl::kNoSpeechTimeoutSec = 8; |
+const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; |
const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; |
SpeechRecognizerImpl::SpeechRecognizerImpl( |
SpeechRecognitionEventListener* listener, |
int caller_id, |
- const std::string& language, |
- const std::string& grammar, |
- net::URLRequestContextGetter* context_getter, |
- bool filter_profanities, |
- const std::string& hardware_info, |
- const std::string& origin_url) |
+ SpeechRecognitionEngine* engine) |
: listener_(listener), |
- caller_id_(caller_id), |
- language_(language), |
- grammar_(grammar), |
- filter_profanities_(filter_profanities), |
- hardware_info_(hardware_info), |
- origin_url_(origin_url), |
- context_getter_(context_getter), |
- codec_(AudioEncoder::CODEC_FLAC), |
- encoder_(NULL), |
+ testing_audio_manager_(NULL), |
+ recognition_engine_(engine), |
endpointer_(kAudioSampleRate), |
- num_samples_recorded_(0), |
- audio_level_(0.0f), |
- audio_manager_(NULL) { |
+ caller_id_(caller_id), |
+ event_dispatch_nesting_level_(0), |
+ state_(kIdle), |
+ event_args_(NULL) { |
+ DCHECK(listener_ != NULL); |
+ DCHECK(recognition_engine_ != NULL); |
endpointer_.set_speech_input_complete_silence_length( |
base::Time::kMicrosecondsPerSecond / 2); |
endpointer_.set_long_speech_input_complete_silence_length( |
base::Time::kMicrosecondsPerSecond); |
endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); |
endpointer_.StartSession(); |
+ recognition_engine_->set_delegate(this); |
} |
SpeechRecognizerImpl::~SpeechRecognizerImpl() { |
- // Recording should have stopped earlier due to the endpointer or |
- // |StopRecording| being called. |
- DCHECK(!audio_controller_.get()); |
- DCHECK(!request_.get() || !request_->HasPendingRequest()); |
- DCHECK(!encoder_.get()); |
endpointer_.EndSession(); |
} |
-bool SpeechRecognizerImpl::StartRecognition() { |
- DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
- DCHECK(!audio_controller_.get()); |
- DCHECK(!request_.get() || !request_->HasPendingRequest()); |
- DCHECK(!encoder_.get()); |
- |
- // The endpointer needs to estimate the environment/background noise before |
- // starting to treat the audio as user input. In |HandleOnData| we wait until |
- // such time has passed before switching to user input mode. |
- endpointer_.SetEnvironmentEstimationMode(); |
- |
- encoder_.reset(AudioEncoder::Create(codec_, kAudioSampleRate, |
- kNumBitsPerAudioSample)); |
- int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; |
- AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout, |
- kAudioSampleRate, kNumBitsPerAudioSample, |
- samples_per_packet); |
- audio_controller_ = AudioInputController::Create( |
- audio_manager_ ? audio_manager_ : BrowserMainLoop::GetAudioManager(), |
- this, params); |
- DCHECK(audio_controller_.get()); |
- VLOG(1) << "SpeechRecognizer starting record."; |
- num_samples_recorded_ = 0; |
- audio_controller_->Record(); |
- |
- return true; |
+// ------- Methods that trigger Finite State Machine (FSM) events ------------ |
+ |
+// NOTE: all the external events and request should be enqueued (PostTask), even |
+// if they come from the same (IO) thread, in order to preserve the relationship |
+// of causalilty between events. |
hans
2012/03/16 11:12:56
s/causalilty/causality/
Primiano Tucci (use gerrit)
2012/03/16 15:03:42
Done.
|
+// Imagine what would happen if a Start has been enqueued from another thread |
+// (but not yet processed) and we suddenly issue a Stop from the IO thread. |
+// Furthermore, even if you are sure to not interleave start and stop requests, |
+// asynchronous event processing mixed with syncrhonous callback can cause very |
hans
2012/03/16 11:12:56
s/syncrhonous/synchronous/
Primiano Tucci (use gerrit)
2012/03/16 15:03:42
Done.
|
+// mind-breaking side effects. |
+// For instance, if someone could call Abort synchronously (instead of posting |
+// the event on the queue), it will receive interleaved callbacks (e.g. an error |
+// or the audio-end event) before the Abort call is effectively ended. |
+// Is your (caller) code ready for this? |
+ |
+void SpeechRecognizerImpl::StartRecognition() { |
+ FSMEventArgs args; |
+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
+ base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
+ this, kStartRequest, args)); |
} |
void SpeechRecognizerImpl::AbortRecognition() { |
- DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
- DCHECK(audio_controller_.get() || request_.get()); |
- |
- // Stop recording if required. |
- if (audio_controller_.get()) { |
- CloseAudioControllerSynchronously(); |
- } |
- |
- VLOG(1) << "SpeechRecognizer canceling recognition."; |
- encoder_.reset(); |
- request_.reset(); |
+ FSMEventArgs args; |
+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
+ base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
+ this, kAbortRequest, args)); |
} |
void SpeechRecognizerImpl::StopAudioCapture() { |
- DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
- |
- // If audio recording has already stopped and we are in recognition phase, |
- // silently ignore any more calls to stop recording. |
- if (!audio_controller_.get()) |
- return; |
- |
- CloseAudioControllerSynchronously(); |
+ FSMEventArgs args; |
+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
+ base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
+ this, kStopCaptureRequest, args)); |
+} |
- listener_->OnSoundEnd(caller_id_); |
- listener_->OnAudioEnd(caller_id_); |
+bool SpeechRecognizerImpl::IsActive() const { |
+ // Checking the FSM state from another thread (thus, while the FSM is |
+ // potentially concurrently evolving) is meaningless. |
+ // If you're doing it, probably you have some design issues. |
+ DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
+ return state_ != kIdle; |
+} |
- // UploadAudioChunk requires a non-empty final buffer. So we encode a packet |
- // of silence in case encoder had no data already. |
- std::vector<short> samples((kAudioSampleRate * kAudioPacketIntervalMs) / |
- 1000); |
- AudioChunk dummy_chunk(reinterpret_cast<uint8*>(&samples[0]), |
- samples.size() * sizeof(short), |
- encoder_->bits_per_sample() / 8); |
- encoder_->Encode(dummy_chunk); |
- encoder_->Flush(); |
- scoped_ptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear()); |
- DCHECK(!encoded_data->IsEmpty()); |
- encoder_.reset(); |
- |
- // If we haven't got any audio yet end the recognition sequence here. |
- if (request_ == NULL) { |
- // Guard against the listener freeing us until we finish our job. |
- scoped_refptr<SpeechRecognizerImpl> me(this); |
- listener_->OnRecognitionEnd(caller_id_); |
- } else { |
- request_->UploadAudioChunk(*encoded_data, true /* is_last_chunk */); |
- } |
+bool SpeechRecognizerImpl::IsCapturingAudio() const { |
+ DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive(). |
+ return state_ >= kStartingRecognition && state_ <= kRecognizingSpeech; |
} |
// Invoked in the audio thread. |
void SpeechRecognizerImpl::OnError(AudioInputController* controller, |
int error_code) { |
+ FSMEventArgs args; |
+ args.audio_error_code = error_code; |
BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
- base::Bind(&SpeechRecognizerImpl::HandleOnError, |
- this, error_code)); |
+ base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
+ this, kAudioError, args)); |
} |
-void SpeechRecognizerImpl::HandleOnError(int error_code) { |
- LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code; |
- |
- // Check if we are still recording before canceling recognition, as |
- // recording might have been stopped after this error was posted to the queue |
- // by |OnError|. |
- if (!audio_controller_.get()) |
+void SpeechRecognizerImpl::OnData(AudioInputController* controller, |
+ const uint8* data, uint32 size) { |
+ if (size == 0) // This could happen when audio capture stops and is normal. |
return; |
- InformErrorAndAbortRecognition(content::SPEECH_RECOGNITION_ERROR_AUDIO); |
+ FSMEventArgs args; |
+ args.audio_data = new AudioChunk(data, static_cast<size_t>(size), |
+ kNumBitsPerAudioSample / 8); |
+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
+ base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
+ this, kAudioData, args)); |
} |
-void SpeechRecognizerImpl::OnData(AudioInputController* controller, |
- const uint8* data, uint32 size) { |
- if (size == 0) // This could happen when recording stops and is normal. |
- return; |
- AudioChunk* raw_audio = new AudioChunk(data, static_cast<size_t>(size), |
- kNumBitsPerAudioSample / 8); |
+void SpeechRecognizerImpl::OnSpeechEngineResult( |
+ const content::SpeechRecognitionResult& result) { |
+ FSMEvent event = kRecognitionResult; |
+ FSMEventArgs args; |
+ args.speech_result = result; |
BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
- base::Bind(&SpeechRecognizerImpl::HandleOnData, |
- this, raw_audio)); |
+ base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
+ this, event, args)); |
} |
-void SpeechRecognizerImpl::HandleOnData(AudioChunk* raw_audio) { |
- scoped_ptr<AudioChunk> free_raw_audio_on_return(raw_audio); |
- // Check if we are still recording and if not discard this buffer, as |
- // recording might have been stopped after this buffer was posted to the queue |
- // by |OnData|. |
- if (!audio_controller_.get()) |
- return; |
+void SpeechRecognizerImpl::OnSpeechEngineError( |
+ const content::SpeechRecognitionError& error) { |
+ FSMEvent event = kRecognitionError; |
+ FSMEventArgs args; |
+ args.error = error; |
+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
+ base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
+ this, event, args)); |
+} |
+ |
+// ----------------------- Core FSM implementation --------------------------- |
+ |
+void SpeechRecognizerImpl::DispatchEvent(FSMEvent event, FSMEventArgs args) { |
+ DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
+ DCHECK_LE(event, kMaxEvent); |
+ DCHECK_LE(state_, kMaxState); |
+ // Event dispatching must be sequential, otherwise it will break all the rules |
+ // and the assumptions of the finite state automata model. |
+ DCHECK_EQ(event_dispatch_nesting_level_, 0); |
+ ++event_dispatch_nesting_level_; |
+ // Guard against the delegate freeing us until we finish processing the event. |
+ scoped_refptr<SpeechRecognizerImpl> me(this); |
- bool speech_was_heard_before_packet = endpointer_.DidStartReceivingSpeech(); |
- |
- encoder_->Encode(*raw_audio); |
- float rms; |
- endpointer_.ProcessAudio(*raw_audio, &rms); |
- bool did_clip = DetectClipping(*raw_audio); |
- num_samples_recorded_ += raw_audio->NumSamples(); |
- |
- if (request_ == NULL) { |
- // This was the first audio packet recorded, so start a request to the |
- // server to send the data and inform the listener. |
- listener_->OnAudioStart(caller_id_); |
- request_.reset(new SpeechRecognitionRequest(context_getter_.get(), this)); |
- request_->Start(language_, grammar_, filter_profanities_, |
- hardware_info_, origin_url_, encoder_->mime_type()); |
+ event_ = event; |
+ event_args_ = &args; |
+ |
+ if (event == kAudioData) |
+ ProcessAudioPipeline(); |
+ // Ensure the audio pipeline is processed before processing the event, |
+ // otherwise it would take actions according to the next state and not the |
+ // current one. |
hans
2012/03/16 11:12:56
should the comment be moved up a little, or put in
Primiano Tucci (use gerrit)
2012/03/16 15:03:42
Mmm It is not referred to the if () statement itse
|
+ state_ = ProcessEvent(event); |
+ |
+ // Cleanup event args. |
+ if (args.audio_data) |
+ delete args.audio_data; |
+ event_args_ = NULL; |
+ --event_dispatch_nesting_level_; |
+} |
+ |
+// ----------- Contract for all the FSM evolution functions below ------------- |
+// - Are guaranteed to be executed in the IO thread; |
+// - Are guaranteed to be not reentrant (themselves and each other); |
+// - event_args_ is guaranteed to be non NULL; |
+// - event_args_ members are guaranteed to be stable during the call; |
+// - The class won't be freed in the meanwhile due to callbacks; |
+ |
+// TODO(primiano) the audio pipeline is currently serial. However, the |
+// clipper->endpointer->vumeter chain and the sr_engine could be parallelized. |
+// We should profile the execution to see if it would be worth or not. |
+void SpeechRecognizerImpl::ProcessAudioPipeline() { |
+ const bool always = true; |
+ const bool route_audio_to_clipper = always; |
+ const bool route_audio_to_endpointer = state_ >= kEstimatingEnvironment && |
+ state_ <= kRecognizingSpeech; |
+ const bool route_audio_to_sr_engine = route_audio_to_endpointer; |
+ const bool route_audio_to_vumeter = state_ >= kWaitingForSpeech && |
+ state_ <= kRecognizingSpeech; |
+ |
+ AudioChunk& recorded_audio_data = *(event_args_->audio_data); |
+ |
+ num_samples_recorded_ += recorded_audio_data.NumSamples(); |
+ |
+ if (route_audio_to_clipper) { |
+ clipper_detected_clip_ = DetectClipping(recorded_audio_data); |
+ } |
+ if (route_audio_to_endpointer) { |
+ endpointer_.ProcessAudio(recorded_audio_data, &rms_); |
} |
+ if (route_audio_to_vumeter) { |
+ DCHECK(route_audio_to_endpointer); // Depends on endpointer due to |rms_|. |
+ UpdateSignalAndNoiseLevels(rms_); |
+ } |
+ if (route_audio_to_sr_engine) { |
+ DCHECK(recognition_engine_.get()); |
+ recognition_engine_->PushSpeechAudio(recorded_audio_data); |
+ } |
+} |
- scoped_ptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear()); |
- DCHECK(!encoded_data->IsEmpty()); |
- request_->UploadAudioChunk(*encoded_data, false /* is_last_chunk */); |
- |
- if (endpointer_.IsEstimatingEnvironment()) { |
- // Check if we have gathered enough audio for the endpointer to do |
- // environment estimation and should move on to detect speech/end of speech. |
- if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs * |
- kAudioSampleRate) / 1000) { |
- endpointer_.SetUserInputMode(); |
- listener_->OnEnvironmentEstimationComplete(caller_id_); |
- } |
- return; // No more processing since we are still estimating environment. |
+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ProcessEvent( |
+ FSMEvent event) { |
+ switch (state_) { |
+ case kIdle: |
+ switch (event) { |
+ // TODO(primiano) restore UNREACHABLE_CONDITION above when speech |
+ // input extensions are fixed. |
+ case kAbortRequest: return DoNothing(); //UNREACHABLE_CONDITION(); |
+ case kStartRequest: return InitializeAndStartRecording(); |
+ case kStopCaptureRequest: return DoNothing(); //UNREACHABLE_CONDITION(); |
+ case kAudioData: return DoNothing(); // Corner cases related to |
+ case kRecognitionResult: return DoNothing(); // queued messages being |
+ case kRecognitionError: return DoNothing(); // lately dispatched. |
+ case kAudioError: return DoNothing(); |
+ } |
+ break; |
+ case kStartingRecognition: |
+ switch (event) { |
+ case kAbortRequest: return Abort(); |
+ case kStartRequest: UNREACHABLE_CONDITION(); |
+ case kStopCaptureRequest: return Abort(); |
+ case kAudioData: return StartSpeechRecognition(); |
+ case kRecognitionResult: UNREACHABLE_CONDITION(); |
+ case kRecognitionError: return Abort(); |
+ case kAudioError: return Abort(); |
+ } |
+ break; |
+ case kEstimatingEnvironment: |
+ switch (event) { |
+ case kAbortRequest: return Abort(); |
+ case kStartRequest: UNREACHABLE_CONDITION(); |
+ case kStopCaptureRequest: return StopCaptureAndWaitForResult(); |
+ case kAudioData: return EnvironmentEstimation(); |
+ case kRecognitionResult: return ProcessIntermediateRecognitionResult(); |
+ case kRecognitionError: return Abort(); |
+ case kAudioError: return Abort(); |
+ } |
+ break; |
+ case kWaitingForSpeech: |
+ switch (event) { |
+ case kAbortRequest: return Abort(); |
+ case kStartRequest: UNREACHABLE_CONDITION(); |
+ case kStopCaptureRequest: return StopCaptureAndWaitForResult(); |
+ case kAudioData: return DetectUserSpeechOrTimeout(); |
+ case kRecognitionResult: return ProcessIntermediateRecognitionResult(); |
+ case kRecognitionError: return Abort(); |
+ case kAudioError: return Abort(); |
+ } |
+ break; |
+ case kRecognizingSpeech: |
+ switch (event) { |
+ case kAbortRequest: return Abort(); |
+ case kStartRequest: UNREACHABLE_CONDITION(); |
+ case kStopCaptureRequest: return StopCaptureAndWaitForResult(); |
+ case kAudioData: return DetectEndOfSpeech(); |
+ case kRecognitionResult: return ProcessIntermediateRecognitionResult(); |
+ case kRecognitionError: return Abort(); |
+ case kAudioError: return Abort(); |
+ } |
+ break; |
+ case kWaitingFinalResult: |
+ switch (event) { |
+ case kAbortRequest: return Abort(); |
+ case kStartRequest: UNREACHABLE_CONDITION(); |
+ case kStopCaptureRequest: return DoNothing(); |
+ case kAudioData: return DoNothing(); |
+ case kRecognitionResult: return ProcessFinalRecognitionResult(); |
+ case kRecognitionError: return Abort(); |
+ case kAudioError: return Abort(); |
+ } |
+ break; |
} |
+ UNREACHABLE_CONDITION(); |
+} |
- // Check if we have waited too long without hearing any speech. |
- bool speech_was_heard_after_packet = endpointer_.DidStartReceivingSpeech(); |
- if (!speech_was_heard_after_packet && |
- num_samples_recorded_ >= kNoSpeechTimeoutSec * kAudioSampleRate) { |
- InformErrorAndAbortRecognition( |
- content::SPEECH_RECOGNITION_ERROR_NO_SPEECH); |
- return; |
+SpeechRecognizerImpl::FSMState |
+ SpeechRecognizerImpl::InitializeAndStartRecording() { |
hans
2012/03/16 11:12:56
i'm unsure about the indentation here..
spontaneou
Primiano Tucci (use gerrit)
2012/03/16 15:03:42
Done.
|
+ DCHECK(recognition_engine_.get()); |
+ DCHECK(audio_controller_.get() == NULL); |
+ AudioManager* audio_manager = (testing_audio_manager_ != NULL) ? |
+ testing_audio_manager_ : |
+ BrowserMainLoop::GetAudioManager(); |
+ DCHECK(audio_manager != NULL); |
+ |
+ VLOG(1) << "SpeechRecognizerImpl starting audio capture."; |
+ num_samples_recorded_ = 0; |
+ rms_ = 0; |
+ audio_level_ = 0; |
+ clipper_detected_clip_ = false; |
+ listener_->OnRecognitionStart(caller_id_); |
+ |
+ if (!audio_manager->HasAudioInputDevices()) |
hans
2012/03/16 11:12:56
i would put { around the body of the if since it's
Primiano Tucci (use gerrit)
2012/03/16 15:03:42
Done.
|
+ return Abort(SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO, |
+ content::AUDIO_ERROR_NO_MIC)); |
+ |
+ if (audio_manager->IsRecordingInProcess()) |
+ return Abort(SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO, |
+ content::AUDIO_ERROR_MIC_IN_USE)); |
+ |
+ const int samples_per_packet = kAudioSampleRate * |
+ recognition_engine_->DesiredAudioChunkDurationMs() / 1000; |
+ AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout, |
+ kAudioSampleRate, kNumBitsPerAudioSample, |
+ samples_per_packet); |
+ audio_controller_ = AudioInputController::Create(audio_manager, this, params); |
+ |
+ if (audio_controller_.get() == NULL) |
+ return Abort( |
+ SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO)); |
+ |
+ // The endpointer needs to estimate the environment/background noise before |
+ // starting to treat the audio as user input. We wait in the state |
+ // kEstimatingEnvironment until such interval has elapsed before switching |
+ // to user input mode. |
+ endpointer_.SetEnvironmentEstimationMode(); |
+ audio_controller_->Record(); |
+ return kStartingRecognition; |
+} |
+ |
+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::StartSpeechRecognition() { |
+ // This was the first audio packet recorded, so start a request to the |
+ // engine to send the data and inform the delegate. |
+ DCHECK(recognition_engine_.get()); |
+ recognition_engine_->SpeechRecognitionBegins(); |
+ listener_->OnAudioStart(caller_id_); |
+ // TODO(primiano) this is a little hack, since PushSpeechAudio() is already |
+ // called by ProcessAudioPipeline(). I hate it since it weakens the |
+ // architectural beauty of this class. But it is the best tradeoff, unless we |
+ // allow the drop the first audio chunk captured after opening the audio dev. |
+ recognition_engine_->PushSpeechAudio(*(event_args_->audio_data)); |
+ return kEstimatingEnvironment; |
+} |
+ |
+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::EnvironmentEstimation() { |
+ DCHECK(endpointer_.IsEstimatingEnvironment()); |
+ if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) { |
+ endpointer_.SetUserInputMode(); |
+ listener_->OnEnvironmentEstimationComplete(caller_id_); |
+ return kWaitingForSpeech; |
+ } else { |
+ return kEstimatingEnvironment; |
} |
+} |
+ |
+SpeechRecognizerImpl::FSMState |
+ SpeechRecognizerImpl::DetectUserSpeechOrTimeout() { |
+ if (skipSilenceDetectionForTesting) |
+ return kRecognizingSpeech; |
- if (!speech_was_heard_before_packet && speech_was_heard_after_packet) |
+ if (endpointer_.DidStartReceivingSpeech()) { |
listener_->OnSoundStart(caller_id_); |
+ return kRecognizingSpeech; |
+ } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) { |
+ return Abort( |
+ SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH)); |
+ } else { |
+ return kWaitingForSpeech; |
+ } |
+} |
- // Calculate the input volume to display in the UI, smoothing towards the |
- // new level. |
- float level = (rms - kAudioMeterMinDb) / |
- (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); |
- level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped); |
- if (level > audio_level_) { |
- audio_level_ += (level - audio_level_) * kUpSmoothingFactor; |
+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::DetectEndOfSpeech() { |
+ if (endpointer_.speech_input_complete()) { |
+ return StopCaptureAndWaitForResult(); |
} else { |
- audio_level_ += (level - audio_level_) * kDownSmoothingFactor; |
+ return kRecognizingSpeech; |
} |
+} |
- float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / |
- (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); |
- noise_level = std::min(std::max(0.0f, noise_level), |
- kAudioMeterRangeMaxUnclipped); |
+SpeechRecognizerImpl::FSMState |
+ SpeechRecognizerImpl::StopCaptureAndWaitForResult() { |
+ DCHECK(state_ >= kEstimatingEnvironment && state_ <= kRecognizingSpeech); |
- listener_->OnAudioLevelsChange(caller_id_, did_clip ? 1.0f : audio_level_, |
- noise_level); |
+ VLOG(1) << "Concluding recognition"; |
+ CloseAudioControllerSynchronously(); |
+ recognition_engine_->SpeechAudioStreamComplete(); |
- if (endpointer_.speech_input_complete()) |
- StopAudioCapture(); |
+ if (state_ > kWaitingForSpeech) |
+ listener_->OnSoundEnd(caller_id_); |
+ |
+ listener_->OnAudioEnd(caller_id_); |
+ return kWaitingFinalResult; |
} |
-void SpeechRecognizerImpl::SetRecognitionResult( |
- const content::SpeechRecognitionResult& result) { |
- if (result.error != content::SPEECH_RECOGNITION_ERROR_NONE) { |
- InformErrorAndAbortRecognition(result.error); |
- return; |
+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort() { |
+ // TODO(primiano) Should raise SPEECH_RECOGNITION_ERROR_ABORTED in lack of |
+ // other specific error sources (so that it was an explicit abort request). |
+ // However, SPEECH_RECOGNITION_ERROR_ABORTED is not caught in UI layers |
+ // and currently would cause an exception. JS will probably need it in future. |
+ SpeechRecognitionError error; |
+ bool has_error = false; |
+ if (event_ == kAudioError) { |
+ has_error = true; |
+ error.code = content::SPEECH_RECOGNITION_ERROR_AUDIO; |
+ } else if (event_ == kRecognitionError) { |
+ has_error = true; |
+ error = event_args_->error; |
} |
+ return Abort(has_error, error); |
+} |
+ |
+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort( |
+ const SpeechRecognitionError& error) { |
+ return Abort(true, error); |
+} |
+ |
+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort( |
+ bool has_error, const SpeechRecognitionError& error) { |
+ if (audio_controller_) |
+ CloseAudioControllerSynchronously(); |
+ |
+ VLOG(1) << "SpeechRecognizerImpl canceling recognition. " << |
+ error.code << " " << error.details; |
+ |
+ // The recognition engine is initialized only after kStartingRecognition. |
+ if (state_ > kStartingRecognition) { |
+ DCHECK(recognition_engine_.get()); |
+ recognition_engine_->SpeechRecognitionEnds(); |
+ //TODO(primiano) reset the engine? Why, after all? |
+ //recognition_engine_.reset(); |
+ } |
+ |
+ if (state_ > kWaitingForSpeech && state_ < kWaitingFinalResult) |
+ listener_->OnSoundEnd(caller_id_); |
+ |
+ if (state_ > kStartingRecognition && state_ < kWaitingFinalResult) |
+ listener_->OnAudioEnd(caller_id_); |
+ |
+ if (has_error) |
+ listener_->OnRecognitionError(caller_id_, error); |
- // Guard against the listener freeing us until we finish our job. |
- scoped_refptr<SpeechRecognizerImpl> me(this); |
- listener_->OnRecognitionResult(caller_id_, result); |
listener_->OnRecognitionEnd(caller_id_); |
+ |
+ return kIdle; |
} |
-void SpeechRecognizerImpl::InformErrorAndAbortRecognition( |
- content::SpeechRecognitionErrorCode error) { |
- DCHECK_NE(error, content::SPEECH_RECOGNITION_ERROR_NONE); |
- AbortRecognition(); |
+SpeechRecognizerImpl::FSMState |
+ SpeechRecognizerImpl::ProcessIntermediateRecognitionResult() { |
+// This is in preparation for future speech recognition functions. |
+// DCHECK(continuous_mode_); |
+// const SpeechRecognitionResult& result = event_args_->speech_result; |
+// VLOG(1) << "Got intermediate result"; |
+// listener_->OnRecognitionResult(caller_id_, result); |
+ NOTREACHED(); |
+ return state_; |
+} |
- // Guard against the listener freeing us until we finish our job. |
- scoped_refptr<SpeechRecognizerImpl> me(this); |
- listener_->OnRecognitionError(caller_id_, error); |
+SpeechRecognizerImpl::FSMState |
+ SpeechRecognizerImpl::ProcessFinalRecognitionResult() { |
+ const SpeechRecognitionResult& result = event_args_->speech_result; |
+ VLOG(1) << "Got valid result"; |
+ recognition_engine_->SpeechRecognitionEnds(); |
+ listener_->OnRecognitionResult(caller_id_, result); |
+ listener_->OnRecognitionEnd(caller_id_); |
+ return kIdle; |
+} |
+ |
+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::DoNothing() const { |
+ return state_; // Just keep the current state. |
} |
void SpeechRecognizerImpl::CloseAudioControllerSynchronously() { |
- VLOG(1) << "SpeechRecognizer stopping record."; |
+ DCHECK(audio_controller_); |
+ VLOG(1) << "SpeechRecognizerImpl stopping audio capture."; |
// TODO(satish): investigate the possibility to utilize the closure |
// and switch to async. version of this method. Compare with how |
@@ -338,17 +564,45 @@ void SpeechRecognizerImpl::CloseAudioControllerSynchronously() { |
audio_controller_ = NULL; // Releases the ref ptr. |
} |
-void SpeechRecognizerImpl::SetAudioManagerForTesting( |
- AudioManager* audio_manager) { |
- audio_manager_ = audio_manager; |
+int SpeechRecognizerImpl::GetElapsedTimeMs() const { |
+ return num_samples_recorded_ * 1000 / kAudioSampleRate; |
} |
-bool SpeechRecognizerImpl::IsActive() const { |
- return (request_.get() != NULL); |
+void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms) { |
+ // Calculate the input volume to display in the UI, smoothing towards the |
+ // new level. |
+ // TODO(primiano) Do we really need all this floating point arith here? |
+ // Perhaps it might be quite expensive on mobile. |
+ float level = (rms - kAudioMeterMinDb) / |
+ (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); |
+ level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped); |
+ if (level > audio_level_) { |
+ audio_level_ += (level - audio_level_) * kUpSmoothingFactor; |
+ } else { |
+ audio_level_ += (level - audio_level_) * kDownSmoothingFactor; |
+ } |
+ |
+ float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / |
+ (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); |
+ noise_level = std::min(std::max(0.0f, noise_level), |
+ kAudioMeterRangeMaxUnclipped); |
+ |
+ listener_->OnAudioLevelsChange( |
+ caller_id_, clipper_detected_clip_ ? 1.0f : audio_level_, noise_level); |
} |
-bool SpeechRecognizerImpl::IsCapturingAudio() const { |
- return (audio_controller_.get() != NULL); |
+const SpeechRecognitionEngine& |
+ SpeechRecognizerImpl::recognition_engine() const { |
+ return *(recognition_engine_.get()); |
+} |
+ |
+void SpeechRecognizerImpl::SetAudioManagerForTesting( |
+ AudioManager* audio_manager) { |
+ testing_audio_manager_ = audio_manager; |
+} |
+ |
+SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs() |
+ : audio_error_code(0), audio_data(NULL) { |
} |
-} // namespace speech |
+} // namespace speech |
hans
2012/03/16 11:12:56
ultra nit: two spaces between } and //
Primiano Tucci (use gerrit)
2012/03/16 15:03:42
Done.
|