Chromium Code Reviews| Index: content/browser/speech/speech_recognition_engine.h |
| diff --git a/content/browser/speech/speech_recognition_engine.h b/content/browser/speech/speech_recognition_engine.h |
| index c25fd19c359c6f9f91bb67e15097d8923315bded..bbe244437c1b2df8682db0f0eeddbcbb4b1a43b8 100644 |
| --- a/content/browser/speech/speech_recognition_engine.h |
| +++ b/content/browser/speech/speech_recognition_engine.h |
| @@ -6,37 +6,65 @@ |
| #define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_ |
| #include <stdint.h> |
| - |
| +#include <memory> |
| #include <string> |
| +#include <vector> |
| +#include "base/macros.h" |
| +#include "base/memory/ref_counted.h" |
| +#include "base/threading/non_thread_safe.h" |
| +#include "content/browser/speech/audio_encoder.h" |
| +#include "content/browser/speech/chunked_byte_buffer.h" |
| #include "content/common/content_export.h" |
| #include "content/public/browser/speech_recognition_session_preamble.h" |
| +#include "content/public/common/speech_recognition_error.h" |
| #include "content/public/common/speech_recognition_grammar.h" |
| #include "content/public/common/speech_recognition_result.h" |
| +#include "net/url_request/url_fetcher_delegate.h" |
| + |
| +namespace net { |
| +class URLRequestContextGetter; |
| +} |
| namespace content { |
| class AudioChunk; |
| struct SpeechRecognitionError; |
| - |
| -// This interface models the basic contract that a speech recognition engine, |
| -// either working locally or relying on a remote web-service, must obey. |
| -// The expected call sequence for exported methods is: |
| +struct SpeechRecognitionResult; |
| + |
| +// A speech recognition engine supporting continuous recognition by means of |
| +// interaction with the Google streaming speech recognition webservice. |
| +// |
| +// This class establishes two HTTPS connections with the webservice for each |
| +// session, herein called "upstream" and "downstream". Audio chunks are sent on |
| +// the upstream by means of a chunked HTTP POST upload. Recognition results are |
| +// retrieved in a full-duplex fashion (i.e. while pushing audio on the upstream) |
| +// on the downstream by means of a chunked HTTP GET request. Pairing between the |
| +// two stream is handled through a randomly generated key, unique for each |
| +// request, which is passed in the &pair= arg to both stream request URLs. In |
| +// the case of a regular session, the upstream is closed when the audio capture |
| +// ends (notified through a |AudioChunksEnded| call) and the downstream waits |
| +// for a corresponding server closure (eventually some late results can come |
| +// after closing the upstream). Both streams are guaranteed to be closed when |
| +// |EndRecognition| call is issued. |
| +// |
| +// The expected call sequence is: |
| // StartRecognition Mandatory at beginning of SR. |
| // TakeAudioChunk For every audio chunk pushed. |
| // AudioChunksEnded Finalize the audio stream (omitted in case of errors). |
| // EndRecognition Mandatory at end of SR (even on errors). |
| -// No delegate callbacks are allowed before StartRecognition or after |
| +// |
| +// No delegate callbacks are performed before StartRecognition or after |
| // EndRecognition. If a recognition was started, the caller can free the |
| // SpeechRecognitionEngine only after calling EndRecognition. |
| -class SpeechRecognitionEngine { |
| + |
| +class CONTENT_EXPORT SpeechRecognitionEngine |
| + : public net::URLFetcherDelegate, |
| + public NON_EXPORTED_BASE(base::NonThreadSafe) { |
| public: |
| - // Interface for receiving callbacks from this object. |
| class Delegate { |
| public: |
| - // Called whenever a result is retrieved. It might be issued several times, |
| - // (e.g., in the case of continuous speech recognition engine |
| - // implementations). |
| + // Called whenever a result is retrieved. |
| virtual void OnSpeechRecognitionEngineResults( |
| const SpeechRecognitionResults& results) = 0; |
| virtual void OnSpeechRecognitionEngineEndOfUtterance() = 0; |
| @@ -47,7 +75,7 @@ class SpeechRecognitionEngine { |
| virtual ~Delegate() {} |
| }; |
| - // Remote engine configuration. |
| + // Engine configuration. |
| struct CONTENT_EXPORT Config { |
| Config(); |
| ~Config(); |
| @@ -67,51 +95,130 @@ class SpeechRecognitionEngine { |
| scoped_refptr<SpeechRecognitionSessionPreamble> preamble; |
| }; |
| - virtual ~SpeechRecognitionEngine() {} |
| + // set_delegate detached from constructor for lazy dependency injection. |
| + void set_delegate(Delegate* delegate) { delegate_ = delegate; } |
| - // Set/change the recognition engine configuration. It is not allowed to call |
| - // this function while a recognition is ongoing. |
| - virtual void SetConfig(const Config& config) = 0; |
| + // Duration of each audio packet. |
| + static const int kAudioPacketIntervalMs; |
| - // Called when the speech recognition begins, before any TakeAudioChunk call. |
| - virtual void StartRecognition() = 0; |
| + // IDs passed to URLFetcher::Create(). Used for testing. |
| + static const int kUpstreamUrlFetcherIdForTesting; |
| + static const int kDownstreamUrlFetcherIdForTesting; |
| - // End any recognition activity and don't make any further callback. |
| - // Must be always called to close the corresponding StartRecognition call, |
| - // even in case of errors. |
| - // No further TakeAudioChunk/AudioChunksEnded calls are allowed after this. |
| - virtual void EndRecognition() = 0; |
| + explicit SpeechRecognitionEngine(net::URLRequestContextGetter* context); |
| + ~SpeechRecognitionEngine() override; |
| - // Push a chunk of uncompressed audio data, where the chunk length agrees with |
| - // GetDesiredAudioChunkDurationMs(). |
| - virtual void TakeAudioChunk(const AudioChunk& data) = 0; |
| + void SetConfig(const Config& config); |
| + void StartRecognition(); |
| + void EndRecognition(); |
| + void TakeAudioChunk(const AudioChunk& data); |
| + void AudioChunksEnded(); |
| + bool IsRecognitionPending() const; |
| + int GetDesiredAudioChunkDurationMs() const; |
| - // Notifies the engine that audio capture has completed and no more chunks |
| - // will be pushed. The engine, however, can still provide further results |
| - // using the audio chunks collected so far. |
| - virtual void AudioChunksEnded() = 0; |
| + // net::URLFetcherDelegate methods. |
| + void OnURLFetchComplete(const net::URLFetcher* source) override; |
| + void OnURLFetchDownloadProgress(const net::URLFetcher* source, |
| + int64_t current, |
| + int64_t total) override; |
| - // Checks wheter recognition of pushed audio data is pending. |
| - virtual bool IsRecognitionPending() const = 0; |
| + private: |
| + Delegate* delegate_; |
| - // Retrieves the desired duration, in milliseconds, of pushed AudioChunk(s). |
| - virtual int GetDesiredAudioChunkDurationMs() const = 0; |
| + // Response status codes from the speech recognition webservice. |
| + static const int kWebserviceStatusNoError; |
| + static const int kWebserviceStatusErrorNoMatch; |
| - // set_delegate detached from constructor for lazy dependency injection. |
| - void set_delegate(Delegate* delegate) { delegate_ = delegate; } |
| + // Frame type for framed POST data. Do NOT change these. They must match |
| + // values the server expects. |
| + enum FrameType { |
| + FRAME_PREAMBLE_AUDIO = 0, |
| + FRAME_RECOGNITION_AUDIO = 1 |
| + }; |
| - protected: |
| - Delegate* delegate() const { return delegate_; } |
| + // Data types for the internal Finite State Machine (FSM). |
| + enum FSMState { |
| + STATE_IDLE = 0, |
| + STATE_BOTH_STREAMS_CONNECTED, |
| + STATE_WAITING_DOWNSTREAM_RESULTS, |
| + STATE_MAX_VALUE = STATE_WAITING_DOWNSTREAM_RESULTS |
| + }; |
| - private: |
| - Delegate* delegate_; |
| -}; |
| + enum FSMEvent { |
| + EVENT_END_RECOGNITION = 0, |
| + EVENT_START_RECOGNITION, |
| + EVENT_AUDIO_CHUNK, |
| + EVENT_AUDIO_CHUNKS_ENDED, |
| + EVENT_UPSTREAM_ERROR, |
| + EVENT_DOWNSTREAM_ERROR, |
| + EVENT_DOWNSTREAM_RESPONSE, |
| + EVENT_DOWNSTREAM_CLOSED, |
| + EVENT_MAX_VALUE = EVENT_DOWNSTREAM_CLOSED |
| + }; |
| + |
| + struct FSMEventArgs { |
| + explicit FSMEventArgs(FSMEvent event_value); |
| + ~FSMEventArgs(); |
| + |
| + FSMEvent event; |
| -// These typedefs are to workaround the issue with certain versions of |
| -// Visual Studio where it gets confused between multiple Delegate |
| -// classes and gives a C2500 error. |
| -typedef SpeechRecognitionEngine::Delegate SpeechRecognitionEngineDelegate; |
| -typedef SpeechRecognitionEngine::Config SpeechRecognitionEngineConfig; |
|
hans
2016/04/13 21:23:06
Let's see if current MSVC can do without these..
|
| + // In case of EVENT_AUDIO_CHUNK, holds the chunk pushed by |TakeAudioChunk|. |
| + scoped_refptr<const AudioChunk> audio_data; |
| + |
| + // In case of EVENT_DOWNSTREAM_RESPONSE, hold the current chunk bytes. |
| + std::unique_ptr<std::vector<uint8_t>> response; |
| + |
| + private: |
| + DISALLOW_COPY_AND_ASSIGN(FSMEventArgs); |
| + }; |
| + |
| + // Invoked by both upstream and downstream URLFetcher callbacks to handle |
| + // new chunk data, connection closed or errors notifications. |
| + void DispatchHTTPResponse(const net::URLFetcher* source, |
| + bool end_of_response); |
| + |
| + // Entry point for pushing any new external event into the recognizer FSM. |
| + void DispatchEvent(const FSMEventArgs& event_args); |
| + |
| + // Defines the behavior of the recognizer FSM, selecting the appropriate |
| + // transition according to the current state and event. |
| + FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& event_args); |
| + |
| + // The methods below handle transitions of the recognizer FSM. |
| + FSMState ConnectBothStreams(const FSMEventArgs& event_args); |
| + FSMState TransmitAudioUpstream(const FSMEventArgs& event_args); |
| + FSMState ProcessDownstreamResponse(const FSMEventArgs& event_args); |
| + FSMState RaiseNoMatchErrorIfGotNoResults(const FSMEventArgs& event_args); |
| + FSMState CloseUpstreamAndWaitForResults(const FSMEventArgs& event_args); |
| + FSMState CloseDownstream(const FSMEventArgs& event_args); |
| + FSMState AbortSilently(const FSMEventArgs& event_args); |
| + FSMState AbortWithError(const FSMEventArgs& event_args); |
| + FSMState Abort(SpeechRecognitionErrorCode error); |
| + FSMState DoNothing(const FSMEventArgs& event_args); |
| + FSMState NotFeasible(const FSMEventArgs& event_args); |
| + |
| + std::string GetAcceptedLanguages() const; |
| + std::string GenerateRequestKey() const; |
| + |
| + // Upload a single chunk of audio data. Handles both unframed and framed |
| + // upload formats, and uses the appropriate one. |
| + void UploadAudioChunk(const std::string& data, FrameType type, bool is_final); |
| + |
| + Config config_; |
| + std::unique_ptr<net::URLFetcher> upstream_fetcher_; |
| + std::unique_ptr<net::URLFetcher> downstream_fetcher_; |
| + scoped_refptr<net::URLRequestContextGetter> url_context_; |
| + std::unique_ptr<AudioEncoder> encoder_; |
| + std::unique_ptr<AudioEncoder> preamble_encoder_; |
| + ChunkedByteBuffer chunked_byte_buffer_; |
| + size_t previous_response_length_; |
| + bool got_last_definitive_result_; |
| + bool is_dispatching_event_; |
| + bool use_framed_post_data_; |
| + FSMState state_; |
| + |
| + DISALLOW_COPY_AND_ASSIGN(SpeechRecognitionEngine); |
| +}; |
| } // namespace content |