Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_ | 5 #ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_ |
| 6 #define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_ | 6 #define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_ |
| 7 | 7 |
| 8 #include <stdint.h> | 8 #include <stdint.h> |
| 9 #include <memory> | |
| 10 #include <string> | |
| 11 #include <vector> | |
| 9 | 12 |
| 10 #include <string> | 13 #include "base/macros.h" |
| 11 | 14 #include "base/memory/ref_counted.h" |
| 15 #include "base/threading/non_thread_safe.h" | |
| 16 #include "content/browser/speech/audio_encoder.h" | |
| 17 #include "content/browser/speech/chunked_byte_buffer.h" | |
| 12 #include "content/common/content_export.h" | 18 #include "content/common/content_export.h" |
| 13 #include "content/public/browser/speech_recognition_session_preamble.h" | 19 #include "content/public/browser/speech_recognition_session_preamble.h" |
| 20 #include "content/public/common/speech_recognition_error.h" | |
| 14 #include "content/public/common/speech_recognition_grammar.h" | 21 #include "content/public/common/speech_recognition_grammar.h" |
| 15 #include "content/public/common/speech_recognition_result.h" | 22 #include "content/public/common/speech_recognition_result.h" |
| 23 #include "net/url_request/url_fetcher_delegate.h" | |
| 24 | |
| 25 namespace net { | |
| 26 class URLRequestContextGetter; | |
| 27 } | |
| 16 | 28 |
| 17 namespace content { | 29 namespace content { |
| 18 | 30 |
| 19 class AudioChunk; | 31 class AudioChunk; |
| 20 struct SpeechRecognitionError; | 32 struct SpeechRecognitionError; |
| 33 struct SpeechRecognitionResult; | |
| 21 | 34 |
| 22 // This interface models the basic contract that a speech recognition engine, | 35 // A speech recognition engine supporting continuous recognition by means of |
| 23 // either working locally or relying on a remote web-service, must obey. | 36 // interaction with the Google streaming speech recognition webservice. |
| 24 // The expected call sequence for exported methods is: | 37 // |
| 38 // This class establishes two HTTPS connections with the webservice for each | |
| 39 // session, herein called "upstream" and "downstream". Audio chunks are sent on | |
| 40 // the upstream by means of a chunked HTTP POST upload. Recognition results are | |
| 41 // retrieved in a full-duplex fashion (i.e. while pushing audio on the upstream) | |
| 42 // on the downstream by means of a chunked HTTP GET request. Pairing between the | |
| 43 // two stream is handled through a randomly generated key, unique for each | |
| 44 // request, which is passed in the &pair= arg to both stream request URLs. In | |
| 45 // the case of a regular session, the upstream is closed when the audio capture | |
| 46 // ends (notified through a |AudioChunksEnded| call) and the downstream waits | |
| 47 // for a corresponding server closure (eventually some late results can come | |
| 48 // after closing the upstream). Both streams are guaranteed to be closed when | |
| 49 // |EndRecognition| call is issued. | |
| 50 // | |
| 51 // The expected call sequence is: | |
| 25 // StartRecognition Mandatory at beginning of SR. | 52 // StartRecognition Mandatory at beginning of SR. |
| 26 // TakeAudioChunk For every audio chunk pushed. | 53 // TakeAudioChunk For every audio chunk pushed. |
| 27 // AudioChunksEnded Finalize the audio stream (omitted in case of errors). | 54 // AudioChunksEnded Finalize the audio stream (omitted in case of errors). |
| 28 // EndRecognition Mandatory at end of SR (even on errors). | 55 // EndRecognition Mandatory at end of SR (even on errors). |
| 29 // No delegate callbacks are allowed before StartRecognition or after | 56 // |
| 57 // No delegate callbacks are performed before StartRecognition or after | |
| 30 // EndRecognition. If a recognition was started, the caller can free the | 58 // EndRecognition. If a recognition was started, the caller can free the |
| 31 // SpeechRecognitionEngine only after calling EndRecognition. | 59 // SpeechRecognitionEngine only after calling EndRecognition. |
| 32 class SpeechRecognitionEngine { | 60 |
| 61 class CONTENT_EXPORT SpeechRecognitionEngine | |
| 62 : public net::URLFetcherDelegate, | |
| 63 public NON_EXPORTED_BASE(base::NonThreadSafe) { | |
| 33 public: | 64 public: |
| 34 // Interface for receiving callbacks from this object. | |
| 35 class Delegate { | 65 class Delegate { |
| 36 public: | 66 public: |
| 37 // Called whenever a result is retrieved. It might be issued several times, | 67 // Called whenever a result is retrieved. |
| 38 // (e.g., in the case of continuous speech recognition engine | |
| 39 // implementations). | |
| 40 virtual void OnSpeechRecognitionEngineResults( | 68 virtual void OnSpeechRecognitionEngineResults( |
| 41 const SpeechRecognitionResults& results) = 0; | 69 const SpeechRecognitionResults& results) = 0; |
| 42 virtual void OnSpeechRecognitionEngineEndOfUtterance() = 0; | 70 virtual void OnSpeechRecognitionEngineEndOfUtterance() = 0; |
| 43 virtual void OnSpeechRecognitionEngineError( | 71 virtual void OnSpeechRecognitionEngineError( |
| 44 const SpeechRecognitionError& error) = 0; | 72 const SpeechRecognitionError& error) = 0; |
| 45 | 73 |
| 46 protected: | 74 protected: |
| 47 virtual ~Delegate() {} | 75 virtual ~Delegate() {} |
| 48 }; | 76 }; |
| 49 | 77 |
| 50 // Remote engine configuration. | 78 // Engine configuration. |
| 51 struct CONTENT_EXPORT Config { | 79 struct CONTENT_EXPORT Config { |
| 52 Config(); | 80 Config(); |
| 53 ~Config(); | 81 ~Config(); |
| 54 | 82 |
| 55 std::string language; | 83 std::string language; |
| 56 SpeechRecognitionGrammarArray grammars; | 84 SpeechRecognitionGrammarArray grammars; |
| 57 bool filter_profanities; | 85 bool filter_profanities; |
| 58 bool continuous; | 86 bool continuous; |
| 59 bool interim_results; | 87 bool interim_results; |
| 60 uint32_t max_hypotheses; | 88 uint32_t max_hypotheses; |
| 61 std::string hardware_info; | 89 std::string hardware_info; |
| 62 std::string origin_url; | 90 std::string origin_url; |
| 63 int audio_sample_rate; | 91 int audio_sample_rate; |
| 64 int audio_num_bits_per_sample; | 92 int audio_num_bits_per_sample; |
| 65 std::string auth_token; | 93 std::string auth_token; |
| 66 std::string auth_scope; | 94 std::string auth_scope; |
| 67 scoped_refptr<SpeechRecognitionSessionPreamble> preamble; | 95 scoped_refptr<SpeechRecognitionSessionPreamble> preamble; |
| 68 }; | 96 }; |
| 69 | 97 |
| 70 virtual ~SpeechRecognitionEngine() {} | |
| 71 | |
| 72 // Set/change the recognition engine configuration. It is not allowed to call | |
| 73 // this function while a recognition is ongoing. | |
| 74 virtual void SetConfig(const Config& config) = 0; | |
| 75 | |
| 76 // Called when the speech recognition begins, before any TakeAudioChunk call. | |
| 77 virtual void StartRecognition() = 0; | |
| 78 | |
| 79 // End any recognition activity and don't make any further callback. | |
| 80 // Must be always called to close the corresponding StartRecognition call, | |
| 81 // even in case of errors. | |
| 82 // No further TakeAudioChunk/AudioChunksEnded calls are allowed after this. | |
| 83 virtual void EndRecognition() = 0; | |
| 84 | |
| 85 // Push a chunk of uncompressed audio data, where the chunk length agrees with | |
| 86 // GetDesiredAudioChunkDurationMs(). | |
| 87 virtual void TakeAudioChunk(const AudioChunk& data) = 0; | |
| 88 | |
| 89 // Notifies the engine that audio capture has completed and no more chunks | |
| 90 // will be pushed. The engine, however, can still provide further results | |
| 91 // using the audio chunks collected so far. | |
| 92 virtual void AudioChunksEnded() = 0; | |
| 93 | |
| 94 // Checks wheter recognition of pushed audio data is pending. | |
| 95 virtual bool IsRecognitionPending() const = 0; | |
| 96 | |
| 97 // Retrieves the desired duration, in milliseconds, of pushed AudioChunk(s). | |
| 98 virtual int GetDesiredAudioChunkDurationMs() const = 0; | |
| 99 | |
| 100 // set_delegate detached from constructor for lazy dependency injection. | 98 // set_delegate detached from constructor for lazy dependency injection. |
| 101 void set_delegate(Delegate* delegate) { delegate_ = delegate; } | 99 void set_delegate(Delegate* delegate) { delegate_ = delegate; } |
| 102 | 100 |
| 103 protected: | 101 // Duration of each audio packet. |
| 104 Delegate* delegate() const { return delegate_; } | 102 static const int kAudioPacketIntervalMs; |
| 103 | |
| 104 // IDs passed to URLFetcher::Create(). Used for testing. | |
| 105 static const int kUpstreamUrlFetcherIdForTesting; | |
| 106 static const int kDownstreamUrlFetcherIdForTesting; | |
| 107 | |
| 108 explicit SpeechRecognitionEngine(net::URLRequestContextGetter* context); | |
| 109 ~SpeechRecognitionEngine() override; | |
| 110 | |
| 111 void SetConfig(const Config& config); | |
| 112 void StartRecognition(); | |
| 113 void EndRecognition(); | |
| 114 void TakeAudioChunk(const AudioChunk& data); | |
| 115 void AudioChunksEnded(); | |
| 116 bool IsRecognitionPending() const; | |
| 117 int GetDesiredAudioChunkDurationMs() const; | |
| 118 | |
| 119 // net::URLFetcherDelegate methods. | |
| 120 void OnURLFetchComplete(const net::URLFetcher* source) override; | |
| 121 void OnURLFetchDownloadProgress(const net::URLFetcher* source, | |
| 122 int64_t current, | |
| 123 int64_t total) override; | |
| 105 | 124 |
| 106 private: | 125 private: |
| 107 Delegate* delegate_; | 126 Delegate* delegate_; |
| 127 | |
| 128 // Response status codes from the speech recognition webservice. | |
| 129 static const int kWebserviceStatusNoError; | |
| 130 static const int kWebserviceStatusErrorNoMatch; | |
| 131 | |
| 132 // Frame type for framed POST data. Do NOT change these. They must match | |
| 133 // values the server expects. | |
| 134 enum FrameType { | |
| 135 FRAME_PREAMBLE_AUDIO = 0, | |
| 136 FRAME_RECOGNITION_AUDIO = 1 | |
| 137 }; | |
| 138 | |
| 139 // Data types for the internal Finite State Machine (FSM). | |
| 140 enum FSMState { | |
| 141 STATE_IDLE = 0, | |
| 142 STATE_BOTH_STREAMS_CONNECTED, | |
| 143 STATE_WAITING_DOWNSTREAM_RESULTS, | |
| 144 STATE_MAX_VALUE = STATE_WAITING_DOWNSTREAM_RESULTS | |
| 145 }; | |
| 146 | |
| 147 enum FSMEvent { | |
| 148 EVENT_END_RECOGNITION = 0, | |
| 149 EVENT_START_RECOGNITION, | |
| 150 EVENT_AUDIO_CHUNK, | |
| 151 EVENT_AUDIO_CHUNKS_ENDED, | |
| 152 EVENT_UPSTREAM_ERROR, | |
| 153 EVENT_DOWNSTREAM_ERROR, | |
| 154 EVENT_DOWNSTREAM_RESPONSE, | |
| 155 EVENT_DOWNSTREAM_CLOSED, | |
| 156 EVENT_MAX_VALUE = EVENT_DOWNSTREAM_CLOSED | |
| 157 }; | |
| 158 | |
| 159 struct FSMEventArgs { | |
| 160 explicit FSMEventArgs(FSMEvent event_value); | |
| 161 ~FSMEventArgs(); | |
| 162 | |
| 163 FSMEvent event; | |
| 164 | |
| 165 // In case of EVENT_AUDIO_CHUNK, holds the chunk pushed by |TakeAudioChunk|. | |
| 166 scoped_refptr<const AudioChunk> audio_data; | |
| 167 | |
| 168 // In case of EVENT_DOWNSTREAM_RESPONSE, hold the current chunk bytes. | |
| 169 std::unique_ptr<std::vector<uint8_t>> response; | |
| 170 | |
| 171 private: | |
| 172 DISALLOW_COPY_AND_ASSIGN(FSMEventArgs); | |
| 173 }; | |
| 174 | |
| 175 // Invoked by both upstream and downstream URLFetcher callbacks to handle | |
| 176 // new chunk data, connection closed or errors notifications. | |
| 177 void DispatchHTTPResponse(const net::URLFetcher* source, | |
| 178 bool end_of_response); | |
| 179 | |
| 180 // Entry point for pushing any new external event into the recognizer FSM. | |
| 181 void DispatchEvent(const FSMEventArgs& event_args); | |
| 182 | |
| 183 // Defines the behavior of the recognizer FSM, selecting the appropriate | |
| 184 // transition according to the current state and event. | |
| 185 FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& event_args); | |
| 186 | |
| 187 // The methods below handle transitions of the recognizer FSM. | |
| 188 FSMState ConnectBothStreams(const FSMEventArgs& event_args); | |
| 189 FSMState TransmitAudioUpstream(const FSMEventArgs& event_args); | |
| 190 FSMState ProcessDownstreamResponse(const FSMEventArgs& event_args); | |
| 191 FSMState RaiseNoMatchErrorIfGotNoResults(const FSMEventArgs& event_args); | |
| 192 FSMState CloseUpstreamAndWaitForResults(const FSMEventArgs& event_args); | |
| 193 FSMState CloseDownstream(const FSMEventArgs& event_args); | |
| 194 FSMState AbortSilently(const FSMEventArgs& event_args); | |
| 195 FSMState AbortWithError(const FSMEventArgs& event_args); | |
| 196 FSMState Abort(SpeechRecognitionErrorCode error); | |
| 197 FSMState DoNothing(const FSMEventArgs& event_args); | |
| 198 FSMState NotFeasible(const FSMEventArgs& event_args); | |
| 199 | |
| 200 std::string GetAcceptedLanguages() const; | |
| 201 std::string GenerateRequestKey() const; | |
| 202 | |
| 203 // Upload a single chunk of audio data. Handles both unframed and framed | |
| 204 // upload formats, and uses the appropriate one. | |
| 205 void UploadAudioChunk(const std::string& data, FrameType type, bool is_final); | |
| 206 | |
| 207 Config config_; | |
| 208 std::unique_ptr<net::URLFetcher> upstream_fetcher_; | |
| 209 std::unique_ptr<net::URLFetcher> downstream_fetcher_; | |
| 210 scoped_refptr<net::URLRequestContextGetter> url_context_; | |
| 211 std::unique_ptr<AudioEncoder> encoder_; | |
| 212 std::unique_ptr<AudioEncoder> preamble_encoder_; | |
| 213 ChunkedByteBuffer chunked_byte_buffer_; | |
| 214 size_t previous_response_length_; | |
| 215 bool got_last_definitive_result_; | |
| 216 bool is_dispatching_event_; | |
| 217 bool use_framed_post_data_; | |
| 218 FSMState state_; | |
| 219 | |
| 220 DISALLOW_COPY_AND_ASSIGN(SpeechRecognitionEngine); | |
| 108 }; | 221 }; |
| 109 | 222 |
| 110 // These typedefs are to workaround the issue with certain versions of | |
| 111 // Visual Studio where it gets confused between multiple Delegate | |
| 112 // classes and gives a C2500 error. | |
| 113 typedef SpeechRecognitionEngine::Delegate SpeechRecognitionEngineDelegate; | |
| 114 typedef SpeechRecognitionEngine::Config SpeechRecognitionEngineConfig; | |
|
hans
2016/04/13 21:23:06
Let's see if current MSVC can do without these..
| |
| 115 | |
| 116 } // namespace content | 223 } // namespace content |
| 117 | 224 |
| 118 #endif // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_ | 225 #endif // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_ |
| OLD | NEW |