Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(86)

Side by Side Diff: content/browser/speech/speech_recognition_engine.h

Issue 1891543002: Devirtualize SpeechRecognitionEngine (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@kill_one_shot_engine
Patch Set: drop an include Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_ 5 #ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_
6 #define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_ 6 #define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_
7 7
8 #include <stdint.h> 8 #include <stdint.h>
9 #include <memory>
10 #include <string>
11 #include <vector>
9 12
10 #include <string> 13 #include "base/macros.h"
11 14 #include "base/memory/ref_counted.h"
15 #include "base/threading/non_thread_safe.h"
16 #include "content/browser/speech/audio_encoder.h"
17 #include "content/browser/speech/chunked_byte_buffer.h"
12 #include "content/common/content_export.h" 18 #include "content/common/content_export.h"
13 #include "content/public/browser/speech_recognition_session_preamble.h" 19 #include "content/public/browser/speech_recognition_session_preamble.h"
20 #include "content/public/common/speech_recognition_error.h"
14 #include "content/public/common/speech_recognition_grammar.h" 21 #include "content/public/common/speech_recognition_grammar.h"
15 #include "content/public/common/speech_recognition_result.h" 22 #include "content/public/common/speech_recognition_result.h"
23 #include "net/url_request/url_fetcher_delegate.h"
24
25 namespace net {
26 class URLRequestContextGetter;
27 }
16 28
17 namespace content { 29 namespace content {
18 30
19 class AudioChunk; 31 class AudioChunk;
20 struct SpeechRecognitionError; 32 struct SpeechRecognitionError;
33 struct SpeechRecognitionResult;
21 34
22 // This interface models the basic contract that a speech recognition engine, 35 // A speech recognition engine supporting continuous recognition by means of
23 // either working locally or relying on a remote web-service, must obey. 36 // interaction with the Google streaming speech recognition webservice.
24 // The expected call sequence for exported methods is: 37 //
38 // This class establishes two HTTPS connections with the webservice for each
39 // session, herein called "upstream" and "downstream". Audio chunks are sent on
40 // the upstream by means of a chunked HTTP POST upload. Recognition results are
41 // retrieved in a full-duplex fashion (i.e. while pushing audio on the upstream)
42 // on the downstream by means of a chunked HTTP GET request. Pairing between the
43 // two stream is handled through a randomly generated key, unique for each
44 // request, which is passed in the &pair= arg to both stream request URLs. In
45 // the case of a regular session, the upstream is closed when the audio capture
46 // ends (notified through a |AudioChunksEnded| call) and the downstream waits
47 // for a corresponding server closure (eventually some late results can come
48 // after closing the upstream). Both streams are guaranteed to be closed when
49 // |EndRecognition| call is issued.
50 //
51 // The expected call sequence is:
25 // StartRecognition Mandatory at beginning of SR. 52 // StartRecognition Mandatory at beginning of SR.
26 // TakeAudioChunk For every audio chunk pushed. 53 // TakeAudioChunk For every audio chunk pushed.
27 // AudioChunksEnded Finalize the audio stream (omitted in case of errors). 54 // AudioChunksEnded Finalize the audio stream (omitted in case of errors).
28 // EndRecognition Mandatory at end of SR (even on errors). 55 // EndRecognition Mandatory at end of SR (even on errors).
29 // No delegate callbacks are allowed before StartRecognition or after 56 //
57 // No delegate callbacks are performed before StartRecognition or after
30 // EndRecognition. If a recognition was started, the caller can free the 58 // EndRecognition. If a recognition was started, the caller can free the
31 // SpeechRecognitionEngine only after calling EndRecognition. 59 // SpeechRecognitionEngine only after calling EndRecognition.
32 class SpeechRecognitionEngine { 60
61 class CONTENT_EXPORT SpeechRecognitionEngine
62 : public net::URLFetcherDelegate,
63 public NON_EXPORTED_BASE(base::NonThreadSafe) {
33 public: 64 public:
34 // Interface for receiving callbacks from this object.
35 class Delegate { 65 class Delegate {
36 public: 66 public:
37 // Called whenever a result is retrieved. It might be issued several times, 67 // Called whenever a result is retrieved.
38 // (e.g., in the case of continuous speech recognition engine
39 // implementations).
40 virtual void OnSpeechRecognitionEngineResults( 68 virtual void OnSpeechRecognitionEngineResults(
41 const SpeechRecognitionResults& results) = 0; 69 const SpeechRecognitionResults& results) = 0;
42 virtual void OnSpeechRecognitionEngineEndOfUtterance() = 0; 70 virtual void OnSpeechRecognitionEngineEndOfUtterance() = 0;
43 virtual void OnSpeechRecognitionEngineError( 71 virtual void OnSpeechRecognitionEngineError(
44 const SpeechRecognitionError& error) = 0; 72 const SpeechRecognitionError& error) = 0;
45 73
46 protected: 74 protected:
47 virtual ~Delegate() {} 75 virtual ~Delegate() {}
48 }; 76 };
49 77
50 // Remote engine configuration. 78 // Engine configuration.
51 struct CONTENT_EXPORT Config { 79 struct CONTENT_EXPORT Config {
52 Config(); 80 Config();
53 ~Config(); 81 ~Config();
54 82
55 std::string language; 83 std::string language;
56 SpeechRecognitionGrammarArray grammars; 84 SpeechRecognitionGrammarArray grammars;
57 bool filter_profanities; 85 bool filter_profanities;
58 bool continuous; 86 bool continuous;
59 bool interim_results; 87 bool interim_results;
60 uint32_t max_hypotheses; 88 uint32_t max_hypotheses;
61 std::string hardware_info; 89 std::string hardware_info;
62 std::string origin_url; 90 std::string origin_url;
63 int audio_sample_rate; 91 int audio_sample_rate;
64 int audio_num_bits_per_sample; 92 int audio_num_bits_per_sample;
65 std::string auth_token; 93 std::string auth_token;
66 std::string auth_scope; 94 std::string auth_scope;
67 scoped_refptr<SpeechRecognitionSessionPreamble> preamble; 95 scoped_refptr<SpeechRecognitionSessionPreamble> preamble;
68 }; 96 };
69 97
70 virtual ~SpeechRecognitionEngine() {}
71
72 // Set/change the recognition engine configuration. It is not allowed to call
73 // this function while a recognition is ongoing.
74 virtual void SetConfig(const Config& config) = 0;
75
76 // Called when the speech recognition begins, before any TakeAudioChunk call.
77 virtual void StartRecognition() = 0;
78
79 // End any recognition activity and don't make any further callback.
80 // Must be always called to close the corresponding StartRecognition call,
81 // even in case of errors.
82 // No further TakeAudioChunk/AudioChunksEnded calls are allowed after this.
83 virtual void EndRecognition() = 0;
84
85 // Push a chunk of uncompressed audio data, where the chunk length agrees with
86 // GetDesiredAudioChunkDurationMs().
87 virtual void TakeAudioChunk(const AudioChunk& data) = 0;
88
89 // Notifies the engine that audio capture has completed and no more chunks
90 // will be pushed. The engine, however, can still provide further results
91 // using the audio chunks collected so far.
92 virtual void AudioChunksEnded() = 0;
93
94 // Checks wheter recognition of pushed audio data is pending.
95 virtual bool IsRecognitionPending() const = 0;
96
97 // Retrieves the desired duration, in milliseconds, of pushed AudioChunk(s).
98 virtual int GetDesiredAudioChunkDurationMs() const = 0;
99
100 // set_delegate detached from constructor for lazy dependency injection. 98 // set_delegate detached from constructor for lazy dependency injection.
101 void set_delegate(Delegate* delegate) { delegate_ = delegate; } 99 void set_delegate(Delegate* delegate) { delegate_ = delegate; }
102 100
103 protected: 101 // Duration of each audio packet.
104 Delegate* delegate() const { return delegate_; } 102 static const int kAudioPacketIntervalMs;
103
104 // IDs passed to URLFetcher::Create(). Used for testing.
105 static const int kUpstreamUrlFetcherIdForTesting;
106 static const int kDownstreamUrlFetcherIdForTesting;
107
108 explicit SpeechRecognitionEngine(net::URLRequestContextGetter* context);
109 ~SpeechRecognitionEngine() override;
110
111 void SetConfig(const Config& config);
112 void StartRecognition();
113 void EndRecognition();
114 void TakeAudioChunk(const AudioChunk& data);
115 void AudioChunksEnded();
116 bool IsRecognitionPending() const;
117 int GetDesiredAudioChunkDurationMs() const;
118
119 // net::URLFetcherDelegate methods.
120 void OnURLFetchComplete(const net::URLFetcher* source) override;
121 void OnURLFetchDownloadProgress(const net::URLFetcher* source,
122 int64_t current,
123 int64_t total) override;
105 124
106 private: 125 private:
107 Delegate* delegate_; 126 Delegate* delegate_;
127
128 // Response status codes from the speech recognition webservice.
129 static const int kWebserviceStatusNoError;
130 static const int kWebserviceStatusErrorNoMatch;
131
132 // Frame type for framed POST data. Do NOT change these. They must match
133 // values the server expects.
134 enum FrameType {
135 FRAME_PREAMBLE_AUDIO = 0,
136 FRAME_RECOGNITION_AUDIO = 1
137 };
138
139 // Data types for the internal Finite State Machine (FSM).
140 enum FSMState {
141 STATE_IDLE = 0,
142 STATE_BOTH_STREAMS_CONNECTED,
143 STATE_WAITING_DOWNSTREAM_RESULTS,
144 STATE_MAX_VALUE = STATE_WAITING_DOWNSTREAM_RESULTS
145 };
146
147 enum FSMEvent {
148 EVENT_END_RECOGNITION = 0,
149 EVENT_START_RECOGNITION,
150 EVENT_AUDIO_CHUNK,
151 EVENT_AUDIO_CHUNKS_ENDED,
152 EVENT_UPSTREAM_ERROR,
153 EVENT_DOWNSTREAM_ERROR,
154 EVENT_DOWNSTREAM_RESPONSE,
155 EVENT_DOWNSTREAM_CLOSED,
156 EVENT_MAX_VALUE = EVENT_DOWNSTREAM_CLOSED
157 };
158
159 struct FSMEventArgs {
160 explicit FSMEventArgs(FSMEvent event_value);
161 ~FSMEventArgs();
162
163 FSMEvent event;
164
165 // In case of EVENT_AUDIO_CHUNK, holds the chunk pushed by |TakeAudioChunk|.
166 scoped_refptr<const AudioChunk> audio_data;
167
168 // In case of EVENT_DOWNSTREAM_RESPONSE, hold the current chunk bytes.
169 std::unique_ptr<std::vector<uint8_t>> response;
170
171 private:
172 DISALLOW_COPY_AND_ASSIGN(FSMEventArgs);
173 };
174
175 // Invoked by both upstream and downstream URLFetcher callbacks to handle
176 // new chunk data, connection closed or errors notifications.
177 void DispatchHTTPResponse(const net::URLFetcher* source,
178 bool end_of_response);
179
180 // Entry point for pushing any new external event into the recognizer FSM.
181 void DispatchEvent(const FSMEventArgs& event_args);
182
183 // Defines the behavior of the recognizer FSM, selecting the appropriate
184 // transition according to the current state and event.
185 FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& event_args);
186
187 // The methods below handle transitions of the recognizer FSM.
188 FSMState ConnectBothStreams(const FSMEventArgs& event_args);
189 FSMState TransmitAudioUpstream(const FSMEventArgs& event_args);
190 FSMState ProcessDownstreamResponse(const FSMEventArgs& event_args);
191 FSMState RaiseNoMatchErrorIfGotNoResults(const FSMEventArgs& event_args);
192 FSMState CloseUpstreamAndWaitForResults(const FSMEventArgs& event_args);
193 FSMState CloseDownstream(const FSMEventArgs& event_args);
194 FSMState AbortSilently(const FSMEventArgs& event_args);
195 FSMState AbortWithError(const FSMEventArgs& event_args);
196 FSMState Abort(SpeechRecognitionErrorCode error);
197 FSMState DoNothing(const FSMEventArgs& event_args);
198 FSMState NotFeasible(const FSMEventArgs& event_args);
199
200 std::string GetAcceptedLanguages() const;
201 std::string GenerateRequestKey() const;
202
203 // Upload a single chunk of audio data. Handles both unframed and framed
204 // upload formats, and uses the appropriate one.
205 void UploadAudioChunk(const std::string& data, FrameType type, bool is_final);
206
207 Config config_;
208 std::unique_ptr<net::URLFetcher> upstream_fetcher_;
209 std::unique_ptr<net::URLFetcher> downstream_fetcher_;
210 scoped_refptr<net::URLRequestContextGetter> url_context_;
211 std::unique_ptr<AudioEncoder> encoder_;
212 std::unique_ptr<AudioEncoder> preamble_encoder_;
213 ChunkedByteBuffer chunked_byte_buffer_;
214 size_t previous_response_length_;
215 bool got_last_definitive_result_;
216 bool is_dispatching_event_;
217 bool use_framed_post_data_;
218 FSMState state_;
219
220 DISALLOW_COPY_AND_ASSIGN(SpeechRecognitionEngine);
108 }; 221 };
109 222
110 // These typedefs are to workaround the issue with certain versions of
111 // Visual Studio where it gets confused between multiple Delegate
112 // classes and gives a C2500 error.
113 typedef SpeechRecognitionEngine::Delegate SpeechRecognitionEngineDelegate;
114 typedef SpeechRecognitionEngine::Config SpeechRecognitionEngineConfig;
hans 2016/04/13 21:23:06 Let's see if current MSVC can do without these..
115
116 } // namespace content 223 } // namespace content
117 224
118 #endif // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_ 225 #endif // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_
OLDNEW
« no previous file with comments | « content/browser/speech/speech_recognition_browsertest.cc ('k') | content/browser/speech/speech_recognition_engine.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698