content/browser/speech/speech_recognition_engine.h - Issue 1891543002: Devirtualize SpeechRecognitionEngine

Side by Side Diff: content/browser/speech/speech_recognition_engine.h

Issue 1891543002: Devirtualize SpeechRecognitionEngine (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@kill_one_shot_engine

Patch Set: drop an include Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_	5 #ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_

6 #define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_	6 #define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_

7	7

8 #include <stdint.h>	8 #include <stdint.h>

	9 #include <memory>

	10 #include <string>

	11 #include <vector>

9	12

10 #include <string>	13 #include "base/macros.h"

11	14 #include "base/memory/ref_counted.h"

	15 #include "base/threading/non_thread_safe.h"

	16 #include "content/browser/speech/audio_encoder.h"

	17 #include "content/browser/speech/chunked_byte_buffer.h"

12 #include "content/common/content_export.h"	18 #include "content/common/content_export.h"

13 #include "content/public/browser/speech_recognition_session_preamble.h"	19 #include "content/public/browser/speech_recognition_session_preamble.h"

	20 #include "content/public/common/speech_recognition_error.h"

14 #include "content/public/common/speech_recognition_grammar.h"	21 #include "content/public/common/speech_recognition_grammar.h"

15 #include "content/public/common/speech_recognition_result.h"	22 #include "content/public/common/speech_recognition_result.h"

	23 #include "net/url_request/url_fetcher_delegate.h"

	24

	25 namespace net {

	26 class URLRequestContextGetter;

	27 }

16	28

17 namespace content {	29 namespace content {

18	30

19 class AudioChunk;	31 class AudioChunk;

20 struct SpeechRecognitionError;	32 struct SpeechRecognitionError;

	33 struct SpeechRecognitionResult;

21	34

22 // This interface models the basic contract that a speech recognition engine,	35 // A speech recognition engine supporting continuous recognition by means of

23 // either working locally or relying on a remote web-service, must obey.	36 // interaction with the Google streaming speech recognition webservice.

24 // The expected call sequence for exported methods is:	37 //

	38 // This class establishes two HTTPS connections with the webservice for each

	39 // session, herein called "upstream" and "downstream". Audio chunks are sent on

	40 // the upstream by means of a chunked HTTP POST upload. Recognition results are

	41 // retrieved in a full-duplex fashion (i.e. while pushing audio on the upstream)

	42 // on the downstream by means of a chunked HTTP GET request. Pairing between the

	43 // two stream is handled through a randomly generated key, unique for each

	44 // request, which is passed in the &pair= arg to both stream request URLs. In

	45 // the case of a regular session, the upstream is closed when the audio capture

	46 // ends (notified through a \|AudioChunksEnded\| call) and the downstream waits

	47 // for a corresponding server closure (eventually some late results can come

	48 // after closing the upstream). Both streams are guaranteed to be closed when

	49 // \|EndRecognition\| call is issued.

	50 //

	51 // The expected call sequence is:

25 // StartRecognition Mandatory at beginning of SR.	52 // StartRecognition Mandatory at beginning of SR.

26 // TakeAudioChunk For every audio chunk pushed.	53 // TakeAudioChunk For every audio chunk pushed.

27 // AudioChunksEnded Finalize the audio stream (omitted in case of errors).	54 // AudioChunksEnded Finalize the audio stream (omitted in case of errors).

28 // EndRecognition Mandatory at end of SR (even on errors).	55 // EndRecognition Mandatory at end of SR (even on errors).

29 // No delegate callbacks are allowed before StartRecognition or after	56 //

	57 // No delegate callbacks are performed before StartRecognition or after

30 // EndRecognition. If a recognition was started, the caller can free the	58 // EndRecognition. If a recognition was started, the caller can free the

31 // SpeechRecognitionEngine only after calling EndRecognition.	59 // SpeechRecognitionEngine only after calling EndRecognition.

32 class SpeechRecognitionEngine {	60

	61 class CONTENT_EXPORT SpeechRecognitionEngine

	62 : public net::URLFetcherDelegate,

	63 public NON_EXPORTED_BASE(base::NonThreadSafe) {

33 public:	64 public:

34 // Interface for receiving callbacks from this object.

35 class Delegate {	65 class Delegate {

36 public:	66 public:

37 // Called whenever a result is retrieved. It might be issued several times,	67 // Called whenever a result is retrieved.

38 // (e.g., in the case of continuous speech recognition engine

39 // implementations).

40 virtual void OnSpeechRecognitionEngineResults(	68 virtual void OnSpeechRecognitionEngineResults(

41 const SpeechRecognitionResults& results) = 0;	69 const SpeechRecognitionResults& results) = 0;

42 virtual void OnSpeechRecognitionEngineEndOfUtterance() = 0;	70 virtual void OnSpeechRecognitionEngineEndOfUtterance() = 0;

43 virtual void OnSpeechRecognitionEngineError(	71 virtual void OnSpeechRecognitionEngineError(

44 const SpeechRecognitionError& error) = 0;	72 const SpeechRecognitionError& error) = 0;

45	73

46 protected:	74 protected:

47 virtual ~Delegate() {}	75 virtual ~Delegate() {}

48 };	76 };

49	77

50 // Remote engine configuration.	78 // Engine configuration.

51 struct CONTENT_EXPORT Config {	79 struct CONTENT_EXPORT Config {

52 Config();	80 Config();

53 ~Config();	81 ~Config();

54	82

55 std::string language;	83 std::string language;

56 SpeechRecognitionGrammarArray grammars;	84 SpeechRecognitionGrammarArray grammars;

57 bool filter_profanities;	85 bool filter_profanities;

58 bool continuous;	86 bool continuous;

59 bool interim_results;	87 bool interim_results;

60 uint32_t max_hypotheses;	88 uint32_t max_hypotheses;

61 std::string hardware_info;	89 std::string hardware_info;

62 std::string origin_url;	90 std::string origin_url;

63 int audio_sample_rate;	91 int audio_sample_rate;

64 int audio_num_bits_per_sample;	92 int audio_num_bits_per_sample;

65 std::string auth_token;	93 std::string auth_token;

66 std::string auth_scope;	94 std::string auth_scope;

67 scoped_refptr<SpeechRecognitionSessionPreamble> preamble;	95 scoped_refptr<SpeechRecognitionSessionPreamble> preamble;

68 };	96 };

69	97

70 virtual ~SpeechRecognitionEngine() {}

71

72 // Set/change the recognition engine configuration. It is not allowed to call

73 // this function while a recognition is ongoing.

74 virtual void SetConfig(const Config& config) = 0;

75

76 // Called when the speech recognition begins, before any TakeAudioChunk call.

77 virtual void StartRecognition() = 0;

78

79 // End any recognition activity and don't make any further callback.

80 // Must be always called to close the corresponding StartRecognition call,

81 // even in case of errors.

82 // No further TakeAudioChunk/AudioChunksEnded calls are allowed after this.

83 virtual void EndRecognition() = 0;

84

85 // Push a chunk of uncompressed audio data, where the chunk length agrees with

86 // GetDesiredAudioChunkDurationMs().

87 virtual void TakeAudioChunk(const AudioChunk& data) = 0;

88

89 // Notifies the engine that audio capture has completed and no more chunks

90 // will be pushed. The engine, however, can still provide further results

91 // using the audio chunks collected so far.

92 virtual void AudioChunksEnded() = 0;

93

94 // Checks wheter recognition of pushed audio data is pending.

95 virtual bool IsRecognitionPending() const = 0;

96

97 // Retrieves the desired duration, in milliseconds, of pushed AudioChunk(s).

98 virtual int GetDesiredAudioChunkDurationMs() const = 0;

99

100 // set_delegate detached from constructor for lazy dependency injection.	98 // set_delegate detached from constructor for lazy dependency injection.

101 void set_delegate(Delegate* delegate) { delegate_ = delegate; }	99 void set_delegate(Delegate* delegate) { delegate_ = delegate; }

102	100

103 protected:	101 // Duration of each audio packet.

104 Delegate* delegate() const { return delegate_; }	102 static const int kAudioPacketIntervalMs;

	103

	104 // IDs passed to URLFetcher::Create(). Used for testing.

	105 static const int kUpstreamUrlFetcherIdForTesting;

	106 static const int kDownstreamUrlFetcherIdForTesting;

	107

	108 explicit SpeechRecognitionEngine(net::URLRequestContextGetter* context);

	109 ~SpeechRecognitionEngine() override;

	110

	111 void SetConfig(const Config& config);

	112 void StartRecognition();

	113 void EndRecognition();

	114 void TakeAudioChunk(const AudioChunk& data);

	115 void AudioChunksEnded();

	116 bool IsRecognitionPending() const;

	117 int GetDesiredAudioChunkDurationMs() const;

	118

	119 // net::URLFetcherDelegate methods.

	120 void OnURLFetchComplete(const net::URLFetcher* source) override;

	121 void OnURLFetchDownloadProgress(const net::URLFetcher* source,

	122 int64_t current,

	123 int64_t total) override;

105	124

106 private:	125 private:

107 Delegate* delegate_;	126 Delegate* delegate_;

	127

	128 // Response status codes from the speech recognition webservice.

	129 static const int kWebserviceStatusNoError;

	130 static const int kWebserviceStatusErrorNoMatch;

	131

	132 // Frame type for framed POST data. Do NOT change these. They must match

	133 // values the server expects.

	134 enum FrameType {

	135 FRAME_PREAMBLE_AUDIO = 0,

	136 FRAME_RECOGNITION_AUDIO = 1

	137 };

	138

	139 // Data types for the internal Finite State Machine (FSM).

	140 enum FSMState {

	141 STATE_IDLE = 0,

	142 STATE_BOTH_STREAMS_CONNECTED,

	143 STATE_WAITING_DOWNSTREAM_RESULTS,

	144 STATE_MAX_VALUE = STATE_WAITING_DOWNSTREAM_RESULTS

	145 };

	146

	147 enum FSMEvent {

	148 EVENT_END_RECOGNITION = 0,

	149 EVENT_START_RECOGNITION,

	150 EVENT_AUDIO_CHUNK,

	151 EVENT_AUDIO_CHUNKS_ENDED,

	152 EVENT_UPSTREAM_ERROR,

	153 EVENT_DOWNSTREAM_ERROR,

	154 EVENT_DOWNSTREAM_RESPONSE,

	155 EVENT_DOWNSTREAM_CLOSED,

	156 EVENT_MAX_VALUE = EVENT_DOWNSTREAM_CLOSED

	157 };

	158

	159 struct FSMEventArgs {

	160 explicit FSMEventArgs(FSMEvent event_value);

	161 ~FSMEventArgs();

	162

	163 FSMEvent event;

	164

	165 // In case of EVENT_AUDIO_CHUNK, holds the chunk pushed by \|TakeAudioChunk\|.

	166 scoped_refptr<const AudioChunk> audio_data;

	167

	168 // In case of EVENT_DOWNSTREAM_RESPONSE, hold the current chunk bytes.

	169 std::unique_ptr<std::vector<uint8_t>> response;

	170

	171 private:

	172 DISALLOW_COPY_AND_ASSIGN(FSMEventArgs);

	173 };

	174

	175 // Invoked by both upstream and downstream URLFetcher callbacks to handle

	176 // new chunk data, connection closed or errors notifications.

	177 void DispatchHTTPResponse(const net::URLFetcher* source,

	178 bool end_of_response);

	179

	180 // Entry point for pushing any new external event into the recognizer FSM.

	181 void DispatchEvent(const FSMEventArgs& event_args);

	182

	183 // Defines the behavior of the recognizer FSM, selecting the appropriate

	184 // transition according to the current state and event.

	185 FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& event_args);

	186

	187 // The methods below handle transitions of the recognizer FSM.

	188 FSMState ConnectBothStreams(const FSMEventArgs& event_args);

	189 FSMState TransmitAudioUpstream(const FSMEventArgs& event_args);

	190 FSMState ProcessDownstreamResponse(const FSMEventArgs& event_args);

	191 FSMState RaiseNoMatchErrorIfGotNoResults(const FSMEventArgs& event_args);

	192 FSMState CloseUpstreamAndWaitForResults(const FSMEventArgs& event_args);

	193 FSMState CloseDownstream(const FSMEventArgs& event_args);

	194 FSMState AbortSilently(const FSMEventArgs& event_args);

	195 FSMState AbortWithError(const FSMEventArgs& event_args);

	196 FSMState Abort(SpeechRecognitionErrorCode error);

	197 FSMState DoNothing(const FSMEventArgs& event_args);

	198 FSMState NotFeasible(const FSMEventArgs& event_args);

	199

	200 std::string GetAcceptedLanguages() const;

	201 std::string GenerateRequestKey() const;

	202

	203 // Upload a single chunk of audio data. Handles both unframed and framed

	204 // upload formats, and uses the appropriate one.

	205 void UploadAudioChunk(const std::string& data, FrameType type, bool is_final);

	206

	207 Config config_;

	208 std::unique_ptr<net::URLFetcher> upstream_fetcher_;

	209 std::unique_ptr<net::URLFetcher> downstream_fetcher_;

	210 scoped_refptr<net::URLRequestContextGetter> url_context_;

	211 std::unique_ptr<AudioEncoder> encoder_;

	212 std::unique_ptr<AudioEncoder> preamble_encoder_;

	213 ChunkedByteBuffer chunked_byte_buffer_;

	214 size_t previous_response_length_;

	215 bool got_last_definitive_result_;

	216 bool is_dispatching_event_;

	217 bool use_framed_post_data_;

	218 FSMState state_;

	219

	220 DISALLOW_COPY_AND_ASSIGN(SpeechRecognitionEngine);

108 };	221 };

109	222

110 // These typedefs are to workaround the issue with certain versions of

111 // Visual Studio where it gets confused between multiple Delegate

112 // classes and gives a C2500 error.

113 typedef SpeechRecognitionEngine::Delegate SpeechRecognitionEngineDelegate;

114 typedef SpeechRecognitionEngine::Config SpeechRecognitionEngineConfig;
hans 2016/04/13 21:23:06 Let's see if current MSVC can do without these.. Let's see if current MSVC can do without these..
115

116 } // namespace content	223 } // namespace content

117	224

118 #endif // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_	225 #endif // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_

OLD	NEW

« no previous file with comments | « content/browser/speech/speech_recognition_browsertest.cc ('k') | content/browser/speech/speech_recognition_engine.cc » ('j') | no next file with comments »