content/browser/speech/google_one_shot_remote_engine.cc - Issue 9663066: Refactoring of chrome speech recognition architecture (CL1.3)

Side by Side Diff: content/browser/speech/google_one_shot_remote_engine.cc

Issue 9663066: Refactoring of chrome speech recognition architecture (CL1.3) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Minor nit on speech_recognition_engine comments. Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« no previous file with comments | « content/browser/speech/google_one_shot_remote_engine.h ('k') | content/browser/speech/google_one_shot_remote_engine_unittest.cc » ('j') | content/browser/speech/speech_recognition_engine.h » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "content/browser/speech/speech_recognition_request.h"	5 #include "content/browser/speech/google_one_shot_remote_engine.h"

6	6

7 #include <vector>	7 #include <vector>

8	8

9 #include "base/json/json_reader.h"	9 #include "base/json/json_reader.h"

	10 #include "base/memory/scoped_ptr.h"

10 #include "base/string_number_conversions.h"	11 #include "base/string_number_conversions.h"

11 #include "base/string_util.h"	12 #include "base/string_util.h"

12 #include "base/values.h"	13 #include "base/values.h"

13 #include "content/browser/speech/audio_buffer.h"	14 #include "content/browser/speech/audio_buffer.h"

14 #include "content/common/net/url_fetcher_impl.h"	15 #include "content/common/net/url_fetcher_impl.h"

15 #include "content/public/common/speech_recognition_result.h"	16 #include "content/public/common/speech_recognition_result.h"

16 #include "net/base/escape.h"	17 #include "net/base/escape.h"

17 #include "net/base/load_flags.h"	18 #include "net/base/load_flags.h"

18 #include "net/url_request/url_request_context.h"	19 #include "net/url_request/url_request_context.h"

19 #include "net/url_request/url_request_context_getter.h"	20 #include "net/url_request/url_request_context_getter.h"

20 #include "net/url_request/url_request_status.h"	21 #include "net/url_request/url_request_status.h"

21	22

	23 using content::SpeechRecognitionError;

	24 using content::SpeechRecognitionHypothesis;

	25 using content::SpeechRecognitionResult;

	26

22 namespace {	27 namespace {

23	28

24 const char* const kDefaultSpeechRecognitionUrl =	29 const char* const kDefaultSpeechRecognitionUrl =

25 "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&";	30 "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&";

26 const char* const kStatusString = "status";	31 const char* const kStatusString = "status";

27 const char* const kHypothesesString = "hypotheses";	32 const char* const kHypothesesString = "hypotheses";

28 const char* const kUtteranceString = "utterance";	33 const char* const kUtteranceString = "utterance";

29 const char* const kConfidenceString = "confidence";	34 const char* const kConfidenceString = "confidence";

30	35 const int kWebServiceStatusNoError = 0;

	36 const int kWebServiceStatusNoSpeech = 4;

	37 const int kWebServiceStatusNoMatch = 5;

	38 const int kDefaultConfigSampleRate = 8000;

	39 const int kDefaultConfigBitsPerSample = 16;

	40 const speech::AudioEncoder::Codec kDefaultAudioCodec =

	41 speech::AudioEncoder::CODEC_FLAC;

31 // TODO(satish): Remove this hardcoded value once the page is allowed to	42 // TODO(satish): Remove this hardcoded value once the page is allowed to

32 // set this via an attribute.	43 // set this via an attribute.

33 const int kMaxResults = 6;	44 const int kMaxResults = 6;

34	45

35 bool ParseServerResponse(const std::string& response_body,	46 bool ParseServerResponse(const std::string& response_body,

36 content::SpeechRecognitionResult* result) {	47 SpeechRecognitionResult* result,

	48 SpeechRecognitionError* error) {

37 if (response_body.empty()) {	49 if (response_body.empty()) {

38 LOG(WARNING) << "ParseServerResponse: Response was empty.";	50 LOG(WARNING) << "ParseServerResponse: Response was empty.";

39 return false;	51 return false;

40 }	52 }

41 DVLOG(1) << "ParseServerResponse: Parsing response " << response_body;	53 DVLOG(1) << "ParseServerResponse: Parsing response " << response_body;

42	54

43 // Parse the response, ignoring comments.	55 // Parse the response, ignoring comments.

44 std::string error_msg;	56 std::string error_msg;

45 scoped_ptr<Value> response_value(base::JSONReader::ReadAndReturnError(	57 scoped_ptr<Value> response_value(base::JSONReader::ReadAndReturnError(

46 response_body, false, NULL, &error_msg));	58 response_body, false, NULL, &error_msg));

(...skipping 13 matching lines...) Expand all Loading...
60 // Get the status.	72 // Get the status.

61 int status;	73 int status;

62 if (!response_object->GetInteger(kStatusString, &status)) {	74 if (!response_object->GetInteger(kStatusString, &status)) {

63 VLOG(1) << "ParseServerResponse: " << kStatusString	75 VLOG(1) << "ParseServerResponse: " << kStatusString

64 << " is not a valid integer value.";	76 << " is not a valid integer value.";

65 return false;	77 return false;

66 }	78 }

67	79

68 // Process the status.	80 // Process the status.

69 switch (status) {	81 switch (status) {

70 case content::SPEECH_RECOGNITION_ERROR_NONE:	82 case kWebServiceStatusNoError:

71 case content::SPEECH_RECOGNITION_ERROR_NO_SPEECH:	83 error->code = content::SPEECH_RECOGNITION_ERROR_NONE;
	Primiano Tucci (use gerrit) 2012/03/22 12:39:29 This was mistakenly added after the first steps of This was mistakenly added after the first steps of the review. This code should not clear the error (which is set to NETWORK_ERROR before the call) because in cause of parse failures (thus a return false) we should keep it. In case of success, instead, the error code is not used at all (it is just a local variable in OnURLFetchComplete).
72 case content::SPEECH_RECOGNITION_ERROR_NO_MATCH:	84 break;

73 break;	85 case kWebServiceStatusNoSpeech:

74	86 error->code = content::SPEECH_RECOGNITION_ERROR_NO_SPEECH;

75 default:	87 return false;

76 // Other status codes should not be returned by the server.	88 case kWebServiceStatusNoMatch:

77 VLOG(1) << "ParseServerResponse: unexpected status code " << status;	89 error->code = content::SPEECH_RECOGNITION_ERROR_NO_MATCH;

78 return false;	90 return false;

	91 default:

	92 error->code = content::SPEECH_RECOGNITION_ERROR_NETWORK;

	93 // Other status codes should not be returned by the server.

	94 VLOG(1) << "ParseServerResponse: unexpected status code " << status;

	95 return false;

79 }	96 }

80	97

81 result->error = static_cast<content::SpeechRecognitionErrorCode>(status);

82

83 // Get the hypotheses.	98 // Get the hypotheses.

84 Value* hypotheses_value = NULL;	99 Value* hypotheses_value = NULL;

85 if (!response_object->Get(kHypothesesString, &hypotheses_value)) {	100 if (!response_object->Get(kHypothesesString, &hypotheses_value)) {

86 VLOG(1) << "ParseServerResponse: Missing hypotheses attribute.";	101 VLOG(1) << "ParseServerResponse: Missing hypotheses attribute.";

87 return false;	102 return false;

88 }	103 }

89	104

90 DCHECK(hypotheses_value);	105 DCHECK(hypotheses_value);

91 if (!hypotheses_value->IsType(Value::TYPE_LIST)) {	106 if (!hypotheses_value->IsType(Value::TYPE_LIST)) {

92 VLOG(1) << "ParseServerResponse: Unexpected hypotheses type "	107 VLOG(1) << "ParseServerResponse: Unexpected hypotheses type "

93 << hypotheses_value->GetType();	108 << hypotheses_value->GetType();

94 return false;	109 return false;

95 }	110 }

96	111

97 const ListValue* hypotheses_list = static_cast<ListValue*>(hypotheses_value);	112 const ListValue* hypotheses_list = static_cast<ListValue*>(hypotheses_value);

98	113 // For now we support only single shot recognition, so we are giving only a

	114 // final result, consisting of one fragment (with one or more hypotheses).

99 size_t index = 0;	115 size_t index = 0;

100 for (; index < hypotheses_list->GetSize(); ++index) {	116 for (; index < hypotheses_list->GetSize(); ++index) {

101 Value* hypothesis = NULL;	117 Value* hypothesis = NULL;

102 if (!hypotheses_list->Get(index, &hypothesis)) {	118 if (!hypotheses_list->Get(index, &hypothesis)) {

103 LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value.";	119 LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value.";

104 break;	120 break;

105 }	121 }

106 DCHECK(hypothesis);	122 DCHECK(hypothesis);

107 if (!hypothesis->IsType(Value::TYPE_DICTIONARY)) {	123 if (!hypothesis->IsType(Value::TYPE_DICTIONARY)) {

108 LOG(WARNING) << "ParseServerResponse: Unexpected value type "	124 LOG(WARNING) << "ParseServerResponse: Unexpected value type "

109 << hypothesis->GetType();	125 << hypothesis->GetType();

110 break;	126 break;

111 }	127 }

112	128

113 const DictionaryValue* hypothesis_value =	129 const DictionaryValue* hypothesis_value =

114 static_cast<DictionaryValue*>(hypothesis);	130 static_cast<DictionaryValue*>(hypothesis);

115 string16 utterance;	131 string16 utterance;

	132

116 if (!hypothesis_value->GetString(kUtteranceString, &utterance)) {	133 if (!hypothesis_value->GetString(kUtteranceString, &utterance)) {

117 LOG(WARNING) << "ParseServerResponse: Missing utterance value.";	134 LOG(WARNING) << "ParseServerResponse: Missing utterance value.";

118 break;	135 break;

119 }	136 }

120	137

121 // It is not an error if the 'confidence' field is missing.	138 // It is not an error if the 'confidence' field is missing.

122 double confidence = 0.0;	139 double confidence = 0.0;

123 hypothesis_value->GetDouble(kConfidenceString, &confidence);	140 hypothesis_value->GetDouble(kConfidenceString, &confidence);

124	141 result->hypotheses.push_back(SpeechRecognitionHypothesis(utterance,

125 result->hypotheses.push_back(content::SpeechRecognitionHypothesis(	142 confidence));

126 utterance, confidence));

127 }	143 }

128	144

129 if (index < hypotheses_list->GetSize()) {	145 if (index < hypotheses_list->GetSize()) {

130 result->hypotheses.clear();	146 result->hypotheses.clear();

131 return false;	147 return false;

132 }	148 }

133

134 return true;	149 return true;

135 }	150 }

136	151

137 } // namespace	152 } // namespace

138	153

139 namespace speech {	154 namespace speech {

140	155

141 int SpeechRecognitionRequest::url_fetcher_id_for_tests = 0;	156 const int GoogleOneShotRemoteEngine::kAudioPacketIntervalMs = 100;

	157 int GoogleOneShotRemoteEngine::url_fetcher_id_for_tests = 0;

142	158

143 SpeechRecognitionRequest::SpeechRecognitionRequest(	159 GoogleOneShotRemoteEngineConfig::GoogleOneShotRemoteEngineConfig()

144 net::URLRequestContextGetter* context, Delegate* delegate)	160 : filter_profanities(false),

145 : url_context_(context),	161 audio_sample_rate(kDefaultConfigSampleRate),

146 delegate_(delegate) {	162 audio_num_bits_per_sample(kDefaultConfigBitsPerSample) {

147 DCHECK(delegate);

148 }	163 }

149	164

150 SpeechRecognitionRequest::~SpeechRecognitionRequest() {}	165 GoogleOneShotRemoteEngineConfig::~GoogleOneShotRemoteEngineConfig() {}

151	166

152 void SpeechRecognitionRequest::Start(const std::string& language,	167 GoogleOneShotRemoteEngine::GoogleOneShotRemoteEngine(

153 const std::string& grammar,	168 net::URLRequestContextGetter* context)

154 bool filter_profanities,	169 : url_context_(context) {

155 const std::string& hardware_info,	170 }

156 const std::string& origin_url,	171

157 const std::string& content_type) {	172 GoogleOneShotRemoteEngine::~GoogleOneShotRemoteEngine() {}

	173

	174 void GoogleOneShotRemoteEngine::SetConfig(

	175 const GoogleOneShotRemoteEngineConfig& config) {

	176 config_ = config;

	177 }

	178

	179 void GoogleOneShotRemoteEngine::StartRecognition() {

	180 DCHECK(delegate());

158 DCHECK(!url_fetcher_.get());	181 DCHECK(!url_fetcher_.get());

	182 std::string lang_param = config_.language;

159	183

160 std::vector<std::string> parts;

161

162 std::string lang_param = language;

163 if (lang_param.empty() && url_context_) {	184 if (lang_param.empty() && url_context_) {

164 // If no language is provided then we use the first from the accepted	185 // If no language is provided then we use the first from the accepted

165 // language list. If this list is empty then it defaults to "en-US".	186 // language list. If this list is empty then it defaults to "en-US".

166 // Example of the contents of this list: "es,en-GB;q=0.8", ""	187 // Example of the contents of this list: "es,en-GB;q=0.8", ""

167 net::URLRequestContext* request_context =	188 net::URLRequestContext* request_context =

168 url_context_->GetURLRequestContext();	189 url_context_->GetURLRequestContext();

169 DCHECK(request_context);	190 DCHECK(request_context);

170 std::string accepted_language_list = request_context->accept_language();	191 std::string accepted_language_list = request_context->accept_language();

171 size_t separator = accepted_language_list.find_first_of(",;");	192 size_t separator = accepted_language_list.find_first_of(",;");

172 lang_param = accepted_language_list.substr(0, separator);	193 lang_param = accepted_language_list.substr(0, separator);

173 }	194 }

	195

174 if (lang_param.empty())	196 if (lang_param.empty())

175 lang_param = "en-US";	197 lang_param = "en-US";

	198

	199 std::vector<std::string> parts;

176 parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true));	200 parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true));

177	201

178 if (!grammar.empty())	202 if (!config_.grammar.empty())

179 parts.push_back("lm=" + net::EscapeQueryParamValue(grammar, true));	203 parts.push_back("lm=" + net::EscapeQueryParamValue(config_.grammar, true));

180 if (!hardware_info.empty())	204

181 parts.push_back("xhw=" + net::EscapeQueryParamValue(hardware_info, true));	205 if (!config_.hardware_info.empty())

	206 parts.push_back("xhw=" + net::EscapeQueryParamValue(config_.hardware_info,

	207 true));

182 parts.push_back("maxresults=" + base::IntToString(kMaxResults));	208 parts.push_back("maxresults=" + base::IntToString(kMaxResults));

183 parts.push_back(filter_profanities ? "pfilter=2" : "pfilter=0");	209 parts.push_back(config_.filter_profanities ? "pfilter=2" : "pfilter=0");

184	210

185 GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&'));	211 GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&'));

186	212

	213 encoder_.reset(AudioEncoder::Create(kDefaultAudioCodec,

	214 config_.audio_sample_rate,

	215 config_.audio_num_bits_per_sample));

	216 DCHECK(encoder_.get());

187 url_fetcher_.reset(URLFetcherImpl::Create(url_fetcher_id_for_tests,	217 url_fetcher_.reset(URLFetcherImpl::Create(url_fetcher_id_for_tests,

188 url,	218 url,

189 URLFetcherImpl::POST,	219 URLFetcherImpl::POST,

190 this));	220 this));

191 url_fetcher_->SetChunkedUpload(content_type);	221 url_fetcher_->SetChunkedUpload(encoder_->mime_type());

192 url_fetcher_->SetRequestContext(url_context_);	222 url_fetcher_->SetRequestContext(url_context_);

193 url_fetcher_->SetReferrer(origin_url);	223 url_fetcher_->SetReferrer(config_.origin_url);

194	224

195 // The speech recognition API does not require user identification as part	225 // The speech recognition API does not require user identification as part

196 // of requests, so we don't send cookies or auth data for these requests to	226 // of requests, so we don't send cookies or auth data for these requests to

197 // prevent any accidental connection between users who are logged into the	227 // prevent any accidental connection between users who are logged into the

198 // domain for other services (e.g. bookmark sync) with the speech requests.	228 // domain for other services (e.g. bookmark sync) with the speech requests.

199 url_fetcher_->SetLoadFlags(	229 url_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES \|

200 net::LOAD_DO_NOT_SAVE_COOKIES \| net::LOAD_DO_NOT_SEND_COOKIES \|	230 net::LOAD_DO_NOT_SEND_COOKIES \|

201 net::LOAD_DO_NOT_SEND_AUTH_DATA);	231 net::LOAD_DO_NOT_SEND_AUTH_DATA);

202 url_fetcher_->Start();	232 url_fetcher_->Start();

203 }	233 }

204	234

205 void SpeechRecognitionRequest::UploadAudioChunk(const AudioChunk& audio_chunk,	235 void GoogleOneShotRemoteEngine::EndRecognition() {

206 bool is_last_chunk) {	236 url_fetcher_.reset();

207 DCHECK(url_fetcher_.get());

208 url_fetcher_->AppendChunkToUpload(audio_chunk.AsString(), is_last_chunk);

209 }	237 }

210	238

211 void SpeechRecognitionRequest::OnURLFetchComplete(	239 void GoogleOneShotRemoteEngine::TakeAudioChunk(const AudioChunk& data) {

	240 DCHECK(url_fetcher_.get());

	241 DCHECK(encoder_.get());

	242 DCHECK_EQ(data.bytes_per_sample(), config_.audio_num_bits_per_sample / 8);

	243 encoder_->Encode(data);

	244 scoped_ptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());

	245 url_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false);

	246 }

	247

	248 void GoogleOneShotRemoteEngine::AudioChunksEnded() {

	249 DCHECK(url_fetcher_.get());

	250 DCHECK(encoder_.get());

	251

	252 // UploadAudioChunk requires a non-empty final buffer. So we encode a packet

	253 // of silence in case encoder had no data already.

	254 std::vector<int16> samples(

	255 config_.audio_sample_rate * kAudioPacketIntervalMs / 1000);

	256 AudioChunk dummy_chunk(reinterpret_cast<uint8*>(&samples[0]),

	257 samples.size() * sizeof(int16),

	258 encoder_->bits_per_sample() / 8);

	259 encoder_->Encode(dummy_chunk);

	260 encoder_->Flush();

	261 scoped_ptr<AudioChunk> encoded_dummy_data(encoder_->GetEncodedDataAndClear());

	262 DCHECK(!encoded_dummy_data->IsEmpty());

	263 encoder_.reset();

	264

	265 url_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true);

	266 }

	267

	268 void GoogleOneShotRemoteEngine::OnURLFetchComplete(

212 const content::URLFetcher* source) {	269 const content::URLFetcher* source) {

213 DCHECK_EQ(url_fetcher_.get(), source);	270 DCHECK_EQ(url_fetcher_.get(), source);

	271 SpeechRecognitionResult result;

	272 SpeechRecognitionError error(content::SPEECH_RECOGNITION_ERROR_NETWORK);

	273 std::string data;

214	274

215 content::SpeechRecognitionResult result;	275 // The default error code in case of parse errors is NETWORK_FAILURE, however

216 std::string data;	276 // ParseServerResponse can change the error to a more appropriate one.

217 if (!source->GetStatus().is_success() \|\| source->GetResponseCode() != 200 \|\|	277 bool error_occurred = (!source->GetStatus().is_success() \|\|

218 !source->GetResponseAsString(&data) \|\|	278 source->GetResponseCode() != 200 \|\|

219 !ParseServerResponse(data, &result)) {	279 !source->GetResponseAsString(&data) \|\|

220 result.error = content::SPEECH_RECOGNITION_ERROR_NETWORK;	280 !ParseServerResponse(data, &result, &error));

	281 url_fetcher_.reset();

	282 if (error_occurred) {

	283 DVLOG(1) << "GoogleOneShotRemoteEngine: Network Error " << error.code;

	284 delegate()->OnSpeechRecognitionEngineError(error);

	285 } else {

	286 DVLOG(1) << "GoogleOneShotRemoteEngine: Invoking delegate with result.";

	287 delegate()->OnSpeechRecognitionEngineResult(result);

221 }	288 }

	289 }

222	290

223 DVLOG(1) << "SpeechRecognitionRequest: Invoking delegate with result.";	291 bool GoogleOneShotRemoteEngine::IsRecognitionPending() const {

224 url_fetcher_.reset();	292 return url_fetcher_ != NULL;

225 delegate_->SetRecognitionResult(result);	293 }

	294

	295 int GoogleOneShotRemoteEngine::GetDesiredAudioChunkDurationMs() const {

	296 return kAudioPacketIntervalMs;

226 }	297 }

227	298

228 } // namespace speech	299 } // namespace speech

OLD	NEW