content/browser/speech/google_one_shot_remote_engine.cc - Issue 9663066: Refactoring of chrome speech recognition architecture (CL1.3)

Side by Side Diff: content/browser/speech/google_one_shot_remote_engine.cc

Issue 9663066: Refactoring of chrome speech recognition architecture (CL1.3) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Fixed according to last Satish comments. Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« no previous file with comments | « content/browser/speech/google_one_shot_remote_engine.h ('k') | content/browser/speech/google_one_shot_remote_engine_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "content/browser/speech/speech_recognition_request.h"	5 #include "content/browser/speech/google_one_shot_remote_engine.h"

6	6

7 #include <vector>	7 #include <vector>

8	8

9 #include "base/json/json_reader.h"	9 #include "base/json/json_reader.h"

	10 #include "base/memory/scoped_ptr.h"

10 #include "base/string_number_conversions.h"	11 #include "base/string_number_conversions.h"

11 #include "base/string_util.h"	12 #include "base/string_util.h"

12 #include "base/values.h"	13 #include "base/values.h"

13 #include "content/browser/speech/audio_buffer.h"	14 #include "content/browser/speech/audio_buffer.h"

14 #include "content/common/net/url_fetcher_impl.h"	15 #include "content/common/net/url_fetcher_impl.h"

15 #include "content/public/common/speech_recognition_result.h"	16 #include "content/public/common/speech_recognition_result.h"

16 #include "net/base/escape.h"	17 #include "net/base/escape.h"

17 #include "net/base/load_flags.h"	18 #include "net/base/load_flags.h"

18 #include "net/url_request/url_request_context.h"	19 #include "net/url_request/url_request_context.h"

19 #include "net/url_request/url_request_context_getter.h"	20 #include "net/url_request/url_request_context_getter.h"

20 #include "net/url_request/url_request_status.h"	21 #include "net/url_request/url_request_status.h"

21	22

	23 using content::SpeechRecognitionError;

	24 using content::SpeechRecognitionHypothesis;

	25 using content::SpeechRecognitionResult;

	26

22 namespace {	27 namespace {

23	28

24 const char* const kDefaultSpeechRecognitionUrl =	29 const char* const kDefaultSpeechRecognitionUrl =

25 "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&";	30 "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&";

26 const char* const kStatusString = "status";	31 const char* const kStatusString = "status";

27 const char* const kHypothesesString = "hypotheses";	32 const char* const kHypothesesString = "hypotheses";

28 const char* const kUtteranceString = "utterance";	33 const char* const kUtteranceString = "utterance";

29 const char* const kConfidenceString = "confidence";	34 const char* const kConfidenceString = "confidence";

30	35 const int kWebServiceStatusNoError = 0;

	36 const int kWebServiceStatusNoSpeech = 4;

	37 const int kWebServiceStatusNoMatch = 5;

	38 const int kDefaultConfigSampleRate = 8000;

	39 const int kDefaultConfigBitsPerSample = 16;

	40 const speech::AudioEncoder::Codec kDefaultAudioCodec =

	41 speech::AudioEncoder::CODEC_FLAC;

31 // TODO(satish): Remove this hardcoded value once the page is allowed to	42 // TODO(satish): Remove this hardcoded value once the page is allowed to

32 // set this via an attribute.	43 // set this via an attribute.

33 const int kMaxResults = 6;	44 const int kMaxResults = 6;

34	45

35 bool ParseServerResponse(const std::string& response_body,	46 bool ParseServerResponse(const std::string& response_body,

36 content::SpeechRecognitionResult* result) {	47 SpeechRecognitionResult* result,

	48 SpeechRecognitionError* error) {

37 if (response_body.empty()) {	49 if (response_body.empty()) {

38 LOG(WARNING) << "ParseServerResponse: Response was empty.";	50 LOG(WARNING) << "ParseServerResponse: Response was empty.";

39 return false;	51 return false;

40 }	52 }

41 DVLOG(1) << "ParseServerResponse: Parsing response " << response_body;	53 DVLOG(1) << "ParseServerResponse: Parsing response " << response_body;

42	54

43 // Parse the response, ignoring comments.	55 // Parse the response, ignoring comments.

44 std::string error_msg;	56 std::string error_msg;

45 scoped_ptr<Value> response_value(base::JSONReader::ReadAndReturnError(	57 scoped_ptr<Value> response_value(base::JSONReader::ReadAndReturnError(

46 response_body, false, NULL, &error_msg));	58 response_body, false, NULL, &error_msg));

(...skipping 13 matching lines...) Expand all Loading...
60 // Get the status.	72 // Get the status.

61 int status;	73 int status;

62 if (!response_object->GetInteger(kStatusString, &status)) {	74 if (!response_object->GetInteger(kStatusString, &status)) {

63 VLOG(1) << "ParseServerResponse: " << kStatusString	75 VLOG(1) << "ParseServerResponse: " << kStatusString

64 << " is not a valid integer value.";	76 << " is not a valid integer value.";

65 return false;	77 return false;

66 }	78 }

67	79

68 // Process the status.	80 // Process the status.

69 switch (status) {	81 switch (status) {

70 case content::SPEECH_RECOGNITION_ERROR_NONE:	82 case kWebServiceStatusNoError:

71 case content::SPEECH_RECOGNITION_ERROR_NO_SPEECH:	83 break;

72 case content::SPEECH_RECOGNITION_ERROR_NO_MATCH:	84 case kWebServiceStatusNoSpeech:

73 break;	85 error->code = content::SPEECH_RECOGNITION_ERROR_NO_SPEECH;

74	86 return false;

75 default:	87 case kWebServiceStatusNoMatch:

76 // Other status codes should not be returned by the server.	88 error->code = content::SPEECH_RECOGNITION_ERROR_NO_MATCH;

77 VLOG(1) << "ParseServerResponse: unexpected status code " << status;	89 return false;

78 return false;	90 default:

	91 error->code = content::SPEECH_RECOGNITION_ERROR_NETWORK;

	92 // Other status codes should not be returned by the server.

	93 VLOG(1) << "ParseServerResponse: unexpected status code " << status;

	94 return false;

79 }	95 }

80	96

81 result->error = static_cast<content::SpeechRecognitionErrorCode>(status);

82

83 // Get the hypotheses.	97 // Get the hypotheses.

84 Value* hypotheses_value = NULL;	98 Value* hypotheses_value = NULL;

85 if (!response_object->Get(kHypothesesString, &hypotheses_value)) {	99 if (!response_object->Get(kHypothesesString, &hypotheses_value)) {

86 VLOG(1) << "ParseServerResponse: Missing hypotheses attribute.";	100 VLOG(1) << "ParseServerResponse: Missing hypotheses attribute.";

87 return false;	101 return false;

88 }	102 }

89	103

90 DCHECK(hypotheses_value);	104 DCHECK(hypotheses_value);

91 if (!hypotheses_value->IsType(Value::TYPE_LIST)) {	105 if (!hypotheses_value->IsType(Value::TYPE_LIST)) {

92 VLOG(1) << "ParseServerResponse: Unexpected hypotheses type "	106 VLOG(1) << "ParseServerResponse: Unexpected hypotheses type "

93 << hypotheses_value->GetType();	107 << hypotheses_value->GetType();

94 return false;	108 return false;

95 }	109 }

96	110

97 const ListValue* hypotheses_list = static_cast<ListValue*>(hypotheses_value);	111 const ListValue* hypotheses_list = static_cast<ListValue*>(hypotheses_value);

98	112 // For now we support only single shot recognition, so we are giving only a
	Satish 2012/03/22 14:37:22 add newline above full length comments like these add newline above full length comments like these Primiano Tucci (use gerrit) 2012/03/22 17:19:05 Done. Show quoted text On 2012/03/22 14:37:22, Satish wrote: > add newline above full length comments like these Done.
	113 // final result, consisting of one fragment (with one or more hypotheses).

99 size_t index = 0;	114 size_t index = 0;

100 for (; index < hypotheses_list->GetSize(); ++index) {	115 for (; index < hypotheses_list->GetSize(); ++index) {

101 Value* hypothesis = NULL;	116 Value* hypothesis = NULL;

102 if (!hypotheses_list->Get(index, &hypothesis)) {	117 if (!hypotheses_list->Get(index, &hypothesis)) {

103 LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value.";	118 LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value.";

104 break;	119 break;

105 }	120 }

106 DCHECK(hypothesis);	121 DCHECK(hypothesis);

107 if (!hypothesis->IsType(Value::TYPE_DICTIONARY)) {	122 if (!hypothesis->IsType(Value::TYPE_DICTIONARY)) {

108 LOG(WARNING) << "ParseServerResponse: Unexpected value type "	123 LOG(WARNING) << "ParseServerResponse: Unexpected value type "

109 << hypothesis->GetType();	124 << hypothesis->GetType();

110 break;	125 break;

111 }	126 }

112	127

113 const DictionaryValue* hypothesis_value =	128 const DictionaryValue* hypothesis_value =

114 static_cast<DictionaryValue*>(hypothesis);	129 static_cast<DictionaryValue*>(hypothesis);

115 string16 utterance;	130 string16 utterance;

	131

116 if (!hypothesis_value->GetString(kUtteranceString, &utterance)) {	132 if (!hypothesis_value->GetString(kUtteranceString, &utterance)) {

117 LOG(WARNING) << "ParseServerResponse: Missing utterance value.";	133 LOG(WARNING) << "ParseServerResponse: Missing utterance value.";

118 break;	134 break;

119 }	135 }

120	136

121 // It is not an error if the 'confidence' field is missing.	137 // It is not an error if the 'confidence' field is missing.

122 double confidence = 0.0;	138 double confidence = 0.0;

123 hypothesis_value->GetDouble(kConfidenceString, &confidence);	139 hypothesis_value->GetDouble(kConfidenceString, &confidence);

124	140 result->hypotheses.push_back(SpeechRecognitionHypothesis(utterance,

125 result->hypotheses.push_back(content::SpeechRecognitionHypothesis(	141 confidence));

126 utterance, confidence));

127 }	142 }

128	143

129 if (index < hypotheses_list->GetSize()) {	144 if (index < hypotheses_list->GetSize()) {

130 result->hypotheses.clear();	145 result->hypotheses.clear();

131 return false;	146 return false;

132 }	147 }

133

134 return true;	148 return true;

135 }	149 }

136	150

137 } // namespace	151 } // namespace

138	152

139 namespace speech {	153 namespace speech {

140	154

141 int SpeechRecognitionRequest::url_fetcher_id_for_tests = 0;	155 const int GoogleOneShotRemoteEngine::kAudioPacketIntervalMs = 100;

	156 int GoogleOneShotRemoteEngine::url_fetcher_id_for_tests = 0;

142	157

143 SpeechRecognitionRequest::SpeechRecognitionRequest(	158 GoogleOneShotRemoteEngineConfig::GoogleOneShotRemoteEngineConfig()

144 net::URLRequestContextGetter* context, Delegate* delegate)	159 : filter_profanities(false),

145 : url_context_(context),	160 audio_sample_rate(kDefaultConfigSampleRate),

146 delegate_(delegate) {	161 audio_num_bits_per_sample(kDefaultConfigBitsPerSample) {

147 DCHECK(delegate);

148 }	162 }

149	163

150 SpeechRecognitionRequest::~SpeechRecognitionRequest() {}	164 GoogleOneShotRemoteEngineConfig::~GoogleOneShotRemoteEngineConfig() {}

151	165

152 void SpeechRecognitionRequest::Start(const std::string& language,	166 GoogleOneShotRemoteEngine::GoogleOneShotRemoteEngine(

153 const std::string& grammar,	167 net::URLRequestContextGetter* context)

154 bool filter_profanities,	168 : url_context_(context) {

155 const std::string& hardware_info,	169 }

156 const std::string& origin_url,	170

157 const std::string& content_type) {	171 GoogleOneShotRemoteEngine::~GoogleOneShotRemoteEngine() {}

	172

	173 void GoogleOneShotRemoteEngine::SetConfig(

	174 const GoogleOneShotRemoteEngineConfig& config) {

	175 config_ = config;

	176 }

	177

	178 void GoogleOneShotRemoteEngine::StartRecognition() {

	179 DCHECK(delegate());

158 DCHECK(!url_fetcher_.get());	180 DCHECK(!url_fetcher_.get());

	181 std::string lang_param = config_.language;

159	182

160 std::vector<std::string> parts;

161

162 std::string lang_param = language;

163 if (lang_param.empty() && url_context_) {	183 if (lang_param.empty() && url_context_) {

164 // If no language is provided then we use the first from the accepted	184 // If no language is provided then we use the first from the accepted

165 // language list. If this list is empty then it defaults to "en-US".	185 // language list. If this list is empty then it defaults to "en-US".

166 // Example of the contents of this list: "es,en-GB;q=0.8", ""	186 // Example of the contents of this list: "es,en-GB;q=0.8", ""

167 net::URLRequestContext* request_context =	187 net::URLRequestContext* request_context =

168 url_context_->GetURLRequestContext();	188 url_context_->GetURLRequestContext();

169 DCHECK(request_context);	189 DCHECK(request_context);

170 std::string accepted_language_list = request_context->accept_language();	190 std::string accepted_language_list = request_context->accept_language();

171 size_t separator = accepted_language_list.find_first_of(",;");	191 size_t separator = accepted_language_list.find_first_of(",;");

172 lang_param = accepted_language_list.substr(0, separator);	192 lang_param = accepted_language_list.substr(0, separator);

173 }	193 }

	194

174 if (lang_param.empty())	195 if (lang_param.empty())

175 lang_param = "en-US";	196 lang_param = "en-US";

	197

	198 std::vector<std::string> parts;

176 parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true));	199 parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true));

177	200

178 if (!grammar.empty())	201 if (!config_.grammar.empty())

179 parts.push_back("lm=" + net::EscapeQueryParamValue(grammar, true));	202 parts.push_back("lm=" + net::EscapeQueryParamValue(config_.grammar, true));

180 if (!hardware_info.empty())	203

181 parts.push_back("xhw=" + net::EscapeQueryParamValue(hardware_info, true));	204 if (!config_.hardware_info.empty())

	205 parts.push_back("xhw=" + net::EscapeQueryParamValue(config_.hardware_info,

	206 true));

182 parts.push_back("maxresults=" + base::IntToString(kMaxResults));	207 parts.push_back("maxresults=" + base::IntToString(kMaxResults));

183 parts.push_back(filter_profanities ? "pfilter=2" : "pfilter=0");	208 parts.push_back(config_.filter_profanities ? "pfilter=2" : "pfilter=0");

184	209

185 GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&'));	210 GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&'));

186	211

	212 encoder_.reset(AudioEncoder::Create(kDefaultAudioCodec,

	213 config_.audio_sample_rate,

	214 config_.audio_num_bits_per_sample));

	215 DCHECK(encoder_.get());

187 url_fetcher_.reset(URLFetcherImpl::Create(url_fetcher_id_for_tests,	216 url_fetcher_.reset(URLFetcherImpl::Create(url_fetcher_id_for_tests,

188 url,	217 url,

189 URLFetcherImpl::POST,	218 URLFetcherImpl::POST,

190 this));	219 this));

191 url_fetcher_->SetChunkedUpload(content_type);	220 url_fetcher_->SetChunkedUpload(encoder_->mime_type());

192 url_fetcher_->SetRequestContext(url_context_);	221 url_fetcher_->SetRequestContext(url_context_);

193 url_fetcher_->SetReferrer(origin_url);	222 url_fetcher_->SetReferrer(config_.origin_url);

194	223

195 // The speech recognition API does not require user identification as part	224 // The speech recognition API does not require user identification as part

196 // of requests, so we don't send cookies or auth data for these requests to	225 // of requests, so we don't send cookies or auth data for these requests to

197 // prevent any accidental connection between users who are logged into the	226 // prevent any accidental connection between users who are logged into the

198 // domain for other services (e.g. bookmark sync) with the speech requests.	227 // domain for other services (e.g. bookmark sync) with the speech requests.

199 url_fetcher_->SetLoadFlags(	228 url_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES \|

200 net::LOAD_DO_NOT_SAVE_COOKIES \| net::LOAD_DO_NOT_SEND_COOKIES \|	229 net::LOAD_DO_NOT_SEND_COOKIES \|

201 net::LOAD_DO_NOT_SEND_AUTH_DATA);	230 net::LOAD_DO_NOT_SEND_AUTH_DATA);

202 url_fetcher_->Start();	231 url_fetcher_->Start();

203 }	232 }

204	233

205 void SpeechRecognitionRequest::UploadAudioChunk(const AudioChunk& audio_chunk,	234 void GoogleOneShotRemoteEngine::EndRecognition() {

206 bool is_last_chunk) {	235 url_fetcher_.reset();

207 DCHECK(url_fetcher_.get());

208 url_fetcher_->AppendChunkToUpload(audio_chunk.AsString(), is_last_chunk);

209 }	236 }

210	237

211 void SpeechRecognitionRequest::OnURLFetchComplete(	238 void GoogleOneShotRemoteEngine::TakeAudioChunk(const AudioChunk& data) {

	239 DCHECK(url_fetcher_.get());

	240 DCHECK(encoder_.get());

	241 DCHECK_EQ(data.bytes_per_sample(), config_.audio_num_bits_per_sample / 8);

	242 encoder_->Encode(data);

	243 scoped_ptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());

	244 url_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false);

	245 }

	246

	247 void GoogleOneShotRemoteEngine::AudioChunksEnded() {

	248 DCHECK(url_fetcher_.get());

	249 DCHECK(encoder_.get());

	250

	251 // UploadAudioChunk requires a non-empty final buffer. So we encode a packet

	252 // of silence in case encoder had no data already.

	253 std::vector<int16> samples(

	254 config_.audio_sample_rate * kAudioPacketIntervalMs / 1000);

	255 AudioChunk dummy_chunk(reinterpret_cast<uint8*>(&samples[0]),

	256 samples.size() * sizeof(int16),

	257 encoder_->bits_per_sample() / 8);

	258 encoder_->Encode(dummy_chunk);

	259 encoder_->Flush();

	260 scoped_ptr<AudioChunk> encoded_dummy_data(encoder_->GetEncodedDataAndClear());

	261 DCHECK(!encoded_dummy_data->IsEmpty());

	262 encoder_.reset();

	263

	264 url_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true);

	265 }

	266

	267 void GoogleOneShotRemoteEngine::OnURLFetchComplete(

212 const content::URLFetcher* source) {	268 const content::URLFetcher* source) {

213 DCHECK_EQ(url_fetcher_.get(), source);	269 DCHECK_EQ(url_fetcher_.get(), source);

	270 SpeechRecognitionResult result;

	271 SpeechRecognitionError error(content::SPEECH_RECOGNITION_ERROR_NETWORK);

	272 std::string data;

214	273

215 content::SpeechRecognitionResult result;	274 // The default error code in case of parse errors is NETWORK_FAILURE, however

216 std::string data;	275 // ParseServerResponse can change the error to a more appropriate one.

217 if (!source->GetStatus().is_success() \|\| source->GetResponseCode() != 200 \|\|	276 bool error_occurred = (!source->GetStatus().is_success() \|\|

218 !source->GetResponseAsString(&data) \|\|	277 source->GetResponseCode() != 200 \|\|

219 !ParseServerResponse(data, &result)) {	278 !source->GetResponseAsString(&data) \|\|

220 result.error = content::SPEECH_RECOGNITION_ERROR_NETWORK;	279 !ParseServerResponse(data, &result, &error));

	280 url_fetcher_.reset();

	281 if (error_occurred) {

	282 DVLOG(1) << "GoogleOneShotRemoteEngine: Network Error " << error.code;

	283 delegate()->OnSpeechRecognitionEngineError(error);

	284 } else {

	285 DVLOG(1) << "GoogleOneShotRemoteEngine: Invoking delegate with result.";

	286 delegate()->OnSpeechRecognitionEngineResult(result);

221 }	287 }

	288 }

222	289

223 DVLOG(1) << "SpeechRecognitionRequest: Invoking delegate with result.";	290 bool GoogleOneShotRemoteEngine::IsRecognitionPending() const {

224 url_fetcher_.reset();	291 return url_fetcher_ != NULL;

225 delegate_->SetRecognitionResult(result);	292 }

	293

	294 int GoogleOneShotRemoteEngine::GetDesiredAudioChunkDurationMs() const {

	295 return kAudioPacketIntervalMs;

226 }	296 }

227	297

228 } // namespace speech	298 } // namespace speech

OLD	NEW