content/browser/speech/google_one_shot_remote_engine.cc - Issue 9663066: Refactoring of chrome speech recognition architecture (CL1.3)

Side by Side Diff: content/browser/speech/google_one_shot_remote_engine.cc

Issue 9663066: Refactoring of chrome speech recognition architecture (CL1.3) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Fixed compilation issues on windows. Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« no previous file with comments | « content/browser/speech/google_one_shot_remote_engine.h ('k') | content/browser/speech/google_one_shot_remote_engine_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "content/browser/speech/speech_recognition_request.h"	5 #include "content/browser/speech/google_one_shot_remote_engine.h"

6	6

7 #include <vector>	7 #include <vector>

8	8

9 #include "base/json/json_reader.h"	9 #include "base/json/json_reader.h"

	10 #include "base/memory/scoped_ptr.h"

10 #include "base/string_number_conversions.h"	11 #include "base/string_number_conversions.h"

11 #include "base/string_util.h"	12 #include "base/string_util.h"

12 #include "base/values.h"	13 #include "base/values.h"

13 #include "content/browser/speech/audio_buffer.h"	14 #include "content/browser/speech/audio_buffer.h"

14 #include "content/common/net/url_fetcher_impl.h"	15 #include "content/common/net/url_fetcher_impl.h"

	16 #include "content/public/common/speech_recognition_error.h"

15 #include "content/public/common/speech_recognition_result.h"	17 #include "content/public/common/speech_recognition_result.h"

16 #include "net/base/escape.h"	18 #include "net/base/escape.h"

17 #include "net/base/load_flags.h"	19 #include "net/base/load_flags.h"

18 #include "net/url_request/url_request_context.h"	20 #include "net/url_request/url_request_context.h"

19 #include "net/url_request/url_request_context_getter.h"	21 #include "net/url_request/url_request_context_getter.h"

20 #include "net/url_request/url_request_status.h"	22 #include "net/url_request/url_request_status.h"

21	23

	24 using content::SpeechRecognitionError;

	25 using content::SpeechRecognitionHypothesis;

	26 using content::SpeechRecognitionResult;

	27

22 namespace {	28 namespace {

23	29

24 const char* const kDefaultSpeechRecognitionUrl =	30 const char* const kDefaultSpeechRecognitionUrl =

25 "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&";	31 "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&";

26 const char* const kStatusString = "status";	32 const char* const kStatusString = "status";

27 const char* const kHypothesesString = "hypotheses";	33 const char* const kHypothesesString = "hypotheses";

28 const char* const kUtteranceString = "utterance";	34 const char* const kUtteranceString = "utterance";

29 const char* const kConfidenceString = "confidence";	35 const char* const kConfidenceString = "confidence";

30	36 const int kWebServiceStatusNoError = 0;

	37 const int kWebServiceStatusNoSpeech = 4;

	38 const int kWebServiceStatusNoMatch = 5;

	39 const int kDefaultConfigSampleRate = 8000;

	40 const int kDefaultConfigBitsPerSample = 16;

	41 const speech::AudioEncoder::Codec kDefaultAudioCodec =

	42 speech::AudioEncoder::CODEC_FLAC;

31 // TODO(satish): Remove this hardcoded value once the page is allowed to	43 // TODO(satish): Remove this hardcoded value once the page is allowed to

32 // set this via an attribute.	44 // set this via an attribute.

33 const int kMaxResults = 6;	45 const int kMaxResults = 6;

34	46

35 bool ParseServerResponse(const std::string& response_body,	47 bool ParseServerResponse(const std::string& response_body,

36 content::SpeechRecognitionResult* result) {	48 SpeechRecognitionResult* result,

	49 SpeechRecognitionError* error) {

37 if (response_body.empty()) {	50 if (response_body.empty()) {

38 LOG(WARNING) << "ParseServerResponse: Response was empty.";	51 LOG(WARNING) << "ParseServerResponse: Response was empty.";

39 return false;	52 return false;

40 }	53 }

41 DVLOG(1) << "ParseServerResponse: Parsing response " << response_body;	54 DVLOG(1) << "ParseServerResponse: Parsing response " << response_body;

42	55

43 // Parse the response, ignoring comments.	56 // Parse the response, ignoring comments.

44 std::string error_msg;	57 std::string error_msg;

45 scoped_ptr<Value> response_value(base::JSONReader::ReadAndReturnError(	58 scoped_ptr<Value> response_value(base::JSONReader::ReadAndReturnError(

46 response_body, false, NULL, &error_msg));	59 response_body, false, NULL, &error_msg));

(...skipping 13 matching lines...) Expand all Loading...
60 // Get the status.	73 // Get the status.

61 int status;	74 int status;

62 if (!response_object->GetInteger(kStatusString, &status)) {	75 if (!response_object->GetInteger(kStatusString, &status)) {

63 VLOG(1) << "ParseServerResponse: " << kStatusString	76 VLOG(1) << "ParseServerResponse: " << kStatusString

64 << " is not a valid integer value.";	77 << " is not a valid integer value.";

65 return false;	78 return false;

66 }	79 }

67	80

68 // Process the status.	81 // Process the status.

69 switch (status) {	82 switch (status) {

70 case content::SPEECH_RECOGNITION_ERROR_NONE:	83 case kWebServiceStatusNoError:

71 case content::SPEECH_RECOGNITION_ERROR_NO_SPEECH:	84 break;

72 case content::SPEECH_RECOGNITION_ERROR_NO_MATCH:	85 case kWebServiceStatusNoSpeech:

73 break;	86 error->code = content::SPEECH_RECOGNITION_ERROR_NO_SPEECH;

74	87 return false;

75 default:	88 case kWebServiceStatusNoMatch:

76 // Other status codes should not be returned by the server.	89 error->code = content::SPEECH_RECOGNITION_ERROR_NO_MATCH;

77 VLOG(1) << "ParseServerResponse: unexpected status code " << status;	90 return false;

78 return false;	91 default:

	92 error->code = content::SPEECH_RECOGNITION_ERROR_NETWORK;

	93 // Other status codes should not be returned by the server.

	94 VLOG(1) << "ParseServerResponse: unexpected status code " << status;

	95 return false;

79 }	96 }

80	97

81 result->error = static_cast<content::SpeechRecognitionErrorCode>(status);

82

83 // Get the hypotheses.	98 // Get the hypotheses.

84 Value* hypotheses_value = NULL;	99 Value* hypotheses_value = NULL;

85 if (!response_object->Get(kHypothesesString, &hypotheses_value)) {	100 if (!response_object->Get(kHypothesesString, &hypotheses_value)) {

86 VLOG(1) << "ParseServerResponse: Missing hypotheses attribute.";	101 VLOG(1) << "ParseServerResponse: Missing hypotheses attribute.";

87 return false;	102 return false;

88 }	103 }

89	104

90 DCHECK(hypotheses_value);	105 DCHECK(hypotheses_value);

91 if (!hypotheses_value->IsType(Value::TYPE_LIST)) {	106 if (!hypotheses_value->IsType(Value::TYPE_LIST)) {

92 VLOG(1) << "ParseServerResponse: Unexpected hypotheses type "	107 VLOG(1) << "ParseServerResponse: Unexpected hypotheses type "

93 << hypotheses_value->GetType();	108 << hypotheses_value->GetType();

94 return false;	109 return false;

95 }	110 }

96	111

97 const ListValue* hypotheses_list = static_cast<ListValue*>(hypotheses_value);	112 const ListValue* hypotheses_list = static_cast<ListValue*>(hypotheses_value);

98	113

	114 // For now we support only single shot recognition, so we are giving only a

	115 // final result, consisting of one fragment (with one or more hypotheses).

99 size_t index = 0;	116 size_t index = 0;

100 for (; index < hypotheses_list->GetSize(); ++index) {	117 for (; index < hypotheses_list->GetSize(); ++index) {

101 Value* hypothesis = NULL;	118 Value* hypothesis = NULL;

102 if (!hypotheses_list->Get(index, &hypothesis)) {	119 if (!hypotheses_list->Get(index, &hypothesis)) {

103 LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value.";	120 LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value.";

104 break;	121 break;

105 }	122 }

106 DCHECK(hypothesis);	123 DCHECK(hypothesis);

107 if (!hypothesis->IsType(Value::TYPE_DICTIONARY)) {	124 if (!hypothesis->IsType(Value::TYPE_DICTIONARY)) {

108 LOG(WARNING) << "ParseServerResponse: Unexpected value type "	125 LOG(WARNING) << "ParseServerResponse: Unexpected value type "

109 << hypothesis->GetType();	126 << hypothesis->GetType();

110 break;	127 break;

111 }	128 }

112	129

113 const DictionaryValue* hypothesis_value =	130 const DictionaryValue* hypothesis_value =

114 static_cast<DictionaryValue*>(hypothesis);	131 static_cast<DictionaryValue*>(hypothesis);

115 string16 utterance;	132 string16 utterance;

	133

116 if (!hypothesis_value->GetString(kUtteranceString, &utterance)) {	134 if (!hypothesis_value->GetString(kUtteranceString, &utterance)) {

117 LOG(WARNING) << "ParseServerResponse: Missing utterance value.";	135 LOG(WARNING) << "ParseServerResponse: Missing utterance value.";

118 break;	136 break;

119 }	137 }

120	138

121 // It is not an error if the 'confidence' field is missing.	139 // It is not an error if the 'confidence' field is missing.

122 double confidence = 0.0;	140 double confidence = 0.0;

123 hypothesis_value->GetDouble(kConfidenceString, &confidence);	141 hypothesis_value->GetDouble(kConfidenceString, &confidence);

124	142 result->hypotheses.push_back(SpeechRecognitionHypothesis(utterance,

125 result->hypotheses.push_back(content::SpeechRecognitionHypothesis(	143 confidence));

126 utterance, confidence));

127 }	144 }

128	145

129 if (index < hypotheses_list->GetSize()) {	146 if (index < hypotheses_list->GetSize()) {

130 result->hypotheses.clear();	147 result->hypotheses.clear();

131 return false;	148 return false;

132 }	149 }

133

134 return true;	150 return true;

135 }	151 }

136	152

137 } // namespace	153 } // namespace

138	154

139 namespace speech {	155 namespace speech {

140	156

141 int SpeechRecognitionRequest::url_fetcher_id_for_tests = 0;	157 const int GoogleOneShotRemoteEngine::kAudioPacketIntervalMs = 100;

	158 int GoogleOneShotRemoteEngine::url_fetcher_id_for_tests = 0;

142	159

143 SpeechRecognitionRequest::SpeechRecognitionRequest(	160 GoogleOneShotRemoteEngineConfig::GoogleOneShotRemoteEngineConfig()

144 net::URLRequestContextGetter* context, Delegate* delegate)	161 : filter_profanities(false),

145 : url_context_(context),	162 audio_sample_rate(kDefaultConfigSampleRate),

146 delegate_(delegate) {	163 audio_num_bits_per_sample(kDefaultConfigBitsPerSample) {

147 DCHECK(delegate);

148 }	164 }

149	165

150 SpeechRecognitionRequest::~SpeechRecognitionRequest() {}	166 GoogleOneShotRemoteEngineConfig::~GoogleOneShotRemoteEngineConfig() {}

151	167

152 void SpeechRecognitionRequest::Start(const std::string& language,	168 GoogleOneShotRemoteEngine::GoogleOneShotRemoteEngine(

153 const std::string& grammar,	169 net::URLRequestContextGetter* context)

154 bool filter_profanities,	170 : url_context_(context) {

155 const std::string& hardware_info,	171 }

156 const std::string& origin_url,	172

157 const std::string& content_type) {	173 GoogleOneShotRemoteEngine::~GoogleOneShotRemoteEngine() {}

	174

	175 void GoogleOneShotRemoteEngine::SetConfig(

	176 const GoogleOneShotRemoteEngineConfig& config) {

	177 config_ = config;

	178 }

	179

	180 void GoogleOneShotRemoteEngine::StartRecognition() {

	181 DCHECK(delegate());

158 DCHECK(!url_fetcher_.get());	182 DCHECK(!url_fetcher_.get());

	183 std::string lang_param = config_.language;

159	184

160 std::vector<std::string> parts;

161

162 std::string lang_param = language;

163 if (lang_param.empty() && url_context_) {	185 if (lang_param.empty() && url_context_) {

164 // If no language is provided then we use the first from the accepted	186 // If no language is provided then we use the first from the accepted

165 // language list. If this list is empty then it defaults to "en-US".	187 // language list. If this list is empty then it defaults to "en-US".

166 // Example of the contents of this list: "es,en-GB;q=0.8", ""	188 // Example of the contents of this list: "es,en-GB;q=0.8", ""

167 net::URLRequestContext* request_context =	189 net::URLRequestContext* request_context =

168 url_context_->GetURLRequestContext();	190 url_context_->GetURLRequestContext();

169 DCHECK(request_context);	191 DCHECK(request_context);

170 std::string accepted_language_list = request_context->accept_language();	192 std::string accepted_language_list = request_context->accept_language();

171 size_t separator = accepted_language_list.find_first_of(",;");	193 size_t separator = accepted_language_list.find_first_of(",;");

172 lang_param = accepted_language_list.substr(0, separator);	194 lang_param = accepted_language_list.substr(0, separator);

173 }	195 }

	196

174 if (lang_param.empty())	197 if (lang_param.empty())

175 lang_param = "en-US";	198 lang_param = "en-US";

	199

	200 std::vector<std::string> parts;

176 parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true));	201 parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true));

177	202

178 if (!grammar.empty())	203 if (!config_.grammar.empty())

179 parts.push_back("lm=" + net::EscapeQueryParamValue(grammar, true));	204 parts.push_back("lm=" + net::EscapeQueryParamValue(config_.grammar, true));

180 if (!hardware_info.empty())	205

181 parts.push_back("xhw=" + net::EscapeQueryParamValue(hardware_info, true));	206 if (!config_.hardware_info.empty())

	207 parts.push_back("xhw=" + net::EscapeQueryParamValue(config_.hardware_info,

	208 true));

182 parts.push_back("maxresults=" + base::IntToString(kMaxResults));	209 parts.push_back("maxresults=" + base::IntToString(kMaxResults));

183 parts.push_back(filter_profanities ? "pfilter=2" : "pfilter=0");	210 parts.push_back(config_.filter_profanities ? "pfilter=2" : "pfilter=0");

184	211

185 GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&'));	212 GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&'));

186	213

	214 encoder_.reset(AudioEncoder::Create(kDefaultAudioCodec,

	215 config_.audio_sample_rate,

	216 config_.audio_num_bits_per_sample));

	217 DCHECK(encoder_.get());

187 url_fetcher_.reset(URLFetcherImpl::Create(url_fetcher_id_for_tests,	218 url_fetcher_.reset(URLFetcherImpl::Create(url_fetcher_id_for_tests,

188 url,	219 url,

189 URLFetcherImpl::POST,	220 URLFetcherImpl::POST,

190 this));	221 this));

191 url_fetcher_->SetChunkedUpload(content_type);	222 url_fetcher_->SetChunkedUpload(encoder_->mime_type());

192 url_fetcher_->SetRequestContext(url_context_);	223 url_fetcher_->SetRequestContext(url_context_);

193 url_fetcher_->SetReferrer(origin_url);	224 url_fetcher_->SetReferrer(config_.origin_url);

194	225

195 // The speech recognition API does not require user identification as part	226 // The speech recognition API does not require user identification as part

196 // of requests, so we don't send cookies or auth data for these requests to	227 // of requests, so we don't send cookies or auth data for these requests to

197 // prevent any accidental connection between users who are logged into the	228 // prevent any accidental connection between users who are logged into the

198 // domain for other services (e.g. bookmark sync) with the speech requests.	229 // domain for other services (e.g. bookmark sync) with the speech requests.

199 url_fetcher_->SetLoadFlags(	230 url_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES \|

200 net::LOAD_DO_NOT_SAVE_COOKIES \| net::LOAD_DO_NOT_SEND_COOKIES \|	231 net::LOAD_DO_NOT_SEND_COOKIES \|

201 net::LOAD_DO_NOT_SEND_AUTH_DATA);	232 net::LOAD_DO_NOT_SEND_AUTH_DATA);

202 url_fetcher_->Start();	233 url_fetcher_->Start();

203 }	234 }

204	235

205 void SpeechRecognitionRequest::UploadAudioChunk(const AudioChunk& audio_chunk,	236 void GoogleOneShotRemoteEngine::EndRecognition() {

206 bool is_last_chunk) {	237 url_fetcher_.reset();

207 DCHECK(url_fetcher_.get());

208 url_fetcher_->AppendChunkToUpload(audio_chunk.AsString(), is_last_chunk);

209 }	238 }

210	239

211 void SpeechRecognitionRequest::OnURLFetchComplete(	240 void GoogleOneShotRemoteEngine::TakeAudioChunk(const AudioChunk& data) {

	241 DCHECK(url_fetcher_.get());

	242 DCHECK(encoder_.get());

	243 DCHECK_EQ(data.bytes_per_sample(), config_.audio_num_bits_per_sample / 8);

	244 encoder_->Encode(data);

	245 scoped_ptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());

	246 url_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false);

	247 }

	248

	249 void GoogleOneShotRemoteEngine::AudioChunksEnded() {

	250 DCHECK(url_fetcher_.get());

	251 DCHECK(encoder_.get());

	252

	253 // UploadAudioChunk requires a non-empty final buffer. So we encode a packet

	254 // of silence in case encoder had no data already.

	255 std::vector<int16> samples(

	256 config_.audio_sample_rate * kAudioPacketIntervalMs / 1000);

	257 AudioChunk dummy_chunk(reinterpret_cast<uint8*>(&samples[0]),

	258 samples.size() * sizeof(int16),

	259 encoder_->bits_per_sample() / 8);

	260 encoder_->Encode(dummy_chunk);

	261 encoder_->Flush();

	262 scoped_ptr<AudioChunk> encoded_dummy_data(encoder_->GetEncodedDataAndClear());

	263 DCHECK(!encoded_dummy_data->IsEmpty());

	264 encoder_.reset();

	265

	266 url_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true);

	267 }

	268

	269 void GoogleOneShotRemoteEngine::OnURLFetchComplete(

212 const content::URLFetcher* source) {	270 const content::URLFetcher* source) {

213 DCHECK_EQ(url_fetcher_.get(), source);	271 DCHECK_EQ(url_fetcher_.get(), source);

	272 SpeechRecognitionResult result;

	273 SpeechRecognitionError error(content::SPEECH_RECOGNITION_ERROR_NETWORK);

	274 std::string data;

214	275

215 content::SpeechRecognitionResult result;	276 // The default error code in case of parse errors is NETWORK_FAILURE, however

216 std::string data;	277 // ParseServerResponse can change the error to a more appropriate one.

217 if (!source->GetStatus().is_success() \|\| source->GetResponseCode() != 200 \|\|	278 bool error_occurred = (!source->GetStatus().is_success() \|\|

218 !source->GetResponseAsString(&data) \|\|	279 source->GetResponseCode() != 200 \|\|

219 !ParseServerResponse(data, &result)) {	280 !source->GetResponseAsString(&data) \|\|

220 result.error = content::SPEECH_RECOGNITION_ERROR_NETWORK;	281 !ParseServerResponse(data, &result, &error));

	282 url_fetcher_.reset();

	283 if (error_occurred) {

	284 DVLOG(1) << "GoogleOneShotRemoteEngine: Network Error " << error.code;

	285 delegate()->OnSpeechRecognitionEngineError(error);

	286 } else {

	287 DVLOG(1) << "GoogleOneShotRemoteEngine: Invoking delegate with result.";

	288 delegate()->OnSpeechRecognitionEngineResult(result);

221 }	289 }

	290 }

222	291

223 DVLOG(1) << "SpeechRecognitionRequest: Invoking delegate with result.";	292 bool GoogleOneShotRemoteEngine::IsRecognitionPending() const {

224 url_fetcher_.reset();	293 return url_fetcher_ != NULL;

225 delegate_->SetRecognitionResult(result);	294 }

	295

	296 int GoogleOneShotRemoteEngine::GetDesiredAudioChunkDurationMs() const {

	297 return kAudioPacketIntervalMs;

226 }	298 }

227	299

228 } // namespace speech	300 } // namespace speech

OLD	NEW