content/browser/speech/google_ssfe_remote_engine.cc - Issue 9663066: Refactoring of chrome speech recognition architecture (CL1.3)

Side by Side Diff: content/browser/speech/google_ssfe_remote_engine.cc

Issue 9663066: Refactoring of chrome speech recognition architecture (CL1.3) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Rebased from master. Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« content/browser/speech/google_ssfe_remote_engine.h ('K') | « content/browser/speech/google_ssfe_remote_engine.h ('k') | content/browser/speech/google_ssfe_remote_engine_unittest.cc » ('j') | content/browser/speech/google_ssfe_remote_engine_unittest.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "content/browser/speech/speech_recognition_request.h"	5 #include "content/browser/speech/google_ssfe_remote_engine.h"

6	6

7 #include <vector>	7 #include <vector>

8	8

9 #include "base/json/json_reader.h"	9 #include "base/json/json_reader.h"

	10 #include "base/memory/scoped_ptr.h"

10 #include "base/string_number_conversions.h"	11 #include "base/string_number_conversions.h"

11 #include "base/string_util.h"	12 #include "base/string_util.h"

12 #include "base/values.h"	13 #include "base/values.h"

13 #include "content/browser/speech/audio_buffer.h"	14 #include "content/browser/speech/audio_buffer.h"

14 #include "content/common/net/url_fetcher_impl.h"	15 #include "content/common/net/url_fetcher_impl.h"

15 #include "content/public/common/speech_recognition_result.h"	16 #include "content/public/common/speech_recognition_result.h"

16 #include "net/base/escape.h"	17 #include "net/base/escape.h"

17 #include "net/base/load_flags.h"	18 #include "net/base/load_flags.h"

18 #include "net/url_request/url_request_context.h"	19 #include "net/url_request/url_request_context.h"

19 #include "net/url_request/url_request_context_getter.h"	20 #include "net/url_request/url_request_context_getter.h"

20 #include "net/url_request/url_request_status.h"	21 #include "net/url_request/url_request_status.h"

21	22

	23 using content::SpeechRecognitionError;

	24 using content::SpeechRecognitionHypothesis;

	25 using content::SpeechRecognitionResult;

	26

22 namespace {	27 namespace {

23	28

24 const char* const kDefaultSpeechRecognitionUrl =	29 const char* const kDefaultSpeechRecognitionUrl =

25 "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&";	30 "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&";

26 const char* const kStatusString = "status";	31 const char* const kStatusString = "status";

27 const char* const kHypothesesString = "hypotheses";	32 const char* const kHypothesesString = "hypotheses";

28 const char* const kUtteranceString = "utterance";	33 const char* const kUtteranceString = "utterance";

29 const char* const kConfidenceString = "confidence";	34 const char* const kConfidenceString = "confidence";

30	35

31 // TODO(satish): Remove this hardcoded value once the page is allowed to	36 // TODO(satish): Remove this hardcoded value once the page is allowed to

32 // set this via an attribute.	37 // set this via an attribute.

33 const int kMaxResults = 6;	38 const int kMaxResults = 6;

34	39

	40 const int SPEECH_API_STATUS_NO_ERROR = 0;

	41 const int SPEECH_API_STATUS_NO_SPEECH = 4;

	42 const int SPEECH_API_STATUS_NO_MATCH = 5;

	43

35 bool ParseServerResponse(const std::string& response_body,	44 bool ParseServerResponse(const std::string& response_body,

36 content::SpeechRecognitionResult* result) {	45 SpeechRecognitionResult* result,

	46 SpeechRecognitionError* error) {

37 if (response_body.empty()) {	47 if (response_body.empty()) {

38 LOG(WARNING) << "ParseServerResponse: Response was empty.";	48 LOG(WARNING) << "ParseServerResponse: Response was empty.";

39 return false;	49 return false;

40 }	50 }

41 DVLOG(1) << "ParseServerResponse: Parsing response " << response_body;	51 DVLOG(1) << "ParseServerResponse: Parsing response " << response_body;

42	52

43 // Parse the response, ignoring comments.	53 // Parse the response, ignoring comments.

44 std::string error_msg;	54 std::string error_msg;

45 scoped_ptr<Value> response_value(base::JSONReader::ReadAndReturnError(	55 scoped_ptr<Value> response_value(base::JSONReader::ReadAndReturnError(

46 response_body, false, NULL, &error_msg));	56 response_body, false, NULL, &error_msg));

(...skipping 13 matching lines...) Expand all Loading...
60 // Get the status.	70 // Get the status.

61 int status;	71 int status;

62 if (!response_object->GetInteger(kStatusString, &status)) {	72 if (!response_object->GetInteger(kStatusString, &status)) {

63 VLOG(1) << "ParseServerResponse: " << kStatusString	73 VLOG(1) << "ParseServerResponse: " << kStatusString

64 << " is not a valid integer value.";	74 << " is not a valid integer value.";

65 return false;	75 return false;

66 }	76 }

67	77

68 // Process the status.	78 // Process the status.

69 switch (status) {	79 switch (status) {

70 case content::SPEECH_RECOGNITION_ERROR_NONE:	80 case SPEECH_API_STATUS_NO_ERROR:

71 case content::SPEECH_RECOGNITION_ERROR_NO_SPEECH:	81 break;

72 case content::SPEECH_RECOGNITION_ERROR_NO_MATCH:	82 case SPEECH_API_STATUS_NO_SPEECH:

73 break;	83 error->code = content::SPEECH_RECOGNITION_ERROR_NO_SPEECH;

74	84 return false;

75 default:	85 case SPEECH_API_STATUS_NO_MATCH:

76 // Other status codes should not be returned by the server.	86 error->code = content::SPEECH_RECOGNITION_ERROR_NO_MATCH;

77 VLOG(1) << "ParseServerResponse: unexpected status code " << status;	87 return false;

78 return false;	88 default:

	89 error->code = content::SPEECH_RECOGNITION_ERROR_NETWORK;

	90 // Other status codes should not be returned by the server.

	91 VLOG(1) << "ParseServerResponse: unexpected status code " << status;

	92 return false;

79 }	93 }

80	94

81 result->error = static_cast<content::SpeechRecognitionErrorCode>(status);

82

83 // Get the hypotheses.	95 // Get the hypotheses.

84 Value* hypotheses_value = NULL;	96 Value* hypotheses_value = NULL;

85 if (!response_object->Get(kHypothesesString, &hypotheses_value)) {	97 if (!response_object->Get(kHypothesesString, &hypotheses_value)) {

86 VLOG(1) << "ParseServerResponse: Missing hypotheses attribute.";	98 VLOG(1) << "ParseServerResponse: Missing hypotheses attribute.";

87 return false;	99 return false;

88 }	100 }

89	101

90 DCHECK(hypotheses_value);	102 DCHECK(hypotheses_value);

91 if (!hypotheses_value->IsType(Value::TYPE_LIST)) {	103 if (!hypotheses_value->IsType(Value::TYPE_LIST)) {

92 VLOG(1) << "ParseServerResponse: Unexpected hypotheses type "	104 VLOG(1) << "ParseServerResponse: Unexpected hypotheses type "

93 << hypotheses_value->GetType();	105 << hypotheses_value->GetType();

94 return false;	106 return false;

95 }	107 }

96	108

97 const ListValue* hypotheses_list = static_cast<ListValue*>(hypotheses_value);	109 const ListValue* hypotheses_list = static_cast<ListValue*>(hypotheses_value);

98	110 // For now we support only single shot recognition, so we are giving only a

	111 // final result, consisting of one fragment (with one or more hypotheses).

99 size_t index = 0;	112 size_t index = 0;

100 for (; index < hypotheses_list->GetSize(); ++index) {	113 for (; index < hypotheses_list->GetSize(); ++index) {

101 Value* hypothesis = NULL;	114 Value* hypothesis = NULL;

102 if (!hypotheses_list->Get(index, &hypothesis)) {	115 if (!hypotheses_list->Get(index, &hypothesis)) {

103 LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value.";	116 LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value.";

104 break;	117 break;

105 }	118 }

106 DCHECK(hypothesis);	119 DCHECK(hypothesis);

107 if (!hypothesis->IsType(Value::TYPE_DICTIONARY)) {	120 if (!hypothesis->IsType(Value::TYPE_DICTIONARY)) {

108 LOG(WARNING) << "ParseServerResponse: Unexpected value type "	121 LOG(WARNING) << "ParseServerResponse: Unexpected value type "

109 << hypothesis->GetType();	122 << hypothesis->GetType();

110 break;	123 break;

111 }	124 }

112	125

113 const DictionaryValue* hypothesis_value =	126 const DictionaryValue* hypothesis_value =

114 static_cast<DictionaryValue*>(hypothesis);	127 static_cast<DictionaryValue*>(hypothesis);

115 string16 utterance;	128 string16 utterance;

	129

116 if (!hypothesis_value->GetString(kUtteranceString, &utterance)) {	130 if (!hypothesis_value->GetString(kUtteranceString, &utterance)) {

117 LOG(WARNING) << "ParseServerResponse: Missing utterance value.";	131 LOG(WARNING) << "ParseServerResponse: Missing utterance value.";

118 break;	132 break;

119 }	133 }

120	134

121 // It is not an error if the 'confidence' field is missing.	135 // It is not an error if the 'confidence' field is missing.

122 double confidence = 0.0;	136 double confidence = 0.0;

123 hypothesis_value->GetDouble(kConfidenceString, &confidence);	137 hypothesis_value->GetDouble(kConfidenceString, &confidence);

124	138 result->hypotheses.push_back(SpeechRecognitionHypothesis(utterance,

125 result->hypotheses.push_back(content::SpeechRecognitionHypothesis(	139 confidence));

126 utterance, confidence));

127 }	140 }

128	141

129 if (index < hypotheses_list->GetSize()) {	142 if (index < hypotheses_list->GetSize()) {

130 result->hypotheses.clear();	143 result->hypotheses.clear();

131 return false;	144 return false;

132 }	145 }

133

134 return true;	146 return true;

135 }	147 }

136	148

137 } // namespace	149 } // namespace

138	150

	151

139 namespace speech {	152 namespace speech {

140	153

141 int SpeechRecognitionRequest::url_fetcher_id_for_tests = 0;	154 const int GoogleSSFERemoteEngine::kAudioPacketIntervalMs = 100;

	155 int GoogleSSFERemoteEngine::url_fetcher_id_for_tests = 0;

142	156

143 SpeechRecognitionRequest::SpeechRecognitionRequest(	157 GoogleSSFERemoteEngineConfig::GoogleSSFERemoteEngineConfig()

144 net::URLRequestContextGetter* context, Delegate* delegate)	158 : filter_profanities(false),

145 : url_context_(context),	159 audio_sample_rate(8000),

146 delegate_(delegate) {	160 audio_num_bits_per_sample(16) {

147 DCHECK(delegate);

148 }	161 }

149	162

150 SpeechRecognitionRequest::~SpeechRecognitionRequest() {}	163 GoogleSSFERemoteEngineConfig::~GoogleSSFERemoteEngineConfig() {}

151	164

152 void SpeechRecognitionRequest::Start(const std::string& language,	165 GoogleSSFERemoteEngine::GoogleSSFERemoteEngine(

153 const std::string& grammar,	166 net::URLRequestContextGetter* context)

154 bool filter_profanities,	167 : url_context_(context),

155 const std::string& hardware_info,	168 codec_(AudioEncoder::CODEC_FLAC),

156 const std::string& origin_url,	169 encoder_(NULL) {

157 const std::string& content_type) {	170 }

	171

	172 GoogleSSFERemoteEngine::~GoogleSSFERemoteEngine() {}

	173

	174 void GoogleSSFERemoteEngine::SetConfiguration(

	175 const GoogleSSFERemoteEngineConfig& config) {

	176 config_ = config;

	177 }

	178

	179 void GoogleSSFERemoteEngine::SpeechRecognitionBegins() {

	180 DCHECK(sr_delegate());

158 DCHECK(!url_fetcher_.get());	181 DCHECK(!url_fetcher_.get());

	182 std::vector<std::string> parts;

	183 encoder_.reset(AudioEncoder::Create(codec_, config_.audio_sample_rate,

	184 config_.audio_num_bits_per_sample));

	185 DCHECK(encoder_.get());

	186 std::string lang_param = config_.language;

159	187

160 std::vector<std::string> parts;

161

162 std::string lang_param = language;

163 if (lang_param.empty() && url_context_) {	188 if (lang_param.empty() && url_context_) {

164 // If no language is provided then we use the first from the accepted	189 // If no language is provided then we use the first from the accepted

165 // language list. If this list is empty then it defaults to "en-US".	190 // language list. If this list is empty then it defaults to "en-US".

166 // Example of the contents of this list: "es,en-GB;q=0.8", ""	191 // Example of the contents of this list: "es,en-GB;q=0.8", ""

167 net::URLRequestContext* request_context =	192 net::URLRequestContext* request_context =

168 url_context_->GetURLRequestContext();	193 url_context_->GetURLRequestContext();

169 DCHECK(request_context);	194 DCHECK(request_context);

170 std::string accepted_language_list = request_context->accept_language();	195 std::string accepted_language_list = request_context->accept_language();

171 size_t separator = accepted_language_list.find_first_of(",;");	196 size_t separator = accepted_language_list.find_first_of(",;");

172 lang_param = accepted_language_list.substr(0, separator);	197 lang_param = accepted_language_list.substr(0, separator);

173 }	198 }

	199

174 if (lang_param.empty())	200 if (lang_param.empty())

175 lang_param = "en-US";	201 lang_param = "en-US";

	202

176 parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true));	203 parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true));

177	204

178 if (!grammar.empty())	205 if (!config_.grammar.empty())

179 parts.push_back("lm=" + net::EscapeQueryParamValue(grammar, true));	206 parts.push_back("lm=" + net::EscapeQueryParamValue(config_.grammar, true));

180 if (!hardware_info.empty())	207

181 parts.push_back("xhw=" + net::EscapeQueryParamValue(hardware_info, true));	208 if (!config_.hardware_info.empty())

	209 parts.push_back("xhw=" + net::EscapeQueryParamValue(config_.hardware_info,

	210 true));

182 parts.push_back("maxresults=" + base::IntToString(kMaxResults));	211 parts.push_back("maxresults=" + base::IntToString(kMaxResults));

183 parts.push_back(filter_profanities ? "pfilter=2" : "pfilter=0");	212 parts.push_back(config_.filter_profanities ? "pfilter=2" : "pfilter=0");

184	213

185 GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&'));	214 GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&'));

186	215

187 url_fetcher_.reset(URLFetcherImpl::Create(url_fetcher_id_for_tests,	216 url_fetcher_.reset(URLFetcherImpl::Create(url_fetcher_id_for_tests,

188 url,	217 url,

189 URLFetcherImpl::POST,	218 URLFetcherImpl::POST,

190 this));	219 this));

191 url_fetcher_->SetChunkedUpload(content_type);	220 url_fetcher_->SetChunkedUpload(encoder_->mime_type());

192 url_fetcher_->SetRequestContext(url_context_);	221 url_fetcher_->SetRequestContext(url_context_);

193 url_fetcher_->SetReferrer(origin_url);	222 url_fetcher_->SetReferrer(config_.origin_url);

194	223

195 // The speech recognition API does not require user identification as part	224 // The speech recognition API does not require user identification as part

196 // of requests, so we don't send cookies or auth data for these requests to	225 // of requests, so we don't send cookies or auth data for these requests to

197 // prevent any accidental connection between users who are logged into the	226 // prevent any accidental connection between users who are logged into the

198 // domain for other services (e.g. bookmark sync) with the speech requests.	227 // domain for other services (e.g. bookmark sync) with the speech requests.

199 url_fetcher_->SetLoadFlags(	228 url_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES \|

200 net::LOAD_DO_NOT_SAVE_COOKIES \| net::LOAD_DO_NOT_SEND_COOKIES \|	229 net::LOAD_DO_NOT_SEND_COOKIES \|

201 net::LOAD_DO_NOT_SEND_AUTH_DATA);	230 net::LOAD_DO_NOT_SEND_AUTH_DATA);

202 url_fetcher_->Start();	231 url_fetcher_->Start();

203 }	232 }

204	233

205 void SpeechRecognitionRequest::UploadAudioChunk(const AudioChunk& audio_chunk,	234 // Called only after the results have been retrieved.

206 bool is_last_chunk) {	235 void GoogleSSFERemoteEngine::SpeechRecognitionEnds() {

207 DCHECK(url_fetcher_.get());	236 url_fetcher_.reset();

208 url_fetcher_->AppendChunkToUpload(audio_chunk.AsString(), is_last_chunk);

209 }	237 }

210	238

211 void SpeechRecognitionRequest::OnURLFetchComplete(	239 void GoogleSSFERemoteEngine::PushSpeechAudio(const AudioChunk& data) {

	240 DCHECK(url_fetcher_.get());

	241 DCHECK_EQ(data.bytes_per_sample(), config_.audio_num_bits_per_sample / 8);

	242 std::string encoded_audio_string;

	243 encoder_->Encode(data);

	244 scoped_ptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());

	245 url_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false);

	246 }

	247

	248 void GoogleSSFERemoteEngine::SpeechAudioStreamComplete() {

	249 DCHECK(url_fetcher_.get());

	250 DCHECK(encoder_.get());

	251 // UploadAudioChunk requires a non-empty final buffer. So we encode a packet

	252 // of silence in case encoder had no data already.

	253 std::vector<short> samples(

	254 config_.audio_sample_rate * kAudioPacketIntervalMs / 1000);

	255 AudioChunk dummy_chunk(reinterpret_cast<uint8*>(&samples[0]),

	256 samples.size() * sizeof(short),

	257 encoder_->bits_per_sample() / 8);

	258 encoder_->Encode(dummy_chunk);

	259 encoder_->Flush();

	260 scoped_ptr<AudioChunk> encoded_dummy_data(encoder_->GetEncodedDataAndClear());

	261 DCHECK(!encoded_dummy_data->IsEmpty());

	262 encoder_.reset();

	263

	264 url_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true);

	265 }

	266

	267 void GoogleSSFERemoteEngine::OnURLFetchComplete(

212 const content::URLFetcher* source) {	268 const content::URLFetcher* source) {

213 DCHECK_EQ(url_fetcher_.get(), source);	269 DCHECK_EQ(url_fetcher_.get(), source);

	270 SpeechRecognitionResult result;

	271 SpeechRecognitionError error(content::SPEECH_RECOGNITION_ERROR_NETWORK);

	272 std::string data;

	273 // The default error code in case of parse errors is NETWORK_FAILURE, however

	274 // ParseServerResponse can change the error to a more appropriate one.

	275 if (!source->GetStatus().is_success() \|\|

	276 source->GetResponseCode() != 200 \|\|

	277 !source->GetResponseAsString(&data) \|\|

	278 !ParseServerResponse(data, &result, &error)) {

	279 DVLOG(1) << "GoogleSSFERemoteEngine: Network Error " << error.code;

	280 sr_delegate()->OnSpeechEngineError(error);

	281 } else {

	282 DVLOG(1) << "GoogleSSFERemoteEngine: Invoking delegate with result.";

	283 sr_delegate()->OnSpeechEngineResult(result);

	284 }

	285 url_fetcher_.reset();

	286 }

214	287

215 content::SpeechRecognitionResult result;	288 bool GoogleSSFERemoteEngine::IsRecognitionPending() const {

216 std::string data;	289 return url_fetcher_ != NULL;

217 if (!source->GetStatus().is_success() \|\| source->GetResponseCode() != 200 \|\|	290 }

218 !source->GetResponseAsString(&data) \|\|

219 !ParseServerResponse(data, &result)) {

220 result.error = content::SPEECH_RECOGNITION_ERROR_NETWORK;

221 }

222	291

223 DVLOG(1) << "SpeechRecognitionRequest: Invoking delegate with result.";	292 int GoogleSSFERemoteEngine::DesiredAudioChunkDurationMs() const {

224 url_fetcher_.reset();	293 return kAudioPacketIntervalMs;

225 delegate_->SetRecognitionResult(result);

226 }	294 }

227	295

228 } // namespace speech	296 } // namespace speech

OLD	NEW