Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(90)

Side by Side Diff: content/browser/speech/google_ssfe_remote_engine.cc

Issue 9663066: Refactoring of chrome speech recognition architecture (CL1.3) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Fixed according to Hans review. Created 8 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "content/browser/speech/speech_recognition_request.h" 5 #include "content/browser/speech/google_ssfe_remote_engine.h"
6 6
7 #include <vector> 7 #include <vector>
8 8
9 #include "base/json/json_reader.h" 9 #include "base/json/json_reader.h"
10 #include "base/memory/scoped_ptr.h"
10 #include "base/string_number_conversions.h" 11 #include "base/string_number_conversions.h"
11 #include "base/string_util.h" 12 #include "base/string_util.h"
12 #include "base/values.h" 13 #include "base/values.h"
13 #include "content/browser/speech/audio_buffer.h" 14 #include "content/browser/speech/audio_buffer.h"
14 #include "content/common/net/url_fetcher_impl.h" 15 #include "content/common/net/url_fetcher_impl.h"
15 #include "content/public/common/speech_recognition_result.h" 16 #include "content/public/common/speech_recognition_result.h"
16 #include "net/base/escape.h" 17 #include "net/base/escape.h"
17 #include "net/base/load_flags.h" 18 #include "net/base/load_flags.h"
18 #include "net/url_request/url_request_context.h" 19 #include "net/url_request/url_request_context.h"
19 #include "net/url_request/url_request_context_getter.h" 20 #include "net/url_request/url_request_context_getter.h"
20 #include "net/url_request/url_request_status.h" 21 #include "net/url_request/url_request_status.h"
21 22
23 using content::SpeechRecognitionError;
24 using content::SpeechRecognitionHypothesis;
25 using content::SpeechRecognitionResult;
26
22 namespace { 27 namespace {
23 28
24 const char* const kDefaultSpeechRecognitionUrl = 29 const char* const kDefaultSpeechRecognitionUrl =
25 "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&"; 30 "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&";
26 const char* const kStatusString = "status"; 31 const char* const kStatusString = "status";
27 const char* const kHypothesesString = "hypotheses"; 32 const char* const kHypothesesString = "hypotheses";
28 const char* const kUtteranceString = "utterance"; 33 const char* const kUtteranceString = "utterance";
29 const char* const kConfidenceString = "confidence"; 34 const char* const kConfidenceString = "confidence";
30 35
31 // TODO(satish): Remove this hardcoded value once the page is allowed to 36 // TODO(satish): Remove this hardcoded value once the page is allowed to
32 // set this via an attribute. 37 // set this via an attribute.
33 const int kMaxResults = 6; 38 const int kMaxResults = 6;
34 39
40 const int SPEECH_API_STATUS_NO_ERROR = 0;
Satish 2012/03/16 17:00:35 add comment here on what these are (i.e. codes ret
Primiano Tucci (use gerrit) 2012/03/20 13:14:50 Done.
41 const int SPEECH_API_STATUS_NO_SPEECH = 4;
42 const int SPEECH_API_STATUS_NO_MATCH = 5;
43
35 bool ParseServerResponse(const std::string& response_body, 44 bool ParseServerResponse(const std::string& response_body,
36 content::SpeechRecognitionResult* result) { 45 SpeechRecognitionResult* result,
46 SpeechRecognitionError* error) {
37 if (response_body.empty()) { 47 if (response_body.empty()) {
38 LOG(WARNING) << "ParseServerResponse: Response was empty."; 48 LOG(WARNING) << "ParseServerResponse: Response was empty.";
39 return false; 49 return false;
40 } 50 }
41 DVLOG(1) << "ParseServerResponse: Parsing response " << response_body; 51 DVLOG(1) << "ParseServerResponse: Parsing response " << response_body;
42 52
43 // Parse the response, ignoring comments. 53 // Parse the response, ignoring comments.
44 std::string error_msg; 54 std::string error_msg;
45 scoped_ptr<Value> response_value(base::JSONReader::ReadAndReturnError( 55 scoped_ptr<Value> response_value(base::JSONReader::ReadAndReturnError(
46 response_body, false, NULL, &error_msg)); 56 response_body, false, NULL, &error_msg));
(...skipping 13 matching lines...) Expand all
60 // Get the status. 70 // Get the status.
61 int status; 71 int status;
62 if (!response_object->GetInteger(kStatusString, &status)) { 72 if (!response_object->GetInteger(kStatusString, &status)) {
63 VLOG(1) << "ParseServerResponse: " << kStatusString 73 VLOG(1) << "ParseServerResponse: " << kStatusString
64 << " is not a valid integer value."; 74 << " is not a valid integer value.";
65 return false; 75 return false;
66 } 76 }
67 77
68 // Process the status. 78 // Process the status.
69 switch (status) { 79 switch (status) {
70 case content::SPEECH_RECOGNITION_ERROR_NONE: 80 case SPEECH_API_STATUS_NO_ERROR:
Satish 2012/03/16 17:00:35 should we set the error code to none in this case?
Primiano Tucci (use gerrit) 2012/03/20 13:14:50 Done.
71 case content::SPEECH_RECOGNITION_ERROR_NO_SPEECH: 81 break;
72 case content::SPEECH_RECOGNITION_ERROR_NO_MATCH: 82 case SPEECH_API_STATUS_NO_SPEECH:
73 break; 83 error->code = content::SPEECH_RECOGNITION_ERROR_NO_SPEECH;
74 84 return false;
75 default: 85 case SPEECH_API_STATUS_NO_MATCH:
76 // Other status codes should not be returned by the server. 86 error->code = content::SPEECH_RECOGNITION_ERROR_NO_MATCH;
77 VLOG(1) << "ParseServerResponse: unexpected status code " << status; 87 return false;
78 return false; 88 default:
89 error->code = content::SPEECH_RECOGNITION_ERROR_NETWORK;
90 // Other status codes should not be returned by the server.
91 VLOG(1) << "ParseServerResponse: unexpected status code " << status;
92 return false;
79 } 93 }
80 94
81 result->error = static_cast<content::SpeechRecognitionErrorCode>(status);
82
83 // Get the hypotheses. 95 // Get the hypotheses.
84 Value* hypotheses_value = NULL; 96 Value* hypotheses_value = NULL;
85 if (!response_object->Get(kHypothesesString, &hypotheses_value)) { 97 if (!response_object->Get(kHypothesesString, &hypotheses_value)) {
86 VLOG(1) << "ParseServerResponse: Missing hypotheses attribute."; 98 VLOG(1) << "ParseServerResponse: Missing hypotheses attribute.";
87 return false; 99 return false;
88 } 100 }
89 101
90 DCHECK(hypotheses_value); 102 DCHECK(hypotheses_value);
91 if (!hypotheses_value->IsType(Value::TYPE_LIST)) { 103 if (!hypotheses_value->IsType(Value::TYPE_LIST)) {
92 VLOG(1) << "ParseServerResponse: Unexpected hypotheses type " 104 VLOG(1) << "ParseServerResponse: Unexpected hypotheses type "
93 << hypotheses_value->GetType(); 105 << hypotheses_value->GetType();
94 return false; 106 return false;
95 } 107 }
96 108
97 const ListValue* hypotheses_list = static_cast<ListValue*>(hypotheses_value); 109 const ListValue* hypotheses_list = static_cast<ListValue*>(hypotheses_value);
98 110 // For now we support only single shot recognition, so we are giving only a
111 // final result, consisting of one fragment (with one or more hypotheses).
99 size_t index = 0; 112 size_t index = 0;
100 for (; index < hypotheses_list->GetSize(); ++index) { 113 for (; index < hypotheses_list->GetSize(); ++index) {
101 Value* hypothesis = NULL; 114 Value* hypothesis = NULL;
102 if (!hypotheses_list->Get(index, &hypothesis)) { 115 if (!hypotheses_list->Get(index, &hypothesis)) {
103 LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value."; 116 LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value.";
104 break; 117 break;
105 } 118 }
106 DCHECK(hypothesis); 119 DCHECK(hypothesis);
107 if (!hypothesis->IsType(Value::TYPE_DICTIONARY)) { 120 if (!hypothesis->IsType(Value::TYPE_DICTIONARY)) {
108 LOG(WARNING) << "ParseServerResponse: Unexpected value type " 121 LOG(WARNING) << "ParseServerResponse: Unexpected value type "
109 << hypothesis->GetType(); 122 << hypothesis->GetType();
110 break; 123 break;
111 } 124 }
112 125
113 const DictionaryValue* hypothesis_value = 126 const DictionaryValue* hypothesis_value =
114 static_cast<DictionaryValue*>(hypothesis); 127 static_cast<DictionaryValue*>(hypothesis);
115 string16 utterance; 128 string16 utterance;
129
116 if (!hypothesis_value->GetString(kUtteranceString, &utterance)) { 130 if (!hypothesis_value->GetString(kUtteranceString, &utterance)) {
117 LOG(WARNING) << "ParseServerResponse: Missing utterance value."; 131 LOG(WARNING) << "ParseServerResponse: Missing utterance value.";
118 break; 132 break;
119 } 133 }
120 134
121 // It is not an error if the 'confidence' field is missing. 135 // It is not an error if the 'confidence' field is missing.
122 double confidence = 0.0; 136 double confidence = 0.0;
123 hypothesis_value->GetDouble(kConfidenceString, &confidence); 137 hypothesis_value->GetDouble(kConfidenceString, &confidence);
124 138 result->hypotheses.push_back(SpeechRecognitionHypothesis(utterance,
125 result->hypotheses.push_back(content::SpeechRecognitionHypothesis( 139 confidence));
126 utterance, confidence));
127 } 140 }
128 141
129 if (index < hypotheses_list->GetSize()) { 142 if (index < hypotheses_list->GetSize()) {
130 result->hypotheses.clear(); 143 result->hypotheses.clear();
131 return false; 144 return false;
132 } 145 }
133
134 return true; 146 return true;
135 } 147 }
136 148
137 } // namespace 149 } // namespace
138 150
151
Satish 2012/03/16 17:00:35 remove extra newline
Primiano Tucci (use gerrit) 2012/03/20 13:14:50 Done.
139 namespace speech { 152 namespace speech {
140 153
141 int SpeechRecognitionRequest::url_fetcher_id_for_tests = 0; 154 const int GoogleSSFERemoteEngine::kAudioPacketIntervalMs = 100;
155 int GoogleSSFERemoteEngine::url_fetcher_id_for_tests = 0;
142 156
143 SpeechRecognitionRequest::SpeechRecognitionRequest( 157 GoogleSSFERemoteEngineConfig::GoogleSSFERemoteEngineConfig()
144 net::URLRequestContextGetter* context, Delegate* delegate) 158 : filter_profanities(false),
Satish 2012/03/16 17:00:35 4 spaces before :
Primiano Tucci (use gerrit) 2012/03/20 13:14:50 Done.
145 : url_context_(context), 159 audio_sample_rate(8000),
Satish 2012/03/16 17:00:35 these two values should probably be constants at t
Primiano Tucci (use gerrit) 2012/03/20 13:14:50 Done.
146 delegate_(delegate) { 160 audio_num_bits_per_sample(16) {
147 DCHECK(delegate);
148 } 161 }
149 162
150 SpeechRecognitionRequest::~SpeechRecognitionRequest() {} 163 GoogleSSFERemoteEngineConfig::~GoogleSSFERemoteEngineConfig() {}
151 164
152 void SpeechRecognitionRequest::Start(const std::string& language, 165 GoogleSSFERemoteEngine::GoogleSSFERemoteEngine(
153 const std::string& grammar, 166 net::URLRequestContextGetter* context)
154 bool filter_profanities, 167 : url_context_(context),
155 const std::string& hardware_info, 168 codec_(AudioEncoder::CODEC_FLAC),
156 const std::string& origin_url, 169 encoder_(NULL) {
Satish 2012/03/16 17:00:35 this should get autoinitialized?
Primiano Tucci (use gerrit) 2012/03/20 13:14:50 Done.
157 const std::string& content_type) { 170 }
171
172 GoogleSSFERemoteEngine::~GoogleSSFERemoteEngine() {}
173
174 void GoogleSSFERemoteEngine::SetConfiguration(
175 const GoogleSSFERemoteEngineConfig& config) {
176 config_ = config;
177 }
178
179 void GoogleSSFERemoteEngine::SpeechRecognitionBegins() {
180 DCHECK(delegate());
158 DCHECK(!url_fetcher_.get()); 181 DCHECK(!url_fetcher_.get());
182 std::vector<std::string> parts;
Satish 2012/03/16 17:00:35 move this to line 203 where it is getting used for
Primiano Tucci (use gerrit) 2012/03/20 13:14:50 Done.
183 encoder_.reset(AudioEncoder::Create(codec_, config_.audio_sample_rate,
184 config_.audio_num_bits_per_sample));
185 DCHECK(encoder_.get());
186 std::string lang_param = config_.language;
159 187
160 std::vector<std::string> parts;
161
162 std::string lang_param = language;
163 if (lang_param.empty() && url_context_) { 188 if (lang_param.empty() && url_context_) {
164 // If no language is provided then we use the first from the accepted 189 // If no language is provided then we use the first from the accepted
165 // language list. If this list is empty then it defaults to "en-US". 190 // language list. If this list is empty then it defaults to "en-US".
166 // Example of the contents of this list: "es,en-GB;q=0.8", "" 191 // Example of the contents of this list: "es,en-GB;q=0.8", ""
167 net::URLRequestContext* request_context = 192 net::URLRequestContext* request_context =
168 url_context_->GetURLRequestContext(); 193 url_context_->GetURLRequestContext();
169 DCHECK(request_context); 194 DCHECK(request_context);
170 std::string accepted_language_list = request_context->accept_language(); 195 std::string accepted_language_list = request_context->accept_language();
171 size_t separator = accepted_language_list.find_first_of(",;"); 196 size_t separator = accepted_language_list.find_first_of(",;");
172 lang_param = accepted_language_list.substr(0, separator); 197 lang_param = accepted_language_list.substr(0, separator);
173 } 198 }
199
174 if (lang_param.empty()) 200 if (lang_param.empty())
175 lang_param = "en-US"; 201 lang_param = "en-US";
202
176 parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true)); 203 parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true));
177 204
178 if (!grammar.empty()) 205 if (!config_.grammar.empty())
179 parts.push_back("lm=" + net::EscapeQueryParamValue(grammar, true)); 206 parts.push_back("lm=" + net::EscapeQueryParamValue(config_.grammar, true));
180 if (!hardware_info.empty()) 207
181 parts.push_back("xhw=" + net::EscapeQueryParamValue(hardware_info, true)); 208 if (!config_.hardware_info.empty())
209 parts.push_back("xhw=" + net::EscapeQueryParamValue(config_.hardware_info,
210 true));
182 parts.push_back("maxresults=" + base::IntToString(kMaxResults)); 211 parts.push_back("maxresults=" + base::IntToString(kMaxResults));
183 parts.push_back(filter_profanities ? "pfilter=2" : "pfilter=0"); 212 parts.push_back(config_.filter_profanities ? "pfilter=2" : "pfilter=0");
184 213
185 GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&')); 214 GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&'));
186 215
187 url_fetcher_.reset(URLFetcherImpl::Create(url_fetcher_id_for_tests, 216 url_fetcher_.reset(URLFetcherImpl::Create(url_fetcher_id_for_tests,
188 url, 217 url,
189 URLFetcherImpl::POST, 218 URLFetcherImpl::POST,
190 this)); 219 this));
191 url_fetcher_->SetChunkedUpload(content_type); 220 url_fetcher_->SetChunkedUpload(encoder_->mime_type());
192 url_fetcher_->SetRequestContext(url_context_); 221 url_fetcher_->SetRequestContext(url_context_);
193 url_fetcher_->SetReferrer(origin_url); 222 url_fetcher_->SetReferrer(config_.origin_url);
194 223
195 // The speech recognition API does not require user identification as part 224 // The speech recognition API does not require user identification as part
196 // of requests, so we don't send cookies or auth data for these requests to 225 // of requests, so we don't send cookies or auth data for these requests to
197 // prevent any accidental connection between users who are logged into the 226 // prevent any accidental connection between users who are logged into the
198 // domain for other services (e.g. bookmark sync) with the speech requests. 227 // domain for other services (e.g. bookmark sync) with the speech requests.
199 url_fetcher_->SetLoadFlags( 228 url_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |
200 net::LOAD_DO_NOT_SAVE_COOKIES | net::LOAD_DO_NOT_SEND_COOKIES | 229 net::LOAD_DO_NOT_SEND_COOKIES |
201 net::LOAD_DO_NOT_SEND_AUTH_DATA); 230 net::LOAD_DO_NOT_SEND_AUTH_DATA);
202 url_fetcher_->Start(); 231 url_fetcher_->Start();
203 } 232 }
204 233
205 void SpeechRecognitionRequest::UploadAudioChunk(const AudioChunk& audio_chunk, 234 // Called only after the results have been retrieved.
206 bool is_last_chunk) { 235 void GoogleSSFERemoteEngine::SpeechRecognitionEnds() {
207 DCHECK(url_fetcher_.get()); 236 url_fetcher_.reset();
Satish 2012/03/16 17:00:35 do you expect callers to reuse this object across
Primiano Tucci (use gerrit) 2012/03/20 13:14:50 In some occasions it can be reused (e.g, the user
208 url_fetcher_->AppendChunkToUpload(audio_chunk.AsString(), is_last_chunk);
209 } 237 }
210 238
211 void SpeechRecognitionRequest::OnURLFetchComplete( 239 void GoogleSSFERemoteEngine::PushSpeechAudio(const AudioChunk& data) {
240 DCHECK(url_fetcher_.get());
Satish 2012/03/16 17:00:35 also check for encoder_ as done below
Primiano Tucci (use gerrit) 2012/03/20 13:14:50 Done.
241 DCHECK_EQ(data.bytes_per_sample(), config_.audio_num_bits_per_sample / 8);
242 encoder_->Encode(data);
243 scoped_ptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());
244 url_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false);
245 }
246
247 void GoogleSSFERemoteEngine::SpeechAudioStreamComplete() {
248 DCHECK(url_fetcher_.get());
249 DCHECK(encoder_.get());
250 // UploadAudioChunk requires a non-empty final buffer. So we encode a packet
251 // of silence in case encoder had no data already.
252 std::vector<short> samples(
Satish 2012/03/16 17:00:35 short -> int16 ?
Primiano Tucci (use gerrit) 2012/03/20 13:14:50 Done.
253 config_.audio_sample_rate * kAudioPacketIntervalMs / 1000);
254 AudioChunk dummy_chunk(reinterpret_cast<uint8*>(&samples[0]),
255 samples.size() * sizeof(short),
Satish 2012/03/16 17:00:35 ditto
Primiano Tucci (use gerrit) 2012/03/20 13:14:50 Done.
256 encoder_->bits_per_sample() / 8);
257 encoder_->Encode(dummy_chunk);
258 encoder_->Flush();
259 scoped_ptr<AudioChunk> encoded_dummy_data(encoder_->GetEncodedDataAndClear());
260 DCHECK(!encoded_dummy_data->IsEmpty());
261 encoder_.reset();
262
263 url_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true);
264 }
265
266 void GoogleSSFERemoteEngine::OnURLFetchComplete(
212 const content::URLFetcher* source) { 267 const content::URLFetcher* source) {
213 DCHECK_EQ(url_fetcher_.get(), source); 268 DCHECK_EQ(url_fetcher_.get(), source);
269 SpeechRecognitionResult result;
270 SpeechRecognitionError error(content::SPEECH_RECOGNITION_ERROR_NETWORK);
271 std::string data;
272 // The default error code in case of parse errors is NETWORK_FAILURE, however
Satish 2012/03/16 17:00:35 add a newline before full line comments such as th
Primiano Tucci (use gerrit) 2012/03/20 13:14:50 Done.
273 // ParseServerResponse can change the error to a more appropriate one.
274 if (!source->GetStatus().is_success() ||
Satish 2012/03/16 17:00:35 no need to align the ||, could just leave 1 space
Primiano Tucci (use gerrit) 2012/03/20 13:14:50 Done.
275 source->GetResponseCode() != 200 ||
276 !source->GetResponseAsString(&data) ||
277 !ParseServerResponse(data, &result, &error)) {
278 DVLOG(1) << "GoogleSSFERemoteEngine: Network Error " << error.code;
279 delegate()->OnSpeechEngineError(error);
280 } else {
281 DVLOG(1) << "GoogleSSFERemoteEngine: Invoking delegate with result.";
282 delegate()->OnSpeechEngineResult(result);
283 }
284 url_fetcher_.reset();
285 }
214 286
215 content::SpeechRecognitionResult result; 287 bool GoogleSSFERemoteEngine::IsRecognitionPending() const {
216 std::string data; 288 return url_fetcher_ != NULL;
217 if (!source->GetStatus().is_success() || source->GetResponseCode() != 200 || 289 }
218 !source->GetResponseAsString(&data) ||
219 !ParseServerResponse(data, &result)) {
220 result.error = content::SPEECH_RECOGNITION_ERROR_NETWORK;
221 }
222 290
223 DVLOG(1) << "SpeechRecognitionRequest: Invoking delegate with result."; 291 int GoogleSSFERemoteEngine::DesiredAudioChunkDurationMs() const {
224 url_fetcher_.reset(); 292 return kAudioPacketIntervalMs;
225 delegate_->SetRecognitionResult(result);
226 } 293 }
227 294
228 } // namespace speech 295 } // namespace speech
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698