OLD | NEW |
---|---|
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "content/browser/speech/speech_recognition_request.h" | 5 #include "content/browser/speech/google_ssfe_remote_engine.h" |
6 | 6 |
7 #include <vector> | 7 #include <vector> |
8 | 8 |
9 #include "base/json/json_reader.h" | 9 #include "base/json/json_reader.h" |
10 #include "base/memory/scoped_ptr.h" | |
10 #include "base/string_number_conversions.h" | 11 #include "base/string_number_conversions.h" |
11 #include "base/string_util.h" | 12 #include "base/string_util.h" |
12 #include "base/values.h" | 13 #include "base/values.h" |
13 #include "content/browser/speech/audio_buffer.h" | 14 #include "content/browser/speech/audio_buffer.h" |
14 #include "content/common/net/url_fetcher_impl.h" | 15 #include "content/common/net/url_fetcher_impl.h" |
15 #include "content/public/common/speech_recognition_result.h" | 16 #include "content/public/common/speech_recognition_result.h" |
16 #include "net/base/escape.h" | 17 #include "net/base/escape.h" |
17 #include "net/base/load_flags.h" | 18 #include "net/base/load_flags.h" |
18 #include "net/url_request/url_request_context.h" | 19 #include "net/url_request/url_request_context.h" |
19 #include "net/url_request/url_request_context_getter.h" | 20 #include "net/url_request/url_request_context_getter.h" |
20 #include "net/url_request/url_request_status.h" | 21 #include "net/url_request/url_request_status.h" |
21 | 22 |
23 using content::SpeechRecognitionError; | |
24 using content::SpeechRecognitionHypothesis; | |
25 using content::SpeechRecognitionResult; | |
26 | |
22 namespace { | 27 namespace { |
23 | 28 |
24 const char* const kDefaultSpeechRecognitionUrl = | 29 const char* const kDefaultSpeechRecognitionUrl = |
25 "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&"; | 30 "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&"; |
26 const char* const kStatusString = "status"; | 31 const char* const kStatusString = "status"; |
27 const char* const kHypothesesString = "hypotheses"; | 32 const char* const kHypothesesString = "hypotheses"; |
28 const char* const kUtteranceString = "utterance"; | 33 const char* const kUtteranceString = "utterance"; |
29 const char* const kConfidenceString = "confidence"; | 34 const char* const kConfidenceString = "confidence"; |
30 | 35 |
31 // TODO(satish): Remove this hardcoded value once the page is allowed to | 36 // TODO(satish): Remove this hardcoded value once the page is allowed to |
32 // set this via an attribute. | 37 // set this via an attribute. |
33 const int kMaxResults = 6; | 38 const int kMaxResults = 6; |
34 | 39 |
40 const int SPEECH_API_STATUS_NO_ERROR = 0; | |
Satish
2012/03/16 17:00:35
add comment here on what these are (i.e. codes ret
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
| |
41 const int SPEECH_API_STATUS_NO_SPEECH = 4; | |
42 const int SPEECH_API_STATUS_NO_MATCH = 5; | |
43 | |
35 bool ParseServerResponse(const std::string& response_body, | 44 bool ParseServerResponse(const std::string& response_body, |
36 content::SpeechRecognitionResult* result) { | 45 SpeechRecognitionResult* result, |
46 SpeechRecognitionError* error) { | |
37 if (response_body.empty()) { | 47 if (response_body.empty()) { |
38 LOG(WARNING) << "ParseServerResponse: Response was empty."; | 48 LOG(WARNING) << "ParseServerResponse: Response was empty."; |
39 return false; | 49 return false; |
40 } | 50 } |
41 DVLOG(1) << "ParseServerResponse: Parsing response " << response_body; | 51 DVLOG(1) << "ParseServerResponse: Parsing response " << response_body; |
42 | 52 |
43 // Parse the response, ignoring comments. | 53 // Parse the response, ignoring comments. |
44 std::string error_msg; | 54 std::string error_msg; |
45 scoped_ptr<Value> response_value(base::JSONReader::ReadAndReturnError( | 55 scoped_ptr<Value> response_value(base::JSONReader::ReadAndReturnError( |
46 response_body, false, NULL, &error_msg)); | 56 response_body, false, NULL, &error_msg)); |
(...skipping 13 matching lines...) Expand all Loading... | |
60 // Get the status. | 70 // Get the status. |
61 int status; | 71 int status; |
62 if (!response_object->GetInteger(kStatusString, &status)) { | 72 if (!response_object->GetInteger(kStatusString, &status)) { |
63 VLOG(1) << "ParseServerResponse: " << kStatusString | 73 VLOG(1) << "ParseServerResponse: " << kStatusString |
64 << " is not a valid integer value."; | 74 << " is not a valid integer value."; |
65 return false; | 75 return false; |
66 } | 76 } |
67 | 77 |
68 // Process the status. | 78 // Process the status. |
69 switch (status) { | 79 switch (status) { |
70 case content::SPEECH_RECOGNITION_ERROR_NONE: | 80 case SPEECH_API_STATUS_NO_ERROR: |
Satish
2012/03/16 17:00:35
should we set the error code to none in this case?
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
| |
71 case content::SPEECH_RECOGNITION_ERROR_NO_SPEECH: | 81 break; |
72 case content::SPEECH_RECOGNITION_ERROR_NO_MATCH: | 82 case SPEECH_API_STATUS_NO_SPEECH: |
73 break; | 83 error->code = content::SPEECH_RECOGNITION_ERROR_NO_SPEECH; |
74 | 84 return false; |
75 default: | 85 case SPEECH_API_STATUS_NO_MATCH: |
76 // Other status codes should not be returned by the server. | 86 error->code = content::SPEECH_RECOGNITION_ERROR_NO_MATCH; |
77 VLOG(1) << "ParseServerResponse: unexpected status code " << status; | 87 return false; |
78 return false; | 88 default: |
89 error->code = content::SPEECH_RECOGNITION_ERROR_NETWORK; | |
90 // Other status codes should not be returned by the server. | |
91 VLOG(1) << "ParseServerResponse: unexpected status code " << status; | |
92 return false; | |
79 } | 93 } |
80 | 94 |
81 result->error = static_cast<content::SpeechRecognitionErrorCode>(status); | |
82 | |
83 // Get the hypotheses. | 95 // Get the hypotheses. |
84 Value* hypotheses_value = NULL; | 96 Value* hypotheses_value = NULL; |
85 if (!response_object->Get(kHypothesesString, &hypotheses_value)) { | 97 if (!response_object->Get(kHypothesesString, &hypotheses_value)) { |
86 VLOG(1) << "ParseServerResponse: Missing hypotheses attribute."; | 98 VLOG(1) << "ParseServerResponse: Missing hypotheses attribute."; |
87 return false; | 99 return false; |
88 } | 100 } |
89 | 101 |
90 DCHECK(hypotheses_value); | 102 DCHECK(hypotheses_value); |
91 if (!hypotheses_value->IsType(Value::TYPE_LIST)) { | 103 if (!hypotheses_value->IsType(Value::TYPE_LIST)) { |
92 VLOG(1) << "ParseServerResponse: Unexpected hypotheses type " | 104 VLOG(1) << "ParseServerResponse: Unexpected hypotheses type " |
93 << hypotheses_value->GetType(); | 105 << hypotheses_value->GetType(); |
94 return false; | 106 return false; |
95 } | 107 } |
96 | 108 |
97 const ListValue* hypotheses_list = static_cast<ListValue*>(hypotheses_value); | 109 const ListValue* hypotheses_list = static_cast<ListValue*>(hypotheses_value); |
98 | 110 // For now we support only single shot recognition, so we are giving only a |
111 // final result, consisting of one fragment (with one or more hypotheses). | |
99 size_t index = 0; | 112 size_t index = 0; |
100 for (; index < hypotheses_list->GetSize(); ++index) { | 113 for (; index < hypotheses_list->GetSize(); ++index) { |
101 Value* hypothesis = NULL; | 114 Value* hypothesis = NULL; |
102 if (!hypotheses_list->Get(index, &hypothesis)) { | 115 if (!hypotheses_list->Get(index, &hypothesis)) { |
103 LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value."; | 116 LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value."; |
104 break; | 117 break; |
105 } | 118 } |
106 DCHECK(hypothesis); | 119 DCHECK(hypothesis); |
107 if (!hypothesis->IsType(Value::TYPE_DICTIONARY)) { | 120 if (!hypothesis->IsType(Value::TYPE_DICTIONARY)) { |
108 LOG(WARNING) << "ParseServerResponse: Unexpected value type " | 121 LOG(WARNING) << "ParseServerResponse: Unexpected value type " |
109 << hypothesis->GetType(); | 122 << hypothesis->GetType(); |
110 break; | 123 break; |
111 } | 124 } |
112 | 125 |
113 const DictionaryValue* hypothesis_value = | 126 const DictionaryValue* hypothesis_value = |
114 static_cast<DictionaryValue*>(hypothesis); | 127 static_cast<DictionaryValue*>(hypothesis); |
115 string16 utterance; | 128 string16 utterance; |
129 | |
116 if (!hypothesis_value->GetString(kUtteranceString, &utterance)) { | 130 if (!hypothesis_value->GetString(kUtteranceString, &utterance)) { |
117 LOG(WARNING) << "ParseServerResponse: Missing utterance value."; | 131 LOG(WARNING) << "ParseServerResponse: Missing utterance value."; |
118 break; | 132 break; |
119 } | 133 } |
120 | 134 |
121 // It is not an error if the 'confidence' field is missing. | 135 // It is not an error if the 'confidence' field is missing. |
122 double confidence = 0.0; | 136 double confidence = 0.0; |
123 hypothesis_value->GetDouble(kConfidenceString, &confidence); | 137 hypothesis_value->GetDouble(kConfidenceString, &confidence); |
124 | 138 result->hypotheses.push_back(SpeechRecognitionHypothesis(utterance, |
125 result->hypotheses.push_back(content::SpeechRecognitionHypothesis( | 139 confidence)); |
126 utterance, confidence)); | |
127 } | 140 } |
128 | 141 |
129 if (index < hypotheses_list->GetSize()) { | 142 if (index < hypotheses_list->GetSize()) { |
130 result->hypotheses.clear(); | 143 result->hypotheses.clear(); |
131 return false; | 144 return false; |
132 } | 145 } |
133 | |
134 return true; | 146 return true; |
135 } | 147 } |
136 | 148 |
137 } // namespace | 149 } // namespace |
138 | 150 |
151 | |
Satish
2012/03/16 17:00:35
remove extra newline
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
| |
139 namespace speech { | 152 namespace speech { |
140 | 153 |
141 int SpeechRecognitionRequest::url_fetcher_id_for_tests = 0; | 154 const int GoogleSSFERemoteEngine::kAudioPacketIntervalMs = 100; |
155 int GoogleSSFERemoteEngine::url_fetcher_id_for_tests = 0; | |
142 | 156 |
143 SpeechRecognitionRequest::SpeechRecognitionRequest( | 157 GoogleSSFERemoteEngineConfig::GoogleSSFERemoteEngineConfig() |
144 net::URLRequestContextGetter* context, Delegate* delegate) | 158 : filter_profanities(false), |
Satish
2012/03/16 17:00:35
4 spaces before :
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
| |
145 : url_context_(context), | 159 audio_sample_rate(8000), |
Satish
2012/03/16 17:00:35
these two values should probably be constants at t
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
| |
146 delegate_(delegate) { | 160 audio_num_bits_per_sample(16) { |
147 DCHECK(delegate); | |
148 } | 161 } |
149 | 162 |
150 SpeechRecognitionRequest::~SpeechRecognitionRequest() {} | 163 GoogleSSFERemoteEngineConfig::~GoogleSSFERemoteEngineConfig() {} |
151 | 164 |
152 void SpeechRecognitionRequest::Start(const std::string& language, | 165 GoogleSSFERemoteEngine::GoogleSSFERemoteEngine( |
153 const std::string& grammar, | 166 net::URLRequestContextGetter* context) |
154 bool filter_profanities, | 167 : url_context_(context), |
155 const std::string& hardware_info, | 168 codec_(AudioEncoder::CODEC_FLAC), |
156 const std::string& origin_url, | 169 encoder_(NULL) { |
Satish
2012/03/16 17:00:35
this should get autoinitialized?
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
| |
157 const std::string& content_type) { | 170 } |
171 | |
172 GoogleSSFERemoteEngine::~GoogleSSFERemoteEngine() {} | |
173 | |
174 void GoogleSSFERemoteEngine::SetConfiguration( | |
175 const GoogleSSFERemoteEngineConfig& config) { | |
176 config_ = config; | |
177 } | |
178 | |
179 void GoogleSSFERemoteEngine::SpeechRecognitionBegins() { | |
180 DCHECK(delegate()); | |
158 DCHECK(!url_fetcher_.get()); | 181 DCHECK(!url_fetcher_.get()); |
182 std::vector<std::string> parts; | |
Satish
2012/03/16 17:00:35
move this to line 203 where it is getting used for
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
| |
183 encoder_.reset(AudioEncoder::Create(codec_, config_.audio_sample_rate, | |
184 config_.audio_num_bits_per_sample)); | |
185 DCHECK(encoder_.get()); | |
186 std::string lang_param = config_.language; | |
159 | 187 |
160 std::vector<std::string> parts; | |
161 | |
162 std::string lang_param = language; | |
163 if (lang_param.empty() && url_context_) { | 188 if (lang_param.empty() && url_context_) { |
164 // If no language is provided then we use the first from the accepted | 189 // If no language is provided then we use the first from the accepted |
165 // language list. If this list is empty then it defaults to "en-US". | 190 // language list. If this list is empty then it defaults to "en-US". |
166 // Example of the contents of this list: "es,en-GB;q=0.8", "" | 191 // Example of the contents of this list: "es,en-GB;q=0.8", "" |
167 net::URLRequestContext* request_context = | 192 net::URLRequestContext* request_context = |
168 url_context_->GetURLRequestContext(); | 193 url_context_->GetURLRequestContext(); |
169 DCHECK(request_context); | 194 DCHECK(request_context); |
170 std::string accepted_language_list = request_context->accept_language(); | 195 std::string accepted_language_list = request_context->accept_language(); |
171 size_t separator = accepted_language_list.find_first_of(",;"); | 196 size_t separator = accepted_language_list.find_first_of(",;"); |
172 lang_param = accepted_language_list.substr(0, separator); | 197 lang_param = accepted_language_list.substr(0, separator); |
173 } | 198 } |
199 | |
174 if (lang_param.empty()) | 200 if (lang_param.empty()) |
175 lang_param = "en-US"; | 201 lang_param = "en-US"; |
202 | |
176 parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true)); | 203 parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true)); |
177 | 204 |
178 if (!grammar.empty()) | 205 if (!config_.grammar.empty()) |
179 parts.push_back("lm=" + net::EscapeQueryParamValue(grammar, true)); | 206 parts.push_back("lm=" + net::EscapeQueryParamValue(config_.grammar, true)); |
180 if (!hardware_info.empty()) | 207 |
181 parts.push_back("xhw=" + net::EscapeQueryParamValue(hardware_info, true)); | 208 if (!config_.hardware_info.empty()) |
209 parts.push_back("xhw=" + net::EscapeQueryParamValue(config_.hardware_info, | |
210 true)); | |
182 parts.push_back("maxresults=" + base::IntToString(kMaxResults)); | 211 parts.push_back("maxresults=" + base::IntToString(kMaxResults)); |
183 parts.push_back(filter_profanities ? "pfilter=2" : "pfilter=0"); | 212 parts.push_back(config_.filter_profanities ? "pfilter=2" : "pfilter=0"); |
184 | 213 |
185 GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&')); | 214 GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&')); |
186 | 215 |
187 url_fetcher_.reset(URLFetcherImpl::Create(url_fetcher_id_for_tests, | 216 url_fetcher_.reset(URLFetcherImpl::Create(url_fetcher_id_for_tests, |
188 url, | 217 url, |
189 URLFetcherImpl::POST, | 218 URLFetcherImpl::POST, |
190 this)); | 219 this)); |
191 url_fetcher_->SetChunkedUpload(content_type); | 220 url_fetcher_->SetChunkedUpload(encoder_->mime_type()); |
192 url_fetcher_->SetRequestContext(url_context_); | 221 url_fetcher_->SetRequestContext(url_context_); |
193 url_fetcher_->SetReferrer(origin_url); | 222 url_fetcher_->SetReferrer(config_.origin_url); |
194 | 223 |
195 // The speech recognition API does not require user identification as part | 224 // The speech recognition API does not require user identification as part |
196 // of requests, so we don't send cookies or auth data for these requests to | 225 // of requests, so we don't send cookies or auth data for these requests to |
197 // prevent any accidental connection between users who are logged into the | 226 // prevent any accidental connection between users who are logged into the |
198 // domain for other services (e.g. bookmark sync) with the speech requests. | 227 // domain for other services (e.g. bookmark sync) with the speech requests. |
199 url_fetcher_->SetLoadFlags( | 228 url_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES | |
200 net::LOAD_DO_NOT_SAVE_COOKIES | net::LOAD_DO_NOT_SEND_COOKIES | | 229 net::LOAD_DO_NOT_SEND_COOKIES | |
201 net::LOAD_DO_NOT_SEND_AUTH_DATA); | 230 net::LOAD_DO_NOT_SEND_AUTH_DATA); |
202 url_fetcher_->Start(); | 231 url_fetcher_->Start(); |
203 } | 232 } |
204 | 233 |
205 void SpeechRecognitionRequest::UploadAudioChunk(const AudioChunk& audio_chunk, | 234 // Called only after the results have been retrieved. |
206 bool is_last_chunk) { | 235 void GoogleSSFERemoteEngine::SpeechRecognitionEnds() { |
207 DCHECK(url_fetcher_.get()); | 236 url_fetcher_.reset(); |
Satish
2012/03/16 17:00:35
do you expect callers to reuse this object across
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
In some occasions it can be reused (e.g, the user
| |
208 url_fetcher_->AppendChunkToUpload(audio_chunk.AsString(), is_last_chunk); | |
209 } | 237 } |
210 | 238 |
211 void SpeechRecognitionRequest::OnURLFetchComplete( | 239 void GoogleSSFERemoteEngine::PushSpeechAudio(const AudioChunk& data) { |
240 DCHECK(url_fetcher_.get()); | |
Satish
2012/03/16 17:00:35
also check for encoder_ as done below
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
| |
241 DCHECK_EQ(data.bytes_per_sample(), config_.audio_num_bits_per_sample / 8); | |
242 encoder_->Encode(data); | |
243 scoped_ptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear()); | |
244 url_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false); | |
245 } | |
246 | |
247 void GoogleSSFERemoteEngine::SpeechAudioStreamComplete() { | |
248 DCHECK(url_fetcher_.get()); | |
249 DCHECK(encoder_.get()); | |
250 // UploadAudioChunk requires a non-empty final buffer. So we encode a packet | |
251 // of silence in case encoder had no data already. | |
252 std::vector<short> samples( | |
Satish
2012/03/16 17:00:35
short -> int16 ?
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
| |
253 config_.audio_sample_rate * kAudioPacketIntervalMs / 1000); | |
254 AudioChunk dummy_chunk(reinterpret_cast<uint8*>(&samples[0]), | |
255 samples.size() * sizeof(short), | |
Satish
2012/03/16 17:00:35
ditto
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
| |
256 encoder_->bits_per_sample() / 8); | |
257 encoder_->Encode(dummy_chunk); | |
258 encoder_->Flush(); | |
259 scoped_ptr<AudioChunk> encoded_dummy_data(encoder_->GetEncodedDataAndClear()); | |
260 DCHECK(!encoded_dummy_data->IsEmpty()); | |
261 encoder_.reset(); | |
262 | |
263 url_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true); | |
264 } | |
265 | |
266 void GoogleSSFERemoteEngine::OnURLFetchComplete( | |
212 const content::URLFetcher* source) { | 267 const content::URLFetcher* source) { |
213 DCHECK_EQ(url_fetcher_.get(), source); | 268 DCHECK_EQ(url_fetcher_.get(), source); |
269 SpeechRecognitionResult result; | |
270 SpeechRecognitionError error(content::SPEECH_RECOGNITION_ERROR_NETWORK); | |
271 std::string data; | |
272 // The default error code in case of parse errors is NETWORK_FAILURE, however | |
Satish
2012/03/16 17:00:35
add a newline before full line comments such as th
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
| |
273 // ParseServerResponse can change the error to a more appropriate one. | |
274 if (!source->GetStatus().is_success() || | |
Satish
2012/03/16 17:00:35
no need to align the ||, could just leave 1 space
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
| |
275 source->GetResponseCode() != 200 || | |
276 !source->GetResponseAsString(&data) || | |
277 !ParseServerResponse(data, &result, &error)) { | |
278 DVLOG(1) << "GoogleSSFERemoteEngine: Network Error " << error.code; | |
279 delegate()->OnSpeechEngineError(error); | |
280 } else { | |
281 DVLOG(1) << "GoogleSSFERemoteEngine: Invoking delegate with result."; | |
282 delegate()->OnSpeechEngineResult(result); | |
283 } | |
284 url_fetcher_.reset(); | |
285 } | |
214 | 286 |
215 content::SpeechRecognitionResult result; | 287 bool GoogleSSFERemoteEngine::IsRecognitionPending() const { |
216 std::string data; | 288 return url_fetcher_ != NULL; |
217 if (!source->GetStatus().is_success() || source->GetResponseCode() != 200 || | 289 } |
218 !source->GetResponseAsString(&data) || | |
219 !ParseServerResponse(data, &result)) { | |
220 result.error = content::SPEECH_RECOGNITION_ERROR_NETWORK; | |
221 } | |
222 | 290 |
223 DVLOG(1) << "SpeechRecognitionRequest: Invoking delegate with result."; | 291 int GoogleSSFERemoteEngine::DesiredAudioChunkDurationMs() const { |
224 url_fetcher_.reset(); | 292 return kAudioPacketIntervalMs; |
225 delegate_->SetRecognitionResult(result); | |
226 } | 293 } |
227 | 294 |
228 } // namespace speech | 295 } // namespace speech |
OLD | NEW |