| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "content/browser/speech/google_one_shot_remote_engine.h" | |
| 6 | |
| 7 #include <stddef.h> | |
| 8 #include <stdint.h> | |
| 9 | |
| 10 #include <vector> | |
| 11 | |
| 12 #include "base/json/json_reader.h" | |
| 13 #include "base/strings/string_number_conversions.h" | |
| 14 #include "base/strings/string_util.h" | |
| 15 #include "base/values.h" | |
| 16 #include "content/browser/speech/audio_buffer.h" | |
| 17 #include "content/public/common/speech_recognition_error.h" | |
| 18 #include "content/public/common/speech_recognition_result.h" | |
| 19 #include "google_apis/google_api_keys.h" | |
| 20 #include "net/base/escape.h" | |
| 21 #include "net/base/load_flags.h" | |
| 22 #include "net/url_request/http_user_agent_settings.h" | |
| 23 #include "net/url_request/url_fetcher.h" | |
| 24 #include "net/url_request/url_request_context.h" | |
| 25 #include "net/url_request/url_request_context_getter.h" | |
| 26 #include "net/url_request/url_request_status.h" | |
| 27 | |
| 28 namespace content { | |
| 29 namespace { | |
| 30 | |
| 31 const char* const kDefaultSpeechRecognitionUrl = | |
| 32 "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&"; | |
| 33 const char* const kStatusString = "status"; | |
| 34 const char* const kHypothesesString = "hypotheses"; | |
| 35 const char* const kUtteranceString = "utterance"; | |
| 36 const char* const kConfidenceString = "confidence"; | |
| 37 const int kWebServiceStatusNoError = 0; | |
| 38 const int kWebServiceStatusNoSpeech = 4; | |
| 39 const int kWebServiceStatusNoMatch = 5; | |
| 40 | |
| 41 bool ParseServerResponse(const std::string& response_body, | |
| 42 SpeechRecognitionResult* result, | |
| 43 SpeechRecognitionError* error) { | |
| 44 if (response_body.empty()) { | |
| 45 LOG(WARNING) << "ParseServerResponse: Response was empty."; | |
| 46 return false; | |
| 47 } | |
| 48 DVLOG(1) << "ParseServerResponse: Parsing response " << response_body; | |
| 49 | |
| 50 // Parse the response, ignoring comments. | |
| 51 std::string error_msg; | |
| 52 std::unique_ptr<base::Value> response_value = | |
| 53 base::JSONReader::ReadAndReturnError(response_body, base::JSON_PARSE_RFC, | |
| 54 NULL, &error_msg); | |
| 55 if (response_value == NULL) { | |
| 56 LOG(WARNING) << "ParseServerResponse: JSONReader failed : " << error_msg; | |
| 57 return false; | |
| 58 } | |
| 59 | |
| 60 if (!response_value->IsType(base::Value::TYPE_DICTIONARY)) { | |
| 61 DVLOG(1) << "ParseServerResponse: Unexpected response type " | |
| 62 << response_value->GetType(); | |
| 63 return false; | |
| 64 } | |
| 65 const base::DictionaryValue* response_object = | |
| 66 static_cast<const base::DictionaryValue*>(response_value.get()); | |
| 67 | |
| 68 // Get the status. | |
| 69 int status; | |
| 70 if (!response_object->GetInteger(kStatusString, &status)) { | |
| 71 DVLOG(1) << "ParseServerResponse: " << kStatusString | |
| 72 << " is not a valid integer value."; | |
| 73 return false; | |
| 74 } | |
| 75 | |
| 76 // Process the status. | |
| 77 switch (status) { | |
| 78 case kWebServiceStatusNoError: | |
| 79 break; | |
| 80 case kWebServiceStatusNoSpeech: | |
| 81 error->code = SPEECH_RECOGNITION_ERROR_NO_SPEECH; | |
| 82 return false; | |
| 83 case kWebServiceStatusNoMatch: | |
| 84 error->code = SPEECH_RECOGNITION_ERROR_NO_MATCH; | |
| 85 return false; | |
| 86 default: | |
| 87 error->code = SPEECH_RECOGNITION_ERROR_NETWORK; | |
| 88 // Other status codes should not be returned by the server. | |
| 89 DVLOG(1) << "ParseServerResponse: unexpected status code " << status; | |
| 90 return false; | |
| 91 } | |
| 92 | |
| 93 // Get the hypotheses. | |
| 94 const base::Value* hypotheses_value = NULL; | |
| 95 if (!response_object->Get(kHypothesesString, &hypotheses_value)) { | |
| 96 DVLOG(1) << "ParseServerResponse: Missing hypotheses attribute."; | |
| 97 return false; | |
| 98 } | |
| 99 | |
| 100 DCHECK(hypotheses_value); | |
| 101 if (!hypotheses_value->IsType(base::Value::TYPE_LIST)) { | |
| 102 DVLOG(1) << "ParseServerResponse: Unexpected hypotheses type " | |
| 103 << hypotheses_value->GetType(); | |
| 104 return false; | |
| 105 } | |
| 106 | |
| 107 const base::ListValue* hypotheses_list = | |
| 108 static_cast<const base::ListValue*>(hypotheses_value); | |
| 109 | |
| 110 // For now we support only single shot recognition, so we are giving only a | |
| 111 // final result, consisting of one fragment (with one or more hypotheses). | |
| 112 size_t index = 0; | |
| 113 for (; index < hypotheses_list->GetSize(); ++index) { | |
| 114 const base::Value* hypothesis = NULL; | |
| 115 if (!hypotheses_list->Get(index, &hypothesis)) { | |
| 116 LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value."; | |
| 117 break; | |
| 118 } | |
| 119 DCHECK(hypothesis); | |
| 120 if (!hypothesis->IsType(base::Value::TYPE_DICTIONARY)) { | |
| 121 LOG(WARNING) << "ParseServerResponse: Unexpected value type " | |
| 122 << hypothesis->GetType(); | |
| 123 break; | |
| 124 } | |
| 125 | |
| 126 const base::DictionaryValue* hypothesis_value = | |
| 127 static_cast<const base::DictionaryValue*>(hypothesis); | |
| 128 base::string16 utterance; | |
| 129 | |
| 130 if (!hypothesis_value->GetString(kUtteranceString, &utterance)) { | |
| 131 LOG(WARNING) << "ParseServerResponse: Missing utterance value."; | |
| 132 break; | |
| 133 } | |
| 134 | |
| 135 // It is not an error if the 'confidence' field is missing. | |
| 136 double confidence = 0.0; | |
| 137 hypothesis_value->GetDouble(kConfidenceString, &confidence); | |
| 138 result->hypotheses.push_back(SpeechRecognitionHypothesis(utterance, | |
| 139 confidence)); | |
| 140 } | |
| 141 | |
| 142 if (index < hypotheses_list->GetSize()) { | |
| 143 result->hypotheses.clear(); | |
| 144 return false; | |
| 145 } | |
| 146 return true; | |
| 147 } | |
| 148 | |
| 149 } // namespace | |
| 150 | |
| 151 const int GoogleOneShotRemoteEngine::kAudioPacketIntervalMs = 100; | |
| 152 int GoogleOneShotRemoteEngine::url_fetcher_id_for_tests = 0; | |
| 153 | |
| 154 GoogleOneShotRemoteEngine::GoogleOneShotRemoteEngine( | |
| 155 net::URLRequestContextGetter* context) | |
| 156 : url_context_(context) { | |
| 157 } | |
| 158 | |
| 159 GoogleOneShotRemoteEngine::~GoogleOneShotRemoteEngine() {} | |
| 160 | |
| 161 void GoogleOneShotRemoteEngine::SetConfig( | |
| 162 const SpeechRecognitionEngineConfig& config) { | |
| 163 config_ = config; | |
| 164 } | |
| 165 | |
| 166 void GoogleOneShotRemoteEngine::StartRecognition() { | |
| 167 DCHECK(delegate()); | |
| 168 DCHECK(!url_fetcher_.get()); | |
| 169 std::string lang_param = config_.language; | |
| 170 | |
| 171 if (lang_param.empty() && url_context_.get()) { | |
| 172 // If no language is provided then we use the first from the accepted | |
| 173 // language list. If this list is empty then it defaults to "en-US". | |
| 174 // Example of the contents of this list: "es,en-GB;q=0.8", "" | |
| 175 net::URLRequestContext* request_context = | |
| 176 url_context_->GetURLRequestContext(); | |
| 177 DCHECK(request_context); | |
| 178 // TODO(pauljensen): GoogleOneShotRemoteEngine should be constructed with | |
| 179 // a reference to the HttpUserAgentSettings rather than accessing the | |
| 180 // accept language through the URLRequestContext. | |
| 181 if (request_context->http_user_agent_settings()) { | |
| 182 std::string accepted_language_list = | |
| 183 request_context->http_user_agent_settings()->GetAcceptLanguage(); | |
| 184 size_t separator = accepted_language_list.find_first_of(",;"); | |
| 185 lang_param = accepted_language_list.substr(0, separator); | |
| 186 } | |
| 187 } | |
| 188 | |
| 189 if (lang_param.empty()) | |
| 190 lang_param = "en-US"; | |
| 191 | |
| 192 std::vector<std::string> parts; | |
| 193 parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true)); | |
| 194 | |
| 195 if (!config_.grammars.empty()) { | |
| 196 DCHECK_EQ(config_.grammars.size(), 1U); | |
| 197 parts.push_back("lm=" + net::EscapeQueryParamValue(config_.grammars[0].url, | |
| 198 true)); | |
| 199 } | |
| 200 | |
| 201 if (!config_.hardware_info.empty()) | |
| 202 parts.push_back("xhw=" + net::EscapeQueryParamValue(config_.hardware_info, | |
| 203 true)); | |
| 204 parts.push_back("maxresults=" + base::UintToString(config_.max_hypotheses)); | |
| 205 parts.push_back(config_.filter_profanities ? "pfilter=2" : "pfilter=0"); | |
| 206 | |
| 207 std::string api_key = google_apis::GetAPIKey(); | |
| 208 parts.push_back("key=" + net::EscapeQueryParamValue(api_key, true)); | |
| 209 | |
| 210 GURL url(std::string(kDefaultSpeechRecognitionUrl) + | |
| 211 base::JoinString(parts, "&")); | |
| 212 | |
| 213 encoder_.reset(new AudioEncoder(config_.audio_sample_rate, | |
| 214 config_.audio_num_bits_per_sample)); | |
| 215 DCHECK(encoder_.get()); | |
| 216 url_fetcher_ = net::URLFetcher::Create(url_fetcher_id_for_tests, url, | |
| 217 net::URLFetcher::POST, this); | |
| 218 url_fetcher_->SetChunkedUpload(encoder_->GetMimeType()); | |
| 219 url_fetcher_->SetRequestContext(url_context_.get()); | |
| 220 url_fetcher_->SetReferrer(config_.origin_url); | |
| 221 | |
| 222 // The speech recognition API does not require user identification as part | |
| 223 // of requests, so we don't send cookies or auth data for these requests to | |
| 224 // prevent any accidental connection between users who are logged into the | |
| 225 // domain for other services (e.g. bookmark sync) with the speech requests. | |
| 226 url_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES | | |
| 227 net::LOAD_DO_NOT_SEND_COOKIES | | |
| 228 net::LOAD_DO_NOT_SEND_AUTH_DATA); | |
| 229 url_fetcher_->Start(); | |
| 230 } | |
| 231 | |
| 232 void GoogleOneShotRemoteEngine::EndRecognition() { | |
| 233 url_fetcher_.reset(); | |
| 234 } | |
| 235 | |
| 236 void GoogleOneShotRemoteEngine::TakeAudioChunk(const AudioChunk& data) { | |
| 237 DCHECK(url_fetcher_.get()); | |
| 238 DCHECK(encoder_.get()); | |
| 239 DCHECK_EQ(data.bytes_per_sample(), config_.audio_num_bits_per_sample / 8); | |
| 240 encoder_->Encode(data); | |
| 241 scoped_refptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear()); | |
| 242 url_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false); | |
| 243 } | |
| 244 | |
| 245 void GoogleOneShotRemoteEngine::AudioChunksEnded() { | |
| 246 DCHECK(url_fetcher_.get()); | |
| 247 DCHECK(encoder_.get()); | |
| 248 | |
| 249 // UploadAudioChunk requires a non-empty final buffer. So we encode a packet | |
| 250 // of silence in case encoder had no data already. | |
| 251 size_t sample_count = | |
| 252 config_.audio_sample_rate * kAudioPacketIntervalMs / 1000; | |
| 253 scoped_refptr<AudioChunk> dummy_chunk(new AudioChunk( | |
| 254 sample_count * sizeof(int16_t), encoder_->GetBitsPerSample() / 8)); | |
| 255 encoder_->Encode(*dummy_chunk.get()); | |
| 256 encoder_->Flush(); | |
| 257 scoped_refptr<AudioChunk> encoded_dummy_data( | |
| 258 encoder_->GetEncodedDataAndClear()); | |
| 259 DCHECK(!encoded_dummy_data->IsEmpty()); | |
| 260 encoder_.reset(); | |
| 261 | |
| 262 url_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true); | |
| 263 } | |
| 264 | |
| 265 void GoogleOneShotRemoteEngine::OnURLFetchComplete( | |
| 266 const net::URLFetcher* source) { | |
| 267 DCHECK_EQ(url_fetcher_.get(), source); | |
| 268 SpeechRecognitionResults results; | |
| 269 results.push_back(SpeechRecognitionResult()); | |
| 270 SpeechRecognitionResult& result = results.back(); | |
| 271 SpeechRecognitionError error(SPEECH_RECOGNITION_ERROR_NETWORK); | |
| 272 std::string data; | |
| 273 | |
| 274 // The default error code in case of parse errors is NETWORK_FAILURE, however | |
| 275 // ParseServerResponse can change the error to a more appropriate one. | |
| 276 bool error_occurred = (!source->GetStatus().is_success() || | |
| 277 source->GetResponseCode() != 200 || | |
| 278 !source->GetResponseAsString(&data) || | |
| 279 !ParseServerResponse(data, &result, &error)); | |
| 280 url_fetcher_.reset(); | |
| 281 if (error_occurred) { | |
| 282 DVLOG(1) << "GoogleOneShotRemoteEngine: Network Error " << error.code; | |
| 283 delegate()->OnSpeechRecognitionEngineError(error); | |
| 284 } else { | |
| 285 DVLOG(1) << "GoogleOneShotRemoteEngine: Invoking delegate with result."; | |
| 286 delegate()->OnSpeechRecognitionEngineResults(results); | |
| 287 } | |
| 288 } | |
| 289 | |
| 290 bool GoogleOneShotRemoteEngine::IsRecognitionPending() const { | |
| 291 return url_fetcher_ != NULL; | |
| 292 } | |
| 293 | |
| 294 int GoogleOneShotRemoteEngine::GetDesiredAudioChunkDurationMs() const { | |
| 295 return kAudioPacketIntervalMs; | |
| 296 } | |
| 297 | |
| 298 } // namespace content | |
| OLD | NEW |