| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "content/browser/speech/speech_recognition_request.h" | 5 #include "content/browser/speech/google_one_shot_remote_engine.h" |
| 6 | 6 |
| 7 #include <vector> | 7 #include <vector> |
| 8 | 8 |
| 9 #include "base/json/json_reader.h" | 9 #include "base/json/json_reader.h" |
| 10 #include "base/memory/scoped_ptr.h" |
| 10 #include "base/string_number_conversions.h" | 11 #include "base/string_number_conversions.h" |
| 11 #include "base/string_util.h" | 12 #include "base/string_util.h" |
| 12 #include "base/values.h" | 13 #include "base/values.h" |
| 13 #include "content/browser/speech/audio_buffer.h" | 14 #include "content/browser/speech/audio_buffer.h" |
| 14 #include "content/common/net/url_fetcher_impl.h" | 15 #include "content/common/net/url_fetcher_impl.h" |
| 16 #include "content/public/common/speech_recognition_error.h" |
| 15 #include "content/public/common/speech_recognition_result.h" | 17 #include "content/public/common/speech_recognition_result.h" |
| 16 #include "net/base/escape.h" | 18 #include "net/base/escape.h" |
| 17 #include "net/base/load_flags.h" | 19 #include "net/base/load_flags.h" |
| 18 #include "net/url_request/url_request_context.h" | 20 #include "net/url_request/url_request_context.h" |
| 19 #include "net/url_request/url_request_context_getter.h" | 21 #include "net/url_request/url_request_context_getter.h" |
| 20 #include "net/url_request/url_request_status.h" | 22 #include "net/url_request/url_request_status.h" |
| 21 | 23 |
| 24 using content::SpeechRecognitionError; |
| 25 using content::SpeechRecognitionHypothesis; |
| 26 using content::SpeechRecognitionResult; |
| 27 |
| 22 namespace { | 28 namespace { |
| 23 | 29 |
| 24 const char* const kDefaultSpeechRecognitionUrl = | 30 const char* const kDefaultSpeechRecognitionUrl = |
| 25 "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&"; | 31 "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&"; |
| 26 const char* const kStatusString = "status"; | 32 const char* const kStatusString = "status"; |
| 27 const char* const kHypothesesString = "hypotheses"; | 33 const char* const kHypothesesString = "hypotheses"; |
| 28 const char* const kUtteranceString = "utterance"; | 34 const char* const kUtteranceString = "utterance"; |
| 29 const char* const kConfidenceString = "confidence"; | 35 const char* const kConfidenceString = "confidence"; |
| 30 | 36 const int kWebServiceStatusNoError = 0; |
| 37 const int kWebServiceStatusNoSpeech = 4; |
| 38 const int kWebServiceStatusNoMatch = 5; |
| 39 const int kDefaultConfigSampleRate = 8000; |
| 40 const int kDefaultConfigBitsPerSample = 16; |
| 41 const speech::AudioEncoder::Codec kDefaultAudioCodec = |
| 42 speech::AudioEncoder::CODEC_FLAC; |
| 31 // TODO(satish): Remove this hardcoded value once the page is allowed to | 43 // TODO(satish): Remove this hardcoded value once the page is allowed to |
| 32 // set this via an attribute. | 44 // set this via an attribute. |
| 33 const int kMaxResults = 6; | 45 const int kMaxResults = 6; |
| 34 | 46 |
| 35 bool ParseServerResponse(const std::string& response_body, | 47 bool ParseServerResponse(const std::string& response_body, |
| 36 content::SpeechRecognitionResult* result) { | 48 SpeechRecognitionResult* result, |
| 49 SpeechRecognitionError* error) { |
| 37 if (response_body.empty()) { | 50 if (response_body.empty()) { |
| 38 LOG(WARNING) << "ParseServerResponse: Response was empty."; | 51 LOG(WARNING) << "ParseServerResponse: Response was empty."; |
| 39 return false; | 52 return false; |
| 40 } | 53 } |
| 41 DVLOG(1) << "ParseServerResponse: Parsing response " << response_body; | 54 DVLOG(1) << "ParseServerResponse: Parsing response " << response_body; |
| 42 | 55 |
| 43 // Parse the response, ignoring comments. | 56 // Parse the response, ignoring comments. |
| 44 std::string error_msg; | 57 std::string error_msg; |
| 45 scoped_ptr<Value> response_value(base::JSONReader::ReadAndReturnError( | 58 scoped_ptr<Value> response_value(base::JSONReader::ReadAndReturnError( |
| 46 response_body, false, NULL, &error_msg)); | 59 response_body, false, NULL, &error_msg)); |
| (...skipping 13 matching lines...) Expand all Loading... |
| 60 // Get the status. | 73 // Get the status. |
| 61 int status; | 74 int status; |
| 62 if (!response_object->GetInteger(kStatusString, &status)) { | 75 if (!response_object->GetInteger(kStatusString, &status)) { |
| 63 VLOG(1) << "ParseServerResponse: " << kStatusString | 76 VLOG(1) << "ParseServerResponse: " << kStatusString |
| 64 << " is not a valid integer value."; | 77 << " is not a valid integer value."; |
| 65 return false; | 78 return false; |
| 66 } | 79 } |
| 67 | 80 |
| 68 // Process the status. | 81 // Process the status. |
| 69 switch (status) { | 82 switch (status) { |
| 70 case content::SPEECH_RECOGNITION_ERROR_NONE: | 83 case kWebServiceStatusNoError: |
| 71 case content::SPEECH_RECOGNITION_ERROR_NO_SPEECH: | 84 break; |
| 72 case content::SPEECH_RECOGNITION_ERROR_NO_MATCH: | 85 case kWebServiceStatusNoSpeech: |
| 73 break; | 86 error->code = content::SPEECH_RECOGNITION_ERROR_NO_SPEECH; |
| 74 | 87 return false; |
| 75 default: | 88 case kWebServiceStatusNoMatch: |
| 76 // Other status codes should not be returned by the server. | 89 error->code = content::SPEECH_RECOGNITION_ERROR_NO_MATCH; |
| 77 VLOG(1) << "ParseServerResponse: unexpected status code " << status; | 90 return false; |
| 78 return false; | 91 default: |
| 92 error->code = content::SPEECH_RECOGNITION_ERROR_NETWORK; |
| 93 // Other status codes should not be returned by the server. |
| 94 VLOG(1) << "ParseServerResponse: unexpected status code " << status; |
| 95 return false; |
| 79 } | 96 } |
| 80 | 97 |
| 81 result->error = static_cast<content::SpeechRecognitionErrorCode>(status); | |
| 82 | |
| 83 // Get the hypotheses. | 98 // Get the hypotheses. |
| 84 Value* hypotheses_value = NULL; | 99 Value* hypotheses_value = NULL; |
| 85 if (!response_object->Get(kHypothesesString, &hypotheses_value)) { | 100 if (!response_object->Get(kHypothesesString, &hypotheses_value)) { |
| 86 VLOG(1) << "ParseServerResponse: Missing hypotheses attribute."; | 101 VLOG(1) << "ParseServerResponse: Missing hypotheses attribute."; |
| 87 return false; | 102 return false; |
| 88 } | 103 } |
| 89 | 104 |
| 90 DCHECK(hypotheses_value); | 105 DCHECK(hypotheses_value); |
| 91 if (!hypotheses_value->IsType(Value::TYPE_LIST)) { | 106 if (!hypotheses_value->IsType(Value::TYPE_LIST)) { |
| 92 VLOG(1) << "ParseServerResponse: Unexpected hypotheses type " | 107 VLOG(1) << "ParseServerResponse: Unexpected hypotheses type " |
| 93 << hypotheses_value->GetType(); | 108 << hypotheses_value->GetType(); |
| 94 return false; | 109 return false; |
| 95 } | 110 } |
| 96 | 111 |
| 97 const ListValue* hypotheses_list = static_cast<ListValue*>(hypotheses_value); | 112 const ListValue* hypotheses_list = static_cast<ListValue*>(hypotheses_value); |
| 98 | 113 |
| 114 // For now we support only single shot recognition, so we are giving only a |
| 115 // final result, consisting of one fragment (with one or more hypotheses). |
| 99 size_t index = 0; | 116 size_t index = 0; |
| 100 for (; index < hypotheses_list->GetSize(); ++index) { | 117 for (; index < hypotheses_list->GetSize(); ++index) { |
| 101 Value* hypothesis = NULL; | 118 Value* hypothesis = NULL; |
| 102 if (!hypotheses_list->Get(index, &hypothesis)) { | 119 if (!hypotheses_list->Get(index, &hypothesis)) { |
| 103 LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value."; | 120 LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value."; |
| 104 break; | 121 break; |
| 105 } | 122 } |
| 106 DCHECK(hypothesis); | 123 DCHECK(hypothesis); |
| 107 if (!hypothesis->IsType(Value::TYPE_DICTIONARY)) { | 124 if (!hypothesis->IsType(Value::TYPE_DICTIONARY)) { |
| 108 LOG(WARNING) << "ParseServerResponse: Unexpected value type " | 125 LOG(WARNING) << "ParseServerResponse: Unexpected value type " |
| 109 << hypothesis->GetType(); | 126 << hypothesis->GetType(); |
| 110 break; | 127 break; |
| 111 } | 128 } |
| 112 | 129 |
| 113 const DictionaryValue* hypothesis_value = | 130 const DictionaryValue* hypothesis_value = |
| 114 static_cast<DictionaryValue*>(hypothesis); | 131 static_cast<DictionaryValue*>(hypothesis); |
| 115 string16 utterance; | 132 string16 utterance; |
| 133 |
| 116 if (!hypothesis_value->GetString(kUtteranceString, &utterance)) { | 134 if (!hypothesis_value->GetString(kUtteranceString, &utterance)) { |
| 117 LOG(WARNING) << "ParseServerResponse: Missing utterance value."; | 135 LOG(WARNING) << "ParseServerResponse: Missing utterance value."; |
| 118 break; | 136 break; |
| 119 } | 137 } |
| 120 | 138 |
| 121 // It is not an error if the 'confidence' field is missing. | 139 // It is not an error if the 'confidence' field is missing. |
| 122 double confidence = 0.0; | 140 double confidence = 0.0; |
| 123 hypothesis_value->GetDouble(kConfidenceString, &confidence); | 141 hypothesis_value->GetDouble(kConfidenceString, &confidence); |
| 124 | 142 result->hypotheses.push_back(SpeechRecognitionHypothesis(utterance, |
| 125 result->hypotheses.push_back(content::SpeechRecognitionHypothesis( | 143 confidence)); |
| 126 utterance, confidence)); | |
| 127 } | 144 } |
| 128 | 145 |
| 129 if (index < hypotheses_list->GetSize()) { | 146 if (index < hypotheses_list->GetSize()) { |
| 130 result->hypotheses.clear(); | 147 result->hypotheses.clear(); |
| 131 return false; | 148 return false; |
| 132 } | 149 } |
| 133 | |
| 134 return true; | 150 return true; |
| 135 } | 151 } |
| 136 | 152 |
| 137 } // namespace | 153 } // namespace |
| 138 | 154 |
| 139 namespace speech { | 155 namespace speech { |
| 140 | 156 |
| 141 int SpeechRecognitionRequest::url_fetcher_id_for_tests = 0; | 157 const int GoogleOneShotRemoteEngine::kAudioPacketIntervalMs = 100; |
| 158 int GoogleOneShotRemoteEngine::url_fetcher_id_for_tests = 0; |
| 142 | 159 |
| 143 SpeechRecognitionRequest::SpeechRecognitionRequest( | 160 GoogleOneShotRemoteEngineConfig::GoogleOneShotRemoteEngineConfig() |
| 144 net::URLRequestContextGetter* context, Delegate* delegate) | 161 : filter_profanities(false), |
| 145 : url_context_(context), | 162 audio_sample_rate(kDefaultConfigSampleRate), |
| 146 delegate_(delegate) { | 163 audio_num_bits_per_sample(kDefaultConfigBitsPerSample) { |
| 147 DCHECK(delegate); | |
| 148 } | 164 } |
| 149 | 165 |
| 150 SpeechRecognitionRequest::~SpeechRecognitionRequest() {} | 166 GoogleOneShotRemoteEngineConfig::~GoogleOneShotRemoteEngineConfig() {} |
| 151 | 167 |
| 152 void SpeechRecognitionRequest::Start(const std::string& language, | 168 GoogleOneShotRemoteEngine::GoogleOneShotRemoteEngine( |
| 153 const std::string& grammar, | 169 net::URLRequestContextGetter* context) |
| 154 bool filter_profanities, | 170 : url_context_(context) { |
| 155 const std::string& hardware_info, | 171 } |
| 156 const std::string& origin_url, | 172 |
| 157 const std::string& content_type) { | 173 GoogleOneShotRemoteEngine::~GoogleOneShotRemoteEngine() {} |
| 174 |
| 175 void GoogleOneShotRemoteEngine::SetConfig( |
| 176 const GoogleOneShotRemoteEngineConfig& config) { |
| 177 config_ = config; |
| 178 } |
| 179 |
| 180 void GoogleOneShotRemoteEngine::StartRecognition() { |
| 181 DCHECK(delegate()); |
| 158 DCHECK(!url_fetcher_.get()); | 182 DCHECK(!url_fetcher_.get()); |
| 183 std::string lang_param = config_.language; |
| 159 | 184 |
| 160 std::vector<std::string> parts; | |
| 161 | |
| 162 std::string lang_param = language; | |
| 163 if (lang_param.empty() && url_context_) { | 185 if (lang_param.empty() && url_context_) { |
| 164 // If no language is provided then we use the first from the accepted | 186 // If no language is provided then we use the first from the accepted |
| 165 // language list. If this list is empty then it defaults to "en-US". | 187 // language list. If this list is empty then it defaults to "en-US". |
| 166 // Example of the contents of this list: "es,en-GB;q=0.8", "" | 188 // Example of the contents of this list: "es,en-GB;q=0.8", "" |
| 167 net::URLRequestContext* request_context = | 189 net::URLRequestContext* request_context = |
| 168 url_context_->GetURLRequestContext(); | 190 url_context_->GetURLRequestContext(); |
| 169 DCHECK(request_context); | 191 DCHECK(request_context); |
| 170 std::string accepted_language_list = request_context->accept_language(); | 192 std::string accepted_language_list = request_context->accept_language(); |
| 171 size_t separator = accepted_language_list.find_first_of(",;"); | 193 size_t separator = accepted_language_list.find_first_of(",;"); |
| 172 lang_param = accepted_language_list.substr(0, separator); | 194 lang_param = accepted_language_list.substr(0, separator); |
| 173 } | 195 } |
| 196 |
| 174 if (lang_param.empty()) | 197 if (lang_param.empty()) |
| 175 lang_param = "en-US"; | 198 lang_param = "en-US"; |
| 199 |
| 200 std::vector<std::string> parts; |
| 176 parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true)); | 201 parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true)); |
| 177 | 202 |
| 178 if (!grammar.empty()) | 203 if (!config_.grammar.empty()) |
| 179 parts.push_back("lm=" + net::EscapeQueryParamValue(grammar, true)); | 204 parts.push_back("lm=" + net::EscapeQueryParamValue(config_.grammar, true)); |
| 180 if (!hardware_info.empty()) | 205 |
| 181 parts.push_back("xhw=" + net::EscapeQueryParamValue(hardware_info, true)); | 206 if (!config_.hardware_info.empty()) |
| 207 parts.push_back("xhw=" + net::EscapeQueryParamValue(config_.hardware_info, |
| 208 true)); |
| 182 parts.push_back("maxresults=" + base::IntToString(kMaxResults)); | 209 parts.push_back("maxresults=" + base::IntToString(kMaxResults)); |
| 183 parts.push_back(filter_profanities ? "pfilter=2" : "pfilter=0"); | 210 parts.push_back(config_.filter_profanities ? "pfilter=2" : "pfilter=0"); |
| 184 | 211 |
| 185 GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&')); | 212 GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&')); |
| 186 | 213 |
| 214 encoder_.reset(AudioEncoder::Create(kDefaultAudioCodec, |
| 215 config_.audio_sample_rate, |
| 216 config_.audio_num_bits_per_sample)); |
| 217 DCHECK(encoder_.get()); |
| 187 url_fetcher_.reset(URLFetcherImpl::Create(url_fetcher_id_for_tests, | 218 url_fetcher_.reset(URLFetcherImpl::Create(url_fetcher_id_for_tests, |
| 188 url, | 219 url, |
| 189 URLFetcherImpl::POST, | 220 URLFetcherImpl::POST, |
| 190 this)); | 221 this)); |
| 191 url_fetcher_->SetChunkedUpload(content_type); | 222 url_fetcher_->SetChunkedUpload(encoder_->mime_type()); |
| 192 url_fetcher_->SetRequestContext(url_context_); | 223 url_fetcher_->SetRequestContext(url_context_); |
| 193 url_fetcher_->SetReferrer(origin_url); | 224 url_fetcher_->SetReferrer(config_.origin_url); |
| 194 | 225 |
| 195 // The speech recognition API does not require user identification as part | 226 // The speech recognition API does not require user identification as part |
| 196 // of requests, so we don't send cookies or auth data for these requests to | 227 // of requests, so we don't send cookies or auth data for these requests to |
| 197 // prevent any accidental connection between users who are logged into the | 228 // prevent any accidental connection between users who are logged into the |
| 198 // domain for other services (e.g. bookmark sync) with the speech requests. | 229 // domain for other services (e.g. bookmark sync) with the speech requests. |
| 199 url_fetcher_->SetLoadFlags( | 230 url_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES | |
| 200 net::LOAD_DO_NOT_SAVE_COOKIES | net::LOAD_DO_NOT_SEND_COOKIES | | 231 net::LOAD_DO_NOT_SEND_COOKIES | |
| 201 net::LOAD_DO_NOT_SEND_AUTH_DATA); | 232 net::LOAD_DO_NOT_SEND_AUTH_DATA); |
| 202 url_fetcher_->Start(); | 233 url_fetcher_->Start(); |
| 203 } | 234 } |
| 204 | 235 |
| 205 void SpeechRecognitionRequest::UploadAudioChunk(const AudioChunk& audio_chunk, | 236 void GoogleOneShotRemoteEngine::EndRecognition() { |
| 206 bool is_last_chunk) { | 237 url_fetcher_.reset(); |
| 207 DCHECK(url_fetcher_.get()); | |
| 208 url_fetcher_->AppendChunkToUpload(audio_chunk.AsString(), is_last_chunk); | |
| 209 } | 238 } |
| 210 | 239 |
| 211 void SpeechRecognitionRequest::OnURLFetchComplete( | 240 void GoogleOneShotRemoteEngine::TakeAudioChunk(const AudioChunk& data) { |
| 241 DCHECK(url_fetcher_.get()); |
| 242 DCHECK(encoder_.get()); |
| 243 DCHECK_EQ(data.bytes_per_sample(), config_.audio_num_bits_per_sample / 8); |
| 244 encoder_->Encode(data); |
| 245 scoped_ptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear()); |
| 246 url_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false); |
| 247 } |
| 248 |
| 249 void GoogleOneShotRemoteEngine::AudioChunksEnded() { |
| 250 DCHECK(url_fetcher_.get()); |
| 251 DCHECK(encoder_.get()); |
| 252 |
| 253 // UploadAudioChunk requires a non-empty final buffer. So we encode a packet |
| 254 // of silence in case encoder had no data already. |
| 255 std::vector<int16> samples( |
| 256 config_.audio_sample_rate * kAudioPacketIntervalMs / 1000); |
| 257 AudioChunk dummy_chunk(reinterpret_cast<uint8*>(&samples[0]), |
| 258 samples.size() * sizeof(int16), |
| 259 encoder_->bits_per_sample() / 8); |
| 260 encoder_->Encode(dummy_chunk); |
| 261 encoder_->Flush(); |
| 262 scoped_ptr<AudioChunk> encoded_dummy_data(encoder_->GetEncodedDataAndClear()); |
| 263 DCHECK(!encoded_dummy_data->IsEmpty()); |
| 264 encoder_.reset(); |
| 265 |
| 266 url_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true); |
| 267 } |
| 268 |
| 269 void GoogleOneShotRemoteEngine::OnURLFetchComplete( |
| 212 const content::URLFetcher* source) { | 270 const content::URLFetcher* source) { |
| 213 DCHECK_EQ(url_fetcher_.get(), source); | 271 DCHECK_EQ(url_fetcher_.get(), source); |
| 272 SpeechRecognitionResult result; |
| 273 SpeechRecognitionError error(content::SPEECH_RECOGNITION_ERROR_NETWORK); |
| 274 std::string data; |
| 214 | 275 |
| 215 content::SpeechRecognitionResult result; | 276 // The default error code in case of parse errors is NETWORK_FAILURE, however |
| 216 std::string data; | 277 // ParseServerResponse can change the error to a more appropriate one. |
| 217 if (!source->GetStatus().is_success() || source->GetResponseCode() != 200 || | 278 bool error_occurred = (!source->GetStatus().is_success() || |
| 218 !source->GetResponseAsString(&data) || | 279 source->GetResponseCode() != 200 || |
| 219 !ParseServerResponse(data, &result)) { | 280 !source->GetResponseAsString(&data) || |
| 220 result.error = content::SPEECH_RECOGNITION_ERROR_NETWORK; | 281 !ParseServerResponse(data, &result, &error)); |
| 282 url_fetcher_.reset(); |
| 283 if (error_occurred) { |
| 284 DVLOG(1) << "GoogleOneShotRemoteEngine: Network Error " << error.code; |
| 285 delegate()->OnSpeechRecognitionEngineError(error); |
| 286 } else { |
| 287 DVLOG(1) << "GoogleOneShotRemoteEngine: Invoking delegate with result."; |
| 288 delegate()->OnSpeechRecognitionEngineResult(result); |
| 221 } | 289 } |
| 290 } |
| 222 | 291 |
| 223 DVLOG(1) << "SpeechRecognitionRequest: Invoking delegate with result."; | 292 bool GoogleOneShotRemoteEngine::IsRecognitionPending() const { |
| 224 url_fetcher_.reset(); | 293 return url_fetcher_ != NULL; |
| 225 delegate_->SetRecognitionResult(result); | 294 } |
| 295 |
| 296 int GoogleOneShotRemoteEngine::GetDesiredAudioChunkDurationMs() const { |
| 297 return kAudioPacketIntervalMs; |
| 226 } | 298 } |
| 227 | 299 |
| 228 } // namespace speech | 300 } // namespace speech |
| OLD | NEW |