| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "content/browser/speech/google_streaming_remote_engine.h" | |
| 6 | |
| 7 #include <algorithm> | |
| 8 #include <vector> | |
| 9 | |
| 10 #include "base/big_endian.h" | |
| 11 #include "base/bind.h" | |
| 12 #include "base/rand_util.h" | |
| 13 #include "base/strings/string_number_conversions.h" | |
| 14 #include "base/strings/string_util.h" | |
| 15 #include "base/strings/utf_string_conversions.h" | |
| 16 #include "base/time/time.h" | |
| 17 #include "content/browser/speech/audio_buffer.h" | |
| 18 #include "content/browser/speech/proto/google_streaming_api.pb.h" | |
| 19 #include "content/public/common/speech_recognition_error.h" | |
| 20 #include "content/public/common/speech_recognition_result.h" | |
| 21 #include "google_apis/google_api_keys.h" | |
| 22 #include "net/base/escape.h" | |
| 23 #include "net/base/load_flags.h" | |
| 24 #include "net/url_request/http_user_agent_settings.h" | |
| 25 #include "net/url_request/url_fetcher.h" | |
| 26 #include "net/url_request/url_request_context.h" | |
| 27 #include "net/url_request/url_request_context_getter.h" | |
| 28 #include "net/url_request/url_request_status.h" | |
| 29 | |
| 30 using net::URLFetcher; | |
| 31 | |
| 32 namespace content { | |
| 33 namespace { | |
| 34 | |
| 35 const char kWebServiceBaseUrl[] = | |
| 36 "https://www.google.com/speech-api/full-duplex/v1"; | |
| 37 const char kDownstreamUrl[] = "/down?"; | |
| 38 const char kUpstreamUrl[] = "/up?"; | |
| 39 | |
| 40 // This matches the maximum maxAlternatives value supported by the server. | |
| 41 const uint32_t kMaxMaxAlternatives = 30; | |
| 42 | |
| 43 // TODO(hans): Remove this and other logging when we don't need it anymore. | |
| 44 void DumpResponse(const std::string& response) { | |
| 45 DVLOG(1) << "------------"; | |
| 46 proto::SpeechRecognitionEvent event; | |
| 47 if (!event.ParseFromString(response)) { | |
| 48 DVLOG(1) << "Parse failed!"; | |
| 49 return; | |
| 50 } | |
| 51 if (event.has_status()) | |
| 52 DVLOG(1) << "STATUS\t" << event.status(); | |
| 53 if (event.has_endpoint()) | |
| 54 DVLOG(1) << "ENDPOINT\t" << event.endpoint(); | |
| 55 for (int i = 0; i < event.result_size(); ++i) { | |
| 56 DVLOG(1) << "RESULT #" << i << ":"; | |
| 57 const proto::SpeechRecognitionResult& res = event.result(i); | |
| 58 if (res.has_final()) | |
| 59 DVLOG(1) << " final:\t" << res.final(); | |
| 60 if (res.has_stability()) | |
| 61 DVLOG(1) << " STABILITY:\t" << res.stability(); | |
| 62 for (int j = 0; j < res.alternative_size(); ++j) { | |
| 63 const proto::SpeechRecognitionAlternative& alt = | |
| 64 res.alternative(j); | |
| 65 if (alt.has_confidence()) | |
| 66 DVLOG(1) << " CONFIDENCE:\t" << alt.confidence(); | |
| 67 if (alt.has_transcript()) | |
| 68 DVLOG(1) << " TRANSCRIPT:\t" << alt.transcript(); | |
| 69 } | |
| 70 } | |
| 71 } | |
| 72 | |
| 73 } // namespace | |
| 74 | |
| 75 const int GoogleStreamingRemoteEngine::kAudioPacketIntervalMs = 100; | |
| 76 const int GoogleStreamingRemoteEngine::kUpstreamUrlFetcherIdForTesting = 0; | |
| 77 const int GoogleStreamingRemoteEngine::kDownstreamUrlFetcherIdForTesting = 1; | |
| 78 const int GoogleStreamingRemoteEngine::kWebserviceStatusNoError = 0; | |
| 79 const int GoogleStreamingRemoteEngine::kWebserviceStatusErrorNoMatch = 5; | |
| 80 | |
| 81 GoogleStreamingRemoteEngine::GoogleStreamingRemoteEngine( | |
| 82 net::URLRequestContextGetter* context) | |
| 83 : url_context_(context), | |
| 84 previous_response_length_(0), | |
| 85 got_last_definitive_result_(false), | |
| 86 is_dispatching_event_(false), | |
| 87 use_framed_post_data_(false), | |
| 88 state_(STATE_IDLE) {} | |
| 89 | |
| 90 GoogleStreamingRemoteEngine::~GoogleStreamingRemoteEngine() {} | |
| 91 | |
| 92 void GoogleStreamingRemoteEngine::SetConfig( | |
| 93 const SpeechRecognitionEngineConfig& config) { | |
| 94 config_ = config; | |
| 95 } | |
| 96 | |
| 97 void GoogleStreamingRemoteEngine::StartRecognition() { | |
| 98 FSMEventArgs event_args(EVENT_START_RECOGNITION); | |
| 99 DispatchEvent(event_args); | |
| 100 } | |
| 101 | |
| 102 void GoogleStreamingRemoteEngine::EndRecognition() { | |
| 103 FSMEventArgs event_args(EVENT_END_RECOGNITION); | |
| 104 DispatchEvent(event_args); | |
| 105 } | |
| 106 | |
| 107 void GoogleStreamingRemoteEngine::TakeAudioChunk(const AudioChunk& data) { | |
| 108 FSMEventArgs event_args(EVENT_AUDIO_CHUNK); | |
| 109 event_args.audio_data = &data; | |
| 110 DispatchEvent(event_args); | |
| 111 } | |
| 112 | |
| 113 void GoogleStreamingRemoteEngine::AudioChunksEnded() { | |
| 114 FSMEventArgs event_args(EVENT_AUDIO_CHUNKS_ENDED); | |
| 115 DispatchEvent(event_args); | |
| 116 } | |
| 117 | |
| 118 void GoogleStreamingRemoteEngine::OnURLFetchComplete(const URLFetcher* source) { | |
| 119 const bool kResponseComplete = true; | |
| 120 DispatchHTTPResponse(source, kResponseComplete); | |
| 121 } | |
| 122 | |
| 123 void GoogleStreamingRemoteEngine::OnURLFetchDownloadProgress( | |
| 124 const URLFetcher* source, | |
| 125 int64_t current, | |
| 126 int64_t total) { | |
| 127 const bool kPartialResponse = false; | |
| 128 DispatchHTTPResponse(source, kPartialResponse); | |
| 129 } | |
| 130 | |
| 131 void GoogleStreamingRemoteEngine::DispatchHTTPResponse(const URLFetcher* source, | |
| 132 bool end_of_response) { | |
| 133 DCHECK(CalledOnValidThread()); | |
| 134 DCHECK(source); | |
| 135 const bool response_is_good = source->GetStatus().is_success() && | |
| 136 source->GetResponseCode() == 200; | |
| 137 std::string response; | |
| 138 if (response_is_good) | |
| 139 source->GetResponseAsString(&response); | |
| 140 const size_t current_response_length = response.size(); | |
| 141 | |
| 142 DVLOG(1) << (source == downstream_fetcher_.get() ? "Downstream" : "Upstream") | |
| 143 << "HTTP, code: " << source->GetResponseCode() | |
| 144 << " length: " << current_response_length | |
| 145 << " eor: " << end_of_response; | |
| 146 | |
| 147 // URLFetcher provides always the entire response buffer, but we are only | |
| 148 // interested in the fresh data introduced by the last chunk. Therefore, we | |
| 149 // drop the previous content we have already processed. | |
| 150 if (current_response_length != 0) { | |
| 151 DCHECK_GE(current_response_length, previous_response_length_); | |
| 152 response.erase(0, previous_response_length_); | |
| 153 previous_response_length_ = current_response_length; | |
| 154 } | |
| 155 | |
| 156 if (!response_is_good && source == downstream_fetcher_.get()) { | |
| 157 DVLOG(1) << "Downstream error " << source->GetResponseCode(); | |
| 158 FSMEventArgs event_args(EVENT_DOWNSTREAM_ERROR); | |
| 159 DispatchEvent(event_args); | |
| 160 return; | |
| 161 } | |
| 162 if (!response_is_good && source == upstream_fetcher_.get()) { | |
| 163 DVLOG(1) << "Upstream error " << source->GetResponseCode() | |
| 164 << " EOR " << end_of_response; | |
| 165 FSMEventArgs event_args(EVENT_UPSTREAM_ERROR); | |
| 166 DispatchEvent(event_args); | |
| 167 return; | |
| 168 } | |
| 169 | |
| 170 // Ignore incoming data on the upstream connection. | |
| 171 if (source == upstream_fetcher_.get()) | |
| 172 return; | |
| 173 | |
| 174 DCHECK(response_is_good && source == downstream_fetcher_.get()); | |
| 175 | |
| 176 // The downstream response is organized in chunks, whose size is determined | |
| 177 // by a 4 bytes prefix, transparently handled by the ChunkedByteBuffer class. | |
| 178 // Such chunks are sent by the speech recognition webservice over the HTTP | |
| 179 // downstream channel using HTTP chunked transfer (unrelated to our chunks). | |
| 180 // This function is called every time an HTTP chunk is received by the | |
| 181 // url fetcher. However there isn't any particular matching beween our | |
| 182 // protocol chunks and HTTP chunks, in the sense that a single HTTP chunk can | |
| 183 // contain a portion of one chunk or even more chunks together. | |
| 184 chunked_byte_buffer_.Append(response); | |
| 185 | |
| 186 // A single HTTP chunk can contain more than one data chunk, thus the while. | |
| 187 while (chunked_byte_buffer_.HasChunks()) { | |
| 188 FSMEventArgs event_args(EVENT_DOWNSTREAM_RESPONSE); | |
| 189 event_args.response = chunked_byte_buffer_.PopChunk(); | |
| 190 DCHECK(event_args.response.get()); | |
| 191 DumpResponse(std::string(event_args.response->begin(), | |
| 192 event_args.response->end())); | |
| 193 DispatchEvent(event_args); | |
| 194 } | |
| 195 if (end_of_response) { | |
| 196 FSMEventArgs event_args(EVENT_DOWNSTREAM_CLOSED); | |
| 197 DispatchEvent(event_args); | |
| 198 } | |
| 199 } | |
| 200 | |
| 201 bool GoogleStreamingRemoteEngine::IsRecognitionPending() const { | |
| 202 DCHECK(CalledOnValidThread()); | |
| 203 return state_ != STATE_IDLE; | |
| 204 } | |
| 205 | |
| 206 int GoogleStreamingRemoteEngine::GetDesiredAudioChunkDurationMs() const { | |
| 207 return kAudioPacketIntervalMs; | |
| 208 } | |
| 209 | |
| 210 // ----------------------- Core FSM implementation --------------------------- | |
| 211 | |
| 212 void GoogleStreamingRemoteEngine::DispatchEvent( | |
| 213 const FSMEventArgs& event_args) { | |
| 214 DCHECK(CalledOnValidThread()); | |
| 215 DCHECK_LE(event_args.event, EVENT_MAX_VALUE); | |
| 216 DCHECK_LE(state_, STATE_MAX_VALUE); | |
| 217 | |
| 218 // Event dispatching must be sequential, otherwise it will break all the rules | |
| 219 // and the assumptions of the finite state automata model. | |
| 220 DCHECK(!is_dispatching_event_); | |
| 221 is_dispatching_event_ = true; | |
| 222 | |
| 223 state_ = ExecuteTransitionAndGetNextState(event_args); | |
| 224 | |
| 225 is_dispatching_event_ = false; | |
| 226 } | |
| 227 | |
| 228 GoogleStreamingRemoteEngine::FSMState | |
| 229 GoogleStreamingRemoteEngine::ExecuteTransitionAndGetNextState( | |
| 230 const FSMEventArgs& event_args) { | |
| 231 const FSMEvent event = event_args.event; | |
| 232 switch (state_) { | |
| 233 case STATE_IDLE: | |
| 234 switch (event) { | |
| 235 case EVENT_START_RECOGNITION: | |
| 236 return ConnectBothStreams(event_args); | |
| 237 case EVENT_END_RECOGNITION: | |
| 238 // Note AUDIO_CHUNK and AUDIO_END events can remain enqueued in case of | |
| 239 // abort, so we just silently drop them here. | |
| 240 case EVENT_AUDIO_CHUNK: | |
| 241 case EVENT_AUDIO_CHUNKS_ENDED: | |
| 242 // DOWNSTREAM_CLOSED can be received if we end up here due to an error. | |
| 243 case EVENT_DOWNSTREAM_CLOSED: | |
| 244 return DoNothing(event_args); | |
| 245 case EVENT_UPSTREAM_ERROR: | |
| 246 case EVENT_DOWNSTREAM_ERROR: | |
| 247 case EVENT_DOWNSTREAM_RESPONSE: | |
| 248 return NotFeasible(event_args); | |
| 249 } | |
| 250 break; | |
| 251 case STATE_BOTH_STREAMS_CONNECTED: | |
| 252 switch (event) { | |
| 253 case EVENT_AUDIO_CHUNK: | |
| 254 return TransmitAudioUpstream(event_args); | |
| 255 case EVENT_DOWNSTREAM_RESPONSE: | |
| 256 return ProcessDownstreamResponse(event_args); | |
| 257 case EVENT_AUDIO_CHUNKS_ENDED: | |
| 258 return CloseUpstreamAndWaitForResults(event_args); | |
| 259 case EVENT_END_RECOGNITION: | |
| 260 return AbortSilently(event_args); | |
| 261 case EVENT_UPSTREAM_ERROR: | |
| 262 case EVENT_DOWNSTREAM_ERROR: | |
| 263 case EVENT_DOWNSTREAM_CLOSED: | |
| 264 return AbortWithError(event_args); | |
| 265 case EVENT_START_RECOGNITION: | |
| 266 return NotFeasible(event_args); | |
| 267 } | |
| 268 break; | |
| 269 case STATE_WAITING_DOWNSTREAM_RESULTS: | |
| 270 switch (event) { | |
| 271 case EVENT_DOWNSTREAM_RESPONSE: | |
| 272 return ProcessDownstreamResponse(event_args); | |
| 273 case EVENT_DOWNSTREAM_CLOSED: | |
| 274 return RaiseNoMatchErrorIfGotNoResults(event_args); | |
| 275 case EVENT_END_RECOGNITION: | |
| 276 return AbortSilently(event_args); | |
| 277 case EVENT_UPSTREAM_ERROR: | |
| 278 case EVENT_DOWNSTREAM_ERROR: | |
| 279 return AbortWithError(event_args); | |
| 280 case EVENT_START_RECOGNITION: | |
| 281 case EVENT_AUDIO_CHUNK: | |
| 282 case EVENT_AUDIO_CHUNKS_ENDED: | |
| 283 return NotFeasible(event_args); | |
| 284 } | |
| 285 break; | |
| 286 } | |
| 287 return NotFeasible(event_args); | |
| 288 } | |
| 289 | |
| 290 // ----------- Contract for all the FSM evolution functions below ------------- | |
| 291 // - Are guaranteed to be executed in the same thread (IO, except for tests); | |
| 292 // - Are guaranteed to be not reentrant (themselves and each other); | |
| 293 // - event_args members are guaranteed to be stable during the call; | |
| 294 | |
| 295 GoogleStreamingRemoteEngine::FSMState | |
| 296 GoogleStreamingRemoteEngine::ConnectBothStreams(const FSMEventArgs&) { | |
| 297 DCHECK(!upstream_fetcher_.get()); | |
| 298 DCHECK(!downstream_fetcher_.get()); | |
| 299 | |
| 300 encoder_.reset(new AudioEncoder(config_.audio_sample_rate, | |
| 301 config_.audio_num_bits_per_sample)); | |
| 302 DCHECK(encoder_.get()); | |
| 303 const std::string request_key = GenerateRequestKey(); | |
| 304 | |
| 305 // Only use the framed post data format when a preamble needs to be logged. | |
| 306 use_framed_post_data_ = (config_.preamble && | |
| 307 !config_.preamble->sample_data.empty() && | |
| 308 !config_.auth_token.empty() && | |
| 309 !config_.auth_scope.empty()); | |
| 310 if (use_framed_post_data_) { | |
| 311 preamble_encoder_.reset(new AudioEncoder( | |
| 312 config_.preamble->sample_rate, | |
| 313 config_.preamble->sample_depth * 8)); | |
| 314 } | |
| 315 | |
| 316 // Setup downstream fetcher. | |
| 317 std::vector<std::string> downstream_args; | |
| 318 downstream_args.push_back( | |
| 319 "key=" + net::EscapeQueryParamValue(google_apis::GetAPIKey(), true)); | |
| 320 downstream_args.push_back("pair=" + request_key); | |
| 321 downstream_args.push_back("output=pb"); | |
| 322 GURL downstream_url(std::string(kWebServiceBaseUrl) + | |
| 323 std::string(kDownstreamUrl) + | |
| 324 base::JoinString(downstream_args, "&")); | |
| 325 | |
| 326 downstream_fetcher_ = URLFetcher::Create( | |
| 327 kDownstreamUrlFetcherIdForTesting, downstream_url, URLFetcher::GET, this); | |
| 328 downstream_fetcher_->SetRequestContext(url_context_.get()); | |
| 329 downstream_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES | | |
| 330 net::LOAD_DO_NOT_SEND_COOKIES | | |
| 331 net::LOAD_DO_NOT_SEND_AUTH_DATA); | |
| 332 downstream_fetcher_->Start(); | |
| 333 | |
| 334 // Setup upstream fetcher. | |
| 335 // TODO(hans): Support for user-selected grammars. | |
| 336 std::vector<std::string> upstream_args; | |
| 337 upstream_args.push_back("key=" + | |
| 338 net::EscapeQueryParamValue(google_apis::GetAPIKey(), true)); | |
| 339 upstream_args.push_back("pair=" + request_key); | |
| 340 upstream_args.push_back("output=pb"); | |
| 341 upstream_args.push_back( | |
| 342 "lang=" + net::EscapeQueryParamValue(GetAcceptedLanguages(), true)); | |
| 343 upstream_args.push_back( | |
| 344 config_.filter_profanities ? "pFilter=2" : "pFilter=0"); | |
| 345 if (config_.max_hypotheses > 0U) { | |
| 346 uint32_t max_alternatives = | |
| 347 std::min(kMaxMaxAlternatives, config_.max_hypotheses); | |
| 348 upstream_args.push_back("maxAlternatives=" + | |
| 349 base::UintToString(max_alternatives)); | |
| 350 } | |
| 351 upstream_args.push_back("app=chromium"); | |
| 352 if (!config_.hardware_info.empty()) { | |
| 353 upstream_args.push_back( | |
| 354 "xhw=" + net::EscapeQueryParamValue(config_.hardware_info, true)); | |
| 355 } | |
| 356 for (const SpeechRecognitionGrammar& grammar : config_.grammars) { | |
| 357 std::string grammar_value( | |
| 358 base::DoubleToString(grammar.weight) + ":" + grammar.url); | |
| 359 upstream_args.push_back( | |
| 360 "grammar=" + net::EscapeQueryParamValue(grammar_value, true)); | |
| 361 } | |
| 362 if (config_.continuous) | |
| 363 upstream_args.push_back("continuous"); | |
| 364 else | |
| 365 upstream_args.push_back("endpoint=1"); | |
| 366 if (config_.interim_results) | |
| 367 upstream_args.push_back("interim"); | |
| 368 if (!config_.auth_token.empty() && !config_.auth_scope.empty()) { | |
| 369 upstream_args.push_back( | |
| 370 "authScope=" + net::EscapeQueryParamValue(config_.auth_scope, true)); | |
| 371 upstream_args.push_back( | |
| 372 "authToken=" + net::EscapeQueryParamValue(config_.auth_token, true)); | |
| 373 } | |
| 374 if (use_framed_post_data_) { | |
| 375 std::string audio_format; | |
| 376 if (preamble_encoder_) | |
| 377 audio_format = preamble_encoder_->GetMimeType() + ","; | |
| 378 audio_format += encoder_->GetMimeType(); | |
| 379 upstream_args.push_back( | |
| 380 "audioFormat=" + net::EscapeQueryParamValue(audio_format, true)); | |
| 381 } | |
| 382 GURL upstream_url(std::string(kWebServiceBaseUrl) + | |
| 383 std::string(kUpstreamUrl) + | |
| 384 base::JoinString(upstream_args, "&")); | |
| 385 | |
| 386 upstream_fetcher_ = URLFetcher::Create(kUpstreamUrlFetcherIdForTesting, | |
| 387 upstream_url, URLFetcher::POST, this); | |
| 388 if (use_framed_post_data_) | |
| 389 upstream_fetcher_->SetChunkedUpload("application/octet-stream"); | |
| 390 else | |
| 391 upstream_fetcher_->SetChunkedUpload(encoder_->GetMimeType()); | |
| 392 upstream_fetcher_->SetRequestContext(url_context_.get()); | |
| 393 upstream_fetcher_->SetReferrer(config_.origin_url); | |
| 394 upstream_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES | | |
| 395 net::LOAD_DO_NOT_SEND_COOKIES | | |
| 396 net::LOAD_DO_NOT_SEND_AUTH_DATA); | |
| 397 upstream_fetcher_->Start(); | |
| 398 previous_response_length_ = 0; | |
| 399 | |
| 400 if (preamble_encoder_) { | |
| 401 // Encode and send preamble right away. | |
| 402 scoped_refptr<AudioChunk> chunk = new AudioChunk( | |
| 403 reinterpret_cast<const uint8_t*>(config_.preamble->sample_data.data()), | |
| 404 config_.preamble->sample_data.size(), config_.preamble->sample_depth); | |
| 405 preamble_encoder_->Encode(*chunk); | |
| 406 preamble_encoder_->Flush(); | |
| 407 scoped_refptr<AudioChunk> encoded_data( | |
| 408 preamble_encoder_->GetEncodedDataAndClear()); | |
| 409 UploadAudioChunk(encoded_data->AsString(), FRAME_PREAMBLE_AUDIO, false); | |
| 410 } | |
| 411 return STATE_BOTH_STREAMS_CONNECTED; | |
| 412 } | |
| 413 | |
| 414 GoogleStreamingRemoteEngine::FSMState | |
| 415 GoogleStreamingRemoteEngine::TransmitAudioUpstream( | |
| 416 const FSMEventArgs& event_args) { | |
| 417 DCHECK(upstream_fetcher_.get()); | |
| 418 DCHECK(event_args.audio_data.get()); | |
| 419 const AudioChunk& audio = *(event_args.audio_data.get()); | |
| 420 | |
| 421 DCHECK_EQ(audio.bytes_per_sample(), config_.audio_num_bits_per_sample / 8); | |
| 422 encoder_->Encode(audio); | |
| 423 scoped_refptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear()); | |
| 424 UploadAudioChunk(encoded_data->AsString(), FRAME_RECOGNITION_AUDIO, false); | |
| 425 return state_; | |
| 426 } | |
| 427 | |
| 428 GoogleStreamingRemoteEngine::FSMState | |
| 429 GoogleStreamingRemoteEngine::ProcessDownstreamResponse( | |
| 430 const FSMEventArgs& event_args) { | |
| 431 DCHECK(event_args.response.get()); | |
| 432 | |
| 433 proto::SpeechRecognitionEvent ws_event; | |
| 434 if (!ws_event.ParseFromString(std::string(event_args.response->begin(), | |
| 435 event_args.response->end()))) | |
| 436 return AbortWithError(event_args); | |
| 437 | |
| 438 if (ws_event.has_status()) { | |
| 439 switch (ws_event.status()) { | |
| 440 case proto::SpeechRecognitionEvent::STATUS_SUCCESS: | |
| 441 break; | |
| 442 case proto::SpeechRecognitionEvent::STATUS_NO_SPEECH: | |
| 443 return Abort(SPEECH_RECOGNITION_ERROR_NO_SPEECH); | |
| 444 case proto::SpeechRecognitionEvent::STATUS_ABORTED: | |
| 445 return Abort(SPEECH_RECOGNITION_ERROR_ABORTED); | |
| 446 case proto::SpeechRecognitionEvent::STATUS_AUDIO_CAPTURE: | |
| 447 return Abort(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE); | |
| 448 case proto::SpeechRecognitionEvent::STATUS_NETWORK: | |
| 449 return Abort(SPEECH_RECOGNITION_ERROR_NETWORK); | |
| 450 case proto::SpeechRecognitionEvent::STATUS_NOT_ALLOWED: | |
| 451 return Abort(SPEECH_RECOGNITION_ERROR_NOT_ALLOWED); | |
| 452 case proto::SpeechRecognitionEvent::STATUS_SERVICE_NOT_ALLOWED: | |
| 453 return Abort(SPEECH_RECOGNITION_ERROR_SERVICE_NOT_ALLOWED); | |
| 454 case proto::SpeechRecognitionEvent::STATUS_BAD_GRAMMAR: | |
| 455 return Abort(SPEECH_RECOGNITION_ERROR_BAD_GRAMMAR); | |
| 456 case proto::SpeechRecognitionEvent::STATUS_LANGUAGE_NOT_SUPPORTED: | |
| 457 return Abort(SPEECH_RECOGNITION_ERROR_LANGUAGE_NOT_SUPPORTED); | |
| 458 } | |
| 459 } | |
| 460 | |
| 461 if (!config_.continuous && ws_event.has_endpoint() && | |
| 462 ws_event.endpoint() == proto::SpeechRecognitionEvent::END_OF_UTTERANCE) { | |
| 463 delegate()->OnSpeechRecognitionEngineEndOfUtterance(); | |
| 464 } | |
| 465 | |
| 466 SpeechRecognitionResults results; | |
| 467 for (int i = 0; i < ws_event.result_size(); ++i) { | |
| 468 const proto::SpeechRecognitionResult& ws_result = ws_event.result(i); | |
| 469 results.push_back(SpeechRecognitionResult()); | |
| 470 SpeechRecognitionResult& result = results.back(); | |
| 471 result.is_provisional = !(ws_result.has_final() && ws_result.final()); | |
| 472 | |
| 473 if (!result.is_provisional) | |
| 474 got_last_definitive_result_ = true; | |
| 475 | |
| 476 for (int j = 0; j < ws_result.alternative_size(); ++j) { | |
| 477 const proto::SpeechRecognitionAlternative& ws_alternative = | |
| 478 ws_result.alternative(j); | |
| 479 SpeechRecognitionHypothesis hypothesis; | |
| 480 if (ws_alternative.has_confidence()) | |
| 481 hypothesis.confidence = ws_alternative.confidence(); | |
| 482 else if (ws_result.has_stability()) | |
| 483 hypothesis.confidence = ws_result.stability(); | |
| 484 DCHECK(ws_alternative.has_transcript()); | |
| 485 // TODO(hans): Perhaps the transcript should be required in the proto? | |
| 486 if (ws_alternative.has_transcript()) | |
| 487 hypothesis.utterance = base::UTF8ToUTF16(ws_alternative.transcript()); | |
| 488 | |
| 489 result.hypotheses.push_back(hypothesis); | |
| 490 } | |
| 491 } | |
| 492 if (results.size()) { | |
| 493 delegate()->OnSpeechRecognitionEngineResults(results); | |
| 494 } | |
| 495 | |
| 496 return state_; | |
| 497 } | |
| 498 | |
| 499 GoogleStreamingRemoteEngine::FSMState | |
| 500 GoogleStreamingRemoteEngine::RaiseNoMatchErrorIfGotNoResults( | |
| 501 const FSMEventArgs& event_args) { | |
| 502 if (!got_last_definitive_result_) { | |
| 503 // Provide an empty result to notify that recognition is ended with no | |
| 504 // errors, yet neither any further results. | |
| 505 delegate()->OnSpeechRecognitionEngineResults(SpeechRecognitionResults()); | |
| 506 } | |
| 507 return AbortSilently(event_args); | |
| 508 } | |
| 509 | |
| 510 GoogleStreamingRemoteEngine::FSMState | |
| 511 GoogleStreamingRemoteEngine::CloseUpstreamAndWaitForResults( | |
| 512 const FSMEventArgs&) { | |
| 513 DCHECK(upstream_fetcher_.get()); | |
| 514 DCHECK(encoder_.get()); | |
| 515 | |
| 516 DVLOG(1) << "Closing upstream."; | |
| 517 | |
| 518 // The encoder requires a non-empty final buffer. So we encode a packet | |
| 519 // of silence in case encoder had no data already. | |
| 520 size_t sample_count = | |
| 521 config_.audio_sample_rate * kAudioPacketIntervalMs / 1000; | |
| 522 scoped_refptr<AudioChunk> dummy_chunk = new AudioChunk( | |
| 523 sample_count * sizeof(int16_t), encoder_->GetBitsPerSample() / 8); | |
| 524 encoder_->Encode(*dummy_chunk.get()); | |
| 525 encoder_->Flush(); | |
| 526 scoped_refptr<AudioChunk> encoded_dummy_data = | |
| 527 encoder_->GetEncodedDataAndClear(); | |
| 528 DCHECK(!encoded_dummy_data->IsEmpty()); | |
| 529 encoder_.reset(); | |
| 530 | |
| 531 UploadAudioChunk(encoded_dummy_data->AsString(), | |
| 532 FRAME_RECOGNITION_AUDIO, | |
| 533 true); | |
| 534 got_last_definitive_result_ = false; | |
| 535 return STATE_WAITING_DOWNSTREAM_RESULTS; | |
| 536 } | |
| 537 | |
| 538 GoogleStreamingRemoteEngine::FSMState | |
| 539 GoogleStreamingRemoteEngine::CloseDownstream(const FSMEventArgs&) { | |
| 540 DCHECK(!upstream_fetcher_.get()); | |
| 541 DCHECK(downstream_fetcher_.get()); | |
| 542 | |
| 543 DVLOG(1) << "Closing downstream."; | |
| 544 downstream_fetcher_.reset(); | |
| 545 return STATE_IDLE; | |
| 546 } | |
| 547 | |
| 548 GoogleStreamingRemoteEngine::FSMState | |
| 549 GoogleStreamingRemoteEngine::AbortSilently(const FSMEventArgs&) { | |
| 550 return Abort(SPEECH_RECOGNITION_ERROR_NONE); | |
| 551 } | |
| 552 | |
| 553 GoogleStreamingRemoteEngine::FSMState | |
| 554 GoogleStreamingRemoteEngine::AbortWithError(const FSMEventArgs&) { | |
| 555 return Abort(SPEECH_RECOGNITION_ERROR_NETWORK); | |
| 556 } | |
| 557 | |
| 558 GoogleStreamingRemoteEngine::FSMState GoogleStreamingRemoteEngine::Abort( | |
| 559 SpeechRecognitionErrorCode error_code) { | |
| 560 DVLOG(1) << "Aborting with error " << error_code; | |
| 561 | |
| 562 if (error_code != SPEECH_RECOGNITION_ERROR_NONE) { | |
| 563 delegate()->OnSpeechRecognitionEngineError( | |
| 564 SpeechRecognitionError(error_code)); | |
| 565 } | |
| 566 downstream_fetcher_.reset(); | |
| 567 upstream_fetcher_.reset(); | |
| 568 encoder_.reset(); | |
| 569 return STATE_IDLE; | |
| 570 } | |
| 571 | |
| 572 GoogleStreamingRemoteEngine::FSMState | |
| 573 GoogleStreamingRemoteEngine::DoNothing(const FSMEventArgs&) { | |
| 574 return state_; | |
| 575 } | |
| 576 | |
| 577 GoogleStreamingRemoteEngine::FSMState | |
| 578 GoogleStreamingRemoteEngine::NotFeasible(const FSMEventArgs& event_args) { | |
| 579 NOTREACHED() << "Unfeasible event " << event_args.event | |
| 580 << " in state " << state_; | |
| 581 return state_; | |
| 582 } | |
| 583 | |
| 584 std::string GoogleStreamingRemoteEngine::GetAcceptedLanguages() const { | |
| 585 std::string langs = config_.language; | |
| 586 if (langs.empty() && url_context_.get()) { | |
| 587 // If no language is provided then we use the first from the accepted | |
| 588 // language list. If this list is empty then it defaults to "en-US". | |
| 589 // Example of the contents of this list: "es,en-GB;q=0.8", "" | |
| 590 net::URLRequestContext* request_context = | |
| 591 url_context_->GetURLRequestContext(); | |
| 592 DCHECK(request_context); | |
| 593 // TODO(pauljensen): GoogleStreamingRemoteEngine should be constructed with | |
| 594 // a reference to the HttpUserAgentSettings rather than accessing the | |
| 595 // accept language through the URLRequestContext. | |
| 596 if (request_context->http_user_agent_settings()) { | |
| 597 std::string accepted_language_list = | |
| 598 request_context->http_user_agent_settings()->GetAcceptLanguage(); | |
| 599 size_t separator = accepted_language_list.find_first_of(",;"); | |
| 600 if (separator != std::string::npos) | |
| 601 langs = accepted_language_list.substr(0, separator); | |
| 602 } | |
| 603 } | |
| 604 if (langs.empty()) | |
| 605 langs = "en-US"; | |
| 606 return langs; | |
| 607 } | |
| 608 | |
| 609 // TODO(primiano): Is there any utility in the codebase that already does this? | |
| 610 std::string GoogleStreamingRemoteEngine::GenerateRequestKey() const { | |
| 611 const int64_t kKeepLowBytes = 0x00000000FFFFFFFFLL; | |
| 612 const int64_t kKeepHighBytes = 0xFFFFFFFF00000000LL; | |
| 613 | |
| 614 // Just keep the least significant bits of timestamp, in order to reduce | |
| 615 // probability of collisions. | |
| 616 int64_t key = (base::Time::Now().ToInternalValue() & kKeepLowBytes) | | |
| 617 (base::RandUint64() & kKeepHighBytes); | |
| 618 return base::HexEncode(reinterpret_cast<void*>(&key), sizeof(key)); | |
| 619 } | |
| 620 | |
| 621 void GoogleStreamingRemoteEngine::UploadAudioChunk(const std::string& data, | |
| 622 FrameType type, | |
| 623 bool is_final) { | |
| 624 if (use_framed_post_data_) { | |
| 625 std::string frame(data.size() + 8, 0); | |
| 626 base::WriteBigEndian(&frame[0], static_cast<uint32_t>(data.size())); | |
| 627 base::WriteBigEndian(&frame[4], static_cast<uint32_t>(type)); | |
| 628 frame.replace(8, data.size(), data); | |
| 629 upstream_fetcher_->AppendChunkToUpload(frame, is_final); | |
| 630 } else { | |
| 631 upstream_fetcher_->AppendChunkToUpload(data, is_final); | |
| 632 } | |
| 633 } | |
| 634 | |
| 635 GoogleStreamingRemoteEngine::FSMEventArgs::FSMEventArgs(FSMEvent event_value) | |
| 636 : event(event_value) { | |
| 637 } | |
| 638 | |
| 639 GoogleStreamingRemoteEngine::FSMEventArgs::~FSMEventArgs() { | |
| 640 } | |
| 641 | |
| 642 } // namespace content | |
| OLD | NEW |