| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "content/browser/speech/speech_recognition_engine.h" | 5 #include "content/browser/speech/speech_recognition_engine.h" |
| 6 | 6 |
| 7 #include <algorithm> |
| 8 #include <vector> |
| 9 |
| 10 #include "base/big_endian.h" |
| 11 #include "base/bind.h" |
| 12 #include "base/rand_util.h" |
| 13 #include "base/strings/string_number_conversions.h" |
| 14 #include "base/strings/string_util.h" |
| 15 #include "base/strings/utf_string_conversions.h" |
| 16 #include "base/time/time.h" |
| 17 #include "content/browser/speech/audio_buffer.h" |
| 18 #include "content/browser/speech/proto/google_streaming_api.pb.h" |
| 19 #include "content/public/common/speech_recognition_error.h" |
| 20 #include "content/public/common/speech_recognition_result.h" |
| 21 #include "google_apis/google_api_keys.h" |
| 22 #include "net/base/escape.h" |
| 23 #include "net/base/load_flags.h" |
| 24 #include "net/url_request/http_user_agent_settings.h" |
| 25 #include "net/url_request/url_fetcher.h" |
| 26 #include "net/url_request/url_request_context.h" |
| 27 #include "net/url_request/url_request_context_getter.h" |
| 28 #include "net/url_request/url_request_status.h" |
| 29 |
| 30 using net::URLFetcher; |
| 31 |
| 32 namespace content { |
| 7 namespace { | 33 namespace { |
| 34 |
| 35 const char kWebServiceBaseUrl[] = |
| 36 "https://www.google.com/speech-api/full-duplex/v1"; |
| 37 const char kDownstreamUrl[] = "/down?"; |
| 38 const char kUpstreamUrl[] = "/up?"; |
| 39 |
| 40 // This matches the maximum maxAlternatives value supported by the server. |
| 41 const uint32_t kMaxMaxAlternatives = 30; |
| 42 |
| 43 // TODO(hans): Remove this and other logging when we don't need it anymore. |
| 44 void DumpResponse(const std::string& response) { |
| 45 DVLOG(1) << "------------"; |
| 46 proto::SpeechRecognitionEvent event; |
| 47 if (!event.ParseFromString(response)) { |
| 48 DVLOG(1) << "Parse failed!"; |
| 49 return; |
| 50 } |
| 51 if (event.has_status()) |
| 52 DVLOG(1) << "STATUS\t" << event.status(); |
| 53 if (event.has_endpoint()) |
| 54 DVLOG(1) << "ENDPOINT\t" << event.endpoint(); |
| 55 for (int i = 0; i < event.result_size(); ++i) { |
| 56 DVLOG(1) << "RESULT #" << i << ":"; |
| 57 const proto::SpeechRecognitionResult& res = event.result(i); |
| 58 if (res.has_final()) |
| 59 DVLOG(1) << " final:\t" << res.final(); |
| 60 if (res.has_stability()) |
| 61 DVLOG(1) << " STABILITY:\t" << res.stability(); |
| 62 for (int j = 0; j < res.alternative_size(); ++j) { |
| 63 const proto::SpeechRecognitionAlternative& alt = |
| 64 res.alternative(j); |
| 65 if (alt.has_confidence()) |
| 66 DVLOG(1) << " CONFIDENCE:\t" << alt.confidence(); |
| 67 if (alt.has_transcript()) |
| 68 DVLOG(1) << " TRANSCRIPT:\t" << alt.transcript(); |
| 69 } |
| 70 } |
| 71 } |
| 72 |
| 8 const int kDefaultConfigSampleRate = 8000; | 73 const int kDefaultConfigSampleRate = 8000; |
| 9 const int kDefaultConfigBitsPerSample = 16; | 74 const int kDefaultConfigBitsPerSample = 16; |
| 10 const uint32_t kDefaultMaxHypotheses = 1; | 75 const uint32_t kDefaultMaxHypotheses = 1; |
| 76 |
| 11 } // namespace | 77 } // namespace |
| 12 | 78 |
| 13 namespace content { | |
| 14 | |
| 15 SpeechRecognitionEngine::Config::Config() | 79 SpeechRecognitionEngine::Config::Config() |
| 16 : filter_profanities(false), | 80 : filter_profanities(false), |
| 17 continuous(true), | 81 continuous(true), |
| 18 interim_results(true), | 82 interim_results(true), |
| 19 max_hypotheses(kDefaultMaxHypotheses), | 83 max_hypotheses(kDefaultMaxHypotheses), |
| 20 audio_sample_rate(kDefaultConfigSampleRate), | 84 audio_sample_rate(kDefaultConfigSampleRate), |
| 21 audio_num_bits_per_sample(kDefaultConfigBitsPerSample) { | 85 audio_num_bits_per_sample(kDefaultConfigBitsPerSample) {} |
| 22 } | 86 |
| 23 | 87 SpeechRecognitionEngine::Config::~Config() {} |
| 24 SpeechRecognitionEngine::Config::~Config() { | 88 |
| 89 const int SpeechRecognitionEngine::kAudioPacketIntervalMs = 100; |
| 90 const int SpeechRecognitionEngine::kUpstreamUrlFetcherIdForTesting = 0; |
| 91 const int SpeechRecognitionEngine::kDownstreamUrlFetcherIdForTesting = 1; |
| 92 const int SpeechRecognitionEngine::kWebserviceStatusNoError = 0; |
| 93 const int SpeechRecognitionEngine::kWebserviceStatusErrorNoMatch = 5; |
| 94 |
| 95 SpeechRecognitionEngine::SpeechRecognitionEngine( |
| 96 net::URLRequestContextGetter* context) |
| 97 : url_context_(context), |
| 98 previous_response_length_(0), |
| 99 got_last_definitive_result_(false), |
| 100 is_dispatching_event_(false), |
| 101 use_framed_post_data_(false), |
| 102 state_(STATE_IDLE) {} |
| 103 |
| 104 SpeechRecognitionEngine::~SpeechRecognitionEngine() {} |
| 105 |
| 106 void SpeechRecognitionEngine::SetConfig(const Config& config) { |
| 107 config_ = config; |
| 108 } |
| 109 |
| 110 void SpeechRecognitionEngine::StartRecognition() { |
| 111 FSMEventArgs event_args(EVENT_START_RECOGNITION); |
| 112 DispatchEvent(event_args); |
| 113 } |
| 114 |
| 115 void SpeechRecognitionEngine::EndRecognition() { |
| 116 FSMEventArgs event_args(EVENT_END_RECOGNITION); |
| 117 DispatchEvent(event_args); |
| 118 } |
| 119 |
| 120 void SpeechRecognitionEngine::TakeAudioChunk(const AudioChunk& data) { |
| 121 FSMEventArgs event_args(EVENT_AUDIO_CHUNK); |
| 122 event_args.audio_data = &data; |
| 123 DispatchEvent(event_args); |
| 124 } |
| 125 |
| 126 void SpeechRecognitionEngine::AudioChunksEnded() { |
| 127 FSMEventArgs event_args(EVENT_AUDIO_CHUNKS_ENDED); |
| 128 DispatchEvent(event_args); |
| 129 } |
| 130 |
| 131 void SpeechRecognitionEngine::OnURLFetchComplete(const URLFetcher* source) { |
| 132 const bool kResponseComplete = true; |
| 133 DispatchHTTPResponse(source, kResponseComplete); |
| 134 } |
| 135 |
| 136 void SpeechRecognitionEngine::OnURLFetchDownloadProgress( |
| 137 const URLFetcher* source, |
| 138 int64_t current, |
| 139 int64_t total) { |
| 140 const bool kPartialResponse = false; |
| 141 DispatchHTTPResponse(source, kPartialResponse); |
| 142 } |
| 143 |
| 144 void SpeechRecognitionEngine::DispatchHTTPResponse(const URLFetcher* source, |
| 145 bool end_of_response) { |
| 146 DCHECK(CalledOnValidThread()); |
| 147 DCHECK(source); |
| 148 const bool response_is_good = source->GetStatus().is_success() && |
| 149 source->GetResponseCode() == 200; |
| 150 std::string response; |
| 151 if (response_is_good) |
| 152 source->GetResponseAsString(&response); |
| 153 const size_t current_response_length = response.size(); |
| 154 |
| 155 DVLOG(1) << (source == downstream_fetcher_.get() ? "Downstream" : "Upstream") |
| 156 << "HTTP, code: " << source->GetResponseCode() |
| 157 << " length: " << current_response_length |
| 158 << " eor: " << end_of_response; |
| 159 |
| 160 // URLFetcher provides always the entire response buffer, but we are only |
| 161 // interested in the fresh data introduced by the last chunk. Therefore, we |
| 162 // drop the previous content we have already processed. |
| 163 if (current_response_length != 0) { |
| 164 DCHECK_GE(current_response_length, previous_response_length_); |
| 165 response.erase(0, previous_response_length_); |
| 166 previous_response_length_ = current_response_length; |
| 167 } |
| 168 |
| 169 if (!response_is_good && source == downstream_fetcher_.get()) { |
| 170 DVLOG(1) << "Downstream error " << source->GetResponseCode(); |
| 171 FSMEventArgs event_args(EVENT_DOWNSTREAM_ERROR); |
| 172 DispatchEvent(event_args); |
| 173 return; |
| 174 } |
| 175 if (!response_is_good && source == upstream_fetcher_.get()) { |
| 176 DVLOG(1) << "Upstream error " << source->GetResponseCode() |
| 177 << " EOR " << end_of_response; |
| 178 FSMEventArgs event_args(EVENT_UPSTREAM_ERROR); |
| 179 DispatchEvent(event_args); |
| 180 return; |
| 181 } |
| 182 |
| 183 // Ignore incoming data on the upstream connection. |
| 184 if (source == upstream_fetcher_.get()) |
| 185 return; |
| 186 |
| 187 DCHECK(response_is_good && source == downstream_fetcher_.get()); |
| 188 |
| 189 // The downstream response is organized in chunks, whose size is determined |
| 190 // by a 4 bytes prefix, transparently handled by the ChunkedByteBuffer class. |
| 191 // Such chunks are sent by the speech recognition webservice over the HTTP |
| 192 // downstream channel using HTTP chunked transfer (unrelated to our chunks). |
| 193 // This function is called every time an HTTP chunk is received by the |
| 194 // url fetcher. However there isn't any particular matching beween our |
| 195 // protocol chunks and HTTP chunks, in the sense that a single HTTP chunk can |
| 196 // contain a portion of one chunk or even more chunks together. |
| 197 chunked_byte_buffer_.Append(response); |
| 198 |
| 199 // A single HTTP chunk can contain more than one data chunk, thus the while. |
| 200 while (chunked_byte_buffer_.HasChunks()) { |
| 201 FSMEventArgs event_args(EVENT_DOWNSTREAM_RESPONSE); |
| 202 event_args.response = chunked_byte_buffer_.PopChunk(); |
| 203 DCHECK(event_args.response.get()); |
| 204 DumpResponse(std::string(event_args.response->begin(), |
| 205 event_args.response->end())); |
| 206 DispatchEvent(event_args); |
| 207 } |
| 208 if (end_of_response) { |
| 209 FSMEventArgs event_args(EVENT_DOWNSTREAM_CLOSED); |
| 210 DispatchEvent(event_args); |
| 211 } |
| 212 } |
| 213 |
| 214 bool SpeechRecognitionEngine::IsRecognitionPending() const { |
| 215 DCHECK(CalledOnValidThread()); |
| 216 return state_ != STATE_IDLE; |
| 217 } |
| 218 |
| 219 int SpeechRecognitionEngine::GetDesiredAudioChunkDurationMs() const { |
| 220 return kAudioPacketIntervalMs; |
| 221 } |
| 222 |
| 223 // ----------------------- Core FSM implementation --------------------------- |
| 224 |
| 225 void SpeechRecognitionEngine::DispatchEvent( |
| 226 const FSMEventArgs& event_args) { |
| 227 DCHECK(CalledOnValidThread()); |
| 228 DCHECK_LE(event_args.event, EVENT_MAX_VALUE); |
| 229 DCHECK_LE(state_, STATE_MAX_VALUE); |
| 230 |
| 231 // Event dispatching must be sequential, otherwise it will break all the rules |
| 232 // and the assumptions of the finite state automata model. |
| 233 DCHECK(!is_dispatching_event_); |
| 234 is_dispatching_event_ = true; |
| 235 |
| 236 state_ = ExecuteTransitionAndGetNextState(event_args); |
| 237 |
| 238 is_dispatching_event_ = false; |
| 239 } |
| 240 |
| 241 SpeechRecognitionEngine::FSMState |
| 242 SpeechRecognitionEngine::ExecuteTransitionAndGetNextState( |
| 243 const FSMEventArgs& event_args) { |
| 244 const FSMEvent event = event_args.event; |
| 245 switch (state_) { |
| 246 case STATE_IDLE: |
| 247 switch (event) { |
| 248 case EVENT_START_RECOGNITION: |
| 249 return ConnectBothStreams(event_args); |
| 250 case EVENT_END_RECOGNITION: |
| 251 // Note AUDIO_CHUNK and AUDIO_END events can remain enqueued in case of |
| 252 // abort, so we just silently drop them here. |
| 253 case EVENT_AUDIO_CHUNK: |
| 254 case EVENT_AUDIO_CHUNKS_ENDED: |
| 255 // DOWNSTREAM_CLOSED can be received if we end up here due to an error. |
| 256 case EVENT_DOWNSTREAM_CLOSED: |
| 257 return DoNothing(event_args); |
| 258 case EVENT_UPSTREAM_ERROR: |
| 259 case EVENT_DOWNSTREAM_ERROR: |
| 260 case EVENT_DOWNSTREAM_RESPONSE: |
| 261 return NotFeasible(event_args); |
| 262 } |
| 263 break; |
| 264 case STATE_BOTH_STREAMS_CONNECTED: |
| 265 switch (event) { |
| 266 case EVENT_AUDIO_CHUNK: |
| 267 return TransmitAudioUpstream(event_args); |
| 268 case EVENT_DOWNSTREAM_RESPONSE: |
| 269 return ProcessDownstreamResponse(event_args); |
| 270 case EVENT_AUDIO_CHUNKS_ENDED: |
| 271 return CloseUpstreamAndWaitForResults(event_args); |
| 272 case EVENT_END_RECOGNITION: |
| 273 return AbortSilently(event_args); |
| 274 case EVENT_UPSTREAM_ERROR: |
| 275 case EVENT_DOWNSTREAM_ERROR: |
| 276 case EVENT_DOWNSTREAM_CLOSED: |
| 277 return AbortWithError(event_args); |
| 278 case EVENT_START_RECOGNITION: |
| 279 return NotFeasible(event_args); |
| 280 } |
| 281 break; |
| 282 case STATE_WAITING_DOWNSTREAM_RESULTS: |
| 283 switch (event) { |
| 284 case EVENT_DOWNSTREAM_RESPONSE: |
| 285 return ProcessDownstreamResponse(event_args); |
| 286 case EVENT_DOWNSTREAM_CLOSED: |
| 287 return RaiseNoMatchErrorIfGotNoResults(event_args); |
| 288 case EVENT_END_RECOGNITION: |
| 289 return AbortSilently(event_args); |
| 290 case EVENT_UPSTREAM_ERROR: |
| 291 case EVENT_DOWNSTREAM_ERROR: |
| 292 return AbortWithError(event_args); |
| 293 case EVENT_START_RECOGNITION: |
| 294 case EVENT_AUDIO_CHUNK: |
| 295 case EVENT_AUDIO_CHUNKS_ENDED: |
| 296 return NotFeasible(event_args); |
| 297 } |
| 298 break; |
| 299 } |
| 300 return NotFeasible(event_args); |
| 301 } |
| 302 |
| 303 // ----------- Contract for all the FSM evolution functions below ------------- |
| 304 // - Are guaranteed to be executed in the same thread (IO, except for tests); |
| 305 // - Are guaranteed to be not reentrant (themselves and each other); |
| 306 // - event_args members are guaranteed to be stable during the call; |
| 307 |
| 308 SpeechRecognitionEngine::FSMState |
| 309 SpeechRecognitionEngine::ConnectBothStreams(const FSMEventArgs&) { |
| 310 DCHECK(!upstream_fetcher_.get()); |
| 311 DCHECK(!downstream_fetcher_.get()); |
| 312 |
| 313 encoder_.reset(new AudioEncoder(config_.audio_sample_rate, |
| 314 config_.audio_num_bits_per_sample)); |
| 315 DCHECK(encoder_.get()); |
| 316 const std::string request_key = GenerateRequestKey(); |
| 317 |
| 318 // Only use the framed post data format when a preamble needs to be logged. |
| 319 use_framed_post_data_ = (config_.preamble && |
| 320 !config_.preamble->sample_data.empty() && |
| 321 !config_.auth_token.empty() && |
| 322 !config_.auth_scope.empty()); |
| 323 if (use_framed_post_data_) { |
| 324 preamble_encoder_.reset(new AudioEncoder( |
| 325 config_.preamble->sample_rate, |
| 326 config_.preamble->sample_depth * 8)); |
| 327 } |
| 328 |
| 329 // Setup downstream fetcher. |
| 330 std::vector<std::string> downstream_args; |
| 331 downstream_args.push_back( |
| 332 "key=" + net::EscapeQueryParamValue(google_apis::GetAPIKey(), true)); |
| 333 downstream_args.push_back("pair=" + request_key); |
| 334 downstream_args.push_back("output=pb"); |
| 335 GURL downstream_url(std::string(kWebServiceBaseUrl) + |
| 336 std::string(kDownstreamUrl) + |
| 337 base::JoinString(downstream_args, "&")); |
| 338 |
| 339 downstream_fetcher_ = URLFetcher::Create( |
| 340 kDownstreamUrlFetcherIdForTesting, downstream_url, URLFetcher::GET, this); |
| 341 downstream_fetcher_->SetRequestContext(url_context_.get()); |
| 342 downstream_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES | |
| 343 net::LOAD_DO_NOT_SEND_COOKIES | |
| 344 net::LOAD_DO_NOT_SEND_AUTH_DATA); |
| 345 downstream_fetcher_->Start(); |
| 346 |
| 347 // Setup upstream fetcher. |
| 348 // TODO(hans): Support for user-selected grammars. |
| 349 std::vector<std::string> upstream_args; |
| 350 upstream_args.push_back("key=" + |
| 351 net::EscapeQueryParamValue(google_apis::GetAPIKey(), true)); |
| 352 upstream_args.push_back("pair=" + request_key); |
| 353 upstream_args.push_back("output=pb"); |
| 354 upstream_args.push_back( |
| 355 "lang=" + net::EscapeQueryParamValue(GetAcceptedLanguages(), true)); |
| 356 upstream_args.push_back( |
| 357 config_.filter_profanities ? "pFilter=2" : "pFilter=0"); |
| 358 if (config_.max_hypotheses > 0U) { |
| 359 uint32_t max_alternatives = |
| 360 std::min(kMaxMaxAlternatives, config_.max_hypotheses); |
| 361 upstream_args.push_back("maxAlternatives=" + |
| 362 base::UintToString(max_alternatives)); |
| 363 } |
| 364 upstream_args.push_back("app=chromium"); |
| 365 if (!config_.hardware_info.empty()) { |
| 366 upstream_args.push_back( |
| 367 "xhw=" + net::EscapeQueryParamValue(config_.hardware_info, true)); |
| 368 } |
| 369 for (const SpeechRecognitionGrammar& grammar : config_.grammars) { |
| 370 std::string grammar_value( |
| 371 base::DoubleToString(grammar.weight) + ":" + grammar.url); |
| 372 upstream_args.push_back( |
| 373 "grammar=" + net::EscapeQueryParamValue(grammar_value, true)); |
| 374 } |
| 375 if (config_.continuous) |
| 376 upstream_args.push_back("continuous"); |
| 377 else |
| 378 upstream_args.push_back("endpoint=1"); |
| 379 if (config_.interim_results) |
| 380 upstream_args.push_back("interim"); |
| 381 if (!config_.auth_token.empty() && !config_.auth_scope.empty()) { |
| 382 upstream_args.push_back( |
| 383 "authScope=" + net::EscapeQueryParamValue(config_.auth_scope, true)); |
| 384 upstream_args.push_back( |
| 385 "authToken=" + net::EscapeQueryParamValue(config_.auth_token, true)); |
| 386 } |
| 387 if (use_framed_post_data_) { |
| 388 std::string audio_format; |
| 389 if (preamble_encoder_) |
| 390 audio_format = preamble_encoder_->GetMimeType() + ","; |
| 391 audio_format += encoder_->GetMimeType(); |
| 392 upstream_args.push_back( |
| 393 "audioFormat=" + net::EscapeQueryParamValue(audio_format, true)); |
| 394 } |
| 395 GURL upstream_url(std::string(kWebServiceBaseUrl) + |
| 396 std::string(kUpstreamUrl) + |
| 397 base::JoinString(upstream_args, "&")); |
| 398 |
| 399 upstream_fetcher_ = URLFetcher::Create(kUpstreamUrlFetcherIdForTesting, |
| 400 upstream_url, URLFetcher::POST, this); |
| 401 if (use_framed_post_data_) |
| 402 upstream_fetcher_->SetChunkedUpload("application/octet-stream"); |
| 403 else |
| 404 upstream_fetcher_->SetChunkedUpload(encoder_->GetMimeType()); |
| 405 upstream_fetcher_->SetRequestContext(url_context_.get()); |
| 406 upstream_fetcher_->SetReferrer(config_.origin_url); |
| 407 upstream_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES | |
| 408 net::LOAD_DO_NOT_SEND_COOKIES | |
| 409 net::LOAD_DO_NOT_SEND_AUTH_DATA); |
| 410 upstream_fetcher_->Start(); |
| 411 previous_response_length_ = 0; |
| 412 |
| 413 if (preamble_encoder_) { |
| 414 // Encode and send preamble right away. |
| 415 scoped_refptr<AudioChunk> chunk = new AudioChunk( |
| 416 reinterpret_cast<const uint8_t*>(config_.preamble->sample_data.data()), |
| 417 config_.preamble->sample_data.size(), config_.preamble->sample_depth); |
| 418 preamble_encoder_->Encode(*chunk); |
| 419 preamble_encoder_->Flush(); |
| 420 scoped_refptr<AudioChunk> encoded_data( |
| 421 preamble_encoder_->GetEncodedDataAndClear()); |
| 422 UploadAudioChunk(encoded_data->AsString(), FRAME_PREAMBLE_AUDIO, false); |
| 423 } |
| 424 return STATE_BOTH_STREAMS_CONNECTED; |
| 425 } |
| 426 |
| 427 SpeechRecognitionEngine::FSMState |
| 428 SpeechRecognitionEngine::TransmitAudioUpstream( |
| 429 const FSMEventArgs& event_args) { |
| 430 DCHECK(upstream_fetcher_.get()); |
| 431 DCHECK(event_args.audio_data.get()); |
| 432 const AudioChunk& audio = *(event_args.audio_data.get()); |
| 433 |
| 434 DCHECK_EQ(audio.bytes_per_sample(), config_.audio_num_bits_per_sample / 8); |
| 435 encoder_->Encode(audio); |
| 436 scoped_refptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear()); |
| 437 UploadAudioChunk(encoded_data->AsString(), FRAME_RECOGNITION_AUDIO, false); |
| 438 return state_; |
| 439 } |
| 440 |
| 441 SpeechRecognitionEngine::FSMState |
| 442 SpeechRecognitionEngine::ProcessDownstreamResponse( |
| 443 const FSMEventArgs& event_args) { |
| 444 DCHECK(event_args.response.get()); |
| 445 |
| 446 proto::SpeechRecognitionEvent ws_event; |
| 447 if (!ws_event.ParseFromString(std::string(event_args.response->begin(), |
| 448 event_args.response->end()))) |
| 449 return AbortWithError(event_args); |
| 450 |
| 451 if (ws_event.has_status()) { |
| 452 switch (ws_event.status()) { |
| 453 case proto::SpeechRecognitionEvent::STATUS_SUCCESS: |
| 454 break; |
| 455 case proto::SpeechRecognitionEvent::STATUS_NO_SPEECH: |
| 456 return Abort(SPEECH_RECOGNITION_ERROR_NO_SPEECH); |
| 457 case proto::SpeechRecognitionEvent::STATUS_ABORTED: |
| 458 return Abort(SPEECH_RECOGNITION_ERROR_ABORTED); |
| 459 case proto::SpeechRecognitionEvent::STATUS_AUDIO_CAPTURE: |
| 460 return Abort(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE); |
| 461 case proto::SpeechRecognitionEvent::STATUS_NETWORK: |
| 462 return Abort(SPEECH_RECOGNITION_ERROR_NETWORK); |
| 463 case proto::SpeechRecognitionEvent::STATUS_NOT_ALLOWED: |
| 464 return Abort(SPEECH_RECOGNITION_ERROR_NOT_ALLOWED); |
| 465 case proto::SpeechRecognitionEvent::STATUS_SERVICE_NOT_ALLOWED: |
| 466 return Abort(SPEECH_RECOGNITION_ERROR_SERVICE_NOT_ALLOWED); |
| 467 case proto::SpeechRecognitionEvent::STATUS_BAD_GRAMMAR: |
| 468 return Abort(SPEECH_RECOGNITION_ERROR_BAD_GRAMMAR); |
| 469 case proto::SpeechRecognitionEvent::STATUS_LANGUAGE_NOT_SUPPORTED: |
| 470 return Abort(SPEECH_RECOGNITION_ERROR_LANGUAGE_NOT_SUPPORTED); |
| 471 } |
| 472 } |
| 473 |
| 474 if (!config_.continuous && ws_event.has_endpoint() && |
| 475 ws_event.endpoint() == proto::SpeechRecognitionEvent::END_OF_UTTERANCE) { |
| 476 delegate_->OnSpeechRecognitionEngineEndOfUtterance(); |
| 477 } |
| 478 |
| 479 SpeechRecognitionResults results; |
| 480 for (int i = 0; i < ws_event.result_size(); ++i) { |
| 481 const proto::SpeechRecognitionResult& ws_result = ws_event.result(i); |
| 482 results.push_back(SpeechRecognitionResult()); |
| 483 SpeechRecognitionResult& result = results.back(); |
| 484 result.is_provisional = !(ws_result.has_final() && ws_result.final()); |
| 485 |
| 486 if (!result.is_provisional) |
| 487 got_last_definitive_result_ = true; |
| 488 |
| 489 for (int j = 0; j < ws_result.alternative_size(); ++j) { |
| 490 const proto::SpeechRecognitionAlternative& ws_alternative = |
| 491 ws_result.alternative(j); |
| 492 SpeechRecognitionHypothesis hypothesis; |
| 493 if (ws_alternative.has_confidence()) |
| 494 hypothesis.confidence = ws_alternative.confidence(); |
| 495 else if (ws_result.has_stability()) |
| 496 hypothesis.confidence = ws_result.stability(); |
| 497 DCHECK(ws_alternative.has_transcript()); |
| 498 // TODO(hans): Perhaps the transcript should be required in the proto? |
| 499 if (ws_alternative.has_transcript()) |
| 500 hypothesis.utterance = base::UTF8ToUTF16(ws_alternative.transcript()); |
| 501 |
| 502 result.hypotheses.push_back(hypothesis); |
| 503 } |
| 504 } |
| 505 if (results.size()) { |
| 506 delegate_->OnSpeechRecognitionEngineResults(results); |
| 507 } |
| 508 |
| 509 return state_; |
| 510 } |
| 511 |
| 512 SpeechRecognitionEngine::FSMState |
| 513 SpeechRecognitionEngine::RaiseNoMatchErrorIfGotNoResults( |
| 514 const FSMEventArgs& event_args) { |
| 515 if (!got_last_definitive_result_) { |
| 516 // Provide an empty result to notify that recognition is ended with no |
| 517 // errors, yet neither any further results. |
| 518 delegate_->OnSpeechRecognitionEngineResults(SpeechRecognitionResults()); |
| 519 } |
| 520 return AbortSilently(event_args); |
| 521 } |
| 522 |
| 523 SpeechRecognitionEngine::FSMState |
| 524 SpeechRecognitionEngine::CloseUpstreamAndWaitForResults( |
| 525 const FSMEventArgs&) { |
| 526 DCHECK(upstream_fetcher_.get()); |
| 527 DCHECK(encoder_.get()); |
| 528 |
| 529 DVLOG(1) << "Closing upstream."; |
| 530 |
| 531 // The encoder requires a non-empty final buffer. So we encode a packet |
| 532 // of silence in case encoder had no data already. |
| 533 size_t sample_count = |
| 534 config_.audio_sample_rate * kAudioPacketIntervalMs / 1000; |
| 535 scoped_refptr<AudioChunk> dummy_chunk = new AudioChunk( |
| 536 sample_count * sizeof(int16_t), encoder_->GetBitsPerSample() / 8); |
| 537 encoder_->Encode(*dummy_chunk.get()); |
| 538 encoder_->Flush(); |
| 539 scoped_refptr<AudioChunk> encoded_dummy_data = |
| 540 encoder_->GetEncodedDataAndClear(); |
| 541 DCHECK(!encoded_dummy_data->IsEmpty()); |
| 542 encoder_.reset(); |
| 543 |
| 544 UploadAudioChunk(encoded_dummy_data->AsString(), |
| 545 FRAME_RECOGNITION_AUDIO, |
| 546 true); |
| 547 got_last_definitive_result_ = false; |
| 548 return STATE_WAITING_DOWNSTREAM_RESULTS; |
| 549 } |
| 550 |
| 551 SpeechRecognitionEngine::FSMState |
| 552 SpeechRecognitionEngine::CloseDownstream(const FSMEventArgs&) { |
| 553 DCHECK(!upstream_fetcher_.get()); |
| 554 DCHECK(downstream_fetcher_.get()); |
| 555 |
| 556 DVLOG(1) << "Closing downstream."; |
| 557 downstream_fetcher_.reset(); |
| 558 return STATE_IDLE; |
| 559 } |
| 560 |
| 561 SpeechRecognitionEngine::FSMState |
| 562 SpeechRecognitionEngine::AbortSilently(const FSMEventArgs&) { |
| 563 return Abort(SPEECH_RECOGNITION_ERROR_NONE); |
| 564 } |
| 565 |
| 566 SpeechRecognitionEngine::FSMState |
| 567 SpeechRecognitionEngine::AbortWithError(const FSMEventArgs&) { |
| 568 return Abort(SPEECH_RECOGNITION_ERROR_NETWORK); |
| 569 } |
| 570 |
| 571 SpeechRecognitionEngine::FSMState SpeechRecognitionEngine::Abort( |
| 572 SpeechRecognitionErrorCode error_code) { |
| 573 DVLOG(1) << "Aborting with error " << error_code; |
| 574 |
| 575 if (error_code != SPEECH_RECOGNITION_ERROR_NONE) { |
| 576 delegate_->OnSpeechRecognitionEngineError( |
| 577 SpeechRecognitionError(error_code)); |
| 578 } |
| 579 downstream_fetcher_.reset(); |
| 580 upstream_fetcher_.reset(); |
| 581 encoder_.reset(); |
| 582 return STATE_IDLE; |
| 583 } |
| 584 |
| 585 SpeechRecognitionEngine::FSMState |
| 586 SpeechRecognitionEngine::DoNothing(const FSMEventArgs&) { |
| 587 return state_; |
| 588 } |
| 589 |
| 590 SpeechRecognitionEngine::FSMState |
| 591 SpeechRecognitionEngine::NotFeasible(const FSMEventArgs& event_args) { |
| 592 NOTREACHED() << "Unfeasible event " << event_args.event |
| 593 << " in state " << state_; |
| 594 return state_; |
| 595 } |
| 596 |
| 597 std::string SpeechRecognitionEngine::GetAcceptedLanguages() const { |
| 598 std::string langs = config_.language; |
| 599 if (langs.empty() && url_context_.get()) { |
| 600 // If no language is provided then we use the first from the accepted |
| 601 // language list. If this list is empty then it defaults to "en-US". |
| 602 // Example of the contents of this list: "es,en-GB;q=0.8", "" |
| 603 net::URLRequestContext* request_context = |
| 604 url_context_->GetURLRequestContext(); |
| 605 DCHECK(request_context); |
| 606 // TODO(pauljensen): SpeechRecognitionEngine should be constructed with |
| 607 // a reference to the HttpUserAgentSettings rather than accessing the |
| 608 // accept language through the URLRequestContext. |
| 609 if (request_context->http_user_agent_settings()) { |
| 610 std::string accepted_language_list = |
| 611 request_context->http_user_agent_settings()->GetAcceptLanguage(); |
| 612 size_t separator = accepted_language_list.find_first_of(",;"); |
| 613 if (separator != std::string::npos) |
| 614 langs = accepted_language_list.substr(0, separator); |
| 615 } |
| 616 } |
| 617 if (langs.empty()) |
| 618 langs = "en-US"; |
| 619 return langs; |
| 620 } |
| 621 |
| 622 // TODO(primiano): Is there any utility in the codebase that already does this? |
| 623 std::string SpeechRecognitionEngine::GenerateRequestKey() const { |
| 624 const int64_t kKeepLowBytes = 0x00000000FFFFFFFFLL; |
| 625 const int64_t kKeepHighBytes = 0xFFFFFFFF00000000LL; |
| 626 |
| 627 // Just keep the least significant bits of timestamp, in order to reduce |
| 628 // probability of collisions. |
| 629 int64_t key = (base::Time::Now().ToInternalValue() & kKeepLowBytes) | |
| 630 (base::RandUint64() & kKeepHighBytes); |
| 631 return base::HexEncode(reinterpret_cast<void*>(&key), sizeof(key)); |
| 632 } |
| 633 |
| 634 void SpeechRecognitionEngine::UploadAudioChunk(const std::string& data, |
| 635 FrameType type, |
| 636 bool is_final) { |
| 637 if (use_framed_post_data_) { |
| 638 std::string frame(data.size() + 8, 0); |
| 639 base::WriteBigEndian(&frame[0], static_cast<uint32_t>(data.size())); |
| 640 base::WriteBigEndian(&frame[4], static_cast<uint32_t>(type)); |
| 641 frame.replace(8, data.size(), data); |
| 642 upstream_fetcher_->AppendChunkToUpload(frame, is_final); |
| 643 } else { |
| 644 upstream_fetcher_->AppendChunkToUpload(data, is_final); |
| 645 } |
| 646 } |
| 647 |
| 648 SpeechRecognitionEngine::FSMEventArgs::FSMEventArgs(FSMEvent event_value) |
| 649 : event(event_value) { |
| 650 } |
| 651 |
| 652 SpeechRecognitionEngine::FSMEventArgs::~FSMEventArgs() { |
| 25 } | 653 } |
| 26 | 654 |
| 27 } // namespace content | 655 } // namespace content |
| OLD | NEW |