| OLD | NEW |
| 1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "content/browser/speech/speech_recognizer_impl.h" | 5 #include "content/browser/speech/speech_recognizer_impl.h" |
| 6 | 6 |
| 7 #include <stdint.h> | 7 #include <stdint.h> |
| 8 | 8 |
| 9 #include <algorithm> | 9 #include <algorithm> |
| 10 | 10 |
| 11 #include "base/bind.h" | 11 #include "base/bind.h" |
| 12 #include "base/macros.h" | 12 #include "base/macros.h" |
| 13 #include "base/time/time.h" | 13 #include "base/time/time.h" |
| 14 #include "build/build_config.h" | 14 #include "build/build_config.h" |
| 15 #include "content/browser/browser_main_loop.h" | 15 #include "content/browser/browser_main_loop.h" |
| 16 #include "content/browser/media/media_internals.h" | 16 #include "content/browser/media/media_internals.h" |
| 17 #include "content/browser/speech/audio_buffer.h" | 17 #include "content/browser/speech/audio_buffer.h" |
| 18 #include "content/public/browser/speech_recognition_event_listener.h" | 18 #include "content/public/browser/speech_recognition_event_listener.h" |
| 19 #include "media/audio/audio_file_writer.h" | 19 #include "media/audio/audio_file_writer.h" |
| 20 #include "media/audio/audio_manager.h" |
| 21 #include "media/audio/audio_system.h" |
| 20 #include "media/base/audio_converter.h" | 22 #include "media/base/audio_converter.h" |
| 21 | 23 |
| 22 #if defined(OS_WIN) | 24 #if defined(OS_WIN) |
| 23 #include "media/audio/win/core_audio_util_win.h" | 25 #include "media/audio/win/core_audio_util_win.h" |
| 24 #endif | 26 #endif |
| 25 | 27 |
| 26 using media::AudioBus; | 28 using media::AudioBus; |
| 27 using media::AudioConverter; | 29 using media::AudioConverter; |
| 28 using media::AudioInputController; | 30 using media::AudioInputController; |
| 29 using media::AudioManager; | 31 using media::AudioManager; |
| (...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 105 } | 107 } |
| 106 | 108 |
| 107 } // namespace | 109 } // namespace |
| 108 | 110 |
| 109 const int SpeechRecognizerImpl::kAudioSampleRate = 16000; | 111 const int SpeechRecognizerImpl::kAudioSampleRate = 16000; |
| 110 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = | 112 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = |
| 111 media::CHANNEL_LAYOUT_MONO; | 113 media::CHANNEL_LAYOUT_MONO; |
| 112 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; | 114 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; |
| 113 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; | 115 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; |
| 114 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; | 116 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; |
| 115 media::AudioManager* SpeechRecognizerImpl::audio_manager_for_tests_ = NULL; | 117 media::AudioSystem* SpeechRecognizerImpl::audio_system_for_tests_ = nullptr; |
| 116 | 118 |
| 117 static_assert(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0, | 119 static_assert(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0, |
| 118 "kNumBitsPerAudioSample must be a multiple of 8"); | 120 "kNumBitsPerAudioSample must be a multiple of 8"); |
| 119 | 121 |
| 120 // SpeechRecognizerImpl::OnDataConverter implementation | 122 // SpeechRecognizerImpl::OnDataConverter implementation |
| 121 | 123 |
| 122 SpeechRecognizerImpl::OnDataConverter::OnDataConverter( | 124 SpeechRecognizerImpl::OnDataConverter::OnDataConverter( |
| 123 const AudioParameters& input_params, | 125 const AudioParameters& input_params, |
| 124 const AudioParameters& output_params) | 126 const AudioParameters& output_params) |
| 125 : audio_converter_(input_params, output_params, false), | 127 : audio_converter_(input_params, output_params, false), |
| (...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 169 input_bus_->CopyTo(dest); | 171 input_bus_->CopyTo(dest); |
| 170 // Indicate that the recorded audio has in fact been used by the converter. | 172 // Indicate that the recorded audio has in fact been used by the converter. |
| 171 data_was_converted_ = true; | 173 data_was_converted_ = true; |
| 172 return 1; | 174 return 1; |
| 173 } | 175 } |
| 174 | 176 |
| 175 // SpeechRecognizerImpl implementation | 177 // SpeechRecognizerImpl implementation |
| 176 | 178 |
| 177 SpeechRecognizerImpl::SpeechRecognizerImpl( | 179 SpeechRecognizerImpl::SpeechRecognizerImpl( |
| 178 SpeechRecognitionEventListener* listener, | 180 SpeechRecognitionEventListener* listener, |
| 181 media::AudioSystem* audio_system, |
| 179 int session_id, | 182 int session_id, |
| 180 bool continuous, | 183 bool continuous, |
| 181 bool provisional_results, | 184 bool provisional_results, |
| 182 SpeechRecognitionEngine* engine) | 185 SpeechRecognitionEngine* engine) |
| 183 : SpeechRecognizer(listener, session_id), | 186 : SpeechRecognizer(listener, session_id), |
| 187 audio_system_(audio_system), |
| 184 recognition_engine_(engine), | 188 recognition_engine_(engine), |
| 185 endpointer_(kAudioSampleRate), | 189 endpointer_(kAudioSampleRate), |
| 186 audio_log_(MediaInternals::GetInstance()->CreateAudioLog( | 190 audio_log_(MediaInternals::GetInstance()->CreateAudioLog( |
| 187 media::AudioLogFactory::AUDIO_INPUT_CONTROLLER)), | 191 media::AudioLogFactory::AUDIO_INPUT_CONTROLLER)), |
| 188 is_dispatching_event_(false), | 192 is_dispatching_event_(false), |
| 189 provisional_results_(provisional_results), | 193 provisional_results_(provisional_results), |
| 190 end_of_utterance_(false), | 194 end_of_utterance_(false), |
| 191 state_(STATE_IDLE) { | 195 state_(STATE_IDLE), |
| 192 DCHECK(recognition_engine_ != NULL); | 196 weak_ptr_factory_(this) { |
| 197 DCHECK(recognition_engine_ != nullptr); |
| 198 DCHECK(audio_system_ != nullptr); |
| 193 if (!continuous) { | 199 if (!continuous) { |
| 194 // In single shot (non-continous) recognition, | 200 // In single shot (non-continous) recognition, |
| 195 // the session is automatically ended after: | 201 // the session is automatically ended after: |
| 196 // - 0.5 seconds of silence if time < 3 seconds | 202 // - 0.5 seconds of silence if time < 3 seconds |
| 197 // - 1 seconds of silence if time >= 3 seconds | 203 // - 1 seconds of silence if time >= 3 seconds |
| 198 endpointer_.set_speech_input_complete_silence_length( | 204 endpointer_.set_speech_input_complete_silence_length( |
| 199 base::Time::kMicrosecondsPerSecond / 2); | 205 base::Time::kMicrosecondsPerSecond / 2); |
| 200 endpointer_.set_long_speech_input_complete_silence_length( | 206 endpointer_.set_long_speech_input_complete_silence_length( |
| 201 base::Time::kMicrosecondsPerSecond); | 207 base::Time::kMicrosecondsPerSecond); |
| 202 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); | 208 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); |
| (...skipping 13 matching lines...) Expand all Loading... |
| 216 // NOTE:all the external events and requests should be enqueued (PostTask), even | 222 // NOTE:all the external events and requests should be enqueued (PostTask), even |
| 217 // if they come from the same (IO) thread, in order to preserve the relationship | 223 // if they come from the same (IO) thread, in order to preserve the relationship |
| 218 // of causality between events and avoid interleaved event processing due to | 224 // of causality between events and avoid interleaved event processing due to |
| 219 // synchronous callbacks. | 225 // synchronous callbacks. |
| 220 | 226 |
| 221 void SpeechRecognizerImpl::StartRecognition(const std::string& device_id) { | 227 void SpeechRecognizerImpl::StartRecognition(const std::string& device_id) { |
| 222 DCHECK(!device_id.empty()); | 228 DCHECK(!device_id.empty()); |
| 223 device_id_ = device_id; | 229 device_id_ = device_id; |
| 224 | 230 |
| 225 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 231 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| 226 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | 232 base::Bind(&SpeechRecognizerImpl::DispatchEvent, this, |
| 227 this, FSMEventArgs(EVENT_START))); | 233 FSMEventArgs(EVENT_PREPARE))); |
| 228 } | 234 } |
| 229 | 235 |
| 230 void SpeechRecognizerImpl::AbortRecognition() { | 236 void SpeechRecognizerImpl::AbortRecognition() { |
| 231 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 237 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| 232 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | 238 base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
| 233 this, FSMEventArgs(EVENT_ABORT))); | 239 this, FSMEventArgs(EVENT_ABORT))); |
| 234 } | 240 } |
| 235 | 241 |
| 236 void SpeechRecognizerImpl::StopAudioCapture() { | 242 void SpeechRecognizerImpl::StopAudioCapture() { |
| 237 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 243 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| (...skipping 131 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 369 SpeechRecognizerImpl::ExecuteTransitionAndGetNextState( | 375 SpeechRecognizerImpl::ExecuteTransitionAndGetNextState( |
| 370 const FSMEventArgs& event_args) { | 376 const FSMEventArgs& event_args) { |
| 371 const FSMEvent event = event_args.event; | 377 const FSMEvent event = event_args.event; |
| 372 switch (state_) { | 378 switch (state_) { |
| 373 case STATE_IDLE: | 379 case STATE_IDLE: |
| 374 switch (event) { | 380 switch (event) { |
| 375 // TODO(primiano): restore UNREACHABLE_CONDITION on EVENT_ABORT and | 381 // TODO(primiano): restore UNREACHABLE_CONDITION on EVENT_ABORT and |
| 376 // EVENT_STOP_CAPTURE below once speech input extensions are fixed. | 382 // EVENT_STOP_CAPTURE below once speech input extensions are fixed. |
| 377 case EVENT_ABORT: | 383 case EVENT_ABORT: |
| 378 return AbortSilently(event_args); | 384 return AbortSilently(event_args); |
| 385 case EVENT_PREPARE: |
| 386 return PrepareRecognition(event_args); |
| 387 case EVENT_START: |
| 388 return NotFeasible(event_args); |
| 389 case EVENT_STOP_CAPTURE: |
| 390 return AbortSilently(event_args); |
| 391 case EVENT_AUDIO_DATA: // Corner cases related to queued messages |
| 392 case EVENT_ENGINE_RESULT: // being lately dispatched. |
| 393 case EVENT_ENGINE_ERROR: |
| 394 case EVENT_AUDIO_ERROR: |
| 395 return DoNothing(event_args); |
| 396 } |
| 397 break; |
| 398 case STATE_PREPARING: |
| 399 switch (event) { |
| 400 case EVENT_ABORT: |
| 401 return AbortSilently(event_args); |
| 402 case EVENT_PREPARE: |
| 403 return NotFeasible(event_args); |
| 379 case EVENT_START: | 404 case EVENT_START: |
| 380 return StartRecording(event_args); | 405 return StartRecording(event_args); |
| 381 case EVENT_STOP_CAPTURE: | 406 case EVENT_STOP_CAPTURE: |
| 382 return AbortSilently(event_args); | 407 return AbortSilently(event_args); |
| 383 case EVENT_AUDIO_DATA: // Corner cases related to queued messages | 408 case EVENT_AUDIO_DATA: // Corner cases related to queued messages |
| 384 case EVENT_ENGINE_RESULT: // being lately dispatched. | 409 case EVENT_ENGINE_RESULT: // being lately dispatched. |
| 385 case EVENT_ENGINE_ERROR: | 410 case EVENT_ENGINE_ERROR: |
| 386 case EVENT_AUDIO_ERROR: | 411 case EVENT_AUDIO_ERROR: |
| 387 return DoNothing(event_args); | 412 return DoNothing(event_args); |
| 388 } | 413 } |
| 389 break; | 414 break; |
| 390 case STATE_STARTING: | 415 case STATE_STARTING: |
| 391 switch (event) { | 416 switch (event) { |
| 392 case EVENT_ABORT: | 417 case EVENT_ABORT: |
| 393 return AbortWithError(event_args); | 418 return AbortWithError(event_args); |
| 419 case EVENT_PREPARE: |
| 420 return NotFeasible(event_args); |
| 394 case EVENT_START: | 421 case EVENT_START: |
| 395 return NotFeasible(event_args); | 422 return NotFeasible(event_args); |
| 396 case EVENT_STOP_CAPTURE: | 423 case EVENT_STOP_CAPTURE: |
| 397 return AbortSilently(event_args); | 424 return AbortSilently(event_args); |
| 398 case EVENT_AUDIO_DATA: | 425 case EVENT_AUDIO_DATA: |
| 399 return StartRecognitionEngine(event_args); | 426 return StartRecognitionEngine(event_args); |
| 400 case EVENT_ENGINE_RESULT: | 427 case EVENT_ENGINE_RESULT: |
| 401 return NotFeasible(event_args); | 428 return NotFeasible(event_args); |
| 402 case EVENT_ENGINE_ERROR: | 429 case EVENT_ENGINE_ERROR: |
| 403 case EVENT_AUDIO_ERROR: | 430 case EVENT_AUDIO_ERROR: |
| 404 return AbortWithError(event_args); | 431 return AbortWithError(event_args); |
| 405 } | 432 } |
| 406 break; | 433 break; |
| 407 case STATE_ESTIMATING_ENVIRONMENT: | 434 case STATE_ESTIMATING_ENVIRONMENT: |
| 408 switch (event) { | 435 switch (event) { |
| 409 case EVENT_ABORT: | 436 case EVENT_ABORT: |
| 410 return AbortWithError(event_args); | 437 return AbortWithError(event_args); |
| 438 case EVENT_PREPARE: |
| 439 return NotFeasible(event_args); |
| 411 case EVENT_START: | 440 case EVENT_START: |
| 412 return NotFeasible(event_args); | 441 return NotFeasible(event_args); |
| 413 case EVENT_STOP_CAPTURE: | 442 case EVENT_STOP_CAPTURE: |
| 414 return StopCaptureAndWaitForResult(event_args); | 443 return StopCaptureAndWaitForResult(event_args); |
| 415 case EVENT_AUDIO_DATA: | 444 case EVENT_AUDIO_DATA: |
| 416 return WaitEnvironmentEstimationCompletion(event_args); | 445 return WaitEnvironmentEstimationCompletion(event_args); |
| 417 case EVENT_ENGINE_RESULT: | 446 case EVENT_ENGINE_RESULT: |
| 418 return ProcessIntermediateResult(event_args); | 447 return ProcessIntermediateResult(event_args); |
| 419 case EVENT_ENGINE_ERROR: | 448 case EVENT_ENGINE_ERROR: |
| 420 case EVENT_AUDIO_ERROR: | 449 case EVENT_AUDIO_ERROR: |
| 421 return AbortWithError(event_args); | 450 return AbortWithError(event_args); |
| 422 } | 451 } |
| 423 break; | 452 break; |
| 424 case STATE_WAITING_FOR_SPEECH: | 453 case STATE_WAITING_FOR_SPEECH: |
| 425 switch (event) { | 454 switch (event) { |
| 426 case EVENT_ABORT: | 455 case EVENT_ABORT: |
| 427 return AbortWithError(event_args); | 456 return AbortWithError(event_args); |
| 457 case EVENT_PREPARE: |
| 458 return NotFeasible(event_args); |
| 428 case EVENT_START: | 459 case EVENT_START: |
| 429 return NotFeasible(event_args); | 460 return NotFeasible(event_args); |
| 430 case EVENT_STOP_CAPTURE: | 461 case EVENT_STOP_CAPTURE: |
| 431 return StopCaptureAndWaitForResult(event_args); | 462 return StopCaptureAndWaitForResult(event_args); |
| 432 case EVENT_AUDIO_DATA: | 463 case EVENT_AUDIO_DATA: |
| 433 return DetectUserSpeechOrTimeout(event_args); | 464 return DetectUserSpeechOrTimeout(event_args); |
| 434 case EVENT_ENGINE_RESULT: | 465 case EVENT_ENGINE_RESULT: |
| 435 return ProcessIntermediateResult(event_args); | 466 return ProcessIntermediateResult(event_args); |
| 436 case EVENT_ENGINE_ERROR: | 467 case EVENT_ENGINE_ERROR: |
| 437 case EVENT_AUDIO_ERROR: | 468 case EVENT_AUDIO_ERROR: |
| 438 return AbortWithError(event_args); | 469 return AbortWithError(event_args); |
| 439 } | 470 } |
| 440 break; | 471 break; |
| 441 case STATE_RECOGNIZING: | 472 case STATE_RECOGNIZING: |
| 442 switch (event) { | 473 switch (event) { |
| 443 case EVENT_ABORT: | 474 case EVENT_ABORT: |
| 444 return AbortWithError(event_args); | 475 return AbortWithError(event_args); |
| 476 case EVENT_PREPARE: |
| 477 return NotFeasible(event_args); |
| 445 case EVENT_START: | 478 case EVENT_START: |
| 446 return NotFeasible(event_args); | 479 return NotFeasible(event_args); |
| 447 case EVENT_STOP_CAPTURE: | 480 case EVENT_STOP_CAPTURE: |
| 448 return StopCaptureAndWaitForResult(event_args); | 481 return StopCaptureAndWaitForResult(event_args); |
| 449 case EVENT_AUDIO_DATA: | 482 case EVENT_AUDIO_DATA: |
| 450 return DetectEndOfSpeech(event_args); | 483 return DetectEndOfSpeech(event_args); |
| 451 case EVENT_ENGINE_RESULT: | 484 case EVENT_ENGINE_RESULT: |
| 452 return ProcessIntermediateResult(event_args); | 485 return ProcessIntermediateResult(event_args); |
| 453 case EVENT_ENGINE_ERROR: | 486 case EVENT_ENGINE_ERROR: |
| 454 case EVENT_AUDIO_ERROR: | 487 case EVENT_AUDIO_ERROR: |
| 455 return AbortWithError(event_args); | 488 return AbortWithError(event_args); |
| 456 } | 489 } |
| 457 break; | 490 break; |
| 458 case STATE_WAITING_FINAL_RESULT: | 491 case STATE_WAITING_FINAL_RESULT: |
| 459 switch (event) { | 492 switch (event) { |
| 460 case EVENT_ABORT: | 493 case EVENT_ABORT: |
| 461 return AbortWithError(event_args); | 494 return AbortWithError(event_args); |
| 495 case EVENT_PREPARE: |
| 496 return NotFeasible(event_args); |
| 462 case EVENT_START: | 497 case EVENT_START: |
| 463 return NotFeasible(event_args); | 498 return NotFeasible(event_args); |
| 464 case EVENT_STOP_CAPTURE: | 499 case EVENT_STOP_CAPTURE: |
| 465 case EVENT_AUDIO_DATA: | 500 case EVENT_AUDIO_DATA: |
| 466 return DoNothing(event_args); | 501 return DoNothing(event_args); |
| 467 case EVENT_ENGINE_RESULT: | 502 case EVENT_ENGINE_RESULT: |
| 468 return ProcessFinalResult(event_args); | 503 return ProcessFinalResult(event_args); |
| 469 case EVENT_ENGINE_ERROR: | 504 case EVENT_ENGINE_ERROR: |
| 470 case EVENT_AUDIO_ERROR: | 505 case EVENT_AUDIO_ERROR: |
| 471 return AbortWithError(event_args); | 506 return AbortWithError(event_args); |
| (...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 508 if (route_to_vumeter) { | 543 if (route_to_vumeter) { |
| 509 DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|. | 544 DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|. |
| 510 UpdateSignalAndNoiseLevels(rms, clip_detected); | 545 UpdateSignalAndNoiseLevels(rms, clip_detected); |
| 511 } | 546 } |
| 512 if (route_to_sr_engine) { | 547 if (route_to_sr_engine) { |
| 513 DCHECK(recognition_engine_.get() != NULL); | 548 DCHECK(recognition_engine_.get() != NULL); |
| 514 recognition_engine_->TakeAudioChunk(raw_audio); | 549 recognition_engine_->TakeAudioChunk(raw_audio); |
| 515 } | 550 } |
| 516 } | 551 } |
| 517 | 552 |
| 553 void SpeechRecognizerImpl::OnDeviceInfo(const media::AudioParameters& params) { |
| 554 DCHECK_CURRENTLY_ON(BrowserThread::IO); |
| 555 device_params_ = params; |
| 556 DVLOG(1) << "Device parameters: " << device_params_.AsHumanReadableString(); |
| 557 DispatchEvent(FSMEventArgs(EVENT_START)); |
| 558 } |
| 559 |
| 560 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::PrepareRecognition( |
| 561 const FSMEventArgs&) { |
| 562 DCHECK(state_ == STATE_IDLE); |
| 563 DCHECK(recognition_engine_.get() != NULL); |
| 564 DCHECK(!IsCapturingAudio()); |
| 565 GetAudioSystem()->GetInputStreamParameters( |
| 566 device_id_, base::Bind(&SpeechRecognizerImpl::OnDeviceInfo, |
| 567 weak_ptr_factory_.GetWeakPtr())); |
| 568 |
| 569 listener()->OnRecognitionStart(session_id()); |
| 570 return STATE_PREPARING; |
| 571 } |
| 572 |
| 518 SpeechRecognizerImpl::FSMState | 573 SpeechRecognizerImpl::FSMState |
| 519 SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) { | 574 SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) { |
| 520 DCHECK(state_ == STATE_IDLE); | 575 DCHECK(state_ == STATE_PREPARING); |
| 521 DCHECK(recognition_engine_.get() != NULL); | 576 DCHECK(recognition_engine_.get() != NULL); |
| 522 DCHECK(!IsCapturingAudio()); | 577 DCHECK(!IsCapturingAudio()); |
| 523 const bool unit_test_is_active = (audio_manager_for_tests_ != NULL); | |
| 524 AudioManager* audio_manager = unit_test_is_active ? | |
| 525 audio_manager_for_tests_ : | |
| 526 AudioManager::Get(); | |
| 527 DCHECK(audio_manager != NULL); | |
| 528 | 578 |
| 529 DVLOG(1) << "SpeechRecognizerImpl starting audio capture."; | 579 DVLOG(1) << "SpeechRecognizerImpl starting audio capture."; |
| 530 num_samples_recorded_ = 0; | 580 num_samples_recorded_ = 0; |
| 531 audio_level_ = 0; | 581 audio_level_ = 0; |
| 532 end_of_utterance_ = false; | 582 end_of_utterance_ = false; |
| 533 listener()->OnRecognitionStart(session_id()); | |
| 534 | 583 |
| 535 // TODO(xians): Check if the OS has the device with |device_id_|, return | 584 int chunk_duration_ms = recognition_engine_->GetDesiredAudioChunkDurationMs(); |
| 536 // |SPEECH_AUDIO_ERROR_DETAILS_NO_MIC| if the target device does not exist. | 585 |
| 537 if (!audio_manager->HasAudioInputDevices()) { | 586 if (!device_params_.IsValid()) { |
| 587 DLOG(ERROR) << "Audio input device not found"; |
| 538 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE, | 588 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE, |
| 539 SPEECH_AUDIO_ERROR_DETAILS_NO_MIC)); | 589 SPEECH_AUDIO_ERROR_DETAILS_NO_MIC)); |
| 540 } | 590 } |
| 541 | 591 |
| 542 int chunk_duration_ms = recognition_engine_->GetDesiredAudioChunkDurationMs(); | |
| 543 | |
| 544 AudioParameters in_params = audio_manager->GetInputStreamParameters( | |
| 545 device_id_); | |
| 546 if (!in_params.IsValid() && !unit_test_is_active) { | |
| 547 DLOG(ERROR) << "Invalid native audio input parameters"; | |
| 548 return Abort( | |
| 549 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE)); | |
| 550 } | |
| 551 | |
| 552 // Audio converter shall provide audio based on these parameters as output. | 592 // Audio converter shall provide audio based on these parameters as output. |
| 553 // Hard coded, WebSpeech specific parameters are utilized here. | 593 // Hard coded, WebSpeech specific parameters are utilized here. |
| 554 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000; | 594 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000; |
| 555 AudioParameters output_parameters = AudioParameters( | 595 AudioParameters output_parameters = AudioParameters( |
| 556 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate, | 596 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate, |
| 557 kNumBitsPerAudioSample, frames_per_buffer); | 597 kNumBitsPerAudioSample, frames_per_buffer); |
| 558 DVLOG(1) << "SRI::output_parameters: " | 598 DVLOG(1) << "SRI::output_parameters: " |
| 559 << output_parameters.AsHumanReadableString(); | 599 << output_parameters.AsHumanReadableString(); |
| 560 | 600 |
| 561 // Audio converter will receive audio based on these parameters as input. | 601 // Audio converter will receive audio based on these parameters as input. |
| 562 // On Windows we start by verifying that Core Audio is supported. If not, | 602 // On Windows we start by verifying that Core Audio is supported. If not, |
| 563 // the WaveIn API is used and we might as well avoid all audio conversations | 603 // the WaveIn API is used and we might as well avoid all audio conversations |
| 564 // since WaveIn does the conversion for us. | 604 // since WaveIn does the conversion for us. |
| 565 // TODO(henrika): this code should be moved to platform dependent audio | 605 // TODO(henrika): this code should be moved to platform dependent audio |
| 566 // managers. | 606 // managers. |
| 567 bool use_native_audio_params = true; | 607 bool use_native_audio_params = true; |
| 568 #if defined(OS_WIN) | 608 #if defined(OS_WIN) |
| 569 use_native_audio_params = media::CoreAudioUtil::IsSupported(); | 609 use_native_audio_params = media::CoreAudioUtil::IsSupported(); |
| 570 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech"; | 610 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech"; |
| 571 #endif | 611 #endif |
| 572 | 612 |
| 573 AudioParameters input_parameters = output_parameters; | 613 AudioParameters input_parameters = output_parameters; |
| 574 if (use_native_audio_params && !unit_test_is_active) { | 614 |
| 615 // AUDIO_FAKE means we are running a test. |
| 616 if (use_native_audio_params && |
| 617 device_params_.format() != media::AudioParameters::AUDIO_FAKE) { |
| 575 // Use native audio parameters but avoid opening up at the native buffer | 618 // Use native audio parameters but avoid opening up at the native buffer |
| 576 // size. Instead use same frame size (in milliseconds) as WebSpeech uses. | 619 // size. Instead use same frame size (in milliseconds) as WebSpeech uses. |
| 577 // We rely on internal buffers in the audio back-end to fulfill this request | 620 // We rely on internal buffers in the audio back-end to fulfill this request |
| 578 // and the idea is to simplify the audio conversion since each Convert() | 621 // and the idea is to simplify the audio conversion since each Convert() |
| 579 // call will then render exactly one ProvideInput() call. | 622 // call will then render exactly one ProvideInput() call. |
| 580 // in_params.sample_rate() | 623 input_parameters = device_params_; |
| 581 input_parameters = in_params; | |
| 582 frames_per_buffer = | 624 frames_per_buffer = |
| 583 ((in_params.sample_rate() * chunk_duration_ms) / 1000.0) + 0.5; | 625 ((input_parameters.sample_rate() * chunk_duration_ms) / 1000.0) + 0.5; |
| 584 input_parameters.set_frames_per_buffer(frames_per_buffer); | 626 input_parameters.set_frames_per_buffer(frames_per_buffer); |
| 585 DVLOG(1) << "SRI::input_parameters: " | 627 DVLOG(1) << "SRI::input_parameters: " |
| 586 << input_parameters.AsHumanReadableString(); | 628 << input_parameters.AsHumanReadableString(); |
| 587 } | 629 } |
| 588 | 630 |
| 589 // Create an audio converter which converts data between native input format | 631 // Create an audio converter which converts data between native input format |
| 590 // and WebSpeech specific output format. | 632 // and WebSpeech specific output format. |
| 591 audio_converter_.reset( | 633 audio_converter_.reset( |
| 592 new OnDataConverter(input_parameters, output_parameters)); | 634 new OnDataConverter(input_parameters, output_parameters)); |
| 593 | 635 |
| 594 audio_controller_ = AudioInputController::Create( | 636 audio_controller_ = AudioInputController::Create( |
| 595 audio_manager, this, this, nullptr, nullptr, input_parameters, device_id_, | 637 GetAudioSystem()->GetAudioManager(), this, this, nullptr, nullptr, |
| 638 input_parameters, device_id_, |
| 596 /*agc_is_enabled*/ false); | 639 /*agc_is_enabled*/ false); |
| 597 | 640 |
| 598 if (!audio_controller_.get()) { | 641 if (!audio_controller_.get()) { |
| 599 return Abort( | 642 return Abort( |
| 600 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE)); | 643 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE)); |
| 601 } | 644 } |
| 602 | 645 |
| 603 audio_log_->OnCreated(0, input_parameters, device_id_); | 646 audio_log_->OnCreated(0, input_parameters, device_id_); |
| 604 | 647 |
| 605 // The endpointer needs to estimate the environment/background noise before | 648 // The endpointer needs to estimate the environment/background noise before |
| (...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 685 return Abort( | 728 return Abort( |
| 686 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE)); | 729 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE)); |
| 687 } else if (event_args.event == EVENT_ENGINE_ERROR) { | 730 } else if (event_args.event == EVENT_ENGINE_ERROR) { |
| 688 return Abort(event_args.engine_error); | 731 return Abort(event_args.engine_error); |
| 689 } | 732 } |
| 690 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_ABORTED)); | 733 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_ABORTED)); |
| 691 } | 734 } |
| 692 | 735 |
| 693 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort( | 736 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort( |
| 694 const SpeechRecognitionError& error) { | 737 const SpeechRecognitionError& error) { |
| 738 DCHECK_CURRENTLY_ON(BrowserThread::IO); |
| 739 |
| 695 if (IsCapturingAudio()) | 740 if (IsCapturingAudio()) |
| 696 CloseAudioControllerAsynchronously(); | 741 CloseAudioControllerAsynchronously(); |
| 697 | 742 |
| 698 DVLOG(1) << "SpeechRecognizerImpl canceling recognition. "; | 743 DVLOG(1) << "SpeechRecognizerImpl canceling recognition. "; |
| 699 | 744 |
| 745 if (state_ == STATE_PREPARING) { |
| 746 // Cancel an outstanding reply from AudioSystem. |
| 747 weak_ptr_factory_.InvalidateWeakPtrs(); |
| 748 } |
| 749 |
| 700 // The recognition engine is initialized only after STATE_STARTING. | 750 // The recognition engine is initialized only after STATE_STARTING. |
| 701 if (state_ > STATE_STARTING) { | 751 if (state_ > STATE_STARTING) { |
| 702 DCHECK(recognition_engine_.get() != NULL); | 752 DCHECK(recognition_engine_.get() != NULL); |
| 703 recognition_engine_->EndRecognition(); | 753 recognition_engine_->EndRecognition(); |
| 704 } | 754 } |
| 705 | 755 |
| 706 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT) | 756 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT) |
| 707 listener()->OnSoundEnd(session_id()); | 757 listener()->OnSoundEnd(session_id()); |
| 708 | 758 |
| 709 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT) | 759 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT) |
| (...skipping 116 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 826 | 876 |
| 827 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / | 877 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / |
| 828 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); | 878 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); |
| 829 noise_level = std::min(std::max(0.0f, noise_level), | 879 noise_level = std::min(std::max(0.0f, noise_level), |
| 830 kAudioMeterRangeMaxUnclipped); | 880 kAudioMeterRangeMaxUnclipped); |
| 831 | 881 |
| 832 listener()->OnAudioLevelsChange( | 882 listener()->OnAudioLevelsChange( |
| 833 session_id(), clip_detected ? 1.0f : audio_level_, noise_level); | 883 session_id(), clip_detected ? 1.0f : audio_level_, noise_level); |
| 834 } | 884 } |
| 835 | 885 |
| 836 void SpeechRecognizerImpl::SetAudioManagerForTesting( | 886 void SpeechRecognizerImpl::SetAudioSystemForTesting( |
| 837 AudioManager* audio_manager) { | 887 media::AudioSystem* audio_system) { |
| 838 audio_manager_for_tests_ = audio_manager; | 888 audio_system_for_tests_ = audio_system; |
| 889 } |
| 890 |
| 891 media::AudioSystem* SpeechRecognizerImpl::GetAudioSystem() { |
| 892 return audio_system_for_tests_ ? audio_system_for_tests_ : audio_system_; |
| 839 } | 893 } |
| 840 | 894 |
| 841 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) | 895 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) |
| 842 : event(event_value), | 896 : event(event_value), |
| 843 audio_data(NULL), | 897 audio_data(NULL), |
| 844 engine_error(SPEECH_RECOGNITION_ERROR_NONE) { | 898 engine_error(SPEECH_RECOGNITION_ERROR_NONE) { |
| 845 } | 899 } |
| 846 | 900 |
| 847 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(const FSMEventArgs& other) = | 901 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(const FSMEventArgs& other) = |
| 848 default; | 902 default; |
| 849 | 903 |
| 850 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { | 904 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { |
| 851 } | 905 } |
| 852 | 906 |
| 853 } // namespace content | 907 } // namespace content |
| OLD | NEW |