Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "content/browser/speech/speech_recognizer_impl.h" | 5 #include "content/browser/speech/speech_recognizer_impl.h" |
| 6 | 6 |
| 7 #include <stdint.h> | 7 #include <stdint.h> |
| 8 | 8 |
| 9 #include "base/bind.h" | 9 #include "base/bind.h" |
| 10 #include "base/macros.h" | 10 #include "base/macros.h" |
| 11 #include "base/time/time.h" | 11 #include "base/time/time.h" |
| 12 #include "build/build_config.h" | 12 #include "build/build_config.h" |
| 13 #include "content/browser/browser_main_loop.h" | 13 #include "content/browser/browser_main_loop.h" |
| 14 #include "content/browser/media/media_internals.h" | 14 #include "content/browser/media/media_internals.h" |
| 15 #include "content/browser/speech/audio_buffer.h" | 15 #include "content/browser/speech/audio_buffer.h" |
| 16 #include "content/public/browser/speech_recognition_event_listener.h" | 16 #include "content/public/browser/speech_recognition_event_listener.h" |
| 17 #include "media/audio/audio_manager.h" | |
| 18 #include "media/audio/audio_system.h" | |
| 17 #include "media/base/audio_converter.h" | 19 #include "media/base/audio_converter.h" |
| 18 | 20 |
| 19 #if defined(OS_WIN) | 21 #if defined(OS_WIN) |
| 20 #include "media/audio/win/core_audio_util_win.h" | 22 #include "media/audio/win/core_audio_util_win.h" |
| 21 #endif | 23 #endif |
| 22 | 24 |
| 23 using media::AudioBus; | 25 using media::AudioBus; |
| 24 using media::AudioConverter; | 26 using media::AudioConverter; |
| 25 using media::AudioInputController; | 27 using media::AudioInputController; |
| 26 using media::AudioManager; | 28 using media::AudioManager; |
| (...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 102 } | 104 } |
| 103 | 105 |
| 104 } // namespace | 106 } // namespace |
| 105 | 107 |
| 106 const int SpeechRecognizerImpl::kAudioSampleRate = 16000; | 108 const int SpeechRecognizerImpl::kAudioSampleRate = 16000; |
| 107 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = | 109 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = |
| 108 media::CHANNEL_LAYOUT_MONO; | 110 media::CHANNEL_LAYOUT_MONO; |
| 109 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; | 111 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; |
| 110 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; | 112 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; |
| 111 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; | 113 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; |
| 112 media::AudioManager* SpeechRecognizerImpl::audio_manager_for_tests_ = NULL; | 114 media::AudioSystem* SpeechRecognizerImpl::audio_system_for_tests_ = nullptr; |
| 113 | 115 |
| 114 static_assert(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0, | 116 static_assert(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0, |
| 115 "kNumBitsPerAudioSample must be a multiple of 8"); | 117 "kNumBitsPerAudioSample must be a multiple of 8"); |
| 116 | 118 |
| 117 // SpeechRecognizerImpl::OnDataConverter implementation | 119 // SpeechRecognizerImpl::OnDataConverter implementation |
| 118 | 120 |
| 119 SpeechRecognizerImpl::OnDataConverter::OnDataConverter( | 121 SpeechRecognizerImpl::OnDataConverter::OnDataConverter( |
| 120 const AudioParameters& input_params, | 122 const AudioParameters& input_params, |
| 121 const AudioParameters& output_params) | 123 const AudioParameters& output_params) |
| 122 : audio_converter_(input_params, output_params, false), | 124 : audio_converter_(input_params, output_params, false), |
| (...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 166 input_bus_->CopyTo(dest); | 168 input_bus_->CopyTo(dest); |
| 167 // Indicate that the recorded audio has in fact been used by the converter. | 169 // Indicate that the recorded audio has in fact been used by the converter. |
| 168 data_was_converted_ = true; | 170 data_was_converted_ = true; |
| 169 return 1; | 171 return 1; |
| 170 } | 172 } |
| 171 | 173 |
| 172 // SpeechRecognizerImpl implementation | 174 // SpeechRecognizerImpl implementation |
| 173 | 175 |
| 174 SpeechRecognizerImpl::SpeechRecognizerImpl( | 176 SpeechRecognizerImpl::SpeechRecognizerImpl( |
| 175 SpeechRecognitionEventListener* listener, | 177 SpeechRecognitionEventListener* listener, |
| 178 media::AudioSystem* audio_system, | |
| 176 int session_id, | 179 int session_id, |
| 177 bool continuous, | 180 bool continuous, |
| 178 bool provisional_results, | 181 bool provisional_results, |
| 179 SpeechRecognitionEngine* engine) | 182 SpeechRecognitionEngine* engine) |
| 180 : SpeechRecognizer(listener, session_id), | 183 : SpeechRecognizer(listener, session_id), |
| 184 audio_system_(audio_system), | |
| 181 recognition_engine_(engine), | 185 recognition_engine_(engine), |
| 182 endpointer_(kAudioSampleRate), | 186 endpointer_(kAudioSampleRate), |
| 183 audio_log_(MediaInternals::GetInstance()->CreateAudioLog( | 187 audio_log_(MediaInternals::GetInstance()->CreateAudioLog( |
| 184 media::AudioLogFactory::AUDIO_INPUT_CONTROLLER)), | 188 media::AudioLogFactory::AUDIO_INPUT_CONTROLLER)), |
| 185 is_dispatching_event_(false), | 189 is_dispatching_event_(false), |
| 186 provisional_results_(provisional_results), | 190 provisional_results_(provisional_results), |
| 187 end_of_utterance_(false), | 191 end_of_utterance_(false), |
| 188 state_(STATE_IDLE) { | 192 state_(STATE_IDLE), |
| 189 DCHECK(recognition_engine_ != NULL); | 193 weak_ptr_factory_(this) { |
| 194 DCHECK(recognition_engine_ != nullptr); | |
| 195 DCHECK(audio_system_ != nullptr); | |
| 190 if (!continuous) { | 196 if (!continuous) { |
| 191 // In single shot (non-continous) recognition, | 197 // In single shot (non-continous) recognition, |
| 192 // the session is automatically ended after: | 198 // the session is automatically ended after: |
| 193 // - 0.5 seconds of silence if time < 3 seconds | 199 // - 0.5 seconds of silence if time < 3 seconds |
| 194 // - 1 seconds of silence if time >= 3 seconds | 200 // - 1 seconds of silence if time >= 3 seconds |
| 195 endpointer_.set_speech_input_complete_silence_length( | 201 endpointer_.set_speech_input_complete_silence_length( |
| 196 base::Time::kMicrosecondsPerSecond / 2); | 202 base::Time::kMicrosecondsPerSecond / 2); |
| 197 endpointer_.set_long_speech_input_complete_silence_length( | 203 endpointer_.set_long_speech_input_complete_silence_length( |
| 198 base::Time::kMicrosecondsPerSecond); | 204 base::Time::kMicrosecondsPerSecond); |
| 199 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); | 205 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); |
| (...skipping 13 matching lines...) Expand all Loading... | |
| 213 // NOTE:all the external events and requests should be enqueued (PostTask), even | 219 // NOTE:all the external events and requests should be enqueued (PostTask), even |
| 214 // if they come from the same (IO) thread, in order to preserve the relationship | 220 // if they come from the same (IO) thread, in order to preserve the relationship |
| 215 // of causality between events and avoid interleaved event processing due to | 221 // of causality between events and avoid interleaved event processing due to |
| 216 // synchronous callbacks. | 222 // synchronous callbacks. |
| 217 | 223 |
| 218 void SpeechRecognizerImpl::StartRecognition(const std::string& device_id) { | 224 void SpeechRecognizerImpl::StartRecognition(const std::string& device_id) { |
| 219 DCHECK(!device_id.empty()); | 225 DCHECK(!device_id.empty()); |
| 220 device_id_ = device_id; | 226 device_id_ = device_id; |
| 221 | 227 |
| 222 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 228 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| 223 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | 229 base::Bind(&SpeechRecognizerImpl::DispatchEvent, this, |
| 224 this, FSMEventArgs(EVENT_START))); | 230 FSMEventArgs(EVENT_PREPARE))); |
| 225 } | 231 } |
| 226 | 232 |
| 227 void SpeechRecognizerImpl::AbortRecognition() { | 233 void SpeechRecognizerImpl::AbortRecognition() { |
| 228 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 234 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| 229 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | 235 base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
| 230 this, FSMEventArgs(EVENT_ABORT))); | 236 this, FSMEventArgs(EVENT_ABORT))); |
| 231 } | 237 } |
| 232 | 238 |
| 233 void SpeechRecognizerImpl::StopAudioCapture() { | 239 void SpeechRecognizerImpl::StopAudioCapture() { |
| 234 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 240 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| (...skipping 131 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 366 SpeechRecognizerImpl::ExecuteTransitionAndGetNextState( | 372 SpeechRecognizerImpl::ExecuteTransitionAndGetNextState( |
| 367 const FSMEventArgs& event_args) { | 373 const FSMEventArgs& event_args) { |
| 368 const FSMEvent event = event_args.event; | 374 const FSMEvent event = event_args.event; |
| 369 switch (state_) { | 375 switch (state_) { |
| 370 case STATE_IDLE: | 376 case STATE_IDLE: |
| 371 switch (event) { | 377 switch (event) { |
| 372 // TODO(primiano): restore UNREACHABLE_CONDITION on EVENT_ABORT and | 378 // TODO(primiano): restore UNREACHABLE_CONDITION on EVENT_ABORT and |
| 373 // EVENT_STOP_CAPTURE below once speech input extensions are fixed. | 379 // EVENT_STOP_CAPTURE below once speech input extensions are fixed. |
| 374 case EVENT_ABORT: | 380 case EVENT_ABORT: |
| 375 return AbortSilently(event_args); | 381 return AbortSilently(event_args); |
| 382 case EVENT_PREPARE: | |
| 383 return PrepareRecognition(event_args); | |
| 384 case EVENT_START: | |
| 385 return NotFeasible(event_args); | |
| 386 case EVENT_STOP_CAPTURE: | |
| 387 return AbortSilently(event_args); | |
| 388 case EVENT_AUDIO_DATA: // Corner cases related to queued messages | |
| 389 case EVENT_ENGINE_RESULT: // being lately dispatched. | |
| 390 case EVENT_ENGINE_ERROR: | |
| 391 case EVENT_AUDIO_ERROR: | |
| 392 return DoNothing(event_args); | |
| 393 } | |
| 394 break; | |
| 395 case STATE_PREPARING: | |
| 396 switch (event) { | |
| 397 case EVENT_ABORT: | |
| 398 return AbortSilently(event_args); | |
| 399 case EVENT_PREPARE: | |
| 400 return NotFeasible(event_args); | |
| 376 case EVENT_START: | 401 case EVENT_START: |
| 377 return StartRecording(event_args); | 402 return StartRecording(event_args); |
| 378 case EVENT_STOP_CAPTURE: | 403 case EVENT_STOP_CAPTURE: |
| 379 return AbortSilently(event_args); | 404 return AbortSilently(event_args); |
| 380 case EVENT_AUDIO_DATA: // Corner cases related to queued messages | 405 case EVENT_AUDIO_DATA: // Corner cases related to queued messages |
| 381 case EVENT_ENGINE_RESULT: // being lately dispatched. | 406 case EVENT_ENGINE_RESULT: // being lately dispatched. |
| 382 case EVENT_ENGINE_ERROR: | 407 case EVENT_ENGINE_ERROR: |
| 383 case EVENT_AUDIO_ERROR: | 408 case EVENT_AUDIO_ERROR: |
| 384 return DoNothing(event_args); | 409 return DoNothing(event_args); |
| 385 } | 410 } |
| 386 break; | 411 break; |
| 387 case STATE_STARTING: | 412 case STATE_STARTING: |
| 388 switch (event) { | 413 switch (event) { |
| 389 case EVENT_ABORT: | 414 case EVENT_ABORT: |
| 390 return AbortWithError(event_args); | 415 return AbortWithError(event_args); |
| 416 case EVENT_PREPARE: | |
| 417 return NotFeasible(event_args); | |
| 391 case EVENT_START: | 418 case EVENT_START: |
| 392 return NotFeasible(event_args); | 419 return NotFeasible(event_args); |
| 393 case EVENT_STOP_CAPTURE: | 420 case EVENT_STOP_CAPTURE: |
| 394 return AbortSilently(event_args); | 421 return AbortSilently(event_args); |
| 395 case EVENT_AUDIO_DATA: | 422 case EVENT_AUDIO_DATA: |
| 396 return StartRecognitionEngine(event_args); | 423 return StartRecognitionEngine(event_args); |
| 397 case EVENT_ENGINE_RESULT: | 424 case EVENT_ENGINE_RESULT: |
| 398 return NotFeasible(event_args); | 425 return NotFeasible(event_args); |
| 399 case EVENT_ENGINE_ERROR: | 426 case EVENT_ENGINE_ERROR: |
| 400 case EVENT_AUDIO_ERROR: | 427 case EVENT_AUDIO_ERROR: |
| 401 return AbortWithError(event_args); | 428 return AbortWithError(event_args); |
| 402 } | 429 } |
| 403 break; | 430 break; |
| 404 case STATE_ESTIMATING_ENVIRONMENT: | 431 case STATE_ESTIMATING_ENVIRONMENT: |
| 405 switch (event) { | 432 switch (event) { |
| 406 case EVENT_ABORT: | 433 case EVENT_ABORT: |
| 407 return AbortWithError(event_args); | 434 return AbortWithError(event_args); |
| 435 case EVENT_PREPARE: | |
| 436 return NotFeasible(event_args); | |
| 408 case EVENT_START: | 437 case EVENT_START: |
| 409 return NotFeasible(event_args); | 438 return NotFeasible(event_args); |
| 410 case EVENT_STOP_CAPTURE: | 439 case EVENT_STOP_CAPTURE: |
| 411 return StopCaptureAndWaitForResult(event_args); | 440 return StopCaptureAndWaitForResult(event_args); |
| 412 case EVENT_AUDIO_DATA: | 441 case EVENT_AUDIO_DATA: |
| 413 return WaitEnvironmentEstimationCompletion(event_args); | 442 return WaitEnvironmentEstimationCompletion(event_args); |
| 414 case EVENT_ENGINE_RESULT: | 443 case EVENT_ENGINE_RESULT: |
| 415 return ProcessIntermediateResult(event_args); | 444 return ProcessIntermediateResult(event_args); |
| 416 case EVENT_ENGINE_ERROR: | 445 case EVENT_ENGINE_ERROR: |
| 417 case EVENT_AUDIO_ERROR: | 446 case EVENT_AUDIO_ERROR: |
| 418 return AbortWithError(event_args); | 447 return AbortWithError(event_args); |
| 419 } | 448 } |
| 420 break; | 449 break; |
| 421 case STATE_WAITING_FOR_SPEECH: | 450 case STATE_WAITING_FOR_SPEECH: |
| 422 switch (event) { | 451 switch (event) { |
| 423 case EVENT_ABORT: | 452 case EVENT_ABORT: |
| 424 return AbortWithError(event_args); | 453 return AbortWithError(event_args); |
| 454 case EVENT_PREPARE: | |
| 455 return NotFeasible(event_args); | |
| 425 case EVENT_START: | 456 case EVENT_START: |
| 426 return NotFeasible(event_args); | 457 return NotFeasible(event_args); |
| 427 case EVENT_STOP_CAPTURE: | 458 case EVENT_STOP_CAPTURE: |
| 428 return StopCaptureAndWaitForResult(event_args); | 459 return StopCaptureAndWaitForResult(event_args); |
| 429 case EVENT_AUDIO_DATA: | 460 case EVENT_AUDIO_DATA: |
| 430 return DetectUserSpeechOrTimeout(event_args); | 461 return DetectUserSpeechOrTimeout(event_args); |
| 431 case EVENT_ENGINE_RESULT: | 462 case EVENT_ENGINE_RESULT: |
| 432 return ProcessIntermediateResult(event_args); | 463 return ProcessIntermediateResult(event_args); |
| 433 case EVENT_ENGINE_ERROR: | 464 case EVENT_ENGINE_ERROR: |
| 434 case EVENT_AUDIO_ERROR: | 465 case EVENT_AUDIO_ERROR: |
| 435 return AbortWithError(event_args); | 466 return AbortWithError(event_args); |
| 436 } | 467 } |
| 437 break; | 468 break; |
| 438 case STATE_RECOGNIZING: | 469 case STATE_RECOGNIZING: |
| 439 switch (event) { | 470 switch (event) { |
| 440 case EVENT_ABORT: | 471 case EVENT_ABORT: |
| 441 return AbortWithError(event_args); | 472 return AbortWithError(event_args); |
| 473 case EVENT_PREPARE: | |
| 474 return NotFeasible(event_args); | |
| 442 case EVENT_START: | 475 case EVENT_START: |
| 443 return NotFeasible(event_args); | 476 return NotFeasible(event_args); |
| 444 case EVENT_STOP_CAPTURE: | 477 case EVENT_STOP_CAPTURE: |
| 445 return StopCaptureAndWaitForResult(event_args); | 478 return StopCaptureAndWaitForResult(event_args); |
| 446 case EVENT_AUDIO_DATA: | 479 case EVENT_AUDIO_DATA: |
| 447 return DetectEndOfSpeech(event_args); | 480 return DetectEndOfSpeech(event_args); |
| 448 case EVENT_ENGINE_RESULT: | 481 case EVENT_ENGINE_RESULT: |
| 449 return ProcessIntermediateResult(event_args); | 482 return ProcessIntermediateResult(event_args); |
| 450 case EVENT_ENGINE_ERROR: | 483 case EVENT_ENGINE_ERROR: |
| 451 case EVENT_AUDIO_ERROR: | 484 case EVENT_AUDIO_ERROR: |
| 452 return AbortWithError(event_args); | 485 return AbortWithError(event_args); |
| 453 } | 486 } |
| 454 break; | 487 break; |
| 455 case STATE_WAITING_FINAL_RESULT: | 488 case STATE_WAITING_FINAL_RESULT: |
| 456 switch (event) { | 489 switch (event) { |
| 457 case EVENT_ABORT: | 490 case EVENT_ABORT: |
| 458 return AbortWithError(event_args); | 491 return AbortWithError(event_args); |
| 492 case EVENT_PREPARE: | |
| 493 return NotFeasible(event_args); | |
| 459 case EVENT_START: | 494 case EVENT_START: |
| 460 return NotFeasible(event_args); | 495 return NotFeasible(event_args); |
| 461 case EVENT_STOP_CAPTURE: | 496 case EVENT_STOP_CAPTURE: |
| 462 case EVENT_AUDIO_DATA: | 497 case EVENT_AUDIO_DATA: |
| 463 return DoNothing(event_args); | 498 return DoNothing(event_args); |
| 464 case EVENT_ENGINE_RESULT: | 499 case EVENT_ENGINE_RESULT: |
| 465 return ProcessFinalResult(event_args); | 500 return ProcessFinalResult(event_args); |
| 466 case EVENT_ENGINE_ERROR: | 501 case EVENT_ENGINE_ERROR: |
| 467 case EVENT_AUDIO_ERROR: | 502 case EVENT_AUDIO_ERROR: |
| 468 return AbortWithError(event_args); | 503 return AbortWithError(event_args); |
| (...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 505 if (route_to_vumeter) { | 540 if (route_to_vumeter) { |
| 506 DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|. | 541 DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|. |
| 507 UpdateSignalAndNoiseLevels(rms, clip_detected); | 542 UpdateSignalAndNoiseLevels(rms, clip_detected); |
| 508 } | 543 } |
| 509 if (route_to_sr_engine) { | 544 if (route_to_sr_engine) { |
| 510 DCHECK(recognition_engine_.get() != NULL); | 545 DCHECK(recognition_engine_.get() != NULL); |
| 511 recognition_engine_->TakeAudioChunk(raw_audio); | 546 recognition_engine_->TakeAudioChunk(raw_audio); |
| 512 } | 547 } |
| 513 } | 548 } |
| 514 | 549 |
| 550 void SpeechRecognizerImpl::OnDeviceInfo(const media::AudioParameters& params) { | |
| 551 DCHECK_CURRENTLY_ON(BrowserThread::IO); | |
| 552 device_params_ = params; | |
| 553 DVLOG(1) << "Device parameters: " << device_params_.AsHumanReadableString(); | |
| 554 DispatchEvent(FSMEventArgs(EVENT_START)); | |
| 555 } | |
| 556 | |
| 557 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::PrepareRecognition( | |
| 558 const FSMEventArgs&) { | |
| 559 DCHECK(state_ == STATE_IDLE); | |
| 560 DCHECK(recognition_engine_.get() != NULL); | |
| 561 DCHECK(!IsCapturingAudio()); | |
| 562 GetAudioSystem()->GetInputStreamParameters( | |
|
tommi (sloooow) - chröme
2017/02/02 16:27:18
Instead of adding AudioSystem etc, could you post
| |
| 563 device_id_, base::Bind(&SpeechRecognizerImpl::OnDeviceInfo, | |
| 564 weak_ptr_factory_.GetWeakPtr())); | |
| 565 | |
| 566 listener()->OnRecognitionStart(session_id()); | |
| 567 return STATE_PREPARING; | |
| 568 } | |
| 569 | |
| 515 SpeechRecognizerImpl::FSMState | 570 SpeechRecognizerImpl::FSMState |
| 516 SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) { | 571 SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) { |
| 517 DCHECK(state_ == STATE_IDLE); | 572 DCHECK(state_ == STATE_PREPARING); |
| 518 DCHECK(recognition_engine_.get() != NULL); | 573 DCHECK(recognition_engine_.get() != NULL); |
| 519 DCHECK(!IsCapturingAudio()); | 574 DCHECK(!IsCapturingAudio()); |
| 520 const bool unit_test_is_active = (audio_manager_for_tests_ != NULL); | |
| 521 AudioManager* audio_manager = unit_test_is_active ? | |
| 522 audio_manager_for_tests_ : | |
| 523 AudioManager::Get(); | |
| 524 DCHECK(audio_manager != NULL); | |
| 525 | 575 |
| 526 DVLOG(1) << "SpeechRecognizerImpl starting audio capture."; | 576 DVLOG(1) << "SpeechRecognizerImpl starting audio capture."; |
| 527 num_samples_recorded_ = 0; | 577 num_samples_recorded_ = 0; |
| 528 audio_level_ = 0; | 578 audio_level_ = 0; |
| 529 end_of_utterance_ = false; | 579 end_of_utterance_ = false; |
| 530 listener()->OnRecognitionStart(session_id()); | |
| 531 | 580 |
| 532 // TODO(xians): Check if the OS has the device with |device_id_|, return | 581 int chunk_duration_ms = recognition_engine_->GetDesiredAudioChunkDurationMs(); |
| 533 // |SPEECH_AUDIO_ERROR_DETAILS_NO_MIC| if the target device does not exist. | 582 |
| 534 if (!audio_manager->HasAudioInputDevices()) { | 583 if (!device_params_.IsValid()) { |
| 584 DLOG(ERROR) << "Audio input device not found"; | |
| 535 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE, | 585 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE, |
| 536 SPEECH_AUDIO_ERROR_DETAILS_NO_MIC)); | 586 SPEECH_AUDIO_ERROR_DETAILS_NO_MIC)); |
| 537 } | 587 } |
| 538 | 588 |
| 539 int chunk_duration_ms = recognition_engine_->GetDesiredAudioChunkDurationMs(); | |
| 540 | |
| 541 AudioParameters in_params = audio_manager->GetInputStreamParameters( | |
| 542 device_id_); | |
| 543 if (!in_params.IsValid() && !unit_test_is_active) { | |
| 544 DLOG(ERROR) << "Invalid native audio input parameters"; | |
| 545 return Abort( | |
| 546 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE)); | |
| 547 } | |
| 548 | |
| 549 // Audio converter shall provide audio based on these parameters as output. | 589 // Audio converter shall provide audio based on these parameters as output. |
| 550 // Hard coded, WebSpeech specific parameters are utilized here. | 590 // Hard coded, WebSpeech specific parameters are utilized here. |
| 551 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000; | 591 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000; |
| 552 AudioParameters output_parameters = AudioParameters( | 592 AudioParameters output_parameters = AudioParameters( |
| 553 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate, | 593 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate, |
| 554 kNumBitsPerAudioSample, frames_per_buffer); | 594 kNumBitsPerAudioSample, frames_per_buffer); |
| 555 DVLOG(1) << "SRI::output_parameters: " | 595 DVLOG(1) << "SRI::output_parameters: " |
| 556 << output_parameters.AsHumanReadableString(); | 596 << output_parameters.AsHumanReadableString(); |
| 557 | 597 |
| 558 // Audio converter will receive audio based on these parameters as input. | 598 // Audio converter will receive audio based on these parameters as input. |
| 559 // On Windows we start by verifying that Core Audio is supported. If not, | 599 // On Windows we start by verifying that Core Audio is supported. If not, |
| 560 // the WaveIn API is used and we might as well avoid all audio conversations | 600 // the WaveIn API is used and we might as well avoid all audio conversations |
| 561 // since WaveIn does the conversion for us. | 601 // since WaveIn does the conversion for us. |
| 562 // TODO(henrika): this code should be moved to platform dependent audio | 602 // TODO(henrika): this code should be moved to platform dependent audio |
| 563 // managers. | 603 // managers. |
| 564 bool use_native_audio_params = true; | 604 bool use_native_audio_params = true; |
| 565 #if defined(OS_WIN) | 605 #if defined(OS_WIN) |
| 566 use_native_audio_params = media::CoreAudioUtil::IsSupported(); | 606 use_native_audio_params = media::CoreAudioUtil::IsSupported(); |
| 567 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech"; | 607 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech"; |
| 568 #endif | 608 #endif |
| 569 | 609 |
| 570 AudioParameters input_parameters = output_parameters; | 610 AudioParameters input_parameters = output_parameters; |
| 571 if (use_native_audio_params && !unit_test_is_active) { | 611 |
| 612 // AUDIO_FAKE means we are running a test. | |
| 613 if (use_native_audio_params && | |
| 614 device_params_.format() != media::AudioParameters::AUDIO_FAKE) { | |
| 572 // Use native audio parameters but avoid opening up at the native buffer | 615 // Use native audio parameters but avoid opening up at the native buffer |
| 573 // size. Instead use same frame size (in milliseconds) as WebSpeech uses. | 616 // size. Instead use same frame size (in milliseconds) as WebSpeech uses. |
| 574 // We rely on internal buffers in the audio back-end to fulfill this request | 617 // We rely on internal buffers in the audio back-end to fulfill this request |
| 575 // and the idea is to simplify the audio conversion since each Convert() | 618 // and the idea is to simplify the audio conversion since each Convert() |
| 576 // call will then render exactly one ProvideInput() call. | 619 // call will then render exactly one ProvideInput() call. |
| 577 // in_params.sample_rate() | 620 input_parameters = device_params_; |
| 578 input_parameters = in_params; | |
| 579 frames_per_buffer = | 621 frames_per_buffer = |
| 580 ((in_params.sample_rate() * chunk_duration_ms) / 1000.0) + 0.5; | 622 ((input_parameters.sample_rate() * chunk_duration_ms) / 1000.0) + 0.5; |
| 581 input_parameters.set_frames_per_buffer(frames_per_buffer); | 623 input_parameters.set_frames_per_buffer(frames_per_buffer); |
| 582 DVLOG(1) << "SRI::input_parameters: " | 624 DVLOG(1) << "SRI::input_parameters: " |
| 583 << input_parameters.AsHumanReadableString(); | 625 << input_parameters.AsHumanReadableString(); |
| 584 } | 626 } |
| 585 | 627 |
| 586 // Create an audio converter which converts data between native input format | 628 // Create an audio converter which converts data between native input format |
| 587 // and WebSpeech specific output format. | 629 // and WebSpeech specific output format. |
| 588 audio_converter_.reset( | 630 audio_converter_.reset( |
| 589 new OnDataConverter(input_parameters, output_parameters)); | 631 new OnDataConverter(input_parameters, output_parameters)); |
| 590 | 632 |
| 591 audio_controller_ = AudioInputController::Create( | 633 audio_controller_ = |
| 592 audio_manager, this, this, input_parameters, device_id_, NULL); | 634 AudioInputController::Create(GetAudioSystem()->GetAudioManager(), this, |
| 635 this, input_parameters, device_id_, NULL); | |
| 593 | 636 |
| 594 if (!audio_controller_.get()) { | 637 if (!audio_controller_.get()) { |
| 595 return Abort( | 638 return Abort( |
| 596 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE)); | 639 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE)); |
| 597 } | 640 } |
| 598 | 641 |
| 599 audio_log_->OnCreated(0, input_parameters, device_id_); | 642 audio_log_->OnCreated(0, input_parameters, device_id_); |
| 600 | 643 |
| 601 // The endpointer needs to estimate the environment/background noise before | 644 // The endpointer needs to estimate the environment/background noise before |
| 602 // starting to treat the audio as user input. We wait in the state | 645 // starting to treat the audio as user input. We wait in the state |
| (...skipping 78 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 681 return Abort( | 724 return Abort( |
| 682 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE)); | 725 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE)); |
| 683 } else if (event_args.event == EVENT_ENGINE_ERROR) { | 726 } else if (event_args.event == EVENT_ENGINE_ERROR) { |
| 684 return Abort(event_args.engine_error); | 727 return Abort(event_args.engine_error); |
| 685 } | 728 } |
| 686 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_ABORTED)); | 729 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_ABORTED)); |
| 687 } | 730 } |
| 688 | 731 |
| 689 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort( | 732 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort( |
| 690 const SpeechRecognitionError& error) { | 733 const SpeechRecognitionError& error) { |
| 734 DCHECK_CURRENTLY_ON(BrowserThread::IO); | |
| 735 | |
| 691 if (IsCapturingAudio()) | 736 if (IsCapturingAudio()) |
| 692 CloseAudioControllerAsynchronously(); | 737 CloseAudioControllerAsynchronously(); |
| 693 | 738 |
| 694 DVLOG(1) << "SpeechRecognizerImpl canceling recognition. "; | 739 DVLOG(1) << "SpeechRecognizerImpl canceling recognition. "; |
| 695 | 740 |
| 741 if (state_ == STATE_PREPARING) { | |
| 742 // Cancel an outstanding reply from AudioSystem. | |
| 743 weak_ptr_factory_.InvalidateWeakPtrs(); | |
| 744 } | |
| 745 | |
| 696 // The recognition engine is initialized only after STATE_STARTING. | 746 // The recognition engine is initialized only after STATE_STARTING. |
| 697 if (state_ > STATE_STARTING) { | 747 if (state_ > STATE_STARTING) { |
| 698 DCHECK(recognition_engine_.get() != NULL); | 748 DCHECK(recognition_engine_.get() != NULL); |
| 699 recognition_engine_->EndRecognition(); | 749 recognition_engine_->EndRecognition(); |
| 700 } | 750 } |
| 701 | 751 |
| 702 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT) | 752 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT) |
| 703 listener()->OnSoundEnd(session_id()); | 753 listener()->OnSoundEnd(session_id()); |
| 704 | 754 |
| 705 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT) | 755 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT) |
| (...skipping 116 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 822 | 872 |
| 823 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / | 873 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / |
| 824 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); | 874 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); |
| 825 noise_level = std::min(std::max(0.0f, noise_level), | 875 noise_level = std::min(std::max(0.0f, noise_level), |
| 826 kAudioMeterRangeMaxUnclipped); | 876 kAudioMeterRangeMaxUnclipped); |
| 827 | 877 |
| 828 listener()->OnAudioLevelsChange( | 878 listener()->OnAudioLevelsChange( |
| 829 session_id(), clip_detected ? 1.0f : audio_level_, noise_level); | 879 session_id(), clip_detected ? 1.0f : audio_level_, noise_level); |
| 830 } | 880 } |
| 831 | 881 |
| 832 void SpeechRecognizerImpl::SetAudioManagerForTesting( | 882 void SpeechRecognizerImpl::SetAudioSystemForTesting( |
| 833 AudioManager* audio_manager) { | 883 media::AudioSystem* audio_system) { |
| 834 audio_manager_for_tests_ = audio_manager; | 884 audio_system_for_tests_ = audio_system; |
| 885 } | |
| 886 | |
| 887 media::AudioSystem* SpeechRecognizerImpl::GetAudioSystem() { | |
| 888 return audio_system_for_tests_ ? audio_system_for_tests_ : audio_system_; | |
| 835 } | 889 } |
| 836 | 890 |
| 837 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) | 891 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) |
| 838 : event(event_value), | 892 : event(event_value), |
| 839 audio_data(NULL), | 893 audio_data(NULL), |
| 840 engine_error(SPEECH_RECOGNITION_ERROR_NONE) { | 894 engine_error(SPEECH_RECOGNITION_ERROR_NONE) { |
| 841 } | 895 } |
| 842 | 896 |
| 843 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(const FSMEventArgs& other) = | 897 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(const FSMEventArgs& other) = |
| 844 default; | 898 default; |
| 845 | 899 |
| 846 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { | 900 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { |
| 847 } | 901 } |
| 848 | 902 |
| 849 } // namespace content | 903 } // namespace content |
| OLD | NEW |