OLD | NEW |
---|---|
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "content/browser/speech/speech_recognizer_impl.h" | 5 #include "content/browser/speech/speech_recognizer_impl.h" |
6 | 6 |
7 #include <stdint.h> | 7 #include <stdint.h> |
8 | 8 |
9 #include <algorithm> | 9 #include <algorithm> |
10 | 10 |
11 #include "base/bind.h" | 11 #include "base/bind.h" |
12 #include "base/macros.h" | 12 #include "base/macros.h" |
13 #include "base/time/time.h" | 13 #include "base/time/time.h" |
14 #include "build/build_config.h" | 14 #include "build/build_config.h" |
15 #include "content/browser/browser_main_loop.h" | 15 #include "content/browser/browser_main_loop.h" |
16 #include "content/browser/media/media_internals.h" | 16 #include "content/browser/media/media_internals.h" |
17 #include "content/browser/speech/audio_buffer.h" | 17 #include "content/browser/speech/audio_buffer.h" |
18 #include "content/public/browser/speech_recognition_event_listener.h" | 18 #include "content/public/browser/speech_recognition_event_listener.h" |
19 #include "media/audio/audio_file_writer.h" | 19 #include "media/audio/audio_file_writer.h" |
20 #include "media/audio/audio_manager.h" | |
21 #include "media/audio/audio_system.h" | |
20 #include "media/base/audio_converter.h" | 22 #include "media/base/audio_converter.h" |
21 | 23 |
22 #if defined(OS_WIN) | 24 #if defined(OS_WIN) |
23 #include "media/audio/win/core_audio_util_win.h" | 25 #include "media/audio/win/core_audio_util_win.h" |
24 #endif | 26 #endif |
25 | 27 |
26 using media::AudioBus; | 28 using media::AudioBus; |
27 using media::AudioConverter; | 29 using media::AudioConverter; |
28 using media::AudioInputController; | 30 using media::AudioInputController; |
29 using media::AudioManager; | 31 using media::AudioManager; |
(...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
105 } | 107 } |
106 | 108 |
107 } // namespace | 109 } // namespace |
108 | 110 |
109 const int SpeechRecognizerImpl::kAudioSampleRate = 16000; | 111 const int SpeechRecognizerImpl::kAudioSampleRate = 16000; |
110 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = | 112 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = |
111 media::CHANNEL_LAYOUT_MONO; | 113 media::CHANNEL_LAYOUT_MONO; |
112 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; | 114 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; |
113 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; | 115 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; |
114 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; | 116 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; |
115 media::AudioManager* SpeechRecognizerImpl::audio_manager_for_tests_ = NULL; | 117 media::AudioSystem* SpeechRecognizerImpl::audio_system_for_tests_ = nullptr; |
tommi (sloooow) - chröme
2017/02/05 20:14:54
Just checking - Is it still necessary for this to
o1ka
2017/02/06 12:06:08
Yes, it's used in speech_recognition_browsertest.c
| |
116 | 118 |
117 static_assert(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0, | 119 static_assert(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0, |
118 "kNumBitsPerAudioSample must be a multiple of 8"); | 120 "kNumBitsPerAudioSample must be a multiple of 8"); |
119 | 121 |
120 // SpeechRecognizerImpl::OnDataConverter implementation | 122 // SpeechRecognizerImpl::OnDataConverter implementation |
121 | 123 |
122 SpeechRecognizerImpl::OnDataConverter::OnDataConverter( | 124 SpeechRecognizerImpl::OnDataConverter::OnDataConverter( |
123 const AudioParameters& input_params, | 125 const AudioParameters& input_params, |
124 const AudioParameters& output_params) | 126 const AudioParameters& output_params) |
125 : audio_converter_(input_params, output_params, false), | 127 : audio_converter_(input_params, output_params, false), |
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
169 input_bus_->CopyTo(dest); | 171 input_bus_->CopyTo(dest); |
170 // Indicate that the recorded audio has in fact been used by the converter. | 172 // Indicate that the recorded audio has in fact been used by the converter. |
171 data_was_converted_ = true; | 173 data_was_converted_ = true; |
172 return 1; | 174 return 1; |
173 } | 175 } |
174 | 176 |
175 // SpeechRecognizerImpl implementation | 177 // SpeechRecognizerImpl implementation |
176 | 178 |
177 SpeechRecognizerImpl::SpeechRecognizerImpl( | 179 SpeechRecognizerImpl::SpeechRecognizerImpl( |
178 SpeechRecognitionEventListener* listener, | 180 SpeechRecognitionEventListener* listener, |
181 media::AudioSystem* audio_system, | |
179 int session_id, | 182 int session_id, |
180 bool continuous, | 183 bool continuous, |
181 bool provisional_results, | 184 bool provisional_results, |
182 SpeechRecognitionEngine* engine) | 185 SpeechRecognitionEngine* engine) |
183 : SpeechRecognizer(listener, session_id), | 186 : SpeechRecognizer(listener, session_id), |
187 audio_system_(audio_system), | |
184 recognition_engine_(engine), | 188 recognition_engine_(engine), |
185 endpointer_(kAudioSampleRate), | 189 endpointer_(kAudioSampleRate), |
186 audio_log_(MediaInternals::GetInstance()->CreateAudioLog( | 190 audio_log_(MediaInternals::GetInstance()->CreateAudioLog( |
187 media::AudioLogFactory::AUDIO_INPUT_CONTROLLER)), | 191 media::AudioLogFactory::AUDIO_INPUT_CONTROLLER)), |
188 is_dispatching_event_(false), | 192 is_dispatching_event_(false), |
189 provisional_results_(provisional_results), | 193 provisional_results_(provisional_results), |
190 end_of_utterance_(false), | 194 end_of_utterance_(false), |
191 state_(STATE_IDLE) { | 195 state_(STATE_IDLE), |
192 DCHECK(recognition_engine_ != NULL); | 196 weak_ptr_factory_(this) { |
197 DCHECK(recognition_engine_ != nullptr); | |
198 DCHECK(audio_system_ != nullptr); | |
193 if (!continuous) { | 199 if (!continuous) { |
194 // In single shot (non-continous) recognition, | 200 // In single shot (non-continous) recognition, |
195 // the session is automatically ended after: | 201 // the session is automatically ended after: |
196 // - 0.5 seconds of silence if time < 3 seconds | 202 // - 0.5 seconds of silence if time < 3 seconds |
197 // - 1 seconds of silence if time >= 3 seconds | 203 // - 1 seconds of silence if time >= 3 seconds |
198 endpointer_.set_speech_input_complete_silence_length( | 204 endpointer_.set_speech_input_complete_silence_length( |
199 base::Time::kMicrosecondsPerSecond / 2); | 205 base::Time::kMicrosecondsPerSecond / 2); |
200 endpointer_.set_long_speech_input_complete_silence_length( | 206 endpointer_.set_long_speech_input_complete_silence_length( |
201 base::Time::kMicrosecondsPerSecond); | 207 base::Time::kMicrosecondsPerSecond); |
202 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); | 208 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); |
(...skipping 13 matching lines...) Expand all Loading... | |
216 // NOTE:all the external events and requests should be enqueued (PostTask), even | 222 // NOTE:all the external events and requests should be enqueued (PostTask), even |
217 // if they come from the same (IO) thread, in order to preserve the relationship | 223 // if they come from the same (IO) thread, in order to preserve the relationship |
218 // of causality between events and avoid interleaved event processing due to | 224 // of causality between events and avoid interleaved event processing due to |
219 // synchronous callbacks. | 225 // synchronous callbacks. |
220 | 226 |
221 void SpeechRecognizerImpl::StartRecognition(const std::string& device_id) { | 227 void SpeechRecognizerImpl::StartRecognition(const std::string& device_id) { |
222 DCHECK(!device_id.empty()); | 228 DCHECK(!device_id.empty()); |
223 device_id_ = device_id; | 229 device_id_ = device_id; |
224 | 230 |
225 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 231 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
226 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | 232 base::Bind(&SpeechRecognizerImpl::DispatchEvent, this, |
227 this, FSMEventArgs(EVENT_START))); | 233 FSMEventArgs(EVENT_PREPARE))); |
228 } | 234 } |
229 | 235 |
230 void SpeechRecognizerImpl::AbortRecognition() { | 236 void SpeechRecognizerImpl::AbortRecognition() { |
231 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 237 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
232 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | 238 base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
233 this, FSMEventArgs(EVENT_ABORT))); | 239 this, FSMEventArgs(EVENT_ABORT))); |
234 } | 240 } |
235 | 241 |
236 void SpeechRecognizerImpl::StopAudioCapture() { | 242 void SpeechRecognizerImpl::StopAudioCapture() { |
237 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 243 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
(...skipping 131 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
369 SpeechRecognizerImpl::ExecuteTransitionAndGetNextState( | 375 SpeechRecognizerImpl::ExecuteTransitionAndGetNextState( |
370 const FSMEventArgs& event_args) { | 376 const FSMEventArgs& event_args) { |
371 const FSMEvent event = event_args.event; | 377 const FSMEvent event = event_args.event; |
372 switch (state_) { | 378 switch (state_) { |
373 case STATE_IDLE: | 379 case STATE_IDLE: |
374 switch (event) { | 380 switch (event) { |
375 // TODO(primiano): restore UNREACHABLE_CONDITION on EVENT_ABORT and | 381 // TODO(primiano): restore UNREACHABLE_CONDITION on EVENT_ABORT and |
376 // EVENT_STOP_CAPTURE below once speech input extensions are fixed. | 382 // EVENT_STOP_CAPTURE below once speech input extensions are fixed. |
377 case EVENT_ABORT: | 383 case EVENT_ABORT: |
378 return AbortSilently(event_args); | 384 return AbortSilently(event_args); |
385 case EVENT_PREPARE: | |
386 return PrepareRecognition(event_args); | |
387 case EVENT_START: | |
388 return NotFeasible(event_args); | |
389 case EVENT_STOP_CAPTURE: | |
390 return AbortSilently(event_args); | |
391 case EVENT_AUDIO_DATA: // Corner cases related to queued messages | |
392 case EVENT_ENGINE_RESULT: // being lately dispatched. | |
393 case EVENT_ENGINE_ERROR: | |
394 case EVENT_AUDIO_ERROR: | |
395 return DoNothing(event_args); | |
396 } | |
397 break; | |
398 case STATE_PREPARING: | |
399 switch (event) { | |
400 case EVENT_ABORT: | |
401 return AbortSilently(event_args); | |
402 case EVENT_PREPARE: | |
403 return NotFeasible(event_args); | |
379 case EVENT_START: | 404 case EVENT_START: |
380 return StartRecording(event_args); | 405 return StartRecording(event_args); |
381 case EVENT_STOP_CAPTURE: | 406 case EVENT_STOP_CAPTURE: |
382 return AbortSilently(event_args); | 407 return AbortSilently(event_args); |
383 case EVENT_AUDIO_DATA: // Corner cases related to queued messages | 408 case EVENT_AUDIO_DATA: // Corner cases related to queued messages |
384 case EVENT_ENGINE_RESULT: // being lately dispatched. | 409 case EVENT_ENGINE_RESULT: // being lately dispatched. |
385 case EVENT_ENGINE_ERROR: | 410 case EVENT_ENGINE_ERROR: |
386 case EVENT_AUDIO_ERROR: | 411 case EVENT_AUDIO_ERROR: |
387 return DoNothing(event_args); | 412 return DoNothing(event_args); |
388 } | 413 } |
389 break; | 414 break; |
390 case STATE_STARTING: | 415 case STATE_STARTING: |
391 switch (event) { | 416 switch (event) { |
392 case EVENT_ABORT: | 417 case EVENT_ABORT: |
393 return AbortWithError(event_args); | 418 return AbortWithError(event_args); |
419 case EVENT_PREPARE: | |
420 return NotFeasible(event_args); | |
394 case EVENT_START: | 421 case EVENT_START: |
395 return NotFeasible(event_args); | 422 return NotFeasible(event_args); |
396 case EVENT_STOP_CAPTURE: | 423 case EVENT_STOP_CAPTURE: |
397 return AbortSilently(event_args); | 424 return AbortSilently(event_args); |
398 case EVENT_AUDIO_DATA: | 425 case EVENT_AUDIO_DATA: |
399 return StartRecognitionEngine(event_args); | 426 return StartRecognitionEngine(event_args); |
400 case EVENT_ENGINE_RESULT: | 427 case EVENT_ENGINE_RESULT: |
401 return NotFeasible(event_args); | 428 return NotFeasible(event_args); |
402 case EVENT_ENGINE_ERROR: | 429 case EVENT_ENGINE_ERROR: |
403 case EVENT_AUDIO_ERROR: | 430 case EVENT_AUDIO_ERROR: |
404 return AbortWithError(event_args); | 431 return AbortWithError(event_args); |
405 } | 432 } |
406 break; | 433 break; |
407 case STATE_ESTIMATING_ENVIRONMENT: | 434 case STATE_ESTIMATING_ENVIRONMENT: |
408 switch (event) { | 435 switch (event) { |
409 case EVENT_ABORT: | 436 case EVENT_ABORT: |
410 return AbortWithError(event_args); | 437 return AbortWithError(event_args); |
438 case EVENT_PREPARE: | |
439 return NotFeasible(event_args); | |
411 case EVENT_START: | 440 case EVENT_START: |
412 return NotFeasible(event_args); | 441 return NotFeasible(event_args); |
413 case EVENT_STOP_CAPTURE: | 442 case EVENT_STOP_CAPTURE: |
414 return StopCaptureAndWaitForResult(event_args); | 443 return StopCaptureAndWaitForResult(event_args); |
415 case EVENT_AUDIO_DATA: | 444 case EVENT_AUDIO_DATA: |
416 return WaitEnvironmentEstimationCompletion(event_args); | 445 return WaitEnvironmentEstimationCompletion(event_args); |
417 case EVENT_ENGINE_RESULT: | 446 case EVENT_ENGINE_RESULT: |
418 return ProcessIntermediateResult(event_args); | 447 return ProcessIntermediateResult(event_args); |
419 case EVENT_ENGINE_ERROR: | 448 case EVENT_ENGINE_ERROR: |
420 case EVENT_AUDIO_ERROR: | 449 case EVENT_AUDIO_ERROR: |
421 return AbortWithError(event_args); | 450 return AbortWithError(event_args); |
422 } | 451 } |
423 break; | 452 break; |
424 case STATE_WAITING_FOR_SPEECH: | 453 case STATE_WAITING_FOR_SPEECH: |
425 switch (event) { | 454 switch (event) { |
426 case EVENT_ABORT: | 455 case EVENT_ABORT: |
427 return AbortWithError(event_args); | 456 return AbortWithError(event_args); |
457 case EVENT_PREPARE: | |
458 return NotFeasible(event_args); | |
428 case EVENT_START: | 459 case EVENT_START: |
429 return NotFeasible(event_args); | 460 return NotFeasible(event_args); |
430 case EVENT_STOP_CAPTURE: | 461 case EVENT_STOP_CAPTURE: |
431 return StopCaptureAndWaitForResult(event_args); | 462 return StopCaptureAndWaitForResult(event_args); |
432 case EVENT_AUDIO_DATA: | 463 case EVENT_AUDIO_DATA: |
433 return DetectUserSpeechOrTimeout(event_args); | 464 return DetectUserSpeechOrTimeout(event_args); |
434 case EVENT_ENGINE_RESULT: | 465 case EVENT_ENGINE_RESULT: |
435 return ProcessIntermediateResult(event_args); | 466 return ProcessIntermediateResult(event_args); |
436 case EVENT_ENGINE_ERROR: | 467 case EVENT_ENGINE_ERROR: |
437 case EVENT_AUDIO_ERROR: | 468 case EVENT_AUDIO_ERROR: |
438 return AbortWithError(event_args); | 469 return AbortWithError(event_args); |
439 } | 470 } |
440 break; | 471 break; |
441 case STATE_RECOGNIZING: | 472 case STATE_RECOGNIZING: |
442 switch (event) { | 473 switch (event) { |
443 case EVENT_ABORT: | 474 case EVENT_ABORT: |
444 return AbortWithError(event_args); | 475 return AbortWithError(event_args); |
476 case EVENT_PREPARE: | |
477 return NotFeasible(event_args); | |
445 case EVENT_START: | 478 case EVENT_START: |
446 return NotFeasible(event_args); | 479 return NotFeasible(event_args); |
447 case EVENT_STOP_CAPTURE: | 480 case EVENT_STOP_CAPTURE: |
448 return StopCaptureAndWaitForResult(event_args); | 481 return StopCaptureAndWaitForResult(event_args); |
449 case EVENT_AUDIO_DATA: | 482 case EVENT_AUDIO_DATA: |
450 return DetectEndOfSpeech(event_args); | 483 return DetectEndOfSpeech(event_args); |
451 case EVENT_ENGINE_RESULT: | 484 case EVENT_ENGINE_RESULT: |
452 return ProcessIntermediateResult(event_args); | 485 return ProcessIntermediateResult(event_args); |
453 case EVENT_ENGINE_ERROR: | 486 case EVENT_ENGINE_ERROR: |
454 case EVENT_AUDIO_ERROR: | 487 case EVENT_AUDIO_ERROR: |
455 return AbortWithError(event_args); | 488 return AbortWithError(event_args); |
456 } | 489 } |
457 break; | 490 break; |
458 case STATE_WAITING_FINAL_RESULT: | 491 case STATE_WAITING_FINAL_RESULT: |
459 switch (event) { | 492 switch (event) { |
460 case EVENT_ABORT: | 493 case EVENT_ABORT: |
461 return AbortWithError(event_args); | 494 return AbortWithError(event_args); |
495 case EVENT_PREPARE: | |
496 return NotFeasible(event_args); | |
462 case EVENT_START: | 497 case EVENT_START: |
463 return NotFeasible(event_args); | 498 return NotFeasible(event_args); |
464 case EVENT_STOP_CAPTURE: | 499 case EVENT_STOP_CAPTURE: |
465 case EVENT_AUDIO_DATA: | 500 case EVENT_AUDIO_DATA: |
466 return DoNothing(event_args); | 501 return DoNothing(event_args); |
467 case EVENT_ENGINE_RESULT: | 502 case EVENT_ENGINE_RESULT: |
468 return ProcessFinalResult(event_args); | 503 return ProcessFinalResult(event_args); |
469 case EVENT_ENGINE_ERROR: | 504 case EVENT_ENGINE_ERROR: |
470 case EVENT_AUDIO_ERROR: | 505 case EVENT_AUDIO_ERROR: |
471 return AbortWithError(event_args); | 506 return AbortWithError(event_args); |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
508 if (route_to_vumeter) { | 543 if (route_to_vumeter) { |
509 DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|. | 544 DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|. |
510 UpdateSignalAndNoiseLevels(rms, clip_detected); | 545 UpdateSignalAndNoiseLevels(rms, clip_detected); |
511 } | 546 } |
512 if (route_to_sr_engine) { | 547 if (route_to_sr_engine) { |
513 DCHECK(recognition_engine_.get() != NULL); | 548 DCHECK(recognition_engine_.get() != NULL); |
514 recognition_engine_->TakeAudioChunk(raw_audio); | 549 recognition_engine_->TakeAudioChunk(raw_audio); |
515 } | 550 } |
516 } | 551 } |
517 | 552 |
553 void SpeechRecognizerImpl::OnDeviceInfo(const media::AudioParameters& params) { | |
554 DCHECK_CURRENTLY_ON(BrowserThread::IO); | |
555 device_params_ = params; | |
556 DVLOG(1) << "Device parameters: " << device_params_.AsHumanReadableString(); | |
557 DispatchEvent(FSMEventArgs(EVENT_START)); | |
558 } | |
559 | |
560 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::PrepareRecognition( | |
561 const FSMEventArgs&) { | |
562 DCHECK(state_ == STATE_IDLE); | |
563 DCHECK(recognition_engine_.get() != NULL); | |
564 DCHECK(!IsCapturingAudio()); | |
565 GetAudioSystem()->GetInputStreamParameters( | |
566 device_id_, base::Bind(&SpeechRecognizerImpl::OnDeviceInfo, | |
567 weak_ptr_factory_.GetWeakPtr())); | |
568 | |
569 listener()->OnRecognitionStart(session_id()); | |
570 return STATE_PREPARING; | |
571 } | |
572 | |
518 SpeechRecognizerImpl::FSMState | 573 SpeechRecognizerImpl::FSMState |
519 SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) { | 574 SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) { |
520 DCHECK(state_ == STATE_IDLE); | 575 DCHECK(state_ == STATE_PREPARING); |
521 DCHECK(recognition_engine_.get() != NULL); | 576 DCHECK(recognition_engine_.get() != NULL); |
522 DCHECK(!IsCapturingAudio()); | 577 DCHECK(!IsCapturingAudio()); |
523 const bool unit_test_is_active = (audio_manager_for_tests_ != NULL); | |
524 AudioManager* audio_manager = unit_test_is_active ? | |
525 audio_manager_for_tests_ : | |
526 AudioManager::Get(); | |
527 DCHECK(audio_manager != NULL); | |
528 | 578 |
529 DVLOG(1) << "SpeechRecognizerImpl starting audio capture."; | 579 DVLOG(1) << "SpeechRecognizerImpl starting audio capture."; |
530 num_samples_recorded_ = 0; | 580 num_samples_recorded_ = 0; |
531 audio_level_ = 0; | 581 audio_level_ = 0; |
532 end_of_utterance_ = false; | 582 end_of_utterance_ = false; |
533 listener()->OnRecognitionStart(session_id()); | |
534 | 583 |
535 // TODO(xians): Check if the OS has the device with |device_id_|, return | 584 int chunk_duration_ms = recognition_engine_->GetDesiredAudioChunkDurationMs(); |
536 // |SPEECH_AUDIO_ERROR_DETAILS_NO_MIC| if the target device does not exist. | 585 |
537 if (!audio_manager->HasAudioInputDevices()) { | 586 if (!device_params_.IsValid()) { |
587 DLOG(ERROR) << "Audio input device not found"; | |
538 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE, | 588 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE, |
539 SPEECH_AUDIO_ERROR_DETAILS_NO_MIC)); | 589 SPEECH_AUDIO_ERROR_DETAILS_NO_MIC)); |
540 } | 590 } |
541 | 591 |
542 int chunk_duration_ms = recognition_engine_->GetDesiredAudioChunkDurationMs(); | |
543 | |
544 AudioParameters in_params = audio_manager->GetInputStreamParameters( | |
545 device_id_); | |
546 if (!in_params.IsValid() && !unit_test_is_active) { | |
547 DLOG(ERROR) << "Invalid native audio input parameters"; | |
548 return Abort( | |
549 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE)); | |
550 } | |
551 | |
552 // Audio converter shall provide audio based on these parameters as output. | 592 // Audio converter shall provide audio based on these parameters as output. |
553 // Hard coded, WebSpeech specific parameters are utilized here. | 593 // Hard coded, WebSpeech specific parameters are utilized here. |
554 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000; | 594 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000; |
555 AudioParameters output_parameters = AudioParameters( | 595 AudioParameters output_parameters = AudioParameters( |
556 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate, | 596 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate, |
557 kNumBitsPerAudioSample, frames_per_buffer); | 597 kNumBitsPerAudioSample, frames_per_buffer); |
558 DVLOG(1) << "SRI::output_parameters: " | 598 DVLOG(1) << "SRI::output_parameters: " |
559 << output_parameters.AsHumanReadableString(); | 599 << output_parameters.AsHumanReadableString(); |
560 | 600 |
561 // Audio converter will receive audio based on these parameters as input. | 601 // Audio converter will receive audio based on these parameters as input. |
562 // On Windows we start by verifying that Core Audio is supported. If not, | 602 // On Windows we start by verifying that Core Audio is supported. If not, |
563 // the WaveIn API is used and we might as well avoid all audio conversations | 603 // the WaveIn API is used and we might as well avoid all audio conversations |
564 // since WaveIn does the conversion for us. | 604 // since WaveIn does the conversion for us. |
565 // TODO(henrika): this code should be moved to platform dependent audio | 605 // TODO(henrika): this code should be moved to platform dependent audio |
566 // managers. | 606 // managers. |
567 bool use_native_audio_params = true; | 607 bool use_native_audio_params = true; |
568 #if defined(OS_WIN) | 608 #if defined(OS_WIN) |
569 use_native_audio_params = media::CoreAudioUtil::IsSupported(); | 609 use_native_audio_params = media::CoreAudioUtil::IsSupported(); |
570 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech"; | 610 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech"; |
571 #endif | 611 #endif |
572 | 612 |
573 AudioParameters input_parameters = output_parameters; | 613 AudioParameters input_parameters = output_parameters; |
574 if (use_native_audio_params && !unit_test_is_active) { | 614 |
615 // AUDIO_FAKE means we are running a test. | |
616 if (use_native_audio_params && | |
617 device_params_.format() != media::AudioParameters::AUDIO_FAKE) { | |
575 // Use native audio parameters but avoid opening up at the native buffer | 618 // Use native audio parameters but avoid opening up at the native buffer |
576 // size. Instead use same frame size (in milliseconds) as WebSpeech uses. | 619 // size. Instead use same frame size (in milliseconds) as WebSpeech uses. |
577 // We rely on internal buffers in the audio back-end to fulfill this request | 620 // We rely on internal buffers in the audio back-end to fulfill this request |
578 // and the idea is to simplify the audio conversion since each Convert() | 621 // and the idea is to simplify the audio conversion since each Convert() |
579 // call will then render exactly one ProvideInput() call. | 622 // call will then render exactly one ProvideInput() call. |
580 // in_params.sample_rate() | 623 input_parameters = device_params_; |
581 input_parameters = in_params; | |
582 frames_per_buffer = | 624 frames_per_buffer = |
583 ((in_params.sample_rate() * chunk_duration_ms) / 1000.0) + 0.5; | 625 ((input_parameters.sample_rate() * chunk_duration_ms) / 1000.0) + 0.5; |
584 input_parameters.set_frames_per_buffer(frames_per_buffer); | 626 input_parameters.set_frames_per_buffer(frames_per_buffer); |
585 DVLOG(1) << "SRI::input_parameters: " | 627 DVLOG(1) << "SRI::input_parameters: " |
586 << input_parameters.AsHumanReadableString(); | 628 << input_parameters.AsHumanReadableString(); |
587 } | 629 } |
588 | 630 |
589 // Create an audio converter which converts data between native input format | 631 // Create an audio converter which converts data between native input format |
590 // and WebSpeech specific output format. | 632 // and WebSpeech specific output format. |
591 audio_converter_.reset( | 633 audio_converter_.reset( |
592 new OnDataConverter(input_parameters, output_parameters)); | 634 new OnDataConverter(input_parameters, output_parameters)); |
593 | 635 |
594 audio_controller_ = AudioInputController::Create( | 636 audio_controller_ = AudioInputController::Create( |
595 audio_manager, this, this, nullptr, nullptr, input_parameters, device_id_, | 637 GetAudioSystem()->GetAudioManager(), this, this, nullptr, nullptr, |
638 input_parameters, device_id_, | |
596 /*agc_is_enabled*/ false); | 639 /*agc_is_enabled*/ false); |
597 | 640 |
598 if (!audio_controller_.get()) { | 641 if (!audio_controller_.get()) { |
599 return Abort( | 642 return Abort( |
600 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE)); | 643 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE)); |
601 } | 644 } |
602 | 645 |
603 audio_log_->OnCreated(0, input_parameters, device_id_); | 646 audio_log_->OnCreated(0, input_parameters, device_id_); |
604 | 647 |
605 // The endpointer needs to estimate the environment/background noise before | 648 // The endpointer needs to estimate the environment/background noise before |
(...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
685 return Abort( | 728 return Abort( |
686 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE)); | 729 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE)); |
687 } else if (event_args.event == EVENT_ENGINE_ERROR) { | 730 } else if (event_args.event == EVENT_ENGINE_ERROR) { |
688 return Abort(event_args.engine_error); | 731 return Abort(event_args.engine_error); |
689 } | 732 } |
690 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_ABORTED)); | 733 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_ABORTED)); |
691 } | 734 } |
692 | 735 |
693 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort( | 736 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort( |
694 const SpeechRecognitionError& error) { | 737 const SpeechRecognitionError& error) { |
738 DCHECK_CURRENTLY_ON(BrowserThread::IO); | |
739 | |
695 if (IsCapturingAudio()) | 740 if (IsCapturingAudio()) |
696 CloseAudioControllerAsynchronously(); | 741 CloseAudioControllerAsynchronously(); |
697 | 742 |
698 DVLOG(1) << "SpeechRecognizerImpl canceling recognition. "; | 743 DVLOG(1) << "SpeechRecognizerImpl canceling recognition. "; |
699 | 744 |
745 if (state_ == STATE_PREPARING) { | |
746 // Cancel an outstanding reply from AudioSystem. | |
747 weak_ptr_factory_.InvalidateWeakPtrs(); | |
748 } | |
749 | |
700 // The recognition engine is initialized only after STATE_STARTING. | 750 // The recognition engine is initialized only after STATE_STARTING. |
701 if (state_ > STATE_STARTING) { | 751 if (state_ > STATE_STARTING) { |
702 DCHECK(recognition_engine_.get() != NULL); | 752 DCHECK(recognition_engine_.get() != NULL); |
703 recognition_engine_->EndRecognition(); | 753 recognition_engine_->EndRecognition(); |
704 } | 754 } |
705 | 755 |
706 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT) | 756 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT) |
707 listener()->OnSoundEnd(session_id()); | 757 listener()->OnSoundEnd(session_id()); |
708 | 758 |
709 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT) | 759 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT) |
(...skipping 116 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
826 | 876 |
827 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / | 877 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / |
828 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); | 878 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); |
829 noise_level = std::min(std::max(0.0f, noise_level), | 879 noise_level = std::min(std::max(0.0f, noise_level), |
830 kAudioMeterRangeMaxUnclipped); | 880 kAudioMeterRangeMaxUnclipped); |
831 | 881 |
832 listener()->OnAudioLevelsChange( | 882 listener()->OnAudioLevelsChange( |
833 session_id(), clip_detected ? 1.0f : audio_level_, noise_level); | 883 session_id(), clip_detected ? 1.0f : audio_level_, noise_level); |
834 } | 884 } |
835 | 885 |
836 void SpeechRecognizerImpl::SetAudioManagerForTesting( | 886 void SpeechRecognizerImpl::SetAudioSystemForTesting( |
837 AudioManager* audio_manager) { | 887 media::AudioSystem* audio_system) { |
838 audio_manager_for_tests_ = audio_manager; | 888 audio_system_for_tests_ = audio_system; |
889 } | |
890 | |
891 media::AudioSystem* SpeechRecognizerImpl::GetAudioSystem() { | |
892 return audio_system_for_tests_ ? audio_system_for_tests_ : audio_system_; | |
839 } | 893 } |
840 | 894 |
841 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) | 895 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) |
842 : event(event_value), | 896 : event(event_value), |
843 audio_data(NULL), | 897 audio_data(NULL), |
844 engine_error(SPEECH_RECOGNITION_ERROR_NONE) { | 898 engine_error(SPEECH_RECOGNITION_ERROR_NONE) { |
845 } | 899 } |
846 | 900 |
847 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(const FSMEventArgs& other) = | 901 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(const FSMEventArgs& other) = |
848 default; | 902 default; |
849 | 903 |
850 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { | 904 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { |
851 } | 905 } |
852 | 906 |
853 } // namespace content | 907 } // namespace content |
OLD | NEW |