content/browser/speech/speech_recognizer_impl.cc - Issue 15907012: Implement SpeechRecognizerImplAndroid

Side by Side Diff: content/browser/speech/speech_recognizer_impl.cc

Issue 15907012: Implement SpeechRecognizerImplAndroid (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Convert error codes in Java, refactor *{,JNI} methods into single methods, nits Created 7 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« content/browser/speech/speech_recognizer.h ('K') | « content/browser/speech/speech_recognizer.h ('k') | content/browser/speech/speech_recognizer_impl_android.h » ('j') | content/browser/speech/speech_recognizer_impl_android.h » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "content/browser/speech/speech_recognizer_impl.h"	5 #include "content/browser/speech/speech_recognizer_impl.h"

6	6

7 #include "base/basictypes.h"	7 #include "base/basictypes.h"

8 #include "base/bind.h"	8 #include "base/bind.h"

9 #include "base/time.h"	9 #include "base/time.h"

10 #include "content/browser/browser_main_loop.h"	10 #include "content/browser/browser_main_loop.h"

(...skipping 63 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
74 SpeechRecognitionEventListener* listener,	74 SpeechRecognitionEventListener* listener,

75 int session_id,	75 int session_id,

76 bool is_single_shot,	76 bool is_single_shot,

77 SpeechRecognitionEngine* engine)	77 SpeechRecognitionEngine* engine)

78 : SpeechRecognizer(listener, session_id),	78 : SpeechRecognizer(listener, session_id),

79 recognition_engine_(engine),	79 recognition_engine_(engine),

80 endpointer_(kAudioSampleRate),	80 endpointer_(kAudioSampleRate),

81 is_dispatching_event_(false),	81 is_dispatching_event_(false),

82 is_single_shot_(is_single_shot),	82 is_single_shot_(is_single_shot),

83 state_(STATE_IDLE) {	83 state_(STATE_IDLE) {

84 DCHECK(listener_ != NULL);	84 DCHECK(this->listener() != NULL);
	bulach 2013/06/11 07:24:48 nit: move the DCHECK to the base class, and there nit: move the DCHECK to the base class, and there just check for the param rather than the accessor. janx 2013/06/12 14:47:14 Moved DCHECK to base class. Show quoted text On 2013/06/11 07:24:48, bulach wrote: > nit: move the DCHECK to the base class, and there just check for the param > rather than the accessor. Moved DCHECK to base class.
85 DCHECK(recognition_engine_ != NULL);	85 DCHECK(recognition_engine_ != NULL);

86 if (is_single_shot) {	86 if (is_single_shot) {

87 // In single shot recognition, the session is automatically ended after:	87 // In single shot recognition, the session is automatically ended after:

88 // - 0.5 seconds of silence if time < 3 seconds	88 // - 0.5 seconds of silence if time < 3 seconds

89 // - 1 seconds of silence if time >= 3 seconds	89 // - 1 seconds of silence if time >= 3 seconds

90 endpointer_.set_speech_input_complete_silence_length(	90 endpointer_.set_speech_input_complete_silence_length(

91 base::Time::kMicrosecondsPerSecond / 2);	91 base::Time::kMicrosecondsPerSecond / 2);

92 endpointer_.set_long_speech_input_complete_silence_length(	92 endpointer_.set_long_speech_input_complete_silence_length(

93 base::Time::kMicrosecondsPerSecond);	93 base::Time::kMicrosecondsPerSecond);

94 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);	94 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);

(...skipping 294 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
389 DCHECK(recognition_engine_.get() != NULL);	389 DCHECK(recognition_engine_.get() != NULL);

390 DCHECK(!IsCapturingAudio());	390 DCHECK(!IsCapturingAudio());

391 AudioManager* audio_manager = (audio_manager_for_tests_ != NULL) ?	391 AudioManager* audio_manager = (audio_manager_for_tests_ != NULL) ?

392 audio_manager_for_tests_ :	392 audio_manager_for_tests_ :

393 BrowserMainLoop::GetAudioManager();	393 BrowserMainLoop::GetAudioManager();

394 DCHECK(audio_manager != NULL);	394 DCHECK(audio_manager != NULL);

395	395

396 DVLOG(1) << "SpeechRecognizerImpl starting audio capture.";	396 DVLOG(1) << "SpeechRecognizerImpl starting audio capture.";

397 num_samples_recorded_ = 0;	397 num_samples_recorded_ = 0;

398 audio_level_ = 0;	398 audio_level_ = 0;

399 listener_->OnRecognitionStart(session_id_);	399 listener()->OnRecognitionStart(session_id());

400	400

401 if (!audio_manager->HasAudioInputDevices()) {	401 if (!audio_manager->HasAudioInputDevices()) {

402 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO,	402 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO,

403 SPEECH_AUDIO_ERROR_DETAILS_NO_MIC));	403 SPEECH_AUDIO_ERROR_DETAILS_NO_MIC));

404 }	404 }

405	405

406 const int samples_per_packet = (kAudioSampleRate *	406 const int samples_per_packet = (kAudioSampleRate *

407 recognition_engine_->GetDesiredAudioChunkDurationMs()) / 1000;	407 recognition_engine_->GetDesiredAudioChunkDurationMs()) / 1000;

408 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,	408 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,

409 kAudioSampleRate, kNumBitsPerAudioSample,	409 kAudioSampleRate, kNumBitsPerAudioSample,

(...skipping 12 matching lines...) Expand all Loading...
422 audio_controller_->Record();	422 audio_controller_->Record();

423 return STATE_STARTING;	423 return STATE_STARTING;

424 }	424 }

425	425

426 SpeechRecognizerImpl::FSMState	426 SpeechRecognizerImpl::FSMState

427 SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) {	427 SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) {

428 // This is the first audio packet captured, so the recognition engine is	428 // This is the first audio packet captured, so the recognition engine is

429 // started and the delegate notified about the event.	429 // started and the delegate notified about the event.

430 DCHECK(recognition_engine_.get() != NULL);	430 DCHECK(recognition_engine_.get() != NULL);

431 recognition_engine_->StartRecognition();	431 recognition_engine_->StartRecognition();

432 listener_->OnAudioStart(session_id_);	432 listener()->OnAudioStart(session_id());

433	433

434 // This is a little hack, since TakeAudioChunk() is already called by	434 // This is a little hack, since TakeAudioChunk() is already called by

435 // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping	435 // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping

436 // the first audio chunk captured after opening the audio device.	436 // the first audio chunk captured after opening the audio device.

437 recognition_engine_->TakeAudioChunk(*(event_args.audio_data.get()));	437 recognition_engine_->TakeAudioChunk(*(event_args.audio_data.get()));

438 return STATE_ESTIMATING_ENVIRONMENT;	438 return STATE_ESTIMATING_ENVIRONMENT;

439 }	439 }

440	440

441 SpeechRecognizerImpl::FSMState	441 SpeechRecognizerImpl::FSMState

442 SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) {	442 SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) {

443 DCHECK(endpointer_.IsEstimatingEnvironment());	443 DCHECK(endpointer_.IsEstimatingEnvironment());

444 if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) {	444 if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) {

445 endpointer_.SetUserInputMode();	445 endpointer_.SetUserInputMode();

446 listener_->OnEnvironmentEstimationComplete(session_id_);	446 listener()->OnEnvironmentEstimationComplete(session_id());

447 return STATE_WAITING_FOR_SPEECH;	447 return STATE_WAITING_FOR_SPEECH;

448 } else {	448 } else {

449 return STATE_ESTIMATING_ENVIRONMENT;	449 return STATE_ESTIMATING_ENVIRONMENT;

450 }	450 }

451 }	451 }

452	452

453 SpeechRecognizerImpl::FSMState	453 SpeechRecognizerImpl::FSMState

454 SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) {	454 SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) {

455 if (endpointer_.DidStartReceivingSpeech()) {	455 if (endpointer_.DidStartReceivingSpeech()) {

456 listener_->OnSoundStart(session_id_);	456 listener()->OnSoundStart(session_id());

457 return STATE_RECOGNIZING;	457 return STATE_RECOGNIZING;

458 } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) {	458 } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) {

459 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NO_SPEECH));	459 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NO_SPEECH));

460 }	460 }

461 return STATE_WAITING_FOR_SPEECH;	461 return STATE_WAITING_FOR_SPEECH;

462 }	462 }

463	463

464 SpeechRecognizerImpl::FSMState	464 SpeechRecognizerImpl::FSMState

465 SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) {	465 SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) {

466 if (endpointer_.speech_input_complete())	466 if (endpointer_.speech_input_complete())

467 return StopCaptureAndWaitForResult(event_args);	467 return StopCaptureAndWaitForResult(event_args);

468 return STATE_RECOGNIZING;	468 return STATE_RECOGNIZING;

469 }	469 }

470	470

471 SpeechRecognizerImpl::FSMState	471 SpeechRecognizerImpl::FSMState

472 SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) {	472 SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) {

473 DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING);	473 DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING);

474	474

475 DVLOG(1) << "Concluding recognition";	475 DVLOG(1) << "Concluding recognition";

476 CloseAudioControllerAsynchronously();	476 CloseAudioControllerAsynchronously();

477 recognition_engine_->AudioChunksEnded();	477 recognition_engine_->AudioChunksEnded();

478	478

479 if (state_ > STATE_WAITING_FOR_SPEECH)	479 if (state_ > STATE_WAITING_FOR_SPEECH)

480 listener_->OnSoundEnd(session_id_);	480 listener()->OnSoundEnd(session_id());

481	481

482 listener_->OnAudioEnd(session_id_);	482 listener()->OnAudioEnd(session_id());

483 return STATE_WAITING_FINAL_RESULT;	483 return STATE_WAITING_FINAL_RESULT;

484 }	484 }

485	485

486 SpeechRecognizerImpl::FSMState	486 SpeechRecognizerImpl::FSMState

487 SpeechRecognizerImpl::AbortSilently(const FSMEventArgs& event_args) {	487 SpeechRecognizerImpl::AbortSilently(const FSMEventArgs& event_args) {

488 DCHECK_NE(event_args.event, EVENT_AUDIO_ERROR);	488 DCHECK_NE(event_args.event, EVENT_AUDIO_ERROR);

489 DCHECK_NE(event_args.event, EVENT_ENGINE_ERROR);	489 DCHECK_NE(event_args.event, EVENT_ENGINE_ERROR);

490 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NONE));	490 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NONE));

491 }	491 }

492	492

(...skipping 14 matching lines...) Expand all Loading...
507	507

508 DVLOG(1) << "SpeechRecognizerImpl canceling recognition. ";	508 DVLOG(1) << "SpeechRecognizerImpl canceling recognition. ";

509	509

510 // The recognition engine is initialized only after STATE_STARTING.	510 // The recognition engine is initialized only after STATE_STARTING.

511 if (state_ > STATE_STARTING) {	511 if (state_ > STATE_STARTING) {

512 DCHECK(recognition_engine_.get() != NULL);	512 DCHECK(recognition_engine_.get() != NULL);

513 recognition_engine_->EndRecognition();	513 recognition_engine_->EndRecognition();

514 }	514 }

515	515

516 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT)	516 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT)

517 listener_->OnSoundEnd(session_id_);	517 listener()->OnSoundEnd(session_id());

518	518

519 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT)	519 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT)

520 listener_->OnAudioEnd(session_id_);	520 listener()->OnAudioEnd(session_id());

521	521

522 if (error.code != SPEECH_RECOGNITION_ERROR_NONE)	522 if (error.code != SPEECH_RECOGNITION_ERROR_NONE)

523 listener_->OnRecognitionError(session_id_, error);	523 listener()->OnRecognitionError(session_id(), error);

524	524

525 listener_->OnRecognitionEnd(session_id_);	525 listener()->OnRecognitionEnd(session_id());

526	526

527 return STATE_ENDED;	527 return STATE_ENDED;

528 }	528 }

529	529

530 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ProcessIntermediateResult(	530 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ProcessIntermediateResult(

531 const FSMEventArgs& event_args) {	531 const FSMEventArgs& event_args) {

532 // Provisional results can occur only during continuous (non one-shot) mode.	532 // Provisional results can occur only during continuous (non one-shot) mode.

533 // If this check is reached it means that a continuous speech recognition	533 // If this check is reached it means that a continuous speech recognition

534 // engine is being used for a one shot recognition.	534 // engine is being used for a one shot recognition.

535 DCHECK_EQ(false, is_single_shot_);	535 DCHECK_EQ(false, is_single_shot_);

536	536

537 // In continuous recognition, intermediate results can occur even when we are	537 // In continuous recognition, intermediate results can occur even when we are

538 // in the ESTIMATING_ENVIRONMENT or WAITING_FOR_SPEECH states (if the	538 // in the ESTIMATING_ENVIRONMENT or WAITING_FOR_SPEECH states (if the

539 // recognition engine is "faster" than our endpointer). In these cases we	539 // recognition engine is "faster" than our endpointer). In these cases we

540 // skip the endpointer and fast-forward to the RECOGNIZING state, with respect	540 // skip the endpointer and fast-forward to the RECOGNIZING state, with respect

541 // of the events triggering order.	541 // of the events triggering order.

542 if (state_ == STATE_ESTIMATING_ENVIRONMENT) {	542 if (state_ == STATE_ESTIMATING_ENVIRONMENT) {

543 DCHECK(endpointer_.IsEstimatingEnvironment());	543 DCHECK(endpointer_.IsEstimatingEnvironment());

544 endpointer_.SetUserInputMode();	544 endpointer_.SetUserInputMode();

545 listener_->OnEnvironmentEstimationComplete(session_id_);	545 listener()->OnEnvironmentEstimationComplete(session_id());

546 } else if (state_ == STATE_WAITING_FOR_SPEECH) {	546 } else if (state_ == STATE_WAITING_FOR_SPEECH) {

547 listener_->OnSoundStart(session_id_);	547 listener()->OnSoundStart(session_id());

548 } else {	548 } else {

549 DCHECK_EQ(STATE_RECOGNIZING, state_);	549 DCHECK_EQ(STATE_RECOGNIZING, state_);

550 }	550 }

551	551

552 listener_->OnRecognitionResults(session_id_, event_args.engine_results);	552 listener()->OnRecognitionResults(session_id(), event_args.engine_results);

553 return STATE_RECOGNIZING;	553 return STATE_RECOGNIZING;

554 }	554 }

555	555

556 SpeechRecognizerImpl::FSMState	556 SpeechRecognizerImpl::FSMState

557 SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) {	557 SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) {

558 const SpeechRecognitionResults& results = event_args.engine_results;	558 const SpeechRecognitionResults& results = event_args.engine_results;

559 SpeechRecognitionResults::const_iterator i = results.begin();	559 SpeechRecognitionResults::const_iterator i = results.begin();

560 bool provisional_results_pending = false;	560 bool provisional_results_pending = false;

561 bool results_are_empty = true;	561 bool results_are_empty = true;

562 for (; i != results.end(); ++i) {	562 for (; i != results.end(); ++i) {

563 const SpeechRecognitionResult& result = *i;	563 const SpeechRecognitionResult& result = *i;

564 if (result.is_provisional) {	564 if (result.is_provisional) {

565 provisional_results_pending = true;	565 provisional_results_pending = true;

566 DCHECK(!is_single_shot_);	566 DCHECK(!is_single_shot_);

567 } else if (results_are_empty) {	567 } else if (results_are_empty) {

568 results_are_empty = result.hypotheses.empty();	568 results_are_empty = result.hypotheses.empty();

569 }	569 }

570 }	570 }

571	571

572 if (provisional_results_pending) {	572 if (provisional_results_pending) {

573 listener_->OnRecognitionResults(session_id_, results);	573 listener()->OnRecognitionResults(session_id(), results);

574 // We don't end the recognition if a provisional result is received in	574 // We don't end the recognition if a provisional result is received in

575 // STATE_WAITING_FINAL_RESULT. A definitive result will come next and will	575 // STATE_WAITING_FINAL_RESULT. A definitive result will come next and will

576 // end the recognition.	576 // end the recognition.

577 return state_;	577 return state_;

578 }	578 }

579	579

580 recognition_engine_->EndRecognition();	580 recognition_engine_->EndRecognition();

581	581

582 if (!results_are_empty) {	582 if (!results_are_empty) {

583 // We could receive an empty result (which we won't propagate further)	583 // We could receive an empty result (which we won't propagate further)

584 // in the following (continuous) scenario:	584 // in the following (continuous) scenario:

585 // 1. The caller start pushing audio and receives some results;	585 // 1. The caller start pushing audio and receives some results;

586 // 2. A \|StopAudioCapture\| is issued later;	586 // 2. A \|StopAudioCapture\| is issued later;

587 // 3. The final audio frames captured in the interval ]1,2] do not lead to	587 // 3. The final audio frames captured in the interval ]1,2] do not lead to

588 // any result (nor any error);	588 // any result (nor any error);

589 // 4. The speech recognition engine, therefore, emits an empty result to	589 // 4. The speech recognition engine, therefore, emits an empty result to

590 // notify that the recognition is ended with no error, yet neither any	590 // notify that the recognition is ended with no error, yet neither any

591 // further result.	591 // further result.

592 listener_->OnRecognitionResults(session_id_, results);	592 listener()->OnRecognitionResults(session_id(), results);

593 }	593 }

594	594

595 listener_->OnRecognitionEnd(session_id_);	595 listener()->OnRecognitionEnd(session_id());

596 return STATE_ENDED;	596 return STATE_ENDED;

597 }	597 }

598	598

599 SpeechRecognizerImpl::FSMState	599 SpeechRecognizerImpl::FSMState

600 SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const {	600 SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const {

601 return state_; // Just keep the current state.	601 return state_; // Just keep the current state.

602 }	602 }

603	603

604 SpeechRecognizerImpl::FSMState	604 SpeechRecognizerImpl::FSMState

605 SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) {	605 SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) {

(...skipping 29 matching lines...) Expand all Loading...
635 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);	635 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);

636 const float smoothing_factor = (level > audio_level_) ? kUpSmoothingFactor :	636 const float smoothing_factor = (level > audio_level_) ? kUpSmoothingFactor :

637 kDownSmoothingFactor;	637 kDownSmoothingFactor;

638 audio_level_ += (level - audio_level_) * smoothing_factor;	638 audio_level_ += (level - audio_level_) * smoothing_factor;

639	639

640 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /	640 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /

641 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);	641 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

642 noise_level = std::min(std::max(0.0f, noise_level),	642 noise_level = std::min(std::max(0.0f, noise_level),

643 kAudioMeterRangeMaxUnclipped);	643 kAudioMeterRangeMaxUnclipped);

644	644

645 listener_->OnAudioLevelsChange(	645 listener()->OnAudioLevelsChange(

646 session_id_, clip_detected ? 1.0f : audio_level_, noise_level);	646 session_id(), clip_detected ? 1.0f : audio_level_, noise_level);

647 }	647 }

648	648

649 void SpeechRecognizerImpl::SetAudioManagerForTests(	649 void SpeechRecognizerImpl::SetAudioManagerForTests(

650 AudioManager* audio_manager) {	650 AudioManager* audio_manager) {

651 audio_manager_for_tests_ = audio_manager;	651 audio_manager_for_tests_ = audio_manager;

652 }	652 }

653	653

654 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value)	654 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value)

655 : event(event_value),	655 : event(event_value),

656 audio_data(NULL),	656 audio_data(NULL),

657 engine_error(SPEECH_RECOGNITION_ERROR_NONE) {	657 engine_error(SPEECH_RECOGNITION_ERROR_NONE) {

658 }	658 }

659	659

660 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() {	660 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() {

661 }	661 }

662	662

663 } // namespace content	663 } // namespace content

OLD	NEW