content/browser/speech/speech_recognizer.cc - Issue 11347004: content/browser: Move speech code into content namespace.

Side by Side Diff: content/browser/speech/speech_recognizer.cc

Issue 11347004: content/browser: Move speech code into content namespace. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 8 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« content/browser/speech/proto/google_streaming_api.proto ('K') | « content/browser/speech/speech_recognizer.h ('k') | content/browser/speech/speech_recognizer_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "content/browser/speech/speech_recognizer.h"	5 #include "content/browser/speech/speech_recognizer.h"

6	6

7 #include "base/basictypes.h"	7 #include "base/basictypes.h"

8 #include "base/bind.h"	8 #include "base/bind.h"

9 #include "base/time.h"	9 #include "base/time.h"

10 #include "content/browser/browser_main_loop.h"	10 #include "content/browser/browser_main_loop.h"

11 #include "content/browser/speech/audio_buffer.h"	11 #include "content/browser/speech/audio_buffer.h"

12 #include "content/browser/speech/google_one_shot_remote_engine.h"	12 #include "content/browser/speech/google_one_shot_remote_engine.h"

13 #include "content/public/browser/browser_thread.h"	13 #include "content/public/browser/browser_thread.h"

14 #include "content/public/browser/speech_recognition_event_listener.h"	14 #include "content/public/browser/speech_recognition_event_listener.h"

15 #include "content/public/common/speech_recognition_error.h"	15 #include "content/public/common/speech_recognition_error.h"

16 #include "content/public/common/speech_recognition_grammar.h"	16 #include "content/public/common/speech_recognition_grammar.h"

17 #include "content/public/common/speech_recognition_result.h"	17 #include "content/public/common/speech_recognition_result.h"

18 #include "net/url_request/url_request_context_getter.h"	18 #include "net/url_request/url_request_context_getter.h"

19	19

20 using content::BrowserMainLoop;

21 using content::BrowserThread;

22 using content::SpeechRecognitionError;

23 using content::SpeechRecognitionEventListener;

24 using content::SpeechRecognitionGrammar;

25 using content::SpeechRecognitionResult;

26 using media::AudioInputController;	20 using media::AudioInputController;

27 using media::AudioManager;	21 using media::AudioManager;

28 using media::AudioParameters;	22 using media::AudioParameters;

29 using media::ChannelLayout;	23 using media::ChannelLayout;

30	24

	25 namespace content {

31 namespace {	26 namespace {

32	27

33 // The following constants are related to the volume level indicator shown in	28 // The following constants are related to the volume level indicator shown in

34 // the UI for recorded audio.	29 // the UI for recorded audio.

35 // Multiplier used when new volume is greater than previous level.	30 // Multiplier used when new volume is greater than previous level.

36 const float kUpSmoothingFactor = 1.0f;	31 const float kUpSmoothingFactor = 1.0f;

37 // Multiplier used when new volume is lesser than previous level.	32 // Multiplier used when new volume is lesser than previous level.

38 const float kDownSmoothingFactor = 0.7f;	33 const float kDownSmoothingFactor = 0.7f;

39 // RMS dB value of a maximum (unclipped) sine wave for int16 samples.	34 // RMS dB value of a maximum (unclipped) sine wave for int16 samples.

40 const float kAudioMeterMaxDb = 90.31f;	35 const float kAudioMeterMaxDb = 90.31f;

41 // This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0.	36 // This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0.

42 // Values lower than this will display as empty level-meter.	37 // Values lower than this will display as empty level-meter.

43 const float kAudioMeterMinDb = 30.0f;	38 const float kAudioMeterMinDb = 30.0f;

44 const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb;	39 const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb;

45	40

46 // Maximum level to draw to display unclipped meter. (1.0f displays clipping.)	41 // Maximum level to draw to display unclipped meter. (1.0f displays clipping.)

47 const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f;	42 const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f;

48	43

49 // Returns true if more than 5% of the samples are at min or max value.	44 // Returns true if more than 5% of the samples are at min or max value.

50 bool DetectClipping(const speech::AudioChunk& chunk) {	45 bool DetectClipping(const AudioChunk& chunk) {

51 const int num_samples = chunk.NumSamples();	46 const int num_samples = chunk.NumSamples();

52 const int16* samples = chunk.SamplesData16();	47 const int16* samples = chunk.SamplesData16();

53 const int kThreshold = num_samples / 20;	48 const int kThreshold = num_samples / 20;

54 int clipping_samples = 0;	49 int clipping_samples = 0;

55	50

56 for (int i = 0; i < num_samples; ++i) {	51 for (int i = 0; i < num_samples; ++i) {

57 if (samples[i] <= -32767 \|\| samples[i] >= 32767) {	52 if (samples[i] <= -32767 \|\| samples[i] >= 32767) {

58 if (++clipping_samples > kThreshold)	53 if (++clipping_samples > kThreshold)

59 return true;	54 return true;

60 }	55 }

61 }	56 }

62 return false;	57 return false;

63 }	58 }

64	59

65 void KeepAudioControllerRefcountedForDtor(scoped_refptr<AudioInputController>) {	60 void KeepAudioControllerRefcountedForDtor(scoped_refptr<AudioInputController>) {

66 }	61 }

67	62

68 } // namespace	63 } // namespace

69	64

70 namespace speech {

71

72 const int SpeechRecognizer::kAudioSampleRate = 16000;	65 const int SpeechRecognizer::kAudioSampleRate = 16000;

73 const ChannelLayout SpeechRecognizer::kChannelLayout =	66 const ChannelLayout SpeechRecognizer::kChannelLayout =

74 media::CHANNEL_LAYOUT_MONO;	67 media::CHANNEL_LAYOUT_MONO;

75 const int SpeechRecognizer::kNumBitsPerAudioSample = 16;	68 const int SpeechRecognizer::kNumBitsPerAudioSample = 16;

76 const int SpeechRecognizer::kNoSpeechTimeoutMs = 8000;	69 const int SpeechRecognizer::kNoSpeechTimeoutMs = 8000;

77 const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300;	70 const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300;

78 media::AudioManager* SpeechRecognizer::audio_manager_for_tests_ = NULL;	71 media::AudioManager* SpeechRecognizer::audio_manager_for_tests_ = NULL;

79	72

80 COMPILE_ASSERT(SpeechRecognizer::kNumBitsPerAudioSample % 8 == 0,	73 COMPILE_ASSERT(SpeechRecognizer::kNumBitsPerAudioSample % 8 == 0,

81 kNumBitsPerAudioSample_must_be_a_multiple_of_8);	74 kNumBitsPerAudioSample_must_be_a_multiple_of_8);

(...skipping 105 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
187 event_args.audio_data = new AudioChunk(data, static_cast<size_t>(size),	180 event_args.audio_data = new AudioChunk(data, static_cast<size_t>(size),

188 kNumBitsPerAudioSample / 8);	181 kNumBitsPerAudioSample / 8);

189 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,	182 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

190 base::Bind(&SpeechRecognizer::DispatchEvent,	183 base::Bind(&SpeechRecognizer::DispatchEvent,

191 this, event_args));	184 this, event_args));

192 }	185 }

193	186

194 void SpeechRecognizer::OnAudioClosed(AudioInputController*) {}	187 void SpeechRecognizer::OnAudioClosed(AudioInputController*) {}

195	188

196 void SpeechRecognizer::OnSpeechRecognitionEngineResult(	189 void SpeechRecognizer::OnSpeechRecognitionEngineResult(

197 const content::SpeechRecognitionResult& result) {	190 const SpeechRecognitionResult& result) {

198 FSMEventArgs event_args(EVENT_ENGINE_RESULT);	191 FSMEventArgs event_args(EVENT_ENGINE_RESULT);

199 event_args.engine_result = result;	192 event_args.engine_result = result;

200 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,	193 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

201 base::Bind(&SpeechRecognizer::DispatchEvent,	194 base::Bind(&SpeechRecognizer::DispatchEvent,

202 this, event_args));	195 this, event_args));

203 }	196 }

204	197

205 void SpeechRecognizer::OnSpeechRecognitionEngineError(	198 void SpeechRecognizer::OnSpeechRecognitionEngineError(

206 const content::SpeechRecognitionError& error) {	199 const SpeechRecognitionError& error) {

207 FSMEventArgs event_args(EVENT_ENGINE_ERROR);	200 FSMEventArgs event_args(EVENT_ENGINE_ERROR);

208 event_args.engine_error = error;	201 event_args.engine_error = error;

209 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,	202 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

210 base::Bind(&SpeechRecognizer::DispatchEvent,	203 base::Bind(&SpeechRecognizer::DispatchEvent,

211 this, event_args));	204 this, event_args));

212 }	205 }

213	206

214 // ----------------------- Core FSM implementation ---------------------------	207 // ----------------------- Core FSM implementation ---------------------------

215 // TODO(primiano): After the changes in the media package (r129173), this class	208 // TODO(primiano): After the changes in the media package (r129173), this class

216 // slightly violates the SpeechRecognitionEventListener interface contract. In	209 // slightly violates the SpeechRecognitionEventListener interface contract. In

(...skipping 182 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
399 audio_manager_for_tests_ :	392 audio_manager_for_tests_ :

400 BrowserMainLoop::GetAudioManager();	393 BrowserMainLoop::GetAudioManager();

401 DCHECK(audio_manager != NULL);	394 DCHECK(audio_manager != NULL);

402	395

403 DVLOG(1) << "SpeechRecognizer starting audio capture.";	396 DVLOG(1) << "SpeechRecognizer starting audio capture.";

404 num_samples_recorded_ = 0;	397 num_samples_recorded_ = 0;

405 audio_level_ = 0;	398 audio_level_ = 0;

406 listener_->OnRecognitionStart(session_id_);	399 listener_->OnRecognitionStart(session_id_);

407	400

408 if (!audio_manager->HasAudioInputDevices()) {	401 if (!audio_manager->HasAudioInputDevices()) {

409 return Abort(SpeechRecognitionError(	402 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO,

410 content::SPEECH_RECOGNITION_ERROR_AUDIO,	403 SPEECH_AUDIO_ERROR_DETAILS_NO_MIC));

411 content::SPEECH_AUDIO_ERROR_DETAILS_NO_MIC));

412 }	404 }

413	405

414 if (audio_manager->IsRecordingInProcess()) {	406 if (audio_manager->IsRecordingInProcess()) {

415 return Abort(SpeechRecognitionError(	407 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO,

416 content::SPEECH_RECOGNITION_ERROR_AUDIO,	408 SPEECH_AUDIO_ERROR_DETAILS_IN_USE));

417 content::SPEECH_AUDIO_ERROR_DETAILS_IN_USE));

418 }	409 }

419	410

420 const int samples_per_packet = (kAudioSampleRate *	411 const int samples_per_packet = (kAudioSampleRate *

421 recognition_engine_->GetDesiredAudioChunkDurationMs()) / 1000;	412 recognition_engine_->GetDesiredAudioChunkDurationMs()) / 1000;

422 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,	413 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,

423 kAudioSampleRate, kNumBitsPerAudioSample,	414 kAudioSampleRate, kNumBitsPerAudioSample,

424 samples_per_packet);	415 samples_per_packet);

425 audio_controller_ = AudioInputController::Create(audio_manager, this, params);	416 audio_controller_ = AudioInputController::Create(audio_manager, this, params);

426	417

427 if (audio_controller_.get() == NULL) {	418 if (audio_controller_.get() == NULL) {

428 return Abort(	419 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO));

429 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO));

430 }	420 }

431	421

432 // The endpointer needs to estimate the environment/background noise before	422 // The endpointer needs to estimate the environment/background noise before

433 // starting to treat the audio as user input. We wait in the state	423 // starting to treat the audio as user input. We wait in the state

434 // ESTIMATING_ENVIRONMENT until such interval has elapsed before switching	424 // ESTIMATING_ENVIRONMENT until such interval has elapsed before switching

435 // to user input mode.	425 // to user input mode.

436 endpointer_.SetEnvironmentEstimationMode();	426 endpointer_.SetEnvironmentEstimationMode();

437 audio_controller_->Record();	427 audio_controller_->Record();

438 return STATE_STARTING;	428 return STATE_STARTING;

439 }	429 }

(...skipping 24 matching lines...) Expand all Loading...
464 return STATE_ESTIMATING_ENVIRONMENT;	454 return STATE_ESTIMATING_ENVIRONMENT;

465 }	455 }

466 }	456 }

467	457

468 SpeechRecognizer::FSMState	458 SpeechRecognizer::FSMState

469 SpeechRecognizer::DetectUserSpeechOrTimeout(const FSMEventArgs&) {	459 SpeechRecognizer::DetectUserSpeechOrTimeout(const FSMEventArgs&) {

470 if (endpointer_.DidStartReceivingSpeech()) {	460 if (endpointer_.DidStartReceivingSpeech()) {

471 listener_->OnSoundStart(session_id_);	461 listener_->OnSoundStart(session_id_);

472 return STATE_RECOGNIZING;	462 return STATE_RECOGNIZING;

473 } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) {	463 } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) {

474 return Abort(	464 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NO_SPEECH));

475 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH));

476 }	465 }

477 return STATE_WAITING_FOR_SPEECH;	466 return STATE_WAITING_FOR_SPEECH;

478 }	467 }

479	468

480 SpeechRecognizer::FSMState	469 SpeechRecognizer::FSMState

481 SpeechRecognizer::DetectEndOfSpeech(const FSMEventArgs& event_args) {	470 SpeechRecognizer::DetectEndOfSpeech(const FSMEventArgs& event_args) {

482 if (endpointer_.speech_input_complete())	471 if (endpointer_.speech_input_complete())

483 return StopCaptureAndWaitForResult(event_args);	472 return StopCaptureAndWaitForResult(event_args);

484 return STATE_RECOGNIZING;	473 return STATE_RECOGNIZING;

485 }	474 }

(...skipping 10 matching lines...) Expand all Loading...
496 listener_->OnSoundEnd(session_id_);	485 listener_->OnSoundEnd(session_id_);

497	486

498 listener_->OnAudioEnd(session_id_);	487 listener_->OnAudioEnd(session_id_);

499 return STATE_WAITING_FINAL_RESULT;	488 return STATE_WAITING_FINAL_RESULT;

500 }	489 }

501	490

502 SpeechRecognizer::FSMState	491 SpeechRecognizer::FSMState

503 SpeechRecognizer::AbortSilently(const FSMEventArgs& event_args) {	492 SpeechRecognizer::AbortSilently(const FSMEventArgs& event_args) {

504 DCHECK_NE(event_args.event, EVENT_AUDIO_ERROR);	493 DCHECK_NE(event_args.event, EVENT_AUDIO_ERROR);

505 DCHECK_NE(event_args.event, EVENT_ENGINE_ERROR);	494 DCHECK_NE(event_args.event, EVENT_ENGINE_ERROR);

506 return Abort(	495 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NONE));

507 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_NONE));

508 }	496 }

509	497

510 SpeechRecognizer::FSMState	498 SpeechRecognizer::FSMState

511 SpeechRecognizer::AbortWithError(const FSMEventArgs& event_args) {	499 SpeechRecognizer::AbortWithError(const FSMEventArgs& event_args) {

512 if (event_args.event == EVENT_AUDIO_ERROR) {	500 if (event_args.event == EVENT_AUDIO_ERROR) {

513 return Abort(	501 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO));

514 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO));

515 } else if (event_args.event == EVENT_ENGINE_ERROR) {	502 } else if (event_args.event == EVENT_ENGINE_ERROR) {

516 return Abort(event_args.engine_error);	503 return Abort(event_args.engine_error);

517 }	504 }

518 return Abort(	505 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_ABORTED));

519 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_ABORTED));

520 }	506 }

521	507

522 SpeechRecognizer::FSMState SpeechRecognizer::Abort(	508 SpeechRecognizer::FSMState SpeechRecognizer::Abort(

523 const SpeechRecognitionError& error) {	509 const SpeechRecognitionError& error) {

524 if (IsCapturingAudio())	510 if (IsCapturingAudio())

525 CloseAudioControllerAsynchronously();	511 CloseAudioControllerAsynchronously();

526	512

527 DVLOG(1) << "SpeechRecognizer canceling recognition. ";	513 DVLOG(1) << "SpeechRecognizer canceling recognition. ";

528	514

529 // The recognition engine is initialized only after STATE_STARTING.	515 // The recognition engine is initialized only after STATE_STARTING.

530 if (state_ > STATE_STARTING) {	516 if (state_ > STATE_STARTING) {

531 DCHECK(recognition_engine_.get() != NULL);	517 DCHECK(recognition_engine_.get() != NULL);

532 recognition_engine_->EndRecognition();	518 recognition_engine_->EndRecognition();

533 }	519 }

534	520

535 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT)	521 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT)

536 listener_->OnSoundEnd(session_id_);	522 listener_->OnSoundEnd(session_id_);

537	523

538 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT)	524 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT)

539 listener_->OnAudioEnd(session_id_);	525 listener_->OnAudioEnd(session_id_);

540	526

541 if (error.code != content::SPEECH_RECOGNITION_ERROR_NONE)	527 if (error.code != SPEECH_RECOGNITION_ERROR_NONE)

542 listener_->OnRecognitionError(session_id_, error);	528 listener_->OnRecognitionError(session_id_, error);

543	529

544 listener_->OnRecognitionEnd(session_id_);	530 listener_->OnRecognitionEnd(session_id_);

545	531

546 return STATE_IDLE;	532 return STATE_IDLE;

547 }	533 }

548	534

549 SpeechRecognizer::FSMState SpeechRecognizer::ProcessIntermediateResult(	535 SpeechRecognizer::FSMState SpeechRecognizer::ProcessIntermediateResult(

550 const FSMEventArgs& event_args) {	536 const FSMEventArgs& event_args) {

551 // Provisional results can occur only during continuous (non one-shot) mode.	537 // Provisional results can occur only during continuous (non one-shot) mode.

(...skipping 101 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
653	639

654 void SpeechRecognizer::SetAudioManagerForTests(	640 void SpeechRecognizer::SetAudioManagerForTests(

655 AudioManager* audio_manager) {	641 AudioManager* audio_manager) {

656 audio_manager_for_tests_ = audio_manager;	642 audio_manager_for_tests_ = audio_manager;

657 }	643 }

658	644

659 SpeechRecognizer::FSMEventArgs::FSMEventArgs(FSMEvent event_value)	645 SpeechRecognizer::FSMEventArgs::FSMEventArgs(FSMEvent event_value)

660 : event(event_value),	646 : event(event_value),

661 audio_error_code(0),	647 audio_error_code(0),

662 audio_data(NULL),	648 audio_data(NULL),

663 engine_error(content::SPEECH_RECOGNITION_ERROR_NONE) {	649 engine_error(SPEECH_RECOGNITION_ERROR_NONE) {

664 }	650 }

665	651

666 SpeechRecognizer::FSMEventArgs::~FSMEventArgs() {	652 SpeechRecognizer::FSMEventArgs::~FSMEventArgs() {

667 }	653 }

668	654

669 } // namespace speech	655 } // namespace content

OLD	NEW