content/browser/speech/speech_recognizer_impl.cc - Issue 9663066: Refactoring of chrome speech recognition architecture (CL1.3)

Side by Side Diff: content/browser/speech/speech_recognizer_impl.cc

Issue 9663066: Refactoring of chrome speech recognition architecture (CL1.3) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Fixed according to (partial) Satish review. Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« content/browser/speech/speech_recognizer_impl.h ('K') | « content/browser/speech/speech_recognizer_impl.h ('k') | content/browser/speech/speech_recognizer_impl_unittest.cc » ('j') | content/public/common/speech_recognition_result.h » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "content/browser/speech/speech_recognizer_impl.h"	5 #include "content/browser/speech/speech_recognizer_impl.h"

6	6

7 #include "base/bind.h"	7 #include "base/bind.h"

8 #include "base/time.h"	8 #include "base/time.h"

9 #include "content/browser/browser_main_loop.h"	9 #include "content/browser/browser_main_loop.h"

10 #include "content/browser/speech/audio_buffer.h"	10 #include "content/browser/speech/audio_buffer.h"

	11 #include "content/browser/speech/google_one_shot_remote_engine.h"

	12 #include "content/public/browser/browser_thread.h"

11 #include "content/public/browser/speech_recognition_event_listener.h"	13 #include "content/public/browser/speech_recognition_event_listener.h"

12 #include "content/public/browser/browser_thread.h"	14 #include "content/public/browser/speech_recognizer.h"

13 #include "content/public/common/speech_recognition_result.h"	15 #include "content/public/common/speech_recognition_result.h"

14 #include "net/url_request/url_request_context_getter.h"	16 #include "net/url_request/url_request_context_getter.h"

15	17

	18 #define UNREACHABLE_CONDITION() do { NOTREACHED(); return state_; } while(0)

	19

16 using content::BrowserMainLoop;	20 using content::BrowserMainLoop;

17 using content::BrowserThread;	21 using content::BrowserThread;

	22 using content::SpeechRecognitionError;

18 using content::SpeechRecognitionEventListener;	23 using content::SpeechRecognitionEventListener;

	24 using content::SpeechRecognitionResult;

19 using content::SpeechRecognizer;	25 using content::SpeechRecognizer;

20 using media::AudioInputController;	26 using media::AudioInputController;

21 using std::string;

22	27

	28 // TODO(primiano) what about a watchdog here to avoid getting stuck if the

	29 // SpeechRecognitionEngine does not deliver a result (in reasonable time)?

23 namespace {	30 namespace {

24	31 // Enables spontaneous transition from WaitingForSpeech to RecognizingSpeech,

	32 // which is required for the mock recognition engine which sends fake results.

	33 const bool skipSilenceDetectionForTesting = false;

25 // The following constants are related to the volume level indicator shown in	34 // The following constants are related to the volume level indicator shown in

26 // the UI for recorded audio.	35 // the UI for recorded audio.

27 // Multiplier used when new volume is greater than previous level.	36 // Multiplier used when new volume is greater than previous level.

28 const float kUpSmoothingFactor = 1.0f;	37 const float kUpSmoothingFactor = 1.0f;

29 // Multiplier used when new volume is lesser than previous level.	38 // Multiplier used when new volume is lesser than previous level.

30 const float kDownSmoothingFactor = 0.7f;	39 const float kDownSmoothingFactor = 0.7f;

31 // RMS dB value of a maximum (unclipped) sine wave for int16 samples.	40 // RMS dB value of a maximum (unclipped) sine wave for int16 samples.

32 const float kAudioMeterMaxDb = 90.31f;	41 const float kAudioMeterMaxDb = 90.31f;

33 // This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0.	42 // This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0.

34 // Values lower than this will display as empty level-meter.	43 // Values lower than this will display as empty level-meter.

35 const float kAudioMeterMinDb = 30.0f;	44 const float kAudioMeterMinDb = 30.0f;

36 const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb;	45 const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb;

37	46

38 // Maximum level to draw to display unclipped meter. (1.0f displays clipping.)	47 // Maximum level to draw to display unclipped meter. (1.0f displays clipping.)

39 const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f;	48 const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f;

40	49

41 // Returns true if more than 5% of the samples are at min or max value.	50 // Returns true if more than 5% of the samples are at min or max value.

42 bool DetectClipping(const speech::AudioChunk& chunk) {	51 bool DetectClipping(const speech::AudioChunk& chunk) {

43 const int num_samples = chunk.NumSamples();	52 const int num_samples = chunk.NumSamples();

44 const int16* samples = chunk.SamplesData16();	53 const int16* samples = chunk.SamplesData16();

45 const int kThreshold = num_samples / 20;	54 const int kThreshold = num_samples / 20;

46 int clipping_samples = 0;	55 int clipping_samples = 0;

	56

47 for (int i = 0; i < num_samples; ++i) {	57 for (int i = 0; i < num_samples; ++i) {

48 if (samples[i] <= -32767 \|\| samples[i] >= 32767) {	58 if (samples[i] <= -32767 \|\| samples[i] >= 32767) {

49 if (++clipping_samples > kThreshold)	59 if (++clipping_samples > kThreshold)

50 return true;	60 return true;

51 }	61 }

52 }	62 }

53 return false;	63 return false;

54 }	64 }

55	65

56 } // namespace	66 } // namespace

57	67

	68 // TODO(primiano) transitional, see description in speech_recognizer.h.

58 SpeechRecognizer* SpeechRecognizer::Create(	69 SpeechRecognizer* SpeechRecognizer::Create(

59 SpeechRecognitionEventListener* listener,	70 SpeechRecognitionEventListener* listener,

60 int caller_id,	71 int caller_id,

61 const std::string& language,	72 const std::string& language,

62 const std::string& grammar,	73 const std::string& grammar,

63 net::URLRequestContextGetter* context_getter,	74 net::URLRequestContextGetter* context_getter,

64 bool filter_profanities,	75 bool filter_profanities,

65 const std::string& hardware_info,	76 const std::string& hardware_info,

66 const std::string& origin_url) {	77 const std::string& origin_url) {

67 return new speech::SpeechRecognizerImpl(	78 speech::GoogleOneShotRemoteEngineConfig google_sr_config;

68 listener, caller_id, language, grammar, context_getter,	79 google_sr_config.language = language;

69 filter_profanities, hardware_info, origin_url);	80 google_sr_config.grammar = grammar;

	81 google_sr_config.audio_sample_rate =

	82 speech::SpeechRecognizerImpl::kAudioSampleRate;

	83 google_sr_config.audio_num_bits_per_sample =

	84 speech::SpeechRecognizerImpl::kNumBitsPerAudioSample;

	85 google_sr_config.filter_profanities = filter_profanities;

	86 google_sr_config.hardware_info = hardware_info;

	87 google_sr_config.origin_url = origin_url;

	88

	89 speech::GoogleOneShotRemoteEngine* google_sr_engine =

	90 new speech::GoogleOneShotRemoteEngine(context_getter);

	91 google_sr_engine->SetConfiguration(google_sr_config);

	92

	93 return new speech::SpeechRecognizerImpl(listener,

	94 caller_id,

	95 google_sr_engine);

70 }	96 }

71	97

72 namespace speech {	98 namespace speech {

73

74 const int SpeechRecognizerImpl::kAudioSampleRate = 16000;	99 const int SpeechRecognizerImpl::kAudioSampleRate = 16000;

75 const int SpeechRecognizerImpl::kAudioPacketIntervalMs = 100;

76 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO;	100 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO;

77 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16;	101 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16;

78 const int SpeechRecognizerImpl::kNoSpeechTimeoutSec = 8;	102 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000;

79 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300;	103 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300;

80	104

81 SpeechRecognizerImpl::SpeechRecognizerImpl(	105 SpeechRecognizerImpl::SpeechRecognizerImpl(

82 SpeechRecognitionEventListener* listener,	106 SpeechRecognitionEventListener* listener,

83 int caller_id,	107 int caller_id,

84 const std::string& language,	108 SpeechRecognitionEngine* engine)

85 const std::string& grammar,

86 net::URLRequestContextGetter* context_getter,

87 bool filter_profanities,

88 const std::string& hardware_info,

89 const std::string& origin_url)

90 : listener_(listener),	109 : listener_(listener),

	110 testing_audio_manager_(NULL),

	111 recognition_engine_(engine),

	112 endpointer_(kAudioSampleRate),

91 caller_id_(caller_id),	113 caller_id_(caller_id),

92 language_(language),	114 event_dispatch_nesting_level_(0),

93 grammar_(grammar),	115 state_(kIdle),

94 filter_profanities_(filter_profanities),	116 event_args_(NULL) {

95 hardware_info_(hardware_info),	117 DCHECK(listener_ != NULL);

96 origin_url_(origin_url),	118 DCHECK(recognition_engine_ != NULL);

97 context_getter_(context_getter),

98 codec_(AudioEncoder::CODEC_FLAC),

99 encoder_(NULL),

100 endpointer_(kAudioSampleRate),

101 num_samples_recorded_(0),

102 audio_level_(0.0f),

103 audio_manager_(NULL) {

104 endpointer_.set_speech_input_complete_silence_length(	119 endpointer_.set_speech_input_complete_silence_length(

105 base::Time::kMicrosecondsPerSecond / 2);	120 base::Time::kMicrosecondsPerSecond / 2);

106 endpointer_.set_long_speech_input_complete_silence_length(	121 endpointer_.set_long_speech_input_complete_silence_length(

107 base::Time::kMicrosecondsPerSecond);	122 base::Time::kMicrosecondsPerSecond);

108 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);	123 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);

109 endpointer_.StartSession();	124 endpointer_.StartSession();

	125 recognition_engine_->set_delegate(this);

110 }	126 }

111	127

112 SpeechRecognizerImpl::~SpeechRecognizerImpl() {	128 SpeechRecognizerImpl::~SpeechRecognizerImpl() {

113 // Recording should have stopped earlier due to the endpointer or

114 // \|StopRecording\| being called.

115 DCHECK(!audio_controller_.get());

116 DCHECK(!request_.get() \|\| !request_->HasPendingRequest());

117 DCHECK(!encoder_.get());

118 endpointer_.EndSession();	129 endpointer_.EndSession();

119 }	130 }

120	131

121 bool SpeechRecognizerImpl::StartRecognition() {	132 // ------- Methods that trigger Finite State Machine (FSM) events ------------

	133

	134 // NOTE: all the external events and request should be enqueued (PostTask), even

	135 // if they come from the same (IO) thread, in order to preserve the relationship

	136 // of causality between events.

	137 // Imagine what would happen if a Start has been enqueued from another thread

	138 // (but not yet processed) and we suddenly issue a Stop from the IO thread.

	139 // Furthermore, even if you are sure to not interleave start and stop requests,

	140 // asynchronous event processing mixed with synchronous callback can cause very

	141 // mind-breaking side effects.

	142 // For instance, if someone could call Abort synchronously (instead of posting

	143 // the event on the queue), it will receive interleaved callbacks (e.g. an error

	144 // or the audio-end event) before the Abort call is effectively ended.

	145 // Is your (caller) code ready for this?

	146

	147 void SpeechRecognizerImpl::StartRecognition() {

	148 FSMEventArgs args;

	149 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

	150 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

	151 this, kStartRequest, args));

	152 }

	153

	154 void SpeechRecognizerImpl::AbortRecognition() {

	155 FSMEventArgs args;

	156 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

	157 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

	158 this, kAbortRequest, args));

	159 }

	160

	161 void SpeechRecognizerImpl::StopAudioCapture() {

	162 FSMEventArgs args;

	163 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

	164 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

	165 this, kStopCaptureRequest, args));

	166 }

	167

	168 bool SpeechRecognizerImpl::IsActive() const {

	169 // Checking the FSM state from another thread (thus, while the FSM is

	170 // potentially concurrently evolving) is meaningless.

	171 // If you're doing it, probably you have some design issues.

122 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));	172 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

123 DCHECK(!audio_controller_.get());	173 return state_ != kIdle;

124 DCHECK(!request_.get() \|\| !request_->HasPendingRequest());	174 }

125 DCHECK(!encoder_.get());	175

126	176 bool SpeechRecognizerImpl::IsCapturingAudio() const {

127 // The endpointer needs to estimate the environment/background noise before	177 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive().

128 // starting to treat the audio as user input. In \|HandleOnData\| we wait until	178 return state_ >= kStartingRecognition && state_ <= kRecognizingSpeech;

129 // such time has passed before switching to user input mode.	179 }

130 endpointer_.SetEnvironmentEstimationMode();	180

131	181 // Invoked in the audio thread.

132 encoder_.reset(AudioEncoder::Create(codec_, kAudioSampleRate,	182 void SpeechRecognizerImpl::OnError(AudioInputController* controller,

133 kNumBitsPerAudioSample));	183 int error_code) {

134 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;	184 FSMEventArgs args;

	185 args.audio_error_code = error_code;

	186 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

	187 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

	188 this, kAudioError, args));

	189 }

	190

	191 void SpeechRecognizerImpl::OnData(AudioInputController* controller,

	192 const uint8* data, uint32 size) {

	193 if (size == 0) // This could happen when audio capture stops and is normal.

	194 return;

	195

	196 FSMEventArgs args;

	197 args.audio_data = new AudioChunk(data, static_cast<size_t>(size),

	198 kNumBitsPerAudioSample / 8);

	199 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

	200 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

	201 this, kAudioData, args));

	202 }

	203

	204 void SpeechRecognizerImpl::OnSpeechEngineResult(

	205 const content::SpeechRecognitionResult& result) {

	206 FSMEvent event = kRecognitionResult;

	207 FSMEventArgs args;

	208 args.speech_result = result;

	209 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

	210 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

	211 this, event, args));

	212 }

	213

	214 void SpeechRecognizerImpl::OnSpeechEngineError(

	215 const content::SpeechRecognitionError& error) {

	216 FSMEvent event = kRecognitionError;

	217 FSMEventArgs args;

	218 args.error = error;

	219 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

	220 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

	221 this, event, args));

	222 }

	223

	224 // ----------------------- Core FSM implementation ---------------------------

	225

	226 void SpeechRecognizerImpl::DispatchEvent(FSMEvent event, FSMEventArgs args) {

	227 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

	228 DCHECK_LE(event, kMaxEvent);

	229 DCHECK_LE(state_, kMaxState);

	230 // Event dispatching must be sequential, otherwise it will break all the rules

	231 // and the assumptions of the finite state automata model.

	232 DCHECK_EQ(event_dispatch_nesting_level_, 0);

	233 ++event_dispatch_nesting_level_;

	234 // Guard against the delegate freeing us until we finish processing the event.

	235 scoped_refptr<SpeechRecognizerImpl> me(this);

	236

	237 event_ = event;

	238 event_args_ = &args;

	239

	240 if (event == kAudioData)

	241 ProcessAudioPipeline();

	242 // The audio pipeline must be processed before the ProcessEvent, otherwise it

	243 // would take actions according to the future state and not the current one.

	244 state_ = ProcessEvent(event);

	245

	246 // Cleanup event args.

	247 if (args.audio_data)

	248 delete args.audio_data;

	249 event_args_ = NULL;

	250 --event_dispatch_nesting_level_;

	251 }

	252

	253 // ----------- Contract for all the FSM evolution functions below -------------

	254 // - Are guaranteed to be executed in the IO thread;

	255 // - Are guaranteed to be not reentrant (themselves and each other);

	256 // - event_args_ is guaranteed to be non NULL;

	257 // - event_args_ members are guaranteed to be stable during the call;

	258 // - The class won't be freed in the meanwhile due to callbacks;

	259

	260 // TODO(primiano) the audio pipeline is currently serial. However, the

	261 // clipper->endpointer->vumeter chain and the sr_engine could be parallelized.

	262 // We should profile the execution to see if it would be worth or not.

	263 void SpeechRecognizerImpl::ProcessAudioPipeline() {

	264 const bool always = true;

	265 const bool route_audio_to_clipper = always;

	266 const bool route_audio_to_endpointer = state_ >= kEstimatingEnvironment &&

	267 state_ <= kRecognizingSpeech;

	268 const bool route_audio_to_sr_engine = route_audio_to_endpointer;

	269 const bool route_audio_to_vumeter = state_ >= kWaitingForSpeech &&

	270 state_ <= kRecognizingSpeech;

	271

	272 AudioChunk& recorded_audio_data = *(event_args_->audio_data);

	273

	274 num_samples_recorded_ += recorded_audio_data.NumSamples();

	275

	276 if (route_audio_to_clipper) {

	277 clipper_detected_clip_ = DetectClipping(recorded_audio_data);

	278 }

	279 if (route_audio_to_endpointer) {

	280 endpointer_.ProcessAudio(recorded_audio_data, &rms_);

	281 }

	282 if (route_audio_to_vumeter) {

	283 DCHECK(route_audio_to_endpointer); // Depends on endpointer due to \|rms_\|.

	284 UpdateSignalAndNoiseLevels(rms_);

	285 }

	286 if (route_audio_to_sr_engine) {

	287 DCHECK(recognition_engine_.get());

	288 recognition_engine_->TakeAudioChunk(recorded_audio_data);

	289 }

	290 }

	291

	292 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ProcessEvent(

	293 FSMEvent event) {

	294 switch (state_) {

	295 case kIdle:

	296 switch (event) {

	297 // TODO(primiano) restore UNREACHABLE_CONDITION above when speech

	298 // input extensions are fixed.

	299 case kAbortRequest: return DoNothing(); //UNREACHABLE_CONDITION();

	300 case kStartRequest: return InitializeAndStartRecording();

	301 case kStopCaptureRequest: return DoNothing(); //UNREACHABLE_CONDITION();

	302 case kAudioData: return DoNothing(); // Corner cases related to

	303 case kRecognitionResult: return DoNothing(); // queued messages being

	304 case kRecognitionError: return DoNothing(); // lately dispatched.

	305 case kAudioError: return DoNothing();

	306 }

	307 break;

	308 case kStartingRecognition:

	309 switch (event) {

	310 case kAbortRequest: return Abort();

	311 case kStartRequest: UNREACHABLE_CONDITION();

	312 case kStopCaptureRequest: return Abort();

	313 case kAudioData: return StartSpeechRecognition();

	314 case kRecognitionResult: UNREACHABLE_CONDITION();

	315 case kRecognitionError: return Abort();

	316 case kAudioError: return Abort();

	317 }

	318 break;

	319 case kEstimatingEnvironment:

	320 switch (event) {

	321 case kAbortRequest: return Abort();

	322 case kStartRequest: UNREACHABLE_CONDITION();

	323 case kStopCaptureRequest: return StopCaptureAndWaitForResult();

	324 case kAudioData: return EnvironmentEstimation();

	325 case kRecognitionResult: return ProcessIntermediateRecognitionResult();

	326 case kRecognitionError: return Abort();

	327 case kAudioError: return Abort();

	328 }

	329 break;

	330 case kWaitingForSpeech:

	331 switch (event) {

	332 case kAbortRequest: return Abort();

	333 case kStartRequest: UNREACHABLE_CONDITION();

	334 case kStopCaptureRequest: return StopCaptureAndWaitForResult();

	335 case kAudioData: return DetectUserSpeechOrTimeout();

	336 case kRecognitionResult: return ProcessIntermediateRecognitionResult();

	337 case kRecognitionError: return Abort();

	338 case kAudioError: return Abort();

	339 }

	340 break;

	341 case kRecognizingSpeech:

	342 switch (event) {

	343 case kAbortRequest: return Abort();

	344 case kStartRequest: UNREACHABLE_CONDITION();

	345 case kStopCaptureRequest: return StopCaptureAndWaitForResult();

	346 case kAudioData: return DetectEndOfSpeech();

	347 case kRecognitionResult: return ProcessIntermediateRecognitionResult();

	348 case kRecognitionError: return Abort();

	349 case kAudioError: return Abort();

	350 }

	351 break;

	352 case kWaitingFinalResult:

	353 switch (event) {

	354 case kAbortRequest: return Abort();

	355 case kStartRequest: UNREACHABLE_CONDITION();

	356 case kStopCaptureRequest: return DoNothing();

	357 case kAudioData: return DoNothing();

	358 case kRecognitionResult: return ProcessFinalRecognitionResult();

	359 case kRecognitionError: return Abort();

	360 case kAudioError: return Abort();

	361 }

	362 break;

	363 }

	364 UNREACHABLE_CONDITION();

	365 }

	366

	367 SpeechRecognizerImpl::FSMState

	368 SpeechRecognizerImpl::InitializeAndStartRecording() {

	369 DCHECK(recognition_engine_.get());

	370 DCHECK(audio_controller_.get() == NULL);

	371 AudioManager* audio_manager = (testing_audio_manager_ != NULL) ?

	372 testing_audio_manager_ :

	373 BrowserMainLoop::GetAudioManager();

	374 DCHECK(audio_manager != NULL);

	375

	376 VLOG(1) << "SpeechRecognizerImpl starting audio capture.";

	377 num_samples_recorded_ = 0;

	378 rms_ = 0;

	379 audio_level_ = 0;

	380 clipper_detected_clip_ = false;

	381 listener_->OnRecognitionStart(caller_id_);

	382

	383 if (!audio_manager->HasAudioInputDevices()) {

	384 return Abort(SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO,

	385 content::AUDIO_ERROR_NO_MIC));

	386 }

	387

	388 if (audio_manager->IsRecordingInProcess()) {

	389 return Abort(SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO,

	390 content::AUDIO_ERROR_MIC_IN_USE));

	391 }

	392

	393 const int samples_per_packet = kAudioSampleRate *

	394 recognition_engine_->GetDesiredAudioChunkDurationMs() / 1000;

135 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,	395 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,

136 kAudioSampleRate, kNumBitsPerAudioSample,	396 kAudioSampleRate, kNumBitsPerAudioSample,

137 samples_per_packet);	397 samples_per_packet);

138 audio_controller_ = AudioInputController::Create(	398 audio_controller_ = AudioInputController::Create(audio_manager, this, params);

139 audio_manager_ ? audio_manager_ : BrowserMainLoop::GetAudioManager(),	399

140 this, params);	400 if (audio_controller_.get() == NULL) {

141 DCHECK(audio_controller_.get());	401 return Abort(

142 VLOG(1) << "SpeechRecognizer starting record.";	402 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO));

143 num_samples_recorded_ = 0;	403 }

	404

	405 // The endpointer needs to estimate the environment/background noise before

	406 // starting to treat the audio as user input. We wait in the state

	407 // kEstimatingEnvironment until such interval has elapsed before switching

	408 // to user input mode.

	409 endpointer_.SetEnvironmentEstimationMode();

144 audio_controller_->Record();	410 audio_controller_->Record();

145	411 return kStartingRecognition;

146 return true;	412 }

147 }	413

148	414 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::StartSpeechRecognition() {

149 void SpeechRecognizerImpl::AbortRecognition() {	415 // This was the first audio packet recorded, so start a request to the

150 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));	416 // engine to send the data and inform the delegate.

151 DCHECK(audio_controller_.get() \|\| request_.get());	417 DCHECK(recognition_engine_.get());

152	418 recognition_engine_->Initialize();

153 // Stop recording if required.	419 listener_->OnAudioStart(caller_id_);

154 if (audio_controller_.get()) {	420 // TODO(primiano) this is a little hack, since TakeAudioChunk() is already

	421 // called by ProcessAudioPipeline(). I hate it since it weakens the

	422 // architectural beauty of this class. But it is the best tradeoff, unless we

	423 // allow the drop the first audio chunk captured after opening the audio dev.

	424 recognition_engine_->TakeAudioChunk(*(event_args_->audio_data));

	425 return kEstimatingEnvironment;

	426 }

	427

	428 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::EnvironmentEstimation() {

	429 DCHECK(endpointer_.IsEstimatingEnvironment());

	430 if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) {

	431 endpointer_.SetUserInputMode();

	432 listener_->OnEnvironmentEstimationComplete(caller_id_);

	433 return kWaitingForSpeech;

	434 } else {

	435 return kEstimatingEnvironment;

	436 }

	437 }

	438

	439 SpeechRecognizerImpl::FSMState

	440 SpeechRecognizerImpl::DetectUserSpeechOrTimeout() {

	441 if (skipSilenceDetectionForTesting)

	442 return kRecognizingSpeech;

	443

	444 if (endpointer_.DidStartReceivingSpeech()) {

	445 listener_->OnSoundStart(caller_id_);

	446 return kRecognizingSpeech;

	447 } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) {

	448 return Abort(

	449 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH));

	450 } else {

	451 return kWaitingForSpeech;

	452 }

	453 }

	454

	455 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::DetectEndOfSpeech() {

	456 if (endpointer_.speech_input_complete()) {

	457 return StopCaptureAndWaitForResult();

	458 } else {

	459 return kRecognizingSpeech;

	460 }

	461 }

	462

	463 SpeechRecognizerImpl::FSMState

	464 SpeechRecognizerImpl::StopCaptureAndWaitForResult() {

	465 DCHECK(state_ >= kEstimatingEnvironment && state_ <= kRecognizingSpeech);

	466

	467 VLOG(1) << "Concluding recognition";

	468 CloseAudioControllerSynchronously();

	469 recognition_engine_->AudioChunksEnded();

	470

	471 if (state_ > kWaitingForSpeech)

	472 listener_->OnSoundEnd(caller_id_);

	473

	474 listener_->OnAudioEnd(caller_id_);

	475 return kWaitingFinalResult;

	476 }

	477

	478 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort() {

	479 // TODO(primiano) Should raise SPEECH_RECOGNITION_ERROR_ABORTED in lack of

	480 // other specific error sources (so that it was an explicit abort request).

	481 // However, SPEECH_RECOGNITION_ERROR_ABORTED is not caught in UI layers

	482 // and currently would cause an exception. JS will probably need it in future.

	483 SpeechRecognitionError error;

	484 bool has_error = false;

	485 if (event_ == kAudioError) {

	486 has_error = true;

	487 error.code = content::SPEECH_RECOGNITION_ERROR_AUDIO;

	488 } else if (event_ == kRecognitionError) {

	489 has_error = true;

	490 error = event_args_->error;

	491 }

	492 return Abort(has_error, error);

	493 }

	494

	495 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort(

	496 const SpeechRecognitionError& error) {

	497 return Abort(true, error);

	498 }

	499

	500 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort(

	501 bool has_error, const SpeechRecognitionError& error) {

	502 if (audio_controller_)

155 CloseAudioControllerSynchronously();	503 CloseAudioControllerSynchronously();

156 }	504

157	505 VLOG(1) << "SpeechRecognizerImpl canceling recognition. " <<

158 VLOG(1) << "SpeechRecognizer canceling recognition.";	506 error.code << " " << error.details;

159 encoder_.reset();	507

160 request_.reset();	508 // The recognition engine is initialized only after kStartingRecognition.

161 }	509 if (state_ > kStartingRecognition) {

162	510 DCHECK(recognition_engine_.get());

163 void SpeechRecognizerImpl::StopAudioCapture() {	511 recognition_engine_->Cleanup();

164 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));	512 //TODO(primiano) reset the engine? Why, after all?

165	513 //recognition_engine_.reset();

166 // If audio recording has already stopped and we are in recognition phase,	514 }

167 // silently ignore any more calls to stop recording.	515

168 if (!audio_controller_.get())	516 if (state_ > kWaitingForSpeech && state_ < kWaitingFinalResult)

169 return;	517 listener_->OnSoundEnd(caller_id_);

170	518

171 CloseAudioControllerSynchronously();	519 if (state_ > kStartingRecognition && state_ < kWaitingFinalResult)

172	520 listener_->OnAudioEnd(caller_id_);

173 listener_->OnSoundEnd(caller_id_);	521

174 listener_->OnAudioEnd(caller_id_);	522 if (has_error)

175	523 listener_->OnRecognitionError(caller_id_, error);

176 // UploadAudioChunk requires a non-empty final buffer. So we encode a packet	524

177 // of silence in case encoder had no data already.	525 listener_->OnRecognitionEnd(caller_id_);

178 std::vector<short> samples((kAudioSampleRate * kAudioPacketIntervalMs) /	526

179 1000);	527 return kIdle;

180 AudioChunk dummy_chunk(reinterpret_cast<uint8*>(&samples[0]),	528 }

181 samples.size() * sizeof(short),	529

182 encoder_->bits_per_sample() / 8);	530 SpeechRecognizerImpl::FSMState

183 encoder_->Encode(dummy_chunk);	531 SpeechRecognizerImpl::ProcessIntermediateRecognitionResult() {

184 encoder_->Flush();	532 // This is in preparation for future speech recognition functions.

185 scoped_ptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());	533 // DCHECK(continuous_mode_);

186 DCHECK(!encoded_data->IsEmpty());	534 // const SpeechRecognitionResult& result = event_args_->speech_result;

187 encoder_.reset();	535 // VLOG(1) << "Got intermediate result";

188	536 // listener_->OnRecognitionResult(caller_id_, result);

189 // If we haven't got any audio yet end the recognition sequence here.	537 NOTREACHED();

190 if (request_ == NULL) {	538 return state_;

191 // Guard against the listener freeing us until we finish our job.	539 }

192 scoped_refptr<SpeechRecognizerImpl> me(this);	540

193 listener_->OnRecognitionEnd(caller_id_);	541 SpeechRecognizerImpl::FSMState

194 } else {	542 SpeechRecognizerImpl::ProcessFinalRecognitionResult() {

195 request_->UploadAudioChunk(encoded_data, true / is_last_chunk */);	543 const SpeechRecognitionResult& result = event_args_->speech_result;

196 }	544 VLOG(1) << "Got valid result";

197 }	545 recognition_engine_->Cleanup();

198

199 // Invoked in the audio thread.

200 void SpeechRecognizerImpl::OnError(AudioInputController* controller,

201 int error_code) {

202 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

203 base::Bind(&SpeechRecognizerImpl::HandleOnError,

204 this, error_code));

205 }

206

207 void SpeechRecognizerImpl::HandleOnError(int error_code) {

208 LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code;

209

210 // Check if we are still recording before canceling recognition, as

211 // recording might have been stopped after this error was posted to the queue

212 // by \|OnError\|.

213 if (!audio_controller_.get())

214 return;

215

216 InformErrorAndAbortRecognition(content::SPEECH_RECOGNITION_ERROR_AUDIO);

217 }

218

219 void SpeechRecognizerImpl::OnData(AudioInputController* controller,

220 const uint8* data, uint32 size) {

221 if (size == 0) // This could happen when recording stops and is normal.

222 return;

223 AudioChunk* raw_audio = new AudioChunk(data, static_cast<size_t>(size),

224 kNumBitsPerAudioSample / 8);

225 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

226 base::Bind(&SpeechRecognizerImpl::HandleOnData,

227 this, raw_audio));

228 }

229

230 void SpeechRecognizerImpl::HandleOnData(AudioChunk* raw_audio) {

231 scoped_ptr<AudioChunk> free_raw_audio_on_return(raw_audio);

232 // Check if we are still recording and if not discard this buffer, as

233 // recording might have been stopped after this buffer was posted to the queue

234 // by \|OnData\|.

235 if (!audio_controller_.get())

236 return;

237

238 bool speech_was_heard_before_packet = endpointer_.DidStartReceivingSpeech();

239

240 encoder_->Encode(*raw_audio);

241 float rms;

242 endpointer_.ProcessAudio(*raw_audio, &rms);

243 bool did_clip = DetectClipping(*raw_audio);

244 num_samples_recorded_ += raw_audio->NumSamples();

245

246 if (request_ == NULL) {

247 // This was the first audio packet recorded, so start a request to the

248 // server to send the data and inform the listener.

249 listener_->OnAudioStart(caller_id_);

250 request_.reset(new SpeechRecognitionRequest(context_getter_.get(), this));

251 request_->Start(language_, grammar_, filter_profanities_,

252 hardware_info_, origin_url_, encoder_->mime_type());

253 }

254

255 scoped_ptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());

256 DCHECK(!encoded_data->IsEmpty());

257 request_->UploadAudioChunk(encoded_data, false / is_last_chunk */);

258

259 if (endpointer_.IsEstimatingEnvironment()) {

260 // Check if we have gathered enough audio for the endpointer to do

261 // environment estimation and should move on to detect speech/end of speech.

262 if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs *

263 kAudioSampleRate) / 1000) {

264 endpointer_.SetUserInputMode();

265 listener_->OnEnvironmentEstimationComplete(caller_id_);

266 }

267 return; // No more processing since we are still estimating environment.

268 }

269

270 // Check if we have waited too long without hearing any speech.

271 bool speech_was_heard_after_packet = endpointer_.DidStartReceivingSpeech();

272 if (!speech_was_heard_after_packet &&

273 num_samples_recorded_ >= kNoSpeechTimeoutSec * kAudioSampleRate) {

274 InformErrorAndAbortRecognition(

275 content::SPEECH_RECOGNITION_ERROR_NO_SPEECH);

276 return;

277 }

278

279 if (!speech_was_heard_before_packet && speech_was_heard_after_packet)

280 listener_->OnSoundStart(caller_id_);

281

282 // Calculate the input volume to display in the UI, smoothing towards the

283 // new level.

284 float level = (rms - kAudioMeterMinDb) /

285 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

286 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);

287 if (level > audio_level_) {

288 audio_level_ += (level - audio_level_) * kUpSmoothingFactor;

289 } else {

290 audio_level_ += (level - audio_level_) * kDownSmoothingFactor;

291 }

292

293 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /

294 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

295 noise_level = std::min(std::max(0.0f, noise_level),

296 kAudioMeterRangeMaxUnclipped);

297

298 listener_->OnAudioLevelsChange(caller_id_, did_clip ? 1.0f : audio_level_,

299 noise_level);

300

301 if (endpointer_.speech_input_complete())

302 StopAudioCapture();

303 }

304

305 void SpeechRecognizerImpl::SetRecognitionResult(

306 const content::SpeechRecognitionResult& result) {

307 if (result.error != content::SPEECH_RECOGNITION_ERROR_NONE) {

308 InformErrorAndAbortRecognition(result.error);

309 return;

310 }

311

312 // Guard against the listener freeing us until we finish our job.

313 scoped_refptr<SpeechRecognizerImpl> me(this);

314 listener_->OnRecognitionResult(caller_id_, result);	546 listener_->OnRecognitionResult(caller_id_, result);

315 listener_->OnRecognitionEnd(caller_id_);	547 listener_->OnRecognitionEnd(caller_id_);

316 }	548 return kIdle;

317	549 }

318 void SpeechRecognizerImpl::InformErrorAndAbortRecognition(	550

319 content::SpeechRecognitionErrorCode error) {	551 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::DoNothing() const {

320 DCHECK_NE(error, content::SPEECH_RECOGNITION_ERROR_NONE);	552 return state_; // Just keep the current state.

321 AbortRecognition();

322

323 // Guard against the listener freeing us until we finish our job.

324 scoped_refptr<SpeechRecognizerImpl> me(this);

325 listener_->OnRecognitionError(caller_id_, error);

326 }	553 }

327	554

328 void SpeechRecognizerImpl::CloseAudioControllerSynchronously() {	555 void SpeechRecognizerImpl::CloseAudioControllerSynchronously() {

329 VLOG(1) << "SpeechRecognizer stopping record.";	556 DCHECK(audio_controller_);

	557 VLOG(1) << "SpeechRecognizerImpl stopping audio capture.";

330	558

331 // TODO(satish): investigate the possibility to utilize the closure	559 // TODO(satish): investigate the possibility to utilize the closure

332 // and switch to async. version of this method. Compare with how	560 // and switch to async. version of this method. Compare with how

333 // it's done in e.g. the AudioRendererHost.	561 // it's done in e.g. the AudioRendererHost.

334 base::WaitableEvent closed_event(true, false);	562 base::WaitableEvent closed_event(true, false);

335 audio_controller_->Close(base::Bind(&base::WaitableEvent::Signal,	563 audio_controller_->Close(base::Bind(&base::WaitableEvent::Signal,

336 base::Unretained(&closed_event)));	564 base::Unretained(&closed_event)));

337 closed_event.Wait();	565 closed_event.Wait();

338 audio_controller_ = NULL; // Releases the ref ptr.	566 audio_controller_ = NULL; // Releases the ref ptr.

339 }	567 }

340	568

	569 int SpeechRecognizerImpl::GetElapsedTimeMs() const {

	570 return num_samples_recorded_ * 1000 / kAudioSampleRate;

	571 }

	572

	573 void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms) {

	574 // Calculate the input volume to display in the UI, smoothing towards the

	575 // new level.

	576 // TODO(primiano) Do we really need all this floating point arith here?

	577 // Perhaps it might be quite expensive on mobile.

	578 float level = (rms - kAudioMeterMinDb) /

	579 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

	580 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);

	581 if (level > audio_level_) {

	582 audio_level_ += (level - audio_level_) * kUpSmoothingFactor;

	583 } else {

	584 audio_level_ += (level - audio_level_) * kDownSmoothingFactor;

	585 }

	586

	587 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /

	588 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

	589 noise_level = std::min(std::max(0.0f, noise_level),

	590 kAudioMeterRangeMaxUnclipped);

	591

	592 listener_->OnAudioLevelsChange(

	593 caller_id_, clipper_detected_clip_ ? 1.0f : audio_level_, noise_level);

	594 }

	595

	596 const SpeechRecognitionEngine&

	597 SpeechRecognizerImpl::recognition_engine() const {

	598 return *(recognition_engine_.get());

	599 }

	600

341 void SpeechRecognizerImpl::SetAudioManagerForTesting(	601 void SpeechRecognizerImpl::SetAudioManagerForTesting(

342 AudioManager* audio_manager) {	602 AudioManager* audio_manager) {

343 audio_manager_ = audio_manager;	603 testing_audio_manager_ = audio_manager;

344 }	604 }

345	605

346 bool SpeechRecognizerImpl::IsActive() const {	606 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs()

347 return (request_.get() != NULL);	607 : audio_error_code(0), audio_data(NULL) {

348 }

349

350 bool SpeechRecognizerImpl::IsCapturingAudio() const {

351 return (audio_controller_.get() != NULL);

352 }	608 }

353	609

354 } // namespace speech	610 } // namespace speech

OLD	NEW