content/browser/speech/speech_recognizer_impl.cc - Issue 9663066: Refactoring of chrome speech recognition architecture (CL1.3)

Side by Side Diff: content/browser/speech/speech_recognizer_impl.cc

Issue 9663066: Refactoring of chrome speech recognition architecture (CL1.3) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Rebased from master. Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« content/browser/speech/speech_recognizer_impl.h ('K') | « content/browser/speech/speech_recognizer_impl.h ('k') | content/browser/speech/speech_recognizer_impl_unittest.cc » ('j') | content/browser/speech/speech_recognizer_impl_unittest.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "content/browser/speech/speech_recognizer_impl.h"	5 #include "content/browser/speech/speech_recognizer_impl.h"

6	6

7 #include "base/bind.h"	7 #include "base/bind.h"

8 #include "base/time.h"	8 #include "base/time.h"

9 #include "content/browser/browser_main_loop.h"	9 #include "content/browser/browser_main_loop.h"

10 #include "content/browser/speech/audio_buffer.h"	10 #include "content/browser/speech/audio_buffer.h"

	11 #include "content/browser/speech/google_ssfe_remote_engine.h"

	12 #include "content/public/browser/browser_thread.h"

11 #include "content/public/browser/speech_recognition_event_listener.h"	13 #include "content/public/browser/speech_recognition_event_listener.h"

12 #include "content/public/browser/browser_thread.h"	14 #include "content/public/browser/speech_recognizer.h"

13 #include "content/public/common/speech_recognition_result.h"	15 #include "content/public/common/speech_recognition_result.h"

14 #include "net/url_request/url_request_context_getter.h"	16 #include "net/url_request/url_request_context_getter.h"

15	17

	18 #define UNREACHABLE_CONDITION() do{ NOTREACHED(); return state_; } while(0)
	hans 2012/03/16 11:12:56 ultra nit: there should be a space between the "do ultra nit: there should be a space between the "do" and { Primiano Tucci (use gerrit) 2012/03/16 15:03:42 Done. Show quoted text On 2012/03/16 11:12:56, hans wrote: > ultra nit: there should be a space between the "do" and { Done.
	19

16 using content::BrowserMainLoop;	20 using content::BrowserMainLoop;

17 using content::BrowserThread;	21 using content::BrowserThread;

	22 using content::SpeechRecognitionError;

18 using content::SpeechRecognitionEventListener;	23 using content::SpeechRecognitionEventListener;

	24 using content::SpeechRecognitionResult;

19 using content::SpeechRecognizer;	25 using content::SpeechRecognizer;

20 using media::AudioInputController;	26 using media::AudioInputController;

21 using std::string;

22	27

	28 // TODO(primiano) what about a watchdog here to avoid getting stuck if the

	29 // SpeechRecognitionEngine does not deliver a result (in reasonable time)?

23 namespace {	30 namespace {

24	31 // Enables spontaneous transition from WaitingForSpeech to RecognizingSpeech,

	32 // which is required for the mock recognition engine which sends fake results.

	33 const bool skipSilenceDetectionForTesting = false;

25 // The following constants are related to the volume level indicator shown in	34 // The following constants are related to the volume level indicator shown in

26 // the UI for recorded audio.	35 // the UI for recorded audio.

27 // Multiplier used when new volume is greater than previous level.	36 // Multiplier used when new volume is greater than previous level.

28 const float kUpSmoothingFactor = 1.0f;	37 const float kUpSmoothingFactor = 1.0f;

29 // Multiplier used when new volume is lesser than previous level.	38 // Multiplier used when new volume is lesser than previous level.

30 const float kDownSmoothingFactor = 0.7f;	39 const float kDownSmoothingFactor = 0.7f;

31 // RMS dB value of a maximum (unclipped) sine wave for int16 samples.	40 // RMS dB value of a maximum (unclipped) sine wave for int16 samples.

32 const float kAudioMeterMaxDb = 90.31f;	41 const float kAudioMeterMaxDb = 90.31f;

33 // This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0.	42 // This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0.

34 // Values lower than this will display as empty level-meter.	43 // Values lower than this will display as empty level-meter.

35 const float kAudioMeterMinDb = 30.0f;	44 const float kAudioMeterMinDb = 30.0f;

36 const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb;	45 const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb;

37	46

38 // Maximum level to draw to display unclipped meter. (1.0f displays clipping.)	47 // Maximum level to draw to display unclipped meter. (1.0f displays clipping.)

39 const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f;	48 const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f;

40	49

41 // Returns true if more than 5% of the samples are at min or max value.	50 // Returns true if more than 5% of the samples are at min or max value.

42 bool DetectClipping(const speech::AudioChunk& chunk) {	51 bool DetectClipping(const speech::AudioChunk& chunk) {

43 const int num_samples = chunk.NumSamples();	52 const int num_samples = chunk.NumSamples();

44 const int16* samples = chunk.SamplesData16();	53 const int16* samples = chunk.SamplesData16();

45 const int kThreshold = num_samples / 20;	54 const int kThreshold = num_samples / 20;

46 int clipping_samples = 0;	55 int clipping_samples = 0;

	56

47 for (int i = 0; i < num_samples; ++i) {	57 for (int i = 0; i < num_samples; ++i) {

48 if (samples[i] <= -32767 \|\| samples[i] >= 32767) {	58 if (samples[i] <= -32767 \|\| samples[i] >= 32767) {

49 if (++clipping_samples > kThreshold)	59 if (++clipping_samples > kThreshold)

50 return true;	60 return true;

51 }	61 }

52 }	62 }

53 return false;	63 return false;

54 }	64 }

55	65

56 } // namespace	66 } // namespace

57	67

	68 // TODO(primiano) transitional, see description in speech_recognizer.h.

58 SpeechRecognizer* SpeechRecognizer::Create(	69 SpeechRecognizer* SpeechRecognizer::Create(

59 SpeechRecognitionEventListener* listener,	70 SpeechRecognitionEventListener* listener,

60 int caller_id,	71 int caller_id,

61 const std::string& language,	72 const std::string& language,

62 const std::string& grammar,	73 const std::string& grammar,

63 net::URLRequestContextGetter* context_getter,	74 net::URLRequestContextGetter* context_getter,

64 bool filter_profanities,	75 bool filter_profanities,

65 const std::string& hardware_info,	76 const std::string& hardware_info,

66 const std::string& origin_url) {	77 const std::string& origin_url) {

67 return new speech::SpeechRecognizerImpl(	78 speech::GoogleSSFERemoteEngineConfig google_sr_config;

68 listener, caller_id, language, grammar, context_getter,	79 google_sr_config.language = language;

69 filter_profanities, hardware_info, origin_url);	80 google_sr_config.grammar = grammar;

	81 google_sr_config.audio_sample_rate =

	82 speech::SpeechRecognizerImpl::kAudioSampleRate;

	83 google_sr_config.audio_num_bits_per_sample =

	84 speech::SpeechRecognizerImpl::kNumBitsPerAudioSample;

	85 google_sr_config.filter_profanities = filter_profanities;

	86 google_sr_config.hardware_info = hardware_info;

	87 google_sr_config.origin_url = origin_url;

	88

	89 speech::GoogleSSFERemoteEngine* google_sr_engine =

	90 new speech::GoogleSSFERemoteEngine(context_getter);

	91 google_sr_engine->SetConfiguration(google_sr_config);

	92

	93 return new speech::SpeechRecognizerImpl(listener,

	94 caller_id,

	95 google_sr_engine);

70 }	96 }

71	97

72 namespace speech {	98 namespace speech {

73

74 const int SpeechRecognizerImpl::kAudioSampleRate = 16000;	99 const int SpeechRecognizerImpl::kAudioSampleRate = 16000;

75 const int SpeechRecognizerImpl::kAudioPacketIntervalMs = 100;

76 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO;	100 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO;

77 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16;	101 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16;

78 const int SpeechRecognizerImpl::kNoSpeechTimeoutSec = 8;	102 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000;

79 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300;	103 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300;

80	104

81 SpeechRecognizerImpl::SpeechRecognizerImpl(	105 SpeechRecognizerImpl::SpeechRecognizerImpl(

82 SpeechRecognitionEventListener* listener,	106 SpeechRecognitionEventListener* listener,

83 int caller_id,	107 int caller_id,

84 const std::string& language,	108 SpeechRecognitionEngine* engine)

85 const std::string& grammar,

86 net::URLRequestContextGetter* context_getter,

87 bool filter_profanities,

88 const std::string& hardware_info,

89 const std::string& origin_url)

90 : listener_(listener),	109 : listener_(listener),

	110 testing_audio_manager_(NULL),

	111 recognition_engine_(engine),

	112 endpointer_(kAudioSampleRate),

91 caller_id_(caller_id),	113 caller_id_(caller_id),

92 language_(language),	114 event_dispatch_nesting_level_(0),

93 grammar_(grammar),	115 state_(kIdle),

94 filter_profanities_(filter_profanities),	116 event_args_(NULL) {

95 hardware_info_(hardware_info),	117 DCHECK(listener_ != NULL);

96 origin_url_(origin_url),	118 DCHECK(recognition_engine_ != NULL);

97 context_getter_(context_getter),

98 codec_(AudioEncoder::CODEC_FLAC),

99 encoder_(NULL),

100 endpointer_(kAudioSampleRate),

101 num_samples_recorded_(0),

102 audio_level_(0.0f),

103 audio_manager_(NULL) {

104 endpointer_.set_speech_input_complete_silence_length(	119 endpointer_.set_speech_input_complete_silence_length(

105 base::Time::kMicrosecondsPerSecond / 2);	120 base::Time::kMicrosecondsPerSecond / 2);

106 endpointer_.set_long_speech_input_complete_silence_length(	121 endpointer_.set_long_speech_input_complete_silence_length(

107 base::Time::kMicrosecondsPerSecond);	122 base::Time::kMicrosecondsPerSecond);

108 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);	123 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);

109 endpointer_.StartSession();	124 endpointer_.StartSession();

	125 recognition_engine_->set_delegate(this);

110 }	126 }

111	127

112 SpeechRecognizerImpl::~SpeechRecognizerImpl() {	128 SpeechRecognizerImpl::~SpeechRecognizerImpl() {

113 // Recording should have stopped earlier due to the endpointer or

114 // \|StopRecording\| being called.

115 DCHECK(!audio_controller_.get());

116 DCHECK(!request_.get() \|\| !request_->HasPendingRequest());

117 DCHECK(!encoder_.get());

118 endpointer_.EndSession();	129 endpointer_.EndSession();

119 }	130 }

120	131

121 bool SpeechRecognizerImpl::StartRecognition() {	132 // ------- Methods that trigger Finite State Machine (FSM) events ------------

	133

	134 // NOTE: all the external events and request should be enqueued (PostTask), even

	135 // if they come from the same (IO) thread, in order to preserve the relationship

	136 // of causalilty between events.
	hans 2012/03/16 11:12:56 s/causalilty/causality/ s/causalilty/causality/ Primiano Tucci (use gerrit) 2012/03/16 15:03:42 Done. Show quoted text On 2012/03/16 11:12:56, hans wrote: > s/causalilty/causality/ Done.
	137 // Imagine what would happen if a Start has been enqueued from another thread

	138 // (but not yet processed) and we suddenly issue a Stop from the IO thread.

	139 // Furthermore, even if you are sure to not interleave start and stop requests,

	140 // asynchronous event processing mixed with syncrhonous callback can cause very
	hans 2012/03/16 11:12:56 s/syncrhonous/synchronous/ s/syncrhonous/synchronous/ Primiano Tucci (use gerrit) 2012/03/16 15:03:42 Done. Show quoted text On 2012/03/16 11:12:56, hans wrote: > s/syncrhonous/synchronous/ Done.
	141 // mind-breaking side effects.

	142 // For instance, if someone could call Abort synchronously (instead of posting

	143 // the event on the queue), it will receive interleaved callbacks (e.g. an error

	144 // or the audio-end event) before the Abort call is effectively ended.

	145 // Is your (caller) code ready for this?

	146

	147 void SpeechRecognizerImpl::StartRecognition() {

	148 FSMEventArgs args;

	149 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

	150 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

	151 this, kStartRequest, args));

	152 }

	153

	154 void SpeechRecognizerImpl::AbortRecognition() {

	155 FSMEventArgs args;

	156 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

	157 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

	158 this, kAbortRequest, args));

	159 }

	160

	161 void SpeechRecognizerImpl::StopAudioCapture() {

	162 FSMEventArgs args;

	163 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

	164 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

	165 this, kStopCaptureRequest, args));

	166 }

	167

	168 bool SpeechRecognizerImpl::IsActive() const {

	169 // Checking the FSM state from another thread (thus, while the FSM is

	170 // potentially concurrently evolving) is meaningless.

	171 // If you're doing it, probably you have some design issues.

122 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));	172 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

123 DCHECK(!audio_controller_.get());	173 return state_ != kIdle;

124 DCHECK(!request_.get() \|\| !request_->HasPendingRequest());	174 }

125 DCHECK(!encoder_.get());	175

126	176 bool SpeechRecognizerImpl::IsCapturingAudio() const {

127 // The endpointer needs to estimate the environment/background noise before	177 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive().

128 // starting to treat the audio as user input. In \|HandleOnData\| we wait until	178 return state_ >= kStartingRecognition && state_ <= kRecognizingSpeech;

129 // such time has passed before switching to user input mode.	179 }

130 endpointer_.SetEnvironmentEstimationMode();	180

131	181 // Invoked in the audio thread.

132 encoder_.reset(AudioEncoder::Create(codec_, kAudioSampleRate,	182 void SpeechRecognizerImpl::OnError(AudioInputController* controller,

133 kNumBitsPerAudioSample));	183 int error_code) {

134 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;	184 FSMEventArgs args;

	185 args.audio_error_code = error_code;

	186 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

	187 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

	188 this, kAudioError, args));

	189 }

	190

	191 void SpeechRecognizerImpl::OnData(AudioInputController* controller,

	192 const uint8* data, uint32 size) {

	193 if (size == 0) // This could happen when audio capture stops and is normal.

	194 return;

	195

	196 FSMEventArgs args;

	197 args.audio_data = new AudioChunk(data, static_cast<size_t>(size),

	198 kNumBitsPerAudioSample / 8);

	199 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

	200 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

	201 this, kAudioData, args));

	202 }

	203

	204 void SpeechRecognizerImpl::OnSpeechEngineResult(

	205 const content::SpeechRecognitionResult& result) {

	206 FSMEvent event = kRecognitionResult;

	207 FSMEventArgs args;

	208 args.speech_result = result;

	209 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

	210 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

	211 this, event, args));

	212 }

	213

	214 void SpeechRecognizerImpl::OnSpeechEngineError(

	215 const content::SpeechRecognitionError& error) {

	216 FSMEvent event = kRecognitionError;

	217 FSMEventArgs args;

	218 args.error = error;

	219 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

	220 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

	221 this, event, args));

	222 }

	223

	224 // ----------------------- Core FSM implementation ---------------------------

	225

	226 void SpeechRecognizerImpl::DispatchEvent(FSMEvent event, FSMEventArgs args) {

	227 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

	228 DCHECK_LE(event, kMaxEvent);

	229 DCHECK_LE(state_, kMaxState);

	230 // Event dispatching must be sequential, otherwise it will break all the rules

	231 // and the assumptions of the finite state automata model.

	232 DCHECK_EQ(event_dispatch_nesting_level_, 0);

	233 ++event_dispatch_nesting_level_;

	234 // Guard against the delegate freeing us until we finish processing the event.

	235 scoped_refptr<SpeechRecognizerImpl> me(this);

	236

	237 event_ = event;

	238 event_args_ = &args;

	239

	240 if (event == kAudioData)

	241 ProcessAudioPipeline();

	242 // Ensure the audio pipeline is processed before processing the event,

	243 // otherwise it would take actions according to the next state and not the

	244 // current one.
	hans 2012/03/16 11:12:56 should the comment be moved up a little, or put in should the comment be moved up a little, or put in the if statement (which should then have braces)? i assume the comment refers to the ProcessAudioPipeline() call above, but it's confusing that the comment comes afterwards, and at a different indent level... Primiano Tucci (use gerrit) 2012/03/16 15:03:42 Mmm It is not referred to the if () statement itse Show quoted text On 2012/03/16 11:12:56, hans wrote: > should the comment be moved up a little, or put in the if statement (which > should then have braces)? i assume the comment refers to the > ProcessAudioPipeline() call above, but it's confusing that the comment comes > afterwards, and at a different indent level... Mmm It is not referred to the if () statement itself. It is just a warning to "don't move process audio pipeline after this point". I modified the comment a bit so it might sound more like a warning rather than a comment.
	245 state_ = ProcessEvent(event);

	246

	247 // Cleanup event args.

	248 if (args.audio_data)

	249 delete args.audio_data;

	250 event_args_ = NULL;

	251 --event_dispatch_nesting_level_;

	252 }

	253

	254 // ----------- Contract for all the FSM evolution functions below -------------

	255 // - Are guaranteed to be executed in the IO thread;

	256 // - Are guaranteed to be not reentrant (themselves and each other);

	257 // - event_args_ is guaranteed to be non NULL;

	258 // - event_args_ members are guaranteed to be stable during the call;

	259 // - The class won't be freed in the meanwhile due to callbacks;

	260

	261 // TODO(primiano) the audio pipeline is currently serial. However, the

	262 // clipper->endpointer->vumeter chain and the sr_engine could be parallelized.

	263 // We should profile the execution to see if it would be worth or not.

	264 void SpeechRecognizerImpl::ProcessAudioPipeline() {

	265 const bool always = true;

	266 const bool route_audio_to_clipper = always;

	267 const bool route_audio_to_endpointer = state_ >= kEstimatingEnvironment &&

	268 state_ <= kRecognizingSpeech;

	269 const bool route_audio_to_sr_engine = route_audio_to_endpointer;

	270 const bool route_audio_to_vumeter = state_ >= kWaitingForSpeech &&

	271 state_ <= kRecognizingSpeech;

	272

	273 AudioChunk& recorded_audio_data = *(event_args_->audio_data);

	274

	275 num_samples_recorded_ += recorded_audio_data.NumSamples();

	276

	277 if (route_audio_to_clipper) {

	278 clipper_detected_clip_ = DetectClipping(recorded_audio_data);

	279 }

	280 if (route_audio_to_endpointer) {

	281 endpointer_.ProcessAudio(recorded_audio_data, &rms_);

	282 }

	283 if (route_audio_to_vumeter) {

	284 DCHECK(route_audio_to_endpointer); // Depends on endpointer due to \|rms_\|.

	285 UpdateSignalAndNoiseLevels(rms_);

	286 }

	287 if (route_audio_to_sr_engine) {

	288 DCHECK(recognition_engine_.get());

	289 recognition_engine_->PushSpeechAudio(recorded_audio_data);

	290 }

	291 }

	292

	293 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ProcessEvent(

	294 FSMEvent event) {

	295 switch (state_) {

	296 case kIdle:

	297 switch (event) {

	298 // TODO(primiano) restore UNREACHABLE_CONDITION above when speech

	299 // input extensions are fixed.

	300 case kAbortRequest: return DoNothing(); //UNREACHABLE_CONDITION();

	301 case kStartRequest: return InitializeAndStartRecording();

	302 case kStopCaptureRequest: return DoNothing(); //UNREACHABLE_CONDITION();

	303 case kAudioData: return DoNothing(); // Corner cases related to

	304 case kRecognitionResult: return DoNothing(); // queued messages being

	305 case kRecognitionError: return DoNothing(); // lately dispatched.

	306 case kAudioError: return DoNothing();

	307 }

	308 break;

	309 case kStartingRecognition:

	310 switch (event) {

	311 case kAbortRequest: return Abort();

	312 case kStartRequest: UNREACHABLE_CONDITION();

	313 case kStopCaptureRequest: return Abort();

	314 case kAudioData: return StartSpeechRecognition();

	315 case kRecognitionResult: UNREACHABLE_CONDITION();

	316 case kRecognitionError: return Abort();

	317 case kAudioError: return Abort();

	318 }

	319 break;

	320 case kEstimatingEnvironment:

	321 switch (event) {

	322 case kAbortRequest: return Abort();

	323 case kStartRequest: UNREACHABLE_CONDITION();

	324 case kStopCaptureRequest: return StopCaptureAndWaitForResult();

	325 case kAudioData: return EnvironmentEstimation();

	326 case kRecognitionResult: return ProcessIntermediateRecognitionResult();

	327 case kRecognitionError: return Abort();

	328 case kAudioError: return Abort();

	329 }

	330 break;

	331 case kWaitingForSpeech:

	332 switch (event) {

	333 case kAbortRequest: return Abort();

	334 case kStartRequest: UNREACHABLE_CONDITION();

	335 case kStopCaptureRequest: return StopCaptureAndWaitForResult();

	336 case kAudioData: return DetectUserSpeechOrTimeout();

	337 case kRecognitionResult: return ProcessIntermediateRecognitionResult();

	338 case kRecognitionError: return Abort();

	339 case kAudioError: return Abort();

	340 }

	341 break;

	342 case kRecognizingSpeech:

	343 switch (event) {

	344 case kAbortRequest: return Abort();

	345 case kStartRequest: UNREACHABLE_CONDITION();

	346 case kStopCaptureRequest: return StopCaptureAndWaitForResult();

	347 case kAudioData: return DetectEndOfSpeech();

	348 case kRecognitionResult: return ProcessIntermediateRecognitionResult();

	349 case kRecognitionError: return Abort();

	350 case kAudioError: return Abort();

	351 }

	352 break;

	353 case kWaitingFinalResult:

	354 switch (event) {

	355 case kAbortRequest: return Abort();

	356 case kStartRequest: UNREACHABLE_CONDITION();

	357 case kStopCaptureRequest: return DoNothing();

	358 case kAudioData: return DoNothing();

	359 case kRecognitionResult: return ProcessFinalRecognitionResult();

	360 case kRecognitionError: return Abort();

	361 case kAudioError: return Abort();

	362 }

	363 break;

	364 }

	365 UNREACHABLE_CONDITION();

	366 }

	367

	368 SpeechRecognizerImpl::FSMState

	369 SpeechRecognizerImpl::InitializeAndStartRecording() {
	hans 2012/03/16 11:12:56 i'm unsure about the indentation here.. spontaneou i'm unsure about the indentation here.. spontaneously i would write SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::InitializeAndStartRecording() { but maybe that's just me :) this would apply for a couple of more functions below Primiano Tucci (use gerrit) 2012/03/16 15:03:42 Done. Show quoted text On 2012/03/16 11:12:56, hans wrote: > i'm unsure about the indentation here.. > spontaneously i would write > > SpeechRecognizerImpl::FSMState > SpeechRecognizerImpl::InitializeAndStartRecording() { > > but maybe that's just me :) this would apply for a couple of more functions > below Done.
	370 DCHECK(recognition_engine_.get());

	371 DCHECK(audio_controller_.get() == NULL);

	372 AudioManager* audio_manager = (testing_audio_manager_ != NULL) ?

	373 testing_audio_manager_ :

	374 BrowserMainLoop::GetAudioManager();

	375 DCHECK(audio_manager != NULL);

	376

	377 VLOG(1) << "SpeechRecognizerImpl starting audio capture.";

	378 num_samples_recorded_ = 0;

	379 rms_ = 0;

	380 audio_level_ = 0;

	381 clipper_detected_clip_ = false;

	382 listener_->OnRecognitionStart(caller_id_);

	383

	384 if (!audio_manager->HasAudioInputDevices())
	hans 2012/03/16 11:12:56 i would put { around the body of the if since it's i would put { around the body of the if since it's covering more than two lines (same for the next ifs below) Primiano Tucci (use gerrit) 2012/03/16 15:03:42 Done. Show quoted text On 2012/03/16 11:12:56, hans wrote: > i would put { around the body of the if since it's covering more than two lines > (same for the next ifs below) Done.
	385 return Abort(SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO,

	386 content::AUDIO_ERROR_NO_MIC));

	387

	388 if (audio_manager->IsRecordingInProcess())

	389 return Abort(SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO,

	390 content::AUDIO_ERROR_MIC_IN_USE));

	391

	392 const int samples_per_packet = kAudioSampleRate *

	393 recognition_engine_->DesiredAudioChunkDurationMs() / 1000;

135 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,	394 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,

136 kAudioSampleRate, kNumBitsPerAudioSample,	395 kAudioSampleRate, kNumBitsPerAudioSample,

137 samples_per_packet);	396 samples_per_packet);

138 audio_controller_ = AudioInputController::Create(	397 audio_controller_ = AudioInputController::Create(audio_manager, this, params);

139 audio_manager_ ? audio_manager_ : BrowserMainLoop::GetAudioManager(),	398

140 this, params);	399 if (audio_controller_.get() == NULL)

141 DCHECK(audio_controller_.get());	400 return Abort(

142 VLOG(1) << "SpeechRecognizer starting record.";	401 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO));

143 num_samples_recorded_ = 0;	402

	403 // The endpointer needs to estimate the environment/background noise before

	404 // starting to treat the audio as user input. We wait in the state

	405 // kEstimatingEnvironment until such interval has elapsed before switching

	406 // to user input mode.

	407 endpointer_.SetEnvironmentEstimationMode();

144 audio_controller_->Record();	408 audio_controller_->Record();

145	409 return kStartingRecognition;

146 return true;	410 }

147 }	411

148	412 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::StartSpeechRecognition() {

149 void SpeechRecognizerImpl::AbortRecognition() {	413 // This was the first audio packet recorded, so start a request to the

150 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));	414 // engine to send the data and inform the delegate.

151 DCHECK(audio_controller_.get() \|\| request_.get());	415 DCHECK(recognition_engine_.get());

152	416 recognition_engine_->SpeechRecognitionBegins();

153 // Stop recording if required.	417 listener_->OnAudioStart(caller_id_);

154 if (audio_controller_.get()) {	418 // TODO(primiano) this is a little hack, since PushSpeechAudio() is already

	419 // called by ProcessAudioPipeline(). I hate it since it weakens the

	420 // architectural beauty of this class. But it is the best tradeoff, unless we

	421 // allow the drop the first audio chunk captured after opening the audio dev.

	422 recognition_engine_->PushSpeechAudio(*(event_args_->audio_data));

	423 return kEstimatingEnvironment;

	424 }

	425

	426 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::EnvironmentEstimation() {

	427 DCHECK(endpointer_.IsEstimatingEnvironment());

	428 if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) {

	429 endpointer_.SetUserInputMode();

	430 listener_->OnEnvironmentEstimationComplete(caller_id_);

	431 return kWaitingForSpeech;

	432 } else {

	433 return kEstimatingEnvironment;

	434 }

	435 }

	436

	437 SpeechRecognizerImpl::FSMState

	438 SpeechRecognizerImpl::DetectUserSpeechOrTimeout() {

	439 if (skipSilenceDetectionForTesting)

	440 return kRecognizingSpeech;

	441

	442 if (endpointer_.DidStartReceivingSpeech()) {

	443 listener_->OnSoundStart(caller_id_);

	444 return kRecognizingSpeech;

	445 } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) {

	446 return Abort(

	447 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH));

	448 } else {

	449 return kWaitingForSpeech;

	450 }

	451 }

	452

	453 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::DetectEndOfSpeech() {

	454 if (endpointer_.speech_input_complete()) {

	455 return StopCaptureAndWaitForResult();

	456 } else {

	457 return kRecognizingSpeech;

	458 }

	459 }

	460

	461 SpeechRecognizerImpl::FSMState

	462 SpeechRecognizerImpl::StopCaptureAndWaitForResult() {

	463 DCHECK(state_ >= kEstimatingEnvironment && state_ <= kRecognizingSpeech);

	464

	465 VLOG(1) << "Concluding recognition";

	466 CloseAudioControllerSynchronously();

	467 recognition_engine_->SpeechAudioStreamComplete();

	468

	469 if (state_ > kWaitingForSpeech)

	470 listener_->OnSoundEnd(caller_id_);

	471

	472 listener_->OnAudioEnd(caller_id_);

	473 return kWaitingFinalResult;

	474 }

	475

	476 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort() {

	477 // TODO(primiano) Should raise SPEECH_RECOGNITION_ERROR_ABORTED in lack of

	478 // other specific error sources (so that it was an explicit abort request).

	479 // However, SPEECH_RECOGNITION_ERROR_ABORTED is not caught in UI layers

	480 // and currently would cause an exception. JS will probably need it in future.

	481 SpeechRecognitionError error;

	482 bool has_error = false;

	483 if (event_ == kAudioError) {

	484 has_error = true;

	485 error.code = content::SPEECH_RECOGNITION_ERROR_AUDIO;

	486 } else if (event_ == kRecognitionError) {

	487 has_error = true;

	488 error = event_args_->error;

	489 }

	490 return Abort(has_error, error);

	491 }

	492

	493 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort(

	494 const SpeechRecognitionError& error) {

	495 return Abort(true, error);

	496 }

	497

	498 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort(

	499 bool has_error, const SpeechRecognitionError& error) {

	500 if (audio_controller_)

155 CloseAudioControllerSynchronously();	501 CloseAudioControllerSynchronously();

156 }	502

157	503 VLOG(1) << "SpeechRecognizerImpl canceling recognition. " <<

158 VLOG(1) << "SpeechRecognizer canceling recognition.";	504 error.code << " " << error.details;

159 encoder_.reset();	505

160 request_.reset();	506 // The recognition engine is initialized only after kStartingRecognition.

161 }	507 if (state_ > kStartingRecognition) {

162	508 DCHECK(recognition_engine_.get());

163 void SpeechRecognizerImpl::StopAudioCapture() {	509 recognition_engine_->SpeechRecognitionEnds();

164 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));	510 //TODO(primiano) reset the engine? Why, after all?

165	511 //recognition_engine_.reset();

166 // If audio recording has already stopped and we are in recognition phase,	512 }

167 // silently ignore any more calls to stop recording.	513

168 if (!audio_controller_.get())	514 if (state_ > kWaitingForSpeech && state_ < kWaitingFinalResult)

169 return;	515 listener_->OnSoundEnd(caller_id_);

170	516

171 CloseAudioControllerSynchronously();	517 if (state_ > kStartingRecognition && state_ < kWaitingFinalResult)

172	518 listener_->OnAudioEnd(caller_id_);

173 listener_->OnSoundEnd(caller_id_);	519

174 listener_->OnAudioEnd(caller_id_);	520 if (has_error)

175	521 listener_->OnRecognitionError(caller_id_, error);

176 // UploadAudioChunk requires a non-empty final buffer. So we encode a packet	522

177 // of silence in case encoder had no data already.	523 listener_->OnRecognitionEnd(caller_id_);

178 std::vector<short> samples((kAudioSampleRate * kAudioPacketIntervalMs) /	524

179 1000);	525 return kIdle;

180 AudioChunk dummy_chunk(reinterpret_cast<uint8*>(&samples[0]),	526 }

181 samples.size() * sizeof(short),	527

182 encoder_->bits_per_sample() / 8);	528 SpeechRecognizerImpl::FSMState

183 encoder_->Encode(dummy_chunk);	529 SpeechRecognizerImpl::ProcessIntermediateRecognitionResult() {

184 encoder_->Flush();	530 // This is in preparation for future speech recognition functions.

185 scoped_ptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());	531 // DCHECK(continuous_mode_);

186 DCHECK(!encoded_data->IsEmpty());	532 // const SpeechRecognitionResult& result = event_args_->speech_result;

187 encoder_.reset();	533 // VLOG(1) << "Got intermediate result";

188	534 // listener_->OnRecognitionResult(caller_id_, result);

189 // If we haven't got any audio yet end the recognition sequence here.	535 NOTREACHED();

190 if (request_ == NULL) {	536 return state_;

191 // Guard against the listener freeing us until we finish our job.	537 }

192 scoped_refptr<SpeechRecognizerImpl> me(this);	538

193 listener_->OnRecognitionEnd(caller_id_);	539 SpeechRecognizerImpl::FSMState

194 } else {	540 SpeechRecognizerImpl::ProcessFinalRecognitionResult() {

195 request_->UploadAudioChunk(encoded_data, true / is_last_chunk */);	541 const SpeechRecognitionResult& result = event_args_->speech_result;

196 }	542 VLOG(1) << "Got valid result";

197 }	543 recognition_engine_->SpeechRecognitionEnds();

198

199 // Invoked in the audio thread.

200 void SpeechRecognizerImpl::OnError(AudioInputController* controller,

201 int error_code) {

202 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

203 base::Bind(&SpeechRecognizerImpl::HandleOnError,

204 this, error_code));

205 }

206

207 void SpeechRecognizerImpl::HandleOnError(int error_code) {

208 LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code;

209

210 // Check if we are still recording before canceling recognition, as

211 // recording might have been stopped after this error was posted to the queue

212 // by \|OnError\|.

213 if (!audio_controller_.get())

214 return;

215

216 InformErrorAndAbortRecognition(content::SPEECH_RECOGNITION_ERROR_AUDIO);

217 }

218

219 void SpeechRecognizerImpl::OnData(AudioInputController* controller,

220 const uint8* data, uint32 size) {

221 if (size == 0) // This could happen when recording stops and is normal.

222 return;

223 AudioChunk* raw_audio = new AudioChunk(data, static_cast<size_t>(size),

224 kNumBitsPerAudioSample / 8);

225 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

226 base::Bind(&SpeechRecognizerImpl::HandleOnData,

227 this, raw_audio));

228 }

229

230 void SpeechRecognizerImpl::HandleOnData(AudioChunk* raw_audio) {

231 scoped_ptr<AudioChunk> free_raw_audio_on_return(raw_audio);

232 // Check if we are still recording and if not discard this buffer, as

233 // recording might have been stopped after this buffer was posted to the queue

234 // by \|OnData\|.

235 if (!audio_controller_.get())

236 return;

237

238 bool speech_was_heard_before_packet = endpointer_.DidStartReceivingSpeech();

239

240 encoder_->Encode(*raw_audio);

241 float rms;

242 endpointer_.ProcessAudio(*raw_audio, &rms);

243 bool did_clip = DetectClipping(*raw_audio);

244 num_samples_recorded_ += raw_audio->NumSamples();

245

246 if (request_ == NULL) {

247 // This was the first audio packet recorded, so start a request to the

248 // server to send the data and inform the listener.

249 listener_->OnAudioStart(caller_id_);

250 request_.reset(new SpeechRecognitionRequest(context_getter_.get(), this));

251 request_->Start(language_, grammar_, filter_profanities_,

252 hardware_info_, origin_url_, encoder_->mime_type());

253 }

254

255 scoped_ptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());

256 DCHECK(!encoded_data->IsEmpty());

257 request_->UploadAudioChunk(encoded_data, false / is_last_chunk */);

258

259 if (endpointer_.IsEstimatingEnvironment()) {

260 // Check if we have gathered enough audio for the endpointer to do

261 // environment estimation and should move on to detect speech/end of speech.

262 if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs *

263 kAudioSampleRate) / 1000) {

264 endpointer_.SetUserInputMode();

265 listener_->OnEnvironmentEstimationComplete(caller_id_);

266 }

267 return; // No more processing since we are still estimating environment.

268 }

269

270 // Check if we have waited too long without hearing any speech.

271 bool speech_was_heard_after_packet = endpointer_.DidStartReceivingSpeech();

272 if (!speech_was_heard_after_packet &&

273 num_samples_recorded_ >= kNoSpeechTimeoutSec * kAudioSampleRate) {

274 InformErrorAndAbortRecognition(

275 content::SPEECH_RECOGNITION_ERROR_NO_SPEECH);

276 return;

277 }

278

279 if (!speech_was_heard_before_packet && speech_was_heard_after_packet)

280 listener_->OnSoundStart(caller_id_);

281

282 // Calculate the input volume to display in the UI, smoothing towards the

283 // new level.

284 float level = (rms - kAudioMeterMinDb) /

285 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

286 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);

287 if (level > audio_level_) {

288 audio_level_ += (level - audio_level_) * kUpSmoothingFactor;

289 } else {

290 audio_level_ += (level - audio_level_) * kDownSmoothingFactor;

291 }

292

293 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /

294 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

295 noise_level = std::min(std::max(0.0f, noise_level),

296 kAudioMeterRangeMaxUnclipped);

297

298 listener_->OnAudioLevelsChange(caller_id_, did_clip ? 1.0f : audio_level_,

299 noise_level);

300

301 if (endpointer_.speech_input_complete())

302 StopAudioCapture();

303 }

304

305 void SpeechRecognizerImpl::SetRecognitionResult(

306 const content::SpeechRecognitionResult& result) {

307 if (result.error != content::SPEECH_RECOGNITION_ERROR_NONE) {

308 InformErrorAndAbortRecognition(result.error);

309 return;

310 }

311

312 // Guard against the listener freeing us until we finish our job.

313 scoped_refptr<SpeechRecognizerImpl> me(this);

314 listener_->OnRecognitionResult(caller_id_, result);	544 listener_->OnRecognitionResult(caller_id_, result);

315 listener_->OnRecognitionEnd(caller_id_);	545 listener_->OnRecognitionEnd(caller_id_);

316 }	546 return kIdle;

317	547 }

318 void SpeechRecognizerImpl::InformErrorAndAbortRecognition(	548

319 content::SpeechRecognitionErrorCode error) {	549 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::DoNothing() const {

320 DCHECK_NE(error, content::SPEECH_RECOGNITION_ERROR_NONE);	550 return state_; // Just keep the current state.

321 AbortRecognition();

322

323 // Guard against the listener freeing us until we finish our job.

324 scoped_refptr<SpeechRecognizerImpl> me(this);

325 listener_->OnRecognitionError(caller_id_, error);

326 }	551 }

327	552

328 void SpeechRecognizerImpl::CloseAudioControllerSynchronously() {	553 void SpeechRecognizerImpl::CloseAudioControllerSynchronously() {

329 VLOG(1) << "SpeechRecognizer stopping record.";	554 DCHECK(audio_controller_);

	555 VLOG(1) << "SpeechRecognizerImpl stopping audio capture.";

330	556

331 // TODO(satish): investigate the possibility to utilize the closure	557 // TODO(satish): investigate the possibility to utilize the closure

332 // and switch to async. version of this method. Compare with how	558 // and switch to async. version of this method. Compare with how

333 // it's done in e.g. the AudioRendererHost.	559 // it's done in e.g. the AudioRendererHost.

334 base::WaitableEvent closed_event(true, false);	560 base::WaitableEvent closed_event(true, false);

335 audio_controller_->Close(base::Bind(&base::WaitableEvent::Signal,	561 audio_controller_->Close(base::Bind(&base::WaitableEvent::Signal,

336 base::Unretained(&closed_event)));	562 base::Unretained(&closed_event)));

337 closed_event.Wait();	563 closed_event.Wait();

338 audio_controller_ = NULL; // Releases the ref ptr.	564 audio_controller_ = NULL; // Releases the ref ptr.

339 }	565 }

340	566

	567 int SpeechRecognizerImpl::GetElapsedTimeMs() const {

	568 return num_samples_recorded_ * 1000 / kAudioSampleRate;

	569 }

	570

	571 void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms) {

	572 // Calculate the input volume to display in the UI, smoothing towards the

	573 // new level.

	574 // TODO(primiano) Do we really need all this floating point arith here?

	575 // Perhaps it might be quite expensive on mobile.

	576 float level = (rms - kAudioMeterMinDb) /

	577 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

	578 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);

	579 if (level > audio_level_) {

	580 audio_level_ += (level - audio_level_) * kUpSmoothingFactor;

	581 } else {

	582 audio_level_ += (level - audio_level_) * kDownSmoothingFactor;

	583 }

	584

	585 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /

	586 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

	587 noise_level = std::min(std::max(0.0f, noise_level),

	588 kAudioMeterRangeMaxUnclipped);

	589

	590 listener_->OnAudioLevelsChange(

	591 caller_id_, clipper_detected_clip_ ? 1.0f : audio_level_, noise_level);

	592 }

	593

	594 const SpeechRecognitionEngine&

	595 SpeechRecognizerImpl::recognition_engine() const {

	596 return *(recognition_engine_.get());

	597 }

	598

341 void SpeechRecognizerImpl::SetAudioManagerForTesting(	599 void SpeechRecognizerImpl::SetAudioManagerForTesting(

342 AudioManager* audio_manager) {	600 AudioManager* audio_manager) {

343 audio_manager_ = audio_manager;	601 testing_audio_manager_ = audio_manager;

344 }	602 }

345	603

346 bool SpeechRecognizerImpl::IsActive() const {	604 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs()

347 return (request_.get() != NULL);	605 : audio_error_code(0), audio_data(NULL) {

348 }	606 }

349	607

350 bool SpeechRecognizerImpl::IsCapturingAudio() const {	608 } // namespace speech
	hans 2012/03/16 11:12:56 ultra nit: two spaces between } and // ultra nit: two spaces between } and // Primiano Tucci (use gerrit) 2012/03/16 15:03:42 Done. Show quoted text On 2012/03/16 11:12:56, hans wrote: > ultra nit: two spaces between } and // Done.
351 return (audio_controller_.get() != NULL);

352 }

353

354 } // namespace speech

OLD	NEW