content/browser/speech/speech_recognizer_impl.cc - Issue 9835049: Speech refactoring: Reimplemented speech_recognizer as a FSM. (CL1.5)

Side by Side Diff: content/browser/speech/speech_recognizer_impl.cc

Issue 9835049: Speech refactoring: Reimplemented speech_recognizer as a FSM. (CL1.5) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Fixed according to Bulach review (+ win compile error). Created 8 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« content/browser/speech/speech_recognizer_impl.h ('K') | « content/browser/speech/speech_recognizer_impl.h ('k') | content/browser/speech/speech_recognizer_impl_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "content/browser/speech/speech_recognizer_impl.h"	5 #include "content/browser/speech/speech_recognizer_impl.h"

6	6

	7 #include "base/basictypes.h"

7 #include "base/bind.h"	8 #include "base/bind.h"

8 #include "base/time.h"	9 #include "base/time.h"

9 #include "content/browser/browser_main_loop.h"	10 #include "content/browser/browser_main_loop.h"

10 #include "content/browser/speech/audio_buffer.h"	11 #include "content/browser/speech/audio_buffer.h"

11 #include "content/browser/speech/google_one_shot_remote_engine.h"	12 #include "content/browser/speech/google_one_shot_remote_engine.h"

12 #include "content/public/browser/browser_thread.h"	13 #include "content/public/browser/browser_thread.h"

13 #include "content/public/browser/speech_recognition_event_listener.h"	14 #include "content/public/browser/speech_recognition_event_listener.h"

14 #include "content/public/browser/speech_recognizer.h"	15 #include "content/public/browser/speech_recognizer.h"

15 #include "content/public/common/speech_recognition_error.h"	16 #include "content/public/common/speech_recognition_error.h"

16 #include "content/public/common/speech_recognition_result.h"	17 #include "content/public/common/speech_recognition_result.h"

17 #include "net/url_request/url_request_context_getter.h"	18 #include "net/url_request/url_request_context_getter.h"

18	19

19 using content::BrowserMainLoop;	20 using content::BrowserMainLoop;

20 using content::BrowserThread;	21 using content::BrowserThread;

21 using content::SpeechRecognitionError;	22 using content::SpeechRecognitionError;

22 using content::SpeechRecognitionEventListener;	23 using content::SpeechRecognitionEventListener;

23 using content::SpeechRecognitionResult;	24 using content::SpeechRecognitionResult;

24 using content::SpeechRecognizer;	25 using content::SpeechRecognizer;

25 using media::AudioInputController;	26 using media::AudioInputController;

26 using media::AudioManager;	27 using media::AudioManager;

	28 using media::AudioParameters;

27	29

28 namespace {	30 namespace {

29	31

30 // The following constants are related to the volume level indicator shown in	32 // The following constants are related to the volume level indicator shown in

31 // the UI for recorded audio.	33 // the UI for recorded audio.

32 // Multiplier used when new volume is greater than previous level.	34 // Multiplier used when new volume is greater than previous level.

33 const float kUpSmoothingFactor = 1.0f;	35 const float kUpSmoothingFactor = 1.0f;

34 // Multiplier used when new volume is lesser than previous level.	36 // Multiplier used when new volume is lesser than previous level.

35 const float kDownSmoothingFactor = 0.7f;	37 const float kDownSmoothingFactor = 0.7f;

36 // RMS dB value of a maximum (unclipped) sine wave for int16 samples.	38 // RMS dB value of a maximum (unclipped) sine wave for int16 samples.

37 const float kAudioMeterMaxDb = 90.31f;	39 const float kAudioMeterMaxDb = 90.31f;

38 // This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0.	40 // This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0.

39 // Values lower than this will display as empty level-meter.	41 // Values lower than this will display as empty level-meter.

40 const float kAudioMeterMinDb = 30.0f;	42 const float kAudioMeterMinDb = 30.0f;

41 const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb;	43 const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb;

42	44

43 // Maximum level to draw to display unclipped meter. (1.0f displays clipping.)	45 // Maximum level to draw to display unclipped meter. (1.0f displays clipping.)

44 const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f;	46 const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f;

45	47

46 // Returns true if more than 5% of the samples are at min or max value.	48 // Returns true if more than 5% of the samples are at min or max value.

47 bool DetectClipping(const speech::AudioChunk& chunk) {	49 bool DetectClipping(const speech::AudioChunk& chunk) {

48 const int num_samples = chunk.NumSamples();	50 const int num_samples = chunk.NumSamples();

49 const int16* samples = chunk.SamplesData16();	51 const int16* samples = chunk.SamplesData16();

50 const int kThreshold = num_samples / 20;	52 const int kThreshold = num_samples / 20;

51 int clipping_samples = 0;	53 int clipping_samples = 0;

	54

52 for (int i = 0; i < num_samples; ++i) {	55 for (int i = 0; i < num_samples; ++i) {

53 if (samples[i] <= -32767 \|\| samples[i] >= 32767) {	56 if (samples[i] <= -32767 \|\| samples[i] >= 32767) {

54 if (++clipping_samples > kThreshold)	57 if (++clipping_samples > kThreshold)

55 return true;	58 return true;

56 }	59 }

57 }	60 }

58 return false;	61 return false;

59 }	62 }

60	63

61 } // namespace	64 } // namespace

62	65

63 SpeechRecognizer* SpeechRecognizer::Create(	66 SpeechRecognizer* SpeechRecognizer::Create(

64 SpeechRecognitionEventListener* listener,	67 SpeechRecognitionEventListener* listener,

65 int caller_id,	68 int caller_id,

66 const std::string& language,	69 const std::string& language,

67 const std::string& grammar,	70 const std::string& grammar,

68 net::URLRequestContextGetter* context_getter,	71 net::URLRequestContextGetter* context_getter,

69 bool filter_profanities,	72 bool filter_profanities,

70 const std::string& hardware_info,	73 const std::string& hardware_info,

71 const std::string& origin_url) {	74 const std::string& origin_url) {

	75 speech::GoogleOneShotRemoteEngineConfig remote_engine_config;

	76 remote_engine_config.language = language;

	77 remote_engine_config.grammar = grammar;

	78 remote_engine_config.audio_sample_rate =

	79 speech::SpeechRecognizerImpl::kAudioSampleRate;

	80 remote_engine_config.audio_num_bits_per_sample =

	81 speech::SpeechRecognizerImpl::kNumBitsPerAudioSample;

	82 remote_engine_config.filter_profanities = filter_profanities;

	83 remote_engine_config.hardware_info = hardware_info;

	84 remote_engine_config.origin_url = origin_url;

	85

	86 // SpeechRecognizerImpl takes ownership of google_remote_engine.

	87 speech::GoogleOneShotRemoteEngine* google_remote_engine =

	88 new speech::GoogleOneShotRemoteEngine(context_getter);

	89 google_remote_engine->SetConfig(remote_engine_config);

	90

72 return new speech::SpeechRecognizerImpl(listener,	91 return new speech::SpeechRecognizerImpl(listener,

73 caller_id,	92 caller_id,

74 language,	93 google_remote_engine);

75 grammar,

76 context_getter,

77 filter_profanities,

78 hardware_info,

79 origin_url);

80 }	94 }

81	95

82 namespace speech {	96 namespace speech {

83	97

84 const int SpeechRecognizerImpl::kAudioSampleRate = 16000;	98 const int SpeechRecognizerImpl::kAudioSampleRate = 16000;

85 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO;	99 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO;

86 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16;	100 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16;

87 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000;	101 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000;

88 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300;	102 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300;

89	103

	104 COMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0,

	105 kNumBitsPerAudioSample_must_be_a_multiple_of_8);

	106

90 SpeechRecognizerImpl::SpeechRecognizerImpl(	107 SpeechRecognizerImpl::SpeechRecognizerImpl(

91 SpeechRecognitionEventListener* listener,	108 SpeechRecognitionEventListener* listener,

92 int caller_id,	109 int caller_id,

93 const std::string& language,	110 SpeechRecognitionEngine* engine)

94 const std::string& grammar,

95 net::URLRequestContextGetter* context_getter,

96 bool filter_profanities,

97 const std::string& hardware_info,

98 const std::string& origin_url)

99 : listener_(listener),	111 : listener_(listener),

100 testing_audio_manager_(NULL),	112 testing_audio_manager_(NULL),

	113 recognition_engine_(engine),

101 endpointer_(kAudioSampleRate),	114 endpointer_(kAudioSampleRate),

102 context_getter_(context_getter),

103 caller_id_(caller_id),	115 caller_id_(caller_id),

104 language_(language),	116 is_dispatching_event_(false),

105 grammar_(grammar),	117 state_(STATE_IDLE) {

106 filter_profanities_(filter_profanities),

107 hardware_info_(hardware_info),

108 origin_url_(origin_url),

109 num_samples_recorded_(0),

110 audio_level_(0.0f) {

111 DCHECK(listener_ != NULL);	118 DCHECK(listener_ != NULL);

	119 DCHECK(recognition_engine_ != NULL);

112 endpointer_.set_speech_input_complete_silence_length(	120 endpointer_.set_speech_input_complete_silence_length(

113 base::Time::kMicrosecondsPerSecond / 2);	121 base::Time::kMicrosecondsPerSecond / 2);

114 endpointer_.set_long_speech_input_complete_silence_length(	122 endpointer_.set_long_speech_input_complete_silence_length(

115 base::Time::kMicrosecondsPerSecond);	123 base::Time::kMicrosecondsPerSecond);

116 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);	124 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);

117 endpointer_.StartSession();	125 endpointer_.StartSession();

	126 recognition_engine_->set_delegate(this);

118 }	127 }

119	128

120 SpeechRecognizerImpl::~SpeechRecognizerImpl() {	129 SpeechRecognizerImpl::~SpeechRecognizerImpl() {

121 // Recording should have stopped earlier due to the endpointer or

122 // \|StopRecording\| being called.

123 DCHECK(!audio_controller_.get());

124 DCHECK(!recognition_engine_.get() \|\|

125 !recognition_engine_->IsRecognitionPending());

126 endpointer_.EndSession();	130 endpointer_.EndSession();

127 }	131 }

128	132

	133 // ------- Methods that trigger Finite State Machine (FSM) events ------------

	134

	135 // NOTE:all the external events and requests should be enqueued (PostTask), even

	136 // if they come from the same (IO) thread, in order to preserve the relationship

	137 // of causality between events and avoid interleaved event processing due to

	138 // synchronous callbacks.

	139

129 void SpeechRecognizerImpl::StartRecognition() {	140 void SpeechRecognizerImpl::StartRecognition() {

	141 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

	142 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

	143 this, FSMEventArgs(EVENT_START)));

	144 }

	145

	146 void SpeechRecognizerImpl::AbortRecognition() {

	147 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

	148 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

	149 this, FSMEventArgs(EVENT_ABORT)));

	150 }

	151

	152 void SpeechRecognizerImpl::StopAudioCapture() {

	153 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

	154 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

	155 this, FSMEventArgs(EVENT_STOP_CAPTURE)));

	156 }

	157

	158 bool SpeechRecognizerImpl::IsActive() const {

	159 // Checking the FSM state from another thread (thus, while the FSM is

	160 // potentially concurrently evolving) is meaningless.

130 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));	161 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

131 DCHECK(!audio_controller_.get());	162 return state_ != STATE_IDLE;

132 DCHECK(!recognition_engine_.get() \|\|	163 }

133 !recognition_engine_->IsRecognitionPending());	164

134	165 bool SpeechRecognizerImpl::IsCapturingAudio() const {

135 // The endpointer needs to estimate the environment/background noise before	166 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive().

136 // starting to treat the audio as user input. In \|HandleOnData\| we wait until	167 const bool is_capturing_audio = state_ >= STATE_STARTING &&

137 // such time has passed before switching to user input mode.	168 state_ <= STATE_RECOGNIZING;

138 endpointer_.SetEnvironmentEstimationMode();	169 DCHECK((is_capturing_audio && (audio_controller_.get() != NULL)) \|\|

139	170 (!is_capturing_audio && audio_controller_.get() == NULL));

140 AudioManager* audio_manager = (testing_audio_manager_ != NULL) ?	171 return is_capturing_audio;

141 testing_audio_manager_ : BrowserMainLoop::GetAudioManager();

142 const int samples_per_packet = kAudioSampleRate *

143 GoogleOneShotRemoteEngine::kAudioPacketIntervalMs / 1000;

144 media::AudioParameters params(

145 media::AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,

146 kAudioSampleRate, kNumBitsPerAudioSample, samples_per_packet);

147 audio_controller_ = AudioInputController::Create(audio_manager, this, params);

148 DCHECK(audio_controller_.get());

149 VLOG(1) << "SpeechRecognizer starting record.";

150 num_samples_recorded_ = 0;

151 audio_controller_->Record();

152 }

153

154 void SpeechRecognizerImpl::AbortRecognition() {

155 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

156 DCHECK(audio_controller_.get() \|\| recognition_engine_.get());

157

158 // Stop recording if required.

159 if (audio_controller_.get()) {

160 CloseAudioControllerAsynchronously();

161 }

162

163 VLOG(1) << "SpeechRecognizer canceling recognition.";

164 recognition_engine_.reset();

165 }

166

167 void SpeechRecognizerImpl::StopAudioCapture() {

168 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

169

170 // If audio recording has already stopped and we are in recognition phase,

171 // silently ignore any more calls to stop recording.

172 if (!audio_controller_.get())

173 return;

174

175 CloseAudioControllerAsynchronously();

176 listener_->OnSoundEnd(caller_id_);

177 listener_->OnAudioEnd(caller_id_);

178

179 // If we haven't got any audio yet end the recognition sequence here.

180 if (recognition_engine_ == NULL) {

181 // Guard against the listener freeing us until we finish our job.

182 scoped_refptr<SpeechRecognizerImpl> me(this);

183 listener_->OnRecognitionEnd(caller_id_);

184 } else {

185 recognition_engine_->AudioChunksEnded();

186 }

187 }	172 }

188	173

189 // Invoked in the audio thread.	174 // Invoked in the audio thread.

190 void SpeechRecognizerImpl::OnError(AudioInputController* controller,	175 void SpeechRecognizerImpl::OnError(AudioInputController* controller,

191 int error_code) {	176 int error_code) {

192 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,	177 FSMEventArgs event_args(EVENT_AUDIO_ERROR);

193 base::Bind(&SpeechRecognizerImpl::HandleOnError,	178 event_args.audio_error_code = error_code;

194 this, error_code));	179 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

195 }	180 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

196	181 this, event_args));

197 void SpeechRecognizerImpl::HandleOnError(int error_code) {

198 LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code;

199

200 // Check if we are still recording before canceling recognition, as

201 // recording might have been stopped after this error was posted to the queue

202 // by \|OnError\|.

203 if (!audio_controller_.get())

204 return;

205

206 InformErrorAndAbortRecognition(content::SPEECH_RECOGNITION_ERROR_AUDIO);

207 }	182 }

208	183

209 void SpeechRecognizerImpl::OnData(AudioInputController* controller,	184 void SpeechRecognizerImpl::OnData(AudioInputController* controller,

210 const uint8* data, uint32 size) {	185 const uint8* data, uint32 size) {

211 if (size == 0) // This could happen when recording stops and is normal.	186 if (size == 0) // This could happen when audio capture stops and is normal.

212 return;	187 return;

213 scoped_refptr<AudioChunk> raw_audio(	188

214 new AudioChunk(data,	189 FSMEventArgs event_args(EVENT_AUDIO_DATA);

215 static_cast<size_t>(size),	190 event_args.audio_data = new AudioChunk(data, static_cast<size_t>(size),

216 kNumBitsPerAudioSample / 8));	191 kNumBitsPerAudioSample / 8);

217 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,	192 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

218 base::Bind(&SpeechRecognizerImpl::HandleOnData,	193 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

219 this, raw_audio));	194 this, event_args));

220 }

221

222 void SpeechRecognizerImpl::HandleOnData(scoped_refptr<AudioChunk> raw_audio) {

223 // Check if we are still recording and if not discard this buffer, as

224 // recording might have been stopped after this buffer was posted to the queue

225 // by \|OnData\|.

226 if (!audio_controller_.get())

227 return;

228

229 bool speech_was_heard_before_packet = endpointer_.DidStartReceivingSpeech();

230

231 float rms;

232 endpointer_.ProcessAudio(*raw_audio, &rms);

233 bool did_clip = DetectClipping(*raw_audio);

234 num_samples_recorded_ += raw_audio->NumSamples();

235

236 if (recognition_engine_ == NULL) {

237 // This was the first audio packet recorded, so start a request to the

238 // server to send the data and inform the listener.

239 listener_->OnAudioStart(caller_id_);

240 GoogleOneShotRemoteEngineConfig google_sr_config;

241 google_sr_config.language = language_;

242 google_sr_config.grammar = grammar_;

243 google_sr_config.audio_sample_rate = kAudioSampleRate;

244 google_sr_config.audio_num_bits_per_sample = kNumBitsPerAudioSample;

245 google_sr_config.filter_profanities = filter_profanities_;

246 google_sr_config.hardware_info = hardware_info_;

247 google_sr_config.origin_url = origin_url_;

248 GoogleOneShotRemoteEngine* google_sr_engine =

249 new GoogleOneShotRemoteEngine(context_getter_.get());

250 google_sr_engine->SetConfig(google_sr_config);

251 recognition_engine_.reset(google_sr_engine);

252 recognition_engine_->set_delegate(this);

253 recognition_engine_->StartRecognition();

254 }

255

256 recognition_engine_->TakeAudioChunk(*raw_audio);

257

258 if (endpointer_.IsEstimatingEnvironment()) {

259 // Check if we have gathered enough audio for the endpointer to do

260 // environment estimation and should move on to detect speech/end of speech.

261 if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs *

262 kAudioSampleRate) / 1000) {

263 endpointer_.SetUserInputMode();

264 listener_->OnEnvironmentEstimationComplete(caller_id_);

265 }

266 return; // No more processing since we are still estimating environment.

267 }

268

269 // Check if we have waited too long without hearing any speech.

270 bool speech_was_heard_after_packet = endpointer_.DidStartReceivingSpeech();

271 if (!speech_was_heard_after_packet &&

272 num_samples_recorded_ >= (kNoSpeechTimeoutMs / 1000) * kAudioSampleRate) {

273 InformErrorAndAbortRecognition(

274 content::SPEECH_RECOGNITION_ERROR_NO_SPEECH);

275 return;

276 }

277

278 if (!speech_was_heard_before_packet && speech_was_heard_after_packet)

279 listener_->OnSoundStart(caller_id_);

280

281 // Calculate the input volume to display in the UI, smoothing towards the

282 // new level.

283 float level = (rms - kAudioMeterMinDb) /

284 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

285 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);

286 if (level > audio_level_) {

287 audio_level_ += (level - audio_level_) * kUpSmoothingFactor;

288 } else {

289 audio_level_ += (level - audio_level_) * kDownSmoothingFactor;

290 }

291

292 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /

293 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

294 noise_level = std::min(std::max(0.0f, noise_level),

295 kAudioMeterRangeMaxUnclipped);

296

297 listener_->OnAudioLevelsChange(caller_id_, did_clip ? 1.0f : audio_level_,

298 noise_level);

299

300 if (endpointer_.speech_input_complete())

301 StopAudioCapture();

302 }	195 }

303	196

304 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {}	197 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {}

305	198

306 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult(	199 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult(

307 const content::SpeechRecognitionResult& result) {	200 const content::SpeechRecognitionResult& result) {

308 // Guard against the listener freeing us until we finish our job.	201 FSMEventArgs event_args(EVENT_ENGINE_RESULT);

	202 event_args.engine_result = result;

	203 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

	204 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

	205 this, event_args));

	206 }

	207

	208 void SpeechRecognizerImpl::OnSpeechRecognitionEngineError(

	209 const content::SpeechRecognitionError& error) {

	210 FSMEventArgs event_args(EVENT_ENGINE_ERROR);

	211 event_args.engine_error = error;

	212 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

	213 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

	214 this, event_args));

	215 }

	216

	217 // ----------------------- Core FSM implementation ---------------------------

	218 // TODO(primiano) After the changes in the media package (r129173), this class

	219 // slightly violates the SpeechRecognitionEventListener interface contract. In

	220 // particular, it is not true anymore that this class can be freed after the

	221 // OnRecognitionEnd event, since the audio_controller_.Close() asynchronous

	222 // call can be still in progress after the end event. Currently, it does not

	223 // represent a problem for the browser itself, since refcounting protects us

	224 // against such race conditions. However, we should fix this in the next CLs.

	225 // For instance, tests are currently working just because the

	226 // TestAudioInputController is not closing asynchronously as the real controller

	227 // does, but they will become flaky if TestAudioInputController will be fixed.

	228

	229 void SpeechRecognizerImpl::DispatchEvent(const FSMEventArgs& event_args) {

	230 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

	231 DCHECK_LE(event_args.event, EVENT_MAX_VALUE);

	232 DCHECK_LE(state_, STATE_MAX_VALUE);

	233

	234 // Event dispatching must be sequential, otherwise it will break all the rules

	235 // and the assumptions of the finite state automata model.

	236 DCHECK(!is_dispatching_event_);

	237 is_dispatching_event_ = true;

	238

	239 // Guard against the delegate freeing us until we finish processing the event.

309 scoped_refptr<SpeechRecognizerImpl> me(this);	240 scoped_refptr<SpeechRecognizerImpl> me(this);

	241

	242 if (event_args.event == EVENT_AUDIO_DATA) {

	243 DCHECK(event_args.audio_data.get() != NULL);

	244 ProcessAudioPipeline(*event_args.audio_data);

	245 }

	246

	247 // The audio pipeline must be processed before the event dispatch, otherwise

	248 // it would take actions according to the future state instead of the current.

	249 state_ = ExecuteTransitionAndGetNextState(event_args);

	250

	251 is_dispatching_event_ = false;

	252 }

	253

	254 SpeechRecognizerImpl::FSMState

	255 SpeechRecognizerImpl::ExecuteTransitionAndGetNextState(

	256 const FSMEventArgs& event_args) {

	257 const FSMEvent event = event_args.event;

	258 switch (state_) {

	259 case STATE_IDLE:

	260 switch (event) {

	261 // TODO(primiano) restore UNREACHABLE_CONDITION on EVENT_ABORT and

	262 // EVENT_STOP_CAPTURE below once speech input extensions are fixed.

	263 case EVENT_ABORT:

	264 return DoNothing(event_args);

	265 case EVENT_START:

	266 return StartRecording(event_args);

	267 case EVENT_STOP_CAPTURE: // Corner cases related to queued messages

	268 case EVENT_AUDIO_DATA: // being lately dispatched.

	269 case EVENT_ENGINE_RESULT:

	270 case EVENT_ENGINE_ERROR:

	271 case EVENT_AUDIO_ERROR:

	272 return DoNothing(event_args);

	273 }

	274 break;

	275 case STATE_STARTING:

	276 switch (event) {

	277 case EVENT_ABORT:

	278 return Abort(event_args);

	279 case EVENT_START:

	280 return NotFeasible(event_args);

	281 case EVENT_STOP_CAPTURE:

	282 return Abort(event_args);

	283 case EVENT_AUDIO_DATA:

	284 return StartRecognitionEngine(event_args);

	285 case EVENT_ENGINE_RESULT:

	286 return NotFeasible(event_args);

	287 case EVENT_ENGINE_ERROR:

	288 case EVENT_AUDIO_ERROR:

	289 return Abort(event_args);

	290 }

	291 break;

	292 case STATE_ESTIMATING_ENVIRONMENT:

	293 switch (event) {

	294 case EVENT_ABORT:

	295 return Abort(event_args);

	296 case EVENT_START:

	297 return NotFeasible(event_args);

	298 case EVENT_STOP_CAPTURE:

	299 return StopCaptureAndWaitForResult(event_args);

	300 case EVENT_AUDIO_DATA:

	301 return WaitEnvironmentEstimationCompletion(event_args);

	302 case EVENT_ENGINE_RESULT:

	303 return ProcessIntermediateResult(event_args);

	304 case EVENT_ENGINE_ERROR:

	305 case EVENT_AUDIO_ERROR:

	306 return Abort(event_args);

	307 }

	308 break;

	309 case STATE_WAITING_FOR_SPEECH:

	310 switch (event) {

	311 case EVENT_ABORT:

	312 return Abort(event_args);

	313 case EVENT_START:

	314 return NotFeasible(event_args);

	315 case EVENT_STOP_CAPTURE:

	316 return StopCaptureAndWaitForResult(event_args);

	317 case EVENT_AUDIO_DATA:

	318 return DetectUserSpeechOrTimeout(event_args);

	319 case EVENT_ENGINE_RESULT:

	320 return ProcessIntermediateResult(event_args);

	321 case EVENT_ENGINE_ERROR:

	322 case EVENT_AUDIO_ERROR:

	323 return Abort(event_args);

	324 }

	325 break;

	326 case STATE_RECOGNIZING:

	327 switch (event) {

	328 case EVENT_ABORT:

	329 return Abort(event_args);

	330 case EVENT_START:

	331 return NotFeasible(event_args);

	332 case EVENT_STOP_CAPTURE:

	333 return StopCaptureAndWaitForResult(event_args);

	334 case EVENT_AUDIO_DATA:

	335 return DetectEndOfSpeech(event_args);

	336 case EVENT_ENGINE_RESULT:

	337 return ProcessIntermediateResult(event_args);

	338 case EVENT_ENGINE_ERROR:

	339 case EVENT_AUDIO_ERROR:

	340 return Abort(event_args);

	341 }

	342 break;

	343 case STATE_WAITING_FINAL_RESULT:

	344 switch (event) {

	345 case EVENT_ABORT:

	346 return Abort(event_args);

	347 case EVENT_START:

	348 return NotFeasible(event_args);

	349 case EVENT_STOP_CAPTURE:

	350 case EVENT_AUDIO_DATA:

	351 return DoNothing(event_args);

	352 case EVENT_ENGINE_RESULT:

	353 return ProcessFinalResult(event_args);

	354 case EVENT_ENGINE_ERROR:

	355 case EVENT_AUDIO_ERROR:

	356 return Abort(event_args);

	357 }

	358 break;

	359 }

	360 return NotFeasible(event_args);

	361 }

	362

	363 // ----------- Contract for all the FSM evolution functions below -------------

	364 // - Are guaranteed to be executed in the IO thread;

	365 // - Are guaranteed to be not reentrant (themselves and each other);

	366 // - event_args members are guaranteed to be stable during the call;

	367 // - The class won't be freed in the meanwhile due to callbacks;

	368 // - IsCapturingAudio() returns true if and only if audio_controller_ != NULL.

	369

	370 // TODO(primiano) the audio pipeline is currently serial. However, the

	371 // clipper->endpointer->vumeter chain and the sr_engine could be parallelized.

	372 // We should profile the execution to see if it would be worth or not.

	373 void SpeechRecognizerImpl::ProcessAudioPipeline(const AudioChunk& raw_audio) {

	374 const bool route_to_endpointer = state_ >= STATE_ESTIMATING_ENVIRONMENT &&

	375 state_ <= STATE_RECOGNIZING;

	376 const bool route_to_sr_engine = route_to_endpointer;

	377 const bool route_to_vumeter = state_ >= STATE_WAITING_FOR_SPEECH &&

	378 state_ <= STATE_RECOGNIZING;

	379 const bool clip_detected = DetectClipping(raw_audio);

	380 float rms = 0.0f;

	381

	382 num_samples_recorded_ += raw_audio.NumSamples();

	383

	384 if (route_to_endpointer)

	385 endpointer_.ProcessAudio(raw_audio, &rms);

	386

	387 if (route_to_vumeter) {

	388 DCHECK(route_to_endpointer); // Depends on endpointer due to \|rms\|.

	389 UpdateSignalAndNoiseLevels(rms, clip_detected);

	390 }

	391 if (route_to_sr_engine) {

	392 DCHECK(recognition_engine_.get() != NULL);

	393 recognition_engine_->TakeAudioChunk(raw_audio);

	394 }

	395 }

	396

	397 SpeechRecognizerImpl::FSMState

	398 SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) {

	399 DCHECK(recognition_engine_.get() != NULL);

	400 DCHECK(!IsCapturingAudio());

	401 AudioManager* audio_manager = (testing_audio_manager_ != NULL) ?

	402 testing_audio_manager_ :

	403 BrowserMainLoop::GetAudioManager();

	404 DCHECK(audio_manager != NULL);

	405

	406 DVLOG(1) << "SpeechRecognizerImpl starting audio capture.";

	407 num_samples_recorded_ = 0;

	408 audio_level_ = 0;

	409 listener_->OnRecognitionStart(caller_id_);

	410

	411 if (!audio_manager->HasAudioInputDevices()) {

	412 return AbortWithError(SpeechRecognitionError(

	413 content::SPEECH_RECOGNITION_ERROR_AUDIO,

	414 content::SPEECH_AUDIO_ERROR_DETAILS_NO_MIC));

	415 }

	416

	417 if (audio_manager->IsRecordingInProcess()) {

	418 return AbortWithError(SpeechRecognitionError(

	419 content::SPEECH_RECOGNITION_ERROR_AUDIO,

	420 content::SPEECH_AUDIO_ERROR_DETAILS_IN_USE));

	421 }

	422

	423 const int samples_per_packet = (kAudioSampleRate *

	424 recognition_engine_->GetDesiredAudioChunkDurationMs()) / 1000;

	425 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,

	426 kAudioSampleRate, kNumBitsPerAudioSample,

	427 samples_per_packet);

	428 audio_controller_ = AudioInputController::Create(audio_manager, this, params);

	429

	430 if (audio_controller_.get() == NULL) {

	431 return AbortWithError(

	432 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO));

	433 }

	434

	435 // The endpointer needs to estimate the environment/background noise before

	436 // starting to treat the audio as user input. We wait in the state

	437 // ESTIMATING_ENVIRONMENT until such interval has elapsed before switching

	438 // to user input mode.

	439 endpointer_.SetEnvironmentEstimationMode();

	440 audio_controller_->Record();

	441 return STATE_STARTING;

	442 }

	443

	444 SpeechRecognizerImpl::FSMState

	445 SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) {

	446 // This is the first audio packet captured, so the recognition engine is

	447 // started and the delegate notified about the event.

	448 DCHECK(recognition_engine_.get() != NULL);

	449 recognition_engine_->StartRecognition();

	450 listener_->OnAudioStart(caller_id_);

	451

	452 // This is a little hack, since TakeAudioChunk() is already called by

	453 // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping

	454 // the first audio chunk captured after opening the audio device.

	455 recognition_engine_->TakeAudioChunk(*(event_args.audio_data));

	456 return STATE_ESTIMATING_ENVIRONMENT;

	457 }

	458

	459 SpeechRecognizerImpl::FSMState

	460 SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) {

	461 DCHECK(endpointer_.IsEstimatingEnvironment());

	462 if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) {

	463 endpointer_.SetUserInputMode();

	464 listener_->OnEnvironmentEstimationComplete(caller_id_);

	465 return STATE_WAITING_FOR_SPEECH;

	466 } else {

	467 return STATE_ESTIMATING_ENVIRONMENT;

	468 }

	469 }

	470

	471 SpeechRecognizerImpl::FSMState

	472 SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) {

	473 if (endpointer_.DidStartReceivingSpeech()) {

	474 listener_->OnSoundStart(caller_id_);

	475 return STATE_RECOGNIZING;

	476 } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) {

	477 return AbortWithError(

	478 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH));

	479 }

	480 return STATE_WAITING_FOR_SPEECH;

	481 }

	482

	483 SpeechRecognizerImpl::FSMState

	484 SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) {

	485 if (endpointer_.speech_input_complete()) {

	486 return StopCaptureAndWaitForResult(event_args);

	487 }

	488 return STATE_RECOGNIZING;

	489 }

	490

	491 SpeechRecognizerImpl::FSMState

	492 SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) {

	493 DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING);

	494

	495 DVLOG(1) << "Concluding recognition";

	496 CloseAudioControllerAsynchronously();

	497 recognition_engine_->AudioChunksEnded();

	498

	499 if (state_ > STATE_WAITING_FOR_SPEECH)

	500 listener_->OnSoundEnd(caller_id_);

	501

	502 listener_->OnAudioEnd(caller_id_);

	503 return STATE_WAITING_FINAL_RESULT;

	504 }

	505

	506 SpeechRecognizerImpl::FSMState

	507 SpeechRecognizerImpl::Abort(const FSMEventArgs& event_args) {

	508 // TODO(primiano) Should raise SPEECH_RECOGNITION_ERROR_ABORTED in lack of

	509 // other specific error sources (so that it was an explicit abort request).

	510 // However, SPEECH_RECOGNITION_ERROR_ABORTED is not currently caught by

	511 // ChromeSpeechRecognitionManagerDelegate and would cause an exception.

	512 // JS support will probably need it in future.

	513 if (event_args.event == EVENT_AUDIO_ERROR) {

	514 return AbortWithError(

	515 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO));

	516 } else if (event_args.event == EVENT_ENGINE_ERROR) {

	517 return AbortWithError(event_args.engine_error);

	518 }

	519 return AbortWithError(NULL);

	520 }

	521

	522 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortWithError(

	523 const SpeechRecognitionError& error) {

	524 return AbortWithError(&error);

	525 }

	526

	527 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortWithError(

	528 const SpeechRecognitionError* error) {

	529 if (IsCapturingAudio())

	530 CloseAudioControllerAsynchronously();

	531

	532 DVLOG(1) << "SpeechRecognizerImpl canceling recognition. ";

	533

	534 // The recognition engine is initialized only after STATE_STARTING.

	535 if (state_ > STATE_STARTING) {

	536 DCHECK(recognition_engine_.get() != NULL);

	537 recognition_engine_->EndRecognition();

	538 }

	539

	540 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT)

	541 listener_->OnSoundEnd(caller_id_);

	542

	543 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT)

	544 listener_->OnAudioEnd(caller_id_);

	545

	546 if (error != NULL)

	547 listener_->OnRecognitionError(caller_id_, *error);

	548

	549 listener_->OnRecognitionEnd(caller_id_);

	550

	551 return STATE_IDLE;

	552 }

	553

	554 SpeechRecognizerImpl::FSMState

	555 SpeechRecognizerImpl::ProcessIntermediateResult(const FSMEventArgs&) {

	556 // This is in preparation for future speech recognition functions.

	557 NOTREACHED();

	558 return state_;

	559 }

	560

	561 SpeechRecognizerImpl::FSMState

	562 SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) {

	563 const SpeechRecognitionResult& result = event_args.engine_result;

	564 DVLOG(1) << "Got valid result";

	565 recognition_engine_->EndRecognition();

310 listener_->OnRecognitionResult(caller_id_, result);	566 listener_->OnRecognitionResult(caller_id_, result);

311 listener_->OnRecognitionEnd(caller_id_);	567 listener_->OnRecognitionEnd(caller_id_);

312 }	568 return STATE_IDLE;

313	569 }

314 void SpeechRecognizerImpl::OnSpeechRecognitionEngineError(	570

315 const content::SpeechRecognitionError& error) {	571 SpeechRecognizerImpl::FSMState

316 InformErrorAndAbortRecognition(error.code);	572 SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const {

317 }	573 return state_; // Just keep the current state.

318	574 }

319 void SpeechRecognizerImpl::InformErrorAndAbortRecognition(	575

320 content::SpeechRecognitionErrorCode error) {	576 SpeechRecognizerImpl::FSMState

321 DCHECK_NE(error, content::SPEECH_RECOGNITION_ERROR_NONE);	577 SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) {

322 AbortRecognition();	578 NOTREACHED() << "Unfeasible event " << event_args.event

323	579 << " in state " << state_;

324 // Guard against the listener freeing us until we finish our job.	580 return state_;

325 scoped_refptr<SpeechRecognizerImpl> me(this);

326 listener_->OnRecognitionError(caller_id_, error);

327 }	581 }

328	582

329 void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() {	583 void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() {

330 VLOG(1) << "SpeechRecognizer stopping record.";	584 DCHECK(IsCapturingAudio());

	585 DVLOG(1) << "SpeechRecognizerImpl stopping audio capture.";

331 // Issues a Close on the audio controller, passing an empty callback. The only	586 // Issues a Close on the audio controller, passing an empty callback. The only

332 // purpose of such callback is to keep the audio controller refcounted until	587 // purpose of such callback is to keep the audio controller refcounted until

333 // Close has completed (in the audio thread) and automatically destroy it	588 // Close has completed (in the audio thread) and automatically destroy it

334 // afterwards (upon return from OnAudioClosed).	589 // afterwards (upon return from OnAudioClosed).

335 audio_controller_->Close(base::Bind(&SpeechRecognizerImpl::OnAudioClosed,	590 audio_controller_->Close(base::Bind(&SpeechRecognizerImpl::OnAudioClosed,

336 this, audio_controller_));	591 this, audio_controller_));

337 audio_controller_ = NULL; // The controller is still refcounted by Bind.	592 audio_controller_ = NULL; // The controller is still refcounted by Bind.

338 }	593 }

339	594

340 bool SpeechRecognizerImpl::IsActive() const {	595 int SpeechRecognizerImpl::GetElapsedTimeMs() const {

341 return (recognition_engine_.get() != NULL);	596 return (num_samples_recorded_ * 1000) / kAudioSampleRate;

342 }	597 }

343	598

344 bool SpeechRecognizerImpl::IsCapturingAudio() const {	599 void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms,

345 return (audio_controller_.get() != NULL);	600 bool clip_detected) {

	601 // Calculate the input volume to display in the UI, smoothing towards the

	602 // new level.

	603 // TODO(primiano) Do we really need all this floating point arith here?

	604 // Perhaps it might be quite expensive on mobile.

	605 float level = (rms - kAudioMeterMinDb) /

	606 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

	607 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);

	608 const float smoothing_factor = (level > audio_level_) ? kUpSmoothingFactor :

	609 kDownSmoothingFactor;

	610 audio_level_ += (level - audio_level_) * smoothing_factor;

	611

	612 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /

	613 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

	614 noise_level = std::min(std::max(0.0f, noise_level),

	615 kAudioMeterRangeMaxUnclipped);

	616

	617 listener_->OnAudioLevelsChange(

	618 caller_id_, clip_detected ? 1.0f : audio_level_, noise_level);

346 }	619 }

347	620

348 const SpeechRecognitionEngine&	621 const SpeechRecognitionEngine&

349 SpeechRecognizerImpl::recognition_engine() const {	622 SpeechRecognizerImpl::recognition_engine() const {

350 return *(recognition_engine_.get());	623 return *(recognition_engine_.get());

351 }	624 }

352	625

353 void SpeechRecognizerImpl::SetAudioManagerForTesting(	626 void SpeechRecognizerImpl::SetAudioManagerForTesting(

354 AudioManager* audio_manager) {	627 AudioManager* audio_manager) {

355 testing_audio_manager_ = audio_manager;	628 testing_audio_manager_ = audio_manager;

356 }	629 }

357	630

	631 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value)

	632 : event(event_value),

	633 audio_error_code(0),

	634 audio_data(NULL),

	635 engine_error(content::SPEECH_RECOGNITION_ERROR_NONE) {

	636 }

	637

	638 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() {

	639 }

358	640

359 } // namespace speech	641 } // namespace speech

OLD	NEW