chrome/browser/speech/speech_recognizer.cc - Issue 3341020: Speech input: Do environment estimation and detect the no-speech case.

Side by Side Diff: chrome/browser/speech/speech_recognizer.cc

Issue 3341020: Speech input: Do environment estimation and detect the no-speech case. (Closed)

Patch Set: . Created 10 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« chrome/browser/speech/speech_input_manager.cc ('K') | « chrome/browser/speech/speech_recognizer.h ('k') | chrome/browser/speech/speech_recognizer_unittest.cc » ('j') | chrome/browser/speech/speech_recognizer_unittest.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/browser/speech/speech_recognizer.h"	5 #include "chrome/browser/speech/speech_recognizer.h"

6	6

7 #include "base/ref_counted.h"	7 #include "base/ref_counted.h"

8 #include "base/scoped_ptr.h"	8 #include "base/scoped_ptr.h"

9 #include "base/time.h"	9 #include "base/time.h"

10 #include "chrome/browser/chrome_thread.h"	10 #include "chrome/browser/chrome_thread.h"

11 #include "chrome/browser/profile.h"	11 #include "chrome/browser/profile.h"

12 #include "chrome/common/net/url_request_context_getter.h"	12 #include "chrome/common/net/url_request_context_getter.h"

13 #include "third_party/speex/include/speex/speex.h"	13 #include "third_party/speex/include/speex/speex.h"

14	14

15 using media::AudioInputController;	15 using media::AudioInputController;

16 using std::list;	16 using std::list;

17 using std::string;	17 using std::string;

18	18

19 namespace {	19 namespace {

20 const char* const kDefaultSpeechRecognitionUrl =	20 const char* const kDefaultSpeechRecognitionUrl =

21 "http://www.google.com/speech-api/v1/recognize?lang=en-us&client=chromium";	21 "http://www.google.com/speech-api/v1/recognize?lang=en-us&client=chromium";

22 const char* const kContentTypeSpeex =	22 const char* const kContentTypeSpeex =

23 "audio/x-speex-with-header-byte; rate=16000";	23 "audio/x-speex-with-header-byte; rate=16000";

24 const int kAudioSampleRate = 16000;

25 const int kSpeexEncodingQuality = 8;	24 const int kSpeexEncodingQuality = 8;

26 const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz).	25 const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz).

27	26

28 // Since the frame length gets written out as a byte in the encoded packet,	27 // Since the frame length gets written out as a byte in the encoded packet,

29 // make sure it is within the byte range.	28 // make sure it is within the byte range.

30 COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength);	29 COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength);

31	30

32 const int kAudioPacketIntervalMs = 100; // Record 100ms long audio packets.	31 const int kEndpointerEstimationTimeMs = 300;

33 const int kNumAudioChannels = 1; // Speech is recorded as mono.

34 const int kNumBitsPerAudioSample = 16;

35 } // namespace	32 } // namespace

36	33

37 namespace speech_input {	34 namespace speech_input {

38	35

	36 const int SpeechRecognizer::kAudioSampleRate = 16000;

	37 const int SpeechRecognizer::kAudioPacketIntervalMs = 100;

	38 const int SpeechRecognizer::kNumAudioChannels = 1;

	39 const int SpeechRecognizer::kNumBitsPerAudioSample = 16;

	40 const int SpeechRecognizer::kNoSpeechTimeoutSec = 8;

	41

39 // Provides a simple interface to encode raw audio using the Speex codec.	42 // Provides a simple interface to encode raw audio using the Speex codec.

40 class SpeexEncoder {	43 class SpeexEncoder {

41 public:	44 public:

42 SpeexEncoder();	45 SpeexEncoder();

43 ~SpeexEncoder();	46 ~SpeexEncoder();

44	47

45 int samples_per_frame() const { return samples_per_frame_; }	48 int samples_per_frame() const { return samples_per_frame_; }

46	49

47 // Encodes each frame of raw audio in \|samples\| and adds the	50 // Encodes each frame of raw audio in \|samples\| and adds the

48 // encoded frames as a set of strings to the \|encoded_frames\| list.	51 // encoded frames as a set of strings to the \|encoded_frames\| list.

(...skipping 66 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
115 DCHECK(!request_.get() \|\| !request_->HasPendingRequest());	118 DCHECK(!request_.get() \|\| !request_->HasPendingRequest());

116 DCHECK(audio_buffers_.empty());	119 DCHECK(audio_buffers_.empty());

117 endpointer_.EndSession();	120 endpointer_.EndSession();

118 }	121 }

119	122

120 bool SpeechRecognizer::StartRecording() {	123 bool SpeechRecognizer::StartRecording() {

121 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO));	124 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO));

122 DCHECK(!audio_controller_.get());	125 DCHECK(!audio_controller_.get());

123 DCHECK(!request_.get() \|\| !request_->HasPendingRequest());	126 DCHECK(!request_.get() \|\| !request_->HasPendingRequest());

124	127

125 // TODO(satish): Normally for a short time (even 0.5s) the endpointer needs to	128 // The endpointer needs to estimate the environment/background noise before

126 // estimate the environment/background noise before starting to treat the	129 // starting to treat the audio as user input. In \|HandleOnData\| we wait until

127 // audio as user input. Once we have implemented a popup UI to notify the user	130 // such time has passed before switching to user input mode.

128 // that recording has started, we should perhaps have a short interval where

129 // we record background audio and then show the popup UI so that the user can

130 // start speaking after that. For now we just do these together so there isn't

131 // any background noise for the end pointer (still works ok).

132 endpointer_.SetEnvironmentEstimationMode();	131 endpointer_.SetEnvironmentEstimationMode();

133 endpointer_.SetUserInputMode();

134	132

135 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;	133 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;

136 DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0);	134 DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0);

137 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels,	135 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels,

138 kAudioSampleRate, kNumBitsPerAudioSample);	136 kAudioSampleRate, kNumBitsPerAudioSample);

139 audio_controller_ =	137 audio_controller_ =

140 AudioInputController::Create(this, params, samples_per_packet);	138 AudioInputController::Create(this, params, samples_per_packet);

141 DCHECK(audio_controller_.get());	139 DCHECK(audio_controller_.get());

142 LOG(INFO) << "SpeechRecognizer starting record.";	140 LOG(INFO) << "SpeechRecognizer starting record.";

	141 num_samples_recorded_ = 0;

143 audio_controller_->Record();	142 audio_controller_->Record();

144	143

145 return true;	144 return true;

146 }	145 }

147	146

148 void SpeechRecognizer::CancelRecognition() {	147 void SpeechRecognizer::CancelRecognition() {

149 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO));	148 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO));

150 DCHECK(audio_controller_.get() \|\| request_.get());	149 DCHECK(audio_controller_.get() \|\| request_.get());

151	150

152 // Stop recording if required.	151 // Stop recording if required.

(...skipping 71 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
224	223

225 void SpeechRecognizer::HandleOnError(int error_code) {	224 void SpeechRecognizer::HandleOnError(int error_code) {

226 LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code;	225 LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code;

227	226

228 // Check if we are still recording before canceling recognition, as	227 // Check if we are still recording before canceling recognition, as

229 // recording might have been stopped after this error was posted to the queue	228 // recording might have been stopped after this error was posted to the queue

230 // by \|OnError\|.	229 // by \|OnError\|.

231 if (!audio_controller_.get())	230 if (!audio_controller_.get())

232 return;	231 return;

233	232

234 delegate_->OnRecognizerError(caller_id_);	233 InformErrorAndCancelRecognition(RECOGNIZER_ERROR_CAPTURE);

235 CancelRecognition();

236 delegate_->DidCompleteRecording(caller_id_);

237 delegate_->DidCompleteRecognition(caller_id_);

238 }	234 }

239	235

240 void SpeechRecognizer::OnData(AudioInputController* controller,	236 void SpeechRecognizer::OnData(AudioInputController* controller,

241 const uint8* data, uint32 size) {	237 const uint8* data, uint32 size) {

242 if (size == 0) // This could happen when recording stops and is normal.	238 if (size == 0) // This could happen when recording stops and is normal.

243 return;	239 return;

244	240

245 string* str_data = new string(reinterpret_cast<const char*>(data), size);	241 string* str_data = new string(reinterpret_cast<const char*>(data), size);

246 ChromeThread::PostTask(ChromeThread::IO, FROM_HERE,	242 ChromeThread::PostTask(ChromeThread::IO, FROM_HERE,

247 NewRunnableMethod(this,	243 NewRunnableMethod(this,

(...skipping 10 matching lines...) Expand all Loading...
258 return;	254 return;

259 }	255 }

260	256

261 const short* samples = reinterpret_cast<const short*>(data->data());	257 const short* samples = reinterpret_cast<const short*>(data->data());

262 DCHECK((data->length() % sizeof(short)) == 0);	258 DCHECK((data->length() % sizeof(short)) == 0);

263 int num_samples = data->length() / sizeof(short);	259 int num_samples = data->length() / sizeof(short);

264	260

265 encoder_->Encode(samples, num_samples, &audio_buffers_);	261 encoder_->Encode(samples, num_samples, &audio_buffers_);

266 endpointer_.ProcessAudio(samples, num_samples);	262 endpointer_.ProcessAudio(samples, num_samples);

267 delete data;	263 delete data;

	264 num_samples_recorded_ += num_samples;

	265

	266 // Check if we have gathered enough audio for the endpointer to do environment

	267 // estimation and should move on to detect speech/end of speech.

	268 if (endpointer_.IsEstimatingEnvironment() &&

	269 num_samples_recorded_ >= (kEndpointerEstimationTimeMs *

	270 kAudioSampleRate) / 1000) {

	271 endpointer_.SetUserInputMode();

	272 delegate_->DidCompleteEnvironmentEstimation(caller_id_);

	273 return;

	274 }

	275

	276 // Check if we have waited too long without hearing any speech.

	277 if (!endpointer_.DidStartReceivingSpeech() &&

	278 num_samples_recorded_ >= kNoSpeechTimeoutSec * kAudioSampleRate) {

	279 InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_SPEECH);

	280 return;

	281 }

268	282

269 if (endpointer_.speech_input_complete()) {	283 if (endpointer_.speech_input_complete()) {

270 StopRecording();	284 StopRecording();

271 }	285 }

272	286

273 // TODO(satish): Once we have streaming POST, start sending the data received	287 // TODO(satish): Once we have streaming POST, start sending the data received

274 // here as POST chunks.	288 // here as POST chunks.

275 }	289 }

276	290

277 void SpeechRecognizer::SetRecognitionResult(bool error, const string16& value) {	291 void SpeechRecognizer::SetRecognitionResult(bool error, const string16& value) {

	292 if (value.empty()) {

	293 InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_RESULTS);

	294 return;

	295 }

	296

278 delegate_->SetRecognitionResult(caller_id_, error, value);	297 delegate_->SetRecognitionResult(caller_id_, error, value);

279	298

280 // Guard against the delegate freeing us until we finish our job.	299 // Guard against the delegate freeing us until we finish our job.

281 scoped_refptr<SpeechRecognizer> me(this);	300 scoped_refptr<SpeechRecognizer> me(this);

282 delegate_->DidCompleteRecognition(caller_id_);	301 delegate_->DidCompleteRecognition(caller_id_);

283 }	302 }

284	303

	304 void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) {

	305 CancelRecognition();

	306

	307 // Guard against the delegate freeing us until we finish our job.

	308 scoped_refptr<SpeechRecognizer> me(this);

	309 delegate_->OnRecognizerError(caller_id_, error);

	310 delegate_->DidCompleteRecording(caller_id_);

	311 delegate_->DidCompleteRecognition(caller_id_);

	312 }

	313

285 } // namespace speech_input	314 } // namespace speech_input

OLD	NEW