chrome/browser/speech/speech_recognizer.cc - Issue 3117026: Add an endpointer for detecting end of speech.

Side by Side Diff: chrome/browser/speech/speech_recognizer.cc

Issue 3117026: Add an endpointer for detecting end of speech. (Closed)

Patch Set: Merged with latest. Created 10 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/browser/speech/speech_recognizer.h"	5 #include "chrome/browser/speech/speech_recognizer.h"

6	6

7 #include "base/ref_counted.h"	7 #include "base/ref_counted.h"

8 #include "base/scoped_ptr.h"	8 #include "base/scoped_ptr.h"

	9 #include "base/time.h"

9 #include "chrome/browser/chrome_thread.h"	10 #include "chrome/browser/chrome_thread.h"

10 #include "chrome/browser/profile.h"	11 #include "chrome/browser/profile.h"

11 #include "chrome/common/net/url_request_context_getter.h"	12 #include "chrome/common/net/url_request_context_getter.h"

12 #include "third_party/speex/include/speex/speex.h"	13 #include "third_party/speex/include/speex/speex.h"

13	14

14 using media::AudioInputController;	15 using media::AudioInputController;

15 using std::list;	16 using std::list;

16 using std::string;	17 using std::string;

17	18

18 namespace {	19 namespace {

(...skipping 17 matching lines...) Expand all Loading...
36 namespace speech_input {	37 namespace speech_input {

37	38

38 // Provides a simple interface to encode raw audio using the Speex codec.	39 // Provides a simple interface to encode raw audio using the Speex codec.

39 class SpeexEncoder {	40 class SpeexEncoder {

40 public:	41 public:

41 SpeexEncoder();	42 SpeexEncoder();

42 ~SpeexEncoder();	43 ~SpeexEncoder();

43	44

44 int samples_per_frame() const { return samples_per_frame_; }	45 int samples_per_frame() const { return samples_per_frame_; }

45	46

46 // Encodes each frame of raw audio in \|raw_samples\| and adds the	47 // Encodes each frame of raw audio in \|samples\| and adds the

47 // encoded frames as a set of strings to the \|encoded_frames\| list.	48 // encoded frames as a set of strings to the \|encoded_frames\| list.

48 // Ownership of the newly added strings is transferred to the caller.	49 // Ownership of the newly added strings is transferred to the caller.

49 void Encode(const string& raw_samples,	50 void Encode(const short* samples,

	51 int num_samples,

50 std::list<std::string> encoded_frames);	52 std::list<std::string> encoded_frames);

51	53

52 private:	54 private:

53 SpeexBits bits_;	55 SpeexBits bits_;

54 void* encoder_state_;	56 void* encoder_state_;

55 int samples_per_frame_;	57 int samples_per_frame_;

56 char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size.	58 char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size.

57 };	59 };

58	60

59 SpeexEncoder::SpeexEncoder() {	61 SpeexEncoder::SpeexEncoder() {

60 speex_bits_init(&bits_);	62 speex_bits_init(&bits_);

61 encoder_state_ = speex_encoder_init(&speex_wb_mode);	63 encoder_state_ = speex_encoder_init(&speex_wb_mode);

62 DCHECK(encoder_state_);	64 DCHECK(encoder_state_);

63 speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_);	65 speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_);

64 DCHECK(samples_per_frame_ > 0);	66 DCHECK(samples_per_frame_ > 0);

65 int quality = kSpeexEncodingQuality;	67 int quality = kSpeexEncodingQuality;

66 speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality);	68 speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality);

67 int vbr = 1;	69 int vbr = 1;

68 speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr);	70 speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr);

69 }	71 }

70	72

71 SpeexEncoder::~SpeexEncoder() {	73 SpeexEncoder::~SpeexEncoder() {

72 speex_bits_destroy(&bits_);	74 speex_bits_destroy(&bits_);

73 speex_encoder_destroy(encoder_state_);	75 speex_encoder_destroy(encoder_state_);

74 }	76 }

75	77

76 void SpeexEncoder::Encode(const string& raw_samples,	78 void SpeexEncoder::Encode(const short* samples,

	79 int num_samples,

77 std::list<std::string> encoded_frames) {	80 std::list<std::string> encoded_frames) {

78 const short* samples = reinterpret_cast<const short*>(raw_samples.data());

79 DCHECK((raw_samples.length() % sizeof(short)) == 0);

80 int num_samples = raw_samples.length() / sizeof(short);

81

82 // Drop incomplete frames, typically those which come in when recording stops.	81 // Drop incomplete frames, typically those which come in when recording stops.

83 num_samples -= (num_samples % samples_per_frame_);	82 num_samples -= (num_samples % samples_per_frame_);

84 for (int i = 0; i < num_samples; i += samples_per_frame_) {	83 for (int i = 0; i < num_samples; i += samples_per_frame_) {

85 speex_bits_reset(&bits_);	84 speex_bits_reset(&bits_);

86 speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i),	85 speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i),

87 &bits_);	86 &bits_);

88	87

89 // Encode the frame and place the size of the frame as the first byte. This	88 // Encode the frame and place the size of the frame as the first byte. This

90 // is the packet format for MIME type x-speex-with-header-byte.	89 // is the packet format for MIME type x-speex-with-header-byte.

91 int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1,	90 int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1,

92 kMaxSpeexFrameLength);	91 kMaxSpeexFrameLength);

93 encoded_frame_data_[0] = static_cast<char>(frame_length);	92 encoded_frame_data_[0] = static_cast<char>(frame_length);

94 encoded_frames->push_back(new string(encoded_frame_data_,	93 encoded_frames->push_back(new string(encoded_frame_data_,

95 frame_length + 1));	94 frame_length + 1));

96 }	95 }

97 }	96 }

98	97

99 SpeechRecognizer::SpeechRecognizer(Delegate* delegate,	98 SpeechRecognizer::SpeechRecognizer(Delegate* delegate,

100 const SpeechInputCallerId& caller_id)	99 const SpeechInputCallerId& caller_id)

101 : delegate_(delegate),	100 : delegate_(delegate),

102 caller_id_(caller_id),	101 caller_id_(caller_id),

103 encoder_(new SpeexEncoder()) {	102 encoder_(new SpeexEncoder()),

	103 endpointer_(kAudioSampleRate) {

	104 endpointer_.set_speech_input_complete_silence_length(

	105 base::Time::kMicrosecondsPerSecond / 2);

	106 endpointer_.set_long_speech_input_complete_silence_length(

	107 base::Time::kMicrosecondsPerSecond);

	108 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);

	109 endpointer_.StartSession();

104 }	110 }

105	111

106 SpeechRecognizer::~SpeechRecognizer() {	112 SpeechRecognizer::~SpeechRecognizer() {

107 // Recording should have stopped earlier due to the endpointer or	113 // Recording should have stopped earlier due to the endpointer or

108 // \|StopRecording\| being called.	114 // \|StopRecording\| being called.

109 DCHECK(!audio_controller_.get());	115 DCHECK(!audio_controller_.get());

110 DCHECK(!request_.get() \|\| !request_->HasPendingRequest());	116 DCHECK(!request_.get() \|\| !request_->HasPendingRequest());

111 DCHECK(audio_buffers_.empty());	117 DCHECK(audio_buffers_.empty());

	118 endpointer_.EndSession();

112 }	119 }

113	120

114 bool SpeechRecognizer::StartRecording() {	121 bool SpeechRecognizer::StartRecording() {

115 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO));	122 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO));

116 DCHECK(!audio_controller_.get());	123 DCHECK(!audio_controller_.get());

117 DCHECK(!request_.get() \|\| !request_->HasPendingRequest());	124 DCHECK(!request_.get() \|\| !request_->HasPendingRequest());

118	125

	126 // TODO(satish): Normally for a short time (even 0.5s) the endpointer needs to

	127 // estimate the environment/background noise before starting to treat the

	128 // audio as user input. Once we have implemented a popup UI to notify the user

	129 // that recording has started, we should perhaps have a short interval where

	130 // we record background audio and then show the popup UI so that the user can

	131 // start speaking after that. For now we just do these together so there isn't

	132 // any background noise for the end pointer (still works ok).

	133 endpointer_.SetEnvironmentEstimationMode();

	134 endpointer_.SetUserInputMode();

	135

119 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;	136 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;

120 DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0);	137 DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0);

121 audio_controller_ = AudioInputController::Create(this,	138 audio_controller_ = AudioInputController::Create(this,

122 AudioManager::AUDIO_PCM_LINEAR, kNumAudioChannels,	139 AudioManager::AUDIO_PCM_LINEAR, kNumAudioChannels,

123 kAudioSampleRate, kNumBitsPerAudioSample,	140 kAudioSampleRate, kNumBitsPerAudioSample,

124 samples_per_packet);	141 samples_per_packet);

125 DCHECK(audio_controller_.get());	142 DCHECK(audio_controller_.get());

126 LOG(INFO) << "SpeechRecognizer starting record.";	143 LOG(INFO) << "SpeechRecognizer starting record.";

127 audio_controller_->Record();	144 audio_controller_->Record();

128	145

(...skipping 20 matching lines...) Expand all Loading...
149 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO));	166 DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO));

150	167

151 // If audio recording has already stopped and we are in recognition phase,	168 // If audio recording has already stopped and we are in recognition phase,

152 // silently ignore any more calls to stop recording.	169 // silently ignore any more calls to stop recording.

153 if (!audio_controller_.get())	170 if (!audio_controller_.get())

154 return;	171 return;

155	172

156 LOG(INFO) << "SpeechRecognizer stopping record.";	173 LOG(INFO) << "SpeechRecognizer stopping record.";

157 audio_controller_->Close();	174 audio_controller_->Close();

158 audio_controller_ = NULL; // Releases the ref ptr.	175 audio_controller_ = NULL; // Releases the ref ptr.

	176

159 delegate_->DidCompleteRecording(caller_id_);	177 delegate_->DidCompleteRecording(caller_id_);

160	178

161 // If we haven't got any audio yet end the recognition sequence here.	179 // If we haven't got any audio yet end the recognition sequence here.

162 if (audio_buffers_.empty()) {	180 if (audio_buffers_.empty()) {

163 // Guard against the delegate freeing us until we finish our job.	181 // Guard against the delegate freeing us until we finish our job.

164 scoped_refptr<SpeechRecognizer> me(this);	182 scoped_refptr<SpeechRecognizer> me(this);

165 delegate_->DidCompleteRecognition(caller_id_);	183 delegate_->DidCompleteRecognition(caller_id_);

166 return;	184 return;

167 }	185 }

168	186

(...skipping 64 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
233	251

234 void SpeechRecognizer::HandleOnData(string* data) {	252 void SpeechRecognizer::HandleOnData(string* data) {

235 // Check if we are still recording and if not discard this buffer, as	253 // Check if we are still recording and if not discard this buffer, as

236 // recording might have been stopped after this buffer was posted to the queue	254 // recording might have been stopped after this buffer was posted to the queue

237 // by \|OnData\|.	255 // by \|OnData\|.

238 if (!audio_controller_.get()) {	256 if (!audio_controller_.get()) {

239 delete data;	257 delete data;

240 return;	258 return;

241 }	259 }

242	260

243 encoder_->Encode(*data, &audio_buffers_);	261 const short* samples = reinterpret_cast<const short*>(data->data());

	262 DCHECK((data->length() % sizeof(short)) == 0);

	263 int num_samples = data->length() / sizeof(short);

	264

	265 encoder_->Encode(samples, num_samples, &audio_buffers_);

	266 endpointer_.ProcessAudio(samples, num_samples);

244 delete data;	267 delete data;

245	268

	269 if (endpointer_.speech_input_complete()) {

	270 StopRecording();

	271 }

	272

246 // TODO(satish): Once we have streaming POST, start sending the data received	273 // TODO(satish): Once we have streaming POST, start sending the data received

247 // here as POST chunks.	274 // here as POST chunks.

248 }	275 }

249	276

250 void SpeechRecognizer::SetRecognitionResult(bool error, const string16& value) {	277 void SpeechRecognizer::SetRecognitionResult(bool error, const string16& value) {

251 delegate_->SetRecognitionResult(caller_id_, error, value);	278 delegate_->SetRecognitionResult(caller_id_, error, value);

252	279

253 // Guard against the delegate freeing us until we finish our job.	280 // Guard against the delegate freeing us until we finish our job.

254 scoped_refptr<SpeechRecognizer> me(this);	281 scoped_refptr<SpeechRecognizer> me(this);

255 delegate_->DidCompleteRecognition(caller_id_);	282 delegate_->DidCompleteRecognition(caller_id_);

256 }	283 }

257	284

258 } // namespace speech_input	285 } // namespace speech_input

OLD	NEW

« chrome/browser/speech/speech_recognizer.h ('K') | « chrome/browser/speech/speech_recognizer.h ('k') | chrome/chrome_browser.gypi » ('j') | no next file with comments »