chrome/browser/speech/speech_recognizer.cc - Issue 6111009: Add the option of compressing speech input audio using FLAC.

Side by Side Diff: chrome/browser/speech/speech_recognizer.cc

Issue 6111009: Add the option of compressing speech input audio using FLAC. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 9 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/browser/speech/speech_recognizer.h"	5 #include "chrome/browser/speech/speech_recognizer.h"

6	6

7 #include "base/ref_counted.h"	7 #include "base/ref_counted.h"

8 #include "base/scoped_ptr.h"	8 #include "base/scoped_ptr.h"

9 #include "base/time.h"	9 #include "base/time.h"

10 #include "chrome/browser/browser_thread.h"	10 #include "chrome/browser/browser_thread.h"

11 #include "chrome/browser/profiles/profile.h"	11 #include "chrome/browser/profiles/profile.h"

12 #include "chrome/common/net/url_request_context_getter.h"	12 #include "chrome/common/net/url_request_context_getter.h"

13 #include "third_party/speex/speex.h"	13 #include "third_party/flac/flac.h"
	bulach 2011/01/12 16:27:07 no longer needed? no longer needed?
14	14

15 using media::AudioInputController;	15 using media::AudioInputController;

16 using std::list;	16 using std::list;

17 using std::string;	17 using std::string;

18	18

19 namespace {	19 namespace {

	20 const char* const kContentTypeFLAC = "audio/x-flac; rate=16000";

20 const char* const kContentTypeSpeex =	21 const char* const kContentTypeSpeex =

21 "audio/x-speex-with-header-byte; rate=16000";	22 "audio/x-speex-with-header-byte; rate=16000";
	bulach 2011/01/12 16:27:07 it'd be nice to move these to the new encoder inte it'd be nice to move these to the new encoder interface
22 const int kSpeexEncodingQuality = 8;

23 const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz).

24

25 // Since the frame length gets written out as a byte in the encoded packet,

26 // make sure it is within the byte range.

27 COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength);

28	23

29 // The following constants are related to the volume level indicator shown in	24 // The following constants are related to the volume level indicator shown in

30 // the UI for recorded audio.	25 // the UI for recorded audio.

31 // Multiplier used when new volume is greater than previous level.	26 // Multiplier used when new volume is greater than previous level.

32 const float kUpSmoothingFactor = 0.9f;	27 const float kUpSmoothingFactor = 0.9f;

33 // Multiplier used when new volume is lesser than previous level.	28 // Multiplier used when new volume is lesser than previous level.

34 const float kDownSmoothingFactor = 0.4f;	29 const float kDownSmoothingFactor = 0.4f;

35 const float kAudioMeterMinDb = 10.0f; // Lower bar for volume meter.	30 const float kAudioMeterMinDb = 10.0f; // Lower bar for volume meter.

36 const float kAudioMeterDbRange = 25.0f;	31 const float kAudioMeterDbRange = 25.0f;

37 } // namespace	32 } // namespace

38	33

39 namespace speech_input {	34 namespace speech_input {

40	35

41 const int SpeechRecognizer::kAudioSampleRate = 16000;	36 const int SpeechRecognizer::kAudioSampleRate = 16000;

42 const int SpeechRecognizer::kAudioPacketIntervalMs = 100;	37 const int SpeechRecognizer::kAudioPacketIntervalMs = 100;

43 const int SpeechRecognizer::kNumAudioChannels = 1;	38 const int SpeechRecognizer::kNumAudioChannels = 1;

44 const int SpeechRecognizer::kNumBitsPerAudioSample = 16;	39 const int SpeechRecognizer::kNumBitsPerAudioSample = 16;

45 const int SpeechRecognizer::kNoSpeechTimeoutSec = 8;	40 const int SpeechRecognizer::kNoSpeechTimeoutSec = 8;

46 const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300;	41 const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300;

47	42

48 // Provides a simple interface to encode raw audio using the Speex codec.

49 class SpeexEncoder {

50 public:

51 SpeexEncoder();

52 ~SpeexEncoder();

53

54 int samples_per_frame() const { return samples_per_frame_; }

55

56 // Encodes each frame of raw audio in \|samples\| and adds the

57 // encoded frames as a set of strings to the \|encoded_frames\| list.

58 // Ownership of the newly added strings is transferred to the caller.

59 void Encode(const short* samples,

60 int num_samples,

61 std::list<std::string> encoded_frames);

62

63 private:

64 SpeexBits bits_;

65 void* encoder_state_;

66 int samples_per_frame_;

67 char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size.

68 };

69

70 SpeexEncoder::SpeexEncoder() {

71 // speex_bits_init() does not initialize all of the \|bits_\| struct.

72 memset(&bits_, 0, sizeof(bits_));

73 speex_bits_init(&bits_);

74 encoder_state_ = speex_encoder_init(&speex_wb_mode);

75 DCHECK(encoder_state_);

76 speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_);

77 DCHECK(samples_per_frame_ > 0);

78 int quality = kSpeexEncodingQuality;

79 speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality);

80 int vbr = 1;

81 speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr);

82 memset(encoded_frame_data_, 0, sizeof(encoded_frame_data_));

83 }

84

85 SpeexEncoder::~SpeexEncoder() {

86 speex_bits_destroy(&bits_);

87 speex_encoder_destroy(encoder_state_);

88 }

89

90 void SpeexEncoder::Encode(const short* samples,

91 int num_samples,

92 std::list<std::string> encoded_frames) {

93 // Drop incomplete frames, typically those which come in when recording stops.

94 num_samples -= (num_samples % samples_per_frame_);

95 for (int i = 0; i < num_samples; i += samples_per_frame_) {

96 speex_bits_reset(&bits_);

97 speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i),

98 &bits_);

99

100 // Encode the frame and place the size of the frame as the first byte. This

101 // is the packet format for MIME type x-speex-with-header-byte.

102 int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1,

103 kMaxSpeexFrameLength);

104 encoded_frame_data_[0] = static_cast<char>(frame_length);

105 encoded_frames->push_back(new string(encoded_frame_data_,

106 frame_length + 1));

107 }

108 }

109

110 SpeechRecognizer::SpeechRecognizer(Delegate* delegate,	43 SpeechRecognizer::SpeechRecognizer(Delegate* delegate,

111 int caller_id,	44 int caller_id,

112 const std::string& language,	45 const std::string& language,

113 const std::string& grammar,	46 const std::string& grammar,

114 const std::string& hardware_info)	47 const std::string& hardware_info)

115 : delegate_(delegate),	48 : delegate_(delegate),

116 caller_id_(caller_id),	49 caller_id_(caller_id),

117 language_(language),	50 language_(language),

118 grammar_(grammar),	51 grammar_(grammar),

119 hardware_info_(hardware_info),	52 hardware_info_(hardware_info),

120 encoder_(new SpeexEncoder()),	53 codec_(AudioEncoder::FLAC),

	54 encoder_(NULL),

121 endpointer_(kAudioSampleRate),	55 endpointer_(kAudioSampleRate),

122 num_samples_recorded_(0),	56 num_samples_recorded_(0),

123 audio_level_(0.0f) {	57 audio_level_(0.0f) {

124 endpointer_.set_speech_input_complete_silence_length(	58 endpointer_.set_speech_input_complete_silence_length(

125 base::Time::kMicrosecondsPerSecond / 2);	59 base::Time::kMicrosecondsPerSecond / 2);

126 endpointer_.set_long_speech_input_complete_silence_length(	60 endpointer_.set_long_speech_input_complete_silence_length(

127 base::Time::kMicrosecondsPerSecond);	61 base::Time::kMicrosecondsPerSecond);

128 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);	62 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);

129 endpointer_.StartSession();	63 endpointer_.StartSession();

130 }	64 }

131	65

132 SpeechRecognizer::~SpeechRecognizer() {	66 SpeechRecognizer::~SpeechRecognizer() {

133 // Recording should have stopped earlier due to the endpointer or	67 // Recording should have stopped earlier due to the endpointer or

134 // \|StopRecording\| being called.	68 // \|StopRecording\| being called.

135 DCHECK(!audio_controller_.get());	69 DCHECK(!audio_controller_.get());

136 DCHECK(!request_.get() \|\| !request_->HasPendingRequest());	70 DCHECK(!request_.get() \|\| !request_->HasPendingRequest());

137 DCHECK(audio_buffers_.empty());	71 DCHECK(!encoder_.get());

138 endpointer_.EndSession();	72 endpointer_.EndSession();

139 }	73 }

140	74

141 bool SpeechRecognizer::StartRecording() {	75 bool SpeechRecognizer::StartRecording() {

142 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));	76 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

143 DCHECK(!audio_controller_.get());	77 DCHECK(!audio_controller_.get());

144 DCHECK(!request_.get() \|\| !request_->HasPendingRequest());	78 DCHECK(!request_.get() \|\| !request_->HasPendingRequest());

	79 DCHECK(!encoder_.get());

145	80

146 // The endpointer needs to estimate the environment/background noise before	81 // The endpointer needs to estimate the environment/background noise before

147 // starting to treat the audio as user input. In \|HandleOnData\| we wait until	82 // starting to treat the audio as user input. In \|HandleOnData\| we wait until

148 // such time has passed before switching to user input mode.	83 // such time has passed before switching to user input mode.

149 endpointer_.SetEnvironmentEstimationMode();	84 endpointer_.SetEnvironmentEstimationMode();

150	85

	86 encoder_.reset(AudioEncoder::Create(codec_, kAudioSampleRate,

	87 kNumBitsPerAudioSample));

151 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;	88 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;

152 DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0);

153 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels,	89 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels,

154 kAudioSampleRate, kNumBitsPerAudioSample,	90 kAudioSampleRate, kNumBitsPerAudioSample,

155 samples_per_packet);	91 samples_per_packet);

156 audio_controller_ = AudioInputController::Create(this, params);	92 audio_controller_ = AudioInputController::Create(this, params);

157 DCHECK(audio_controller_.get());	93 DCHECK(audio_controller_.get());

158 VLOG(1) << "SpeechRecognizer starting record.";	94 VLOG(1) << "SpeechRecognizer starting record.";

159 num_samples_recorded_ = 0;	95 num_samples_recorded_ = 0;

160 audio_controller_->Record();	96 audio_controller_->Record();

161	97

162 return true;	98 return true;

163 }	99 }

164	100

165 void SpeechRecognizer::CancelRecognition() {	101 void SpeechRecognizer::CancelRecognition() {

166 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));	102 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

167 DCHECK(audio_controller_.get() \|\| request_.get());	103 DCHECK(audio_controller_.get() \|\| request_.get());

168	104

169 // Stop recording if required.	105 // Stop recording if required.

170 if (audio_controller_.get()) {	106 if (audio_controller_.get()) {

171 VLOG(1) << "SpeechRecognizer stopping record.";	107 VLOG(1) << "SpeechRecognizer stopping record.";

172 audio_controller_->Close();	108 audio_controller_->Close();

173 audio_controller_ = NULL; // Releases the ref ptr.	109 audio_controller_ = NULL; // Releases the ref ptr.

174 }	110 }

175	111

176 VLOG(1) << "SpeechRecognizer canceling recognition.";	112 VLOG(1) << "SpeechRecognizer canceling recognition.";

177 ReleaseAudioBuffers();	113 encoder_.reset();

178 request_.reset();	114 request_.reset();

179 }	115 }

180	116

181 void SpeechRecognizer::StopRecording() {	117 void SpeechRecognizer::StopRecording() {

182 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));	118 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

183	119

184 // If audio recording has already stopped and we are in recognition phase,	120 // If audio recording has already stopped and we are in recognition phase,

185 // silently ignore any more calls to stop recording.	121 // silently ignore any more calls to stop recording.

186 if (!audio_controller_.get())	122 if (!audio_controller_.get())

187 return;	123 return;

188	124

189 VLOG(1) << "SpeechRecognizer stopping record.";	125 VLOG(1) << "SpeechRecognizer stopping record.";

190 audio_controller_->Close();	126 audio_controller_->Close();

191 audio_controller_ = NULL; // Releases the ref ptr.	127 audio_controller_ = NULL; // Releases the ref ptr.

	128 encoder_->Flush();

192	129

193 delegate_->DidCompleteRecording(caller_id_);	130 delegate_->DidCompleteRecording(caller_id_);

194	131

195 // If we haven't got any audio yet end the recognition sequence here.	132 // Since the http request takes a single string as POST data, allocate

196 if (audio_buffers_.empty()) {	133 // one and copy over bytes from the audio buffers to the string.

	134 // And If we haven't got any audio yet end the recognition sequence here.

	135 string data;

	136 if (!encoder_->GetEncodedData(&data)) {

197 // Guard against the delegate freeing us until we finish our job.	137 // Guard against the delegate freeing us until we finish our job.

198 scoped_refptr<SpeechRecognizer> me(this);	138 scoped_refptr<SpeechRecognizer> me(this);

199 delegate_->DidCompleteRecognition(caller_id_);	139 delegate_->DidCompleteRecognition(caller_id_);

200 return;	140 return;

201 }	141 }

202	142

203 // We now have recorded audio in our buffers, so start a recognition request.

204 // Since the http request takes a single string as POST data, allocate

205 // one and copy over bytes from the audio buffers to the string.

206 int audio_buffer_length = 0;

207 for (AudioBufferQueue::iterator it = audio_buffers_.begin();

208 it != audio_buffers_.end(); it++) {

209 audio_buffer_length += (*it)->length();

210 }

211 string data;

212 data.reserve(audio_buffer_length);

213 for (AudioBufferQueue::iterator it = audio_buffers_.begin();

214 it != audio_buffers_.end(); it++) {

215 data.append((it));

216 }

217

218 DCHECK(!request_.get());	143 DCHECK(!request_.get());

219 request_.reset(new SpeechRecognitionRequest(	144 request_.reset(new SpeechRecognitionRequest(

220 Profile::GetDefaultRequestContext(), this));	145 Profile::GetDefaultRequestContext(), this));

221 request_->Send(language_, grammar_, hardware_info_, kContentTypeSpeex, data);	146 request_->Send(language_, grammar_, hardware_info_,

222 ReleaseAudioBuffers(); // No need to keep the audio anymore.	147 (codec_ == AudioEncoder::FLAC) ? kContentTypeFLAC : kContentTypeSpeex,

	148 data);

	149 encoder_.reset();

223 }	150 }

224	151

225 void SpeechRecognizer::ReleaseAudioBuffers() {	152 void SpeechRecognizer::ReleaseAudioBuffers() {

226 for (AudioBufferQueue::iterator it = audio_buffers_.begin();

227 it != audio_buffers_.end(); it++)

228 delete *it;

229 audio_buffers_.clear();

230 }	153 }

231	154

232 // Invoked in the audio thread.	155 // Invoked in the audio thread.

233 void SpeechRecognizer::OnError(AudioInputController* controller,	156 void SpeechRecognizer::OnError(AudioInputController* controller,

234 int error_code) {	157 int error_code) {

235 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,	158 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

236 NewRunnableMethod(this,	159 NewRunnableMethod(this,

237 &SpeechRecognizer::HandleOnError,	160 &SpeechRecognizer::HandleOnError,

238 error_code));	161 error_code));

239 }	162 }

(...skipping 28 matching lines...) Expand all Loading...
268 // by \|OnData\|.	191 // by \|OnData\|.

269 if (!audio_controller_.get()) {	192 if (!audio_controller_.get()) {

270 delete data;	193 delete data;

271 return;	194 return;

272 }	195 }

273	196

274 const short* samples = reinterpret_cast<const short*>(data->data());	197 const short* samples = reinterpret_cast<const short*>(data->data());

275 DCHECK((data->length() % sizeof(short)) == 0);	198 DCHECK((data->length() % sizeof(short)) == 0);

276 int num_samples = data->length() / sizeof(short);	199 int num_samples = data->length() / sizeof(short);

277	200

278 encoder_->Encode(samples, num_samples, &audio_buffers_);	201 encoder_->Encode(samples, num_samples);

279 float rms;	202 float rms;

280 endpointer_.ProcessAudio(samples, num_samples, &rms);	203 endpointer_.ProcessAudio(samples, num_samples, &rms);

281 delete data;	204 delete data;

282 num_samples_recorded_ += num_samples;	205 num_samples_recorded_ += num_samples;

283	206

284 if (endpointer_.IsEstimatingEnvironment()) {	207 if (endpointer_.IsEstimatingEnvironment()) {

285 // Check if we have gathered enough audio for the endpointer to do	208 // Check if we have gathered enough audio for the endpointer to do

286 // environment estimation and should move on to detect speech/end of speech.	209 // environment estimation and should move on to detect speech/end of speech.

287 if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs *	210 if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs *

288 kAudioSampleRate) / 1000) {	211 kAudioSampleRate) / 1000) {

(...skipping 45 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
334	257

335 void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) {	258 void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) {

336 CancelRecognition();	259 CancelRecognition();

337	260

338 // Guard against the delegate freeing us until we finish our job.	261 // Guard against the delegate freeing us until we finish our job.

339 scoped_refptr<SpeechRecognizer> me(this);	262 scoped_refptr<SpeechRecognizer> me(this);

340 delegate_->OnRecognizerError(caller_id_, error);	263 delegate_->OnRecognizerError(caller_id_, error);

341 }	264 }

342	265

343 } // namespace speech_input	266 } // namespace speech_input

OLD	NEW

« chrome/browser/speech/audio_encoder.cc ('K') | « chrome/browser/speech/speech_recognizer.h ('k') | chrome/chrome_browser.gypi » ('j') | no next file with comments »