chrome/browser/speech/speech_recognizer.cc - Issue 6111009: Add the option of compressing speech input audio using FLAC.

Side by Side Diff: chrome/browser/speech/speech_recognizer.cc

Issue 6111009: Add the option of compressing speech input audio using FLAC. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: . Created 9 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/browser/speech/speech_recognizer.h"	5 #include "chrome/browser/speech/speech_recognizer.h"

6	6

7 #include "base/ref_counted.h"	7 #include "base/ref_counted.h"

8 #include "base/scoped_ptr.h"	8 #include "base/scoped_ptr.h"

9 #include "base/time.h"	9 #include "base/time.h"

10 #include "chrome/browser/browser_thread.h"	10 #include "chrome/browser/browser_thread.h"

11 #include "chrome/browser/profiles/profile.h"	11 #include "chrome/browser/profiles/profile.h"

12 #include "chrome/common/net/url_request_context_getter.h"	12 #include "chrome/common/net/url_request_context_getter.h"

13 #include "third_party/speex/speex.h"

14	13

15 using media::AudioInputController;	14 using media::AudioInputController;

16 using std::list;

17 using std::string;	15 using std::string;

18	16

19 namespace {	17 namespace {

20 const char* const kContentTypeSpeex =

21 "audio/x-speex-with-header-byte; rate=16000";

22 const int kSpeexEncodingQuality = 8;

23 const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz).

24

25 // Since the frame length gets written out as a byte in the encoded packet,

26 // make sure it is within the byte range.

27 COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength);

28	18

29 // The following constants are related to the volume level indicator shown in	19 // The following constants are related to the volume level indicator shown in

30 // the UI for recorded audio.	20 // the UI for recorded audio.

31 // Multiplier used when new volume is greater than previous level.	21 // Multiplier used when new volume is greater than previous level.

32 const float kUpSmoothingFactor = 0.9f;	22 const float kUpSmoothingFactor = 0.9f;

33 // Multiplier used when new volume is lesser than previous level.	23 // Multiplier used when new volume is lesser than previous level.

34 const float kDownSmoothingFactor = 0.4f;	24 const float kDownSmoothingFactor = 0.4f;

35 const float kAudioMeterMinDb = 10.0f; // Lower bar for volume meter.	25 const float kAudioMeterMinDb = 10.0f; // Lower bar for volume meter.

36 const float kAudioMeterDbRange = 25.0f;	26 const float kAudioMeterDbRange = 25.0f;

37 } // namespace	27 } // namespace

38	28

39 namespace speech_input {	29 namespace speech_input {

40	30

41 const int SpeechRecognizer::kAudioSampleRate = 16000;	31 const int SpeechRecognizer::kAudioSampleRate = 16000;

42 const int SpeechRecognizer::kAudioPacketIntervalMs = 100;	32 const int SpeechRecognizer::kAudioPacketIntervalMs = 100;

43 const int SpeechRecognizer::kNumAudioChannels = 1;	33 const int SpeechRecognizer::kNumAudioChannels = 1;

44 const int SpeechRecognizer::kNumBitsPerAudioSample = 16;	34 const int SpeechRecognizer::kNumBitsPerAudioSample = 16;

45 const int SpeechRecognizer::kNoSpeechTimeoutSec = 8;	35 const int SpeechRecognizer::kNoSpeechTimeoutSec = 8;

46 const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300;	36 const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300;

47	37

48 // Provides a simple interface to encode raw audio using the Speex codec.

49 class SpeexEncoder {

50 public:

51 SpeexEncoder();

52 ~SpeexEncoder();

53

54 int samples_per_frame() const { return samples_per_frame_; }

55

56 // Encodes each frame of raw audio in \|samples\| and adds the

57 // encoded frames as a set of strings to the \|encoded_frames\| list.

58 // Ownership of the newly added strings is transferred to the caller.

59 void Encode(const short* samples,

60 int num_samples,

61 std::list<std::string> encoded_frames);

62

63 private:

64 SpeexBits bits_;

65 void* encoder_state_;

66 int samples_per_frame_;

67 char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size.

68 };

69

70 SpeexEncoder::SpeexEncoder() {

71 // speex_bits_init() does not initialize all of the \|bits_\| struct.

72 memset(&bits_, 0, sizeof(bits_));

73 speex_bits_init(&bits_);

74 encoder_state_ = speex_encoder_init(&speex_wb_mode);

75 DCHECK(encoder_state_);

76 speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_);

77 DCHECK(samples_per_frame_ > 0);

78 int quality = kSpeexEncodingQuality;

79 speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality);

80 int vbr = 1;

81 speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr);

82 memset(encoded_frame_data_, 0, sizeof(encoded_frame_data_));

83 }

84

85 SpeexEncoder::~SpeexEncoder() {

86 speex_bits_destroy(&bits_);

87 speex_encoder_destroy(encoder_state_);

88 }

89

90 void SpeexEncoder::Encode(const short* samples,

91 int num_samples,

92 std::list<std::string> encoded_frames) {

93 // Drop incomplete frames, typically those which come in when recording stops.

94 num_samples -= (num_samples % samples_per_frame_);

95 for (int i = 0; i < num_samples; i += samples_per_frame_) {

96 speex_bits_reset(&bits_);

97 speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i),

98 &bits_);

99

100 // Encode the frame and place the size of the frame as the first byte. This

101 // is the packet format for MIME type x-speex-with-header-byte.

102 int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1,

103 kMaxSpeexFrameLength);

104 encoded_frame_data_[0] = static_cast<char>(frame_length);

105 encoded_frames->push_back(new string(encoded_frame_data_,

106 frame_length + 1));

107 }

108 }

109

110 SpeechRecognizer::SpeechRecognizer(Delegate* delegate,	38 SpeechRecognizer::SpeechRecognizer(Delegate* delegate,

111 int caller_id,	39 int caller_id,

112 const std::string& language,	40 const std::string& language,

113 const std::string& grammar,	41 const std::string& grammar,

114 const std::string& hardware_info)	42 const std::string& hardware_info)

115 : delegate_(delegate),	43 : delegate_(delegate),

116 caller_id_(caller_id),	44 caller_id_(caller_id),

117 language_(language),	45 language_(language),

118 grammar_(grammar),	46 grammar_(grammar),

119 hardware_info_(hardware_info),	47 hardware_info_(hardware_info),

120 encoder_(new SpeexEncoder()),	48 codec_(AudioEncoder::CODEC_SPEEX),

	49 encoder_(NULL),

121 endpointer_(kAudioSampleRate),	50 endpointer_(kAudioSampleRate),

122 num_samples_recorded_(0),	51 num_samples_recorded_(0),

123 audio_level_(0.0f) {	52 audio_level_(0.0f) {

124 endpointer_.set_speech_input_complete_silence_length(	53 endpointer_.set_speech_input_complete_silence_length(

125 base::Time::kMicrosecondsPerSecond / 2);	54 base::Time::kMicrosecondsPerSecond / 2);

126 endpointer_.set_long_speech_input_complete_silence_length(	55 endpointer_.set_long_speech_input_complete_silence_length(

127 base::Time::kMicrosecondsPerSecond);	56 base::Time::kMicrosecondsPerSecond);

128 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);	57 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);

129 endpointer_.StartSession();	58 endpointer_.StartSession();

130 }	59 }

131	60

132 SpeechRecognizer::~SpeechRecognizer() {	61 SpeechRecognizer::~SpeechRecognizer() {

133 // Recording should have stopped earlier due to the endpointer or	62 // Recording should have stopped earlier due to the endpointer or

134 // \|StopRecording\| being called.	63 // \|StopRecording\| being called.

135 DCHECK(!audio_controller_.get());	64 DCHECK(!audio_controller_.get());

136 DCHECK(!request_.get() \|\| !request_->HasPendingRequest());	65 DCHECK(!request_.get() \|\| !request_->HasPendingRequest());

137 DCHECK(audio_buffers_.empty());	66 DCHECK(!encoder_.get());

138 endpointer_.EndSession();	67 endpointer_.EndSession();

139 }	68 }

140	69

141 bool SpeechRecognizer::StartRecording() {	70 bool SpeechRecognizer::StartRecording() {

142 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));	71 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

143 DCHECK(!audio_controller_.get());	72 DCHECK(!audio_controller_.get());

144 DCHECK(!request_.get() \|\| !request_->HasPendingRequest());	73 DCHECK(!request_.get() \|\| !request_->HasPendingRequest());

	74 DCHECK(!encoder_.get());

145	75

146 // The endpointer needs to estimate the environment/background noise before	76 // The endpointer needs to estimate the environment/background noise before

147 // starting to treat the audio as user input. In \|HandleOnData\| we wait until	77 // starting to treat the audio as user input. In \|HandleOnData\| we wait until

148 // such time has passed before switching to user input mode.	78 // such time has passed before switching to user input mode.

149 endpointer_.SetEnvironmentEstimationMode();	79 endpointer_.SetEnvironmentEstimationMode();

150	80

	81 encoder_.reset(AudioEncoder::Create(codec_, kAudioSampleRate,

	82 kNumBitsPerAudioSample));

151 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;	83 int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;

152 DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0);

153 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels,	84 AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels,

154 kAudioSampleRate, kNumBitsPerAudioSample,	85 kAudioSampleRate, kNumBitsPerAudioSample,

155 samples_per_packet);	86 samples_per_packet);

156 audio_controller_ = AudioInputController::Create(this, params);	87 audio_controller_ = AudioInputController::Create(this, params);

157 DCHECK(audio_controller_.get());	88 DCHECK(audio_controller_.get());

158 VLOG(1) << "SpeechRecognizer starting record.";	89 VLOG(1) << "SpeechRecognizer starting record.";

159 num_samples_recorded_ = 0;	90 num_samples_recorded_ = 0;

160 audio_controller_->Record();	91 audio_controller_->Record();

161	92

162 return true;	93 return true;

163 }	94 }

164	95

165 void SpeechRecognizer::CancelRecognition() {	96 void SpeechRecognizer::CancelRecognition() {

166 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));	97 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

167 DCHECK(audio_controller_.get() \|\| request_.get());	98 DCHECK(audio_controller_.get() \|\| request_.get());

168	99

169 // Stop recording if required.	100 // Stop recording if required.

170 if (audio_controller_.get()) {	101 if (audio_controller_.get()) {

171 VLOG(1) << "SpeechRecognizer stopping record.";	102 VLOG(1) << "SpeechRecognizer stopping record.";

172 audio_controller_->Close();	103 audio_controller_->Close();

173 audio_controller_ = NULL; // Releases the ref ptr.	104 audio_controller_ = NULL; // Releases the ref ptr.

174 }	105 }

175	106

176 VLOG(1) << "SpeechRecognizer canceling recognition.";	107 VLOG(1) << "SpeechRecognizer canceling recognition.";

177 ReleaseAudioBuffers();	108 encoder_.reset();

178 request_.reset();	109 request_.reset();

179 }	110 }

180	111

181 void SpeechRecognizer::StopRecording() {	112 void SpeechRecognizer::StopRecording() {

182 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));	113 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

183	114

184 // If audio recording has already stopped and we are in recognition phase,	115 // If audio recording has already stopped and we are in recognition phase,

185 // silently ignore any more calls to stop recording.	116 // silently ignore any more calls to stop recording.

186 if (!audio_controller_.get())	117 if (!audio_controller_.get())

187 return;	118 return;

188	119

189 VLOG(1) << "SpeechRecognizer stopping record.";	120 VLOG(1) << "SpeechRecognizer stopping record.";

190 audio_controller_->Close();	121 audio_controller_->Close();

191 audio_controller_ = NULL; // Releases the ref ptr.	122 audio_controller_ = NULL; // Releases the ref ptr.

	123 encoder_->Flush();

192	124

193 delegate_->DidCompleteRecording(caller_id_);	125 delegate_->DidCompleteRecording(caller_id_);

194	126

195 // If we haven't got any audio yet end the recognition sequence here.	127 // Since the http request takes a single string as POST data, allocate

196 if (audio_buffers_.empty()) {	128 // one and copy over bytes from the audio buffers to the string.

	129 // And If we haven't got any audio yet end the recognition sequence here.

	130 string data;

	131 if (!encoder_->GetEncodedData(&data)) {

197 // Guard against the delegate freeing us until we finish our job.	132 // Guard against the delegate freeing us until we finish our job.

198 scoped_refptr<SpeechRecognizer> me(this);	133 scoped_refptr<SpeechRecognizer> me(this);

199 delegate_->DidCompleteRecognition(caller_id_);	134 delegate_->DidCompleteRecognition(caller_id_);

200 return;	135 } else {

	136 DCHECK(!request_.get());

	137 request_.reset(new SpeechRecognitionRequest(

	138 Profile::GetDefaultRequestContext(), this));

	139 request_->Send(language_, grammar_, hardware_info_, encoder_->mime_type(),

	140 data);

201 }	141 }

202	142 encoder_.reset();

203 // We now have recorded audio in our buffers, so start a recognition request.

204 // Since the http request takes a single string as POST data, allocate

205 // one and copy over bytes from the audio buffers to the string.

206 int audio_buffer_length = 0;

207 for (AudioBufferQueue::iterator it = audio_buffers_.begin();

208 it != audio_buffers_.end(); it++) {

209 audio_buffer_length += (*it)->length();

210 }

211 string data;

212 data.reserve(audio_buffer_length);

213 for (AudioBufferQueue::iterator it = audio_buffers_.begin();

214 it != audio_buffers_.end(); it++) {

215 data.append((it));

216 }

217

218 DCHECK(!request_.get());

219 request_.reset(new SpeechRecognitionRequest(

220 Profile::GetDefaultRequestContext(), this));

221 request_->Send(language_, grammar_, hardware_info_, kContentTypeSpeex, data);

222 ReleaseAudioBuffers(); // No need to keep the audio anymore.

223 }	143 }

224	144

225 void SpeechRecognizer::ReleaseAudioBuffers() {	145 void SpeechRecognizer::ReleaseAudioBuffers() {

226 for (AudioBufferQueue::iterator it = audio_buffers_.begin();

227 it != audio_buffers_.end(); it++)

228 delete *it;

229 audio_buffers_.clear();

230 }	146 }

231	147

232 // Invoked in the audio thread.	148 // Invoked in the audio thread.

233 void SpeechRecognizer::OnError(AudioInputController* controller,	149 void SpeechRecognizer::OnError(AudioInputController* controller,

234 int error_code) {	150 int error_code) {

235 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,	151 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

236 NewRunnableMethod(this,	152 NewRunnableMethod(this,

237 &SpeechRecognizer::HandleOnError,	153 &SpeechRecognizer::HandleOnError,

238 error_code));	154 error_code));

239 }	155 }

(...skipping 28 matching lines...) Expand all Loading...
268 // by \|OnData\|.	184 // by \|OnData\|.

269 if (!audio_controller_.get()) {	185 if (!audio_controller_.get()) {

270 delete data;	186 delete data;

271 return;	187 return;

272 }	188 }

273	189

274 const short* samples = reinterpret_cast<const short*>(data->data());	190 const short* samples = reinterpret_cast<const short*>(data->data());

275 DCHECK((data->length() % sizeof(short)) == 0);	191 DCHECK((data->length() % sizeof(short)) == 0);

276 int num_samples = data->length() / sizeof(short);	192 int num_samples = data->length() / sizeof(short);

277	193

278 encoder_->Encode(samples, num_samples, &audio_buffers_);	194 encoder_->Encode(samples, num_samples);

279 float rms;	195 float rms;

280 endpointer_.ProcessAudio(samples, num_samples, &rms);	196 endpointer_.ProcessAudio(samples, num_samples, &rms);

281 delete data;	197 delete data;

282 num_samples_recorded_ += num_samples;	198 num_samples_recorded_ += num_samples;

283	199

284 if (endpointer_.IsEstimatingEnvironment()) {	200 if (endpointer_.IsEstimatingEnvironment()) {

285 // Check if we have gathered enough audio for the endpointer to do	201 // Check if we have gathered enough audio for the endpointer to do

286 // environment estimation and should move on to detect speech/end of speech.	202 // environment estimation and should move on to detect speech/end of speech.

287 if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs *	203 if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs *

288 kAudioSampleRate) / 1000) {	204 kAudioSampleRate) / 1000) {

(...skipping 45 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
334	250

335 void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) {	251 void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) {

336 CancelRecognition();	252 CancelRecognition();

337	253

338 // Guard against the delegate freeing us until we finish our job.	254 // Guard against the delegate freeing us until we finish our job.

339 scoped_refptr<SpeechRecognizer> me(this);	255 scoped_refptr<SpeechRecognizer> me(this);

340 delegate_->OnRecognizerError(caller_id_, error);	256 delegate_->OnRecognizerError(caller_id_, error);

341 }	257 }

342	258

343 } // namespace speech_input	259 } // namespace speech_input

OLD	NEW

« no previous file with comments | « chrome/browser/speech/speech_recognizer.h ('k') | chrome/browser/speech/speech_recognizer_unittest.cc » ('j') | no next file with comments »