Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(42)

Unified Diff: chrome/browser/speech/speech_recognizer.cc

Issue 6111009: Add the option of compressing speech input audio using FLAC. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 9 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/browser/speech/speech_recognizer.cc
diff --git a/chrome/browser/speech/speech_recognizer.cc b/chrome/browser/speech/speech_recognizer.cc
index 277393ca08f7e601f9d6832f33324cdef7606978..73d6d3649b00ee4c52233a369b72216b214a6eb8 100644
--- a/chrome/browser/speech/speech_recognizer.cc
+++ b/chrome/browser/speech/speech_recognizer.cc
@@ -10,21 +10,16 @@
#include "chrome/browser/browser_thread.h"
#include "chrome/browser/profiles/profile.h"
#include "chrome/common/net/url_request_context_getter.h"
-#include "third_party/speex/speex.h"
+#include "third_party/flac/flac.h"
bulach 2011/01/12 16:27:07 no longer needed?
using media::AudioInputController;
using std::list;
using std::string;
namespace {
+const char* const kContentTypeFLAC = "audio/x-flac; rate=16000";
const char* const kContentTypeSpeex =
"audio/x-speex-with-header-byte; rate=16000";
bulach 2011/01/12 16:27:07 it'd be nice to move these to the new encoder inte
-const int kSpeexEncodingQuality = 8;
-const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz).
-
-// Since the frame length gets written out as a byte in the encoded packet,
-// make sure it is within the byte range.
-COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength);
// The following constants are related to the volume level indicator shown in
// the UI for recorded audio.
@@ -45,68 +40,6 @@ const int SpeechRecognizer::kNumBitsPerAudioSample = 16;
const int SpeechRecognizer::kNoSpeechTimeoutSec = 8;
const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300;
-// Provides a simple interface to encode raw audio using the Speex codec.
-class SpeexEncoder {
- public:
- SpeexEncoder();
- ~SpeexEncoder();
-
- int samples_per_frame() const { return samples_per_frame_; }
-
- // Encodes each frame of raw audio in |samples| and adds the
- // encoded frames as a set of strings to the |encoded_frames| list.
- // Ownership of the newly added strings is transferred to the caller.
- void Encode(const short* samples,
- int num_samples,
- std::list<std::string*>* encoded_frames);
-
- private:
- SpeexBits bits_;
- void* encoder_state_;
- int samples_per_frame_;
- char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size.
-};
-
-SpeexEncoder::SpeexEncoder() {
- // speex_bits_init() does not initialize all of the |bits_| struct.
- memset(&bits_, 0, sizeof(bits_));
- speex_bits_init(&bits_);
- encoder_state_ = speex_encoder_init(&speex_wb_mode);
- DCHECK(encoder_state_);
- speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_);
- DCHECK(samples_per_frame_ > 0);
- int quality = kSpeexEncodingQuality;
- speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality);
- int vbr = 1;
- speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr);
- memset(encoded_frame_data_, 0, sizeof(encoded_frame_data_));
-}
-
-SpeexEncoder::~SpeexEncoder() {
- speex_bits_destroy(&bits_);
- speex_encoder_destroy(encoder_state_);
-}
-
-void SpeexEncoder::Encode(const short* samples,
- int num_samples,
- std::list<std::string*>* encoded_frames) {
- // Drop incomplete frames, typically those which come in when recording stops.
- num_samples -= (num_samples % samples_per_frame_);
- for (int i = 0; i < num_samples; i += samples_per_frame_) {
- speex_bits_reset(&bits_);
- speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i),
- &bits_);
-
- // Encode the frame and place the size of the frame as the first byte. This
- // is the packet format for MIME type x-speex-with-header-byte.
- int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1,
- kMaxSpeexFrameLength);
- encoded_frame_data_[0] = static_cast<char>(frame_length);
- encoded_frames->push_back(new string(encoded_frame_data_,
- frame_length + 1));
- }
-}
-
SpeechRecognizer::SpeechRecognizer(Delegate* delegate,
int caller_id,
const std::string& language,
@@ -117,7 +50,8 @@ SpeechRecognizer::SpeechRecognizer(Delegate* delegate,
language_(language),
grammar_(grammar),
hardware_info_(hardware_info),
- encoder_(new SpeexEncoder()),
+ codec_(AudioEncoder::FLAC),
+ encoder_(NULL),
endpointer_(kAudioSampleRate),
num_samples_recorded_(0),
audio_level_(0.0f) {
@@ -134,7 +68,7 @@ SpeechRecognizer::~SpeechRecognizer() {
// |StopRecording| being called.
DCHECK(!audio_controller_.get());
DCHECK(!request_.get() || !request_->HasPendingRequest());
- DCHECK(audio_buffers_.empty());
+ DCHECK(!encoder_.get());
endpointer_.EndSession();
}
@@ -142,14 +76,16 @@ bool SpeechRecognizer::StartRecording() {
DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
DCHECK(!audio_controller_.get());
DCHECK(!request_.get() || !request_->HasPendingRequest());
+ DCHECK(!encoder_.get());
// The endpointer needs to estimate the environment/background noise before
// starting to treat the audio as user input. In |HandleOnData| we wait until
// such time has passed before switching to user input mode.
endpointer_.SetEnvironmentEstimationMode();
+ encoder_.reset(AudioEncoder::Create(codec_, kAudioSampleRate,
+ kNumBitsPerAudioSample));
int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;
- DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0);
AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels,
kAudioSampleRate, kNumBitsPerAudioSample,
samples_per_packet);
@@ -174,7 +110,7 @@ void SpeechRecognizer::CancelRecognition() {
}
VLOG(1) << "SpeechRecognizer canceling recognition.";
- ReleaseAudioBuffers();
+ encoder_.reset();
request_.reset();
}
@@ -189,44 +125,31 @@ void SpeechRecognizer::StopRecording() {
VLOG(1) << "SpeechRecognizer stopping record.";
audio_controller_->Close();
audio_controller_ = NULL; // Releases the ref ptr.
+ encoder_->Flush();
delegate_->DidCompleteRecording(caller_id_);
- // If we haven't got any audio yet end the recognition sequence here.
- if (audio_buffers_.empty()) {
+ // Since the http request takes a single string as POST data, allocate
+ // one and copy over bytes from the audio buffers to the string.
+ // And If we haven't got any audio yet end the recognition sequence here.
+ string data;
+ if (!encoder_->GetEncodedData(&data)) {
// Guard against the delegate freeing us until we finish our job.
scoped_refptr<SpeechRecognizer> me(this);
delegate_->DidCompleteRecognition(caller_id_);
return;
}
- // We now have recorded audio in our buffers, so start a recognition request.
- // Since the http request takes a single string as POST data, allocate
- // one and copy over bytes from the audio buffers to the string.
- int audio_buffer_length = 0;
- for (AudioBufferQueue::iterator it = audio_buffers_.begin();
- it != audio_buffers_.end(); it++) {
- audio_buffer_length += (*it)->length();
- }
- string data;
- data.reserve(audio_buffer_length);
- for (AudioBufferQueue::iterator it = audio_buffers_.begin();
- it != audio_buffers_.end(); it++) {
- data.append(*(*it));
- }
-
DCHECK(!request_.get());
request_.reset(new SpeechRecognitionRequest(
Profile::GetDefaultRequestContext(), this));
- request_->Send(language_, grammar_, hardware_info_, kContentTypeSpeex, data);
- ReleaseAudioBuffers(); // No need to keep the audio anymore.
+ request_->Send(language_, grammar_, hardware_info_,
+ (codec_ == AudioEncoder::FLAC) ? kContentTypeFLAC : kContentTypeSpeex,
+ data);
+ encoder_.reset();
}
void SpeechRecognizer::ReleaseAudioBuffers() {
- for (AudioBufferQueue::iterator it = audio_buffers_.begin();
- it != audio_buffers_.end(); it++)
- delete *it;
- audio_buffers_.clear();
}
// Invoked in the audio thread.
@@ -275,7 +198,7 @@ void SpeechRecognizer::HandleOnData(string* data) {
DCHECK((data->length() % sizeof(short)) == 0);
int num_samples = data->length() / sizeof(short);
- encoder_->Encode(samples, num_samples, &audio_buffers_);
+ encoder_->Encode(samples, num_samples);
float rms;
endpointer_.ProcessAudio(samples, num_samples, &rms);
delete data;

Powered by Google App Engine
This is Rietveld 408576698