Index: chrome/browser/speech/speech_recognizer.cc |
diff --git a/chrome/browser/speech/speech_recognizer.cc b/chrome/browser/speech/speech_recognizer.cc |
index 277393ca08f7e601f9d6832f33324cdef7606978..6d46a72b0776aa0da705c0c237ae12bb9efa31ed 100644 |
--- a/chrome/browser/speech/speech_recognizer.cc |
+++ b/chrome/browser/speech/speech_recognizer.cc |
@@ -10,21 +10,11 @@ |
#include "chrome/browser/browser_thread.h" |
#include "chrome/browser/profiles/profile.h" |
#include "chrome/common/net/url_request_context_getter.h" |
-#include "third_party/speex/speex.h" |
using media::AudioInputController; |
-using std::list; |
using std::string; |
namespace { |
-const char* const kContentTypeSpeex = |
- "audio/x-speex-with-header-byte; rate=16000"; |
-const int kSpeexEncodingQuality = 8; |
-const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz). |
- |
-// Since the frame length gets written out as a byte in the encoded packet, |
-// make sure it is within the byte range. |
-COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength); |
// The following constants are related to the volume level indicator shown in |
// the UI for recorded audio. |
@@ -45,68 +35,6 @@ const int SpeechRecognizer::kNumBitsPerAudioSample = 16; |
const int SpeechRecognizer::kNoSpeechTimeoutSec = 8; |
const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300; |
-// Provides a simple interface to encode raw audio using the Speex codec. |
-class SpeexEncoder { |
- public: |
- SpeexEncoder(); |
- ~SpeexEncoder(); |
- |
- int samples_per_frame() const { return samples_per_frame_; } |
- |
- // Encodes each frame of raw audio in |samples| and adds the |
- // encoded frames as a set of strings to the |encoded_frames| list. |
- // Ownership of the newly added strings is transferred to the caller. |
- void Encode(const short* samples, |
- int num_samples, |
- std::list<std::string*>* encoded_frames); |
- |
- private: |
- SpeexBits bits_; |
- void* encoder_state_; |
- int samples_per_frame_; |
- char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size. |
-}; |
- |
-SpeexEncoder::SpeexEncoder() { |
- // speex_bits_init() does not initialize all of the |bits_| struct. |
- memset(&bits_, 0, sizeof(bits_)); |
- speex_bits_init(&bits_); |
- encoder_state_ = speex_encoder_init(&speex_wb_mode); |
- DCHECK(encoder_state_); |
- speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_); |
- DCHECK(samples_per_frame_ > 0); |
- int quality = kSpeexEncodingQuality; |
- speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality); |
- int vbr = 1; |
- speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr); |
- memset(encoded_frame_data_, 0, sizeof(encoded_frame_data_)); |
-} |
- |
-SpeexEncoder::~SpeexEncoder() { |
- speex_bits_destroy(&bits_); |
- speex_encoder_destroy(encoder_state_); |
-} |
- |
-void SpeexEncoder::Encode(const short* samples, |
- int num_samples, |
- std::list<std::string*>* encoded_frames) { |
- // Drop incomplete frames, typically those which come in when recording stops. |
- num_samples -= (num_samples % samples_per_frame_); |
- for (int i = 0; i < num_samples; i += samples_per_frame_) { |
- speex_bits_reset(&bits_); |
- speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i), |
- &bits_); |
- |
- // Encode the frame and place the size of the frame as the first byte. This |
- // is the packet format for MIME type x-speex-with-header-byte. |
- int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1, |
- kMaxSpeexFrameLength); |
- encoded_frame_data_[0] = static_cast<char>(frame_length); |
- encoded_frames->push_back(new string(encoded_frame_data_, |
- frame_length + 1)); |
- } |
-} |
- |
SpeechRecognizer::SpeechRecognizer(Delegate* delegate, |
int caller_id, |
const std::string& language, |
@@ -117,7 +45,8 @@ SpeechRecognizer::SpeechRecognizer(Delegate* delegate, |
language_(language), |
grammar_(grammar), |
hardware_info_(hardware_info), |
- encoder_(new SpeexEncoder()), |
+ codec_(AudioEncoder::CODEC_SPEEX), |
+ encoder_(NULL), |
endpointer_(kAudioSampleRate), |
num_samples_recorded_(0), |
audio_level_(0.0f) { |
@@ -134,7 +63,7 @@ SpeechRecognizer::~SpeechRecognizer() { |
// |StopRecording| being called. |
DCHECK(!audio_controller_.get()); |
DCHECK(!request_.get() || !request_->HasPendingRequest()); |
- DCHECK(audio_buffers_.empty()); |
+ DCHECK(!encoder_.get()); |
endpointer_.EndSession(); |
} |
@@ -142,14 +71,16 @@ bool SpeechRecognizer::StartRecording() { |
DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
DCHECK(!audio_controller_.get()); |
DCHECK(!request_.get() || !request_->HasPendingRequest()); |
+ DCHECK(!encoder_.get()); |
// The endpointer needs to estimate the environment/background noise before |
// starting to treat the audio as user input. In |HandleOnData| we wait until |
// such time has passed before switching to user input mode. |
endpointer_.SetEnvironmentEstimationMode(); |
+ encoder_.reset(AudioEncoder::Create(codec_, kAudioSampleRate, |
+ kNumBitsPerAudioSample)); |
int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; |
- DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0); |
AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels, |
kAudioSampleRate, kNumBitsPerAudioSample, |
samples_per_packet); |
@@ -174,7 +105,7 @@ void SpeechRecognizer::CancelRecognition() { |
} |
VLOG(1) << "SpeechRecognizer canceling recognition."; |
- ReleaseAudioBuffers(); |
+ encoder_.reset(); |
request_.reset(); |
} |
@@ -189,44 +120,29 @@ void SpeechRecognizer::StopRecording() { |
VLOG(1) << "SpeechRecognizer stopping record."; |
audio_controller_->Close(); |
audio_controller_ = NULL; // Releases the ref ptr. |
+ encoder_->Flush(); |
delegate_->DidCompleteRecording(caller_id_); |
- // If we haven't got any audio yet end the recognition sequence here. |
- if (audio_buffers_.empty()) { |
- // Guard against the delegate freeing us until we finish our job. |
- scoped_refptr<SpeechRecognizer> me(this); |
- delegate_->DidCompleteRecognition(caller_id_); |
- return; |
- } |
- |
- // We now have recorded audio in our buffers, so start a recognition request. |
// Since the http request takes a single string as POST data, allocate |
// one and copy over bytes from the audio buffers to the string. |
- int audio_buffer_length = 0; |
- for (AudioBufferQueue::iterator it = audio_buffers_.begin(); |
- it != audio_buffers_.end(); it++) { |
- audio_buffer_length += (*it)->length(); |
- } |
+ // And If we haven't got any audio yet end the recognition sequence here. |
string data; |
- data.reserve(audio_buffer_length); |
- for (AudioBufferQueue::iterator it = audio_buffers_.begin(); |
- it != audio_buffers_.end(); it++) { |
- data.append(*(*it)); |
+ if (!encoder_->GetEncodedData(&data)) { |
+ // Guard against the delegate freeing us until we finish our job. |
+ scoped_refptr<SpeechRecognizer> me(this); |
+ delegate_->DidCompleteRecognition(caller_id_); |
+ } else { |
+ DCHECK(!request_.get()); |
+ request_.reset(new SpeechRecognitionRequest( |
+ Profile::GetDefaultRequestContext(), this)); |
+ request_->Send(language_, grammar_, hardware_info_, encoder_->mime_type(), |
+ data); |
} |
- |
- DCHECK(!request_.get()); |
- request_.reset(new SpeechRecognitionRequest( |
- Profile::GetDefaultRequestContext(), this)); |
- request_->Send(language_, grammar_, hardware_info_, kContentTypeSpeex, data); |
- ReleaseAudioBuffers(); // No need to keep the audio anymore. |
+ encoder_.reset(); |
} |
void SpeechRecognizer::ReleaseAudioBuffers() { |
- for (AudioBufferQueue::iterator it = audio_buffers_.begin(); |
- it != audio_buffers_.end(); it++) |
- delete *it; |
- audio_buffers_.clear(); |
} |
// Invoked in the audio thread. |
@@ -275,7 +191,7 @@ void SpeechRecognizer::HandleOnData(string* data) { |
DCHECK((data->length() % sizeof(short)) == 0); |
int num_samples = data->length() / sizeof(short); |
- encoder_->Encode(samples, num_samples, &audio_buffers_); |
+ encoder_->Encode(samples, num_samples); |
float rms; |
endpointer_.ProcessAudio(samples, num_samples, &rms); |
delete data; |