Index: content/browser/speech/speech_recognizer_impl.cc |
diff --git a/content/browser/speech/speech_recognizer_impl.cc b/content/browser/speech/speech_recognizer_impl.cc |
index a08ffe17661af3b5ecaeffa381fa22b8a0c6308f..0f9632f58a5d0b86bff58a1b9137805ef4860658 100644 |
--- a/content/browser/speech/speech_recognizer_impl.cc |
+++ b/content/browser/speech/speech_recognizer_impl.cc |
@@ -42,6 +42,8 @@ class SpeechRecognizerImpl::OnDataConverter |
// |output_parameters_|. |
scoped_refptr<AudioChunk> Convert(const AudioBus* data); |
+ bool data_was_converted() const { return data_was_converted_; } |
+ |
private: |
// media::AudioConverter::InputCallback implementation. |
double ProvideInput(AudioBus* dest, base::TimeDelta buffer_delay) override; |
@@ -54,7 +56,7 @@ class SpeechRecognizerImpl::OnDataConverter |
scoped_ptr<AudioBus> output_bus_; |
const AudioParameters input_parameters_; |
const AudioParameters output_parameters_; |
- bool waiting_for_input_; |
+ bool data_was_converted_; |
DISALLOW_COPY_AND_ASSIGN(OnDataConverter); |
}; |
@@ -119,8 +121,9 @@ SpeechRecognizerImpl::OnDataConverter::OnDataConverter( |
output_bus_(AudioBus::Create(output_params)), |
input_parameters_(input_params), |
output_parameters_(output_params), |
- waiting_for_input_(false) { |
+ data_was_converted_(false) { |
audio_converter_.AddInput(this); |
+ audio_converter_.PrimeWithSilence(); |
} |
SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() { |
@@ -132,12 +135,18 @@ SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() { |
scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert( |
const AudioBus* data) { |
CHECK_EQ(data->frames(), input_parameters_.frames_per_buffer()); |
- |
+ data_was_converted_ = false; |
+ // Copy recorded audio to the |input_bus_| for later use in ProvideInput(). |
data->CopyTo(input_bus_.get()); |
- |
- waiting_for_input_ = true; |
+ // Convert the audio and place the result in |output_bus_|. This call will |
+ // result in a ProvideInput() callback where the actual input is provided. |
+ // However, it can happen that the converter contains enough cached data |
+ // to return a result without calling ProvideInput(). The caller of this |
+ // method should check the state of data_was_converted_() and make an |
+ // additional call if it is set to false at return. |
+ // See http://crbug.com/506051 for details. |
audio_converter_.Convert(output_bus_.get()); |
- |
+ // Create an audio chunk based on the converted result. |
scoped_refptr<AudioChunk> chunk( |
new AudioChunk(output_parameters_.GetBytesPerBuffer(), |
output_parameters_.bits_per_sample() / 8)); |
@@ -149,16 +158,10 @@ scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert( |
double SpeechRecognizerImpl::OnDataConverter::ProvideInput( |
AudioBus* dest, base::TimeDelta buffer_delay) { |
- // The audio converted should never ask for more than one bus in each call |
- // to Convert(). If so, we have a serious issue in our design since we might |
- // miss recorded chunks of 100 ms audio data. |
- CHECK(waiting_for_input_); |
- |
// Read from the input bus to feed the converter. |
input_bus_->CopyTo(dest); |
- |
- // |input_bus_| should only be provide once. |
- waiting_for_input_ = false; |
+ // Indicate that the recorded audio has in fact been used by the converter. |
+ data_was_converted_ = true; |
return 1; |
} |
@@ -273,10 +276,22 @@ void SpeechRecognizerImpl::OnData(AudioInputController* controller, |
// Convert audio from native format to fixed format used by WebSpeech. |
FSMEventArgs event_args(EVENT_AUDIO_DATA); |
event_args.audio_data = audio_converter_->Convert(data); |
- |
BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
this, event_args)); |
+ // See http://crbug.com/506051 regarding why one extra convert call can |
DaleCurtis
2015/07/08 18:41:47
Oh, you could just do a while (!data_was_converted
henrika (OOO until Aug 14)
2015/07/09 10:06:09
Great point, tried that. But what then happens is:
|
+ // sometimes be required. It should be a rare case. |
+ if (!audio_converter_->data_was_converted()) { |
+ DCHECK(false); |
DaleCurtis
2015/07/08 18:41:47
Remove dcheck and dlog, I don't think it's unexpec
henrika (OOO until Aug 14)
2015/07/09 10:06:09
Done.
|
+ DLOG(WARNING) << "One extra convert call is required"; |
+ event_args.audio_data = audio_converter_->Convert(data); |
+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
+ base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
+ this, event_args)); |
+ } |
+ // Something is seriously wrong here and we are most likely missing some |
+ // audio segments. |
+ CHECK(audio_converter_->data_was_converted()); |
} |
void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} |
@@ -523,6 +538,8 @@ SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) { |
AudioParameters output_parameters = AudioParameters( |
AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate, |
kNumBitsPerAudioSample, frames_per_buffer); |
+ DVLOG(1) << "SRI::output_parameters: " |
+ << output_parameters.AsHumanReadableString(); |
// Audio converter will receive audio based on these parameters as input. |
// On Windows we start by verifying that Core Audio is supported. If not, |
@@ -543,17 +560,17 @@ SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) { |
// We rely on internal buffers in the audio back-end to fulfill this request |
// and the idea is to simplify the audio conversion since each Convert() |
// call will then render exactly one ProvideInput() call. |
- // Due to implementation details in the audio converter, 2 milliseconds |
- // are added to the default frame size (100 ms) to ensure there is enough |
- // data to generate 100 ms of output when resampling. |
+ // in_params.sample_rate() |
frames_per_buffer = |
- ((in_params.sample_rate() * (chunk_duration_ms + 2)) / 1000.0) + 0.5; |
+ ((in_params.sample_rate() * chunk_duration_ms) / 1000.0) + 0.5; |
DaleCurtis
2015/07/08 18:39:46
One last thing, can verify that the chunk size for
henrika (OOO until Aug 14)
2015/07/09 10:06:09
SRI::output_parameters: format: 1 channels: 1 chan
|
input_parameters.Reset(in_params.format(), |
in_params.channel_layout(), |
in_params.channels(), |
in_params.sample_rate(), |
in_params.bits_per_sample(), |
frames_per_buffer); |
+ DVLOG(1) << "SRI::input_parameters: " |
+ << input_parameters.AsHumanReadableString(); |
} |
// Create an audio converter which converts data between native input format |