content/browser/speech/speech_recognizer_impl.cc - Issue 1211203006: Fixes issue where Web Speech API drops a frame every 5.1 seconds

Unified Diff: content/browser/speech/speech_recognizer_impl.cc

Issue 1211203006: Fixes issue where Web Speech API drops a frame every 5.1 seconds (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Feedback from Dale Created 5 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: content/browser/speech/speech_recognizer_impl.cc

diff --git a/content/browser/speech/speech_recognizer_impl.cc b/content/browser/speech/speech_recognizer_impl.cc

index a08ffe17661af3b5ecaeffa381fa22b8a0c6308f..0f9632f58a5d0b86bff58a1b9137805ef4860658 100644

--- a/content/browser/speech/speech_recognizer_impl.cc

+++ b/content/browser/speech/speech_recognizer_impl.cc

@@ -42,6 +42,8 @@ class SpeechRecognizerImpl::OnDataConverter

// |output_parameters_|.

scoped_refptr<AudioChunk> Convert(const AudioBus* data);

+ bool data_was_converted() const { return data_was_converted_; }

private:

// media::AudioConverter::InputCallback implementation.

double ProvideInput(AudioBus* dest, base::TimeDelta buffer_delay) override;

@@ -54,7 +56,7 @@ class SpeechRecognizerImpl::OnDataConverter

scoped_ptr<AudioBus> output_bus_;

const AudioParameters input_parameters_;

const AudioParameters output_parameters_;

- bool waiting_for_input_;

+ bool data_was_converted_;

DISALLOW_COPY_AND_ASSIGN(OnDataConverter);

};

@@ -119,8 +121,9 @@ SpeechRecognizerImpl::OnDataConverter::OnDataConverter(

output_bus_(AudioBus::Create(output_params)),

input_parameters_(input_params),

output_parameters_(output_params),

- waiting_for_input_(false) {

+ data_was_converted_(false) {

audio_converter_.AddInput(this);

+ audio_converter_.PrimeWithSilence();

}

SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() {

@@ -132,12 +135,18 @@ SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() {

scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert(

const AudioBus* data) {

CHECK_EQ(data->frames(), input_parameters_.frames_per_buffer());

+ data_was_converted_ = false;

+ // Copy recorded audio to the |input_bus_| for later use in ProvideInput().

data->CopyTo(input_bus_.get());

- waiting_for_input_ = true;

+ // Convert the audio and place the result in |output_bus_|. This call will

+ // result in a ProvideInput() callback where the actual input is provided.

+ // However, it can happen that the converter contains enough cached data

+ // to return a result without calling ProvideInput(). The caller of this

+ // method should check the state of data_was_converted_() and make an

+ // additional call if it is set to false at return.

+ // See http://crbug.com/506051 for details.

audio_converter_.Convert(output_bus_.get());

+ // Create an audio chunk based on the converted result.

scoped_refptr<AudioChunk> chunk(

new AudioChunk(output_parameters_.GetBytesPerBuffer(),

output_parameters_.bits_per_sample() / 8));

@@ -149,16 +158,10 @@ scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert(

double SpeechRecognizerImpl::OnDataConverter::ProvideInput(

AudioBus* dest, base::TimeDelta buffer_delay) {

- // The audio converted should never ask for more than one bus in each call

- // to Convert(). If so, we have a serious issue in our design since we might

- // miss recorded chunks of 100 ms audio data.

- CHECK(waiting_for_input_);

// Read from the input bus to feed the converter.

input_bus_->CopyTo(dest);

- // |input_bus_| should only be provide once.

- waiting_for_input_ = false;

+ // Indicate that the recorded audio has in fact been used by the converter.

+ data_was_converted_ = true;

return 1;

}

@@ -273,10 +276,22 @@ void SpeechRecognizerImpl::OnData(AudioInputController* controller,

// Convert audio from native format to fixed format used by WebSpeech.

FSMEventArgs event_args(EVENT_AUDIO_DATA);

event_args.audio_data = audio_converter_->Convert(data);

BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

base::Bind(&SpeechRecognizerImpl::DispatchEvent,

this, event_args));

+ // See http://crbug.com/506051 regarding why one extra convert call can

DaleCurtis 2015/07/08 18:41:47 Oh, you could just do a while (!data_was_converted

henrika (OOO until Aug 14) 2015/07/09 10:06:09 Great point, tried that. But what then happens is:

+ // sometimes be required. It should be a rare case.

+ if (!audio_converter_->data_was_converted()) {

+ DCHECK(false);

DaleCurtis 2015/07/08 18:41:47 Remove dcheck and dlog, I don't think it's unexpec

henrika (OOO until Aug 14) 2015/07/09 10:06:09 Done.

+ DLOG(WARNING) << "One extra convert call is required";

+ event_args.audio_data = audio_converter_->Convert(data);

+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

+ base::Bind(&SpeechRecognizerImpl::DispatchEvent,

+ this, event_args));

+ }

+ // Something is seriously wrong here and we are most likely missing some

+ // audio segments.

+ CHECK(audio_converter_->data_was_converted());

}

void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {}

@@ -523,6 +538,8 @@ SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) {

AudioParameters output_parameters = AudioParameters(

AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate,

kNumBitsPerAudioSample, frames_per_buffer);

+ DVLOG(1) << "SRI::output_parameters: "

+ << output_parameters.AsHumanReadableString();

// Audio converter will receive audio based on these parameters as input.

// On Windows we start by verifying that Core Audio is supported. If not,

@@ -543,17 +560,17 @@ SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) {

// We rely on internal buffers in the audio back-end to fulfill this request

// and the idea is to simplify the audio conversion since each Convert()

// call will then render exactly one ProvideInput() call.

- // Due to implementation details in the audio converter, 2 milliseconds

- // are added to the default frame size (100 ms) to ensure there is enough

- // data to generate 100 ms of output when resampling.

+ // in_params.sample_rate()

frames_per_buffer =

- ((in_params.sample_rate() * (chunk_duration_ms + 2)) / 1000.0) + 0.5;

+ ((in_params.sample_rate() * chunk_duration_ms) / 1000.0) + 0.5;

DaleCurtis 2015/07/08 18:39:46 One last thing, can verify that the chunk size for

henrika (OOO until Aug 14) 2015/07/09 10:06:09 SRI::output_parameters: format: 1 channels: 1 chan

input_parameters.Reset(in_params.format(),

in_params.channel_layout(),

in_params.channels(),

in_params.sample_rate(),

in_params.bits_per_sample(),

frames_per_buffer);

+ DVLOG(1) << "SRI::input_parameters: "

+ << input_parameters.AsHumanReadableString();

}

// Create an audio converter which converts data between native input format

« no previous file with comments | « no previous file | media/base/audio_converter.h » ('j') | no next file with comments »