content/browser/speech/speech_recognizer_impl.cc - Issue 1211203006: Fixes issue where Web Speech API drops a frame every 5.1 seconds

Side by Side Diff: content/browser/speech/speech_recognizer_impl.cc

Issue 1211203006: Fixes issue where Web Speech API drops a frame every 5.1 seconds (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Feedback from Dale Created 5 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "content/browser/speech/speech_recognizer_impl.h"	5 #include "content/browser/speech/speech_recognizer_impl.h"

6	6

7 #include "base/basictypes.h"	7 #include "base/basictypes.h"

8 #include "base/bind.h"	8 #include "base/bind.h"

9 #include "base/time/time.h"	9 #include "base/time/time.h"

10 #include "content/browser/browser_main_loop.h"	10 #include "content/browser/browser_main_loop.h"

(...skipping 24 matching lines...) Expand all Loading...
35 public:	35 public:

36 OnDataConverter(const AudioParameters& input_params,	36 OnDataConverter(const AudioParameters& input_params,

37 const AudioParameters& output_params);	37 const AudioParameters& output_params);

38 ~OnDataConverter() override;	38 ~OnDataConverter() override;

39	39

40 // Converts input audio \|data\| bus into an AudioChunk where the input format	40 // Converts input audio \|data\| bus into an AudioChunk where the input format

41 // is given by \|input_parameters_\| and the output format by	41 // is given by \|input_parameters_\| and the output format by

42 // \|output_parameters_\|.	42 // \|output_parameters_\|.

43 scoped_refptr<AudioChunk> Convert(const AudioBus* data);	43 scoped_refptr<AudioChunk> Convert(const AudioBus* data);

44	44

	45 bool data_was_converted() const { return data_was_converted_; }

	46

45 private:	47 private:

46 // media::AudioConverter::InputCallback implementation.	48 // media::AudioConverter::InputCallback implementation.

47 double ProvideInput(AudioBus* dest, base::TimeDelta buffer_delay) override;	49 double ProvideInput(AudioBus* dest, base::TimeDelta buffer_delay) override;

48	50

49 // Handles resampling, buffering, and channel mixing between input and output	51 // Handles resampling, buffering, and channel mixing between input and output

50 // parameters.	52 // parameters.

51 AudioConverter audio_converter_;	53 AudioConverter audio_converter_;

52	54

53 scoped_ptr<AudioBus> input_bus_;	55 scoped_ptr<AudioBus> input_bus_;

54 scoped_ptr<AudioBus> output_bus_;	56 scoped_ptr<AudioBus> output_bus_;

55 const AudioParameters input_parameters_;	57 const AudioParameters input_parameters_;

56 const AudioParameters output_parameters_;	58 const AudioParameters output_parameters_;

57 bool waiting_for_input_;	59 bool data_was_converted_;

58	60

59 DISALLOW_COPY_AND_ASSIGN(OnDataConverter);	61 DISALLOW_COPY_AND_ASSIGN(OnDataConverter);

60 };	62 };

61	63

62 namespace {	64 namespace {

63	65

64 // The following constants are related to the volume level indicator shown in	66 // The following constants are related to the volume level indicator shown in

65 // the UI for recorded audio.	67 // the UI for recorded audio.

66 // Multiplier used when new volume is greater than previous level.	68 // Multiplier used when new volume is greater than previous level.

67 const float kUpSmoothingFactor = 1.0f;	69 const float kUpSmoothingFactor = 1.0f;

(...skipping 44 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
112 // SpeechRecognizerImpl::OnDataConverter implementation	114 // SpeechRecognizerImpl::OnDataConverter implementation

113	115

114 SpeechRecognizerImpl::OnDataConverter::OnDataConverter(	116 SpeechRecognizerImpl::OnDataConverter::OnDataConverter(

115 const AudioParameters& input_params,	117 const AudioParameters& input_params,

116 const AudioParameters& output_params)	118 const AudioParameters& output_params)

117 : audio_converter_(input_params, output_params, false),	119 : audio_converter_(input_params, output_params, false),

118 input_bus_(AudioBus::Create(input_params)),	120 input_bus_(AudioBus::Create(input_params)),

119 output_bus_(AudioBus::Create(output_params)),	121 output_bus_(AudioBus::Create(output_params)),

120 input_parameters_(input_params),	122 input_parameters_(input_params),

121 output_parameters_(output_params),	123 output_parameters_(output_params),

122 waiting_for_input_(false) {	124 data_was_converted_(false) {

123 audio_converter_.AddInput(this);	125 audio_converter_.AddInput(this);

	126 audio_converter_.PrimeWithSilence();

124 }	127 }

125	128

126 SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() {	129 SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() {

127 // It should now be safe to unregister the converter since no more OnData()	130 // It should now be safe to unregister the converter since no more OnData()

128 // callbacks are outstanding at this point.	131 // callbacks are outstanding at this point.

129 audio_converter_.RemoveInput(this);	132 audio_converter_.RemoveInput(this);

130 }	133 }

131	134

132 scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert(	135 scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert(

133 const AudioBus* data) {	136 const AudioBus* data) {

134 CHECK_EQ(data->frames(), input_parameters_.frames_per_buffer());	137 CHECK_EQ(data->frames(), input_parameters_.frames_per_buffer());

135	138 data_was_converted_ = false;

	139 // Copy recorded audio to the \|input_bus_\| for later use in ProvideInput().

136 data->CopyTo(input_bus_.get());	140 data->CopyTo(input_bus_.get());

137	141 // Convert the audio and place the result in \|output_bus_\|. This call will

138 waiting_for_input_ = true;	142 // result in a ProvideInput() callback where the actual input is provided.

	143 // However, it can happen that the converter contains enough cached data

	144 // to return a result without calling ProvideInput(). The caller of this

	145 // method should check the state of data_was_converted_() and make an

	146 // additional call if it is set to false at return.

	147 // See http://crbug.com/506051 for details.

139 audio_converter_.Convert(output_bus_.get());	148 audio_converter_.Convert(output_bus_.get());

140	149 // Create an audio chunk based on the converted result.

141 scoped_refptr<AudioChunk> chunk(	150 scoped_refptr<AudioChunk> chunk(

142 new AudioChunk(output_parameters_.GetBytesPerBuffer(),	151 new AudioChunk(output_parameters_.GetBytesPerBuffer(),

143 output_parameters_.bits_per_sample() / 8));	152 output_parameters_.bits_per_sample() / 8));

144 output_bus_->ToInterleaved(output_bus_->frames(),	153 output_bus_->ToInterleaved(output_bus_->frames(),

145 output_parameters_.bits_per_sample() / 8,	154 output_parameters_.bits_per_sample() / 8,

146 chunk->writable_data());	155 chunk->writable_data());

147 return chunk;	156 return chunk;

148 }	157 }

149	158

150 double SpeechRecognizerImpl::OnDataConverter::ProvideInput(	159 double SpeechRecognizerImpl::OnDataConverter::ProvideInput(

151 AudioBus* dest, base::TimeDelta buffer_delay) {	160 AudioBus* dest, base::TimeDelta buffer_delay) {

152 // The audio converted should never ask for more than one bus in each call

153 // to Convert(). If so, we have a serious issue in our design since we might

154 // miss recorded chunks of 100 ms audio data.

155 CHECK(waiting_for_input_);

156

157 // Read from the input bus to feed the converter.	161 // Read from the input bus to feed the converter.

158 input_bus_->CopyTo(dest);	162 input_bus_->CopyTo(dest);

159	163 // Indicate that the recorded audio has in fact been used by the converter.

160 // \|input_bus_\| should only be provide once.	164 data_was_converted_ = true;

161 waiting_for_input_ = false;

162 return 1;	165 return 1;

163 }	166 }

164	167

165 // SpeechRecognizerImpl implementation	168 // SpeechRecognizerImpl implementation

166	169

167 SpeechRecognizerImpl::SpeechRecognizerImpl(	170 SpeechRecognizerImpl::SpeechRecognizerImpl(

168 SpeechRecognitionEventListener* listener,	171 SpeechRecognitionEventListener* listener,

169 int session_id,	172 int session_id,

170 bool continuous,	173 bool continuous,

171 bool provisional_results,	174 bool provisional_results,

(...skipping 94 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
266 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,	269 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

267 base::Bind(&SpeechRecognizerImpl::DispatchEvent,	270 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

268 this, event_args));	271 this, event_args));

269 }	272 }

270	273

271 void SpeechRecognizerImpl::OnData(AudioInputController* controller,	274 void SpeechRecognizerImpl::OnData(AudioInputController* controller,

272 const AudioBus* data) {	275 const AudioBus* data) {

273 // Convert audio from native format to fixed format used by WebSpeech.	276 // Convert audio from native format to fixed format used by WebSpeech.

274 FSMEventArgs event_args(EVENT_AUDIO_DATA);	277 FSMEventArgs event_args(EVENT_AUDIO_DATA);

275 event_args.audio_data = audio_converter_->Convert(data);	278 event_args.audio_data = audio_converter_->Convert(data);

276

277 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,	279 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

278 base::Bind(&SpeechRecognizerImpl::DispatchEvent,	280 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

279 this, event_args));	281 this, event_args));

	282 // See http://crbug.com/506051 regarding why one extra convert call can
	DaleCurtis 2015/07/08 18:41:47 Oh, you could just do a while (!data_was_converted Oh, you could just do a while (!data_was_converted) arund the above section. henrika (OOO until Aug 14) 2015/07/09 10:06:09 Great point, tried that. But what then happens is: Great point, tried that. But what then happens is: #1: data_was_converted (dwc) is false => convert and dwc is true => #2: break while loop but now dwc is true and we never enter the while loop again.. Yes, there are ways around it but I figured that the current style makes it clear that the second loop is an extra-ordinary thing and I did not want to risk ending up in an infinite loop. I can make changes in a follow-up if you like.
	283 // sometimes be required. It should be a rare case.

	284 if (!audio_converter_->data_was_converted()) {

	285 DCHECK(false);
	DaleCurtis 2015/07/08 18:41:47 Remove dcheck and dlog, I don't think it's unexpec Remove dcheck and dlog, I don't think it's unexpected. henrika (OOO until Aug 14) 2015/07/09 10:06:09 Done. Show quoted text On 2015/07/08 18:41:47, DaleCurtis wrote: > Remove dcheck and dlog, I don't think it's unexpected. Done.
	286 DLOG(WARNING) << "One extra convert call is required";

	287 event_args.audio_data = audio_converter_->Convert(data);

	288 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

	289 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

	290 this, event_args));

	291 }

	292 // Something is seriously wrong here and we are most likely missing some

	293 // audio segments.

	294 CHECK(audio_converter_->data_was_converted());

280 }	295 }

281	296

282 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {}	297 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {}

283	298

284 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResults(	299 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResults(

285 const SpeechRecognitionResults& results) {	300 const SpeechRecognitionResults& results) {

286 FSMEventArgs event_args(EVENT_ENGINE_RESULT);	301 FSMEventArgs event_args(EVENT_ENGINE_RESULT);

287 event_args.engine_results = results;	302 event_args.engine_results = results;

288 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,	303 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,

289 base::Bind(&SpeechRecognizerImpl::DispatchEvent,	304 base::Bind(&SpeechRecognizerImpl::DispatchEvent,

(...skipping 226 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
516 return Abort(	531 return Abort(

517 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE));	532 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE));

518 }	533 }

519	534

520 // Audio converter shall provide audio based on these parameters as output.	535 // Audio converter shall provide audio based on these parameters as output.

521 // Hard coded, WebSpeech specific parameters are utilized here.	536 // Hard coded, WebSpeech specific parameters are utilized here.

522 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000;	537 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000;

523 AudioParameters output_parameters = AudioParameters(	538 AudioParameters output_parameters = AudioParameters(

524 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate,	539 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate,

525 kNumBitsPerAudioSample, frames_per_buffer);	540 kNumBitsPerAudioSample, frames_per_buffer);

	541 DVLOG(1) << "SRI::output_parameters: "

	542 << output_parameters.AsHumanReadableString();

526	543

527 // Audio converter will receive audio based on these parameters as input.	544 // Audio converter will receive audio based on these parameters as input.

528 // On Windows we start by verifying that Core Audio is supported. If not,	545 // On Windows we start by verifying that Core Audio is supported. If not,

529 // the WaveIn API is used and we might as well avoid all audio conversations	546 // the WaveIn API is used and we might as well avoid all audio conversations

530 // since WaveIn does the conversion for us.	547 // since WaveIn does the conversion for us.

531 // TODO(henrika): this code should be moved to platform dependent audio	548 // TODO(henrika): this code should be moved to platform dependent audio

532 // managers.	549 // managers.

533 bool use_native_audio_params = true;	550 bool use_native_audio_params = true;

534 #if defined(OS_WIN)	551 #if defined(OS_WIN)

535 use_native_audio_params = media::CoreAudioUtil::IsSupported();	552 use_native_audio_params = media::CoreAudioUtil::IsSupported();

536 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech";	553 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech";

537 #endif	554 #endif

538	555

539 AudioParameters input_parameters = output_parameters;	556 AudioParameters input_parameters = output_parameters;

540 if (use_native_audio_params && !unit_test_is_active) {	557 if (use_native_audio_params && !unit_test_is_active) {

541 // Use native audio parameters but avoid opening up at the native buffer	558 // Use native audio parameters but avoid opening up at the native buffer

542 // size. Instead use same frame size (in milliseconds) as WebSpeech uses.	559 // size. Instead use same frame size (in milliseconds) as WebSpeech uses.

543 // We rely on internal buffers in the audio back-end to fulfill this request	560 // We rely on internal buffers in the audio back-end to fulfill this request

544 // and the idea is to simplify the audio conversion since each Convert()	561 // and the idea is to simplify the audio conversion since each Convert()

545 // call will then render exactly one ProvideInput() call.	562 // call will then render exactly one ProvideInput() call.

546 // Due to implementation details in the audio converter, 2 milliseconds	563 // in_params.sample_rate()

547 // are added to the default frame size (100 ms) to ensure there is enough

548 // data to generate 100 ms of output when resampling.

549 frames_per_buffer =	564 frames_per_buffer =

550 ((in_params.sample_rate() * (chunk_duration_ms + 2)) / 1000.0) + 0.5;	565 ((in_params.sample_rate() * chunk_duration_ms) / 1000.0) + 0.5;
	DaleCurtis 2015/07/08 18:39:46 One last thing, can verify that the chunk size for One last thing, can verify that the chunk size for 8kHz -> 16kHz is greater than the output request size? I'm kind of surprised it's not a problem. henrika (OOO until Aug 14) 2015/07/09 10:06:09 SRI::output_parameters: format: 1 channels: 1 chan SRI::output_parameters: format: 1 channels: 1 channel_layout: 2 sample_rate: 16000 bits_per_sample: 16 frames_per_buffer: 1600 SRI::input_parameters: format: 1 channels: 2 channel_layout: 3 sample_rate: 8000 bits_per_sample: 16 frames_per_buffer: 800 AudioConverter::ChunkSize: 1600 It seems to work fine for me.
551 input_parameters.Reset(in_params.format(),	566 input_parameters.Reset(in_params.format(),

552 in_params.channel_layout(),	567 in_params.channel_layout(),

553 in_params.channels(),	568 in_params.channels(),

554 in_params.sample_rate(),	569 in_params.sample_rate(),

555 in_params.bits_per_sample(),	570 in_params.bits_per_sample(),

556 frames_per_buffer);	571 frames_per_buffer);

	572 DVLOG(1) << "SRI::input_parameters: "

	573 << input_parameters.AsHumanReadableString();

557 }	574 }

558	575

559 // Create an audio converter which converts data between native input format	576 // Create an audio converter which converts data between native input format

560 // and WebSpeech specific output format.	577 // and WebSpeech specific output format.

561 audio_converter_.reset(	578 audio_converter_.reset(

562 new OnDataConverter(input_parameters, output_parameters));	579 new OnDataConverter(input_parameters, output_parameters));

563	580

564 audio_controller_ = AudioInputController::Create(	581 audio_controller_ = AudioInputController::Create(

565 audio_manager, this, input_parameters, device_id_, NULL);	582 audio_manager, this, input_parameters, device_id_, NULL);

566	583

(...skipping 245 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
812 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value)	829 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value)

813 : event(event_value),	830 : event(event_value),

814 audio_data(NULL),	831 audio_data(NULL),

815 engine_error(SPEECH_RECOGNITION_ERROR_NONE) {	832 engine_error(SPEECH_RECOGNITION_ERROR_NONE) {

816 }	833 }

817	834

818 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() {	835 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() {

819 }	836 }

820	837

821 } // namespace content	838 } // namespace content

OLD	NEW

« no previous file with comments | « no previous file | media/base/audio_converter.h » ('j') | no next file with comments »