content/browser/speech/speech_recognizer_impl.cc - Issue 1211203006: Fixes issue where Web Speech API drops a frame every 5.1 seconds

Side by Side Diff: content/browser/speech/speech_recognizer_impl.cc

Issue 1211203006: Fixes issue where Web Speech API drops a frame every 5.1 seconds (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 5 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "content/browser/speech/speech_recognizer_impl.h"	5 #include "content/browser/speech/speech_recognizer_impl.h"

6	6

7 #include "base/basictypes.h"	7 #include "base/basictypes.h"

8 #include "base/bind.h"	8 #include "base/bind.h"

9 #include "base/time/time.h"	9 #include "base/time/time.h"

10 #include "content/browser/browser_main_loop.h"	10 #include "content/browser/browser_main_loop.h"

(...skipping 37 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
48	48

49 // Handles resampling, buffering, and channel mixing between input and output	49 // Handles resampling, buffering, and channel mixing between input and output

50 // parameters.	50 // parameters.

51 AudioConverter audio_converter_;	51 AudioConverter audio_converter_;

52	52

53 scoped_ptr<AudioBus> input_bus_;	53 scoped_ptr<AudioBus> input_bus_;

54 scoped_ptr<AudioBus> output_bus_;	54 scoped_ptr<AudioBus> output_bus_;

55 const AudioParameters input_parameters_;	55 const AudioParameters input_parameters_;

56 const AudioParameters output_parameters_;	56 const AudioParameters output_parameters_;

57 bool waiting_for_input_;	57 bool waiting_for_input_;

	58 int convert_count_;

58	59

59 DISALLOW_COPY_AND_ASSIGN(OnDataConverter);	60 DISALLOW_COPY_AND_ASSIGN(OnDataConverter);

60 };	61 };

61	62

62 namespace {	63 namespace {

63	64

64 // The following constants are related to the volume level indicator shown in	65 // The following constants are related to the volume level indicator shown in

65 // the UI for recorded audio.	66 // the UI for recorded audio.

66 // Multiplier used when new volume is greater than previous level.	67 // Multiplier used when new volume is greater than previous level.

67 const float kUpSmoothingFactor = 1.0f;	68 const float kUpSmoothingFactor = 1.0f;

(...skipping 44 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
112 // SpeechRecognizerImpl::OnDataConverter implementation	113 // SpeechRecognizerImpl::OnDataConverter implementation

113	114

114 SpeechRecognizerImpl::OnDataConverter::OnDataConverter(	115 SpeechRecognizerImpl::OnDataConverter::OnDataConverter(

115 const AudioParameters& input_params,	116 const AudioParameters& input_params,

116 const AudioParameters& output_params)	117 const AudioParameters& output_params)

117 : audio_converter_(input_params, output_params, false),	118 : audio_converter_(input_params, output_params, false),

118 input_bus_(AudioBus::Create(input_params)),	119 input_bus_(AudioBus::Create(input_params)),

119 output_bus_(AudioBus::Create(output_params)),	120 output_bus_(AudioBus::Create(output_params)),

120 input_parameters_(input_params),	121 input_parameters_(input_params),

121 output_parameters_(output_params),	122 output_parameters_(output_params),

122 waiting_for_input_(false) {	123 waiting_for_input_(false),

	124 convert_count_(0) {

123 audio_converter_.AddInput(this);	125 audio_converter_.AddInput(this);

	126 DVLOG(1) << "SRI::AudioConverter::ChunkSize " << audio_converter_.ChunkSize();

	127

	128 // Initial priming with zeros...

	129 // waiting_for_input_ = true;

	130 // input_bus_->Zero();

	131 // audio_converter_.Convert(output_bus_.get());

	132

	133 audio_converter_.PrimeWithSilence();

	134 DVLOG(1) << "SRI::AudioConverter::ChunkSize " << audio_converter_.ChunkSize();

124 }	135 }

125	136

126 SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() {	137 SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() {

127 // It should now be safe to unregister the converter since no more OnData()	138 // It should now be safe to unregister the converter since no more OnData()

128 // callbacks are outstanding at this point.	139 // callbacks are outstanding at this point.

129 audio_converter_.RemoveInput(this);	140 audio_converter_.RemoveInput(this);

130 }	141 }

131	142

132 scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert(	143 scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert(

133 const AudioBus* data) {	144 const AudioBus* data) {

134 CHECK_EQ(data->frames(), input_parameters_.frames_per_buffer());	145 CHECK_EQ(data->frames(), input_parameters_.frames_per_buffer());

135	146 DVLOG(1) << "SRI::ODC::Convert...";

	147 // Data should always have been provided by ProvideInput(). If not, the

	148 // previous call to Convert() could produce converted data using cached

	149 // data. But that means that we will miss one large frame of recorded audio

	150 // samples.

	151 CHECK(!waiting_for_input_);

136 data->CopyTo(input_bus_.get());	152 data->CopyTo(input_bus_.get());

137	153

138 waiting_for_input_ = true;	154 waiting_for_input_ = true;

	155 convert_count_++;

139 audio_converter_.Convert(output_bus_.get());	156 audio_converter_.Convert(output_bus_.get());
	DaleCurtis 2015/07/07 16:38:17 As mentioned in the email thread, you should proba As mentioned in the email thread, you should probably just have a "bool provide_input_called_" and if you have data and it isn't, then call convert again and generate another AudioChunk. henrika (OOO until Aug 14) 2015/07/07 19:30:05 Smart. But note that I have modified the input siz Smart. But note that I have modified the input size to 100ms (used to be 102) in this patch as well. And it will result in that ProvideInput is always called. Do you want me to restore 102 and use "double convert when needed" or keep 100. I don't remember the exact reason why we selected 102 in the first place. DaleCurtis 2015/07/07 20:40:44 Well, I'm not confident your modification always e Show quoted text On 2015/07/07 19:30:05, henrika wrote: > Smart. But note that I have modified the input size to 100ms (used to be 102) in > this patch as well. And it will result in that ProvideInput is always called. Do > you want me to restore 102 and use "double convert when needed" or keep 100. I > don't remember the exact reason why we selected 102 in the first place. Well, I'm not confident your modification always ensures a ProvideInput() call; due to rounding over time you may end up without a ProvideInput() call, possibly speech input is short enough that this isn't a problem though. I'm not sure it's possible to make a stricter guarantee than "at most 1 call." henrika (OOO until Aug 14) 2015/07/07 21:14:08 Got it. I will test your scheme using 102 where I Got it. I will test your scheme using 102 where I know it will be triggered once every 5th seconds. Then go back to 100 but keep the scheme just in case. DaleCurtis 2015/07/07 21:44:15 I think it's fine to use 100ms like you do w/ prim Show quoted text On 2015/07/07 21:14:08, henrika wrote: > Got it. I will test your scheme using 102 where I know it will be triggered once > every 5th seconds. Then go back to 100 but keep the scheme just in case. I think it's fine to use 100ms like you do w/ prime with silence, you just need to also handle the extra-convert case too. henrika (OOO until Aug 14) 2015/07/08 12:22:59 Done. Hope you are OK with how I handle the extra Done. Hope you are OK with how I handle the extra convert case. I wanted to make it clear that it should be a rare event.
	157 DVLOG(1) << "SRI::ODC::Convert done (" << convert_count_ << ")";
	tommi (sloooow) - chröme 2015/07/07 12:40:21 seems like this will always log a value higher tha seems like this will always log a value higher than 0 (up to max int of course) and convert_count_ is otherwise not used (or maybe I'm missing something). It seems like that if we really need this variable, we should get rid of waiting_for_input_. henrika (OOO until Aug 14) 2015/07/07 12:56:54 Got it. Plan is to remove this counter. Just wante Got it. Plan is to remove this counter. Just wanted to see where (~5.1) we end up in a state where we don't ask for new input data but instead read from the internal storage inside the Converter (that is when we miss 102ms of input data currently).
140	158

141 scoped_refptr<AudioChunk> chunk(	159 scoped_refptr<AudioChunk> chunk(

142 new AudioChunk(output_parameters_.GetBytesPerBuffer(),	160 new AudioChunk(output_parameters_.GetBytesPerBuffer(),

143 output_parameters_.bits_per_sample() / 8));	161 output_parameters_.bits_per_sample() / 8));

144 output_bus_->ToInterleaved(output_bus_->frames(),	162 output_bus_->ToInterleaved(output_bus_->frames(),

145 output_parameters_.bits_per_sample() / 8,	163 output_parameters_.bits_per_sample() / 8,

146 chunk->writable_data());	164 chunk->writable_data());

147 return chunk;	165 return chunk;

148 }	166 }

149	167

150 double SpeechRecognizerImpl::OnDataConverter::ProvideInput(	168 double SpeechRecognizerImpl::OnDataConverter::ProvideInput(

151 AudioBus* dest, base::TimeDelta buffer_delay) {	169 AudioBus* dest, base::TimeDelta buffer_delay) {

	170 DVLOG(1) << "SRI::ODC::ProvideInput";

152 // The audio converted should never ask for more than one bus in each call	171 // The audio converted should never ask for more than one bus in each call

153 // to Convert(). If so, we have a serious issue in our design since we might	172 // to Convert(). If so, we have a serious issue in our design since we might

154 // miss recorded chunks of 100 ms audio data.	173 // miss recorded chunks of 100 ms audio data.

155 CHECK(waiting_for_input_);	174 CHECK(waiting_for_input_);

156	175

157 // Read from the input bus to feed the converter.	176 // Read from the input bus to feed the converter.

158 input_bus_->CopyTo(dest);	177 input_bus_->CopyTo(dest);

159	178

160 // \|input_bus_\| should only be provide once.	179 // \|input_bus_\| should only be provide once.

161 waiting_for_input_ = false;	180 waiting_for_input_ = false;

(...skipping 354 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
516 return Abort(	535 return Abort(

517 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE));	536 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE));

518 }	537 }

519	538

520 // Audio converter shall provide audio based on these parameters as output.	539 // Audio converter shall provide audio based on these parameters as output.

521 // Hard coded, WebSpeech specific parameters are utilized here.	540 // Hard coded, WebSpeech specific parameters are utilized here.

522 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000;	541 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000;

523 AudioParameters output_parameters = AudioParameters(	542 AudioParameters output_parameters = AudioParameters(

524 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate,	543 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate,

525 kNumBitsPerAudioSample, frames_per_buffer);	544 kNumBitsPerAudioSample, frames_per_buffer);

	545 DVLOG(1) << "SRI::output_parameters: "

	546 << output_parameters.AsHumanReadableString();

526	547

527 // Audio converter will receive audio based on these parameters as input.	548 // Audio converter will receive audio based on these parameters as input.

528 // On Windows we start by verifying that Core Audio is supported. If not,	549 // On Windows we start by verifying that Core Audio is supported. If not,

529 // the WaveIn API is used and we might as well avoid all audio conversations	550 // the WaveIn API is used and we might as well avoid all audio conversations

530 // since WaveIn does the conversion for us.	551 // since WaveIn does the conversion for us.

531 // TODO(henrika): this code should be moved to platform dependent audio	552 // TODO(henrika): this code should be moved to platform dependent audio

532 // managers.	553 // managers.

533 bool use_native_audio_params = true;	554 bool use_native_audio_params = true;

534 #if defined(OS_WIN)	555 #if defined(OS_WIN)
	DaleCurtis 2015/07/07 20:40:43 I'm surprised we're not using the native params fo I'm surprised we're not using the native params for all platforms, but I guess if it works it works. henrika (OOO until Aug 14) 2015/07/07 21:14:08 Hmm, but we are on all but Win XP. All others use Hmm, but we are on all but Win XP. All others use native. Clear, or am I missing something? DaleCurtis 2015/07/07 21:44:15 Ah no, I misread and inverted the check. Show quoted text On 2015/07/07 21:14:08, henrika wrote: > Hmm, but we are on all but Win XP. All others use native. Clear, or am I missing > something? Ah no, I misread and inverted the check.
535 use_native_audio_params = media::CoreAudioUtil::IsSupported();	556 use_native_audio_params = media::CoreAudioUtil::IsSupported();

536 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech";	557 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech";

537 #endif	558 #endif

538	559

539 AudioParameters input_parameters = output_parameters;	560 AudioParameters input_parameters = output_parameters;

540 if (use_native_audio_params && !unit_test_is_active) {	561 if (use_native_audio_params && !unit_test_is_active) {

541 // Use native audio parameters but avoid opening up at the native buffer	562 // Use native audio parameters but avoid opening up at the native buffer

542 // size. Instead use same frame size (in milliseconds) as WebSpeech uses.	563 // size. Instead use same frame size (in milliseconds) as WebSpeech uses.

543 // We rely on internal buffers in the audio back-end to fulfill this request	564 // We rely on internal buffers in the audio back-end to fulfill this request

544 // and the idea is to simplify the audio conversion since each Convert()	565 // and the idea is to simplify the audio conversion since each Convert()

545 // call will then render exactly one ProvideInput() call.	566 // call will then render exactly one ProvideInput() call.

546 // Due to implementation details in the audio converter, 2 milliseconds	567 // Due to implementation details in the audio converter, 2 milliseconds

547 // are added to the default frame size (100 ms) to ensure there is enough	568 // are added to the default frame size (100 ms) to ensure there is enough

548 // data to generate 100 ms of output when resampling.	569 // data to generate 100 ms of output when resampling.

	570 // frames_per_buffer =

	571 // ((in_params.sample_rate() * (chunk_duration_ms + 2)) / 1000.0) + 0.5;

549 frames_per_buffer =	572 frames_per_buffer =

550 ((in_params.sample_rate() * (chunk_duration_ms + 2)) / 1000.0) + 0.5;	573 ((in_params.sample_rate() * chunk_duration_ms) / 1000.0) + 0.5;

551 input_parameters.Reset(in_params.format(),	574 input_parameters.Reset(in_params.format(),

552 in_params.channel_layout(),	575 in_params.channel_layout(),

553 in_params.channels(),	576 in_params.channels(),

554 in_params.sample_rate(),	577 in_params.sample_rate(),

555 in_params.bits_per_sample(),	578 in_params.bits_per_sample(),

556 frames_per_buffer);	579 frames_per_buffer);

	580 DVLOG(1) << "SRI::input_parameters: "

	581 << input_parameters.AsHumanReadableString();

557 }	582 }

558	583

559 // Create an audio converter which converts data between native input format	584 // Create an audio converter which converts data between native input format

560 // and WebSpeech specific output format.	585 // and WebSpeech specific output format.

561 audio_converter_.reset(	586 audio_converter_.reset(

562 new OnDataConverter(input_parameters, output_parameters));	587 new OnDataConverter(input_parameters, output_parameters));

563	588

564 audio_controller_ = AudioInputController::Create(	589 audio_controller_ = AudioInputController::Create(

565 audio_manager, this, input_parameters, device_id_, NULL);	590 audio_manager, this, input_parameters, device_id_, NULL);

566	591

(...skipping 245 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
812 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value)	837 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value)

813 : event(event_value),	838 : event(event_value),

814 audio_data(NULL),	839 audio_data(NULL),

815 engine_error(SPEECH_RECOGNITION_ERROR_NONE) {	840 engine_error(SPEECH_RECOGNITION_ERROR_NONE) {

816 }	841 }

817	842

818 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() {	843 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() {

819 }	844 }

820	845

821 } // namespace content	846 } // namespace content

OLD	NEW

« no previous file with comments | « no previous file | media/audio/audio_input_controller.cc » ('j') | media/base/audio_converter.cc » ('J')