Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(16)

Side by Side Diff: content/browser/speech/speech_recognizer_impl.cc

Issue 1211203006: Fixes issue where Web Speech API drops a frame every 5.1 seconds (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 5 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | media/audio/audio_input_controller.cc » ('j') | media/base/audio_converter.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "content/browser/speech/speech_recognizer_impl.h" 5 #include "content/browser/speech/speech_recognizer_impl.h"
6 6
7 #include "base/basictypes.h" 7 #include "base/basictypes.h"
8 #include "base/bind.h" 8 #include "base/bind.h"
9 #include "base/time/time.h" 9 #include "base/time/time.h"
10 #include "content/browser/browser_main_loop.h" 10 #include "content/browser/browser_main_loop.h"
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after
48 48
49 // Handles resampling, buffering, and channel mixing between input and output 49 // Handles resampling, buffering, and channel mixing between input and output
50 // parameters. 50 // parameters.
51 AudioConverter audio_converter_; 51 AudioConverter audio_converter_;
52 52
53 scoped_ptr<AudioBus> input_bus_; 53 scoped_ptr<AudioBus> input_bus_;
54 scoped_ptr<AudioBus> output_bus_; 54 scoped_ptr<AudioBus> output_bus_;
55 const AudioParameters input_parameters_; 55 const AudioParameters input_parameters_;
56 const AudioParameters output_parameters_; 56 const AudioParameters output_parameters_;
57 bool waiting_for_input_; 57 bool waiting_for_input_;
58 int convert_count_;
58 59
59 DISALLOW_COPY_AND_ASSIGN(OnDataConverter); 60 DISALLOW_COPY_AND_ASSIGN(OnDataConverter);
60 }; 61 };
61 62
62 namespace { 63 namespace {
63 64
64 // The following constants are related to the volume level indicator shown in 65 // The following constants are related to the volume level indicator shown in
65 // the UI for recorded audio. 66 // the UI for recorded audio.
66 // Multiplier used when new volume is greater than previous level. 67 // Multiplier used when new volume is greater than previous level.
67 const float kUpSmoothingFactor = 1.0f; 68 const float kUpSmoothingFactor = 1.0f;
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
112 // SpeechRecognizerImpl::OnDataConverter implementation 113 // SpeechRecognizerImpl::OnDataConverter implementation
113 114
114 SpeechRecognizerImpl::OnDataConverter::OnDataConverter( 115 SpeechRecognizerImpl::OnDataConverter::OnDataConverter(
115 const AudioParameters& input_params, 116 const AudioParameters& input_params,
116 const AudioParameters& output_params) 117 const AudioParameters& output_params)
117 : audio_converter_(input_params, output_params, false), 118 : audio_converter_(input_params, output_params, false),
118 input_bus_(AudioBus::Create(input_params)), 119 input_bus_(AudioBus::Create(input_params)),
119 output_bus_(AudioBus::Create(output_params)), 120 output_bus_(AudioBus::Create(output_params)),
120 input_parameters_(input_params), 121 input_parameters_(input_params),
121 output_parameters_(output_params), 122 output_parameters_(output_params),
122 waiting_for_input_(false) { 123 waiting_for_input_(false),
124 convert_count_(0) {
123 audio_converter_.AddInput(this); 125 audio_converter_.AddInput(this);
126 DVLOG(1) << "SRI::AudioConverter::ChunkSize " << audio_converter_.ChunkSize();
127
128 // Initial priming with zeros...
129 // waiting_for_input_ = true;
130 // input_bus_->Zero();
131 // audio_converter_.Convert(output_bus_.get());
132
133 audio_converter_.PrimeWithSilence();
134 DVLOG(1) << "SRI::AudioConverter::ChunkSize " << audio_converter_.ChunkSize();
124 } 135 }
125 136
126 SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() { 137 SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() {
127 // It should now be safe to unregister the converter since no more OnData() 138 // It should now be safe to unregister the converter since no more OnData()
128 // callbacks are outstanding at this point. 139 // callbacks are outstanding at this point.
129 audio_converter_.RemoveInput(this); 140 audio_converter_.RemoveInput(this);
130 } 141 }
131 142
132 scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert( 143 scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert(
133 const AudioBus* data) { 144 const AudioBus* data) {
134 CHECK_EQ(data->frames(), input_parameters_.frames_per_buffer()); 145 CHECK_EQ(data->frames(), input_parameters_.frames_per_buffer());
135 146 DVLOG(1) << "SRI::ODC::Convert...";
147 // Data should always have been provided by ProvideInput(). If not, the
148 // previous call to Convert() could produce converted data using cached
149 // data. But that means that we will miss one large frame of recorded audio
150 // samples.
151 CHECK(!waiting_for_input_);
136 data->CopyTo(input_bus_.get()); 152 data->CopyTo(input_bus_.get());
137 153
138 waiting_for_input_ = true; 154 waiting_for_input_ = true;
155 convert_count_++;
139 audio_converter_.Convert(output_bus_.get()); 156 audio_converter_.Convert(output_bus_.get());
DaleCurtis 2015/07/07 16:38:17 As mentioned in the email thread, you should proba
henrika (OOO until Aug 14) 2015/07/07 19:30:05 Smart. But note that I have modified the input siz
DaleCurtis 2015/07/07 20:40:44 Well, I'm not confident your modification always e
henrika (OOO until Aug 14) 2015/07/07 21:14:08 Got it. I will test your scheme using 102 where I
DaleCurtis 2015/07/07 21:44:15 I think it's fine to use 100ms like you do w/ prim
henrika (OOO until Aug 14) 2015/07/08 12:22:59 Done. Hope you are OK with how I handle the extra
157 DVLOG(1) << "SRI::ODC::Convert done (" << convert_count_ << ")";
tommi (sloooow) - chröme 2015/07/07 12:40:21 seems like this will always log a value higher tha
henrika (OOO until Aug 14) 2015/07/07 12:56:54 Got it. Plan is to remove this counter. Just wante
140 158
141 scoped_refptr<AudioChunk> chunk( 159 scoped_refptr<AudioChunk> chunk(
142 new AudioChunk(output_parameters_.GetBytesPerBuffer(), 160 new AudioChunk(output_parameters_.GetBytesPerBuffer(),
143 output_parameters_.bits_per_sample() / 8)); 161 output_parameters_.bits_per_sample() / 8));
144 output_bus_->ToInterleaved(output_bus_->frames(), 162 output_bus_->ToInterleaved(output_bus_->frames(),
145 output_parameters_.bits_per_sample() / 8, 163 output_parameters_.bits_per_sample() / 8,
146 chunk->writable_data()); 164 chunk->writable_data());
147 return chunk; 165 return chunk;
148 } 166 }
149 167
150 double SpeechRecognizerImpl::OnDataConverter::ProvideInput( 168 double SpeechRecognizerImpl::OnDataConverter::ProvideInput(
151 AudioBus* dest, base::TimeDelta buffer_delay) { 169 AudioBus* dest, base::TimeDelta buffer_delay) {
170 DVLOG(1) << "SRI::ODC::ProvideInput";
152 // The audio converted should never ask for more than one bus in each call 171 // The audio converted should never ask for more than one bus in each call
153 // to Convert(). If so, we have a serious issue in our design since we might 172 // to Convert(). If so, we have a serious issue in our design since we might
154 // miss recorded chunks of 100 ms audio data. 173 // miss recorded chunks of 100 ms audio data.
155 CHECK(waiting_for_input_); 174 CHECK(waiting_for_input_);
156 175
157 // Read from the input bus to feed the converter. 176 // Read from the input bus to feed the converter.
158 input_bus_->CopyTo(dest); 177 input_bus_->CopyTo(dest);
159 178
160 // |input_bus_| should only be provide once. 179 // |input_bus_| should only be provide once.
161 waiting_for_input_ = false; 180 waiting_for_input_ = false;
(...skipping 354 matching lines...) Expand 10 before | Expand all | Expand 10 after
516 return Abort( 535 return Abort(
517 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE)); 536 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE));
518 } 537 }
519 538
520 // Audio converter shall provide audio based on these parameters as output. 539 // Audio converter shall provide audio based on these parameters as output.
521 // Hard coded, WebSpeech specific parameters are utilized here. 540 // Hard coded, WebSpeech specific parameters are utilized here.
522 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000; 541 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000;
523 AudioParameters output_parameters = AudioParameters( 542 AudioParameters output_parameters = AudioParameters(
524 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate, 543 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate,
525 kNumBitsPerAudioSample, frames_per_buffer); 544 kNumBitsPerAudioSample, frames_per_buffer);
545 DVLOG(1) << "SRI::output_parameters: "
546 << output_parameters.AsHumanReadableString();
526 547
527 // Audio converter will receive audio based on these parameters as input. 548 // Audio converter will receive audio based on these parameters as input.
528 // On Windows we start by verifying that Core Audio is supported. If not, 549 // On Windows we start by verifying that Core Audio is supported. If not,
529 // the WaveIn API is used and we might as well avoid all audio conversations 550 // the WaveIn API is used and we might as well avoid all audio conversations
530 // since WaveIn does the conversion for us. 551 // since WaveIn does the conversion for us.
531 // TODO(henrika): this code should be moved to platform dependent audio 552 // TODO(henrika): this code should be moved to platform dependent audio
532 // managers. 553 // managers.
533 bool use_native_audio_params = true; 554 bool use_native_audio_params = true;
534 #if defined(OS_WIN) 555 #if defined(OS_WIN)
DaleCurtis 2015/07/07 20:40:43 I'm surprised we're not using the native params fo
henrika (OOO until Aug 14) 2015/07/07 21:14:08 Hmm, but we are on all but Win XP. All others use
DaleCurtis 2015/07/07 21:44:15 Ah no, I misread and inverted the check.
535 use_native_audio_params = media::CoreAudioUtil::IsSupported(); 556 use_native_audio_params = media::CoreAudioUtil::IsSupported();
536 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech"; 557 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech";
537 #endif 558 #endif
538 559
539 AudioParameters input_parameters = output_parameters; 560 AudioParameters input_parameters = output_parameters;
540 if (use_native_audio_params && !unit_test_is_active) { 561 if (use_native_audio_params && !unit_test_is_active) {
541 // Use native audio parameters but avoid opening up at the native buffer 562 // Use native audio parameters but avoid opening up at the native buffer
542 // size. Instead use same frame size (in milliseconds) as WebSpeech uses. 563 // size. Instead use same frame size (in milliseconds) as WebSpeech uses.
543 // We rely on internal buffers in the audio back-end to fulfill this request 564 // We rely on internal buffers in the audio back-end to fulfill this request
544 // and the idea is to simplify the audio conversion since each Convert() 565 // and the idea is to simplify the audio conversion since each Convert()
545 // call will then render exactly one ProvideInput() call. 566 // call will then render exactly one ProvideInput() call.
546 // Due to implementation details in the audio converter, 2 milliseconds 567 // Due to implementation details in the audio converter, 2 milliseconds
547 // are added to the default frame size (100 ms) to ensure there is enough 568 // are added to the default frame size (100 ms) to ensure there is enough
548 // data to generate 100 ms of output when resampling. 569 // data to generate 100 ms of output when resampling.
570 // frames_per_buffer =
571 // ((in_params.sample_rate() * (chunk_duration_ms + 2)) / 1000.0) + 0.5;
549 frames_per_buffer = 572 frames_per_buffer =
550 ((in_params.sample_rate() * (chunk_duration_ms + 2)) / 1000.0) + 0.5; 573 ((in_params.sample_rate() * chunk_duration_ms) / 1000.0) + 0.5;
551 input_parameters.Reset(in_params.format(), 574 input_parameters.Reset(in_params.format(),
552 in_params.channel_layout(), 575 in_params.channel_layout(),
553 in_params.channels(), 576 in_params.channels(),
554 in_params.sample_rate(), 577 in_params.sample_rate(),
555 in_params.bits_per_sample(), 578 in_params.bits_per_sample(),
556 frames_per_buffer); 579 frames_per_buffer);
580 DVLOG(1) << "SRI::input_parameters: "
581 << input_parameters.AsHumanReadableString();
557 } 582 }
558 583
559 // Create an audio converter which converts data between native input format 584 // Create an audio converter which converts data between native input format
560 // and WebSpeech specific output format. 585 // and WebSpeech specific output format.
561 audio_converter_.reset( 586 audio_converter_.reset(
562 new OnDataConverter(input_parameters, output_parameters)); 587 new OnDataConverter(input_parameters, output_parameters));
563 588
564 audio_controller_ = AudioInputController::Create( 589 audio_controller_ = AudioInputController::Create(
565 audio_manager, this, input_parameters, device_id_, NULL); 590 audio_manager, this, input_parameters, device_id_, NULL);
566 591
(...skipping 245 matching lines...) Expand 10 before | Expand all | Expand 10 after
812 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) 837 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value)
813 : event(event_value), 838 : event(event_value),
814 audio_data(NULL), 839 audio_data(NULL),
815 engine_error(SPEECH_RECOGNITION_ERROR_NONE) { 840 engine_error(SPEECH_RECOGNITION_ERROR_NONE) {
816 } 841 }
817 842
818 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { 843 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() {
819 } 844 }
820 845
821 } // namespace content 846 } // namespace content
OLDNEW
« no previous file with comments | « no previous file | media/audio/audio_input_controller.cc » ('j') | media/base/audio_converter.cc » ('J')

Powered by Google App Engine
This is Rietveld 408576698