Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(414)

Side by Side Diff: content/browser/speech/speech_recognizer_impl.cc

Issue 1211203006: Fixes issue where Web Speech API drops a frame every 5.1 seconds (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Feedback from Dale Created 5 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | media/base/audio_converter.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "content/browser/speech/speech_recognizer_impl.h" 5 #include "content/browser/speech/speech_recognizer_impl.h"
6 6
7 #include "base/basictypes.h" 7 #include "base/basictypes.h"
8 #include "base/bind.h" 8 #include "base/bind.h"
9 #include "base/time/time.h" 9 #include "base/time/time.h"
10 #include "content/browser/browser_main_loop.h" 10 #include "content/browser/browser_main_loop.h"
(...skipping 24 matching lines...) Expand all
35 public: 35 public:
36 OnDataConverter(const AudioParameters& input_params, 36 OnDataConverter(const AudioParameters& input_params,
37 const AudioParameters& output_params); 37 const AudioParameters& output_params);
38 ~OnDataConverter() override; 38 ~OnDataConverter() override;
39 39
40 // Converts input audio |data| bus into an AudioChunk where the input format 40 // Converts input audio |data| bus into an AudioChunk where the input format
41 // is given by |input_parameters_| and the output format by 41 // is given by |input_parameters_| and the output format by
42 // |output_parameters_|. 42 // |output_parameters_|.
43 scoped_refptr<AudioChunk> Convert(const AudioBus* data); 43 scoped_refptr<AudioChunk> Convert(const AudioBus* data);
44 44
45 bool data_was_converted() const { return data_was_converted_; }
46
45 private: 47 private:
46 // media::AudioConverter::InputCallback implementation. 48 // media::AudioConverter::InputCallback implementation.
47 double ProvideInput(AudioBus* dest, base::TimeDelta buffer_delay) override; 49 double ProvideInput(AudioBus* dest, base::TimeDelta buffer_delay) override;
48 50
49 // Handles resampling, buffering, and channel mixing between input and output 51 // Handles resampling, buffering, and channel mixing between input and output
50 // parameters. 52 // parameters.
51 AudioConverter audio_converter_; 53 AudioConverter audio_converter_;
52 54
53 scoped_ptr<AudioBus> input_bus_; 55 scoped_ptr<AudioBus> input_bus_;
54 scoped_ptr<AudioBus> output_bus_; 56 scoped_ptr<AudioBus> output_bus_;
55 const AudioParameters input_parameters_; 57 const AudioParameters input_parameters_;
56 const AudioParameters output_parameters_; 58 const AudioParameters output_parameters_;
57 bool waiting_for_input_; 59 bool data_was_converted_;
58 60
59 DISALLOW_COPY_AND_ASSIGN(OnDataConverter); 61 DISALLOW_COPY_AND_ASSIGN(OnDataConverter);
60 }; 62 };
61 63
62 namespace { 64 namespace {
63 65
64 // The following constants are related to the volume level indicator shown in 66 // The following constants are related to the volume level indicator shown in
65 // the UI for recorded audio. 67 // the UI for recorded audio.
66 // Multiplier used when new volume is greater than previous level. 68 // Multiplier used when new volume is greater than previous level.
67 const float kUpSmoothingFactor = 1.0f; 69 const float kUpSmoothingFactor = 1.0f;
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
112 // SpeechRecognizerImpl::OnDataConverter implementation 114 // SpeechRecognizerImpl::OnDataConverter implementation
113 115
114 SpeechRecognizerImpl::OnDataConverter::OnDataConverter( 116 SpeechRecognizerImpl::OnDataConverter::OnDataConverter(
115 const AudioParameters& input_params, 117 const AudioParameters& input_params,
116 const AudioParameters& output_params) 118 const AudioParameters& output_params)
117 : audio_converter_(input_params, output_params, false), 119 : audio_converter_(input_params, output_params, false),
118 input_bus_(AudioBus::Create(input_params)), 120 input_bus_(AudioBus::Create(input_params)),
119 output_bus_(AudioBus::Create(output_params)), 121 output_bus_(AudioBus::Create(output_params)),
120 input_parameters_(input_params), 122 input_parameters_(input_params),
121 output_parameters_(output_params), 123 output_parameters_(output_params),
122 waiting_for_input_(false) { 124 data_was_converted_(false) {
123 audio_converter_.AddInput(this); 125 audio_converter_.AddInput(this);
126 audio_converter_.PrimeWithSilence();
124 } 127 }
125 128
126 SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() { 129 SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() {
127 // It should now be safe to unregister the converter since no more OnData() 130 // It should now be safe to unregister the converter since no more OnData()
128 // callbacks are outstanding at this point. 131 // callbacks are outstanding at this point.
129 audio_converter_.RemoveInput(this); 132 audio_converter_.RemoveInput(this);
130 } 133 }
131 134
132 scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert( 135 scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert(
133 const AudioBus* data) { 136 const AudioBus* data) {
134 CHECK_EQ(data->frames(), input_parameters_.frames_per_buffer()); 137 CHECK_EQ(data->frames(), input_parameters_.frames_per_buffer());
135 138 data_was_converted_ = false;
139 // Copy recorded audio to the |input_bus_| for later use in ProvideInput().
136 data->CopyTo(input_bus_.get()); 140 data->CopyTo(input_bus_.get());
137 141 // Convert the audio and place the result in |output_bus_|. This call will
138 waiting_for_input_ = true; 142 // result in a ProvideInput() callback where the actual input is provided.
143 // However, it can happen that the converter contains enough cached data
144 // to return a result without calling ProvideInput(). The caller of this
145 // method should check the state of data_was_converted_() and make an
146 // additional call if it is set to false at return.
147 // See http://crbug.com/506051 for details.
139 audio_converter_.Convert(output_bus_.get()); 148 audio_converter_.Convert(output_bus_.get());
140 149 // Create an audio chunk based on the converted result.
141 scoped_refptr<AudioChunk> chunk( 150 scoped_refptr<AudioChunk> chunk(
142 new AudioChunk(output_parameters_.GetBytesPerBuffer(), 151 new AudioChunk(output_parameters_.GetBytesPerBuffer(),
143 output_parameters_.bits_per_sample() / 8)); 152 output_parameters_.bits_per_sample() / 8));
144 output_bus_->ToInterleaved(output_bus_->frames(), 153 output_bus_->ToInterleaved(output_bus_->frames(),
145 output_parameters_.bits_per_sample() / 8, 154 output_parameters_.bits_per_sample() / 8,
146 chunk->writable_data()); 155 chunk->writable_data());
147 return chunk; 156 return chunk;
148 } 157 }
149 158
150 double SpeechRecognizerImpl::OnDataConverter::ProvideInput( 159 double SpeechRecognizerImpl::OnDataConverter::ProvideInput(
151 AudioBus* dest, base::TimeDelta buffer_delay) { 160 AudioBus* dest, base::TimeDelta buffer_delay) {
152 // The audio converted should never ask for more than one bus in each call
153 // to Convert(). If so, we have a serious issue in our design since we might
154 // miss recorded chunks of 100 ms audio data.
155 CHECK(waiting_for_input_);
156
157 // Read from the input bus to feed the converter. 161 // Read from the input bus to feed the converter.
158 input_bus_->CopyTo(dest); 162 input_bus_->CopyTo(dest);
159 163 // Indicate that the recorded audio has in fact been used by the converter.
160 // |input_bus_| should only be provide once. 164 data_was_converted_ = true;
161 waiting_for_input_ = false;
162 return 1; 165 return 1;
163 } 166 }
164 167
165 // SpeechRecognizerImpl implementation 168 // SpeechRecognizerImpl implementation
166 169
167 SpeechRecognizerImpl::SpeechRecognizerImpl( 170 SpeechRecognizerImpl::SpeechRecognizerImpl(
168 SpeechRecognitionEventListener* listener, 171 SpeechRecognitionEventListener* listener,
169 int session_id, 172 int session_id,
170 bool continuous, 173 bool continuous,
171 bool provisional_results, 174 bool provisional_results,
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after
266 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 269 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
267 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 270 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
268 this, event_args)); 271 this, event_args));
269 } 272 }
270 273
271 void SpeechRecognizerImpl::OnData(AudioInputController* controller, 274 void SpeechRecognizerImpl::OnData(AudioInputController* controller,
272 const AudioBus* data) { 275 const AudioBus* data) {
273 // Convert audio from native format to fixed format used by WebSpeech. 276 // Convert audio from native format to fixed format used by WebSpeech.
274 FSMEventArgs event_args(EVENT_AUDIO_DATA); 277 FSMEventArgs event_args(EVENT_AUDIO_DATA);
275 event_args.audio_data = audio_converter_->Convert(data); 278 event_args.audio_data = audio_converter_->Convert(data);
276
277 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 279 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
278 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 280 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
279 this, event_args)); 281 this, event_args));
282 // See http://crbug.com/506051 regarding why one extra convert call can
DaleCurtis 2015/07/08 18:41:47 Oh, you could just do a while (!data_was_converted
henrika (OOO until Aug 14) 2015/07/09 10:06:09 Great point, tried that. But what then happens is:
283 // sometimes be required. It should be a rare case.
284 if (!audio_converter_->data_was_converted()) {
285 DCHECK(false);
DaleCurtis 2015/07/08 18:41:47 Remove dcheck and dlog, I don't think it's unexpec
henrika (OOO until Aug 14) 2015/07/09 10:06:09 Done.
286 DLOG(WARNING) << "One extra convert call is required";
287 event_args.audio_data = audio_converter_->Convert(data);
288 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
289 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
290 this, event_args));
291 }
292 // Something is seriously wrong here and we are most likely missing some
293 // audio segments.
294 CHECK(audio_converter_->data_was_converted());
280 } 295 }
281 296
282 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} 297 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {}
283 298
284 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResults( 299 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResults(
285 const SpeechRecognitionResults& results) { 300 const SpeechRecognitionResults& results) {
286 FSMEventArgs event_args(EVENT_ENGINE_RESULT); 301 FSMEventArgs event_args(EVENT_ENGINE_RESULT);
287 event_args.engine_results = results; 302 event_args.engine_results = results;
288 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 303 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
289 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 304 base::Bind(&SpeechRecognizerImpl::DispatchEvent,
(...skipping 226 matching lines...) Expand 10 before | Expand all | Expand 10 after
516 return Abort( 531 return Abort(
517 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE)); 532 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE));
518 } 533 }
519 534
520 // Audio converter shall provide audio based on these parameters as output. 535 // Audio converter shall provide audio based on these parameters as output.
521 // Hard coded, WebSpeech specific parameters are utilized here. 536 // Hard coded, WebSpeech specific parameters are utilized here.
522 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000; 537 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000;
523 AudioParameters output_parameters = AudioParameters( 538 AudioParameters output_parameters = AudioParameters(
524 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate, 539 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate,
525 kNumBitsPerAudioSample, frames_per_buffer); 540 kNumBitsPerAudioSample, frames_per_buffer);
541 DVLOG(1) << "SRI::output_parameters: "
542 << output_parameters.AsHumanReadableString();
526 543
527 // Audio converter will receive audio based on these parameters as input. 544 // Audio converter will receive audio based on these parameters as input.
528 // On Windows we start by verifying that Core Audio is supported. If not, 545 // On Windows we start by verifying that Core Audio is supported. If not,
529 // the WaveIn API is used and we might as well avoid all audio conversations 546 // the WaveIn API is used and we might as well avoid all audio conversations
530 // since WaveIn does the conversion for us. 547 // since WaveIn does the conversion for us.
531 // TODO(henrika): this code should be moved to platform dependent audio 548 // TODO(henrika): this code should be moved to platform dependent audio
532 // managers. 549 // managers.
533 bool use_native_audio_params = true; 550 bool use_native_audio_params = true;
534 #if defined(OS_WIN) 551 #if defined(OS_WIN)
535 use_native_audio_params = media::CoreAudioUtil::IsSupported(); 552 use_native_audio_params = media::CoreAudioUtil::IsSupported();
536 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech"; 553 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech";
537 #endif 554 #endif
538 555
539 AudioParameters input_parameters = output_parameters; 556 AudioParameters input_parameters = output_parameters;
540 if (use_native_audio_params && !unit_test_is_active) { 557 if (use_native_audio_params && !unit_test_is_active) {
541 // Use native audio parameters but avoid opening up at the native buffer 558 // Use native audio parameters but avoid opening up at the native buffer
542 // size. Instead use same frame size (in milliseconds) as WebSpeech uses. 559 // size. Instead use same frame size (in milliseconds) as WebSpeech uses.
543 // We rely on internal buffers in the audio back-end to fulfill this request 560 // We rely on internal buffers in the audio back-end to fulfill this request
544 // and the idea is to simplify the audio conversion since each Convert() 561 // and the idea is to simplify the audio conversion since each Convert()
545 // call will then render exactly one ProvideInput() call. 562 // call will then render exactly one ProvideInput() call.
546 // Due to implementation details in the audio converter, 2 milliseconds 563 // in_params.sample_rate()
547 // are added to the default frame size (100 ms) to ensure there is enough
548 // data to generate 100 ms of output when resampling.
549 frames_per_buffer = 564 frames_per_buffer =
550 ((in_params.sample_rate() * (chunk_duration_ms + 2)) / 1000.0) + 0.5; 565 ((in_params.sample_rate() * chunk_duration_ms) / 1000.0) + 0.5;
DaleCurtis 2015/07/08 18:39:46 One last thing, can verify that the chunk size for
henrika (OOO until Aug 14) 2015/07/09 10:06:09 SRI::output_parameters: format: 1 channels: 1 chan
551 input_parameters.Reset(in_params.format(), 566 input_parameters.Reset(in_params.format(),
552 in_params.channel_layout(), 567 in_params.channel_layout(),
553 in_params.channels(), 568 in_params.channels(),
554 in_params.sample_rate(), 569 in_params.sample_rate(),
555 in_params.bits_per_sample(), 570 in_params.bits_per_sample(),
556 frames_per_buffer); 571 frames_per_buffer);
572 DVLOG(1) << "SRI::input_parameters: "
573 << input_parameters.AsHumanReadableString();
557 } 574 }
558 575
559 // Create an audio converter which converts data between native input format 576 // Create an audio converter which converts data between native input format
560 // and WebSpeech specific output format. 577 // and WebSpeech specific output format.
561 audio_converter_.reset( 578 audio_converter_.reset(
562 new OnDataConverter(input_parameters, output_parameters)); 579 new OnDataConverter(input_parameters, output_parameters));
563 580
564 audio_controller_ = AudioInputController::Create( 581 audio_controller_ = AudioInputController::Create(
565 audio_manager, this, input_parameters, device_id_, NULL); 582 audio_manager, this, input_parameters, device_id_, NULL);
566 583
(...skipping 245 matching lines...) Expand 10 before | Expand all | Expand 10 after
812 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) 829 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value)
813 : event(event_value), 830 : event(event_value),
814 audio_data(NULL), 831 audio_data(NULL),
815 engine_error(SPEECH_RECOGNITION_ERROR_NONE) { 832 engine_error(SPEECH_RECOGNITION_ERROR_NONE) {
816 } 833 }
817 834
818 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { 835 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() {
819 } 836 }
820 837
821 } // namespace content 838 } // namespace content
OLDNEW
« no previous file with comments | « no previous file | media/base/audio_converter.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698