OLD | NEW |
---|---|
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "content/browser/speech/speech_recognizer_impl.h" | 5 #include "content/browser/speech/speech_recognizer_impl.h" |
6 | 6 |
7 #include "base/basictypes.h" | 7 #include "base/basictypes.h" |
8 #include "base/bind.h" | 8 #include "base/bind.h" |
9 #include "base/time/time.h" | 9 #include "base/time/time.h" |
10 #include "content/browser/browser_main_loop.h" | 10 #include "content/browser/browser_main_loop.h" |
(...skipping 24 matching lines...) Expand all Loading... | |
35 public: | 35 public: |
36 OnDataConverter(const AudioParameters& input_params, | 36 OnDataConverter(const AudioParameters& input_params, |
37 const AudioParameters& output_params); | 37 const AudioParameters& output_params); |
38 ~OnDataConverter() override; | 38 ~OnDataConverter() override; |
39 | 39 |
40 // Converts input audio |data| bus into an AudioChunk where the input format | 40 // Converts input audio |data| bus into an AudioChunk where the input format |
41 // is given by |input_parameters_| and the output format by | 41 // is given by |input_parameters_| and the output format by |
42 // |output_parameters_|. | 42 // |output_parameters_|. |
43 scoped_refptr<AudioChunk> Convert(const AudioBus* data); | 43 scoped_refptr<AudioChunk> Convert(const AudioBus* data); |
44 | 44 |
45 bool data_was_converted() const { return data_was_converted_; } | |
46 | |
45 private: | 47 private: |
46 // media::AudioConverter::InputCallback implementation. | 48 // media::AudioConverter::InputCallback implementation. |
47 double ProvideInput(AudioBus* dest, base::TimeDelta buffer_delay) override; | 49 double ProvideInput(AudioBus* dest, base::TimeDelta buffer_delay) override; |
48 | 50 |
49 // Handles resampling, buffering, and channel mixing between input and output | 51 // Handles resampling, buffering, and channel mixing between input and output |
50 // parameters. | 52 // parameters. |
51 AudioConverter audio_converter_; | 53 AudioConverter audio_converter_; |
52 | 54 |
53 scoped_ptr<AudioBus> input_bus_; | 55 scoped_ptr<AudioBus> input_bus_; |
54 scoped_ptr<AudioBus> output_bus_; | 56 scoped_ptr<AudioBus> output_bus_; |
55 const AudioParameters input_parameters_; | 57 const AudioParameters input_parameters_; |
56 const AudioParameters output_parameters_; | 58 const AudioParameters output_parameters_; |
57 bool waiting_for_input_; | 59 bool data_was_converted_; |
58 | 60 |
59 DISALLOW_COPY_AND_ASSIGN(OnDataConverter); | 61 DISALLOW_COPY_AND_ASSIGN(OnDataConverter); |
60 }; | 62 }; |
61 | 63 |
62 namespace { | 64 namespace { |
63 | 65 |
64 // The following constants are related to the volume level indicator shown in | 66 // The following constants are related to the volume level indicator shown in |
65 // the UI for recorded audio. | 67 // the UI for recorded audio. |
66 // Multiplier used when new volume is greater than previous level. | 68 // Multiplier used when new volume is greater than previous level. |
67 const float kUpSmoothingFactor = 1.0f; | 69 const float kUpSmoothingFactor = 1.0f; |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
112 // SpeechRecognizerImpl::OnDataConverter implementation | 114 // SpeechRecognizerImpl::OnDataConverter implementation |
113 | 115 |
114 SpeechRecognizerImpl::OnDataConverter::OnDataConverter( | 116 SpeechRecognizerImpl::OnDataConverter::OnDataConverter( |
115 const AudioParameters& input_params, | 117 const AudioParameters& input_params, |
116 const AudioParameters& output_params) | 118 const AudioParameters& output_params) |
117 : audio_converter_(input_params, output_params, false), | 119 : audio_converter_(input_params, output_params, false), |
118 input_bus_(AudioBus::Create(input_params)), | 120 input_bus_(AudioBus::Create(input_params)), |
119 output_bus_(AudioBus::Create(output_params)), | 121 output_bus_(AudioBus::Create(output_params)), |
120 input_parameters_(input_params), | 122 input_parameters_(input_params), |
121 output_parameters_(output_params), | 123 output_parameters_(output_params), |
122 waiting_for_input_(false) { | 124 data_was_converted_(false) { |
123 audio_converter_.AddInput(this); | 125 audio_converter_.AddInput(this); |
126 audio_converter_.PrimeWithSilence(); | |
124 } | 127 } |
125 | 128 |
126 SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() { | 129 SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() { |
127 // It should now be safe to unregister the converter since no more OnData() | 130 // It should now be safe to unregister the converter since no more OnData() |
128 // callbacks are outstanding at this point. | 131 // callbacks are outstanding at this point. |
129 audio_converter_.RemoveInput(this); | 132 audio_converter_.RemoveInput(this); |
130 } | 133 } |
131 | 134 |
132 scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert( | 135 scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert( |
133 const AudioBus* data) { | 136 const AudioBus* data) { |
134 CHECK_EQ(data->frames(), input_parameters_.frames_per_buffer()); | 137 CHECK_EQ(data->frames(), input_parameters_.frames_per_buffer()); |
135 | 138 data_was_converted_ = false; |
139 // Copy recorded audio to the |input_bus_| for later use in ProvideInput(). | |
136 data->CopyTo(input_bus_.get()); | 140 data->CopyTo(input_bus_.get()); |
137 | 141 // Convert the audio and place the result in |output_bus_|. This call will |
138 waiting_for_input_ = true; | 142 // result in a ProvideInput() callback where the actual input is provided. |
143 // However, it can happen that the converter contains enough cached data | |
144 // to return a result without calling ProvideInput(). The caller of this | |
145 // method should check the state of data_was_converted_() and make an | |
146 // additional call if it is set to false at return. | |
147 // See http://crbug.com/506051 for details. | |
139 audio_converter_.Convert(output_bus_.get()); | 148 audio_converter_.Convert(output_bus_.get()); |
140 | 149 // Create an audio chunk based on the converted result. |
141 scoped_refptr<AudioChunk> chunk( | 150 scoped_refptr<AudioChunk> chunk( |
142 new AudioChunk(output_parameters_.GetBytesPerBuffer(), | 151 new AudioChunk(output_parameters_.GetBytesPerBuffer(), |
143 output_parameters_.bits_per_sample() / 8)); | 152 output_parameters_.bits_per_sample() / 8)); |
144 output_bus_->ToInterleaved(output_bus_->frames(), | 153 output_bus_->ToInterleaved(output_bus_->frames(), |
145 output_parameters_.bits_per_sample() / 8, | 154 output_parameters_.bits_per_sample() / 8, |
146 chunk->writable_data()); | 155 chunk->writable_data()); |
147 return chunk; | 156 return chunk; |
148 } | 157 } |
149 | 158 |
150 double SpeechRecognizerImpl::OnDataConverter::ProvideInput( | 159 double SpeechRecognizerImpl::OnDataConverter::ProvideInput( |
151 AudioBus* dest, base::TimeDelta buffer_delay) { | 160 AudioBus* dest, base::TimeDelta buffer_delay) { |
152 // The audio converted should never ask for more than one bus in each call | |
153 // to Convert(). If so, we have a serious issue in our design since we might | |
154 // miss recorded chunks of 100 ms audio data. | |
155 CHECK(waiting_for_input_); | |
156 | |
157 // Read from the input bus to feed the converter. | 161 // Read from the input bus to feed the converter. |
158 input_bus_->CopyTo(dest); | 162 input_bus_->CopyTo(dest); |
159 | 163 // Indicate that the recorded audio has in fact been used by the converter. |
160 // |input_bus_| should only be provide once. | 164 data_was_converted_ = true; |
161 waiting_for_input_ = false; | |
162 return 1; | 165 return 1; |
163 } | 166 } |
164 | 167 |
165 // SpeechRecognizerImpl implementation | 168 // SpeechRecognizerImpl implementation |
166 | 169 |
167 SpeechRecognizerImpl::SpeechRecognizerImpl( | 170 SpeechRecognizerImpl::SpeechRecognizerImpl( |
168 SpeechRecognitionEventListener* listener, | 171 SpeechRecognitionEventListener* listener, |
169 int session_id, | 172 int session_id, |
170 bool continuous, | 173 bool continuous, |
171 bool provisional_results, | 174 bool provisional_results, |
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
266 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 269 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
267 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | 270 base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
268 this, event_args)); | 271 this, event_args)); |
269 } | 272 } |
270 | 273 |
271 void SpeechRecognizerImpl::OnData(AudioInputController* controller, | 274 void SpeechRecognizerImpl::OnData(AudioInputController* controller, |
272 const AudioBus* data) { | 275 const AudioBus* data) { |
273 // Convert audio from native format to fixed format used by WebSpeech. | 276 // Convert audio from native format to fixed format used by WebSpeech. |
274 FSMEventArgs event_args(EVENT_AUDIO_DATA); | 277 FSMEventArgs event_args(EVENT_AUDIO_DATA); |
275 event_args.audio_data = audio_converter_->Convert(data); | 278 event_args.audio_data = audio_converter_->Convert(data); |
276 | |
277 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 279 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
278 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | 280 base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
279 this, event_args)); | 281 this, event_args)); |
282 // See http://crbug.com/506051 regarding why one extra convert call can | |
DaleCurtis
2015/07/08 18:41:47
Oh, you could just do a while (!data_was_converted
henrika (OOO until Aug 14)
2015/07/09 10:06:09
Great point, tried that. But what then happens is:
| |
283 // sometimes be required. It should be a rare case. | |
284 if (!audio_converter_->data_was_converted()) { | |
285 DCHECK(false); | |
DaleCurtis
2015/07/08 18:41:47
Remove dcheck and dlog, I don't think it's unexpec
henrika (OOO until Aug 14)
2015/07/09 10:06:09
Done.
| |
286 DLOG(WARNING) << "One extra convert call is required"; | |
287 event_args.audio_data = audio_converter_->Convert(data); | |
288 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | |
289 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | |
290 this, event_args)); | |
291 } | |
292 // Something is seriously wrong here and we are most likely missing some | |
293 // audio segments. | |
294 CHECK(audio_converter_->data_was_converted()); | |
280 } | 295 } |
281 | 296 |
282 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} | 297 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} |
283 | 298 |
284 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResults( | 299 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResults( |
285 const SpeechRecognitionResults& results) { | 300 const SpeechRecognitionResults& results) { |
286 FSMEventArgs event_args(EVENT_ENGINE_RESULT); | 301 FSMEventArgs event_args(EVENT_ENGINE_RESULT); |
287 event_args.engine_results = results; | 302 event_args.engine_results = results; |
288 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 303 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
289 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | 304 base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
(...skipping 226 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
516 return Abort( | 531 return Abort( |
517 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE)); | 532 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE)); |
518 } | 533 } |
519 | 534 |
520 // Audio converter shall provide audio based on these parameters as output. | 535 // Audio converter shall provide audio based on these parameters as output. |
521 // Hard coded, WebSpeech specific parameters are utilized here. | 536 // Hard coded, WebSpeech specific parameters are utilized here. |
522 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000; | 537 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000; |
523 AudioParameters output_parameters = AudioParameters( | 538 AudioParameters output_parameters = AudioParameters( |
524 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate, | 539 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate, |
525 kNumBitsPerAudioSample, frames_per_buffer); | 540 kNumBitsPerAudioSample, frames_per_buffer); |
541 DVLOG(1) << "SRI::output_parameters: " | |
542 << output_parameters.AsHumanReadableString(); | |
526 | 543 |
527 // Audio converter will receive audio based on these parameters as input. | 544 // Audio converter will receive audio based on these parameters as input. |
528 // On Windows we start by verifying that Core Audio is supported. If not, | 545 // On Windows we start by verifying that Core Audio is supported. If not, |
529 // the WaveIn API is used and we might as well avoid all audio conversations | 546 // the WaveIn API is used and we might as well avoid all audio conversations |
530 // since WaveIn does the conversion for us. | 547 // since WaveIn does the conversion for us. |
531 // TODO(henrika): this code should be moved to platform dependent audio | 548 // TODO(henrika): this code should be moved to platform dependent audio |
532 // managers. | 549 // managers. |
533 bool use_native_audio_params = true; | 550 bool use_native_audio_params = true; |
534 #if defined(OS_WIN) | 551 #if defined(OS_WIN) |
535 use_native_audio_params = media::CoreAudioUtil::IsSupported(); | 552 use_native_audio_params = media::CoreAudioUtil::IsSupported(); |
536 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech"; | 553 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech"; |
537 #endif | 554 #endif |
538 | 555 |
539 AudioParameters input_parameters = output_parameters; | 556 AudioParameters input_parameters = output_parameters; |
540 if (use_native_audio_params && !unit_test_is_active) { | 557 if (use_native_audio_params && !unit_test_is_active) { |
541 // Use native audio parameters but avoid opening up at the native buffer | 558 // Use native audio parameters but avoid opening up at the native buffer |
542 // size. Instead use same frame size (in milliseconds) as WebSpeech uses. | 559 // size. Instead use same frame size (in milliseconds) as WebSpeech uses. |
543 // We rely on internal buffers in the audio back-end to fulfill this request | 560 // We rely on internal buffers in the audio back-end to fulfill this request |
544 // and the idea is to simplify the audio conversion since each Convert() | 561 // and the idea is to simplify the audio conversion since each Convert() |
545 // call will then render exactly one ProvideInput() call. | 562 // call will then render exactly one ProvideInput() call. |
546 // Due to implementation details in the audio converter, 2 milliseconds | 563 // in_params.sample_rate() |
547 // are added to the default frame size (100 ms) to ensure there is enough | |
548 // data to generate 100 ms of output when resampling. | |
549 frames_per_buffer = | 564 frames_per_buffer = |
550 ((in_params.sample_rate() * (chunk_duration_ms + 2)) / 1000.0) + 0.5; | 565 ((in_params.sample_rate() * chunk_duration_ms) / 1000.0) + 0.5; |
DaleCurtis
2015/07/08 18:39:46
One last thing, can verify that the chunk size for
henrika (OOO until Aug 14)
2015/07/09 10:06:09
SRI::output_parameters: format: 1 channels: 1 chan
| |
551 input_parameters.Reset(in_params.format(), | 566 input_parameters.Reset(in_params.format(), |
552 in_params.channel_layout(), | 567 in_params.channel_layout(), |
553 in_params.channels(), | 568 in_params.channels(), |
554 in_params.sample_rate(), | 569 in_params.sample_rate(), |
555 in_params.bits_per_sample(), | 570 in_params.bits_per_sample(), |
556 frames_per_buffer); | 571 frames_per_buffer); |
572 DVLOG(1) << "SRI::input_parameters: " | |
573 << input_parameters.AsHumanReadableString(); | |
557 } | 574 } |
558 | 575 |
559 // Create an audio converter which converts data between native input format | 576 // Create an audio converter which converts data between native input format |
560 // and WebSpeech specific output format. | 577 // and WebSpeech specific output format. |
561 audio_converter_.reset( | 578 audio_converter_.reset( |
562 new OnDataConverter(input_parameters, output_parameters)); | 579 new OnDataConverter(input_parameters, output_parameters)); |
563 | 580 |
564 audio_controller_ = AudioInputController::Create( | 581 audio_controller_ = AudioInputController::Create( |
565 audio_manager, this, input_parameters, device_id_, NULL); | 582 audio_manager, this, input_parameters, device_id_, NULL); |
566 | 583 |
(...skipping 245 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
812 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) | 829 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) |
813 : event(event_value), | 830 : event(event_value), |
814 audio_data(NULL), | 831 audio_data(NULL), |
815 engine_error(SPEECH_RECOGNITION_ERROR_NONE) { | 832 engine_error(SPEECH_RECOGNITION_ERROR_NONE) { |
816 } | 833 } |
817 | 834 |
818 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { | 835 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { |
819 } | 836 } |
820 | 837 |
821 } // namespace content | 838 } // namespace content |
OLD | NEW |