OLD | NEW |
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "content/browser/speech/speech_recognizer_impl.h" | 5 #include "content/browser/speech/speech_recognizer_impl.h" |
6 | 6 |
7 #include "base/basictypes.h" | 7 #include "base/basictypes.h" |
8 #include "base/bind.h" | 8 #include "base/bind.h" |
9 #include "base/time/time.h" | 9 #include "base/time/time.h" |
10 #include "content/browser/browser_main_loop.h" | 10 #include "content/browser/browser_main_loop.h" |
(...skipping 24 matching lines...) Expand all Loading... |
35 public: | 35 public: |
36 OnDataConverter(const AudioParameters& input_params, | 36 OnDataConverter(const AudioParameters& input_params, |
37 const AudioParameters& output_params); | 37 const AudioParameters& output_params); |
38 ~OnDataConverter() override; | 38 ~OnDataConverter() override; |
39 | 39 |
40 // Converts input audio |data| bus into an AudioChunk where the input format | 40 // Converts input audio |data| bus into an AudioChunk where the input format |
41 // is given by |input_parameters_| and the output format by | 41 // is given by |input_parameters_| and the output format by |
42 // |output_parameters_|. | 42 // |output_parameters_|. |
43 scoped_refptr<AudioChunk> Convert(const AudioBus* data); | 43 scoped_refptr<AudioChunk> Convert(const AudioBus* data); |
44 | 44 |
| 45 bool data_was_converted() const { return data_was_converted_; } |
| 46 |
45 private: | 47 private: |
46 // media::AudioConverter::InputCallback implementation. | 48 // media::AudioConverter::InputCallback implementation. |
47 double ProvideInput(AudioBus* dest, base::TimeDelta buffer_delay) override; | 49 double ProvideInput(AudioBus* dest, base::TimeDelta buffer_delay) override; |
48 | 50 |
49 // Handles resampling, buffering, and channel mixing between input and output | 51 // Handles resampling, buffering, and channel mixing between input and output |
50 // parameters. | 52 // parameters. |
51 AudioConverter audio_converter_; | 53 AudioConverter audio_converter_; |
52 | 54 |
53 scoped_ptr<AudioBus> input_bus_; | 55 scoped_ptr<AudioBus> input_bus_; |
54 scoped_ptr<AudioBus> output_bus_; | 56 scoped_ptr<AudioBus> output_bus_; |
55 const AudioParameters input_parameters_; | 57 const AudioParameters input_parameters_; |
56 const AudioParameters output_parameters_; | 58 const AudioParameters output_parameters_; |
57 bool waiting_for_input_; | 59 bool data_was_converted_; |
58 | 60 |
59 DISALLOW_COPY_AND_ASSIGN(OnDataConverter); | 61 DISALLOW_COPY_AND_ASSIGN(OnDataConverter); |
60 }; | 62 }; |
61 | 63 |
62 namespace { | 64 namespace { |
63 | 65 |
64 // The following constants are related to the volume level indicator shown in | 66 // The following constants are related to the volume level indicator shown in |
65 // the UI for recorded audio. | 67 // the UI for recorded audio. |
66 // Multiplier used when new volume is greater than previous level. | 68 // Multiplier used when new volume is greater than previous level. |
67 const float kUpSmoothingFactor = 1.0f; | 69 const float kUpSmoothingFactor = 1.0f; |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
112 // SpeechRecognizerImpl::OnDataConverter implementation | 114 // SpeechRecognizerImpl::OnDataConverter implementation |
113 | 115 |
114 SpeechRecognizerImpl::OnDataConverter::OnDataConverter( | 116 SpeechRecognizerImpl::OnDataConverter::OnDataConverter( |
115 const AudioParameters& input_params, | 117 const AudioParameters& input_params, |
116 const AudioParameters& output_params) | 118 const AudioParameters& output_params) |
117 : audio_converter_(input_params, output_params, false), | 119 : audio_converter_(input_params, output_params, false), |
118 input_bus_(AudioBus::Create(input_params)), | 120 input_bus_(AudioBus::Create(input_params)), |
119 output_bus_(AudioBus::Create(output_params)), | 121 output_bus_(AudioBus::Create(output_params)), |
120 input_parameters_(input_params), | 122 input_parameters_(input_params), |
121 output_parameters_(output_params), | 123 output_parameters_(output_params), |
122 waiting_for_input_(false) { | 124 data_was_converted_(false) { |
123 audio_converter_.AddInput(this); | 125 audio_converter_.AddInput(this); |
| 126 audio_converter_.PrimeWithSilence(); |
124 } | 127 } |
125 | 128 |
126 SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() { | 129 SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() { |
127 // It should now be safe to unregister the converter since no more OnData() | 130 // It should now be safe to unregister the converter since no more OnData() |
128 // callbacks are outstanding at this point. | 131 // callbacks are outstanding at this point. |
129 audio_converter_.RemoveInput(this); | 132 audio_converter_.RemoveInput(this); |
130 } | 133 } |
131 | 134 |
132 scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert( | 135 scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert( |
133 const AudioBus* data) { | 136 const AudioBus* data) { |
134 CHECK_EQ(data->frames(), input_parameters_.frames_per_buffer()); | 137 CHECK_EQ(data->frames(), input_parameters_.frames_per_buffer()); |
135 | 138 data_was_converted_ = false; |
| 139 // Copy recorded audio to the |input_bus_| for later use in ProvideInput(). |
136 data->CopyTo(input_bus_.get()); | 140 data->CopyTo(input_bus_.get()); |
137 | 141 // Convert the audio and place the result in |output_bus_|. This call will |
138 waiting_for_input_ = true; | 142 // result in a ProvideInput() callback where the actual input is provided. |
| 143 // However, it can happen that the converter contains enough cached data |
| 144 // to return a result without calling ProvideInput(). The caller of this |
| 145 // method should check the state of data_was_converted_() and make an |
| 146 // additional call if it is set to false at return. |
| 147 // See http://crbug.com/506051 for details. |
139 audio_converter_.Convert(output_bus_.get()); | 148 audio_converter_.Convert(output_bus_.get()); |
140 | 149 // Create an audio chunk based on the converted result. |
141 scoped_refptr<AudioChunk> chunk( | 150 scoped_refptr<AudioChunk> chunk( |
142 new AudioChunk(output_parameters_.GetBytesPerBuffer(), | 151 new AudioChunk(output_parameters_.GetBytesPerBuffer(), |
143 output_parameters_.bits_per_sample() / 8)); | 152 output_parameters_.bits_per_sample() / 8)); |
144 output_bus_->ToInterleaved(output_bus_->frames(), | 153 output_bus_->ToInterleaved(output_bus_->frames(), |
145 output_parameters_.bits_per_sample() / 8, | 154 output_parameters_.bits_per_sample() / 8, |
146 chunk->writable_data()); | 155 chunk->writable_data()); |
147 return chunk; | 156 return chunk; |
148 } | 157 } |
149 | 158 |
150 double SpeechRecognizerImpl::OnDataConverter::ProvideInput( | 159 double SpeechRecognizerImpl::OnDataConverter::ProvideInput( |
151 AudioBus* dest, base::TimeDelta buffer_delay) { | 160 AudioBus* dest, base::TimeDelta buffer_delay) { |
152 // The audio converted should never ask for more than one bus in each call | |
153 // to Convert(). If so, we have a serious issue in our design since we might | |
154 // miss recorded chunks of 100 ms audio data. | |
155 CHECK(waiting_for_input_); | |
156 | |
157 // Read from the input bus to feed the converter. | 161 // Read from the input bus to feed the converter. |
158 input_bus_->CopyTo(dest); | 162 input_bus_->CopyTo(dest); |
159 | 163 // Indicate that the recorded audio has in fact been used by the converter. |
160 // |input_bus_| should only be provide once. | 164 data_was_converted_ = true; |
161 waiting_for_input_ = false; | |
162 return 1; | 165 return 1; |
163 } | 166 } |
164 | 167 |
165 // SpeechRecognizerImpl implementation | 168 // SpeechRecognizerImpl implementation |
166 | 169 |
167 SpeechRecognizerImpl::SpeechRecognizerImpl( | 170 SpeechRecognizerImpl::SpeechRecognizerImpl( |
168 SpeechRecognitionEventListener* listener, | 171 SpeechRecognitionEventListener* listener, |
169 int session_id, | 172 int session_id, |
170 bool continuous, | 173 bool continuous, |
171 bool provisional_results, | 174 bool provisional_results, |
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
266 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 269 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
267 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | 270 base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
268 this, event_args)); | 271 this, event_args)); |
269 } | 272 } |
270 | 273 |
271 void SpeechRecognizerImpl::OnData(AudioInputController* controller, | 274 void SpeechRecognizerImpl::OnData(AudioInputController* controller, |
272 const AudioBus* data) { | 275 const AudioBus* data) { |
273 // Convert audio from native format to fixed format used by WebSpeech. | 276 // Convert audio from native format to fixed format used by WebSpeech. |
274 FSMEventArgs event_args(EVENT_AUDIO_DATA); | 277 FSMEventArgs event_args(EVENT_AUDIO_DATA); |
275 event_args.audio_data = audio_converter_->Convert(data); | 278 event_args.audio_data = audio_converter_->Convert(data); |
276 | |
277 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 279 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
278 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | 280 base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
279 this, event_args)); | 281 this, event_args)); |
| 282 // See http://crbug.com/506051 regarding why one extra convert call can |
| 283 // sometimes be required. It should be a rare case. |
| 284 if (!audio_converter_->data_was_converted()) { |
| 285 event_args.audio_data = audio_converter_->Convert(data); |
| 286 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| 287 base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
| 288 this, event_args)); |
| 289 } |
| 290 // Something is seriously wrong here and we are most likely missing some |
| 291 // audio segments. |
| 292 CHECK(audio_converter_->data_was_converted()); |
280 } | 293 } |
281 | 294 |
282 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} | 295 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} |
283 | 296 |
284 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResults( | 297 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResults( |
285 const SpeechRecognitionResults& results) { | 298 const SpeechRecognitionResults& results) { |
286 FSMEventArgs event_args(EVENT_ENGINE_RESULT); | 299 FSMEventArgs event_args(EVENT_ENGINE_RESULT); |
287 event_args.engine_results = results; | 300 event_args.engine_results = results; |
288 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 301 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
289 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | 302 base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
(...skipping 226 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
516 return Abort( | 529 return Abort( |
517 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE)); | 530 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE)); |
518 } | 531 } |
519 | 532 |
520 // Audio converter shall provide audio based on these parameters as output. | 533 // Audio converter shall provide audio based on these parameters as output. |
521 // Hard coded, WebSpeech specific parameters are utilized here. | 534 // Hard coded, WebSpeech specific parameters are utilized here. |
522 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000; | 535 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000; |
523 AudioParameters output_parameters = AudioParameters( | 536 AudioParameters output_parameters = AudioParameters( |
524 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate, | 537 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate, |
525 kNumBitsPerAudioSample, frames_per_buffer); | 538 kNumBitsPerAudioSample, frames_per_buffer); |
| 539 DVLOG(1) << "SRI::output_parameters: " |
| 540 << output_parameters.AsHumanReadableString(); |
526 | 541 |
527 // Audio converter will receive audio based on these parameters as input. | 542 // Audio converter will receive audio based on these parameters as input. |
528 // On Windows we start by verifying that Core Audio is supported. If not, | 543 // On Windows we start by verifying that Core Audio is supported. If not, |
529 // the WaveIn API is used and we might as well avoid all audio conversations | 544 // the WaveIn API is used and we might as well avoid all audio conversations |
530 // since WaveIn does the conversion for us. | 545 // since WaveIn does the conversion for us. |
531 // TODO(henrika): this code should be moved to platform dependent audio | 546 // TODO(henrika): this code should be moved to platform dependent audio |
532 // managers. | 547 // managers. |
533 bool use_native_audio_params = true; | 548 bool use_native_audio_params = true; |
534 #if defined(OS_WIN) | 549 #if defined(OS_WIN) |
535 use_native_audio_params = media::CoreAudioUtil::IsSupported(); | 550 use_native_audio_params = media::CoreAudioUtil::IsSupported(); |
536 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech"; | 551 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech"; |
537 #endif | 552 #endif |
538 | 553 |
539 AudioParameters input_parameters = output_parameters; | 554 AudioParameters input_parameters = output_parameters; |
540 if (use_native_audio_params && !unit_test_is_active) { | 555 if (use_native_audio_params && !unit_test_is_active) { |
541 // Use native audio parameters but avoid opening up at the native buffer | 556 // Use native audio parameters but avoid opening up at the native buffer |
542 // size. Instead use same frame size (in milliseconds) as WebSpeech uses. | 557 // size. Instead use same frame size (in milliseconds) as WebSpeech uses. |
543 // We rely on internal buffers in the audio back-end to fulfill this request | 558 // We rely on internal buffers in the audio back-end to fulfill this request |
544 // and the idea is to simplify the audio conversion since each Convert() | 559 // and the idea is to simplify the audio conversion since each Convert() |
545 // call will then render exactly one ProvideInput() call. | 560 // call will then render exactly one ProvideInput() call. |
546 // Due to implementation details in the audio converter, 2 milliseconds | 561 // in_params.sample_rate() |
547 // are added to the default frame size (100 ms) to ensure there is enough | |
548 // data to generate 100 ms of output when resampling. | |
549 frames_per_buffer = | 562 frames_per_buffer = |
550 ((in_params.sample_rate() * (chunk_duration_ms + 2)) / 1000.0) + 0.5; | 563 ((in_params.sample_rate() * chunk_duration_ms) / 1000.0) + 0.5; |
551 input_parameters.Reset(in_params.format(), | 564 input_parameters.Reset(in_params.format(), |
552 in_params.channel_layout(), | 565 in_params.channel_layout(), |
553 in_params.channels(), | 566 in_params.channels(), |
554 in_params.sample_rate(), | 567 in_params.sample_rate(), |
555 in_params.bits_per_sample(), | 568 in_params.bits_per_sample(), |
556 frames_per_buffer); | 569 frames_per_buffer); |
| 570 DVLOG(1) << "SRI::input_parameters: " |
| 571 << input_parameters.AsHumanReadableString(); |
557 } | 572 } |
558 | 573 |
559 // Create an audio converter which converts data between native input format | 574 // Create an audio converter which converts data between native input format |
560 // and WebSpeech specific output format. | 575 // and WebSpeech specific output format. |
561 audio_converter_.reset( | 576 audio_converter_.reset( |
562 new OnDataConverter(input_parameters, output_parameters)); | 577 new OnDataConverter(input_parameters, output_parameters)); |
563 | 578 |
564 audio_controller_ = AudioInputController::Create( | 579 audio_controller_ = AudioInputController::Create( |
565 audio_manager, this, input_parameters, device_id_, NULL); | 580 audio_manager, this, input_parameters, device_id_, NULL); |
566 | 581 |
(...skipping 245 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
812 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) | 827 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) |
813 : event(event_value), | 828 : event(event_value), |
814 audio_data(NULL), | 829 audio_data(NULL), |
815 engine_error(SPEECH_RECOGNITION_ERROR_NONE) { | 830 engine_error(SPEECH_RECOGNITION_ERROR_NONE) { |
816 } | 831 } |
817 | 832 |
818 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { | 833 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { |
819 } | 834 } |
820 | 835 |
821 } // namespace content | 836 } // namespace content |
OLD | NEW |