Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "content/browser/speech/speech_recognizer_impl.h" | 5 #include "content/browser/speech/speech_recognizer_impl.h" |
| 6 | 6 |
| 7 #include "base/basictypes.h" | 7 #include "base/basictypes.h" |
| 8 #include "base/bind.h" | 8 #include "base/bind.h" |
| 9 #include "base/time/time.h" | 9 #include "base/time/time.h" |
| 10 #include "content/browser/browser_main_loop.h" | 10 #include "content/browser/browser_main_loop.h" |
| (...skipping 24 matching lines...) Expand all Loading... | |
| 35 public: | 35 public: |
| 36 OnDataConverter(const AudioParameters& input_params, | 36 OnDataConverter(const AudioParameters& input_params, |
| 37 const AudioParameters& output_params); | 37 const AudioParameters& output_params); |
| 38 ~OnDataConverter() override; | 38 ~OnDataConverter() override; |
| 39 | 39 |
| 40 // Converts input audio |data| bus into an AudioChunk where the input format | 40 // Converts input audio |data| bus into an AudioChunk where the input format |
| 41 // is given by |input_parameters_| and the output format by | 41 // is given by |input_parameters_| and the output format by |
| 42 // |output_parameters_|. | 42 // |output_parameters_|. |
| 43 scoped_refptr<AudioChunk> Convert(const AudioBus* data); | 43 scoped_refptr<AudioChunk> Convert(const AudioBus* data); |
| 44 | 44 |
| 45 bool data_was_converted() const { return data_was_converted_; } | |
| 46 | |
| 45 private: | 47 private: |
| 46 // media::AudioConverter::InputCallback implementation. | 48 // media::AudioConverter::InputCallback implementation. |
| 47 double ProvideInput(AudioBus* dest, base::TimeDelta buffer_delay) override; | 49 double ProvideInput(AudioBus* dest, base::TimeDelta buffer_delay) override; |
| 48 | 50 |
| 49 // Handles resampling, buffering, and channel mixing between input and output | 51 // Handles resampling, buffering, and channel mixing between input and output |
| 50 // parameters. | 52 // parameters. |
| 51 AudioConverter audio_converter_; | 53 AudioConverter audio_converter_; |
| 52 | 54 |
| 53 scoped_ptr<AudioBus> input_bus_; | 55 scoped_ptr<AudioBus> input_bus_; |
| 54 scoped_ptr<AudioBus> output_bus_; | 56 scoped_ptr<AudioBus> output_bus_; |
| 55 const AudioParameters input_parameters_; | 57 const AudioParameters input_parameters_; |
| 56 const AudioParameters output_parameters_; | 58 const AudioParameters output_parameters_; |
| 57 bool waiting_for_input_; | 59 bool data_was_converted_; |
| 58 | 60 |
| 59 DISALLOW_COPY_AND_ASSIGN(OnDataConverter); | 61 DISALLOW_COPY_AND_ASSIGN(OnDataConverter); |
| 60 }; | 62 }; |
| 61 | 63 |
| 62 namespace { | 64 namespace { |
| 63 | 65 |
| 64 // The following constants are related to the volume level indicator shown in | 66 // The following constants are related to the volume level indicator shown in |
| 65 // the UI for recorded audio. | 67 // the UI for recorded audio. |
| 66 // Multiplier used when new volume is greater than previous level. | 68 // Multiplier used when new volume is greater than previous level. |
| 67 const float kUpSmoothingFactor = 1.0f; | 69 const float kUpSmoothingFactor = 1.0f; |
| (...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 112 // SpeechRecognizerImpl::OnDataConverter implementation | 114 // SpeechRecognizerImpl::OnDataConverter implementation |
| 113 | 115 |
| 114 SpeechRecognizerImpl::OnDataConverter::OnDataConverter( | 116 SpeechRecognizerImpl::OnDataConverter::OnDataConverter( |
| 115 const AudioParameters& input_params, | 117 const AudioParameters& input_params, |
| 116 const AudioParameters& output_params) | 118 const AudioParameters& output_params) |
| 117 : audio_converter_(input_params, output_params, false), | 119 : audio_converter_(input_params, output_params, false), |
| 118 input_bus_(AudioBus::Create(input_params)), | 120 input_bus_(AudioBus::Create(input_params)), |
| 119 output_bus_(AudioBus::Create(output_params)), | 121 output_bus_(AudioBus::Create(output_params)), |
| 120 input_parameters_(input_params), | 122 input_parameters_(input_params), |
| 121 output_parameters_(output_params), | 123 output_parameters_(output_params), |
| 122 waiting_for_input_(false) { | 124 data_was_converted_(false) { |
| 123 audio_converter_.AddInput(this); | 125 audio_converter_.AddInput(this); |
| 126 audio_converter_.PrimeWithSilence(); | |
| 124 } | 127 } |
| 125 | 128 |
| 126 SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() { | 129 SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() { |
| 127 // It should now be safe to unregister the converter since no more OnData() | 130 // It should now be safe to unregister the converter since no more OnData() |
| 128 // callbacks are outstanding at this point. | 131 // callbacks are outstanding at this point. |
| 129 audio_converter_.RemoveInput(this); | 132 audio_converter_.RemoveInput(this); |
| 130 } | 133 } |
| 131 | 134 |
| 132 scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert( | 135 scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert( |
| 133 const AudioBus* data) { | 136 const AudioBus* data) { |
| 134 CHECK_EQ(data->frames(), input_parameters_.frames_per_buffer()); | 137 CHECK_EQ(data->frames(), input_parameters_.frames_per_buffer()); |
| 135 | 138 data_was_converted_ = false; |
| 139 // Copy recorded audio to the |input_bus_| for later use in ProvideInput(). | |
| 136 data->CopyTo(input_bus_.get()); | 140 data->CopyTo(input_bus_.get()); |
| 137 | 141 // Convert the audio and place the result in |output_bus_|. This call will |
| 138 waiting_for_input_ = true; | 142 // result in a ProvideInput() callback where the actual input is provided. |
| 143 // However, it can happen that the converter contains enough cached data | |
| 144 // to return a result without calling ProvideInput(). The caller of this | |
| 145 // method should check the state of data_was_converted_() and make an | |
| 146 // additional call if it is set to false at return. | |
| 147 // See http://crbug.com/506051 for details. | |
| 139 audio_converter_.Convert(output_bus_.get()); | 148 audio_converter_.Convert(output_bus_.get()); |
| 140 | 149 // Create an audio chunk based on the converted result. |
| 141 scoped_refptr<AudioChunk> chunk( | 150 scoped_refptr<AudioChunk> chunk( |
| 142 new AudioChunk(output_parameters_.GetBytesPerBuffer(), | 151 new AudioChunk(output_parameters_.GetBytesPerBuffer(), |
| 143 output_parameters_.bits_per_sample() / 8)); | 152 output_parameters_.bits_per_sample() / 8)); |
| 144 output_bus_->ToInterleaved(output_bus_->frames(), | 153 output_bus_->ToInterleaved(output_bus_->frames(), |
| 145 output_parameters_.bits_per_sample() / 8, | 154 output_parameters_.bits_per_sample() / 8, |
| 146 chunk->writable_data()); | 155 chunk->writable_data()); |
| 147 return chunk; | 156 return chunk; |
| 148 } | 157 } |
| 149 | 158 |
| 150 double SpeechRecognizerImpl::OnDataConverter::ProvideInput( | 159 double SpeechRecognizerImpl::OnDataConverter::ProvideInput( |
| 151 AudioBus* dest, base::TimeDelta buffer_delay) { | 160 AudioBus* dest, base::TimeDelta buffer_delay) { |
| 152 // The audio converted should never ask for more than one bus in each call | |
| 153 // to Convert(). If so, we have a serious issue in our design since we might | |
| 154 // miss recorded chunks of 100 ms audio data. | |
| 155 CHECK(waiting_for_input_); | |
| 156 | |
| 157 // Read from the input bus to feed the converter. | 161 // Read from the input bus to feed the converter. |
| 158 input_bus_->CopyTo(dest); | 162 input_bus_->CopyTo(dest); |
| 159 | 163 // Indicate that the recorded audio has in fact been used by the converter. |
| 160 // |input_bus_| should only be provide once. | 164 data_was_converted_ = true; |
| 161 waiting_for_input_ = false; | |
| 162 return 1; | 165 return 1; |
| 163 } | 166 } |
| 164 | 167 |
| 165 // SpeechRecognizerImpl implementation | 168 // SpeechRecognizerImpl implementation |
| 166 | 169 |
| 167 SpeechRecognizerImpl::SpeechRecognizerImpl( | 170 SpeechRecognizerImpl::SpeechRecognizerImpl( |
| 168 SpeechRecognitionEventListener* listener, | 171 SpeechRecognitionEventListener* listener, |
| 169 int session_id, | 172 int session_id, |
| 170 bool continuous, | 173 bool continuous, |
| 171 bool provisional_results, | 174 bool provisional_results, |
| (...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 266 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 269 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| 267 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | 270 base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
| 268 this, event_args)); | 271 this, event_args)); |
| 269 } | 272 } |
| 270 | 273 |
| 271 void SpeechRecognizerImpl::OnData(AudioInputController* controller, | 274 void SpeechRecognizerImpl::OnData(AudioInputController* controller, |
| 272 const AudioBus* data) { | 275 const AudioBus* data) { |
| 273 // Convert audio from native format to fixed format used by WebSpeech. | 276 // Convert audio from native format to fixed format used by WebSpeech. |
| 274 FSMEventArgs event_args(EVENT_AUDIO_DATA); | 277 FSMEventArgs event_args(EVENT_AUDIO_DATA); |
| 275 event_args.audio_data = audio_converter_->Convert(data); | 278 event_args.audio_data = audio_converter_->Convert(data); |
| 276 | |
| 277 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 279 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| 278 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | 280 base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
| 279 this, event_args)); | 281 this, event_args)); |
| 282 // See http://crbug.com/506051 regarding why one extra convert call can | |
|
DaleCurtis
2015/07/08 18:41:47
Oh, you could just do a while (!data_was_converted
henrika (OOO until Aug 14)
2015/07/09 10:06:09
Great point, tried that. But what then happens is:
| |
| 283 // sometimes be required. It should be a rare case. | |
| 284 if (!audio_converter_->data_was_converted()) { | |
| 285 DCHECK(false); | |
|
DaleCurtis
2015/07/08 18:41:47
Remove dcheck and dlog, I don't think it's unexpec
henrika (OOO until Aug 14)
2015/07/09 10:06:09
Done.
| |
| 286 DLOG(WARNING) << "One extra convert call is required"; | |
| 287 event_args.audio_data = audio_converter_->Convert(data); | |
| 288 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | |
| 289 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | |
| 290 this, event_args)); | |
| 291 } | |
| 292 // Something is seriously wrong here and we are most likely missing some | |
| 293 // audio segments. | |
| 294 CHECK(audio_converter_->data_was_converted()); | |
| 280 } | 295 } |
| 281 | 296 |
| 282 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} | 297 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} |
| 283 | 298 |
| 284 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResults( | 299 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResults( |
| 285 const SpeechRecognitionResults& results) { | 300 const SpeechRecognitionResults& results) { |
| 286 FSMEventArgs event_args(EVENT_ENGINE_RESULT); | 301 FSMEventArgs event_args(EVENT_ENGINE_RESULT); |
| 287 event_args.engine_results = results; | 302 event_args.engine_results = results; |
| 288 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, | 303 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| 289 base::Bind(&SpeechRecognizerImpl::DispatchEvent, | 304 base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
| (...skipping 226 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 516 return Abort( | 531 return Abort( |
| 517 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE)); | 532 SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO_CAPTURE)); |
| 518 } | 533 } |
| 519 | 534 |
| 520 // Audio converter shall provide audio based on these parameters as output. | 535 // Audio converter shall provide audio based on these parameters as output. |
| 521 // Hard coded, WebSpeech specific parameters are utilized here. | 536 // Hard coded, WebSpeech specific parameters are utilized here. |
| 522 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000; | 537 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000; |
| 523 AudioParameters output_parameters = AudioParameters( | 538 AudioParameters output_parameters = AudioParameters( |
| 524 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate, | 539 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate, |
| 525 kNumBitsPerAudioSample, frames_per_buffer); | 540 kNumBitsPerAudioSample, frames_per_buffer); |
| 541 DVLOG(1) << "SRI::output_parameters: " | |
| 542 << output_parameters.AsHumanReadableString(); | |
| 526 | 543 |
| 527 // Audio converter will receive audio based on these parameters as input. | 544 // Audio converter will receive audio based on these parameters as input. |
| 528 // On Windows we start by verifying that Core Audio is supported. If not, | 545 // On Windows we start by verifying that Core Audio is supported. If not, |
| 529 // the WaveIn API is used and we might as well avoid all audio conversations | 546 // the WaveIn API is used and we might as well avoid all audio conversations |
| 530 // since WaveIn does the conversion for us. | 547 // since WaveIn does the conversion for us. |
| 531 // TODO(henrika): this code should be moved to platform dependent audio | 548 // TODO(henrika): this code should be moved to platform dependent audio |
| 532 // managers. | 549 // managers. |
| 533 bool use_native_audio_params = true; | 550 bool use_native_audio_params = true; |
| 534 #if defined(OS_WIN) | 551 #if defined(OS_WIN) |
| 535 use_native_audio_params = media::CoreAudioUtil::IsSupported(); | 552 use_native_audio_params = media::CoreAudioUtil::IsSupported(); |
| 536 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech"; | 553 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech"; |
| 537 #endif | 554 #endif |
| 538 | 555 |
| 539 AudioParameters input_parameters = output_parameters; | 556 AudioParameters input_parameters = output_parameters; |
| 540 if (use_native_audio_params && !unit_test_is_active) { | 557 if (use_native_audio_params && !unit_test_is_active) { |
| 541 // Use native audio parameters but avoid opening up at the native buffer | 558 // Use native audio parameters but avoid opening up at the native buffer |
| 542 // size. Instead use same frame size (in milliseconds) as WebSpeech uses. | 559 // size. Instead use same frame size (in milliseconds) as WebSpeech uses. |
| 543 // We rely on internal buffers in the audio back-end to fulfill this request | 560 // We rely on internal buffers in the audio back-end to fulfill this request |
| 544 // and the idea is to simplify the audio conversion since each Convert() | 561 // and the idea is to simplify the audio conversion since each Convert() |
| 545 // call will then render exactly one ProvideInput() call. | 562 // call will then render exactly one ProvideInput() call. |
| 546 // Due to implementation details in the audio converter, 2 milliseconds | 563 // in_params.sample_rate() |
| 547 // are added to the default frame size (100 ms) to ensure there is enough | |
| 548 // data to generate 100 ms of output when resampling. | |
| 549 frames_per_buffer = | 564 frames_per_buffer = |
| 550 ((in_params.sample_rate() * (chunk_duration_ms + 2)) / 1000.0) + 0.5; | 565 ((in_params.sample_rate() * chunk_duration_ms) / 1000.0) + 0.5; |
|
DaleCurtis
2015/07/08 18:39:46
One last thing, can verify that the chunk size for
henrika (OOO until Aug 14)
2015/07/09 10:06:09
SRI::output_parameters: format: 1 channels: 1 chan
| |
| 551 input_parameters.Reset(in_params.format(), | 566 input_parameters.Reset(in_params.format(), |
| 552 in_params.channel_layout(), | 567 in_params.channel_layout(), |
| 553 in_params.channels(), | 568 in_params.channels(), |
| 554 in_params.sample_rate(), | 569 in_params.sample_rate(), |
| 555 in_params.bits_per_sample(), | 570 in_params.bits_per_sample(), |
| 556 frames_per_buffer); | 571 frames_per_buffer); |
| 572 DVLOG(1) << "SRI::input_parameters: " | |
| 573 << input_parameters.AsHumanReadableString(); | |
| 557 } | 574 } |
| 558 | 575 |
| 559 // Create an audio converter which converts data between native input format | 576 // Create an audio converter which converts data between native input format |
| 560 // and WebSpeech specific output format. | 577 // and WebSpeech specific output format. |
| 561 audio_converter_.reset( | 578 audio_converter_.reset( |
| 562 new OnDataConverter(input_parameters, output_parameters)); | 579 new OnDataConverter(input_parameters, output_parameters)); |
| 563 | 580 |
| 564 audio_controller_ = AudioInputController::Create( | 581 audio_controller_ = AudioInputController::Create( |
| 565 audio_manager, this, input_parameters, device_id_, NULL); | 582 audio_manager, this, input_parameters, device_id_, NULL); |
| 566 | 583 |
| (...skipping 245 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 812 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) | 829 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) |
| 813 : event(event_value), | 830 : event(event_value), |
| 814 audio_data(NULL), | 831 audio_data(NULL), |
| 815 engine_error(SPEECH_RECOGNITION_ERROR_NONE) { | 832 engine_error(SPEECH_RECOGNITION_ERROR_NONE) { |
| 816 } | 833 } |
| 817 | 834 |
| 818 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { | 835 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { |
| 819 } | 836 } |
| 820 | 837 |
| 821 } // namespace content | 838 } // namespace content |
| OLD | NEW |