| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "content/browser/speech/endpointer/endpointer.h" | 5 #include "content/browser/speech/endpointer/endpointer.h" |
| 6 | 6 |
| 7 #include "base/time/time.h" | 7 #include "base/time/time.h" |
| 8 #include "content/browser/speech/audio_buffer.h" | 8 #include "content/browser/speech/audio_buffer.h" |
| 9 | 9 |
| 10 using base::Time; | 10 using base::Time; |
| 11 | 11 |
| 12 namespace { | 12 namespace { |
| 13 const int kFrameRate = 50; // 1 frame = 20ms of audio. | 13 const int kFrameRate = 50; // 1 frame = 20ms of audio. |
| 14 } | 14 } |
| 15 | 15 |
| 16 namespace content { | 16 namespace content { |
| 17 | 17 |
| 18 Endpointer::Endpointer(int sample_rate) | 18 Endpointer::Endpointer(int sample_rate) |
| 19 : speech_input_possibly_complete_silence_length_us_(-1), | 19 : speech_input_possibly_complete_silence_length_us_(-1), |
| 20 speech_input_complete_silence_length_us_(-1), | 20 speech_input_complete_silence_length_us_(-1), |
| 21 audio_frame_time_us_(0), | 21 audio_frame_time_us_(0), |
| 22 sample_rate_(sample_rate), | 22 sample_rate_(sample_rate), |
| 23 frame_size_(0) { | 23 frame_size_(0) { |
| 24 Reset(); | 24 Reset(); |
| 25 | 25 |
| 26 frame_size_ = static_cast<int>(sample_rate / static_cast<float>(kFrameRate)); | 26 frame_size_ = static_cast<int>(sample_rate / static_cast<float>(kFrameRate)); |
| 27 | 27 |
| 28 speech_input_minimum_length_us_ = | 28 speech_input_minimum_length_us_ = |
| 29 static_cast<int64>(1.7 * Time::kMicrosecondsPerSecond); | 29 static_cast<int64_t>(1.7 * Time::kMicrosecondsPerSecond); |
| 30 speech_input_complete_silence_length_us_ = | 30 speech_input_complete_silence_length_us_ = |
| 31 static_cast<int64>(0.5 * Time::kMicrosecondsPerSecond); | 31 static_cast<int64_t>(0.5 * Time::kMicrosecondsPerSecond); |
| 32 long_speech_input_complete_silence_length_us_ = -1; | 32 long_speech_input_complete_silence_length_us_ = -1; |
| 33 long_speech_length_us_ = -1; | 33 long_speech_length_us_ = -1; |
| 34 speech_input_possibly_complete_silence_length_us_ = | 34 speech_input_possibly_complete_silence_length_us_ = |
| 35 1 * Time::kMicrosecondsPerSecond; | 35 1 * Time::kMicrosecondsPerSecond; |
| 36 | 36 |
| 37 // Set the default configuration for Push To Talk mode. | 37 // Set the default configuration for Push To Talk mode. |
| 38 EnergyEndpointerParams ep_config; | 38 EnergyEndpointerParams ep_config; |
| 39 ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate)); | 39 ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate)); |
| 40 ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate)); | 40 ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate)); |
| 41 ep_config.set_endpoint_margin(0.2f); | 41 ep_config.set_endpoint_margin(0.2f); |
| (...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 78 | 78 |
| 79 void Endpointer::SetEnvironmentEstimationMode() { | 79 void Endpointer::SetEnvironmentEstimationMode() { |
| 80 Reset(); | 80 Reset(); |
| 81 energy_endpointer_.SetEnvironmentEstimationMode(); | 81 energy_endpointer_.SetEnvironmentEstimationMode(); |
| 82 } | 82 } |
| 83 | 83 |
| 84 void Endpointer::SetUserInputMode() { | 84 void Endpointer::SetUserInputMode() { |
| 85 energy_endpointer_.SetUserInputMode(); | 85 energy_endpointer_.SetUserInputMode(); |
| 86 } | 86 } |
| 87 | 87 |
| 88 EpStatus Endpointer::Status(int64 *time) { | 88 EpStatus Endpointer::Status(int64_t* time) { |
| 89 return energy_endpointer_.Status(time); | 89 return energy_endpointer_.Status(time); |
| 90 } | 90 } |
| 91 | 91 |
| 92 EpStatus Endpointer::ProcessAudio(const AudioChunk& raw_audio, float* rms_out) { | 92 EpStatus Endpointer::ProcessAudio(const AudioChunk& raw_audio, float* rms_out) { |
| 93 const int16* audio_data = raw_audio.SamplesData16(); | 93 const int16_t* audio_data = raw_audio.SamplesData16(); |
| 94 const int num_samples = raw_audio.NumSamples(); | 94 const int num_samples = raw_audio.NumSamples(); |
| 95 EpStatus ep_status = EP_PRE_SPEECH; | 95 EpStatus ep_status = EP_PRE_SPEECH; |
| 96 | 96 |
| 97 // Process the input data in blocks of frame_size_, dropping any incomplete | 97 // Process the input data in blocks of frame_size_, dropping any incomplete |
| 98 // frames at the end (which is ok since typically the caller will be recording | 98 // frames at the end (which is ok since typically the caller will be recording |
| 99 // audio in multiples of our frame size). | 99 // audio in multiples of our frame size). |
| 100 int sample_index = 0; | 100 int sample_index = 0; |
| 101 while (sample_index + frame_size_ <= num_samples) { | 101 while (sample_index + frame_size_ <= num_samples) { |
| 102 // Have the endpointer process the frame. | 102 // Have the endpointer process the frame. |
| 103 energy_endpointer_.ProcessAudioFrame(audio_frame_time_us_, | 103 energy_endpointer_.ProcessAudioFrame(audio_frame_time_us_, |
| 104 audio_data + sample_index, | 104 audio_data + sample_index, |
| 105 frame_size_, | 105 frame_size_, |
| 106 rms_out); | 106 rms_out); |
| 107 sample_index += frame_size_; | 107 sample_index += frame_size_; |
| 108 audio_frame_time_us_ += (frame_size_ * Time::kMicrosecondsPerSecond) / | 108 audio_frame_time_us_ += (frame_size_ * Time::kMicrosecondsPerSecond) / |
| 109 sample_rate_; | 109 sample_rate_; |
| 110 | 110 |
| 111 // Get the status of the endpointer. | 111 // Get the status of the endpointer. |
| 112 int64 ep_time; | 112 int64_t ep_time; |
| 113 ep_status = energy_endpointer_.Status(&ep_time); | 113 ep_status = energy_endpointer_.Status(&ep_time); |
| 114 | 114 |
| 115 // Handle state changes. | 115 // Handle state changes. |
| 116 if ((EP_SPEECH_PRESENT == ep_status) && | 116 if ((EP_SPEECH_PRESENT == ep_status) && |
| 117 (EP_POSSIBLE_ONSET == old_ep_status_)) { | 117 (EP_POSSIBLE_ONSET == old_ep_status_)) { |
| 118 speech_end_time_us_ = -1; | 118 speech_end_time_us_ = -1; |
| 119 waiting_for_speech_possibly_complete_timeout_ = false; | 119 waiting_for_speech_possibly_complete_timeout_ = false; |
| 120 waiting_for_speech_complete_timeout_ = false; | 120 waiting_for_speech_complete_timeout_ = false; |
| 121 // Trigger SpeechInputDidStart event on first detection. | 121 // Trigger SpeechInputDidStart event on first detection. |
| 122 if (false == speech_previously_detected_) { | 122 if (false == speech_previously_detected_) { |
| (...skipping 14 matching lines...) Expand all Loading... |
| 137 speech_input_possibly_complete_silence_length_us_)) { | 137 speech_input_possibly_complete_silence_length_us_)) { |
| 138 waiting_for_speech_possibly_complete_timeout_ = false; | 138 waiting_for_speech_possibly_complete_timeout_ = false; |
| 139 } | 139 } |
| 140 if (waiting_for_speech_complete_timeout_) { | 140 if (waiting_for_speech_complete_timeout_) { |
| 141 // The length of the silence timeout period can be held constant, or it | 141 // The length of the silence timeout period can be held constant, or it |
| 142 // can be changed after a fixed amount of time from the beginning of | 142 // can be changed after a fixed amount of time from the beginning of |
| 143 // speech. | 143 // speech. |
| 144 bool has_stepped_silence = | 144 bool has_stepped_silence = |
| 145 (long_speech_length_us_ > 0) && | 145 (long_speech_length_us_ > 0) && |
| 146 (long_speech_input_complete_silence_length_us_ > 0); | 146 (long_speech_input_complete_silence_length_us_ > 0); |
| 147 int64 requested_silence_length; | 147 int64_t requested_silence_length; |
| 148 if (has_stepped_silence && | 148 if (has_stepped_silence && |
| 149 (ep_time - speech_start_time_us_) > long_speech_length_us_) { | 149 (ep_time - speech_start_time_us_) > long_speech_length_us_) { |
| 150 requested_silence_length = | 150 requested_silence_length = |
| 151 long_speech_input_complete_silence_length_us_; | 151 long_speech_input_complete_silence_length_us_; |
| 152 } else { | 152 } else { |
| 153 requested_silence_length = | 153 requested_silence_length = |
| 154 speech_input_complete_silence_length_us_; | 154 speech_input_complete_silence_length_us_; |
| 155 } | 155 } |
| 156 | 156 |
| 157 // Speech complete timeout. | 157 // Speech complete timeout. |
| 158 if ((ep_time - speech_end_time_us_) > requested_silence_length) { | 158 if ((ep_time - speech_end_time_us_) > requested_silence_length) { |
| 159 waiting_for_speech_complete_timeout_ = false; | 159 waiting_for_speech_complete_timeout_ = false; |
| 160 speech_input_complete_ = true; | 160 speech_input_complete_ = true; |
| 161 } | 161 } |
| 162 } | 162 } |
| 163 } | 163 } |
| 164 old_ep_status_ = ep_status; | 164 old_ep_status_ = ep_status; |
| 165 } | 165 } |
| 166 return ep_status; | 166 return ep_status; |
| 167 } | 167 } |
| 168 | 168 |
| 169 } // namespace content | 169 } // namespace content |
| OLD | NEW |