OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "content/browser/speech/endpointer/endpointer.h" | 5 #include "content/browser/speech/endpointer/endpointer.h" |
6 | 6 |
7 #include "base/time/time.h" | 7 #include "base/time/time.h" |
8 #include "content/browser/speech/audio_buffer.h" | 8 #include "content/browser/speech/audio_buffer.h" |
9 | 9 |
10 using base::Time; | 10 using base::Time; |
11 | 11 |
12 namespace { | 12 namespace { |
13 const int kFrameRate = 50; // 1 frame = 20ms of audio. | 13 const int kFrameRate = 50; // 1 frame = 20ms of audio. |
14 } | 14 } |
15 | 15 |
16 namespace content { | 16 namespace content { |
17 | 17 |
18 Endpointer::Endpointer(int sample_rate) | 18 Endpointer::Endpointer(int sample_rate) |
19 : speech_input_possibly_complete_silence_length_us_(-1), | 19 : speech_input_possibly_complete_silence_length_us_(-1), |
20 speech_input_complete_silence_length_us_(-1), | 20 speech_input_complete_silence_length_us_(-1), |
21 audio_frame_time_us_(0), | 21 audio_frame_time_us_(0), |
22 sample_rate_(sample_rate), | 22 sample_rate_(sample_rate), |
23 frame_size_(0) { | 23 frame_size_(0) { |
24 Reset(); | 24 Reset(); |
25 | 25 |
26 frame_size_ = static_cast<int>(sample_rate / static_cast<float>(kFrameRate)); | 26 frame_size_ = static_cast<int>(sample_rate / static_cast<float>(kFrameRate)); |
27 | 27 |
28 speech_input_minimum_length_us_ = | 28 speech_input_minimum_length_us_ = |
29 static_cast<int64>(1.7 * Time::kMicrosecondsPerSecond); | 29 static_cast<int64_t>(1.7 * Time::kMicrosecondsPerSecond); |
30 speech_input_complete_silence_length_us_ = | 30 speech_input_complete_silence_length_us_ = |
31 static_cast<int64>(0.5 * Time::kMicrosecondsPerSecond); | 31 static_cast<int64_t>(0.5 * Time::kMicrosecondsPerSecond); |
32 long_speech_input_complete_silence_length_us_ = -1; | 32 long_speech_input_complete_silence_length_us_ = -1; |
33 long_speech_length_us_ = -1; | 33 long_speech_length_us_ = -1; |
34 speech_input_possibly_complete_silence_length_us_ = | 34 speech_input_possibly_complete_silence_length_us_ = |
35 1 * Time::kMicrosecondsPerSecond; | 35 1 * Time::kMicrosecondsPerSecond; |
36 | 36 |
37 // Set the default configuration for Push To Talk mode. | 37 // Set the default configuration for Push To Talk mode. |
38 EnergyEndpointerParams ep_config; | 38 EnergyEndpointerParams ep_config; |
39 ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate)); | 39 ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate)); |
40 ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate)); | 40 ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate)); |
41 ep_config.set_endpoint_margin(0.2f); | 41 ep_config.set_endpoint_margin(0.2f); |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
78 | 78 |
79 void Endpointer::SetEnvironmentEstimationMode() { | 79 void Endpointer::SetEnvironmentEstimationMode() { |
80 Reset(); | 80 Reset(); |
81 energy_endpointer_.SetEnvironmentEstimationMode(); | 81 energy_endpointer_.SetEnvironmentEstimationMode(); |
82 } | 82 } |
83 | 83 |
84 void Endpointer::SetUserInputMode() { | 84 void Endpointer::SetUserInputMode() { |
85 energy_endpointer_.SetUserInputMode(); | 85 energy_endpointer_.SetUserInputMode(); |
86 } | 86 } |
87 | 87 |
88 EpStatus Endpointer::Status(int64 *time) { | 88 EpStatus Endpointer::Status(int64_t* time) { |
89 return energy_endpointer_.Status(time); | 89 return energy_endpointer_.Status(time); |
90 } | 90 } |
91 | 91 |
92 EpStatus Endpointer::ProcessAudio(const AudioChunk& raw_audio, float* rms_out) { | 92 EpStatus Endpointer::ProcessAudio(const AudioChunk& raw_audio, float* rms_out) { |
93 const int16* audio_data = raw_audio.SamplesData16(); | 93 const int16_t* audio_data = raw_audio.SamplesData16(); |
94 const int num_samples = raw_audio.NumSamples(); | 94 const int num_samples = raw_audio.NumSamples(); |
95 EpStatus ep_status = EP_PRE_SPEECH; | 95 EpStatus ep_status = EP_PRE_SPEECH; |
96 | 96 |
97 // Process the input data in blocks of frame_size_, dropping any incomplete | 97 // Process the input data in blocks of frame_size_, dropping any incomplete |
98 // frames at the end (which is ok since typically the caller will be recording | 98 // frames at the end (which is ok since typically the caller will be recording |
99 // audio in multiples of our frame size). | 99 // audio in multiples of our frame size). |
100 int sample_index = 0; | 100 int sample_index = 0; |
101 while (sample_index + frame_size_ <= num_samples) { | 101 while (sample_index + frame_size_ <= num_samples) { |
102 // Have the endpointer process the frame. | 102 // Have the endpointer process the frame. |
103 energy_endpointer_.ProcessAudioFrame(audio_frame_time_us_, | 103 energy_endpointer_.ProcessAudioFrame(audio_frame_time_us_, |
104 audio_data + sample_index, | 104 audio_data + sample_index, |
105 frame_size_, | 105 frame_size_, |
106 rms_out); | 106 rms_out); |
107 sample_index += frame_size_; | 107 sample_index += frame_size_; |
108 audio_frame_time_us_ += (frame_size_ * Time::kMicrosecondsPerSecond) / | 108 audio_frame_time_us_ += (frame_size_ * Time::kMicrosecondsPerSecond) / |
109 sample_rate_; | 109 sample_rate_; |
110 | 110 |
111 // Get the status of the endpointer. | 111 // Get the status of the endpointer. |
112 int64 ep_time; | 112 int64_t ep_time; |
113 ep_status = energy_endpointer_.Status(&ep_time); | 113 ep_status = energy_endpointer_.Status(&ep_time); |
114 | 114 |
115 // Handle state changes. | 115 // Handle state changes. |
116 if ((EP_SPEECH_PRESENT == ep_status) && | 116 if ((EP_SPEECH_PRESENT == ep_status) && |
117 (EP_POSSIBLE_ONSET == old_ep_status_)) { | 117 (EP_POSSIBLE_ONSET == old_ep_status_)) { |
118 speech_end_time_us_ = -1; | 118 speech_end_time_us_ = -1; |
119 waiting_for_speech_possibly_complete_timeout_ = false; | 119 waiting_for_speech_possibly_complete_timeout_ = false; |
120 waiting_for_speech_complete_timeout_ = false; | 120 waiting_for_speech_complete_timeout_ = false; |
121 // Trigger SpeechInputDidStart event on first detection. | 121 // Trigger SpeechInputDidStart event on first detection. |
122 if (false == speech_previously_detected_) { | 122 if (false == speech_previously_detected_) { |
(...skipping 14 matching lines...) Expand all Loading... |
137 speech_input_possibly_complete_silence_length_us_)) { | 137 speech_input_possibly_complete_silence_length_us_)) { |
138 waiting_for_speech_possibly_complete_timeout_ = false; | 138 waiting_for_speech_possibly_complete_timeout_ = false; |
139 } | 139 } |
140 if (waiting_for_speech_complete_timeout_) { | 140 if (waiting_for_speech_complete_timeout_) { |
141 // The length of the silence timeout period can be held constant, or it | 141 // The length of the silence timeout period can be held constant, or it |
142 // can be changed after a fixed amount of time from the beginning of | 142 // can be changed after a fixed amount of time from the beginning of |
143 // speech. | 143 // speech. |
144 bool has_stepped_silence = | 144 bool has_stepped_silence = |
145 (long_speech_length_us_ > 0) && | 145 (long_speech_length_us_ > 0) && |
146 (long_speech_input_complete_silence_length_us_ > 0); | 146 (long_speech_input_complete_silence_length_us_ > 0); |
147 int64 requested_silence_length; | 147 int64_t requested_silence_length; |
148 if (has_stepped_silence && | 148 if (has_stepped_silence && |
149 (ep_time - speech_start_time_us_) > long_speech_length_us_) { | 149 (ep_time - speech_start_time_us_) > long_speech_length_us_) { |
150 requested_silence_length = | 150 requested_silence_length = |
151 long_speech_input_complete_silence_length_us_; | 151 long_speech_input_complete_silence_length_us_; |
152 } else { | 152 } else { |
153 requested_silence_length = | 153 requested_silence_length = |
154 speech_input_complete_silence_length_us_; | 154 speech_input_complete_silence_length_us_; |
155 } | 155 } |
156 | 156 |
157 // Speech complete timeout. | 157 // Speech complete timeout. |
158 if ((ep_time - speech_end_time_us_) > requested_silence_length) { | 158 if ((ep_time - speech_end_time_us_) > requested_silence_length) { |
159 waiting_for_speech_complete_timeout_ = false; | 159 waiting_for_speech_complete_timeout_ = false; |
160 speech_input_complete_ = true; | 160 speech_input_complete_ = true; |
161 } | 161 } |
162 } | 162 } |
163 } | 163 } |
164 old_ep_status_ = ep_status; | 164 old_ep_status_ = ep_status; |
165 } | 165 } |
166 return ep_status; | 166 return ep_status; |
167 } | 167 } |
168 | 168 |
169 } // namespace content | 169 } // namespace content |
OLD | NEW |