OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 // The EnergyEndpointer class finds likely speech onset and offset points. |
| 6 // |
| 7 // The implementation described here is about the simplest possible. |
| 8 // It is based on timings of threshold crossings for overall signal |
| 9 // RMS. It is suitable for light weight applications. |
| 10 // |
| 11 // As written, the basic idea is that one specifies intervals that |
| 12 // must be occupied by super- and sub-threshold energy levels, and |
| 13 // defers decisions re onset and offset times until these |
| 14 // specifications have been met. Three basic intervals are tested: an |
| 15 // onset window, a speech-on window, and an offset window. We require |
| 16 // super-threshold to exceed some mimimum total durations in the onset |
| 17 // and speech-on windows before declaring the speech onset time, and |
| 18 // we specify a required sub-threshold residency in the offset window |
| 19 // before declaring speech offset. As the various residency requirements are |
| 20 // met, the EnergyEndpointer instance assumes various states, and can return the |
| 21 // ID of these states to the client (see EpStatus below). |
| 22 // |
| 23 // The levels of the speech and background noise are continuously updated. It is |
| 24 // important that the background noise level be estimated initially for |
| 25 // robustness in noisy conditions. The first frames are assumed to be background |
| 26 // noise and a fast update rate is used for the noise level. The duration for |
| 27 // fast update is controlled by the fast_update_dur_ paramter. |
| 28 // |
| 29 // If used in noisy conditions, the endpointer should be started and run in the |
| 30 // EnvironmentEstimation mode, for at least 200ms, before switching to |
| 31 // UserInputMode. |
| 32 // Audio feedback contamination can appear in the input audio, if not cut |
| 33 // out or handled by echo cancellation. Audio feedback can trigger a false |
| 34 // accept. The false accepts can be ignored by setting |
| 35 // ep_contamination_rejection_period. |
| 36 |
| 37 #ifndef CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ |
| 38 #define CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ |
| 39 |
| 40 #include "base/basictypes.h" |
| 41 #include "base/scoped_ptr.h" |
| 42 #include "chrome/browser/speech/endpointer/energy_endpointer_params.h" |
| 43 #include <vector> |
| 44 |
| 45 namespace speech_input { |
| 46 |
| 47 // Endpointer status codes |
| 48 enum EpStatus { |
| 49 EP_PRE_SPEECH = 10, |
| 50 EP_POSSIBLE_ONSET, |
| 51 EP_SPEECH_PRESENT, |
| 52 EP_POSSIBLE_OFFSET, |
| 53 EP_POST_SPEECH, |
| 54 }; |
| 55 |
| 56 class EnergyEndpointer { |
| 57 public: |
| 58 // The default construction MUST be followed by Init(), before any |
| 59 // other use can be made of the instance. |
| 60 EnergyEndpointer(); |
| 61 virtual ~EnergyEndpointer(); |
| 62 |
| 63 void Init(const EnergyEndpointerParams& params); |
| 64 |
| 65 // Start the endpointer. This should be called at the beginning of a session. |
| 66 void StartSession(); |
| 67 |
| 68 // Stop the endpointer. |
| 69 void EndSession(); |
| 70 |
| 71 // Start environment estimation. Audio will be used for environment estimation |
| 72 // i.e. noise level estimation. |
| 73 void SetEnvironmentEstimationMode(); |
| 74 |
| 75 // Start user input. This should be called when the user indicates start of |
| 76 // input, e.g. by pressing a button. |
| 77 void SetUserInputMode(); |
| 78 |
| 79 // Computes the next input frame and modifies EnergyEndpointer status as |
| 80 // appropriate based on the computation. |
| 81 void ProcessAudioFrame(int64 time_us, const int16* samples, int num_samples); |
| 82 |
| 83 // Returns the current state of the EnergyEndpointer and the time |
| 84 // corresponding to the most recently computed frame. |
| 85 EpStatus Status(int64* status_time_us) const; |
| 86 |
| 87 private: |
| 88 class HistoryRing; |
| 89 |
| 90 // Resets the endpointer internal state. If reset_threshold is true, the |
| 91 // state will be reset completely, including adaptive thresholds and the |
| 92 // removal of all history information. |
| 93 void Restart(bool reset_threshold); |
| 94 |
| 95 // Update internal speech and noise levels. |
| 96 void UpdateLevels(float rms); |
| 97 |
| 98 // Returns the number of frames (or frame number) corresponding to |
| 99 // the 'time' (in seconds). |
| 100 int TimeToFrame(float time) const; |
| 101 |
| 102 EpStatus status_; // The current state of this instance. |
| 103 float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH |
| 104 int64 endpointer_time_us_; // Time of the most recently received audio frame. |
| 105 int64 fast_update_frames_; // Number of frames for initial level adaptation. |
| 106 int64 frame_counter_; // Number of frames seen. Used for initial adaptation. |
| 107 float max_window_dur_; // Largest search window size (seconds) |
| 108 float sample_rate_; // Sampling rate. |
| 109 |
| 110 // Ring buffers to hold the speech activity history. |
| 111 scoped_ptr<HistoryRing> history_; |
| 112 |
| 113 // Configuration parameters. |
| 114 EnergyEndpointerParams params_; |
| 115 |
| 116 // RMS which must be exceeded to conclude frame is speech. |
| 117 float decision_threshold_; |
| 118 |
| 119 // Flag to indicate that audio should be used to estmiate enviroment, prior to |
| 120 // receiving user input. |
| 121 bool estimating_environment_; |
| 122 |
| 123 // Estimate of the background noise level. Used externally for UI feedback. |
| 124 float noise_level_; |
| 125 |
| 126 // An adaptive threshold used to update decision_threshold_ when appropriate. |
| 127 float rms_adapt_; |
| 128 |
| 129 // Start lag corresponds to the highest fundamental frequency. |
| 130 int start_lag_; |
| 131 |
| 132 // End lag corresponds to the lowest fundamental frequency. |
| 133 int end_lag_; |
| 134 |
| 135 // Time when mode switched from environment estimation to user input. This |
| 136 // is used to time forced rejection of audio feedback contamination. |
| 137 int64 user_input_start_time_us_; |
| 138 |
| 139 DISALLOW_COPY_AND_ASSIGN(EnergyEndpointer); |
| 140 }; |
| 141 |
| 142 } // namespace speech_input |
| 143 |
| 144 #endif // CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ |
OLD | NEW |