Index: chrome/browser/speech/endpointer/energy_endpointer.h |
diff --git a/chrome/browser/speech/endpointer/energy_endpointer.h b/chrome/browser/speech/endpointer/energy_endpointer.h |
new file mode 100644 |
index 0000000000000000000000000000000000000000..5a5c76f6f2fe203b675af75472de7b0458a8afc7 |
--- /dev/null |
+++ b/chrome/browser/speech/endpointer/energy_endpointer.h |
@@ -0,0 +1,144 @@ |
+// Copyright (c) 2010 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+// The EnergyEndpointer class finds likely speech onset and offset points. |
+// |
+// The implementation described here is about the simplest possible. |
+// It is based on timings of threshold crossings for overall signal |
+// RMS. It is suitable for light weight applications. |
+// |
+// As written, the basic idea is that one specifies intervals that |
+// must be occupied by super- and sub-threshold energy levels, and |
+// defers decisions re onset and offset times until these |
+// specifications have been met. Three basic intervals are tested: an |
+// onset window, a speech-on window, and an offset window. We require |
+// super-threshold to exceed some mimimum total durations in the onset |
+// and speech-on windows before declaring the speech onset time, and |
+// we specify a required sub-threshold residency in the offset window |
+// before declaring speech offset. As the various residency requirements are |
+// met, the EnergyEndpointer instance assumes various states, and can return the |
+// ID of these states to the client (see EpStatus below). |
+// |
+// The levels of the speech and background noise are continuously updated. It is |
+// important that the background noise level be estimated initially for |
+// robustness in noisy conditions. The first frames are assumed to be background |
+// noise and a fast update rate is used for the noise level. The duration for |
+// fast update is controlled by the fast_update_dur_ paramter. |
+// |
+// If used in noisy conditions, the endpointer should be started and run in the |
+// EnvironmentEstimation mode, for at least 200ms, before switching to |
+// UserInputMode. |
+// Audio feedback contamination can appear in the input audio, if not cut |
+// out or handled by echo cancellation. Audio feedback can trigger a false |
+// accept. The false accepts can be ignored by setting |
+// ep_contamination_rejection_period. |
+ |
+#ifndef CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ |
+#define CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ |
+ |
+#include "base/basictypes.h" |
+#include "base/scoped_ptr.h" |
+#include "chrome/browser/speech/endpointer/energy_endpointer_params.h" |
+#include <vector> |
+ |
+namespace speech_input { |
+ |
+// Endpointer status codes |
+enum EpStatus { |
+ EP_PRE_SPEECH = 10, |
+ EP_POSSIBLE_ONSET, |
+ EP_SPEECH_PRESENT, |
+ EP_POSSIBLE_OFFSET, |
+ EP_POST_SPEECH, |
+}; |
+ |
+class EnergyEndpointer { |
+ public: |
+ // The default construction MUST be followed by Init(), before any |
+ // other use can be made of the instance. |
+ EnergyEndpointer(); |
+ virtual ~EnergyEndpointer(); |
+ |
+ void Init(const EnergyEndpointerParams& params); |
+ |
+ // Start the endpointer. This should be called at the beginning of a session. |
+ void StartSession(); |
+ |
+ // Stop the endpointer. |
+ void EndSession(); |
+ |
+ // Start environment estimation. Audio will be used for environment estimation |
+ // i.e. noise level estimation. |
+ void SetEnvironmentEstimationMode(); |
+ |
+ // Start user input. This should be called when the user indicates start of |
+ // input, e.g. by pressing a button. |
+ void SetUserInputMode(); |
+ |
+ // Computes the next input frame and modifies EnergyEndpointer status as |
+ // appropriate based on the computation. |
+ void ProcessAudioFrame(int64 time_us, const int16* samples, int num_samples); |
+ |
+ // Returns the current state of the EnergyEndpointer and the time |
+ // corresponding to the most recently computed frame. |
+ EpStatus Status(int64* status_time_us) const; |
+ |
+ private: |
+ class HistoryRing; |
+ |
+ // Resets the endpointer internal state. If reset_threshold is true, the |
+ // state will be reset completely, including adaptive thresholds and the |
+ // removal of all history information. |
+ void Restart(bool reset_threshold); |
+ |
+ // Update internal speech and noise levels. |
+ void UpdateLevels(float rms); |
+ |
+ // Returns the number of frames (or frame number) corresponding to |
+ // the 'time' (in seconds). |
+ int TimeToFrame(float time) const; |
+ |
+ EpStatus status_; // The current state of this instance. |
+ float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH |
+ int64 endpointer_time_us_; // Time of the most recently received audio frame. |
+ int64 fast_update_frames_; // Number of frames for initial level adaptation. |
+ int64 frame_counter_; // Number of frames seen. Used for initial adaptation. |
+ float max_window_dur_; // Largest search window size (seconds) |
+ float sample_rate_; // Sampling rate. |
+ |
+ // Ring buffers to hold the speech activity history. |
+ scoped_ptr<HistoryRing> history_; |
+ |
+ // Configuration parameters. |
+ EnergyEndpointerParams params_; |
+ |
+ // RMS which must be exceeded to conclude frame is speech. |
+ float decision_threshold_; |
+ |
+ // Flag to indicate that audio should be used to estmiate enviroment, prior to |
+ // receiving user input. |
+ bool estimating_environment_; |
+ |
+ // Estimate of the background noise level. Used externally for UI feedback. |
+ float noise_level_; |
+ |
+ // An adaptive threshold used to update decision_threshold_ when appropriate. |
+ float rms_adapt_; |
+ |
+ // Start lag corresponds to the highest fundamental frequency. |
+ int start_lag_; |
+ |
+ // End lag corresponds to the lowest fundamental frequency. |
+ int end_lag_; |
+ |
+ // Time when mode switched from environment estimation to user input. This |
+ // is used to time forced rejection of audio feedback contamination. |
+ int64 user_input_start_time_us_; |
+ |
+ DISALLOW_COPY_AND_ASSIGN(EnergyEndpointer); |
+}; |
+ |
+} // namespace speech_input |
+ |
+#endif // CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ |