chrome/browser/speech/endpointer/energy_endpointer.h - Issue 3117026: Add an endpointer for detecting end of speech.

Unified Diff: chrome/browser/speech/endpointer/energy_endpointer.h

Issue 3117026: Add an endpointer for detecting end of speech. (Closed)

Patch Set: Merged with latest. Created 10 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

« no previous file with comments | « chrome/browser/speech/endpointer/endpointer_unittest.cc ('k') | chrome/browser/speech/endpointer/energy_endpointer.cc » ('j') | chrome/browser/speech/speech_recognizer.h » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: chrome/browser/speech/endpointer/energy_endpointer.h

diff --git a/chrome/browser/speech/endpointer/energy_endpointer.h b/chrome/browser/speech/endpointer/energy_endpointer.h

new file mode 100644

index 0000000000000000000000000000000000000000..5a5c76f6f2fe203b675af75472de7b0458a8afc7

--- /dev/null

+++ b/chrome/browser/speech/endpointer/energy_endpointer.h

@@ -0,0 +1,144 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+// The EnergyEndpointer class finds likely speech onset and offset points.

+//

+// The implementation described here is about the simplest possible.

+// It is based on timings of threshold crossings for overall signal

+// RMS. It is suitable for light weight applications.

+//

+// As written, the basic idea is that one specifies intervals that

+// must be occupied by super- and sub-threshold energy levels, and

+// defers decisions re onset and offset times until these

+// specifications have been met. Three basic intervals are tested: an

+// onset window, a speech-on window, and an offset window. We require

+// super-threshold to exceed some mimimum total durations in the onset

+// and speech-on windows before declaring the speech onset time, and

+// we specify a required sub-threshold residency in the offset window

+// before declaring speech offset. As the various residency requirements are

+// met, the EnergyEndpointer instance assumes various states, and can return the

+// ID of these states to the client (see EpStatus below).

+//

+// The levels of the speech and background noise are continuously updated. It is

+// important that the background noise level be estimated initially for

+// robustness in noisy conditions. The first frames are assumed to be background

+// noise and a fast update rate is used for the noise level. The duration for

+// fast update is controlled by the fast_update_dur_ paramter.

+//

+// If used in noisy conditions, the endpointer should be started and run in the

+// EnvironmentEstimation mode, for at least 200ms, before switching to

+// UserInputMode.

+// Audio feedback contamination can appear in the input audio, if not cut

+// out or handled by echo cancellation. Audio feedback can trigger a false

+// accept. The false accepts can be ignored by setting

+// ep_contamination_rejection_period.

+#ifndef CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_

+#define CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_

+#include "base/basictypes.h"

+#include "base/scoped_ptr.h"

+#include "chrome/browser/speech/endpointer/energy_endpointer_params.h"

+#include <vector>

+namespace speech_input {

+// Endpointer status codes

+enum EpStatus {

+ EP_PRE_SPEECH = 10,

+ EP_POSSIBLE_ONSET,

+ EP_SPEECH_PRESENT,

+ EP_POSSIBLE_OFFSET,

+ EP_POST_SPEECH,

+};

+class EnergyEndpointer {

+ public:

+ // The default construction MUST be followed by Init(), before any

+ // other use can be made of the instance.

+ EnergyEndpointer();

+ virtual ~EnergyEndpointer();

+ void Init(const EnergyEndpointerParams& params);

+ // Start the endpointer. This should be called at the beginning of a session.

+ void StartSession();

+ // Stop the endpointer.

+ void EndSession();

+ // Start environment estimation. Audio will be used for environment estimation

+ // i.e. noise level estimation.

+ void SetEnvironmentEstimationMode();

+ // Start user input. This should be called when the user indicates start of

+ // input, e.g. by pressing a button.

+ void SetUserInputMode();

+ // Computes the next input frame and modifies EnergyEndpointer status as

+ // appropriate based on the computation.

+ void ProcessAudioFrame(int64 time_us, const int16* samples, int num_samples);

+ // Returns the current state of the EnergyEndpointer and the time

+ // corresponding to the most recently computed frame.

+ EpStatus Status(int64* status_time_us) const;

+ private:

+ class HistoryRing;

+ // Resets the endpointer internal state. If reset_threshold is true, the

+ // state will be reset completely, including adaptive thresholds and the

+ // removal of all history information.

+ void Restart(bool reset_threshold);

+ // Update internal speech and noise levels.

+ void UpdateLevels(float rms);

+ // Returns the number of frames (or frame number) corresponding to

+ // the 'time' (in seconds).

+ int TimeToFrame(float time) const;

+ EpStatus status_; // The current state of this instance.

+ float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH

+ int64 endpointer_time_us_; // Time of the most recently received audio frame.

+ int64 fast_update_frames_; // Number of frames for initial level adaptation.

+ int64 frame_counter_; // Number of frames seen. Used for initial adaptation.

+ float max_window_dur_; // Largest search window size (seconds)

+ float sample_rate_; // Sampling rate.

+ // Ring buffers to hold the speech activity history.

+ scoped_ptr<HistoryRing> history_;

+ // Configuration parameters.

+ EnergyEndpointerParams params_;

+ // RMS which must be exceeded to conclude frame is speech.

+ float decision_threshold_;

+ // Flag to indicate that audio should be used to estmiate enviroment, prior to

+ // receiving user input.

+ bool estimating_environment_;

+ // Estimate of the background noise level. Used externally for UI feedback.

+ float noise_level_;

+ // An adaptive threshold used to update decision_threshold_ when appropriate.

+ float rms_adapt_;

+ // Start lag corresponds to the highest fundamental frequency.

+ int start_lag_;

+ // End lag corresponds to the lowest fundamental frequency.

+ int end_lag_;

+ // Time when mode switched from environment estimation to user input. This

+ // is used to time forced rejection of audio feedback contamination.

+ int64 user_input_start_time_us_;

+ DISALLOW_COPY_AND_ASSIGN(EnergyEndpointer);

+};

+} // namespace speech_input

+#endif // CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_