chrome/browser/speech/endpointer/energy_endpointer.h - Issue 3117026: Add an endpointer for detecting end of speech.

Side by Side Diff: chrome/browser/speech/endpointer/energy_endpointer.h

Issue 3117026: Add an endpointer for detecting end of speech. (Closed)

Patch Set: Merged with latest. Created 10 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « chrome/browser/speech/endpointer/endpointer_unittest.cc ('k') | chrome/browser/speech/endpointer/energy_endpointer.cc » ('j') | chrome/browser/speech/speech_recognizer.h » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 // The EnergyEndpointer class finds likely speech onset and offset points.

	6 //

	7 // The implementation described here is about the simplest possible.

	8 // It is based on timings of threshold crossings for overall signal

	9 // RMS. It is suitable for light weight applications.

	10 //

	11 // As written, the basic idea is that one specifies intervals that

	12 // must be occupied by super- and sub-threshold energy levels, and

	13 // defers decisions re onset and offset times until these

	14 // specifications have been met. Three basic intervals are tested: an

	15 // onset window, a speech-on window, and an offset window. We require

	16 // super-threshold to exceed some mimimum total durations in the onset

	17 // and speech-on windows before declaring the speech onset time, and

	18 // we specify a required sub-threshold residency in the offset window

	19 // before declaring speech offset. As the various residency requirements are

	20 // met, the EnergyEndpointer instance assumes various states, and can return the

	21 // ID of these states to the client (see EpStatus below).

	22 //

	23 // The levels of the speech and background noise are continuously updated. It is

	24 // important that the background noise level be estimated initially for

	25 // robustness in noisy conditions. The first frames are assumed to be background

	26 // noise and a fast update rate is used for the noise level. The duration for

	27 // fast update is controlled by the fast_update_dur_ paramter.

	28 //

	29 // If used in noisy conditions, the endpointer should be started and run in the

	30 // EnvironmentEstimation mode, for at least 200ms, before switching to

	31 // UserInputMode.

	32 // Audio feedback contamination can appear in the input audio, if not cut

	33 // out or handled by echo cancellation. Audio feedback can trigger a false

	34 // accept. The false accepts can be ignored by setting

	35 // ep_contamination_rejection_period.

	36

	37 #ifndef CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_

	38 #define CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_

	39

	40 #include "base/basictypes.h"

	41 #include "base/scoped_ptr.h"

	42 #include "chrome/browser/speech/endpointer/energy_endpointer_params.h"

	43 #include <vector>

	44

	45 namespace speech_input {

	46

	47 // Endpointer status codes

	48 enum EpStatus {

	49 EP_PRE_SPEECH = 10,

	50 EP_POSSIBLE_ONSET,

	51 EP_SPEECH_PRESENT,

	52 EP_POSSIBLE_OFFSET,

	53 EP_POST_SPEECH,

	54 };

	55

	56 class EnergyEndpointer {

	57 public:

	58 // The default construction MUST be followed by Init(), before any

	59 // other use can be made of the instance.

	60 EnergyEndpointer();

	61 virtual ~EnergyEndpointer();

	62

	63 void Init(const EnergyEndpointerParams& params);

	64

	65 // Start the endpointer. This should be called at the beginning of a session.

	66 void StartSession();

	67

	68 // Stop the endpointer.

	69 void EndSession();

	70

	71 // Start environment estimation. Audio will be used for environment estimation

	72 // i.e. noise level estimation.

	73 void SetEnvironmentEstimationMode();

	74

	75 // Start user input. This should be called when the user indicates start of

	76 // input, e.g. by pressing a button.

	77 void SetUserInputMode();

	78

	79 // Computes the next input frame and modifies EnergyEndpointer status as

	80 // appropriate based on the computation.

	81 void ProcessAudioFrame(int64 time_us, const int16* samples, int num_samples);

	82

	83 // Returns the current state of the EnergyEndpointer and the time

	84 // corresponding to the most recently computed frame.

	85 EpStatus Status(int64* status_time_us) const;

	86

	87 private:

	88 class HistoryRing;

	89

	90 // Resets the endpointer internal state. If reset_threshold is true, the

	91 // state will be reset completely, including adaptive thresholds and the

	92 // removal of all history information.

	93 void Restart(bool reset_threshold);

	94

	95 // Update internal speech and noise levels.

	96 void UpdateLevels(float rms);

	97

	98 // Returns the number of frames (or frame number) corresponding to

	99 // the 'time' (in seconds).

	100 int TimeToFrame(float time) const;

	101

	102 EpStatus status_; // The current state of this instance.

	103 float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH

	104 int64 endpointer_time_us_; // Time of the most recently received audio frame.

	105 int64 fast_update_frames_; // Number of frames for initial level adaptation.

	106 int64 frame_counter_; // Number of frames seen. Used for initial adaptation.

	107 float max_window_dur_; // Largest search window size (seconds)

	108 float sample_rate_; // Sampling rate.

	109

	110 // Ring buffers to hold the speech activity history.

	111 scoped_ptr<HistoryRing> history_;

	112

	113 // Configuration parameters.

	114 EnergyEndpointerParams params_;

	115

	116 // RMS which must be exceeded to conclude frame is speech.

	117 float decision_threshold_;

	118

	119 // Flag to indicate that audio should be used to estmiate enviroment, prior to

	120 // receiving user input.

	121 bool estimating_environment_;

	122

	123 // Estimate of the background noise level. Used externally for UI feedback.

	124 float noise_level_;

	125

	126 // An adaptive threshold used to update decision_threshold_ when appropriate.

	127 float rms_adapt_;

	128

	129 // Start lag corresponds to the highest fundamental frequency.

	130 int start_lag_;

	131

	132 // End lag corresponds to the lowest fundamental frequency.

	133 int end_lag_;

	134

	135 // Time when mode switched from environment estimation to user input. This

	136 // is used to time forced rejection of audio feedback contamination.

	137 int64 user_input_start_time_us_;

	138

	139 DISALLOW_COPY_AND_ASSIGN(EnergyEndpointer);

	140 };

	141

	142 } // namespace speech_input

	143

	144 #endif // CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_

OLD	NEW