Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(12)

Side by Side Diff: chrome/browser/speech/endpointer/energy_endpointer.h

Issue 3117026: Add an endpointer for detecting end of speech. (Closed)
Patch Set: Merged with latest. Created 10 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // The EnergyEndpointer class finds likely speech onset and offset points.
6 //
7 // The implementation described here is about the simplest possible.
8 // It is based on timings of threshold crossings for overall signal
9 // RMS. It is suitable for light weight applications.
10 //
11 // As written, the basic idea is that one specifies intervals that
12 // must be occupied by super- and sub-threshold energy levels, and
13 // defers decisions re onset and offset times until these
14 // specifications have been met. Three basic intervals are tested: an
15 // onset window, a speech-on window, and an offset window. We require
16 // super-threshold to exceed some mimimum total durations in the onset
17 // and speech-on windows before declaring the speech onset time, and
18 // we specify a required sub-threshold residency in the offset window
19 // before declaring speech offset. As the various residency requirements are
20 // met, the EnergyEndpointer instance assumes various states, and can return the
21 // ID of these states to the client (see EpStatus below).
22 //
23 // The levels of the speech and background noise are continuously updated. It is
24 // important that the background noise level be estimated initially for
25 // robustness in noisy conditions. The first frames are assumed to be background
26 // noise and a fast update rate is used for the noise level. The duration for
27 // fast update is controlled by the fast_update_dur_ paramter.
28 //
29 // If used in noisy conditions, the endpointer should be started and run in the
30 // EnvironmentEstimation mode, for at least 200ms, before switching to
31 // UserInputMode.
32 // Audio feedback contamination can appear in the input audio, if not cut
33 // out or handled by echo cancellation. Audio feedback can trigger a false
34 // accept. The false accepts can be ignored by setting
35 // ep_contamination_rejection_period.
36
37 #ifndef CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
38 #define CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
39
40 #include "base/basictypes.h"
41 #include "base/scoped_ptr.h"
42 #include "chrome/browser/speech/endpointer/energy_endpointer_params.h"
43 #include <vector>
44
45 namespace speech_input {
46
47 // Endpointer status codes
48 enum EpStatus {
49 EP_PRE_SPEECH = 10,
50 EP_POSSIBLE_ONSET,
51 EP_SPEECH_PRESENT,
52 EP_POSSIBLE_OFFSET,
53 EP_POST_SPEECH,
54 };
55
56 class EnergyEndpointer {
57 public:
58 // The default construction MUST be followed by Init(), before any
59 // other use can be made of the instance.
60 EnergyEndpointer();
61 virtual ~EnergyEndpointer();
62
63 void Init(const EnergyEndpointerParams& params);
64
65 // Start the endpointer. This should be called at the beginning of a session.
66 void StartSession();
67
68 // Stop the endpointer.
69 void EndSession();
70
71 // Start environment estimation. Audio will be used for environment estimation
72 // i.e. noise level estimation.
73 void SetEnvironmentEstimationMode();
74
75 // Start user input. This should be called when the user indicates start of
76 // input, e.g. by pressing a button.
77 void SetUserInputMode();
78
79 // Computes the next input frame and modifies EnergyEndpointer status as
80 // appropriate based on the computation.
81 void ProcessAudioFrame(int64 time_us, const int16* samples, int num_samples);
82
83 // Returns the current state of the EnergyEndpointer and the time
84 // corresponding to the most recently computed frame.
85 EpStatus Status(int64* status_time_us) const;
86
87 private:
88 class HistoryRing;
89
90 // Resets the endpointer internal state. If reset_threshold is true, the
91 // state will be reset completely, including adaptive thresholds and the
92 // removal of all history information.
93 void Restart(bool reset_threshold);
94
95 // Update internal speech and noise levels.
96 void UpdateLevels(float rms);
97
98 // Returns the number of frames (or frame number) corresponding to
99 // the 'time' (in seconds).
100 int TimeToFrame(float time) const;
101
102 EpStatus status_; // The current state of this instance.
103 float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH
104 int64 endpointer_time_us_; // Time of the most recently received audio frame.
105 int64 fast_update_frames_; // Number of frames for initial level adaptation.
106 int64 frame_counter_; // Number of frames seen. Used for initial adaptation.
107 float max_window_dur_; // Largest search window size (seconds)
108 float sample_rate_; // Sampling rate.
109
110 // Ring buffers to hold the speech activity history.
111 scoped_ptr<HistoryRing> history_;
112
113 // Configuration parameters.
114 EnergyEndpointerParams params_;
115
116 // RMS which must be exceeded to conclude frame is speech.
117 float decision_threshold_;
118
119 // Flag to indicate that audio should be used to estmiate enviroment, prior to
120 // receiving user input.
121 bool estimating_environment_;
122
123 // Estimate of the background noise level. Used externally for UI feedback.
124 float noise_level_;
125
126 // An adaptive threshold used to update decision_threshold_ when appropriate.
127 float rms_adapt_;
128
129 // Start lag corresponds to the highest fundamental frequency.
130 int start_lag_;
131
132 // End lag corresponds to the lowest fundamental frequency.
133 int end_lag_;
134
135 // Time when mode switched from environment estimation to user input. This
136 // is used to time forced rejection of audio feedback contamination.
137 int64 user_input_start_time_us_;
138
139 DISALLOW_COPY_AND_ASSIGN(EnergyEndpointer);
140 };
141
142 } // namespace speech_input
143
144 #endif // CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698