| Index: chrome/browser/speech/endpointer/energy_endpointer.h
|
| diff --git a/chrome/browser/speech/endpointer/energy_endpointer.h b/chrome/browser/speech/endpointer/energy_endpointer.h
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..5a5c76f6f2fe203b675af75472de7b0458a8afc7
|
| --- /dev/null
|
| +++ b/chrome/browser/speech/endpointer/energy_endpointer.h
|
| @@ -0,0 +1,144 @@
|
| +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
|
| +// Use of this source code is governed by a BSD-style license that can be
|
| +// found in the LICENSE file.
|
| +
|
| +// The EnergyEndpointer class finds likely speech onset and offset points.
|
| +//
|
| +// The implementation described here is about the simplest possible.
|
| +// It is based on timings of threshold crossings for overall signal
|
| +// RMS. It is suitable for light weight applications.
|
| +//
|
| +// As written, the basic idea is that one specifies intervals that
|
| +// must be occupied by super- and sub-threshold energy levels, and
|
| +// defers decisions re onset and offset times until these
|
| +// specifications have been met. Three basic intervals are tested: an
|
| +// onset window, a speech-on window, and an offset window. We require
|
| +// super-threshold to exceed some mimimum total durations in the onset
|
| +// and speech-on windows before declaring the speech onset time, and
|
| +// we specify a required sub-threshold residency in the offset window
|
| +// before declaring speech offset. As the various residency requirements are
|
| +// met, the EnergyEndpointer instance assumes various states, and can return the
|
| +// ID of these states to the client (see EpStatus below).
|
| +//
|
| +// The levels of the speech and background noise are continuously updated. It is
|
| +// important that the background noise level be estimated initially for
|
| +// robustness in noisy conditions. The first frames are assumed to be background
|
| +// noise and a fast update rate is used for the noise level. The duration for
|
| +// fast update is controlled by the fast_update_dur_ paramter.
|
| +//
|
| +// If used in noisy conditions, the endpointer should be started and run in the
|
| +// EnvironmentEstimation mode, for at least 200ms, before switching to
|
| +// UserInputMode.
|
| +// Audio feedback contamination can appear in the input audio, if not cut
|
| +// out or handled by echo cancellation. Audio feedback can trigger a false
|
| +// accept. The false accepts can be ignored by setting
|
| +// ep_contamination_rejection_period.
|
| +
|
| +#ifndef CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
|
| +#define CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
|
| +
|
| +#include "base/basictypes.h"
|
| +#include "base/scoped_ptr.h"
|
| +#include "chrome/browser/speech/endpointer/energy_endpointer_params.h"
|
| +#include <vector>
|
| +
|
| +namespace speech_input {
|
| +
|
| +// Endpointer status codes
|
| +enum EpStatus {
|
| + EP_PRE_SPEECH = 10,
|
| + EP_POSSIBLE_ONSET,
|
| + EP_SPEECH_PRESENT,
|
| + EP_POSSIBLE_OFFSET,
|
| + EP_POST_SPEECH,
|
| +};
|
| +
|
| +class EnergyEndpointer {
|
| + public:
|
| + // The default construction MUST be followed by Init(), before any
|
| + // other use can be made of the instance.
|
| + EnergyEndpointer();
|
| + virtual ~EnergyEndpointer();
|
| +
|
| + void Init(const EnergyEndpointerParams& params);
|
| +
|
| + // Start the endpointer. This should be called at the beginning of a session.
|
| + void StartSession();
|
| +
|
| + // Stop the endpointer.
|
| + void EndSession();
|
| +
|
| + // Start environment estimation. Audio will be used for environment estimation
|
| + // i.e. noise level estimation.
|
| + void SetEnvironmentEstimationMode();
|
| +
|
| + // Start user input. This should be called when the user indicates start of
|
| + // input, e.g. by pressing a button.
|
| + void SetUserInputMode();
|
| +
|
| + // Computes the next input frame and modifies EnergyEndpointer status as
|
| + // appropriate based on the computation.
|
| + void ProcessAudioFrame(int64 time_us, const int16* samples, int num_samples);
|
| +
|
| + // Returns the current state of the EnergyEndpointer and the time
|
| + // corresponding to the most recently computed frame.
|
| + EpStatus Status(int64* status_time_us) const;
|
| +
|
| + private:
|
| + class HistoryRing;
|
| +
|
| + // Resets the endpointer internal state. If reset_threshold is true, the
|
| + // state will be reset completely, including adaptive thresholds and the
|
| + // removal of all history information.
|
| + void Restart(bool reset_threshold);
|
| +
|
| + // Update internal speech and noise levels.
|
| + void UpdateLevels(float rms);
|
| +
|
| + // Returns the number of frames (or frame number) corresponding to
|
| + // the 'time' (in seconds).
|
| + int TimeToFrame(float time) const;
|
| +
|
| + EpStatus status_; // The current state of this instance.
|
| + float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH
|
| + int64 endpointer_time_us_; // Time of the most recently received audio frame.
|
| + int64 fast_update_frames_; // Number of frames for initial level adaptation.
|
| + int64 frame_counter_; // Number of frames seen. Used for initial adaptation.
|
| + float max_window_dur_; // Largest search window size (seconds)
|
| + float sample_rate_; // Sampling rate.
|
| +
|
| + // Ring buffers to hold the speech activity history.
|
| + scoped_ptr<HistoryRing> history_;
|
| +
|
| + // Configuration parameters.
|
| + EnergyEndpointerParams params_;
|
| +
|
| + // RMS which must be exceeded to conclude frame is speech.
|
| + float decision_threshold_;
|
| +
|
| + // Flag to indicate that audio should be used to estmiate enviroment, prior to
|
| + // receiving user input.
|
| + bool estimating_environment_;
|
| +
|
| + // Estimate of the background noise level. Used externally for UI feedback.
|
| + float noise_level_;
|
| +
|
| + // An adaptive threshold used to update decision_threshold_ when appropriate.
|
| + float rms_adapt_;
|
| +
|
| + // Start lag corresponds to the highest fundamental frequency.
|
| + int start_lag_;
|
| +
|
| + // End lag corresponds to the lowest fundamental frequency.
|
| + int end_lag_;
|
| +
|
| + // Time when mode switched from environment estimation to user input. This
|
| + // is used to time forced rejection of audio feedback contamination.
|
| + int64 user_input_start_time_us_;
|
| +
|
| + DISALLOW_COPY_AND_ASSIGN(EnergyEndpointer);
|
| +};
|
| +
|
| +} // namespace speech_input
|
| +
|
| +#endif // CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
|
|
|