chrome/browser/speech/endpointer/endpointer.cc - Issue 3117026: Add an endpointer for detecting end of speech.

Side by Side Diff: chrome/browser/speech/endpointer/endpointer.cc

Issue 3117026: Add an endpointer for detecting end of speech. (Closed)

Patch Set: Merged with latest. Created 10 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « chrome/browser/speech/endpointer/endpointer.h ('k') | chrome/browser/speech/endpointer/endpointer_unittest.cc » ('j') | chrome/browser/speech/speech_recognizer.h » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "chrome/browser/speech/endpointer/endpointer.h"

	6 #include "base/time.h"

	7

	8 using base::Time;

	9

	10 namespace {

	11 static const int kFrameRate = 50; // 1 frame = 20ms of audio.

	12 }

	13

	14 namespace speech_input {

	15

	16 Endpointer::Endpointer(int sample_rate)

	17 : speech_input_possibly_complete_silence_length_us_(-1),

	18 speech_input_complete_silence_length_us_(-1),

	19 audio_frame_time_us_(0),

	20 sample_rate_(sample_rate),

	21 frame_size_(0) {

	22 Reset();

	23

	24 frame_size_ = static_cast<int>(sample_rate / static_cast<float>(kFrameRate));

	25

	26 speech_input_minimum_length_us_ =

	27 static_cast<int64>(1.7 * Time::kMicrosecondsPerSecond);

	28 speech_input_complete_silence_length_us_ =

	29 static_cast<int64>(0.5 * Time::kMicrosecondsPerSecond);

	30 long_speech_input_complete_silence_length_us_ = -1;

	31 long_speech_length_us_ = -1;

	32 speech_input_possibly_complete_silence_length_us_ =

	33 1 * Time::kMicrosecondsPerSecond;

	34

	35 // Set the default configuration for Push To Talk mode.

	36 EnergyEndpointerParams ep_config;

	37 ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate));

	38 ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate));

	39 ep_config.set_endpoint_margin(0.2f);

	40 ep_config.set_onset_window(0.15f);

	41 ep_config.set_speech_on_window(0.4f);

	42 ep_config.set_offset_window(0.15f);

	43 ep_config.set_onset_detect_dur(0.09f);

	44 ep_config.set_onset_confirm_dur(0.075f);

	45 ep_config.set_on_maintain_dur(0.10f);

	46 ep_config.set_offset_confirm_dur(0.12f);

	47 ep_config.set_decision_threshold(1000.0f);

	48 ep_config.set_min_decision_threshold(50.0f);

	49 ep_config.set_fast_update_dur(0.2f);

	50 ep_config.set_sample_rate(static_cast<float>(sample_rate));

	51 ep_config.set_min_fundamental_frequency(57.143f);

	52 ep_config.set_max_fundamental_frequency(400.0f);

	53 ep_config.set_contamination_rejection_period(0.25f);

	54 energy_endpointer_.Init(ep_config);

	55 }

	56

	57 void Endpointer::Reset() {

	58 old_ep_status_ = EP_PRE_SPEECH;

	59 waiting_for_speech_possibly_complete_timeout_ = false;

	60 waiting_for_speech_complete_timeout_ = false;

	61 speech_previously_detected_ = false;

	62 speech_input_complete_ = false;

	63 audio_frame_time_us_ = 0; // Reset time for packets sent to endpointer.

	64 speech_end_time_us_ = -1;

	65 speech_start_time_us_ = -1;

	66 }

	67

	68 void Endpointer::StartSession() {

	69 Reset();

	70 energy_endpointer_.StartSession();

	71 }

	72

	73 void Endpointer::EndSession() {

	74 energy_endpointer_.EndSession();

	75 }

	76

	77 void Endpointer::SetEnvironmentEstimationMode() {

	78 Reset();

	79 energy_endpointer_.SetEnvironmentEstimationMode();

	80 }

	81

	82 void Endpointer::SetUserInputMode() {

	83 energy_endpointer_.SetUserInputMode();

	84 }

	85

	86 EpStatus Endpointer::Status(int64 *time) {

	87 return energy_endpointer_.Status(time);

	88 }

	89

	90 EpStatus Endpointer::ProcessAudio(const int16* audio_data, int num_samples) {

	91 EpStatus ep_status = EP_PRE_SPEECH;

	92

	93 // Process the input data in blocks of frame_size_, dropping any incomplete

	94 // frames at the end (which is ok since typically the caller will be recording

	95 // audio in multiples of our frame size).

	96 int sample_index = 0;

	97 while (sample_index + frame_size_ <= num_samples) {

	98 // Have the endpointer process the frame.

	99 energy_endpointer_.ProcessAudioFrame(audio_frame_time_us_,

	100 audio_data + sample_index,

	101 frame_size_);

	102 sample_index += frame_size_;

	103 audio_frame_time_us_ += (frame_size_ * Time::kMicrosecondsPerSecond) /

	104 sample_rate_;

	105

	106 // Get the status of the endpointer.

	107 int64 ep_time;

	108 ep_status = energy_endpointer_.Status(&ep_time);

	109

	110 // Handle state changes.

	111 if ((EP_SPEECH_PRESENT == ep_status) &&

	112 (EP_POSSIBLE_ONSET == old_ep_status_)) {

	113 speech_end_time_us_ = -1;

	114 waiting_for_speech_possibly_complete_timeout_ = false;

	115 waiting_for_speech_complete_timeout_ = false;

	116 // Trigger SpeechInputDidStart event on first detection.

	117 if (false == speech_previously_detected_) {

	118 speech_previously_detected_ = true;

	119 speech_start_time_us_ = ep_time;

	120 }

	121 }

	122 if ((EP_PRE_SPEECH == ep_status) &&

	123 (EP_POSSIBLE_OFFSET == old_ep_status_)) {

	124 speech_end_time_us_ = ep_time;

	125 waiting_for_speech_possibly_complete_timeout_ = true;

	126 waiting_for_speech_complete_timeout_ = true;

	127 }

	128 if (ep_time > speech_input_minimum_length_us_) {

	129 // Speech possibly complete timeout.

	130 if ((waiting_for_speech_possibly_complete_timeout_) &&

	131 (ep_time - speech_end_time_us_ >

	132 speech_input_possibly_complete_silence_length_us_)) {

	133 waiting_for_speech_possibly_complete_timeout_ = false;

	134 }

	135 if (waiting_for_speech_complete_timeout_) {

	136 // The length of the silence timeout period can be held constant, or it

	137 // can be changed after a fixed amount of time from the beginning of

	138 // speech.

	139 bool has_stepped_silence =

	140 (long_speech_length_us_ > 0) &&

	141 (long_speech_input_complete_silence_length_us_ > 0);

	142 int64 requested_silence_length;

	143 if (has_stepped_silence &&

	144 (ep_time - speech_start_time_us_) > long_speech_length_us_) {

	145 requested_silence_length =

	146 long_speech_input_complete_silence_length_us_;

	147 } else {

	148 requested_silence_length =

	149 speech_input_complete_silence_length_us_;

	150 }

	151

	152 // Speech complete timeout.

	153 if ((ep_time - speech_end_time_us_) > requested_silence_length) {

	154 waiting_for_speech_complete_timeout_ = false;

	155 speech_input_complete_ = true;

	156 }

	157 }

	158 }

	159 old_ep_status_ = ep_status;

	160 }

	161 return ep_status;

	162 }

	163

	164 } // namespace speech

OLD	NEW