content/browser/speech/google_one_shot_remote_engine.cc - Issue 9663066: Refactoring of chrome speech recognition architecture (CL1.3)

Unified Diff: content/browser/speech/google_one_shot_remote_engine.cc

Issue 9663066: Refactoring of chrome speech recognition architecture (CL1.3) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Moved SpeechRecognitionError* to a separate .h file. Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « content/browser/speech/google_one_shot_remote_engine.h ('k') | content/browser/speech/google_one_shot_remote_engine_unittest.cc » ('j') | content/public/common/speech_recognition_error.h » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: content/browser/speech/google_one_shot_remote_engine.cc

diff --git a/content/browser/speech/speech_recognition_request.cc b/content/browser/speech/google_one_shot_remote_engine.cc

similarity index 50%

rename from content/browser/speech/speech_recognition_request.cc

rename to content/browser/speech/google_one_shot_remote_engine.cc

index a14369976d45ff0c267ef0af7221f4cac547fc32..09c2b28d087cb2cdeaa8aebc9d038c9472b1b001 100644

--- a/content/browser/speech/speech_recognition_request.cc

+++ b/content/browser/speech/google_one_shot_remote_engine.cc

@@ -2,16 +2,18 @@

// Use of this source code is governed by a BSD-style license that can be

// found in the LICENSE file.

-#include "content/browser/speech/speech_recognition_request.h"

+#include "content/browser/speech/google_one_shot_remote_engine.h"

#include <vector>

#include "base/json/json_reader.h"

+#include "base/memory/scoped_ptr.h"

#include "base/string_number_conversions.h"

#include "base/string_util.h"

#include "base/values.h"

#include "content/browser/speech/audio_buffer.h"

#include "content/common/net/url_fetcher_impl.h"

+#include "content/public/common/speech_recognition_error.h"

#include "content/public/common/speech_recognition_result.h"

#include "net/base/escape.h"

#include "net/base/load_flags.h"

@@ -19,6 +21,10 @@

#include "net/url_request/url_request_context_getter.h"

#include "net/url_request/url_request_status.h"

+using content::SpeechRecognitionError;

+using content::SpeechRecognitionHypothesis;

+using content::SpeechRecognitionResult;

namespace {

const char* const kDefaultSpeechRecognitionUrl =

@@ -27,13 +33,20 @@ const char* const kStatusString = "status";

const char* const kHypothesesString = "hypotheses";

const char* const kUtteranceString = "utterance";

const char* const kConfidenceString = "confidence";

+const int kWebServiceStatusNoError = 0;

+const int kWebServiceStatusNoSpeech = 4;

+const int kWebServiceStatusNoMatch = 5;

+const int kDefaultConfigSampleRate = 8000;

+const int kDefaultConfigBitsPerSample = 16;

+const speech::AudioEncoder::Codec kDefaultAudioCodec =

+ speech::AudioEncoder::CODEC_FLAC;

// TODO(satish): Remove this hardcoded value once the page is allowed to

// set this via an attribute.

const int kMaxResults = 6;

bool ParseServerResponse(const std::string& response_body,

- content::SpeechRecognitionResult* result) {

+ SpeechRecognitionResult* result,

+ SpeechRecognitionError* error) {

if (response_body.empty()) {

LOG(WARNING) << "ParseServerResponse: Response was empty.";

return false;

@@ -67,19 +80,21 @@ bool ParseServerResponse(const std::string& response_body,

// Process the status.

switch (status) {

- case content::SPEECH_RECOGNITION_ERROR_NONE:

- case content::SPEECH_RECOGNITION_ERROR_NO_SPEECH:

- case content::SPEECH_RECOGNITION_ERROR_NO_MATCH:

- break;

- default:

- // Other status codes should not be returned by the server.

- VLOG(1) << "ParseServerResponse: unexpected status code " << status;

- return false;

+ case kWebServiceStatusNoError:

+ break;

+ case kWebServiceStatusNoSpeech:

+ error->code = content::SPEECH_RECOGNITION_ERROR_NO_SPEECH;

+ return false;

+ case kWebServiceStatusNoMatch:

+ error->code = content::SPEECH_RECOGNITION_ERROR_NO_MATCH;

+ return false;

+ default:

+ error->code = content::SPEECH_RECOGNITION_ERROR_NETWORK;

+ // Other status codes should not be returned by the server.

+ VLOG(1) << "ParseServerResponse: unexpected status code " << status;

+ return false;

}

- result->error = static_cast<content::SpeechRecognitionErrorCode>(status);

// Get the hypotheses.

Value* hypotheses_value = NULL;

if (!response_object->Get(kHypothesesString, &hypotheses_value)) {

@@ -96,6 +111,8 @@ bool ParseServerResponse(const std::string& response_body,

const ListValue* hypotheses_list = static_cast<ListValue*>(hypotheses_value);

+ // For now we support only single shot recognition, so we are giving only a

+ // final result, consisting of one fragment (with one or more hypotheses).

size_t index = 0;

for (; index < hypotheses_list->GetSize(); ++index) {

Value* hypothesis = NULL;

@@ -113,6 +130,7 @@ bool ParseServerResponse(const std::string& response_body,

const DictionaryValue* hypothesis_value =

static_cast<DictionaryValue*>(hypothesis);

string16 utterance;

if (!hypothesis_value->GetString(kUtteranceString, &utterance)) {

LOG(WARNING) << "ParseServerResponse: Missing utterance value.";

break;

@@ -121,16 +139,14 @@ bool ParseServerResponse(const std::string& response_body,

// It is not an error if the 'confidence' field is missing.

double confidence = 0.0;

hypothesis_value->GetDouble(kConfidenceString, &confidence);

- result->hypotheses.push_back(content::SpeechRecognitionHypothesis(

- utterance, confidence));

+ result->hypotheses.push_back(SpeechRecognitionHypothesis(utterance,

+ confidence));

}

if (index < hypotheses_list->GetSize()) {

result->hypotheses.clear();

return false;

}

return true;

}

@@ -138,28 +154,34 @@ bool ParseServerResponse(const std::string& response_body,

namespace speech {

-int SpeechRecognitionRequest::url_fetcher_id_for_tests = 0;

+const int GoogleOneShotRemoteEngine::kAudioPacketIntervalMs = 100;

+int GoogleOneShotRemoteEngine::url_fetcher_id_for_tests = 0;

-SpeechRecognitionRequest::SpeechRecognitionRequest(

- net::URLRequestContextGetter* context, Delegate* delegate)

- : url_context_(context),

- delegate_(delegate) {

- DCHECK(delegate);

+GoogleOneShotRemoteEngineConfig::GoogleOneShotRemoteEngineConfig()

+ : filter_profanities(false),

+ audio_sample_rate(kDefaultConfigSampleRate),

+ audio_num_bits_per_sample(kDefaultConfigBitsPerSample) {

}

-SpeechRecognitionRequest::~SpeechRecognitionRequest() {}

+GoogleOneShotRemoteEngineConfig::~GoogleOneShotRemoteEngineConfig() {}

-void SpeechRecognitionRequest::Start(const std::string& language,

- const std::string& grammar,

- bool filter_profanities,

- const std::string& hardware_info,

- const std::string& origin_url,

- const std::string& content_type) {

- DCHECK(!url_fetcher_.get());

+GoogleOneShotRemoteEngine::GoogleOneShotRemoteEngine(

+ net::URLRequestContextGetter* context)

+ : url_context_(context) {

- std::vector<std::string> parts;

+GoogleOneShotRemoteEngine::~GoogleOneShotRemoteEngine() {}

+void GoogleOneShotRemoteEngine::SetConfig(

+ const GoogleOneShotRemoteEngineConfig& config) {

+ config_ = config;

+void GoogleOneShotRemoteEngine::StartRecognition() {

+ DCHECK(delegate());

+ DCHECK(!url_fetcher_.get());

+ std::string lang_param = config_.language;

- std::string lang_param = language;

if (lang_param.empty() && url_context_) {

// If no language is provided then we use the first from the accepted

// language list. If this list is empty then it defaults to "en-US".

@@ -171,58 +193,108 @@ void SpeechRecognitionRequest::Start(const std::string& language,

size_t separator = accepted_language_list.find_first_of(",;");

lang_param = accepted_language_list.substr(0, separator);

}

if (lang_param.empty())

lang_param = "en-US";

+ std::vector<std::string> parts;

parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true));

- if (!grammar.empty())

- parts.push_back("lm=" + net::EscapeQueryParamValue(grammar, true));

- if (!hardware_info.empty())

- parts.push_back("xhw=" + net::EscapeQueryParamValue(hardware_info, true));

+ if (!config_.grammar.empty())

+ parts.push_back("lm=" + net::EscapeQueryParamValue(config_.grammar, true));

+ if (!config_.hardware_info.empty())

+ parts.push_back("xhw=" + net::EscapeQueryParamValue(config_.hardware_info,

+ true));

parts.push_back("maxresults=" + base::IntToString(kMaxResults));

- parts.push_back(filter_profanities ? "pfilter=2" : "pfilter=0");

+ parts.push_back(config_.filter_profanities ? "pfilter=2" : "pfilter=0");

GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&'));

+ encoder_.reset(AudioEncoder::Create(kDefaultAudioCodec,

+ config_.audio_sample_rate,

+ config_.audio_num_bits_per_sample));

+ DCHECK(encoder_.get());

url_fetcher_.reset(URLFetcherImpl::Create(url_fetcher_id_for_tests,

url,

URLFetcherImpl::POST,

this));

- url_fetcher_->SetChunkedUpload(content_type);

+ url_fetcher_->SetChunkedUpload(encoder_->mime_type());

url_fetcher_->SetRequestContext(url_context_);

- url_fetcher_->SetReferrer(origin_url);

+ url_fetcher_->SetReferrer(config_.origin_url);

// The speech recognition API does not require user identification as part

// of requests, so we don't send cookies or auth data for these requests to

// prevent any accidental connection between users who are logged into the

// domain for other services (e.g. bookmark sync) with the speech requests.

- url_fetcher_->SetLoadFlags(

- net::LOAD_DO_NOT_SAVE_COOKIES | net::LOAD_DO_NOT_SEND_COOKIES |

- net::LOAD_DO_NOT_SEND_AUTH_DATA);

+ url_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |

+ net::LOAD_DO_NOT_SEND_COOKIES |

+ net::LOAD_DO_NOT_SEND_AUTH_DATA);

url_fetcher_->Start();

}

-void SpeechRecognitionRequest::UploadAudioChunk(const AudioChunk& audio_chunk,

- bool is_last_chunk) {

+void GoogleOneShotRemoteEngine::EndRecognition() {

+ url_fetcher_.reset();

+void GoogleOneShotRemoteEngine::TakeAudioChunk(const AudioChunk& data) {

+ DCHECK(url_fetcher_.get());

+ DCHECK(encoder_.get());

+ DCHECK_EQ(data.bytes_per_sample(), config_.audio_num_bits_per_sample / 8);

+ encoder_->Encode(data);

+ scoped_ptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());

+ url_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false);

+void GoogleOneShotRemoteEngine::AudioChunksEnded() {

DCHECK(url_fetcher_.get());

- url_fetcher_->AppendChunkToUpload(audio_chunk.AsString(), is_last_chunk);

+ DCHECK(encoder_.get());

+ // UploadAudioChunk requires a non-empty final buffer. So we encode a packet

+ // of silence in case encoder had no data already.

+ std::vector<int16> samples(

+ config_.audio_sample_rate * kAudioPacketIntervalMs / 1000);

+ AudioChunk dummy_chunk(reinterpret_cast<uint8*>(&samples[0]),

+ samples.size() * sizeof(int16),

+ encoder_->bits_per_sample() / 8);

+ encoder_->Encode(dummy_chunk);

+ encoder_->Flush();

+ scoped_ptr<AudioChunk> encoded_dummy_data(encoder_->GetEncodedDataAndClear());

+ DCHECK(!encoded_dummy_data->IsEmpty());

+ encoder_.reset();

+ url_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true);

}

-void SpeechRecognitionRequest::OnURLFetchComplete(

+void GoogleOneShotRemoteEngine::OnURLFetchComplete(

const content::URLFetcher* source) {

DCHECK_EQ(url_fetcher_.get(), source);

- content::SpeechRecognitionResult result;

+ SpeechRecognitionResult result;

+ SpeechRecognitionError error(content::SPEECH_RECOGNITION_ERROR_NETWORK);

std::string data;

- if (!source->GetStatus().is_success() || source->GetResponseCode() != 200 ||

- !source->GetResponseAsString(&data) ||

- !ParseServerResponse(data, &result)) {

- result.error = content::SPEECH_RECOGNITION_ERROR_NETWORK;

- }

- DVLOG(1) << "SpeechRecognitionRequest: Invoking delegate with result.";

+ // The default error code in case of parse errors is NETWORK_FAILURE, however

+ // ParseServerResponse can change the error to a more appropriate one.

+ bool error_occurred = (!source->GetStatus().is_success() ||

+ source->GetResponseCode() != 200 ||

+ !source->GetResponseAsString(&data) ||

+ !ParseServerResponse(data, &result, &error));

url_fetcher_.reset();

- delegate_->SetRecognitionResult(result);

+ if (error_occurred) {

+ DVLOG(1) << "GoogleOneShotRemoteEngine: Network Error " << error.code;

+ delegate()->OnSpeechRecognitionEngineError(error);

+ } else {

+ DVLOG(1) << "GoogleOneShotRemoteEngine: Invoking delegate with result.";

+ delegate()->OnSpeechRecognitionEngineResult(result);

+ }

+bool GoogleOneShotRemoteEngine::IsRecognitionPending() const {

+ return url_fetcher_ != NULL;

+int GoogleOneShotRemoteEngine::GetDesiredAudioChunkDurationMs() const {

+ return kAudioPacketIntervalMs;

}

} // namespace speech