content/browser/speech/google_one_shot_remote_engine.cc - Issue 9663066: Refactoring of chrome speech recognition architecture (CL1.3)

Unified Diff: content/browser/speech/google_one_shot_remote_engine.cc

Issue 9663066: Refactoring of chrome speech recognition architecture (CL1.3) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Fixed according to (partial) Satish review. Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« content/browser/speech/google_one_shot_remote_engine.h ('K') | « content/browser/speech/google_one_shot_remote_engine.h ('k') | content/browser/speech/google_one_shot_remote_engine_unittest.cc » ('j') | content/browser/speech/speech_recognition_engine.h » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: content/browser/speech/google_one_shot_remote_engine.cc

diff --git a/content/browser/speech/speech_recognition_request.cc b/content/browser/speech/google_one_shot_remote_engine.cc

similarity index 51%

rename from content/browser/speech/speech_recognition_request.cc

rename to content/browser/speech/google_one_shot_remote_engine.cc

index a14369976d45ff0c267ef0af7221f4cac547fc32..7cc7739f60d3f1ba9c6da793c2b30c07c291de52 100644

--- a/content/browser/speech/speech_recognition_request.cc

+++ b/content/browser/speech/google_one_shot_remote_engine.cc

@@ -2,11 +2,12 @@

// Use of this source code is governed by a BSD-style license that can be

// found in the LICENSE file.

-#include "content/browser/speech/speech_recognition_request.h"

+#include "content/browser/speech/google_one_shot_remote_engine.h"

#include <vector>

#include "base/json/json_reader.h"

+#include "base/memory/scoped_ptr.h"

#include "base/string_number_conversions.h"

#include "base/string_util.h"

#include "base/values.h"

@@ -19,6 +20,10 @@

#include "net/url_request/url_request_context_getter.h"

#include "net/url_request/url_request_status.h"

+using content::SpeechRecognitionError;

+using content::SpeechRecognitionHypothesis;

+using content::SpeechRecognitionResult;

namespace {

const char* const kDefaultSpeechRecognitionUrl =

@@ -27,13 +32,19 @@ const char* const kStatusString = "status";

const char* const kHypothesesString = "hypotheses";

const char* const kUtteranceString = "utterance";

const char* const kConfidenceString = "confidence";

+const int kGoogleSSFEStatusNoError = 0;

Satish 2012/03/21 13:29:48 SSFE to something else?

Primiano Tucci (use gerrit) 2012/03/22 11:20:41 Done.

+const int kGoogleSSFEStatusNoSpeech = 4;

+const int kGoogleSSFEStatusNoMatch = 5;

+const int kDefaultConfigSampleRate = 8000;

+const int kDefaultConfigBPS = 16;

Satish 2012/03/21 13:29:48 BPS -> BitsPerSample

Primiano Tucci (use gerrit) 2012/03/22 11:20:41 Done.

+const speech::AudioEncoder::Codec kCodec = speech::AudioEncoder::CODEC_FLAC;

Satish 2012/03/21 13:29:48 kCodec > kAudioCodec or kDefaultAudioCodec

Primiano Tucci (use gerrit) 2012/03/22 11:20:41 Done.

// TODO(satish): Remove this hardcoded value once the page is allowed to

// set this via an attribute.

const int kMaxResults = 6;

bool ParseServerResponse(const std::string& response_body,

- content::SpeechRecognitionResult* result) {

+ SpeechRecognitionResult* result,

+ SpeechRecognitionError* error) {

if (response_body.empty()) {

LOG(WARNING) << "ParseServerResponse: Response was empty.";

return false;

@@ -67,19 +78,22 @@ bool ParseServerResponse(const std::string& response_body,

// Process the status.

switch (status) {

- case content::SPEECH_RECOGNITION_ERROR_NONE:

- case content::SPEECH_RECOGNITION_ERROR_NO_SPEECH:

- case content::SPEECH_RECOGNITION_ERROR_NO_MATCH:

- break;

- default:

- // Other status codes should not be returned by the server.

- VLOG(1) << "ParseServerResponse: unexpected status code " << status;

- return false;

+ case kGoogleSSFEStatusNoError:

+ error->code = content::SPEECH_RECOGNITION_ERROR_NONE;

+ break;

+ case kGoogleSSFEStatusNoSpeech:

+ error->code = content::SPEECH_RECOGNITION_ERROR_NO_SPEECH;

+ return false;

+ case kGoogleSSFEStatusNoMatch:

+ error->code = content::SPEECH_RECOGNITION_ERROR_NO_MATCH;

+ return false;

+ default:

+ error->code = content::SPEECH_RECOGNITION_ERROR_NETWORK;

+ // Other status codes should not be returned by the server.

+ VLOG(1) << "ParseServerResponse: unexpected status code " << status;

+ return false;

}

- result->error = static_cast<content::SpeechRecognitionErrorCode>(status);

// Get the hypotheses.

Value* hypotheses_value = NULL;

if (!response_object->Get(kHypothesesString, &hypotheses_value)) {

@@ -95,7 +109,8 @@ bool ParseServerResponse(const std::string& response_body,

}

const ListValue* hypotheses_list = static_cast<ListValue*>(hypotheses_value);

+ // For now we support only single shot recognition, so we are giving only a

+ // final result, consisting of one fragment (with one or more hypotheses).

size_t index = 0;

for (; index < hypotheses_list->GetSize(); ++index) {

Value* hypothesis = NULL;

@@ -113,6 +128,7 @@ bool ParseServerResponse(const std::string& response_body,

const DictionaryValue* hypothesis_value =

static_cast<DictionaryValue*>(hypothesis);

string16 utterance;

if (!hypothesis_value->GetString(kUtteranceString, &utterance)) {

LOG(WARNING) << "ParseServerResponse: Missing utterance value.";

break;

@@ -121,16 +137,14 @@ bool ParseServerResponse(const std::string& response_body,

// It is not an error if the 'confidence' field is missing.

double confidence = 0.0;

hypothesis_value->GetDouble(kConfidenceString, &confidence);

- result->hypotheses.push_back(content::SpeechRecognitionHypothesis(

- utterance, confidence));

+ result->hypotheses.push_back(SpeechRecognitionHypothesis(utterance,

+ confidence));

}

if (index < hypotheses_list->GetSize()) {

result->hypotheses.clear();

return false;

}

return true;

}

@@ -138,28 +152,35 @@ bool ParseServerResponse(const std::string& response_body,

namespace speech {

-int SpeechRecognitionRequest::url_fetcher_id_for_tests = 0;

+const int GoogleOneShotRemoteEngine::kAudioPacketIntervalMs = 100;

+int GoogleOneShotRemoteEngine::url_fetcher_id_for_tests = 0;

-SpeechRecognitionRequest::SpeechRecognitionRequest(

- net::URLRequestContextGetter* context, Delegate* delegate)

- : url_context_(context),

- delegate_(delegate) {

- DCHECK(delegate);

+GoogleOneShotRemoteEngineConfig::GoogleOneShotRemoteEngineConfig()

+ : filter_profanities(false),

+ audio_sample_rate(kDefaultConfigSampleRate),

+ audio_num_bits_per_sample(kDefaultConfigBPS) {

}

-SpeechRecognitionRequest::~SpeechRecognitionRequest() {}

+GoogleOneShotRemoteEngineConfig::~GoogleOneShotRemoteEngineConfig() {}

-void SpeechRecognitionRequest::Start(const std::string& language,

- const std::string& grammar,

- bool filter_profanities,

- const std::string& hardware_info,

- const std::string& origin_url,

- const std::string& content_type) {

- DCHECK(!url_fetcher_.get());

+GoogleOneShotRemoteEngine::GoogleOneShotRemoteEngine(

+ net::URLRequestContextGetter* context)

+ : url_context_(context) {

+GoogleOneShotRemoteEngine::~GoogleOneShotRemoteEngine() {}

+void GoogleOneShotRemoteEngine::SetConfiguration(

Satish 2012/03/21 13:29:48 since the type name is Config how about renaming t

Primiano Tucci (use gerrit) 2012/03/22 11:20:41 Done.

+ const GoogleOneShotRemoteEngineConfig& config) {

+ config_ = config;

+void GoogleOneShotRemoteEngine::Initialize() {

Satish 2012/03/21 13:29:48 Reading this I got confused a bit why Initialize w

Primiano Tucci (use gerrit) 2012/03/22 11:20:41 Done.

+ DCHECK(delegate());

+ DCHECK(!url_fetcher_.get());

std::vector<std::string> parts;

Satish 2012/03/21 13:29:48 move this to line 199 where it is first used

Primiano Tucci (use gerrit) 2012/03/22 11:20:41 Done.

+ std::string lang_param = config_.language;

- std::string lang_param = language;

if (lang_param.empty() && url_context_) {

// If no language is provided then we use the first from the accepted

// language list. If this list is empty then it defaults to "en-US".

@@ -171,58 +192,105 @@ void SpeechRecognitionRequest::Start(const std::string& language,

size_t separator = accepted_language_list.find_first_of(",;");

lang_param = accepted_language_list.substr(0, separator);

}

if (lang_param.empty())

lang_param = "en-US";

parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true));

- if (!grammar.empty())

- parts.push_back("lm=" + net::EscapeQueryParamValue(grammar, true));

- if (!hardware_info.empty())

- parts.push_back("xhw=" + net::EscapeQueryParamValue(hardware_info, true));

+ if (!config_.grammar.empty())

+ parts.push_back("lm=" + net::EscapeQueryParamValue(config_.grammar, true));

+ if (!config_.hardware_info.empty())

+ parts.push_back("xhw=" + net::EscapeQueryParamValue(config_.hardware_info,

+ true));

parts.push_back("maxresults=" + base::IntToString(kMaxResults));

- parts.push_back(filter_profanities ? "pfilter=2" : "pfilter=0");

+ parts.push_back(config_.filter_profanities ? "pfilter=2" : "pfilter=0");

GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&'));

+ encoder_.reset(AudioEncoder::Create(kCodec, config_.audio_sample_rate,

+ config_.audio_num_bits_per_sample));

+ DCHECK(encoder_.get());

url_fetcher_.reset(URLFetcherImpl::Create(url_fetcher_id_for_tests,

url,

URLFetcherImpl::POST,

this));

- url_fetcher_->SetChunkedUpload(content_type);

+ url_fetcher_->SetChunkedUpload(encoder_->mime_type());

url_fetcher_->SetRequestContext(url_context_);

- url_fetcher_->SetReferrer(origin_url);

+ url_fetcher_->SetReferrer(config_.origin_url);

// The speech recognition API does not require user identification as part

// of requests, so we don't send cookies or auth data for these requests to

// prevent any accidental connection between users who are logged into the

// domain for other services (e.g. bookmark sync) with the speech requests.

- url_fetcher_->SetLoadFlags(

- net::LOAD_DO_NOT_SAVE_COOKIES | net::LOAD_DO_NOT_SEND_COOKIES |

- net::LOAD_DO_NOT_SEND_AUTH_DATA);

+ url_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |

+ net::LOAD_DO_NOT_SEND_COOKIES |

+ net::LOAD_DO_NOT_SEND_AUTH_DATA);

url_fetcher_->Start();

}

-void SpeechRecognitionRequest::UploadAudioChunk(const AudioChunk& audio_chunk,

- bool is_last_chunk) {

+// Called only after the results have been retrieved.

Satish 2012/03/21 13:29:48 this comment isn't correct as this could be called

Primiano Tucci (use gerrit) 2012/03/22 11:20:41 Done.

+void GoogleOneShotRemoteEngine::Cleanup() {

+ url_fetcher_.reset();

+void GoogleOneShotRemoteEngine::TakeAudioChunk(const AudioChunk& data) {

+ DCHECK(url_fetcher_.get());

+ DCHECK(encoder_.get());

+ DCHECK_EQ(data.bytes_per_sample(), config_.audio_num_bits_per_sample / 8);

+ encoder_->Encode(data);

+ scoped_ptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());

+ url_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false);

+void GoogleOneShotRemoteEngine::AudioChunksEnded() {

DCHECK(url_fetcher_.get());

- url_fetcher_->AppendChunkToUpload(audio_chunk.AsString(), is_last_chunk);

+ DCHECK(encoder_.get());

+ // UploadAudioChunk requires a non-empty final buffer. So we encode a packet

Satish 2012/03/21 13:29:48 suggest leaving a newline above full line comments

Primiano Tucci (use gerrit) 2012/03/22 11:20:41 Done.

+ // of silence in case encoder had no data already.

+ std::vector<int16> samples(

+ config_.audio_sample_rate * kAudioPacketIntervalMs / 1000);

+ AudioChunk dummy_chunk(reinterpret_cast<uint8*>(&samples[0]),

+ samples.size() * sizeof(int16),

+ encoder_->bits_per_sample() / 8);

+ encoder_->Encode(dummy_chunk);

+ encoder_->Flush();

+ scoped_ptr<AudioChunk> encoded_dummy_data(encoder_->GetEncodedDataAndClear());

+ DCHECK(!encoded_dummy_data->IsEmpty());

+ encoder_.reset();

+ url_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true);

}

-void SpeechRecognitionRequest::OnURLFetchComplete(

+void GoogleOneShotRemoteEngine::OnURLFetchComplete(

const content::URLFetcher* source) {

DCHECK_EQ(url_fetcher_.get(), source);

- content::SpeechRecognitionResult result;

+ SpeechRecognitionResult result;

+ SpeechRecognitionError error(content::SPEECH_RECOGNITION_ERROR_NETWORK);

std::string data;

- if (!source->GetStatus().is_success() || source->GetResponseCode() != 200 ||

+ // The default error code in case of parse errors is NETWORK_FAILURE, however

Satish 2012/03/21 13:29:48 ditto

Primiano Tucci (use gerrit) 2012/03/22 11:20:41 Done.

+ // ParseServerResponse can change the error to a more appropriate one.

+ if (!source->GetStatus().is_success() ||

+ source->GetResponseCode() != 200 ||

!source->GetResponseAsString(&data) ||

- !ParseServerResponse(data, &result)) {

- result.error = content::SPEECH_RECOGNITION_ERROR_NETWORK;

+ !ParseServerResponse(data, &result, &error)) {

+ DVLOG(1) << "GoogleOneShotRemoteEngine: Network Error " << error.code;

+ delegate()->OnSpeechEngineError(error);

+ } else {

+ DVLOG(1) << "GoogleOneShotRemoteEngine: Invoking delegate with result.";

+ delegate()->OnSpeechEngineResult(result);

}

- DVLOG(1) << "SpeechRecognitionRequest: Invoking delegate with result.";

url_fetcher_.reset();

- delegate_->SetRecognitionResult(result);

+bool GoogleOneShotRemoteEngine::IsRecognitionPending() const {

+ return url_fetcher_ != NULL;

+int GoogleOneShotRemoteEngine::GetDesiredAudioChunkDurationMs() const {

+ return kAudioPacketIntervalMs;

}

} // namespace speech