Index: content/browser/speech/google_one_shot_remote_engine.cc |
diff --git a/content/browser/speech/speech_recognition_request.cc b/content/browser/speech/google_one_shot_remote_engine.cc |
similarity index 50% |
rename from content/browser/speech/speech_recognition_request.cc |
rename to content/browser/speech/google_one_shot_remote_engine.cc |
index a14369976d45ff0c267ef0af7221f4cac547fc32..09c2b28d087cb2cdeaa8aebc9d038c9472b1b001 100644 |
--- a/content/browser/speech/speech_recognition_request.cc |
+++ b/content/browser/speech/google_one_shot_remote_engine.cc |
@@ -2,16 +2,18 @@ |
// Use of this source code is governed by a BSD-style license that can be |
// found in the LICENSE file. |
-#include "content/browser/speech/speech_recognition_request.h" |
+#include "content/browser/speech/google_one_shot_remote_engine.h" |
#include <vector> |
#include "base/json/json_reader.h" |
+#include "base/memory/scoped_ptr.h" |
#include "base/string_number_conversions.h" |
#include "base/string_util.h" |
#include "base/values.h" |
#include "content/browser/speech/audio_buffer.h" |
#include "content/common/net/url_fetcher_impl.h" |
+#include "content/public/common/speech_recognition_error.h" |
#include "content/public/common/speech_recognition_result.h" |
#include "net/base/escape.h" |
#include "net/base/load_flags.h" |
@@ -19,6 +21,10 @@ |
#include "net/url_request/url_request_context_getter.h" |
#include "net/url_request/url_request_status.h" |
+using content::SpeechRecognitionError; |
+using content::SpeechRecognitionHypothesis; |
+using content::SpeechRecognitionResult; |
+ |
namespace { |
const char* const kDefaultSpeechRecognitionUrl = |
@@ -27,13 +33,20 @@ const char* const kStatusString = "status"; |
const char* const kHypothesesString = "hypotheses"; |
const char* const kUtteranceString = "utterance"; |
const char* const kConfidenceString = "confidence"; |
- |
+const int kWebServiceStatusNoError = 0; |
+const int kWebServiceStatusNoSpeech = 4; |
+const int kWebServiceStatusNoMatch = 5; |
+const int kDefaultConfigSampleRate = 8000; |
+const int kDefaultConfigBitsPerSample = 16; |
+const speech::AudioEncoder::Codec kDefaultAudioCodec = |
+ speech::AudioEncoder::CODEC_FLAC; |
// TODO(satish): Remove this hardcoded value once the page is allowed to |
// set this via an attribute. |
const int kMaxResults = 6; |
bool ParseServerResponse(const std::string& response_body, |
- content::SpeechRecognitionResult* result) { |
+ SpeechRecognitionResult* result, |
+ SpeechRecognitionError* error) { |
if (response_body.empty()) { |
LOG(WARNING) << "ParseServerResponse: Response was empty."; |
return false; |
@@ -67,19 +80,21 @@ bool ParseServerResponse(const std::string& response_body, |
// Process the status. |
switch (status) { |
- case content::SPEECH_RECOGNITION_ERROR_NONE: |
- case content::SPEECH_RECOGNITION_ERROR_NO_SPEECH: |
- case content::SPEECH_RECOGNITION_ERROR_NO_MATCH: |
- break; |
- |
- default: |
- // Other status codes should not be returned by the server. |
- VLOG(1) << "ParseServerResponse: unexpected status code " << status; |
- return false; |
+ case kWebServiceStatusNoError: |
+ break; |
+ case kWebServiceStatusNoSpeech: |
+ error->code = content::SPEECH_RECOGNITION_ERROR_NO_SPEECH; |
+ return false; |
+ case kWebServiceStatusNoMatch: |
+ error->code = content::SPEECH_RECOGNITION_ERROR_NO_MATCH; |
+ return false; |
+ default: |
+ error->code = content::SPEECH_RECOGNITION_ERROR_NETWORK; |
+ // Other status codes should not be returned by the server. |
+ VLOG(1) << "ParseServerResponse: unexpected status code " << status; |
+ return false; |
} |
- result->error = static_cast<content::SpeechRecognitionErrorCode>(status); |
- |
// Get the hypotheses. |
Value* hypotheses_value = NULL; |
if (!response_object->Get(kHypothesesString, &hypotheses_value)) { |
@@ -96,6 +111,8 @@ bool ParseServerResponse(const std::string& response_body, |
const ListValue* hypotheses_list = static_cast<ListValue*>(hypotheses_value); |
+ // For now we support only single shot recognition, so we are giving only a |
+ // final result, consisting of one fragment (with one or more hypotheses). |
size_t index = 0; |
for (; index < hypotheses_list->GetSize(); ++index) { |
Value* hypothesis = NULL; |
@@ -113,6 +130,7 @@ bool ParseServerResponse(const std::string& response_body, |
const DictionaryValue* hypothesis_value = |
static_cast<DictionaryValue*>(hypothesis); |
string16 utterance; |
+ |
if (!hypothesis_value->GetString(kUtteranceString, &utterance)) { |
LOG(WARNING) << "ParseServerResponse: Missing utterance value."; |
break; |
@@ -121,16 +139,14 @@ bool ParseServerResponse(const std::string& response_body, |
// It is not an error if the 'confidence' field is missing. |
double confidence = 0.0; |
hypothesis_value->GetDouble(kConfidenceString, &confidence); |
- |
- result->hypotheses.push_back(content::SpeechRecognitionHypothesis( |
- utterance, confidence)); |
+ result->hypotheses.push_back(SpeechRecognitionHypothesis(utterance, |
+ confidence)); |
} |
if (index < hypotheses_list->GetSize()) { |
result->hypotheses.clear(); |
return false; |
} |
- |
return true; |
} |
@@ -138,28 +154,34 @@ bool ParseServerResponse(const std::string& response_body, |
namespace speech { |
-int SpeechRecognitionRequest::url_fetcher_id_for_tests = 0; |
+const int GoogleOneShotRemoteEngine::kAudioPacketIntervalMs = 100; |
+int GoogleOneShotRemoteEngine::url_fetcher_id_for_tests = 0; |
-SpeechRecognitionRequest::SpeechRecognitionRequest( |
- net::URLRequestContextGetter* context, Delegate* delegate) |
- : url_context_(context), |
- delegate_(delegate) { |
- DCHECK(delegate); |
+GoogleOneShotRemoteEngineConfig::GoogleOneShotRemoteEngineConfig() |
+ : filter_profanities(false), |
+ audio_sample_rate(kDefaultConfigSampleRate), |
+ audio_num_bits_per_sample(kDefaultConfigBitsPerSample) { |
} |
-SpeechRecognitionRequest::~SpeechRecognitionRequest() {} |
+GoogleOneShotRemoteEngineConfig::~GoogleOneShotRemoteEngineConfig() {} |
-void SpeechRecognitionRequest::Start(const std::string& language, |
- const std::string& grammar, |
- bool filter_profanities, |
- const std::string& hardware_info, |
- const std::string& origin_url, |
- const std::string& content_type) { |
- DCHECK(!url_fetcher_.get()); |
+GoogleOneShotRemoteEngine::GoogleOneShotRemoteEngine( |
+ net::URLRequestContextGetter* context) |
+ : url_context_(context) { |
+} |
- std::vector<std::string> parts; |
+GoogleOneShotRemoteEngine::~GoogleOneShotRemoteEngine() {} |
+ |
+void GoogleOneShotRemoteEngine::SetConfig( |
+ const GoogleOneShotRemoteEngineConfig& config) { |
+ config_ = config; |
+} |
+ |
+void GoogleOneShotRemoteEngine::StartRecognition() { |
+ DCHECK(delegate()); |
+ DCHECK(!url_fetcher_.get()); |
+ std::string lang_param = config_.language; |
- std::string lang_param = language; |
if (lang_param.empty() && url_context_) { |
// If no language is provided then we use the first from the accepted |
// language list. If this list is empty then it defaults to "en-US". |
@@ -171,58 +193,108 @@ void SpeechRecognitionRequest::Start(const std::string& language, |
size_t separator = accepted_language_list.find_first_of(",;"); |
lang_param = accepted_language_list.substr(0, separator); |
} |
+ |
if (lang_param.empty()) |
lang_param = "en-US"; |
+ |
+ std::vector<std::string> parts; |
parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true)); |
- if (!grammar.empty()) |
- parts.push_back("lm=" + net::EscapeQueryParamValue(grammar, true)); |
- if (!hardware_info.empty()) |
- parts.push_back("xhw=" + net::EscapeQueryParamValue(hardware_info, true)); |
+ if (!config_.grammar.empty()) |
+ parts.push_back("lm=" + net::EscapeQueryParamValue(config_.grammar, true)); |
+ |
+ if (!config_.hardware_info.empty()) |
+ parts.push_back("xhw=" + net::EscapeQueryParamValue(config_.hardware_info, |
+ true)); |
parts.push_back("maxresults=" + base::IntToString(kMaxResults)); |
- parts.push_back(filter_profanities ? "pfilter=2" : "pfilter=0"); |
+ parts.push_back(config_.filter_profanities ? "pfilter=2" : "pfilter=0"); |
GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&')); |
+ encoder_.reset(AudioEncoder::Create(kDefaultAudioCodec, |
+ config_.audio_sample_rate, |
+ config_.audio_num_bits_per_sample)); |
+ DCHECK(encoder_.get()); |
url_fetcher_.reset(URLFetcherImpl::Create(url_fetcher_id_for_tests, |
url, |
URLFetcherImpl::POST, |
this)); |
- url_fetcher_->SetChunkedUpload(content_type); |
+ url_fetcher_->SetChunkedUpload(encoder_->mime_type()); |
url_fetcher_->SetRequestContext(url_context_); |
- url_fetcher_->SetReferrer(origin_url); |
+ url_fetcher_->SetReferrer(config_.origin_url); |
// The speech recognition API does not require user identification as part |
// of requests, so we don't send cookies or auth data for these requests to |
// prevent any accidental connection between users who are logged into the |
// domain for other services (e.g. bookmark sync) with the speech requests. |
- url_fetcher_->SetLoadFlags( |
- net::LOAD_DO_NOT_SAVE_COOKIES | net::LOAD_DO_NOT_SEND_COOKIES | |
- net::LOAD_DO_NOT_SEND_AUTH_DATA); |
+ url_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES | |
+ net::LOAD_DO_NOT_SEND_COOKIES | |
+ net::LOAD_DO_NOT_SEND_AUTH_DATA); |
url_fetcher_->Start(); |
} |
-void SpeechRecognitionRequest::UploadAudioChunk(const AudioChunk& audio_chunk, |
- bool is_last_chunk) { |
+void GoogleOneShotRemoteEngine::EndRecognition() { |
+ url_fetcher_.reset(); |
+} |
+ |
+void GoogleOneShotRemoteEngine::TakeAudioChunk(const AudioChunk& data) { |
+ DCHECK(url_fetcher_.get()); |
+ DCHECK(encoder_.get()); |
+ DCHECK_EQ(data.bytes_per_sample(), config_.audio_num_bits_per_sample / 8); |
+ encoder_->Encode(data); |
+ scoped_ptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear()); |
+ url_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false); |
+} |
+ |
+void GoogleOneShotRemoteEngine::AudioChunksEnded() { |
DCHECK(url_fetcher_.get()); |
- url_fetcher_->AppendChunkToUpload(audio_chunk.AsString(), is_last_chunk); |
+ DCHECK(encoder_.get()); |
+ |
+ // UploadAudioChunk requires a non-empty final buffer. So we encode a packet |
+ // of silence in case encoder had no data already. |
+ std::vector<int16> samples( |
+ config_.audio_sample_rate * kAudioPacketIntervalMs / 1000); |
+ AudioChunk dummy_chunk(reinterpret_cast<uint8*>(&samples[0]), |
+ samples.size() * sizeof(int16), |
+ encoder_->bits_per_sample() / 8); |
+ encoder_->Encode(dummy_chunk); |
+ encoder_->Flush(); |
+ scoped_ptr<AudioChunk> encoded_dummy_data(encoder_->GetEncodedDataAndClear()); |
+ DCHECK(!encoded_dummy_data->IsEmpty()); |
+ encoder_.reset(); |
+ |
+ url_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true); |
} |
-void SpeechRecognitionRequest::OnURLFetchComplete( |
+void GoogleOneShotRemoteEngine::OnURLFetchComplete( |
const content::URLFetcher* source) { |
DCHECK_EQ(url_fetcher_.get(), source); |
- |
- content::SpeechRecognitionResult result; |
+ SpeechRecognitionResult result; |
+ SpeechRecognitionError error(content::SPEECH_RECOGNITION_ERROR_NETWORK); |
std::string data; |
- if (!source->GetStatus().is_success() || source->GetResponseCode() != 200 || |
- !source->GetResponseAsString(&data) || |
- !ParseServerResponse(data, &result)) { |
- result.error = content::SPEECH_RECOGNITION_ERROR_NETWORK; |
- } |
- DVLOG(1) << "SpeechRecognitionRequest: Invoking delegate with result."; |
+ // The default error code in case of parse errors is NETWORK_FAILURE, however |
+ // ParseServerResponse can change the error to a more appropriate one. |
+ bool error_occurred = (!source->GetStatus().is_success() || |
+ source->GetResponseCode() != 200 || |
+ !source->GetResponseAsString(&data) || |
+ !ParseServerResponse(data, &result, &error)); |
url_fetcher_.reset(); |
- delegate_->SetRecognitionResult(result); |
+ if (error_occurred) { |
+ DVLOG(1) << "GoogleOneShotRemoteEngine: Network Error " << error.code; |
+ delegate()->OnSpeechRecognitionEngineError(error); |
+ } else { |
+ DVLOG(1) << "GoogleOneShotRemoteEngine: Invoking delegate with result."; |
+ delegate()->OnSpeechRecognitionEngineResult(result); |
+ } |
+} |
+ |
+bool GoogleOneShotRemoteEngine::IsRecognitionPending() const { |
+ return url_fetcher_ != NULL; |
+} |
+ |
+int GoogleOneShotRemoteEngine::GetDesiredAudioChunkDurationMs() const { |
+ return kAudioPacketIntervalMs; |
} |
} // namespace speech |