| Index: content/browser/speech/google_one_shot_remote_engine.cc
|
| diff --git a/content/browser/speech/speech_recognition_request.cc b/content/browser/speech/google_one_shot_remote_engine.cc
|
| similarity index 50%
|
| rename from content/browser/speech/speech_recognition_request.cc
|
| rename to content/browser/speech/google_one_shot_remote_engine.cc
|
| index a14369976d45ff0c267ef0af7221f4cac547fc32..09c2b28d087cb2cdeaa8aebc9d038c9472b1b001 100644
|
| --- a/content/browser/speech/speech_recognition_request.cc
|
| +++ b/content/browser/speech/google_one_shot_remote_engine.cc
|
| @@ -2,16 +2,18 @@
|
| // Use of this source code is governed by a BSD-style license that can be
|
| // found in the LICENSE file.
|
|
|
| -#include "content/browser/speech/speech_recognition_request.h"
|
| +#include "content/browser/speech/google_one_shot_remote_engine.h"
|
|
|
| #include <vector>
|
|
|
| #include "base/json/json_reader.h"
|
| +#include "base/memory/scoped_ptr.h"
|
| #include "base/string_number_conversions.h"
|
| #include "base/string_util.h"
|
| #include "base/values.h"
|
| #include "content/browser/speech/audio_buffer.h"
|
| #include "content/common/net/url_fetcher_impl.h"
|
| +#include "content/public/common/speech_recognition_error.h"
|
| #include "content/public/common/speech_recognition_result.h"
|
| #include "net/base/escape.h"
|
| #include "net/base/load_flags.h"
|
| @@ -19,6 +21,10 @@
|
| #include "net/url_request/url_request_context_getter.h"
|
| #include "net/url_request/url_request_status.h"
|
|
|
| +using content::SpeechRecognitionError;
|
| +using content::SpeechRecognitionHypothesis;
|
| +using content::SpeechRecognitionResult;
|
| +
|
| namespace {
|
|
|
| const char* const kDefaultSpeechRecognitionUrl =
|
| @@ -27,13 +33,20 @@ const char* const kStatusString = "status";
|
| const char* const kHypothesesString = "hypotheses";
|
| const char* const kUtteranceString = "utterance";
|
| const char* const kConfidenceString = "confidence";
|
| -
|
| +const int kWebServiceStatusNoError = 0;
|
| +const int kWebServiceStatusNoSpeech = 4;
|
| +const int kWebServiceStatusNoMatch = 5;
|
| +const int kDefaultConfigSampleRate = 8000;
|
| +const int kDefaultConfigBitsPerSample = 16;
|
| +const speech::AudioEncoder::Codec kDefaultAudioCodec =
|
| + speech::AudioEncoder::CODEC_FLAC;
|
| // TODO(satish): Remove this hardcoded value once the page is allowed to
|
| // set this via an attribute.
|
| const int kMaxResults = 6;
|
|
|
| bool ParseServerResponse(const std::string& response_body,
|
| - content::SpeechRecognitionResult* result) {
|
| + SpeechRecognitionResult* result,
|
| + SpeechRecognitionError* error) {
|
| if (response_body.empty()) {
|
| LOG(WARNING) << "ParseServerResponse: Response was empty.";
|
| return false;
|
| @@ -67,19 +80,21 @@ bool ParseServerResponse(const std::string& response_body,
|
|
|
| // Process the status.
|
| switch (status) {
|
| - case content::SPEECH_RECOGNITION_ERROR_NONE:
|
| - case content::SPEECH_RECOGNITION_ERROR_NO_SPEECH:
|
| - case content::SPEECH_RECOGNITION_ERROR_NO_MATCH:
|
| - break;
|
| -
|
| - default:
|
| - // Other status codes should not be returned by the server.
|
| - VLOG(1) << "ParseServerResponse: unexpected status code " << status;
|
| - return false;
|
| + case kWebServiceStatusNoError:
|
| + break;
|
| + case kWebServiceStatusNoSpeech:
|
| + error->code = content::SPEECH_RECOGNITION_ERROR_NO_SPEECH;
|
| + return false;
|
| + case kWebServiceStatusNoMatch:
|
| + error->code = content::SPEECH_RECOGNITION_ERROR_NO_MATCH;
|
| + return false;
|
| + default:
|
| + error->code = content::SPEECH_RECOGNITION_ERROR_NETWORK;
|
| + // Other status codes should not be returned by the server.
|
| + VLOG(1) << "ParseServerResponse: unexpected status code " << status;
|
| + return false;
|
| }
|
|
|
| - result->error = static_cast<content::SpeechRecognitionErrorCode>(status);
|
| -
|
| // Get the hypotheses.
|
| Value* hypotheses_value = NULL;
|
| if (!response_object->Get(kHypothesesString, &hypotheses_value)) {
|
| @@ -96,6 +111,8 @@ bool ParseServerResponse(const std::string& response_body,
|
|
|
| const ListValue* hypotheses_list = static_cast<ListValue*>(hypotheses_value);
|
|
|
| + // For now we support only single shot recognition, so we are giving only a
|
| + // final result, consisting of one fragment (with one or more hypotheses).
|
| size_t index = 0;
|
| for (; index < hypotheses_list->GetSize(); ++index) {
|
| Value* hypothesis = NULL;
|
| @@ -113,6 +130,7 @@ bool ParseServerResponse(const std::string& response_body,
|
| const DictionaryValue* hypothesis_value =
|
| static_cast<DictionaryValue*>(hypothesis);
|
| string16 utterance;
|
| +
|
| if (!hypothesis_value->GetString(kUtteranceString, &utterance)) {
|
| LOG(WARNING) << "ParseServerResponse: Missing utterance value.";
|
| break;
|
| @@ -121,16 +139,14 @@ bool ParseServerResponse(const std::string& response_body,
|
| // It is not an error if the 'confidence' field is missing.
|
| double confidence = 0.0;
|
| hypothesis_value->GetDouble(kConfidenceString, &confidence);
|
| -
|
| - result->hypotheses.push_back(content::SpeechRecognitionHypothesis(
|
| - utterance, confidence));
|
| + result->hypotheses.push_back(SpeechRecognitionHypothesis(utterance,
|
| + confidence));
|
| }
|
|
|
| if (index < hypotheses_list->GetSize()) {
|
| result->hypotheses.clear();
|
| return false;
|
| }
|
| -
|
| return true;
|
| }
|
|
|
| @@ -138,28 +154,34 @@ bool ParseServerResponse(const std::string& response_body,
|
|
|
| namespace speech {
|
|
|
| -int SpeechRecognitionRequest::url_fetcher_id_for_tests = 0;
|
| +const int GoogleOneShotRemoteEngine::kAudioPacketIntervalMs = 100;
|
| +int GoogleOneShotRemoteEngine::url_fetcher_id_for_tests = 0;
|
|
|
| -SpeechRecognitionRequest::SpeechRecognitionRequest(
|
| - net::URLRequestContextGetter* context, Delegate* delegate)
|
| - : url_context_(context),
|
| - delegate_(delegate) {
|
| - DCHECK(delegate);
|
| +GoogleOneShotRemoteEngineConfig::GoogleOneShotRemoteEngineConfig()
|
| + : filter_profanities(false),
|
| + audio_sample_rate(kDefaultConfigSampleRate),
|
| + audio_num_bits_per_sample(kDefaultConfigBitsPerSample) {
|
| }
|
|
|
| -SpeechRecognitionRequest::~SpeechRecognitionRequest() {}
|
| +GoogleOneShotRemoteEngineConfig::~GoogleOneShotRemoteEngineConfig() {}
|
|
|
| -void SpeechRecognitionRequest::Start(const std::string& language,
|
| - const std::string& grammar,
|
| - bool filter_profanities,
|
| - const std::string& hardware_info,
|
| - const std::string& origin_url,
|
| - const std::string& content_type) {
|
| - DCHECK(!url_fetcher_.get());
|
| +GoogleOneShotRemoteEngine::GoogleOneShotRemoteEngine(
|
| + net::URLRequestContextGetter* context)
|
| + : url_context_(context) {
|
| +}
|
|
|
| - std::vector<std::string> parts;
|
| +GoogleOneShotRemoteEngine::~GoogleOneShotRemoteEngine() {}
|
| +
|
| +void GoogleOneShotRemoteEngine::SetConfig(
|
| + const GoogleOneShotRemoteEngineConfig& config) {
|
| + config_ = config;
|
| +}
|
| +
|
| +void GoogleOneShotRemoteEngine::StartRecognition() {
|
| + DCHECK(delegate());
|
| + DCHECK(!url_fetcher_.get());
|
| + std::string lang_param = config_.language;
|
|
|
| - std::string lang_param = language;
|
| if (lang_param.empty() && url_context_) {
|
| // If no language is provided then we use the first from the accepted
|
| // language list. If this list is empty then it defaults to "en-US".
|
| @@ -171,58 +193,108 @@ void SpeechRecognitionRequest::Start(const std::string& language,
|
| size_t separator = accepted_language_list.find_first_of(",;");
|
| lang_param = accepted_language_list.substr(0, separator);
|
| }
|
| +
|
| if (lang_param.empty())
|
| lang_param = "en-US";
|
| +
|
| + std::vector<std::string> parts;
|
| parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true));
|
|
|
| - if (!grammar.empty())
|
| - parts.push_back("lm=" + net::EscapeQueryParamValue(grammar, true));
|
| - if (!hardware_info.empty())
|
| - parts.push_back("xhw=" + net::EscapeQueryParamValue(hardware_info, true));
|
| + if (!config_.grammar.empty())
|
| + parts.push_back("lm=" + net::EscapeQueryParamValue(config_.grammar, true));
|
| +
|
| + if (!config_.hardware_info.empty())
|
| + parts.push_back("xhw=" + net::EscapeQueryParamValue(config_.hardware_info,
|
| + true));
|
| parts.push_back("maxresults=" + base::IntToString(kMaxResults));
|
| - parts.push_back(filter_profanities ? "pfilter=2" : "pfilter=0");
|
| + parts.push_back(config_.filter_profanities ? "pfilter=2" : "pfilter=0");
|
|
|
| GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&'));
|
|
|
| + encoder_.reset(AudioEncoder::Create(kDefaultAudioCodec,
|
| + config_.audio_sample_rate,
|
| + config_.audio_num_bits_per_sample));
|
| + DCHECK(encoder_.get());
|
| url_fetcher_.reset(URLFetcherImpl::Create(url_fetcher_id_for_tests,
|
| url,
|
| URLFetcherImpl::POST,
|
| this));
|
| - url_fetcher_->SetChunkedUpload(content_type);
|
| + url_fetcher_->SetChunkedUpload(encoder_->mime_type());
|
| url_fetcher_->SetRequestContext(url_context_);
|
| - url_fetcher_->SetReferrer(origin_url);
|
| + url_fetcher_->SetReferrer(config_.origin_url);
|
|
|
| // The speech recognition API does not require user identification as part
|
| // of requests, so we don't send cookies or auth data for these requests to
|
| // prevent any accidental connection between users who are logged into the
|
| // domain for other services (e.g. bookmark sync) with the speech requests.
|
| - url_fetcher_->SetLoadFlags(
|
| - net::LOAD_DO_NOT_SAVE_COOKIES | net::LOAD_DO_NOT_SEND_COOKIES |
|
| - net::LOAD_DO_NOT_SEND_AUTH_DATA);
|
| + url_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |
|
| + net::LOAD_DO_NOT_SEND_COOKIES |
|
| + net::LOAD_DO_NOT_SEND_AUTH_DATA);
|
| url_fetcher_->Start();
|
| }
|
|
|
| -void SpeechRecognitionRequest::UploadAudioChunk(const AudioChunk& audio_chunk,
|
| - bool is_last_chunk) {
|
| +void GoogleOneShotRemoteEngine::EndRecognition() {
|
| + url_fetcher_.reset();
|
| +}
|
| +
|
| +void GoogleOneShotRemoteEngine::TakeAudioChunk(const AudioChunk& data) {
|
| + DCHECK(url_fetcher_.get());
|
| + DCHECK(encoder_.get());
|
| + DCHECK_EQ(data.bytes_per_sample(), config_.audio_num_bits_per_sample / 8);
|
| + encoder_->Encode(data);
|
| + scoped_ptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());
|
| + url_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false);
|
| +}
|
| +
|
| +void GoogleOneShotRemoteEngine::AudioChunksEnded() {
|
| DCHECK(url_fetcher_.get());
|
| - url_fetcher_->AppendChunkToUpload(audio_chunk.AsString(), is_last_chunk);
|
| + DCHECK(encoder_.get());
|
| +
|
| + // UploadAudioChunk requires a non-empty final buffer. So we encode a packet
|
| + // of silence in case encoder had no data already.
|
| + std::vector<int16> samples(
|
| + config_.audio_sample_rate * kAudioPacketIntervalMs / 1000);
|
| + AudioChunk dummy_chunk(reinterpret_cast<uint8*>(&samples[0]),
|
| + samples.size() * sizeof(int16),
|
| + encoder_->bits_per_sample() / 8);
|
| + encoder_->Encode(dummy_chunk);
|
| + encoder_->Flush();
|
| + scoped_ptr<AudioChunk> encoded_dummy_data(encoder_->GetEncodedDataAndClear());
|
| + DCHECK(!encoded_dummy_data->IsEmpty());
|
| + encoder_.reset();
|
| +
|
| + url_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true);
|
| }
|
|
|
| -void SpeechRecognitionRequest::OnURLFetchComplete(
|
| +void GoogleOneShotRemoteEngine::OnURLFetchComplete(
|
| const content::URLFetcher* source) {
|
| DCHECK_EQ(url_fetcher_.get(), source);
|
| -
|
| - content::SpeechRecognitionResult result;
|
| + SpeechRecognitionResult result;
|
| + SpeechRecognitionError error(content::SPEECH_RECOGNITION_ERROR_NETWORK);
|
| std::string data;
|
| - if (!source->GetStatus().is_success() || source->GetResponseCode() != 200 ||
|
| - !source->GetResponseAsString(&data) ||
|
| - !ParseServerResponse(data, &result)) {
|
| - result.error = content::SPEECH_RECOGNITION_ERROR_NETWORK;
|
| - }
|
|
|
| - DVLOG(1) << "SpeechRecognitionRequest: Invoking delegate with result.";
|
| + // The default error code in case of parse errors is NETWORK_FAILURE, however
|
| + // ParseServerResponse can change the error to a more appropriate one.
|
| + bool error_occurred = (!source->GetStatus().is_success() ||
|
| + source->GetResponseCode() != 200 ||
|
| + !source->GetResponseAsString(&data) ||
|
| + !ParseServerResponse(data, &result, &error));
|
| url_fetcher_.reset();
|
| - delegate_->SetRecognitionResult(result);
|
| + if (error_occurred) {
|
| + DVLOG(1) << "GoogleOneShotRemoteEngine: Network Error " << error.code;
|
| + delegate()->OnSpeechRecognitionEngineError(error);
|
| + } else {
|
| + DVLOG(1) << "GoogleOneShotRemoteEngine: Invoking delegate with result.";
|
| + delegate()->OnSpeechRecognitionEngineResult(result);
|
| + }
|
| +}
|
| +
|
| +bool GoogleOneShotRemoteEngine::IsRecognitionPending() const {
|
| + return url_fetcher_ != NULL;
|
| +}
|
| +
|
| +int GoogleOneShotRemoteEngine::GetDesiredAudioChunkDurationMs() const {
|
| + return kAudioPacketIntervalMs;
|
| }
|
|
|
| } // namespace speech
|
|
|