Chromium Code Reviews| Index: content/browser/speech/google_ssfe_remote_engine.cc |
| diff --git a/content/browser/speech/speech_recognition_request.cc b/content/browser/speech/google_ssfe_remote_engine.cc |
| similarity index 53% |
| rename from content/browser/speech/speech_recognition_request.cc |
| rename to content/browser/speech/google_ssfe_remote_engine.cc |
| index a14369976d45ff0c267ef0af7221f4cac547fc32..98573af95e4c2941362c03ababeb3a5c0d1df9d0 100644 |
| --- a/content/browser/speech/speech_recognition_request.cc |
| +++ b/content/browser/speech/google_ssfe_remote_engine.cc |
| @@ -2,11 +2,12 @@ |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| -#include "content/browser/speech/speech_recognition_request.h" |
| +#include "content/browser/speech/google_ssfe_remote_engine.h" |
| #include <vector> |
| #include "base/json/json_reader.h" |
| +#include "base/memory/scoped_ptr.h" |
| #include "base/string_number_conversions.h" |
| #include "base/string_util.h" |
| #include "base/values.h" |
| @@ -19,6 +20,10 @@ |
| #include "net/url_request/url_request_context_getter.h" |
| #include "net/url_request/url_request_status.h" |
| +using content::SpeechRecognitionError; |
| +using content::SpeechRecognitionHypothesis; |
| +using content::SpeechRecognitionResult; |
| + |
| namespace { |
| const char* const kDefaultSpeechRecognitionUrl = |
| @@ -32,8 +37,13 @@ const char* const kConfidenceString = "confidence"; |
| // set this via an attribute. |
| const int kMaxResults = 6; |
| +const int SPEECH_API_STATUS_NO_ERROR = 0; |
|
Satish
2012/03/16 17:00:35
add comment here on what these are (i.e. codes ret
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
|
| +const int SPEECH_API_STATUS_NO_SPEECH = 4; |
| +const int SPEECH_API_STATUS_NO_MATCH = 5; |
| + |
| bool ParseServerResponse(const std::string& response_body, |
| - content::SpeechRecognitionResult* result) { |
| + SpeechRecognitionResult* result, |
| + SpeechRecognitionError* error) { |
| if (response_body.empty()) { |
| LOG(WARNING) << "ParseServerResponse: Response was empty."; |
| return false; |
| @@ -67,19 +77,21 @@ bool ParseServerResponse(const std::string& response_body, |
| // Process the status. |
| switch (status) { |
| - case content::SPEECH_RECOGNITION_ERROR_NONE: |
| - case content::SPEECH_RECOGNITION_ERROR_NO_SPEECH: |
| - case content::SPEECH_RECOGNITION_ERROR_NO_MATCH: |
| - break; |
| - |
| - default: |
| - // Other status codes should not be returned by the server. |
| - VLOG(1) << "ParseServerResponse: unexpected status code " << status; |
| - return false; |
| + case SPEECH_API_STATUS_NO_ERROR: |
|
Satish
2012/03/16 17:00:35
should we set the error code to none in this case?
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
|
| + break; |
| + case SPEECH_API_STATUS_NO_SPEECH: |
| + error->code = content::SPEECH_RECOGNITION_ERROR_NO_SPEECH; |
| + return false; |
| + case SPEECH_API_STATUS_NO_MATCH: |
| + error->code = content::SPEECH_RECOGNITION_ERROR_NO_MATCH; |
| + return false; |
| + default: |
| + error->code = content::SPEECH_RECOGNITION_ERROR_NETWORK; |
| + // Other status codes should not be returned by the server. |
| + VLOG(1) << "ParseServerResponse: unexpected status code " << status; |
| + return false; |
| } |
| - result->error = static_cast<content::SpeechRecognitionErrorCode>(status); |
| - |
| // Get the hypotheses. |
| Value* hypotheses_value = NULL; |
| if (!response_object->Get(kHypothesesString, &hypotheses_value)) { |
| @@ -95,7 +107,8 @@ bool ParseServerResponse(const std::string& response_body, |
| } |
| const ListValue* hypotheses_list = static_cast<ListValue*>(hypotheses_value); |
| - |
| + // For now we support only single shot recognition, so we are giving only a |
| + // final result, consisting of one fragment (with one or more hypotheses). |
| size_t index = 0; |
| for (; index < hypotheses_list->GetSize(); ++index) { |
| Value* hypothesis = NULL; |
| @@ -113,6 +126,7 @@ bool ParseServerResponse(const std::string& response_body, |
| const DictionaryValue* hypothesis_value = |
| static_cast<DictionaryValue*>(hypothesis); |
| string16 utterance; |
| + |
| if (!hypothesis_value->GetString(kUtteranceString, &utterance)) { |
| LOG(WARNING) << "ParseServerResponse: Missing utterance value."; |
| break; |
| @@ -121,45 +135,56 @@ bool ParseServerResponse(const std::string& response_body, |
| // It is not an error if the 'confidence' field is missing. |
| double confidence = 0.0; |
| hypothesis_value->GetDouble(kConfidenceString, &confidence); |
| - |
| - result->hypotheses.push_back(content::SpeechRecognitionHypothesis( |
| - utterance, confidence)); |
| + result->hypotheses.push_back(SpeechRecognitionHypothesis(utterance, |
| + confidence)); |
| } |
| if (index < hypotheses_list->GetSize()) { |
| result->hypotheses.clear(); |
| return false; |
| } |
| - |
| return true; |
| } |
| } // namespace |
| + |
|
Satish
2012/03/16 17:00:35
remove extra newline
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
|
| namespace speech { |
| -int SpeechRecognitionRequest::url_fetcher_id_for_tests = 0; |
| +const int GoogleSSFERemoteEngine::kAudioPacketIntervalMs = 100; |
| +int GoogleSSFERemoteEngine::url_fetcher_id_for_tests = 0; |
| + |
| +GoogleSSFERemoteEngineConfig::GoogleSSFERemoteEngineConfig() |
| + : filter_profanities(false), |
|
Satish
2012/03/16 17:00:35
4 spaces before :
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
|
| + audio_sample_rate(8000), |
|
Satish
2012/03/16 17:00:35
these two values should probably be constants at t
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
|
| + audio_num_bits_per_sample(16) { |
| +} |
| + |
| +GoogleSSFERemoteEngineConfig::~GoogleSSFERemoteEngineConfig() {} |
| -SpeechRecognitionRequest::SpeechRecognitionRequest( |
| - net::URLRequestContextGetter* context, Delegate* delegate) |
| +GoogleSSFERemoteEngine::GoogleSSFERemoteEngine( |
| + net::URLRequestContextGetter* context) |
| : url_context_(context), |
| - delegate_(delegate) { |
| - DCHECK(delegate); |
| + codec_(AudioEncoder::CODEC_FLAC), |
| + encoder_(NULL) { |
|
Satish
2012/03/16 17:00:35
this should get autoinitialized?
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
|
| } |
| -SpeechRecognitionRequest::~SpeechRecognitionRequest() {} |
| +GoogleSSFERemoteEngine::~GoogleSSFERemoteEngine() {} |
| -void SpeechRecognitionRequest::Start(const std::string& language, |
| - const std::string& grammar, |
| - bool filter_profanities, |
| - const std::string& hardware_info, |
| - const std::string& origin_url, |
| - const std::string& content_type) { |
| - DCHECK(!url_fetcher_.get()); |
| +void GoogleSSFERemoteEngine::SetConfiguration( |
| + const GoogleSSFERemoteEngineConfig& config) { |
| + config_ = config; |
| +} |
| +void GoogleSSFERemoteEngine::SpeechRecognitionBegins() { |
| + DCHECK(delegate()); |
| + DCHECK(!url_fetcher_.get()); |
| std::vector<std::string> parts; |
|
Satish
2012/03/16 17:00:35
move this to line 203 where it is getting used for
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
|
| + encoder_.reset(AudioEncoder::Create(codec_, config_.audio_sample_rate, |
| + config_.audio_num_bits_per_sample)); |
| + DCHECK(encoder_.get()); |
| + std::string lang_param = config_.language; |
| - std::string lang_param = language; |
| if (lang_param.empty() && url_context_) { |
| // If no language is provided then we use the first from the accepted |
| // language list. If this list is empty then it defaults to "en-US". |
| @@ -171,16 +196,20 @@ void SpeechRecognitionRequest::Start(const std::string& language, |
| size_t separator = accepted_language_list.find_first_of(",;"); |
| lang_param = accepted_language_list.substr(0, separator); |
| } |
| + |
| if (lang_param.empty()) |
| lang_param = "en-US"; |
| + |
| parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true)); |
| - if (!grammar.empty()) |
| - parts.push_back("lm=" + net::EscapeQueryParamValue(grammar, true)); |
| - if (!hardware_info.empty()) |
| - parts.push_back("xhw=" + net::EscapeQueryParamValue(hardware_info, true)); |
| + if (!config_.grammar.empty()) |
| + parts.push_back("lm=" + net::EscapeQueryParamValue(config_.grammar, true)); |
| + |
| + if (!config_.hardware_info.empty()) |
| + parts.push_back("xhw=" + net::EscapeQueryParamValue(config_.hardware_info, |
| + true)); |
| parts.push_back("maxresults=" + base::IntToString(kMaxResults)); |
| - parts.push_back(filter_profanities ? "pfilter=2" : "pfilter=0"); |
| + parts.push_back(config_.filter_profanities ? "pfilter=2" : "pfilter=0"); |
| GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&')); |
| @@ -188,41 +217,79 @@ void SpeechRecognitionRequest::Start(const std::string& language, |
| url, |
| URLFetcherImpl::POST, |
| this)); |
| - url_fetcher_->SetChunkedUpload(content_type); |
| + url_fetcher_->SetChunkedUpload(encoder_->mime_type()); |
| url_fetcher_->SetRequestContext(url_context_); |
| - url_fetcher_->SetReferrer(origin_url); |
| + url_fetcher_->SetReferrer(config_.origin_url); |
| // The speech recognition API does not require user identification as part |
| // of requests, so we don't send cookies or auth data for these requests to |
| // prevent any accidental connection between users who are logged into the |
| // domain for other services (e.g. bookmark sync) with the speech requests. |
| - url_fetcher_->SetLoadFlags( |
| - net::LOAD_DO_NOT_SAVE_COOKIES | net::LOAD_DO_NOT_SEND_COOKIES | |
| - net::LOAD_DO_NOT_SEND_AUTH_DATA); |
| + url_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES | |
| + net::LOAD_DO_NOT_SEND_COOKIES | |
| + net::LOAD_DO_NOT_SEND_AUTH_DATA); |
| url_fetcher_->Start(); |
| } |
| -void SpeechRecognitionRequest::UploadAudioChunk(const AudioChunk& audio_chunk, |
| - bool is_last_chunk) { |
| +// Called only after the results have been retrieved. |
| +void GoogleSSFERemoteEngine::SpeechRecognitionEnds() { |
| + url_fetcher_.reset(); |
|
Satish
2012/03/16 17:00:35
do you expect callers to reuse this object across
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
In some occasions it can be reused (e.g, the user
|
| +} |
| + |
| +void GoogleSSFERemoteEngine::PushSpeechAudio(const AudioChunk& data) { |
| DCHECK(url_fetcher_.get()); |
|
Satish
2012/03/16 17:00:35
also check for encoder_ as done below
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
|
| - url_fetcher_->AppendChunkToUpload(audio_chunk.AsString(), is_last_chunk); |
| + DCHECK_EQ(data.bytes_per_sample(), config_.audio_num_bits_per_sample / 8); |
| + encoder_->Encode(data); |
| + scoped_ptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear()); |
| + url_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false); |
| } |
| -void SpeechRecognitionRequest::OnURLFetchComplete( |
| +void GoogleSSFERemoteEngine::SpeechAudioStreamComplete() { |
| + DCHECK(url_fetcher_.get()); |
| + DCHECK(encoder_.get()); |
| + // UploadAudioChunk requires a non-empty final buffer. So we encode a packet |
| + // of silence in case encoder had no data already. |
| + std::vector<short> samples( |
|
Satish
2012/03/16 17:00:35
short -> int16 ?
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
|
| + config_.audio_sample_rate * kAudioPacketIntervalMs / 1000); |
| + AudioChunk dummy_chunk(reinterpret_cast<uint8*>(&samples[0]), |
| + samples.size() * sizeof(short), |
|
Satish
2012/03/16 17:00:35
ditto
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
|
| + encoder_->bits_per_sample() / 8); |
| + encoder_->Encode(dummy_chunk); |
| + encoder_->Flush(); |
| + scoped_ptr<AudioChunk> encoded_dummy_data(encoder_->GetEncodedDataAndClear()); |
| + DCHECK(!encoded_dummy_data->IsEmpty()); |
| + encoder_.reset(); |
| + |
| + url_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true); |
| +} |
| + |
| +void GoogleSSFERemoteEngine::OnURLFetchComplete( |
| const content::URLFetcher* source) { |
| DCHECK_EQ(url_fetcher_.get(), source); |
| - |
| - content::SpeechRecognitionResult result; |
| + SpeechRecognitionResult result; |
| + SpeechRecognitionError error(content::SPEECH_RECOGNITION_ERROR_NETWORK); |
| std::string data; |
| - if (!source->GetStatus().is_success() || source->GetResponseCode() != 200 || |
| + // The default error code in case of parse errors is NETWORK_FAILURE, however |
|
Satish
2012/03/16 17:00:35
add a newline before full line comments such as th
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
|
| + // ParseServerResponse can change the error to a more appropriate one. |
| + if (!source->GetStatus().is_success() || |
|
Satish
2012/03/16 17:00:35
no need to align the ||, could just leave 1 space
Primiano Tucci (use gerrit)
2012/03/20 13:14:50
Done.
|
| + source->GetResponseCode() != 200 || |
| !source->GetResponseAsString(&data) || |
| - !ParseServerResponse(data, &result)) { |
| - result.error = content::SPEECH_RECOGNITION_ERROR_NETWORK; |
| + !ParseServerResponse(data, &result, &error)) { |
| + DVLOG(1) << "GoogleSSFERemoteEngine: Network Error " << error.code; |
| + delegate()->OnSpeechEngineError(error); |
| + } else { |
| + DVLOG(1) << "GoogleSSFERemoteEngine: Invoking delegate with result."; |
| + delegate()->OnSpeechEngineResult(result); |
| } |
| - |
| - DVLOG(1) << "SpeechRecognitionRequest: Invoking delegate with result."; |
| url_fetcher_.reset(); |
| - delegate_->SetRecognitionResult(result); |
| +} |
| + |
| +bool GoogleSSFERemoteEngine::IsRecognitionPending() const { |
| + return url_fetcher_ != NULL; |
| +} |
| + |
| +int GoogleSSFERemoteEngine::DesiredAudioChunkDurationMs() const { |
| + return kAudioPacketIntervalMs; |
| } |
| } // namespace speech |