chrome/browser/autofill/autofill_locale_model.cc - Issue 3226001: Detecting form locale

Unified Diff: chrome/browser/autofill/autofill_locale_model.cc

Issue 3226001: Detecting form locale (Closed) Base URL: http://src.chromium.org/git/chromium.git

Patch Set: Unit test for top websites Created 10 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: chrome/browser/autofill/autofill_locale_model.cc

diff --git a/chrome/browser/autofill/autofill_locale_model.cc b/chrome/browser/autofill/autofill_locale_model.cc

new file mode 100644

index 0000000000000000000000000000000000000000..253744ff55790ab408558aa865d4480558724ae5

--- /dev/null

+++ b/chrome/browser/autofill/autofill_locale_model.cc

@@ -0,0 +1,245 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "chrome/browser/autofill/autofill_locale_model.h"

+#include "base/string_util.h"

+#include "chrome/browser/autofill/form_structure.h"

+#include "chrome/browser/tab_contents/language_state.h"

+#include "chrome/browser/tab_contents/tab_contents.h"

+#include "chrome/common/chrome_constants.h"

+#include "net/base/registry_controlled_domain.h"

+#include "third_party/icu/public/common/unicode/locid.h"

+#include "third_party/icu/public/common/unicode/uloc.h"

+namespace {

+// Associates known registries with their typcial regions and languages.

+struct AutoFillRegistry {

+ const char* const registry;

+ const char* const region;

+ const char* const languages;

+};

+// TODO(isherman): Expand this list.

+const AutoFillRegistry kAutoFillRegistries[] = {

+ {"au", "AU", "en"},

+ {"br", "BR", "pt"},

+ {"ca", "CA", "fr, en"},

+ {"co.jp", "JP", "ja"},

+ {"co.il", "IL", "he"},

+ {"co.nz", "NZ", "en"},

+ {"co.uk", "GB", "en"},

+ {"com.au", "AU", "en"},

+ {"com.br", "BR", "pt"},

+ {"com.mx", "MX", "es"},

+ {"il", "IL", "he"},

+ {"it", "IT", "it"},

+ {"jp", "JP", "ja"},

+ {"fr", "FR", "fr"},

+ {"mx", "MX", "es"},

+ {"ru", "RU", "ru"},

+};

+// Returns the registry (roughly, top level domain) for |url|.

+std::string ExtractRegistry(const GURL& url) {

+ const size_t registry_length =

+ net::RegistryControlledDomainService::GetRegistryLength(url, false);

+ if (registry_length == 0 || registry_length == std::string::npos)

+ return std::string();

+ const std::string host =

+ net::RegistryControlledDomainService::GetDomainAndRegistry(url);

+ DCHECK_LT(registry_length, host.size());

+ return StringToLowerASCII(host.substr(host.size() - registry_length));

+// Converts |locale_id| into a canonical ICU |Locale|, with a couple of fixups:

+// * An empty |locale_id| returns an empty result, rather than ICU's default

+// canonicalization to "en-US".

+// * Likewise, a |locale_id| with an unknown language or region returns an empty

+// result, rather than ICU's default canonicalization to a bogus locale.

+icu::Locale ConvertToICULocale(const std::string& locale_id) {

+ if (locale_id.empty())

+ return icu::Locale("");

+ scoped_array<char> sanitized_locale(new char[locale_id.size() + 1]);

+ size_t sanitized_length = 0;

+ for (size_t i = 0; i < locale_id.size(); ++i) {

+ // We aren't interested in locale keywords, so the only valid characters in

+ // the locale string are ASCII letters, hyphen, and underscore. See

+ // http://userguide.icu-project.org/locale#TOC-Canonicalization

+ if (isalpha(locale_id[i]) || locale_id[i] == '-' || locale_id[i] == '_') {

+ sanitized_locale.get()[sanitized_length++] = locale_id[i];

+ }

+ sanitized_locale.get()[sanitized_length] = '\0';

+ icu::Locale canonical_locale =

+ icu::Locale::createCanonical(sanitized_locale.get());

+ std::string language = canonical_locale.getLanguage();

+ if (!AutoFillLocaleModel::IsValidLanguageTag(language))

+ language = std::string();

+ std::string region = canonical_locale.getCountry();

+ if (!AutoFillLocaleModel::IsValidRegionTag(region))

+ region = std::string();

+ return icu::Locale(language.c_str(), region.c_str());

+// Combines |language| and |region| into a single locale code.

+// Returns "en-US" as a default if the |language| is empty.

+std::string LanguageAndRegionToLocaleCode(const std::string& language,

+ const std::string& region) {

+ // We should never get a non-empty region with an empty language.

+ DCHECK(!language.empty() || region.empty());

+ if (language.empty())

+ return "en-US";

+ if (region.empty())

+ return language;

+ return (language + "-" + region);

+} // namespace

+AutoFillLocaleModel::AutoFillLocaleModel(const TabContents* tab_contents)

+ : tab_contents_(tab_contents),

+ tab_language_determined_(false) {

+ // Populate the registry map.

+ for (size_t i = 0; i < arraysize(kAutoFillRegistries); ++i) {

+ AutoFillRegistryInfo info;

+ info.region = kAutoFillRegistries[i].region;

+ SplitString(kAutoFillRegistries[i].languages, ',', &info.languages);

+ registries_[kAutoFillRegistries[i].registry] = info;

+ }

+AutoFillLocaleModel::~AutoFillLocaleModel() {

+// static

+bool AutoFillLocaleModel::IsValidLanguageTag(const std::string& language_tag) {

+ // Special case: "und" is an officially registered language tag for

+ // "Undetermined".

+ if (language_tag == chrome::kUnknownLanguageCode)

+ return false;

+ // |languages| is a NULL-terminated array of C-strings

+ for (const char* const* languages = uloc_getISOLanguages(); languages[0];

+ ++languages) {

+ if (languages[0] == language_tag)

+ return true;

+ }

+ return false;

+// static

+bool AutoFillLocaleModel::IsValidRegionTag(const std::string& region_tag) {

+ // |regions| is a NULL-terminated array of C-strings

+ for (const char* const* regions = uloc_getISOCountries(); regions[0];

+ ++regions) {

+ if (regions[0] == region_tag)

+ return true;

+ }

+ return false;

+// If |guessed_language| is empty or the languages associated with |registry|

+// include |guessed_language|, returns the region associated with |registry|.

+// Otherwise, returns an empty string.

+std::string AutoFillLocaleModel::RegionFromRegistry(

+ const std::string& registry,

+ std::string* guessed_language) const {

+ AutoFillRegistryMap::const_iterator result = registries_.find(registry);

+ if (result == registries_.end())

+ return std::string();

+ const AutoFillRegistryInfo& info = result->second;

+ for (size_t i = 0; i < info.languages.size(); ++i) {

+ // For each language spoken in the region, check to see if our current guess

+ // at the language matches. Since we accept the first match, the order in

+ // which the langauges are listed matters. If we previously had no guess

+ // for the language, update the guess.

+ if (guessed_language->empty()) {

+ *guessed_language = info.languages[i];

+ return info.region;

+ } else if (*guessed_language == info.languages[i]) {

+ return info.region;

+ }

+ return std::string();

+// TODO(isherman): We use the page's original language to detect the form's

+// language. We might want to also use the page's current language (after

+// translation), e.g. to help us figure out the address's language.

+void AutoFillLocaleModel::UpdateLocale(FormStructure* form) const {

+ DCHECK(tab_language_determined_);

+ icu::Locale dom_locale = ConvertToICULocale(form->locale());

+ const std::string detected_locale_str = tab_contents_?

+ tab_contents_->language_state().original_language() : std::string();

+ icu::Locale detected_locale = ConvertToICULocale(detected_locale_str);

+ std::string result_language;

+ std::string result_region;

+ // If we got a non-empty locale from the DOM, use that. Moreover, we don't

+ // trust the locales "en" and "en-US" -- these can occur with international

+ // forms due to copy/pasted or otherwise careless HTML. We'll ignore them

+ // for now, but come back to them if we find no other useful signals.

+ if ((strlen(dom_locale.getLanguage()) != 0) &&

+ dom_locale != icu::Locale::getEnglish() &&

+ dom_locale != icu::Locale::getUS()) {

+ result_language = dom_locale.getLanguage();

+ result_region = dom_locale.getCountry();

+ }

+ // If we didn't get a language from the DOM, use the CLD-detected locale.

+ // If we did get a language but not a region form the DOM, try using the

+ // CLD-detected region.

+ if (result_language.empty()) {

+ result_language = detected_locale.getLanguage();

+ result_region = detected_locale.getCountry();

+ } else if (result_language == detected_locale.getLanguage() &&

+ result_region.empty()) {

+ result_region = detected_locale.getCountry();

+ }

+ // If we still don't have a region, see if we can guess it based on the url's

+ // registry. Try the form's source url first, then the target url, then the

+ // tab's registry (which might be different if the form is being framed).

+ if (result_region.empty()) {

+ result_region = RegionFromRegistry(ExtractRegistry(form->source_url()),

+ &result_language);

+ }

+ if (result_region.empty()) {

+ result_region = RegionFromRegistry(ExtractRegistry(form->target_url()),

+ &result_language);

+ }

+ if (result_region.empty() && tab_contents_) {

+ result_region = RegionFromRegistry(ExtractRegistry(tab_contents_->GetURL()),

+ &result_language);

+ }

+ // If we _still_ don't have a region, see if we ignored an untrusted region --

+ // e.g. "en-US" -- specified by the DOM. Only do this if our

+ // current guess at the language matches what the DOM specified.

+ if (result_region.empty() && (result_language.empty() ||

+ result_language == dom_locale.getLanguage())){

+ result_language = dom_locale.getLanguage();

+ result_region = dom_locale.getCountry();

+ }

+ form->set_locale(

+ LanguageAndRegionToLocaleCode(result_language, result_region));

« no previous file with comments | « chrome/browser/autofill/autofill_locale_model.h ('k') | chrome/browser/autofill/autofill_locale_model_unittest.cc » ('j') | no next file with comments »