| Index: chrome/browser/autofill/autofill_locale_model.cc
|
| diff --git a/chrome/browser/autofill/autofill_locale_model.cc b/chrome/browser/autofill/autofill_locale_model.cc
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..253744ff55790ab408558aa865d4480558724ae5
|
| --- /dev/null
|
| +++ b/chrome/browser/autofill/autofill_locale_model.cc
|
| @@ -0,0 +1,245 @@
|
| +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
|
| +// Use of this source code is governed by a BSD-style license that can be
|
| +// found in the LICENSE file.
|
| +
|
| +#include "chrome/browser/autofill/autofill_locale_model.h"
|
| +
|
| +#include "base/string_util.h"
|
| +#include "chrome/browser/autofill/form_structure.h"
|
| +#include "chrome/browser/tab_contents/language_state.h"
|
| +#include "chrome/browser/tab_contents/tab_contents.h"
|
| +#include "chrome/common/chrome_constants.h"
|
| +#include "net/base/registry_controlled_domain.h"
|
| +#include "third_party/icu/public/common/unicode/locid.h"
|
| +#include "third_party/icu/public/common/unicode/uloc.h"
|
| +
|
| +namespace {
|
| +
|
| +// Associates known registries with their typcial regions and languages.
|
| +struct AutoFillRegistry {
|
| + const char* const registry;
|
| + const char* const region;
|
| + const char* const languages;
|
| +};
|
| +
|
| +// TODO(isherman): Expand this list.
|
| +const AutoFillRegistry kAutoFillRegistries[] = {
|
| + {"au", "AU", "en"},
|
| + {"br", "BR", "pt"},
|
| + {"ca", "CA", "fr, en"},
|
| + {"co.jp", "JP", "ja"},
|
| + {"co.il", "IL", "he"},
|
| + {"co.nz", "NZ", "en"},
|
| + {"co.uk", "GB", "en"},
|
| + {"com.au", "AU", "en"},
|
| + {"com.br", "BR", "pt"},
|
| + {"com.mx", "MX", "es"},
|
| + {"il", "IL", "he"},
|
| + {"it", "IT", "it"},
|
| + {"jp", "JP", "ja"},
|
| + {"fr", "FR", "fr"},
|
| + {"mx", "MX", "es"},
|
| + {"ru", "RU", "ru"},
|
| +};
|
| +
|
| +// Returns the registry (roughly, top level domain) for |url|.
|
| +std::string ExtractRegistry(const GURL& url) {
|
| + const size_t registry_length =
|
| + net::RegistryControlledDomainService::GetRegistryLength(url, false);
|
| + if (registry_length == 0 || registry_length == std::string::npos)
|
| + return std::string();
|
| +
|
| + const std::string host =
|
| + net::RegistryControlledDomainService::GetDomainAndRegistry(url);
|
| + DCHECK_LT(registry_length, host.size());
|
| +
|
| + return StringToLowerASCII(host.substr(host.size() - registry_length));
|
| +}
|
| +
|
| +// Converts |locale_id| into a canonical ICU |Locale|, with a couple of fixups:
|
| +// * An empty |locale_id| returns an empty result, rather than ICU's default
|
| +// canonicalization to "en-US".
|
| +// * Likewise, a |locale_id| with an unknown language or region returns an empty
|
| +// result, rather than ICU's default canonicalization to a bogus locale.
|
| +icu::Locale ConvertToICULocale(const std::string& locale_id) {
|
| + if (locale_id.empty())
|
| + return icu::Locale("");
|
| +
|
| + scoped_array<char> sanitized_locale(new char[locale_id.size() + 1]);
|
| + size_t sanitized_length = 0;
|
| + for (size_t i = 0; i < locale_id.size(); ++i) {
|
| + // We aren't interested in locale keywords, so the only valid characters in
|
| + // the locale string are ASCII letters, hyphen, and underscore. See
|
| + // http://userguide.icu-project.org/locale#TOC-Canonicalization
|
| + if (isalpha(locale_id[i]) || locale_id[i] == '-' || locale_id[i] == '_') {
|
| + sanitized_locale.get()[sanitized_length++] = locale_id[i];
|
| + }
|
| + }
|
| + sanitized_locale.get()[sanitized_length] = '\0';
|
| +
|
| + icu::Locale canonical_locale =
|
| + icu::Locale::createCanonical(sanitized_locale.get());
|
| +
|
| + std::string language = canonical_locale.getLanguage();
|
| + if (!AutoFillLocaleModel::IsValidLanguageTag(language))
|
| + language = std::string();
|
| +
|
| + std::string region = canonical_locale.getCountry();
|
| + if (!AutoFillLocaleModel::IsValidRegionTag(region))
|
| + region = std::string();
|
| +
|
| + return icu::Locale(language.c_str(), region.c_str());
|
| +}
|
| +
|
| +// Combines |language| and |region| into a single locale code.
|
| +// Returns "en-US" as a default if the |language| is empty.
|
| +std::string LanguageAndRegionToLocaleCode(const std::string& language,
|
| + const std::string& region) {
|
| + // We should never get a non-empty region with an empty language.
|
| + DCHECK(!language.empty() || region.empty());
|
| +
|
| + if (language.empty())
|
| + return "en-US";
|
| +
|
| + if (region.empty())
|
| + return language;
|
| +
|
| + return (language + "-" + region);
|
| +}
|
| +
|
| +} // namespace
|
| +
|
| +AutoFillLocaleModel::AutoFillLocaleModel(const TabContents* tab_contents)
|
| + : tab_contents_(tab_contents),
|
| + tab_language_determined_(false) {
|
| + // Populate the registry map.
|
| + for (size_t i = 0; i < arraysize(kAutoFillRegistries); ++i) {
|
| + AutoFillRegistryInfo info;
|
| + info.region = kAutoFillRegistries[i].region;
|
| + SplitString(kAutoFillRegistries[i].languages, ',', &info.languages);
|
| +
|
| + registries_[kAutoFillRegistries[i].registry] = info;
|
| + }
|
| +}
|
| +
|
| +AutoFillLocaleModel::~AutoFillLocaleModel() {
|
| +}
|
| +
|
| +// static
|
| +bool AutoFillLocaleModel::IsValidLanguageTag(const std::string& language_tag) {
|
| + // Special case: "und" is an officially registered language tag for
|
| + // "Undetermined".
|
| + if (language_tag == chrome::kUnknownLanguageCode)
|
| + return false;
|
| +
|
| + // |languages| is a NULL-terminated array of C-strings
|
| + for (const char* const* languages = uloc_getISOLanguages(); languages[0];
|
| + ++languages) {
|
| + if (languages[0] == language_tag)
|
| + return true;
|
| + }
|
| + return false;
|
| +}
|
| +
|
| +// static
|
| +bool AutoFillLocaleModel::IsValidRegionTag(const std::string& region_tag) {
|
| + // |regions| is a NULL-terminated array of C-strings
|
| + for (const char* const* regions = uloc_getISOCountries(); regions[0];
|
| + ++regions) {
|
| + if (regions[0] == region_tag)
|
| + return true;
|
| + }
|
| + return false;
|
| +}
|
| +
|
| +// If |guessed_language| is empty or the languages associated with |registry|
|
| +// include |guessed_language|, returns the region associated with |registry|.
|
| +// Otherwise, returns an empty string.
|
| +std::string AutoFillLocaleModel::RegionFromRegistry(
|
| + const std::string& registry,
|
| + std::string* guessed_language) const {
|
| + AutoFillRegistryMap::const_iterator result = registries_.find(registry);
|
| + if (result == registries_.end())
|
| + return std::string();
|
| +
|
| + const AutoFillRegistryInfo& info = result->second;
|
| + for (size_t i = 0; i < info.languages.size(); ++i) {
|
| + // For each language spoken in the region, check to see if our current guess
|
| + // at the language matches. Since we accept the first match, the order in
|
| + // which the langauges are listed matters. If we previously had no guess
|
| + // for the language, update the guess.
|
| + if (guessed_language->empty()) {
|
| + *guessed_language = info.languages[i];
|
| + return info.region;
|
| + } else if (*guessed_language == info.languages[i]) {
|
| + return info.region;
|
| + }
|
| + }
|
| +
|
| + return std::string();
|
| +}
|
| +
|
| +// TODO(isherman): We use the page's original language to detect the form's
|
| +// language. We might want to also use the page's current language (after
|
| +// translation), e.g. to help us figure out the address's language.
|
| +void AutoFillLocaleModel::UpdateLocale(FormStructure* form) const {
|
| + DCHECK(tab_language_determined_);
|
| +
|
| + icu::Locale dom_locale = ConvertToICULocale(form->locale());
|
| + const std::string detected_locale_str = tab_contents_?
|
| + tab_contents_->language_state().original_language() : std::string();
|
| + icu::Locale detected_locale = ConvertToICULocale(detected_locale_str);
|
| +
|
| + std::string result_language;
|
| + std::string result_region;
|
| +
|
| + // If we got a non-empty locale from the DOM, use that. Moreover, we don't
|
| + // trust the locales "en" and "en-US" -- these can occur with international
|
| + // forms due to copy/pasted or otherwise careless HTML. We'll ignore them
|
| + // for now, but come back to them if we find no other useful signals.
|
| + if ((strlen(dom_locale.getLanguage()) != 0) &&
|
| + dom_locale != icu::Locale::getEnglish() &&
|
| + dom_locale != icu::Locale::getUS()) {
|
| + result_language = dom_locale.getLanguage();
|
| + result_region = dom_locale.getCountry();
|
| + }
|
| +
|
| + // If we didn't get a language from the DOM, use the CLD-detected locale.
|
| + // If we did get a language but not a region form the DOM, try using the
|
| + // CLD-detected region.
|
| + if (result_language.empty()) {
|
| + result_language = detected_locale.getLanguage();
|
| + result_region = detected_locale.getCountry();
|
| + } else if (result_language == detected_locale.getLanguage() &&
|
| + result_region.empty()) {
|
| + result_region = detected_locale.getCountry();
|
| + }
|
| +
|
| + // If we still don't have a region, see if we can guess it based on the url's
|
| + // registry. Try the form's source url first, then the target url, then the
|
| + // tab's registry (which might be different if the form is being framed).
|
| + if (result_region.empty()) {
|
| + result_region = RegionFromRegistry(ExtractRegistry(form->source_url()),
|
| + &result_language);
|
| + }
|
| + if (result_region.empty()) {
|
| + result_region = RegionFromRegistry(ExtractRegistry(form->target_url()),
|
| + &result_language);
|
| + }
|
| + if (result_region.empty() && tab_contents_) {
|
| + result_region = RegionFromRegistry(ExtractRegistry(tab_contents_->GetURL()),
|
| + &result_language);
|
| + }
|
| +
|
| + // If we _still_ don't have a region, see if we ignored an untrusted region --
|
| + // e.g. "en-US" -- specified by the DOM. Only do this if our
|
| + // current guess at the language matches what the DOM specified.
|
| + if (result_region.empty() && (result_language.empty() ||
|
| + result_language == dom_locale.getLanguage())){
|
| + result_language = dom_locale.getLanguage();
|
| + result_region = dom_locale.getCountry();
|
| + }
|
| +
|
| + form->set_locale(
|
| + LanguageAndRegionToLocaleCode(result_language, result_region));
|
| +}
|
|
|