Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(242)

Unified Diff: chrome/browser/autofill/autofill_locale_model.cc

Issue 3226001: Detecting form locale (Closed) Base URL: http://src.chromium.org/git/chromium.git
Patch Set: Unit test for top websites Created 10 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/browser/autofill/autofill_locale_model.cc
diff --git a/chrome/browser/autofill/autofill_locale_model.cc b/chrome/browser/autofill/autofill_locale_model.cc
new file mode 100644
index 0000000000000000000000000000000000000000..253744ff55790ab408558aa865d4480558724ae5
--- /dev/null
+++ b/chrome/browser/autofill/autofill_locale_model.cc
@@ -0,0 +1,245 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/browser/autofill/autofill_locale_model.h"
+
+#include "base/string_util.h"
+#include "chrome/browser/autofill/form_structure.h"
+#include "chrome/browser/tab_contents/language_state.h"
+#include "chrome/browser/tab_contents/tab_contents.h"
+#include "chrome/common/chrome_constants.h"
+#include "net/base/registry_controlled_domain.h"
+#include "third_party/icu/public/common/unicode/locid.h"
+#include "third_party/icu/public/common/unicode/uloc.h"
+
+namespace {
+
+// Associates known registries with their typcial regions and languages.
+struct AutoFillRegistry {
+ const char* const registry;
+ const char* const region;
+ const char* const languages;
+};
+
+// TODO(isherman): Expand this list.
+const AutoFillRegistry kAutoFillRegistries[] = {
+ {"au", "AU", "en"},
+ {"br", "BR", "pt"},
+ {"ca", "CA", "fr, en"},
+ {"co.jp", "JP", "ja"},
+ {"co.il", "IL", "he"},
+ {"co.nz", "NZ", "en"},
+ {"co.uk", "GB", "en"},
+ {"com.au", "AU", "en"},
+ {"com.br", "BR", "pt"},
+ {"com.mx", "MX", "es"},
+ {"il", "IL", "he"},
+ {"it", "IT", "it"},
+ {"jp", "JP", "ja"},
+ {"fr", "FR", "fr"},
+ {"mx", "MX", "es"},
+ {"ru", "RU", "ru"},
+};
+
+// Returns the registry (roughly, top level domain) for |url|.
+std::string ExtractRegistry(const GURL& url) {
+ const size_t registry_length =
+ net::RegistryControlledDomainService::GetRegistryLength(url, false);
+ if (registry_length == 0 || registry_length == std::string::npos)
+ return std::string();
+
+ const std::string host =
+ net::RegistryControlledDomainService::GetDomainAndRegistry(url);
+ DCHECK_LT(registry_length, host.size());
+
+ return StringToLowerASCII(host.substr(host.size() - registry_length));
+}
+
+// Converts |locale_id| into a canonical ICU |Locale|, with a couple of fixups:
+// * An empty |locale_id| returns an empty result, rather than ICU's default
+// canonicalization to "en-US".
+// * Likewise, a |locale_id| with an unknown language or region returns an empty
+// result, rather than ICU's default canonicalization to a bogus locale.
+icu::Locale ConvertToICULocale(const std::string& locale_id) {
+ if (locale_id.empty())
+ return icu::Locale("");
+
+ scoped_array<char> sanitized_locale(new char[locale_id.size() + 1]);
+ size_t sanitized_length = 0;
+ for (size_t i = 0; i < locale_id.size(); ++i) {
+ // We aren't interested in locale keywords, so the only valid characters in
+ // the locale string are ASCII letters, hyphen, and underscore. See
+ // http://userguide.icu-project.org/locale#TOC-Canonicalization
+ if (isalpha(locale_id[i]) || locale_id[i] == '-' || locale_id[i] == '_') {
+ sanitized_locale.get()[sanitized_length++] = locale_id[i];
+ }
+ }
+ sanitized_locale.get()[sanitized_length] = '\0';
+
+ icu::Locale canonical_locale =
+ icu::Locale::createCanonical(sanitized_locale.get());
+
+ std::string language = canonical_locale.getLanguage();
+ if (!AutoFillLocaleModel::IsValidLanguageTag(language))
+ language = std::string();
+
+ std::string region = canonical_locale.getCountry();
+ if (!AutoFillLocaleModel::IsValidRegionTag(region))
+ region = std::string();
+
+ return icu::Locale(language.c_str(), region.c_str());
+}
+
+// Combines |language| and |region| into a single locale code.
+// Returns "en-US" as a default if the |language| is empty.
+std::string LanguageAndRegionToLocaleCode(const std::string& language,
+ const std::string& region) {
+ // We should never get a non-empty region with an empty language.
+ DCHECK(!language.empty() || region.empty());
+
+ if (language.empty())
+ return "en-US";
+
+ if (region.empty())
+ return language;
+
+ return (language + "-" + region);
+}
+
+} // namespace
+
+AutoFillLocaleModel::AutoFillLocaleModel(const TabContents* tab_contents)
+ : tab_contents_(tab_contents),
+ tab_language_determined_(false) {
+ // Populate the registry map.
+ for (size_t i = 0; i < arraysize(kAutoFillRegistries); ++i) {
+ AutoFillRegistryInfo info;
+ info.region = kAutoFillRegistries[i].region;
+ SplitString(kAutoFillRegistries[i].languages, ',', &info.languages);
+
+ registries_[kAutoFillRegistries[i].registry] = info;
+ }
+}
+
+AutoFillLocaleModel::~AutoFillLocaleModel() {
+}
+
+// static
+bool AutoFillLocaleModel::IsValidLanguageTag(const std::string& language_tag) {
+ // Special case: "und" is an officially registered language tag for
+ // "Undetermined".
+ if (language_tag == chrome::kUnknownLanguageCode)
+ return false;
+
+ // |languages| is a NULL-terminated array of C-strings
+ for (const char* const* languages = uloc_getISOLanguages(); languages[0];
+ ++languages) {
+ if (languages[0] == language_tag)
+ return true;
+ }
+ return false;
+}
+
+// static
+bool AutoFillLocaleModel::IsValidRegionTag(const std::string& region_tag) {
+ // |regions| is a NULL-terminated array of C-strings
+ for (const char* const* regions = uloc_getISOCountries(); regions[0];
+ ++regions) {
+ if (regions[0] == region_tag)
+ return true;
+ }
+ return false;
+}
+
+// If |guessed_language| is empty or the languages associated with |registry|
+// include |guessed_language|, returns the region associated with |registry|.
+// Otherwise, returns an empty string.
+std::string AutoFillLocaleModel::RegionFromRegistry(
+ const std::string& registry,
+ std::string* guessed_language) const {
+ AutoFillRegistryMap::const_iterator result = registries_.find(registry);
+ if (result == registries_.end())
+ return std::string();
+
+ const AutoFillRegistryInfo& info = result->second;
+ for (size_t i = 0; i < info.languages.size(); ++i) {
+ // For each language spoken in the region, check to see if our current guess
+ // at the language matches. Since we accept the first match, the order in
+ // which the langauges are listed matters. If we previously had no guess
+ // for the language, update the guess.
+ if (guessed_language->empty()) {
+ *guessed_language = info.languages[i];
+ return info.region;
+ } else if (*guessed_language == info.languages[i]) {
+ return info.region;
+ }
+ }
+
+ return std::string();
+}
+
+// TODO(isherman): We use the page's original language to detect the form's
+// language. We might want to also use the page's current language (after
+// translation), e.g. to help us figure out the address's language.
+void AutoFillLocaleModel::UpdateLocale(FormStructure* form) const {
+ DCHECK(tab_language_determined_);
+
+ icu::Locale dom_locale = ConvertToICULocale(form->locale());
+ const std::string detected_locale_str = tab_contents_?
+ tab_contents_->language_state().original_language() : std::string();
+ icu::Locale detected_locale = ConvertToICULocale(detected_locale_str);
+
+ std::string result_language;
+ std::string result_region;
+
+ // If we got a non-empty locale from the DOM, use that. Moreover, we don't
+ // trust the locales "en" and "en-US" -- these can occur with international
+ // forms due to copy/pasted or otherwise careless HTML. We'll ignore them
+ // for now, but come back to them if we find no other useful signals.
+ if ((strlen(dom_locale.getLanguage()) != 0) &&
+ dom_locale != icu::Locale::getEnglish() &&
+ dom_locale != icu::Locale::getUS()) {
+ result_language = dom_locale.getLanguage();
+ result_region = dom_locale.getCountry();
+ }
+
+ // If we didn't get a language from the DOM, use the CLD-detected locale.
+ // If we did get a language but not a region form the DOM, try using the
+ // CLD-detected region.
+ if (result_language.empty()) {
+ result_language = detected_locale.getLanguage();
+ result_region = detected_locale.getCountry();
+ } else if (result_language == detected_locale.getLanguage() &&
+ result_region.empty()) {
+ result_region = detected_locale.getCountry();
+ }
+
+ // If we still don't have a region, see if we can guess it based on the url's
+ // registry. Try the form's source url first, then the target url, then the
+ // tab's registry (which might be different if the form is being framed).
+ if (result_region.empty()) {
+ result_region = RegionFromRegistry(ExtractRegistry(form->source_url()),
+ &result_language);
+ }
+ if (result_region.empty()) {
+ result_region = RegionFromRegistry(ExtractRegistry(form->target_url()),
+ &result_language);
+ }
+ if (result_region.empty() && tab_contents_) {
+ result_region = RegionFromRegistry(ExtractRegistry(tab_contents_->GetURL()),
+ &result_language);
+ }
+
+ // If we _still_ don't have a region, see if we ignored an untrusted region --
+ // e.g. "en-US" -- specified by the DOM. Only do this if our
+ // current guess at the language matches what the DOM specified.
+ if (result_region.empty() && (result_language.empty() ||
+ result_language == dom_locale.getLanguage())){
+ result_language = dom_locale.getLanguage();
+ result_region = dom_locale.getCountry();
+ }
+
+ form->set_locale(
+ LanguageAndRegionToLocaleCode(result_language, result_region));
+}
« no previous file with comments | « chrome/browser/autofill/autofill_locale_model.h ('k') | chrome/browser/autofill/autofill_locale_model_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698