Index: chrome/browser/autofill/autofill_locale_model.cc |
diff --git a/chrome/browser/autofill/autofill_locale_model.cc b/chrome/browser/autofill/autofill_locale_model.cc |
new file mode 100644 |
index 0000000000000000000000000000000000000000..253744ff55790ab408558aa865d4480558724ae5 |
--- /dev/null |
+++ b/chrome/browser/autofill/autofill_locale_model.cc |
@@ -0,0 +1,245 @@ |
+// Copyright (c) 2010 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+#include "chrome/browser/autofill/autofill_locale_model.h" |
+ |
+#include "base/string_util.h" |
+#include "chrome/browser/autofill/form_structure.h" |
+#include "chrome/browser/tab_contents/language_state.h" |
+#include "chrome/browser/tab_contents/tab_contents.h" |
+#include "chrome/common/chrome_constants.h" |
+#include "net/base/registry_controlled_domain.h" |
+#include "third_party/icu/public/common/unicode/locid.h" |
+#include "third_party/icu/public/common/unicode/uloc.h" |
+ |
+namespace { |
+ |
+// Associates known registries with their typcial regions and languages. |
+struct AutoFillRegistry { |
+ const char* const registry; |
+ const char* const region; |
+ const char* const languages; |
+}; |
+ |
+// TODO(isherman): Expand this list. |
+const AutoFillRegistry kAutoFillRegistries[] = { |
+ {"au", "AU", "en"}, |
+ {"br", "BR", "pt"}, |
+ {"ca", "CA", "fr, en"}, |
+ {"co.jp", "JP", "ja"}, |
+ {"co.il", "IL", "he"}, |
+ {"co.nz", "NZ", "en"}, |
+ {"co.uk", "GB", "en"}, |
+ {"com.au", "AU", "en"}, |
+ {"com.br", "BR", "pt"}, |
+ {"com.mx", "MX", "es"}, |
+ {"il", "IL", "he"}, |
+ {"it", "IT", "it"}, |
+ {"jp", "JP", "ja"}, |
+ {"fr", "FR", "fr"}, |
+ {"mx", "MX", "es"}, |
+ {"ru", "RU", "ru"}, |
+}; |
+ |
+// Returns the registry (roughly, top level domain) for |url|. |
+std::string ExtractRegistry(const GURL& url) { |
+ const size_t registry_length = |
+ net::RegistryControlledDomainService::GetRegistryLength(url, false); |
+ if (registry_length == 0 || registry_length == std::string::npos) |
+ return std::string(); |
+ |
+ const std::string host = |
+ net::RegistryControlledDomainService::GetDomainAndRegistry(url); |
+ DCHECK_LT(registry_length, host.size()); |
+ |
+ return StringToLowerASCII(host.substr(host.size() - registry_length)); |
+} |
+ |
+// Converts |locale_id| into a canonical ICU |Locale|, with a couple of fixups: |
+// * An empty |locale_id| returns an empty result, rather than ICU's default |
+// canonicalization to "en-US". |
+// * Likewise, a |locale_id| with an unknown language or region returns an empty |
+// result, rather than ICU's default canonicalization to a bogus locale. |
+icu::Locale ConvertToICULocale(const std::string& locale_id) { |
+ if (locale_id.empty()) |
+ return icu::Locale(""); |
+ |
+ scoped_array<char> sanitized_locale(new char[locale_id.size() + 1]); |
+ size_t sanitized_length = 0; |
+ for (size_t i = 0; i < locale_id.size(); ++i) { |
+ // We aren't interested in locale keywords, so the only valid characters in |
+ // the locale string are ASCII letters, hyphen, and underscore. See |
+ // http://userguide.icu-project.org/locale#TOC-Canonicalization |
+ if (isalpha(locale_id[i]) || locale_id[i] == '-' || locale_id[i] == '_') { |
+ sanitized_locale.get()[sanitized_length++] = locale_id[i]; |
+ } |
+ } |
+ sanitized_locale.get()[sanitized_length] = '\0'; |
+ |
+ icu::Locale canonical_locale = |
+ icu::Locale::createCanonical(sanitized_locale.get()); |
+ |
+ std::string language = canonical_locale.getLanguage(); |
+ if (!AutoFillLocaleModel::IsValidLanguageTag(language)) |
+ language = std::string(); |
+ |
+ std::string region = canonical_locale.getCountry(); |
+ if (!AutoFillLocaleModel::IsValidRegionTag(region)) |
+ region = std::string(); |
+ |
+ return icu::Locale(language.c_str(), region.c_str()); |
+} |
+ |
+// Combines |language| and |region| into a single locale code. |
+// Returns "en-US" as a default if the |language| is empty. |
+std::string LanguageAndRegionToLocaleCode(const std::string& language, |
+ const std::string& region) { |
+ // We should never get a non-empty region with an empty language. |
+ DCHECK(!language.empty() || region.empty()); |
+ |
+ if (language.empty()) |
+ return "en-US"; |
+ |
+ if (region.empty()) |
+ return language; |
+ |
+ return (language + "-" + region); |
+} |
+ |
+} // namespace |
+ |
+AutoFillLocaleModel::AutoFillLocaleModel(const TabContents* tab_contents) |
+ : tab_contents_(tab_contents), |
+ tab_language_determined_(false) { |
+ // Populate the registry map. |
+ for (size_t i = 0; i < arraysize(kAutoFillRegistries); ++i) { |
+ AutoFillRegistryInfo info; |
+ info.region = kAutoFillRegistries[i].region; |
+ SplitString(kAutoFillRegistries[i].languages, ',', &info.languages); |
+ |
+ registries_[kAutoFillRegistries[i].registry] = info; |
+ } |
+} |
+ |
+AutoFillLocaleModel::~AutoFillLocaleModel() { |
+} |
+ |
+// static |
+bool AutoFillLocaleModel::IsValidLanguageTag(const std::string& language_tag) { |
+ // Special case: "und" is an officially registered language tag for |
+ // "Undetermined". |
+ if (language_tag == chrome::kUnknownLanguageCode) |
+ return false; |
+ |
+ // |languages| is a NULL-terminated array of C-strings |
+ for (const char* const* languages = uloc_getISOLanguages(); languages[0]; |
+ ++languages) { |
+ if (languages[0] == language_tag) |
+ return true; |
+ } |
+ return false; |
+} |
+ |
+// static |
+bool AutoFillLocaleModel::IsValidRegionTag(const std::string& region_tag) { |
+ // |regions| is a NULL-terminated array of C-strings |
+ for (const char* const* regions = uloc_getISOCountries(); regions[0]; |
+ ++regions) { |
+ if (regions[0] == region_tag) |
+ return true; |
+ } |
+ return false; |
+} |
+ |
+// If |guessed_language| is empty or the languages associated with |registry| |
+// include |guessed_language|, returns the region associated with |registry|. |
+// Otherwise, returns an empty string. |
+std::string AutoFillLocaleModel::RegionFromRegistry( |
+ const std::string& registry, |
+ std::string* guessed_language) const { |
+ AutoFillRegistryMap::const_iterator result = registries_.find(registry); |
+ if (result == registries_.end()) |
+ return std::string(); |
+ |
+ const AutoFillRegistryInfo& info = result->second; |
+ for (size_t i = 0; i < info.languages.size(); ++i) { |
+ // For each language spoken in the region, check to see if our current guess |
+ // at the language matches. Since we accept the first match, the order in |
+ // which the langauges are listed matters. If we previously had no guess |
+ // for the language, update the guess. |
+ if (guessed_language->empty()) { |
+ *guessed_language = info.languages[i]; |
+ return info.region; |
+ } else if (*guessed_language == info.languages[i]) { |
+ return info.region; |
+ } |
+ } |
+ |
+ return std::string(); |
+} |
+ |
+// TODO(isherman): We use the page's original language to detect the form's |
+// language. We might want to also use the page's current language (after |
+// translation), e.g. to help us figure out the address's language. |
+void AutoFillLocaleModel::UpdateLocale(FormStructure* form) const { |
+ DCHECK(tab_language_determined_); |
+ |
+ icu::Locale dom_locale = ConvertToICULocale(form->locale()); |
+ const std::string detected_locale_str = tab_contents_? |
+ tab_contents_->language_state().original_language() : std::string(); |
+ icu::Locale detected_locale = ConvertToICULocale(detected_locale_str); |
+ |
+ std::string result_language; |
+ std::string result_region; |
+ |
+ // If we got a non-empty locale from the DOM, use that. Moreover, we don't |
+ // trust the locales "en" and "en-US" -- these can occur with international |
+ // forms due to copy/pasted or otherwise careless HTML. We'll ignore them |
+ // for now, but come back to them if we find no other useful signals. |
+ if ((strlen(dom_locale.getLanguage()) != 0) && |
+ dom_locale != icu::Locale::getEnglish() && |
+ dom_locale != icu::Locale::getUS()) { |
+ result_language = dom_locale.getLanguage(); |
+ result_region = dom_locale.getCountry(); |
+ } |
+ |
+ // If we didn't get a language from the DOM, use the CLD-detected locale. |
+ // If we did get a language but not a region form the DOM, try using the |
+ // CLD-detected region. |
+ if (result_language.empty()) { |
+ result_language = detected_locale.getLanguage(); |
+ result_region = detected_locale.getCountry(); |
+ } else if (result_language == detected_locale.getLanguage() && |
+ result_region.empty()) { |
+ result_region = detected_locale.getCountry(); |
+ } |
+ |
+ // If we still don't have a region, see if we can guess it based on the url's |
+ // registry. Try the form's source url first, then the target url, then the |
+ // tab's registry (which might be different if the form is being framed). |
+ if (result_region.empty()) { |
+ result_region = RegionFromRegistry(ExtractRegistry(form->source_url()), |
+ &result_language); |
+ } |
+ if (result_region.empty()) { |
+ result_region = RegionFromRegistry(ExtractRegistry(form->target_url()), |
+ &result_language); |
+ } |
+ if (result_region.empty() && tab_contents_) { |
+ result_region = RegionFromRegistry(ExtractRegistry(tab_contents_->GetURL()), |
+ &result_language); |
+ } |
+ |
+ // If we _still_ don't have a region, see if we ignored an untrusted region -- |
+ // e.g. "en-US" -- specified by the DOM. Only do this if our |
+ // current guess at the language matches what the DOM specified. |
+ if (result_region.empty() && (result_language.empty() || |
+ result_language == dom_locale.getLanguage())){ |
+ result_language = dom_locale.getLanguage(); |
+ result_region = dom_locale.getCountry(); |
+ } |
+ |
+ form->set_locale( |
+ LanguageAndRegionToLocaleCode(result_language, result_region)); |
+} |