OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include "chrome/browser/autofill/autofill_locale_model.h" |
| 6 |
| 7 #include "base/string_util.h" |
| 8 #include "chrome/browser/autofill/form_structure.h" |
| 9 #include "chrome/browser/tab_contents/language_state.h" |
| 10 #include "chrome/browser/tab_contents/tab_contents.h" |
| 11 #include "chrome/common/chrome_constants.h" |
| 12 #include "net/base/registry_controlled_domain.h" |
| 13 #include "third_party/icu/public/common/unicode/locid.h" |
| 14 #include "third_party/icu/public/common/unicode/uloc.h" |
| 15 |
| 16 namespace { |
| 17 |
| 18 // Associates known registries with their typcial regions and languages. |
| 19 struct AutoFillRegistry { |
| 20 const char* const registry; |
| 21 const char* const region; |
| 22 const char* const languages; |
| 23 }; |
| 24 |
| 25 // TODO(isherman): Expand this list. |
| 26 const AutoFillRegistry kAutoFillRegistries[] = { |
| 27 {"au", "AU", "en"}, |
| 28 {"br", "BR", "pt"}, |
| 29 {"ca", "CA", "fr, en"}, |
| 30 {"co.jp", "JP", "ja"}, |
| 31 {"co.il", "IL", "he"}, |
| 32 {"co.nz", "NZ", "en"}, |
| 33 {"co.uk", "GB", "en"}, |
| 34 {"com.au", "AU", "en"}, |
| 35 {"com.br", "BR", "pt"}, |
| 36 {"com.mx", "MX", "es"}, |
| 37 {"il", "IL", "he"}, |
| 38 {"it", "IT", "it"}, |
| 39 {"jp", "JP", "ja"}, |
| 40 {"fr", "FR", "fr"}, |
| 41 {"mx", "MX", "es"}, |
| 42 {"ru", "RU", "ru"}, |
| 43 }; |
| 44 |
| 45 // Returns the registry (roughly, top level domain) for |url|. |
| 46 std::string ExtractRegistry(const GURL& url) { |
| 47 const size_t registry_length = |
| 48 net::RegistryControlledDomainService::GetRegistryLength(url, false); |
| 49 if (registry_length == 0 || registry_length == std::string::npos) |
| 50 return std::string(); |
| 51 |
| 52 const std::string host = |
| 53 net::RegistryControlledDomainService::GetDomainAndRegistry(url); |
| 54 DCHECK_LT(registry_length, host.size()); |
| 55 |
| 56 return StringToLowerASCII(host.substr(host.size() - registry_length)); |
| 57 } |
| 58 |
| 59 // Converts |locale_id| into a canonical ICU |Locale|, with a couple of fixups: |
| 60 // * An empty |locale_id| returns an empty result, rather than ICU's default |
| 61 // canonicalization to "en-US". |
| 62 // * Likewise, a |locale_id| with an unknown language or region returns an empty |
| 63 // result, rather than ICU's default canonicalization to a bogus locale. |
| 64 icu::Locale ConvertToICULocale(const std::string& locale_id) { |
| 65 if (locale_id.empty()) |
| 66 return icu::Locale(""); |
| 67 |
| 68 scoped_array<char> sanitized_locale(new char[locale_id.size() + 1]); |
| 69 size_t sanitized_length = 0; |
| 70 for (size_t i = 0; i < locale_id.size(); ++i) { |
| 71 // We aren't interested in locale keywords, so the only valid characters in |
| 72 // the locale string are ASCII letters, hyphen, and underscore. See |
| 73 // http://userguide.icu-project.org/locale#TOC-Canonicalization |
| 74 if (isalpha(locale_id[i]) || locale_id[i] == '-' || locale_id[i] == '_') { |
| 75 sanitized_locale.get()[sanitized_length++] = locale_id[i]; |
| 76 } |
| 77 } |
| 78 sanitized_locale.get()[sanitized_length] = '\0'; |
| 79 |
| 80 icu::Locale canonical_locale = |
| 81 icu::Locale::createCanonical(sanitized_locale.get()); |
| 82 |
| 83 std::string language = canonical_locale.getLanguage(); |
| 84 if (!AutoFillLocaleModel::IsValidLanguageTag(language)) |
| 85 language = std::string(); |
| 86 |
| 87 std::string region = canonical_locale.getCountry(); |
| 88 if (!AutoFillLocaleModel::IsValidRegionTag(region)) |
| 89 region = std::string(); |
| 90 |
| 91 return icu::Locale(language.c_str(), region.c_str()); |
| 92 } |
| 93 |
| 94 // Combines |language| and |region| into a single locale code. |
| 95 // Returns "en-US" as a default if the |language| is empty. |
| 96 std::string LanguageAndRegionToLocaleCode(const std::string& language, |
| 97 const std::string& region) { |
| 98 // We should never get a non-empty region with an empty language. |
| 99 DCHECK(!language.empty() || region.empty()); |
| 100 |
| 101 if (language.empty()) |
| 102 return "en-US"; |
| 103 |
| 104 if (region.empty()) |
| 105 return language; |
| 106 |
| 107 return (language + "-" + region); |
| 108 } |
| 109 |
| 110 } // namespace |
| 111 |
| 112 AutoFillLocaleModel::AutoFillLocaleModel(const TabContents* tab_contents) |
| 113 : tab_contents_(tab_contents), |
| 114 tab_language_determined_(false) { |
| 115 // Populate the registry map. |
| 116 for (size_t i = 0; i < arraysize(kAutoFillRegistries); ++i) { |
| 117 AutoFillRegistryInfo info; |
| 118 info.region = kAutoFillRegistries[i].region; |
| 119 SplitString(kAutoFillRegistries[i].languages, ',', &info.languages); |
| 120 |
| 121 registries_[kAutoFillRegistries[i].registry] = info; |
| 122 } |
| 123 } |
| 124 |
| 125 AutoFillLocaleModel::~AutoFillLocaleModel() { |
| 126 } |
| 127 |
| 128 // static |
| 129 bool AutoFillLocaleModel::IsValidLanguageTag(const std::string& language_tag) { |
| 130 // Special case: "und" is an officially registered language tag for |
| 131 // "Undetermined". |
| 132 if (language_tag == chrome::kUnknownLanguageCode) |
| 133 return false; |
| 134 |
| 135 // |languages| is a NULL-terminated array of C-strings |
| 136 for (const char* const* languages = uloc_getISOLanguages(); languages[0]; |
| 137 ++languages) { |
| 138 if (languages[0] == language_tag) |
| 139 return true; |
| 140 } |
| 141 return false; |
| 142 } |
| 143 |
| 144 // static |
| 145 bool AutoFillLocaleModel::IsValidRegionTag(const std::string& region_tag) { |
| 146 // |regions| is a NULL-terminated array of C-strings |
| 147 for (const char* const* regions = uloc_getISOCountries(); regions[0]; |
| 148 ++regions) { |
| 149 if (regions[0] == region_tag) |
| 150 return true; |
| 151 } |
| 152 return false; |
| 153 } |
| 154 |
| 155 // If |guessed_language| is empty or the languages associated with |registry| |
| 156 // include |guessed_language|, returns the region associated with |registry|. |
| 157 // Otherwise, returns an empty string. |
| 158 std::string AutoFillLocaleModel::RegionFromRegistry( |
| 159 const std::string& registry, |
| 160 std::string* guessed_language) const { |
| 161 AutoFillRegistryMap::const_iterator result = registries_.find(registry); |
| 162 if (result == registries_.end()) |
| 163 return std::string(); |
| 164 |
| 165 const AutoFillRegistryInfo& info = result->second; |
| 166 for (size_t i = 0; i < info.languages.size(); ++i) { |
| 167 // For each language spoken in the region, check to see if our current guess |
| 168 // at the language matches. Since we accept the first match, the order in |
| 169 // which the langauges are listed matters. If we previously had no guess |
| 170 // for the language, update the guess. |
| 171 if (guessed_language->empty()) { |
| 172 *guessed_language = info.languages[i]; |
| 173 return info.region; |
| 174 } else if (*guessed_language == info.languages[i]) { |
| 175 return info.region; |
| 176 } |
| 177 } |
| 178 |
| 179 return std::string(); |
| 180 } |
| 181 |
| 182 // TODO(isherman): We use the page's original language to detect the form's |
| 183 // language. We might want to also use the page's current language (after |
| 184 // translation), e.g. to help us figure out the address's language. |
| 185 void AutoFillLocaleModel::UpdateLocale(FormStructure* form) const { |
| 186 DCHECK(tab_language_determined_); |
| 187 |
| 188 icu::Locale dom_locale = ConvertToICULocale(form->locale()); |
| 189 const std::string detected_locale_str = tab_contents_? |
| 190 tab_contents_->language_state().original_language() : std::string(); |
| 191 icu::Locale detected_locale = ConvertToICULocale(detected_locale_str); |
| 192 |
| 193 std::string result_language; |
| 194 std::string result_region; |
| 195 |
| 196 // If we got a non-empty locale from the DOM, use that. Moreover, we don't |
| 197 // trust the locales "en" and "en-US" -- these can occur with international |
| 198 // forms due to copy/pasted or otherwise careless HTML. We'll ignore them |
| 199 // for now, but come back to them if we find no other useful signals. |
| 200 if ((strlen(dom_locale.getLanguage()) != 0) && |
| 201 dom_locale != icu::Locale::getEnglish() && |
| 202 dom_locale != icu::Locale::getUS()) { |
| 203 result_language = dom_locale.getLanguage(); |
| 204 result_region = dom_locale.getCountry(); |
| 205 } |
| 206 |
| 207 // If we didn't get a language from the DOM, use the CLD-detected locale. |
| 208 // If we did get a language but not a region form the DOM, try using the |
| 209 // CLD-detected region. |
| 210 if (result_language.empty()) { |
| 211 result_language = detected_locale.getLanguage(); |
| 212 result_region = detected_locale.getCountry(); |
| 213 } else if (result_language == detected_locale.getLanguage() && |
| 214 result_region.empty()) { |
| 215 result_region = detected_locale.getCountry(); |
| 216 } |
| 217 |
| 218 // If we still don't have a region, see if we can guess it based on the url's |
| 219 // registry. Try the form's source url first, then the target url, then the |
| 220 // tab's registry (which might be different if the form is being framed). |
| 221 if (result_region.empty()) { |
| 222 result_region = RegionFromRegistry(ExtractRegistry(form->source_url()), |
| 223 &result_language); |
| 224 } |
| 225 if (result_region.empty()) { |
| 226 result_region = RegionFromRegistry(ExtractRegistry(form->target_url()), |
| 227 &result_language); |
| 228 } |
| 229 if (result_region.empty() && tab_contents_) { |
| 230 result_region = RegionFromRegistry(ExtractRegistry(tab_contents_->GetURL()), |
| 231 &result_language); |
| 232 } |
| 233 |
| 234 // If we _still_ don't have a region, see if we ignored an untrusted region -- |
| 235 // e.g. "en-US" -- specified by the DOM. Only do this if our |
| 236 // current guess at the language matches what the DOM specified. |
| 237 if (result_region.empty() && (result_language.empty() || |
| 238 result_language == dom_locale.getLanguage())){ |
| 239 result_language = dom_locale.getLanguage(); |
| 240 result_region = dom_locale.getCountry(); |
| 241 } |
| 242 |
| 243 form->set_locale( |
| 244 LanguageAndRegionToLocaleCode(result_language, result_region)); |
| 245 } |
OLD | NEW |