Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(133)

Side by Side Diff: chrome/browser/autofill/autofill_locale_model.cc

Issue 3226001: Detecting form locale (Closed) Base URL: http://src.chromium.org/git/chromium.git
Patch Set: Unit test for top websites Created 10 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/autofill/autofill_locale_model.h"
6
7 #include "base/string_util.h"
8 #include "chrome/browser/autofill/form_structure.h"
9 #include "chrome/browser/tab_contents/language_state.h"
10 #include "chrome/browser/tab_contents/tab_contents.h"
11 #include "chrome/common/chrome_constants.h"
12 #include "net/base/registry_controlled_domain.h"
13 #include "third_party/icu/public/common/unicode/locid.h"
14 #include "third_party/icu/public/common/unicode/uloc.h"
15
16 namespace {
17
18 // Associates known registries with their typcial regions and languages.
19 struct AutoFillRegistry {
20 const char* const registry;
21 const char* const region;
22 const char* const languages;
23 };
24
25 // TODO(isherman): Expand this list.
26 const AutoFillRegistry kAutoFillRegistries[] = {
27 {"au", "AU", "en"},
28 {"br", "BR", "pt"},
29 {"ca", "CA", "fr, en"},
30 {"co.jp", "JP", "ja"},
31 {"co.il", "IL", "he"},
32 {"co.nz", "NZ", "en"},
33 {"co.uk", "GB", "en"},
34 {"com.au", "AU", "en"},
35 {"com.br", "BR", "pt"},
36 {"com.mx", "MX", "es"},
37 {"il", "IL", "he"},
38 {"it", "IT", "it"},
39 {"jp", "JP", "ja"},
40 {"fr", "FR", "fr"},
41 {"mx", "MX", "es"},
42 {"ru", "RU", "ru"},
43 };
44
45 // Returns the registry (roughly, top level domain) for |url|.
46 std::string ExtractRegistry(const GURL& url) {
47 const size_t registry_length =
48 net::RegistryControlledDomainService::GetRegistryLength(url, false);
49 if (registry_length == 0 || registry_length == std::string::npos)
50 return std::string();
51
52 const std::string host =
53 net::RegistryControlledDomainService::GetDomainAndRegistry(url);
54 DCHECK_LT(registry_length, host.size());
55
56 return StringToLowerASCII(host.substr(host.size() - registry_length));
57 }
58
59 // Converts |locale_id| into a canonical ICU |Locale|, with a couple of fixups:
60 // * An empty |locale_id| returns an empty result, rather than ICU's default
61 // canonicalization to "en-US".
62 // * Likewise, a |locale_id| with an unknown language or region returns an empty
63 // result, rather than ICU's default canonicalization to a bogus locale.
64 icu::Locale ConvertToICULocale(const std::string& locale_id) {
65 if (locale_id.empty())
66 return icu::Locale("");
67
68 scoped_array<char> sanitized_locale(new char[locale_id.size() + 1]);
69 size_t sanitized_length = 0;
70 for (size_t i = 0; i < locale_id.size(); ++i) {
71 // We aren't interested in locale keywords, so the only valid characters in
72 // the locale string are ASCII letters, hyphen, and underscore. See
73 // http://userguide.icu-project.org/locale#TOC-Canonicalization
74 if (isalpha(locale_id[i]) || locale_id[i] == '-' || locale_id[i] == '_') {
75 sanitized_locale.get()[sanitized_length++] = locale_id[i];
76 }
77 }
78 sanitized_locale.get()[sanitized_length] = '\0';
79
80 icu::Locale canonical_locale =
81 icu::Locale::createCanonical(sanitized_locale.get());
82
83 std::string language = canonical_locale.getLanguage();
84 if (!AutoFillLocaleModel::IsValidLanguageTag(language))
85 language = std::string();
86
87 std::string region = canonical_locale.getCountry();
88 if (!AutoFillLocaleModel::IsValidRegionTag(region))
89 region = std::string();
90
91 return icu::Locale(language.c_str(), region.c_str());
92 }
93
94 // Combines |language| and |region| into a single locale code.
95 // Returns "en-US" as a default if the |language| is empty.
96 std::string LanguageAndRegionToLocaleCode(const std::string& language,
97 const std::string& region) {
98 // We should never get a non-empty region with an empty language.
99 DCHECK(!language.empty() || region.empty());
100
101 if (language.empty())
102 return "en-US";
103
104 if (region.empty())
105 return language;
106
107 return (language + "-" + region);
108 }
109
110 } // namespace
111
112 AutoFillLocaleModel::AutoFillLocaleModel(const TabContents* tab_contents)
113 : tab_contents_(tab_contents),
114 tab_language_determined_(false) {
115 // Populate the registry map.
116 for (size_t i = 0; i < arraysize(kAutoFillRegistries); ++i) {
117 AutoFillRegistryInfo info;
118 info.region = kAutoFillRegistries[i].region;
119 SplitString(kAutoFillRegistries[i].languages, ',', &info.languages);
120
121 registries_[kAutoFillRegistries[i].registry] = info;
122 }
123 }
124
125 AutoFillLocaleModel::~AutoFillLocaleModel() {
126 }
127
128 // static
129 bool AutoFillLocaleModel::IsValidLanguageTag(const std::string& language_tag) {
130 // Special case: "und" is an officially registered language tag for
131 // "Undetermined".
132 if (language_tag == chrome::kUnknownLanguageCode)
133 return false;
134
135 // |languages| is a NULL-terminated array of C-strings
136 for (const char* const* languages = uloc_getISOLanguages(); languages[0];
137 ++languages) {
138 if (languages[0] == language_tag)
139 return true;
140 }
141 return false;
142 }
143
144 // static
145 bool AutoFillLocaleModel::IsValidRegionTag(const std::string& region_tag) {
146 // |regions| is a NULL-terminated array of C-strings
147 for (const char* const* regions = uloc_getISOCountries(); regions[0];
148 ++regions) {
149 if (regions[0] == region_tag)
150 return true;
151 }
152 return false;
153 }
154
155 // If |guessed_language| is empty or the languages associated with |registry|
156 // include |guessed_language|, returns the region associated with |registry|.
157 // Otherwise, returns an empty string.
158 std::string AutoFillLocaleModel::RegionFromRegistry(
159 const std::string& registry,
160 std::string* guessed_language) const {
161 AutoFillRegistryMap::const_iterator result = registries_.find(registry);
162 if (result == registries_.end())
163 return std::string();
164
165 const AutoFillRegistryInfo& info = result->second;
166 for (size_t i = 0; i < info.languages.size(); ++i) {
167 // For each language spoken in the region, check to see if our current guess
168 // at the language matches. Since we accept the first match, the order in
169 // which the langauges are listed matters. If we previously had no guess
170 // for the language, update the guess.
171 if (guessed_language->empty()) {
172 *guessed_language = info.languages[i];
173 return info.region;
174 } else if (*guessed_language == info.languages[i]) {
175 return info.region;
176 }
177 }
178
179 return std::string();
180 }
181
182 // TODO(isherman): We use the page's original language to detect the form's
183 // language. We might want to also use the page's current language (after
184 // translation), e.g. to help us figure out the address's language.
185 void AutoFillLocaleModel::UpdateLocale(FormStructure* form) const {
186 DCHECK(tab_language_determined_);
187
188 icu::Locale dom_locale = ConvertToICULocale(form->locale());
189 const std::string detected_locale_str = tab_contents_?
190 tab_contents_->language_state().original_language() : std::string();
191 icu::Locale detected_locale = ConvertToICULocale(detected_locale_str);
192
193 std::string result_language;
194 std::string result_region;
195
196 // If we got a non-empty locale from the DOM, use that. Moreover, we don't
197 // trust the locales "en" and "en-US" -- these can occur with international
198 // forms due to copy/pasted or otherwise careless HTML. We'll ignore them
199 // for now, but come back to them if we find no other useful signals.
200 if ((strlen(dom_locale.getLanguage()) != 0) &&
201 dom_locale != icu::Locale::getEnglish() &&
202 dom_locale != icu::Locale::getUS()) {
203 result_language = dom_locale.getLanguage();
204 result_region = dom_locale.getCountry();
205 }
206
207 // If we didn't get a language from the DOM, use the CLD-detected locale.
208 // If we did get a language but not a region form the DOM, try using the
209 // CLD-detected region.
210 if (result_language.empty()) {
211 result_language = detected_locale.getLanguage();
212 result_region = detected_locale.getCountry();
213 } else if (result_language == detected_locale.getLanguage() &&
214 result_region.empty()) {
215 result_region = detected_locale.getCountry();
216 }
217
218 // If we still don't have a region, see if we can guess it based on the url's
219 // registry. Try the form's source url first, then the target url, then the
220 // tab's registry (which might be different if the form is being framed).
221 if (result_region.empty()) {
222 result_region = RegionFromRegistry(ExtractRegistry(form->source_url()),
223 &result_language);
224 }
225 if (result_region.empty()) {
226 result_region = RegionFromRegistry(ExtractRegistry(form->target_url()),
227 &result_language);
228 }
229 if (result_region.empty() && tab_contents_) {
230 result_region = RegionFromRegistry(ExtractRegistry(tab_contents_->GetURL()),
231 &result_language);
232 }
233
234 // If we _still_ don't have a region, see if we ignored an untrusted region --
235 // e.g. "en-US" -- specified by the DOM. Only do this if our
236 // current guess at the language matches what the DOM specified.
237 if (result_region.empty() && (result_language.empty() ||
238 result_language == dom_locale.getLanguage())){
239 result_language = dom_locale.getLanguage();
240 result_region = dom_locale.getCountry();
241 }
242
243 form->set_locale(
244 LanguageAndRegionToLocaleCode(result_language, result_region));
245 }
OLDNEW
« no previous file with comments | « chrome/browser/autofill/autofill_locale_model.h ('k') | chrome/browser/autofill/autofill_locale_model_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698