OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2016 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "components/autofill/core/browser/autofill_profile_comparator.h" | |
6 | |
7 #include <algorithm> | |
8 #include <vector> | |
9 | |
10 #include "base/i18n/char_iterator.h" | |
11 #include "base/strings/string_split.h" | |
12 #include "base/strings/string_util.h" | |
13 #include "base/strings/utf_string_conversion_utils.h" | |
14 #include "base/strings/utf_string_conversions.h" | |
15 #include "components/autofill/core/browser/autofill_data_util.h" | |
16 #include "third_party/libphonenumber/phonenumber_api.h" | |
17 | |
18 namespace autofill { | |
19 namespace { | |
20 | |
21 const base::char16 kSpace[] = {L' ', L'\0'}; | |
22 | |
23 } // namespace | |
24 | |
25 AutofillProfileComparator::AutofillProfileComparator( | |
26 const base::StringPiece& app_locale) | |
27 : app_locale_(app_locale.data(), app_locale.size()) { | |
28 // Use ICU transliteration to remove diacritics and fold case. | |
29 // See http://userguide.icu-project.org/transforms/general | |
30 UErrorCode status = U_ZERO_ERROR; | |
31 std::unique_ptr<icu::Transliterator> transliterator( | |
32 icu::Transliterator::createInstance( | |
33 "NFD; [:Nonspacing Mark:] Remove; Lower; NFC", UTRANS_FORWARD, | |
34 status)); | |
35 if (U_FAILURE(status) || transliterator == nullptr) { | |
36 // TODO(rogerm): Add a histogram to count how often this happens. | |
37 LOG(ERROR) << "Failed to create ICU Transliterator: " | |
38 << u_errorName(status); | |
39 } | |
40 | |
41 transliterator_ = std::move(transliterator); | |
42 } | |
43 | |
44 AutofillProfileComparator::~AutofillProfileComparator() {} | |
45 | |
46 base::string16 AutofillProfileComparator::NormalizeForComparison( | |
47 base::StringPiece16 text, | |
48 AutofillProfileComparator::WhitespaceSpec whitespace_spec) const { | |
49 // This algorithm is not designed to be perfect, we could get arbitrarily | |
50 // fancy here trying to canonicalize address lines. Instead, this is designed | |
51 // to handle common cases for all types of data (addresses and names) without | |
52 // the need of domain-specific logic. | |
53 // | |
54 // 1. Convert punctuation to spaces and normalize all whitespace to spaces. | |
55 // This will convert "Mid-Island Plz." -> "Mid Island Plz " (the trailing | |
56 // space will be trimmed off outside of the end of the loop). | |
57 // | |
58 // 2. Collapse consecutive punctuation/whitespace characters to a single | |
59 // space. We pretend the string has already started with whitespace in | |
60 // order to trim leading spaces. | |
61 // | |
62 // 3. Remove diacritics (accents and other non-spacing marks) and perform | |
63 // case folding to lower-case. | |
64 base::string16 result; | |
65 result.reserve(text.length()); | |
66 bool previous_was_whitespace = (whitespace_spec == RETAIN_WHITESPACE); | |
67 for (base::i18n::UTF16CharIterator iter(text.data(), text.length()); | |
68 !iter.end(); iter.Advance()) { | |
69 switch (u_charType(iter.get())) { | |
70 // Punctuation | |
71 case U_DASH_PUNCTUATION: | |
72 case U_START_PUNCTUATION: | |
73 case U_END_PUNCTUATION: | |
74 case U_CONNECTOR_PUNCTUATION: | |
75 case U_OTHER_PUNCTUATION: | |
76 // Whitespace | |
77 case U_CONTROL_CHAR: // To escape the '\n' character. | |
78 case U_SPACE_SEPARATOR: | |
79 case U_LINE_SEPARATOR: | |
80 case U_PARAGRAPH_SEPARATOR: | |
81 if (!previous_was_whitespace && whitespace_spec == RETAIN_WHITESPACE) { | |
82 result.push_back(' '); | |
83 previous_was_whitespace = true; | |
84 } | |
85 break; | |
86 | |
87 default: | |
88 previous_was_whitespace = false; | |
89 base::WriteUnicodeCharacter(iter.get(), &result); | |
90 break; | |
91 } | |
92 } | |
93 | |
94 // Trim off trailing whitespace if we left one. | |
95 if (previous_was_whitespace && !result.empty()) | |
96 result.resize(result.size() - 1); | |
97 | |
98 if (transliterator_ == nullptr) | |
99 return result; | |
100 | |
101 icu::UnicodeString value = icu::UnicodeString(result.data(), result.length()); | |
102 transliterator_->transliterate(value); | |
103 return base::string16(value.getBuffer(), value.length()); | |
104 } | |
105 | |
106 bool AutofillProfileComparator::AreMergeable(const AutofillProfile& p1, | |
107 const AutofillProfile& p2) const { | |
108 // Sorted in order to relative expense of the tests to fail early and cheaply | |
109 // if possible. | |
110 return HaveMergeableEmailAddresses(p1, p2) && | |
111 HaveMergeableCompanyNames(p1, p2) && | |
112 HaveMergeablePhoneNumbers(p1, p2) && HaveMergeableNames(p1, p2) && | |
113 HaveMergeableAddresses(p1, p2); | |
114 } | |
115 | |
116 // static | |
117 std::set<base::StringPiece16> AutofillProfileComparator::UniqueTokens( | |
118 base::StringPiece16 s) { | |
119 std::vector<base::StringPiece16> tokens = base::SplitStringPiece( | |
120 s, kSpace, base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY); | |
121 return std::set<base::StringPiece16>(tokens.begin(), tokens.end()); | |
122 } | |
123 | |
124 // static | |
125 bool AutofillProfileComparator::HaveSameTokens(base::StringPiece16 s1, | |
126 base::StringPiece16 s2) { | |
127 std::set<base::StringPiece16> t1 = UniqueTokens(s1); | |
128 std::set<base::StringPiece16> t2 = UniqueTokens(s2); | |
129 | |
130 // Note: std::include() expects the items in each range to be in sorted order, | |
131 // hence the use of std::set<> instead of std::unordered_set<>. | |
132 return std::includes(t1.begin(), t1.end(), t2.begin(), t2.end()) || | |
133 std::includes(t2.begin(), t2.end(), t1.begin(), t1.end()); | |
134 } | |
135 | |
136 // static | |
137 std::set<base::string16> AutofillProfileComparator::GetNameVariants( | |
138 const base::string16& name) { | |
139 const size_t kMaxSupportedNameParts = 8; | |
140 | |
141 std::vector<base::string16> name_parts = base::SplitString( | |
142 name, kSpace, base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY); | |
143 | |
144 // Limit the number of parts we support (to constrain memory usage); | |
145 if (name_parts.size() > kMaxSupportedNameParts) | |
146 return {name}; | |
147 | |
148 // Start with the empty string as a variant. | |
149 std::set<base::string16> variants = {base::EmptyString16()}; | |
150 | |
151 // For each name part, add a variant of all the already existing variants that | |
152 // appends this name part and one that appends the initial of this name part. | |
153 // Duplicates will be discarded when they're added to the variants set. | |
154 for (const base::string16& part : name_parts) { | |
155 if (part.empty()) | |
156 continue; | |
157 std::vector<base::string16> new_variants; | |
158 for (const base::string16& variant : variants) { | |
159 new_variants.push_back(base::CollapseWhitespace( | |
160 base::JoinString({variant, part}, kSpace), true)); | |
161 new_variants.push_back(base::CollapseWhitespace( | |
162 base::JoinString({variant, part.substr(0, 1)}, kSpace), true)); | |
163 } | |
164 variants.insert(new_variants.begin(), new_variants.end()); | |
165 } | |
166 return variants; | |
167 } | |
168 | |
169 bool AutofillProfileComparator::IsNameVariantOf( | |
170 const base::string16& full_name_1, | |
171 const base::string16& full_name_2) const { | |
172 data_util::NameParts name_1_parts = data_util::SplitName(full_name_1); | |
173 // Build the variants of full_name_1`s given and middle names, and hang on | |
174 // to its family name. | |
175 const std::set<base::string16> given_name_variants = | |
176 GetNameVariants(name_1_parts.given); | |
177 const std::set<base::string16> middle_name_variants = | |
178 GetNameVariants(name_1_parts.middle); | |
179 const base::string16& family_name = name_1_parts.family; | |
sebsg
2016/06/10 15:12:13
Hum, I'm really on the fence on whether I would wa
Roger McFarlane (Chromium)
2016/06/10 16:19:44
Done.
| |
180 | |
181 // Iterate over all full name variants of profile 2 and see if any of them | |
182 // match the full name from profile 1. | |
183 for (const base::string16& given_name : given_name_variants) { | |
184 for (const base::string16& middle_name : middle_name_variants) { | |
185 base::string16 candidate = base::CollapseWhitespace( | |
186 base::JoinString({given_name, middle_name, family_name}, kSpace), | |
187 true); | |
188 if (candidate == full_name_2) | |
189 return true; | |
190 } | |
191 } | |
192 | |
193 // There was not match found. | |
194 return false; | |
195 } | |
196 | |
197 bool AutofillProfileComparator::HaveMergeableNames( | |
198 const AutofillProfile& p1, | |
199 const AutofillProfile& p2) const { | |
200 base::string16 full_name_1 = | |
201 NormalizeForComparison(p1.GetInfo(AutofillType(NAME_FULL), app_locale_)); | |
202 base::string16 full_name_2 = | |
203 NormalizeForComparison(p2.GetInfo(AutofillType(NAME_FULL), app_locale_)); | |
204 | |
205 // Is it reasonable to merge the names from p1 and p2. | |
206 return full_name_1.empty() || full_name_2.empty() || | |
207 (full_name_1 == full_name_2) || | |
208 IsNameVariantOf(full_name_1, full_name_2) || | |
209 IsNameVariantOf(full_name_2, full_name_1); | |
210 } | |
211 | |
212 bool AutofillProfileComparator::HaveMergeableEmailAddresses( | |
213 const AutofillProfile& p1, | |
214 const AutofillProfile& p2) const { | |
215 const base::string16& email_1 = | |
216 p1.GetInfo(AutofillType(EMAIL_ADDRESS), app_locale_); | |
217 const base::string16& email_2 = | |
218 p2.GetInfo(AutofillType(EMAIL_ADDRESS), app_locale_); | |
219 return email_1.empty() || email_2.empty() || | |
220 case_insensitive_compare_.StringsEqual(email_1, email_2); | |
221 } | |
222 | |
223 bool AutofillProfileComparator::HaveMergeableCompanyNames( | |
224 const AutofillProfile& p1, | |
225 const AutofillProfile& p2) const { | |
226 const base::string16& company_name_1 = NormalizeForComparison( | |
227 p1.GetInfo(AutofillType(COMPANY_NAME), app_locale_)); | |
228 const base::string16& company_name_2 = NormalizeForComparison( | |
229 p2.GetInfo(AutofillType(COMPANY_NAME), app_locale_)); | |
230 return company_name_1.empty() || company_name_2.empty() || | |
231 HaveSameTokens(company_name_1, company_name_2); | |
232 } | |
233 | |
234 bool AutofillProfileComparator::HaveMergeablePhoneNumbers( | |
235 const AutofillProfile& p1, | |
236 const AutofillProfile& p2) const { | |
237 // We work with the raw phone numbers to avoid losing any helpful information | |
238 // as we parse. | |
239 const base::string16& raw_phone_1 = p1.GetRawInfo(PHONE_HOME_WHOLE_NUMBER); | |
240 const base::string16& raw_phone_2 = p2.GetRawInfo(PHONE_HOME_WHOLE_NUMBER); | |
241 | |
242 // Are the two phone numbers trivially mergeable? | |
243 if (raw_phone_1.empty() || raw_phone_2.empty() || | |
244 raw_phone_1 == raw_phone_2) { | |
245 return true; | |
246 } | |
247 | |
248 // TODO(rogerm): Modify ::autofill::i18n::PhoneNumbersMatch to support | |
249 // SHORT_NSN_MATCH and just call that instead of accessing the underlying | |
250 // utility library directly? | |
251 | |
252 // The phone number util library needs the numbers in utf8. | |
253 const std::string phone_1 = base::UTF16ToUTF8(raw_phone_1); | |
254 const std::string phone_2 = base::UTF16ToUTF8(raw_phone_2); | |
255 | |
256 // Parse and compare the phone numbers. | |
257 using ::i18n::phonenumbers::PhoneNumberUtil; | |
258 PhoneNumberUtil* phone_util = PhoneNumberUtil::GetInstance(); | |
259 switch (phone_util->IsNumberMatchWithTwoStrings(phone_1, phone_2)) { | |
260 case PhoneNumberUtil::INVALID_NUMBER: | |
261 case PhoneNumberUtil::NO_MATCH: | |
262 return false; | |
263 case PhoneNumberUtil::SHORT_NSN_MATCH: | |
264 case PhoneNumberUtil::NSN_MATCH: | |
265 case PhoneNumberUtil::EXACT_MATCH: | |
266 return true; | |
267 } | |
268 | |
269 NOTREACHED(); | |
270 return false; | |
271 } | |
272 | |
273 bool AutofillProfileComparator::HaveMergeableAddresses( | |
274 const AutofillProfile& p1, | |
275 const AutofillProfile& p2) const { | |
276 // If the address are not in the same country, then they're not the same. If | |
277 // one of the address countries is unknown/invalid the comparison continues. | |
278 const base::string16& country1 = p1.GetInfo( | |
279 AutofillType(HTML_TYPE_COUNTRY_CODE, HTML_MODE_NONE), app_locale_); | |
280 const base::string16& country2 = p2.GetInfo( | |
281 AutofillType(HTML_TYPE_COUNTRY_CODE, HTML_MODE_NONE), app_locale_); | |
282 if (!country1.empty() && !country2.empty() && | |
283 !case_insensitive_compare_.StringsEqual(country1, country2)) { | |
284 return false; | |
285 } | |
286 | |
287 // TODO(rogerm): Lookup the normalization rules for the (common) country of | |
288 // the address. The rules should be applied post NormalizeForComparison to | |
289 // the state, city, and address bag of words comparisons. | |
290 | |
291 // Zip | |
292 // ---- | |
293 // If the addresses are definitely not in the same zip/area code then we're | |
294 // done. Otherwise,the comparison continues. | |
295 const base::string16& zip1 = NormalizeForComparison( | |
296 p1.GetInfo(AutofillType(ADDRESS_HOME_ZIP), app_locale_), | |
297 DISCARD_WHITESPACE); | |
298 const base::string16& zip2 = NormalizeForComparison( | |
299 p2.GetInfo(AutofillType(ADDRESS_HOME_ZIP), app_locale_), | |
300 DISCARD_WHITESPACE); | |
301 if (!zip1.empty() && !zip2.empty() && | |
302 zip1.find(zip2) == base::string16::npos && | |
303 zip2.find(zip1) == base::string16::npos) { | |
304 return false; | |
305 } | |
306 | |
307 // State | |
308 // ------ | |
309 // Heuristic: If the match is between non-empty zip codes then we can infer | |
310 // that the two state strings are intended to have the same meaning. This | |
311 // handles the cases where we have invalid or poorly formed data in one of the | |
312 // state values (like "Select one", or "CA - California"). Otherwise, we | |
313 // actually have to check if the states map to the the same set of tokens. | |
314 const base::string16& state1 = NormalizeForComparison( | |
315 p1.GetInfo(AutofillType(ADDRESS_HOME_STATE), app_locale_)); | |
316 const base::string16& state2 = NormalizeForComparison( | |
317 p2.GetInfo(AutofillType(ADDRESS_HOME_STATE), app_locale_)); | |
318 if ((zip1.empty() || zip2.empty()) && !HaveSameTokens(state1, state2)) { | |
319 return false; | |
320 } | |
321 | |
322 // City | |
323 // ------ | |
324 // Heuristic: If the match is between non-empty zip codes then we can infer | |
325 // that the two city strings are intended to have the same meaning. This | |
326 // handles the cases where we have a city vs one of its suburbs. Otherwise, we | |
327 // actually have to check if the cities map to the the same set of tokens. | |
328 const base::string16& city1 = NormalizeForComparison( | |
329 p1.GetInfo(AutofillType(ADDRESS_HOME_CITY), app_locale_)); | |
330 const base::string16& city2 = NormalizeForComparison( | |
331 p2.GetInfo(AutofillType(ADDRESS_HOME_CITY), app_locale_)); | |
332 if ((zip1.empty() || zip2.empty()) && !HaveSameTokens(city1, city2)) { | |
333 return false; | |
334 } | |
335 | |
336 // Address | |
337 // -------- | |
338 // Heuristic: Use bag of words comparison on the post-normalized addresses. | |
339 const base::string16& address1 = NormalizeForComparison( | |
340 p1.GetInfo(AutofillType(ADDRESS_HOME_STREET_ADDRESS), app_locale_)); | |
341 const base::string16& address2 = NormalizeForComparison( | |
342 p2.GetInfo(AutofillType(ADDRESS_HOME_STREET_ADDRESS), app_locale_)); | |
343 if (!HaveSameTokens(address1, address2)) { | |
344 return false; | |
345 } | |
346 | |
347 return true; | |
348 } | |
349 | |
350 } // namespace autofill | |
OLD | NEW |