Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(286)

Side by Side Diff: components/autofill/core/browser/autofill_profile_comparator.cc

Issue 2041413004: Add an AutofillProfileComparator class. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Rebase Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "components/autofill/core/browser/autofill_profile_comparator.h"
6
7 #include <algorithm>
8 #include <vector>
9
10 #include "base/i18n/char_iterator.h"
11 #include "base/strings/string_split.h"
12 #include "base/strings/string_util.h"
13 #include "base/strings/utf_string_conversion_utils.h"
14 #include "base/strings/utf_string_conversions.h"
15 #include "components/autofill/core/browser/autofill_data_util.h"
16 #include "third_party/libphonenumber/phonenumber_api.h"
17
18 namespace autofill {
19 namespace {
20
21 const base::char16 kSpace[] = {L' ', L'\0'};
22
23 } // namespace
24
25 AutofillProfileComparator::AutofillProfileComparator(
26 const base::StringPiece& app_locale)
27 : app_locale_(app_locale.data(), app_locale.size()) {
28 // Use ICU transliteration to remove diacritics and fold case.
29 // See http://userguide.icu-project.org/transforms/general
30 UErrorCode status = U_ZERO_ERROR;
31 std::unique_ptr<icu::Transliterator> transliterator(
32 icu::Transliterator::createInstance(
33 "NFD; [:Nonspacing Mark:] Remove; Lower; NFC", UTRANS_FORWARD,
34 status));
35 if (U_FAILURE(status) || transliterator == nullptr) {
36 // TODO(rogerm): Add a histogram to count how often this happens.
37 LOG(ERROR) << "Failed to create ICU Transliterator: "
38 << u_errorName(status);
39 }
40
41 transliterator_ = std::move(transliterator);
42 }
43
44 AutofillProfileComparator::~AutofillProfileComparator() {}
45
46 base::string16 AutofillProfileComparator::NormalizeForComparison(
47 base::StringPiece16 text,
48 AutofillProfileComparator::WhitespaceSpec whitespace_spec) const {
49 // This algorithm is not designed to be perfect, we could get arbitrarily
50 // fancy here trying to canonicalize address lines. Instead, this is designed
51 // to handle common cases for all types of data (addresses and names) without
52 // the need of domain-specific logic.
53 //
54 // 1. Convert punctuation to spaces and normalize all whitespace to spaces.
55 // This will convert "Mid-Island Plz." -> "Mid Island Plz " (the trailing
56 // space will be trimmed off outside of the end of the loop).
57 //
58 // 2. Collapse consecutive punctuation/whitespace characters to a single
59 // space. We pretend the string has already started with whitespace in
60 // order to trim leading spaces.
61 //
62 // 3. Remove diacritics (accents and other non-spacing marks) and perform
63 // case folding to lower-case.
64 base::string16 result;
65 result.reserve(text.length());
66 bool previous_was_whitespace = (whitespace_spec == RETAIN_WHITESPACE);
67 for (base::i18n::UTF16CharIterator iter(text.data(), text.length());
68 !iter.end(); iter.Advance()) {
69 switch (u_charType(iter.get())) {
70 // Punctuation
71 case U_DASH_PUNCTUATION:
72 case U_START_PUNCTUATION:
73 case U_END_PUNCTUATION:
74 case U_CONNECTOR_PUNCTUATION:
75 case U_OTHER_PUNCTUATION:
76 // Whitespace
77 case U_CONTROL_CHAR: // To escape the '\n' character.
78 case U_SPACE_SEPARATOR:
79 case U_LINE_SEPARATOR:
80 case U_PARAGRAPH_SEPARATOR:
81 if (!previous_was_whitespace && whitespace_spec == RETAIN_WHITESPACE) {
82 result.push_back(' ');
83 previous_was_whitespace = true;
84 }
85 break;
86
87 default:
88 previous_was_whitespace = false;
89 base::WriteUnicodeCharacter(iter.get(), &result);
90 break;
91 }
92 }
93
94 // Trim off trailing whitespace if we left one.
95 if (previous_was_whitespace && !result.empty())
96 result.resize(result.size() - 1);
97
98 if (transliterator_ == nullptr)
99 return result;
100
101 icu::UnicodeString value = icu::UnicodeString(result.data(), result.length());
102 transliterator_->transliterate(value);
103 return base::string16(value.getBuffer(), value.length());
104 }
105
106 bool AutofillProfileComparator::AreMergeable(const AutofillProfile& p1,
107 const AutofillProfile& p2) const {
108 // Sorted in order to relative expense of the tests to fail early and cheaply
109 // if possible.
110 return HaveMergeableEmailAddresses(p1, p2) &&
111 HaveMergeableCompanyNames(p1, p2) &&
112 HaveMergeablePhoneNumbers(p1, p2) && HaveMergeableNames(p1, p2) &&
113 HaveMergeableAddresses(p1, p2);
114 }
115
116 // static
117 std::set<base::StringPiece16> AutofillProfileComparator::UniqueTokens(
118 base::StringPiece16 s) {
119 std::vector<base::StringPiece16> tokens = base::SplitStringPiece(
120 s, kSpace, base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
121 return std::set<base::StringPiece16>(tokens.begin(), tokens.end());
122 }
123
124 // static
125 bool AutofillProfileComparator::HaveSameTokens(base::StringPiece16 s1,
126 base::StringPiece16 s2) {
127 std::set<base::StringPiece16> t1 = UniqueTokens(s1);
128 std::set<base::StringPiece16> t2 = UniqueTokens(s2);
129
130 // Note: std::include() expects the items in each range to be in sorted order,
131 // hence the use of std::set<> instead of std::unordered_set<>.
132 return std::includes(t1.begin(), t1.end(), t2.begin(), t2.end()) ||
133 std::includes(t2.begin(), t2.end(), t1.begin(), t1.end());
134 }
135
136 // static
137 std::set<base::string16> AutofillProfileComparator::GetNamePartVariants(
138 const base::string16& name_part) {
139 const size_t kMaxSupportedSubNames = 8;
140
141 std::vector<base::string16> sub_names = base::SplitString(
142 name_part, kSpace, base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
143
144 // Limit the number of sub-names we support (to constrain memory usage);
145 if (sub_names.size() > kMaxSupportedSubNames)
146 return {name_part};
147
148 // Start with the empty string as a variant.
149 std::set<base::string16> variants = {base::EmptyString16()};
150
151 // For each sub-name, add a variant of all the already existing variants that
152 // appends this sub-name and one that appends the initial of this sub-name.
153 // Duplicates will be discarded when they're added to the variants set.
154 for (const base::string16& sub_name : sub_names) {
155 if (sub_name.empty()) continue;
156 std::vector<base::string16> new_variants;
157 for (const base::string16& variant : variants) {
158 new_variants.push_back(base::CollapseWhitespace(
159 base::JoinString({variant, sub_name}, kSpace), true));
160 new_variants.push_back(base::CollapseWhitespace(
161 base::JoinString({variant, sub_name.substr(0, 1)}, kSpace), true));
162 }
163 variants.insert(new_variants.begin(), new_variants.end());
164 }
165
166 // As a common case, also add the variant that just concatenates all of the
167 // initials.
168 base::string16 initials;
169 for (const base::string16& sub_name : sub_names) {
170 if (sub_name.empty()) continue;
171 initials.push_back(sub_name[0]);
172 }
173 variants.insert(initials);
174
175 // And, we're done.
176 return variants;
177 }
178
179 bool AutofillProfileComparator::IsNameVariantOf(
180 const base::string16& full_name_1,
181 const base::string16& full_name_2) const {
182 data_util::NameParts name_1_parts = data_util::SplitName(full_name_1);
183
184 // Build the variants of full_name_1`s given, middle and family names.
185 //
186 // TODO(rogerm): Figure out whether or not we should break apart a compound
187 // family name into variants (crbug/619051)
188 const std::set<base::string16> given_name_variants =
189 GetNamePartVariants(name_1_parts.given);
190 const std::set<base::string16> middle_name_variants =
191 GetNamePartVariants(name_1_parts.middle);
192 const base::string16& family_name = name_1_parts.family;
193
194 // Iterate over all full name variants of profile 2 and see if any of them
195 // match the full name from profile 1.
196 for (const base::string16& given_name : given_name_variants) {
197 for (const base::string16& middle_name : middle_name_variants) {
198 base::string16 candidate = base::CollapseWhitespace(
199 base::JoinString({given_name, middle_name, family_name}, kSpace),
200 true);
201 if (candidate == full_name_2)
202 return true;
203 }
204 }
205
206 // Also check if the name is just composed of the user's initials. For
207 // example, "thomas jefferson miller" could be composed as "tj miller".
208 if (!name_1_parts.given.empty() && !name_1_parts.middle.empty()) {
209 base::string16 initials;
210 initials.push_back(name_1_parts.given[0]);
211 initials.push_back(name_1_parts.middle[0]);
212 base::string16 candidate = base::CollapseWhitespace(
213 base::JoinString({initials, family_name}, kSpace), true);
214 if (candidate == full_name_2)
215 return true;
216 }
217
218 // There was no match found.
219 return false;
220 }
221
222 bool AutofillProfileComparator::HaveMergeableNames(
223 const AutofillProfile& p1,
224 const AutofillProfile& p2) const {
225 base::string16 full_name_1 =
226 NormalizeForComparison(p1.GetInfo(AutofillType(NAME_FULL), app_locale_));
227 base::string16 full_name_2 =
228 NormalizeForComparison(p2.GetInfo(AutofillType(NAME_FULL), app_locale_));
229
230 // Is it reasonable to merge the names from p1 and p2.
231 return full_name_1.empty() || full_name_2.empty() ||
232 (full_name_1 == full_name_2) ||
233 IsNameVariantOf(full_name_1, full_name_2) ||
234 IsNameVariantOf(full_name_2, full_name_1);
235 }
236
237 bool AutofillProfileComparator::HaveMergeableEmailAddresses(
238 const AutofillProfile& p1,
239 const AutofillProfile& p2) const {
240 const base::string16& email_1 =
241 p1.GetInfo(AutofillType(EMAIL_ADDRESS), app_locale_);
242 const base::string16& email_2 =
243 p2.GetInfo(AutofillType(EMAIL_ADDRESS), app_locale_);
244 return email_1.empty() || email_2.empty() ||
245 case_insensitive_compare_.StringsEqual(email_1, email_2);
246 }
247
248 bool AutofillProfileComparator::HaveMergeableCompanyNames(
249 const AutofillProfile& p1,
250 const AutofillProfile& p2) const {
251 const base::string16& company_name_1 = NormalizeForComparison(
252 p1.GetInfo(AutofillType(COMPANY_NAME), app_locale_));
253 const base::string16& company_name_2 = NormalizeForComparison(
254 p2.GetInfo(AutofillType(COMPANY_NAME), app_locale_));
255 return company_name_1.empty() || company_name_2.empty() ||
256 HaveSameTokens(company_name_1, company_name_2);
257 }
258
259 bool AutofillProfileComparator::HaveMergeablePhoneNumbers(
260 const AutofillProfile& p1,
261 const AutofillProfile& p2) const {
262 // We work with the raw phone numbers to avoid losing any helpful information
263 // as we parse.
264 const base::string16& raw_phone_1 = p1.GetRawInfo(PHONE_HOME_WHOLE_NUMBER);
265 const base::string16& raw_phone_2 = p2.GetRawInfo(PHONE_HOME_WHOLE_NUMBER);
266
267 // Are the two phone numbers trivially mergeable?
268 if (raw_phone_1.empty() || raw_phone_2.empty() ||
269 raw_phone_1 == raw_phone_2) {
270 return true;
271 }
272
273 // TODO(rogerm): Modify ::autofill::i18n::PhoneNumbersMatch to support
274 // SHORT_NSN_MATCH and just call that instead of accessing the underlying
275 // utility library directly?
276
277 // The phone number util library needs the numbers in utf8.
278 const std::string phone_1 = base::UTF16ToUTF8(raw_phone_1);
279 const std::string phone_2 = base::UTF16ToUTF8(raw_phone_2);
280
281 // Parse and compare the phone numbers.
282 using ::i18n::phonenumbers::PhoneNumberUtil;
283 PhoneNumberUtil* phone_util = PhoneNumberUtil::GetInstance();
284 switch (phone_util->IsNumberMatchWithTwoStrings(phone_1, phone_2)) {
285 case PhoneNumberUtil::INVALID_NUMBER:
286 case PhoneNumberUtil::NO_MATCH:
287 return false;
288 case PhoneNumberUtil::SHORT_NSN_MATCH:
289 case PhoneNumberUtil::NSN_MATCH:
290 case PhoneNumberUtil::EXACT_MATCH:
291 return true;
292 }
293
294 NOTREACHED();
295 return false;
296 }
297
298 bool AutofillProfileComparator::HaveMergeableAddresses(
299 const AutofillProfile& p1,
300 const AutofillProfile& p2) const {
301 // If the address are not in the same country, then they're not the same. If
302 // one of the address countries is unknown/invalid the comparison continues.
303 const base::string16& country1 = p1.GetInfo(
304 AutofillType(HTML_TYPE_COUNTRY_CODE, HTML_MODE_NONE), app_locale_);
305 const base::string16& country2 = p2.GetInfo(
306 AutofillType(HTML_TYPE_COUNTRY_CODE, HTML_MODE_NONE), app_locale_);
307 if (!country1.empty() && !country2.empty() &&
308 !case_insensitive_compare_.StringsEqual(country1, country2)) {
309 return false;
310 }
311
312 // TODO(rogerm): Lookup the normalization rules for the (common) country of
313 // the address. The rules should be applied post NormalizeForComparison to
314 // the state, city, and address bag of words comparisons.
315
316 // Zip
317 // ----
318 // If the addresses are definitely not in the same zip/area code then we're
319 // done. Otherwise,the comparison continues.
320 const base::string16& zip1 = NormalizeForComparison(
321 p1.GetInfo(AutofillType(ADDRESS_HOME_ZIP), app_locale_),
322 DISCARD_WHITESPACE);
323 const base::string16& zip2 = NormalizeForComparison(
324 p2.GetInfo(AutofillType(ADDRESS_HOME_ZIP), app_locale_),
325 DISCARD_WHITESPACE);
326 if (!zip1.empty() && !zip2.empty() &&
327 zip1.find(zip2) == base::string16::npos &&
328 zip2.find(zip1) == base::string16::npos) {
329 return false;
330 }
331
332 // State
333 // ------
334 // Heuristic: If the match is between non-empty zip codes then we can infer
335 // that the two state strings are intended to have the same meaning. This
336 // handles the cases where we have invalid or poorly formed data in one of the
337 // state values (like "Select one", or "CA - California"). Otherwise, we
338 // actually have to check if the states map to the the same set of tokens.
339 const base::string16& state1 = NormalizeForComparison(
340 p1.GetInfo(AutofillType(ADDRESS_HOME_STATE), app_locale_));
341 const base::string16& state2 = NormalizeForComparison(
342 p2.GetInfo(AutofillType(ADDRESS_HOME_STATE), app_locale_));
343 if ((zip1.empty() || zip2.empty()) && !HaveSameTokens(state1, state2)) {
344 return false;
345 }
346
347 // City
348 // ------
349 // Heuristic: If the match is between non-empty zip codes then we can infer
350 // that the two city strings are intended to have the same meaning. This
351 // handles the cases where we have a city vs one of its suburbs. Otherwise, we
352 // actually have to check if the cities map to the the same set of tokens.
353 const base::string16& city1 = NormalizeForComparison(
354 p1.GetInfo(AutofillType(ADDRESS_HOME_CITY), app_locale_));
355 const base::string16& city2 = NormalizeForComparison(
356 p2.GetInfo(AutofillType(ADDRESS_HOME_CITY), app_locale_));
357 if ((zip1.empty() || zip2.empty()) && !HaveSameTokens(city1, city2)) {
358 return false;
359 }
360
361 // Address
362 // --------
363 // Heuristic: Use bag of words comparison on the post-normalized addresses.
364 const base::string16& address1 = NormalizeForComparison(
365 p1.GetInfo(AutofillType(ADDRESS_HOME_STREET_ADDRESS), app_locale_));
366 const base::string16& address2 = NormalizeForComparison(
367 p2.GetInfo(AutofillType(ADDRESS_HOME_STREET_ADDRESS), app_locale_));
368 if (!HaveSameTokens(address1, address2)) {
369 return false;
370 }
371
372 return true;
373 }
374
375 } // namespace autofill
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698