Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(371)

Side by Side Diff: components/autofill/core/browser/autofill_profile_comparator.cc

Issue 2041413004: Add an AutofillProfileComparator class. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Fix gyp files. Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "components/autofill/core/browser/autofill_profile_comparator.h"
6
7 #include <algorithm>
8 #include <vector>
9
10 #include "base/i18n/char_iterator.h"
11 #include "base/strings/string_split.h"
12 #include "base/strings/string_util.h"
13 #include "base/strings/utf_string_conversion_utils.h"
14 #include "base/strings/utf_string_conversions.h"
15 #include "third_party/libphonenumber/phonenumber_api.h"
16
17 namespace autofill {
18 namespace {
19
20 const base::char16 kSpace[] = {L' ', L'\0'};
21
22 } // namespace
23
24 AutofillProfileComparator::AutofillProfileComparator() {
25 // Use ICU transliteration to remove diacritics and fold case.
26 // See http://userguide.icu-project.org/transforms/general
27 UErrorCode status = U_ZERO_ERROR;
28 std::unique_ptr<icu::Transliterator> transliterator(
29 icu::Transliterator::createInstance(
30 "NFD; [:Nonspacing Mark:] Remove; Lower; NFC", UTRANS_FORWARD,
31 status));
32 if (U_FAILURE(status) || transliterator == nullptr) {
33 // TODO(rogerm): Add a histogram to count how often this happens.
34 LOG(ERROR) << "Failed to create ICU Transliterator: "
35 << u_errorName(status);
36 }
37
38 transliterator_ = std::move(transliterator);
39 }
40
41 AutofillProfileComparator::~AutofillProfileComparator() {}
42
43 base::string16 AutofillProfileComparator::NormalizeForComparison(
44 base::StringPiece16 text,
45 AutofillProfileComparator::WhitespaceSpec whitespace_spec) const {
46 // This algorithm is not designed to be perfect, we could get arbitrarily
47 // fancy here trying to canonicalize address lines. Instead, this is designed
48 // to handle common cases for all types of data (addresses and names) without
49 // the need of domain-specific logic.
50 //
51 // 1. Convert punctuation to spaces and normalize all whitespace to spaces.
52 // This will convert "Mid-Island Plz." -> "Mid Island Plz " (the trailing
53 // space will be trimmed off outside of the end of the loop).
54 //
55 // 2. Collapse consecutive punctuation/whitespace characters to a single
56 // space. We pretend the string has already started with whitespace in
57 // order to trim leading spaces.
58 //
59 // 3. Remove diacritics (accents and other non-spacing marks) and perform
60 // case folding to lower-case.
61 base::string16 result;
62 result.reserve(text.length());
63 bool previous_was_whitespace = (whitespace_spec == RETAIN_WHITESPACE);
64 for (base::i18n::UTF16CharIterator iter(text.data(), text.length());
65 !iter.end(); iter.Advance()) {
66 switch (u_charType(iter.get())) {
67 // Punctuation
68 case U_DASH_PUNCTUATION:
69 case U_START_PUNCTUATION:
70 case U_END_PUNCTUATION:
71 case U_CONNECTOR_PUNCTUATION:
72 case U_OTHER_PUNCTUATION:
73 // Whitespace
74 case U_CONTROL_CHAR: // To escape the '\n' character.
75 case U_SPACE_SEPARATOR:
76 case U_LINE_SEPARATOR:
77 case U_PARAGRAPH_SEPARATOR:
78 if (!previous_was_whitespace && whitespace_spec == RETAIN_WHITESPACE) {
79 result.push_back(' ');
80 previous_was_whitespace = true;
81 }
82 break;
83
84 default:
85 previous_was_whitespace = false;
86 base::WriteUnicodeCharacter(iter.get(), &result);
87 break;
88 }
89 }
90
91 // Trim off trailing whitespace if we left one.
92 if (previous_was_whitespace && !result.empty())
93 result.resize(result.size() - 1);
94
95 if (transliterator_ == nullptr)
96 return result;
97
98 icu::UnicodeString value = icu::UnicodeString(result.data(), result.length());
99 transliterator_->transliterate(value);
100 return base::string16(value.getBuffer(), value.length());
101 }
102
103 bool AutofillProfileComparator::IsMergeable(const AutofillProfile& p1,
104 const AutofillProfile& p2) const {
105 // Sorted in order to relative expense of the tests to fail early and cheaply
106 // if possible.
107 return HaveMergeableEmailAddresses(p1, p2) &&
108 HaveMergeableCompanyNames(p1, p2) &&
109 HaveMergeablePhoneNumbers(p1, p2) && HaveMergeableNames(p1, p2) &&
110 HaveMergeableAddresses(p1, p2);
111 }
112
113 std::set<base::string16> AutofillProfileComparator::GetNameVariants(
114 const base::string16& name) const {
115 const size_t kMaxSupportedNameParts = 8;
116
117 std::vector<base::string16> name_parts = base::SplitString(
118 name, kSpace, base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
119
120 // Limit the number of parts we support (to constrain memory usage);
121 if (name_parts.size() > kMaxSupportedNameParts)
122 return {name};
123
124 // Start with the empty string as a variant.
125 std::set<base::string16> variants = {base::EmptyString16()};
126
127 // For each name part, add a variant of all the already existing variants that
128 // appends this name part and one that appends the initial of this name part.
129 // Duplicates will be discarded when they're added to the variants set.
130 for (const base::string16& part : name_parts) {
131 if (part.empty())
132 continue;
133 std::vector<base::string16> new_variants;
134 for (const base::string16& variant : variants) {
135 new_variants.push_back(base::CollapseWhitespace(
136 base::JoinString({variant, part}, kSpace), true));
137 new_variants.push_back(base::CollapseWhitespace(
138 base::JoinString({variant, part.substr(0, 1)}, kSpace), true));
139 }
140 variants.insert(new_variants.begin(), new_variants.end());
141 }
142 return variants;
143 }
144
145 // static
146 std::set<base::StringPiece16> AutofillProfileComparator::UniqueTokens(
147 base::StringPiece16 s) {
148 std::vector<base::StringPiece16> tokens = base::SplitStringPiece(
149 s, kSpace, base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
150 return std::set<base::StringPiece16>(tokens.begin(), tokens.end());
151 }
152
153 // static
154 bool AutofillProfileComparator::HaveSameTokens(base::StringPiece16 s1,
155 base::StringPiece16 s2) {
156 std::set<base::StringPiece16> t1 = UniqueTokens(s1);
157 std::set<base::StringPiece16> t2 = UniqueTokens(s2);
158
159 // Note: std::include() expects the items in each range to be in sorted order,
160 // hence the use of std::set<> instead of std::unordered_set<>.
161 return std::includes(t1.begin(), t1.end(), t2.begin(), t2.end()) ||
162 std::includes(t2.begin(), t2.end(), t1.begin(), t1.end());
163 }
164
165 bool AutofillProfileComparator::IsNameVariantOf(
166 const AutofillProfile& profile,
167 const base::string16& full_name) const {
168 // Build the variants of profile 2's first and middle name, as well as the
169 // last name or profile 2.
170 const std::set<base::string16> first_name_variants =
171 GetNameVariants(NormalizeForComparison(profile.GetRawInfo(NAME_FIRST)));
172 const std::set<base::string16> middle_name_variants =
173 GetNameVariants(NormalizeForComparison(profile.GetRawInfo(NAME_MIDDLE)));
174 const base::string16 last_name =
175 NormalizeForComparison(profile.GetRawInfo(NAME_LAST));
176
177 // Iterate over all full name variants of profile 2 and see if any of them
178 // match the full name from profile 1.
179 for (const base::string16& first_name : first_name_variants) {
180 for (const base::string16& middle_name : middle_name_variants) {
181 base::string16 candidate = base::CollapseWhitespace(
182 base::JoinString({first_name, middle_name, last_name}, kSpace), true);
183 if (candidate == full_name)
184 return true;
185 }
186 }
187
188 // There was not match found.
189 return false;
190 }
191
192 bool AutofillProfileComparator::HaveMergeableNames(
193 const AutofillProfile& p1,
194 const AutofillProfile& p2) const {
195 // Build the full name for profile 1.
196 base::string16 full_name_1 = NormalizeForComparison(
197 base::JoinString({p1.GetRawInfo(NAME_FIRST), p1.GetRawInfo(NAME_MIDDLE),
198 p1.GetRawInfo(NAME_LAST)},
199 kSpace));
200
201 // Build the full name for profile 2.
202 base::string16 full_name_2 = NormalizeForComparison(
203 base::JoinString({p2.GetRawInfo(NAME_FIRST), p2.GetRawInfo(NAME_MIDDLE),
204 p2.GetRawInfo(NAME_LAST)},
205 kSpace));
206
207 // Is it reasonable to merge the names from p1 and p2.
208 return full_name_1.empty() || full_name_2.empty() ||
209 (full_name_1 == full_name_2) || IsNameVariantOf(p2, full_name_1) ||
210 IsNameVariantOf(p1, full_name_2);
211 }
212
213 bool AutofillProfileComparator::HaveMergeableEmailAddresses(
214 const AutofillProfile& p1,
215 const AutofillProfile& p2) const {
216 const base::string16& email_1 = p1.GetRawInfo(EMAIL_ADDRESS);
217 const base::string16& email_2 = p2.GetRawInfo(EMAIL_ADDRESS);
218 return email_1.empty() || email_2.empty() ||
219 case_insensitive_compare_.StringsEqual(email_1, email_2);
220 }
221
222 bool AutofillProfileComparator::HaveMergeableCompanyNames(
223 const AutofillProfile& p1,
224 const AutofillProfile& p2) const {
225 const base::string16& company_name_1 =
226 NormalizeForComparison(p1.GetRawInfo(COMPANY_NAME));
227 const base::string16& company_name_2 =
228 NormalizeForComparison(p2.GetRawInfo(COMPANY_NAME));
229 return company_name_1.empty() || company_name_2.empty() ||
230 HaveSameTokens(company_name_1, company_name_2);
231 }
232
233 bool AutofillProfileComparator::HaveMergeablePhoneNumbers(
234 const AutofillProfile& p1,
235 const AutofillProfile& p2) const {
236 using ::i18n::phonenumbers::PhoneNumberUtil;
237
238 // Are the two phone numbers trivially mergeable?
239 const base::string16& raw_phone_1 = p1.GetRawInfo(PHONE_HOME_WHOLE_NUMBER);
240 const base::string16& raw_phone_2 = p2.GetRawInfo(PHONE_HOME_WHOLE_NUMBER);
241 if (raw_phone_1.empty() || raw_phone_2.empty() ||
242 raw_phone_1 == raw_phone_2) {
243 return true;
244 }
245
246 // The phone numbers util library needs the numbers in utf8.
247 const std::string phone_1 = base::UTF16ToUTF8(raw_phone_1);
248 const std::string phone_2 = base::UTF16ToUTF8(raw_phone_2);
249
250 // Parse and compare the phone numbers.
251 PhoneNumberUtil* phone_util = PhoneNumberUtil::GetInstance();
252 switch (phone_util->IsNumberMatchWithTwoStrings(phone_1, phone_2)) {
253 case PhoneNumberUtil::INVALID_NUMBER:
254 case PhoneNumberUtil::NO_MATCH:
255 return false;
256 case PhoneNumberUtil::SHORT_NSN_MATCH:
257 case PhoneNumberUtil::NSN_MATCH:
258 case PhoneNumberUtil::EXACT_MATCH:
259 return true;
260 }
261
262 NOTREACHED();
263 return false;
264 }
265
266 bool AutofillProfileComparator::HaveMergeableAddresses(
267 const AutofillProfile& p1,
268 const AutofillProfile& p2) const {
269 // If the address are not in the same country, then they're not the same. If
270 // one of the address countries is unknown/invalid the comparison continues.
271 const base::string16& country1 =
272 p1.GetInfo(AutofillType(HTML_TYPE_COUNTRY_CODE, HTML_MODE_NONE), "en-US");
273 const base::string16& country2 =
274 p2.GetInfo(AutofillType(HTML_TYPE_COUNTRY_CODE, HTML_MODE_NONE), "en-US");
275 if (!country1.empty() && !country2.empty() &&
276 !case_insensitive_compare_.StringsEqual(country1, country2)) {
277 LOG(ERROR) << country1 << "!=" << country2;
278 return false;
279 }
280
281 // TODO(rogerm): Lookup the normalization rules for the country of the
282 // address.
283
284 // Zip
285 // ----
286 // If the addresses are definitely not in the same zip/area code then we're
287 // done. Otherwise,the comparison continues.
288 const base::string16& zip1 = NormalizeForComparison(
289 p1.GetRawInfo(ADDRESS_HOME_ZIP), DISCARD_WHITESPACE);
290 const base::string16& zip2 = NormalizeForComparison(
291 p2.GetRawInfo(ADDRESS_HOME_ZIP), DISCARD_WHITESPACE);
292 if (!zip1.empty() && !zip2.empty() &&
293 zip1.find(zip2) == base::string16::npos &&
294 zip2.find(zip1) == base::string16::npos) {
295 return false;
296 }
297
298 // State
299 // ------
300 // Heuristic: If the match is between non-empty zip codes then we can infer
301 // that the two state strings are intended to have the same meaning. This
302 // handles the cases where we have invalid or poorly formed data in
303 // one of the state values (like "Select one", or "CA - California").
304 // Otherwise, we actually have to check if the states map to the the same set
305 // of
306 // tokens.
307 const base::string16& state1 =
308 NormalizeForComparison(p1.GetRawInfo(ADDRESS_HOME_STATE));
309 const base::string16& state2 =
310 NormalizeForComparison(p2.GetRawInfo(ADDRESS_HOME_STATE));
311 if ((zip1.empty() || zip2.empty()) && !HaveSameTokens(state1, state2)) {
312 return false;
313 }
314
315 // City
316 // ------
317 // Heuristic: If the match is between non-empty zip codes then we can infer
318 // that the two city strings are intended to have the same meaning. This
319 // handles the cases where we have a city vs one of its suburbs.
320 // Otherwise, we actually have to check if the cities map to the the same set
321 // of
322 // tokens.
323 const base::string16& city1 =
324 NormalizeForComparison(p1.GetRawInfo(ADDRESS_HOME_CITY));
325 const base::string16& city2 =
326 NormalizeForComparison(p2.GetRawInfo(ADDRESS_HOME_CITY));
327 if ((zip1.empty() || zip2.empty()) && !HaveSameTokens(city1, city2)) {
328 return false;
329 }
330
331 // Address
332 const base::string16& address1 = NormalizeForComparison(base::JoinString(
333 {p1.GetRawInfo(ADDRESS_HOME_LINE1), p1.GetRawInfo(ADDRESS_HOME_LINE2)},
334 kSpace));
335 const base::string16& address2 = NormalizeForComparison(base::JoinString(
336 {p2.GetRawInfo(ADDRESS_HOME_LINE1), p2.GetRawInfo(ADDRESS_HOME_LINE2)},
337 kSpace));
338 if (!HaveSameTokens(address1, address2)) {
339 return false;
340 }
341
342 return true;
343 }
344
345 } // namespace autofill
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698