components/autofill/core/browser/autofill_profile_comparator.cc - Issue 2041413004: Add an AutofillProfileComparator class.

Side by Side Diff: components/autofill/core/browser/autofill_profile_comparator.cc

Issue 2041413004: Add an AutofillProfileComparator class. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Rebase Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « components/autofill/core/browser/autofill_profile_comparator.h ('k') | components/autofill/core/browser/autofill_profile_comparator_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 // Copyright 2016 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "components/autofill/core/browser/autofill_profile_comparator.h"

	6

	7 #include <algorithm>

	8 #include <vector>

	9

	10 #include "base/i18n/char_iterator.h"

	11 #include "base/strings/string_split.h"

	12 #include "base/strings/string_util.h"

	13 #include "base/strings/utf_string_conversion_utils.h"

	14 #include "base/strings/utf_string_conversions.h"

	15 #include "components/autofill/core/browser/autofill_data_util.h"

	16 #include "third_party/libphonenumber/phonenumber_api.h"

	17

	18 namespace autofill {

	19 namespace {

	20

	21 const base::char16 kSpace[] = {L' ', L'\0'};

	22

	23 } // namespace

	24

	25 AutofillProfileComparator::AutofillProfileComparator(

	26 const base::StringPiece& app_locale)

	27 : app_locale_(app_locale.data(), app_locale.size()) {

	28 // Use ICU transliteration to remove diacritics and fold case.

	29 // See http://userguide.icu-project.org/transforms/general

	30 UErrorCode status = U_ZERO_ERROR;

	31 std::unique_ptr<icu::Transliterator> transliterator(

	32 icu::Transliterator::createInstance(

	33 "NFD; [:Nonspacing Mark:] Remove; Lower; NFC", UTRANS_FORWARD,

	34 status));

	35 if (U_FAILURE(status) \|\| transliterator == nullptr) {

	36 // TODO(rogerm): Add a histogram to count how often this happens.

	37 LOG(ERROR) << "Failed to create ICU Transliterator: "

	38 << u_errorName(status);

	39 }

	40

	41 transliterator_ = std::move(transliterator);

	42 }

	43

	44 AutofillProfileComparator::~AutofillProfileComparator() {}

	45

	46 base::string16 AutofillProfileComparator::NormalizeForComparison(

	47 base::StringPiece16 text,

	48 AutofillProfileComparator::WhitespaceSpec whitespace_spec) const {

	49 // This algorithm is not designed to be perfect, we could get arbitrarily

	50 // fancy here trying to canonicalize address lines. Instead, this is designed

	51 // to handle common cases for all types of data (addresses and names) without

	52 // the need of domain-specific logic.

	53 //

	54 // 1. Convert punctuation to spaces and normalize all whitespace to spaces.

	55 // This will convert "Mid-Island Plz." -> "Mid Island Plz " (the trailing

	56 // space will be trimmed off outside of the end of the loop).

	57 //

	58 // 2. Collapse consecutive punctuation/whitespace characters to a single

	59 // space. We pretend the string has already started with whitespace in

	60 // order to trim leading spaces.

	61 //

	62 // 3. Remove diacritics (accents and other non-spacing marks) and perform

	63 // case folding to lower-case.

	64 base::string16 result;

	65 result.reserve(text.length());

	66 bool previous_was_whitespace = (whitespace_spec == RETAIN_WHITESPACE);

	67 for (base::i18n::UTF16CharIterator iter(text.data(), text.length());

	68 !iter.end(); iter.Advance()) {

	69 switch (u_charType(iter.get())) {

	70 // Punctuation

	71 case U_DASH_PUNCTUATION:

	72 case U_START_PUNCTUATION:

	73 case U_END_PUNCTUATION:

	74 case U_CONNECTOR_PUNCTUATION:

	75 case U_OTHER_PUNCTUATION:

	76 // Whitespace

	77 case U_CONTROL_CHAR: // To escape the '\n' character.

	78 case U_SPACE_SEPARATOR:

	79 case U_LINE_SEPARATOR:

	80 case U_PARAGRAPH_SEPARATOR:

	81 if (!previous_was_whitespace && whitespace_spec == RETAIN_WHITESPACE) {

	82 result.push_back(' ');

	83 previous_was_whitespace = true;

	84 }

	85 break;

	86

	87 default:

	88 previous_was_whitespace = false;

	89 base::WriteUnicodeCharacter(iter.get(), &result);

	90 break;

	91 }

	92 }

	93

	94 // Trim off trailing whitespace if we left one.

	95 if (previous_was_whitespace && !result.empty())

	96 result.resize(result.size() - 1);

	97

	98 if (transliterator_ == nullptr)

	99 return result;

	100

	101 icu::UnicodeString value = icu::UnicodeString(result.data(), result.length());

	102 transliterator_->transliterate(value);

	103 return base::string16(value.getBuffer(), value.length());

	104 }

	105

	106 bool AutofillProfileComparator::AreMergeable(const AutofillProfile& p1,

	107 const AutofillProfile& p2) const {

	108 // Sorted in order to relative expense of the tests to fail early and cheaply

	109 // if possible.

	110 return HaveMergeableEmailAddresses(p1, p2) &&

	111 HaveMergeableCompanyNames(p1, p2) &&

	112 HaveMergeablePhoneNumbers(p1, p2) && HaveMergeableNames(p1, p2) &&

	113 HaveMergeableAddresses(p1, p2);

	114 }

	115

	116 // static

	117 std::set<base::StringPiece16> AutofillProfileComparator::UniqueTokens(

	118 base::StringPiece16 s) {

	119 std::vector<base::StringPiece16> tokens = base::SplitStringPiece(

	120 s, kSpace, base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);

	121 return std::set<base::StringPiece16>(tokens.begin(), tokens.end());

	122 }

	123

	124 // static

	125 bool AutofillProfileComparator::HaveSameTokens(base::StringPiece16 s1,

	126 base::StringPiece16 s2) {

	127 std::set<base::StringPiece16> t1 = UniqueTokens(s1);

	128 std::set<base::StringPiece16> t2 = UniqueTokens(s2);

	129

	130 // Note: std::include() expects the items in each range to be in sorted order,

	131 // hence the use of std::set<> instead of std::unordered_set<>.

	132 return std::includes(t1.begin(), t1.end(), t2.begin(), t2.end()) \|\|

	133 std::includes(t2.begin(), t2.end(), t1.begin(), t1.end());

	134 }

	135

	136 // static

	137 std::set<base::string16> AutofillProfileComparator::GetNamePartVariants(

	138 const base::string16& name_part) {

	139 const size_t kMaxSupportedSubNames = 8;

	140

	141 std::vector<base::string16> sub_names = base::SplitString(

	142 name_part, kSpace, base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);

	143

	144 // Limit the number of sub-names we support (to constrain memory usage);

	145 if (sub_names.size() > kMaxSupportedSubNames)

	146 return {name_part};

	147

	148 // Start with the empty string as a variant.

	149 std::set<base::string16> variants = {base::EmptyString16()};

	150

	151 // For each sub-name, add a variant of all the already existing variants that

	152 // appends this sub-name and one that appends the initial of this sub-name.

	153 // Duplicates will be discarded when they're added to the variants set.

	154 for (const base::string16& sub_name : sub_names) {

	155 if (sub_name.empty()) continue;

	156 std::vector<base::string16> new_variants;

	157 for (const base::string16& variant : variants) {

	158 new_variants.push_back(base::CollapseWhitespace(

	159 base::JoinString({variant, sub_name}, kSpace), true));

	160 new_variants.push_back(base::CollapseWhitespace(

	161 base::JoinString({variant, sub_name.substr(0, 1)}, kSpace), true));

	162 }

	163 variants.insert(new_variants.begin(), new_variants.end());

	164 }

	165

	166 // As a common case, also add the variant that just concatenates all of the

	167 // initials.

	168 base::string16 initials;

	169 for (const base::string16& sub_name : sub_names) {

	170 if (sub_name.empty()) continue;

	171 initials.push_back(sub_name[0]);

	172 }

	173 variants.insert(initials);

	174

	175 // And, we're done.

	176 return variants;

	177 }

	178

	179 bool AutofillProfileComparator::IsNameVariantOf(

	180 const base::string16& full_name_1,

	181 const base::string16& full_name_2) const {

	182 data_util::NameParts name_1_parts = data_util::SplitName(full_name_1);

	183

	184 // Build the variants of full_name_1`s given, middle and family names.

	185 //

	186 // TODO(rogerm): Figure out whether or not we should break apart a compound

	187 // family name into variants (crbug/619051)

	188 const std::set<base::string16> given_name_variants =

	189 GetNamePartVariants(name_1_parts.given);

	190 const std::set<base::string16> middle_name_variants =

	191 GetNamePartVariants(name_1_parts.middle);

	192 const base::string16& family_name = name_1_parts.family;

	193

	194 // Iterate over all full name variants of profile 2 and see if any of them

	195 // match the full name from profile 1.

	196 for (const base::string16& given_name : given_name_variants) {

	197 for (const base::string16& middle_name : middle_name_variants) {

	198 base::string16 candidate = base::CollapseWhitespace(

	199 base::JoinString({given_name, middle_name, family_name}, kSpace),

	200 true);

	201 if (candidate == full_name_2)

	202 return true;

	203 }

	204 }

	205

	206 // Also check if the name is just composed of the user's initials. For

	207 // example, "thomas jefferson miller" could be composed as "tj miller".

	208 if (!name_1_parts.given.empty() && !name_1_parts.middle.empty()) {

	209 base::string16 initials;

	210 initials.push_back(name_1_parts.given[0]);

	211 initials.push_back(name_1_parts.middle[0]);

	212 base::string16 candidate = base::CollapseWhitespace(

	213 base::JoinString({initials, family_name}, kSpace), true);

	214 if (candidate == full_name_2)

	215 return true;

	216 }

	217

	218 // There was no match found.

	219 return false;

	220 }

	221

	222 bool AutofillProfileComparator::HaveMergeableNames(

	223 const AutofillProfile& p1,

	224 const AutofillProfile& p2) const {

	225 base::string16 full_name_1 =

	226 NormalizeForComparison(p1.GetInfo(AutofillType(NAME_FULL), app_locale_));

	227 base::string16 full_name_2 =

	228 NormalizeForComparison(p2.GetInfo(AutofillType(NAME_FULL), app_locale_));

	229

	230 // Is it reasonable to merge the names from p1 and p2.

	231 return full_name_1.empty() \|\| full_name_2.empty() \|\|

	232 (full_name_1 == full_name_2) \|\|

	233 IsNameVariantOf(full_name_1, full_name_2) \|\|

	234 IsNameVariantOf(full_name_2, full_name_1);

	235 }

	236

	237 bool AutofillProfileComparator::HaveMergeableEmailAddresses(

	238 const AutofillProfile& p1,

	239 const AutofillProfile& p2) const {

	240 const base::string16& email_1 =

	241 p1.GetInfo(AutofillType(EMAIL_ADDRESS), app_locale_);

	242 const base::string16& email_2 =

	243 p2.GetInfo(AutofillType(EMAIL_ADDRESS), app_locale_);

	244 return email_1.empty() \|\| email_2.empty() \|\|

	245 case_insensitive_compare_.StringsEqual(email_1, email_2);

	246 }

	247

	248 bool AutofillProfileComparator::HaveMergeableCompanyNames(

	249 const AutofillProfile& p1,

	250 const AutofillProfile& p2) const {

	251 const base::string16& company_name_1 = NormalizeForComparison(

	252 p1.GetInfo(AutofillType(COMPANY_NAME), app_locale_));

	253 const base::string16& company_name_2 = NormalizeForComparison(

	254 p2.GetInfo(AutofillType(COMPANY_NAME), app_locale_));

	255 return company_name_1.empty() \|\| company_name_2.empty() \|\|

	256 HaveSameTokens(company_name_1, company_name_2);

	257 }

	258

	259 bool AutofillProfileComparator::HaveMergeablePhoneNumbers(

	260 const AutofillProfile& p1,

	261 const AutofillProfile& p2) const {

	262 // We work with the raw phone numbers to avoid losing any helpful information

	263 // as we parse.

	264 const base::string16& raw_phone_1 = p1.GetRawInfo(PHONE_HOME_WHOLE_NUMBER);

	265 const base::string16& raw_phone_2 = p2.GetRawInfo(PHONE_HOME_WHOLE_NUMBER);

	266

	267 // Are the two phone numbers trivially mergeable?

	268 if (raw_phone_1.empty() \|\| raw_phone_2.empty() \|\|

	269 raw_phone_1 == raw_phone_2) {

	270 return true;

	271 }

	272

	273 // TODO(rogerm): Modify ::autofill::i18n::PhoneNumbersMatch to support

	274 // SHORT_NSN_MATCH and just call that instead of accessing the underlying

	275 // utility library directly?

	276

	277 // The phone number util library needs the numbers in utf8.

	278 const std::string phone_1 = base::UTF16ToUTF8(raw_phone_1);

	279 const std::string phone_2 = base::UTF16ToUTF8(raw_phone_2);

	280

	281 // Parse and compare the phone numbers.

	282 using ::i18n::phonenumbers::PhoneNumberUtil;

	283 PhoneNumberUtil* phone_util = PhoneNumberUtil::GetInstance();

	284 switch (phone_util->IsNumberMatchWithTwoStrings(phone_1, phone_2)) {

	285 case PhoneNumberUtil::INVALID_NUMBER:

	286 case PhoneNumberUtil::NO_MATCH:

	287 return false;

	288 case PhoneNumberUtil::SHORT_NSN_MATCH:

	289 case PhoneNumberUtil::NSN_MATCH:

	290 case PhoneNumberUtil::EXACT_MATCH:

	291 return true;

	292 }

	293

	294 NOTREACHED();

	295 return false;

	296 }

	297

	298 bool AutofillProfileComparator::HaveMergeableAddresses(

	299 const AutofillProfile& p1,

	300 const AutofillProfile& p2) const {

	301 // If the address are not in the same country, then they're not the same. If

	302 // one of the address countries is unknown/invalid the comparison continues.

	303 const base::string16& country1 = p1.GetInfo(

	304 AutofillType(HTML_TYPE_COUNTRY_CODE, HTML_MODE_NONE), app_locale_);

	305 const base::string16& country2 = p2.GetInfo(

	306 AutofillType(HTML_TYPE_COUNTRY_CODE, HTML_MODE_NONE), app_locale_);

	307 if (!country1.empty() && !country2.empty() &&

	308 !case_insensitive_compare_.StringsEqual(country1, country2)) {

	309 return false;

	310 }

	311

	312 // TODO(rogerm): Lookup the normalization rules for the (common) country of

	313 // the address. The rules should be applied post NormalizeForComparison to

	314 // the state, city, and address bag of words comparisons.

	315

	316 // Zip

	317 // ----

	318 // If the addresses are definitely not in the same zip/area code then we're

	319 // done. Otherwise,the comparison continues.

	320 const base::string16& zip1 = NormalizeForComparison(

	321 p1.GetInfo(AutofillType(ADDRESS_HOME_ZIP), app_locale_),

	322 DISCARD_WHITESPACE);

	323 const base::string16& zip2 = NormalizeForComparison(

	324 p2.GetInfo(AutofillType(ADDRESS_HOME_ZIP), app_locale_),

	325 DISCARD_WHITESPACE);

	326 if (!zip1.empty() && !zip2.empty() &&

	327 zip1.find(zip2) == base::string16::npos &&

	328 zip2.find(zip1) == base::string16::npos) {

	329 return false;

	330 }

	331

	332 // State

	333 // ------

	334 // Heuristic: If the match is between non-empty zip codes then we can infer

	335 // that the two state strings are intended to have the same meaning. This

	336 // handles the cases where we have invalid or poorly formed data in one of the

	337 // state values (like "Select one", or "CA - California"). Otherwise, we

	338 // actually have to check if the states map to the the same set of tokens.

	339 const base::string16& state1 = NormalizeForComparison(

	340 p1.GetInfo(AutofillType(ADDRESS_HOME_STATE), app_locale_));

	341 const base::string16& state2 = NormalizeForComparison(

	342 p2.GetInfo(AutofillType(ADDRESS_HOME_STATE), app_locale_));

	343 if ((zip1.empty() \|\| zip2.empty()) && !HaveSameTokens(state1, state2)) {

	344 return false;

	345 }

	346

	347 // City

	348 // ------

	349 // Heuristic: If the match is between non-empty zip codes then we can infer

	350 // that the two city strings are intended to have the same meaning. This

	351 // handles the cases where we have a city vs one of its suburbs. Otherwise, we

	352 // actually have to check if the cities map to the the same set of tokens.

	353 const base::string16& city1 = NormalizeForComparison(

	354 p1.GetInfo(AutofillType(ADDRESS_HOME_CITY), app_locale_));

	355 const base::string16& city2 = NormalizeForComparison(

	356 p2.GetInfo(AutofillType(ADDRESS_HOME_CITY), app_locale_));

	357 if ((zip1.empty() \|\| zip2.empty()) && !HaveSameTokens(city1, city2)) {

	358 return false;

	359 }

	360

	361 // Address

	362 // --------

	363 // Heuristic: Use bag of words comparison on the post-normalized addresses.

	364 const base::string16& address1 = NormalizeForComparison(

	365 p1.GetInfo(AutofillType(ADDRESS_HOME_STREET_ADDRESS), app_locale_));

	366 const base::string16& address2 = NormalizeForComparison(

	367 p2.GetInfo(AutofillType(ADDRESS_HOME_STREET_ADDRESS), app_locale_));

	368 if (!HaveSameTokens(address1, address2)) {

	369 return false;

	370 }

	371

	372 return true;

	373 }

	374

	375 } // namespace autofill

OLD	NEW