components/autofill/core/browser/autofill_profile_comparator.cc - Issue 2041413004: Add an AutofillProfileComparator class.

Side by Side Diff: components/autofill/core/browser/autofill_profile_comparator.cc

Issue 2041413004: Add an AutofillProfileComparator class. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Fix gyp files. Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« components/autofill/core/browser/autofill_profile_comparator.h ('K') | « components/autofill/core/browser/autofill_profile_comparator.h ('k') | components/autofill/core/browser/autofill_profile_comparator_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 // Copyright 2016 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "components/autofill/core/browser/autofill_profile_comparator.h"

	6

	7 #include <algorithm>

	8 #include <vector>

	9

	10 #include "base/i18n/char_iterator.h"

	11 #include "base/strings/string_split.h"

	12 #include "base/strings/string_util.h"

	13 #include "base/strings/utf_string_conversion_utils.h"

	14 #include "base/strings/utf_string_conversions.h"

	15 #include "third_party/libphonenumber/phonenumber_api.h"

	16

	17 namespace autofill {

	18 namespace {

	19

	20 const base::char16 kSpace[] = {L' ', L'\0'};

	21

	22 } // namespace

	23

	24 AutofillProfileComparator::AutofillProfileComparator() {

	25 // Use ICU transliteration to remove diacritics and fold case.

	26 // See http://userguide.icu-project.org/transforms/general

	27 UErrorCode status = U_ZERO_ERROR;

	28 std::unique_ptr<icu::Transliterator> transliterator(

	29 icu::Transliterator::createInstance(

	30 "NFD; [:Nonspacing Mark:] Remove; Lower; NFC", UTRANS_FORWARD,

	31 status));

	32 if (U_FAILURE(status) \|\| transliterator == nullptr) {

	33 // TODO(rogerm): Add a histogram to count how often this happens.

	34 LOG(ERROR) << "Failed to create ICU Transliterator: "

	35 << u_errorName(status);

	36 }

	37

	38 transliterator_ = std::move(transliterator);

	39 }

	40

	41 AutofillProfileComparator::~AutofillProfileComparator() {}

	42

	43 base::string16 AutofillProfileComparator::NormalizeForComparison(

	44 base::StringPiece16 text,

	45 AutofillProfileComparator::WhitespaceSpec whitespace_spec) const {

	46 // This algorithm is not designed to be perfect, we could get arbitrarily

	47 // fancy here trying to canonicalize address lines. Instead, this is designed

	48 // to handle common cases for all types of data (addresses and names) without

	49 // the need of domain-specific logic.

	50 //

	51 // 1. Convert punctuation to spaces and normalize all whitespace to spaces.

	52 // This will convert "Mid-Island Plz." -> "Mid Island Plz " (the trailing

	53 // space will be trimmed off outside of the end of the loop).

	54 //

	55 // 2. Collapse consecutive punctuation/whitespace characters to a single

	56 // space. We pretend the string has already started with whitespace in

	57 // order to trim leading spaces.

	58 //

	59 // 3. Remove diacritics (accents and other non-spacing marks) and perform

	60 // case folding to lower-case.

	61 base::string16 result;

	62 result.reserve(text.length());

	63 bool previous_was_whitespace = (whitespace_spec == RETAIN_WHITESPACE);

	64 for (base::i18n::UTF16CharIterator iter(text.data(), text.length());

	65 !iter.end(); iter.Advance()) {

	66 switch (u_charType(iter.get())) {

	67 // Punctuation

	68 case U_DASH_PUNCTUATION:

	69 case U_START_PUNCTUATION:

	70 case U_END_PUNCTUATION:

	71 case U_CONNECTOR_PUNCTUATION:

	72 case U_OTHER_PUNCTUATION:

	73 // Whitespace

	74 case U_CONTROL_CHAR: // To escape the '\n' character.

	75 case U_SPACE_SEPARATOR:

	76 case U_LINE_SEPARATOR:

	77 case U_PARAGRAPH_SEPARATOR:

	78 if (!previous_was_whitespace && whitespace_spec == RETAIN_WHITESPACE) {

	79 result.push_back(' ');

	80 previous_was_whitespace = true;

	81 }

	82 break;

	83

	84 default:

	85 previous_was_whitespace = false;

	86 base::WriteUnicodeCharacter(iter.get(), &result);

	87 break;

	88 }

	89 }

	90

	91 // Trim off trailing whitespace if we left one.

	92 if (previous_was_whitespace && !result.empty())

	93 result.resize(result.size() - 1);

	94

	95 if (transliterator_ == nullptr)

	96 return result;

	97

	98 icu::UnicodeString value = icu::UnicodeString(result.data(), result.length());

	99 transliterator_->transliterate(value);

	100 return base::string16(value.getBuffer(), value.length());

	101 }

	102

	103 bool AutofillProfileComparator::IsMergeable(const AutofillProfile& p1,

	104 const AutofillProfile& p2) const {

	105 // Sorted in order to relative expense of the tests to fail early and cheaply

	106 // if possible.

	107 return HaveMergeableEmailAddresses(p1, p2) &&

	108 HaveMergeableCompanyNames(p1, p2) &&

	109 HaveMergeablePhoneNumbers(p1, p2) && HaveMergeableNames(p1, p2) &&

	110 HaveMergeableAddresses(p1, p2);

	111 }

	112

	113 std::set<base::string16> AutofillProfileComparator::GetNameVariants(

	114 const base::string16& name) const {

	115 const size_t kMaxSupportedNameParts = 8;

	116

	117 std::vector<base::string16> name_parts = base::SplitString(

	118 name, kSpace, base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);

	119

	120 // Limit the number of parts we support (to constrain memory usage);

	121 if (name_parts.size() > kMaxSupportedNameParts)

	122 return {name};

	123

	124 // Start with the empty string as a variant.

	125 std::set<base::string16> variants = {base::EmptyString16()};

	126

	127 // For each name part, add a variant of all the already existing variants that

	128 // appends this name part and one that appends the initial of this name part.

	129 // Duplicates will be discarded when they're added to the variants set.

	130 for (const base::string16& part : name_parts) {

	131 if (part.empty())

	132 continue;

	133 std::vector<base::string16> new_variants;

	134 for (const base::string16& variant : variants) {

	135 new_variants.push_back(base::CollapseWhitespace(

	136 base::JoinString({variant, part}, kSpace), true));

	137 new_variants.push_back(base::CollapseWhitespace(

	138 base::JoinString({variant, part.substr(0, 1)}, kSpace), true));

	139 }

	140 variants.insert(new_variants.begin(), new_variants.end());

	141 }

	142 return variants;

	143 }

	144

	145 // static

	146 std::set<base::StringPiece16> AutofillProfileComparator::UniqueTokens(

	147 base::StringPiece16 s) {

	148 std::vector<base::StringPiece16> tokens = base::SplitStringPiece(

	149 s, kSpace, base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);

	150 return std::set<base::StringPiece16>(tokens.begin(), tokens.end());

	151 }

	152

	153 // static

	154 bool AutofillProfileComparator::HaveSameTokens(base::StringPiece16 s1,

	155 base::StringPiece16 s2) {

	156 std::set<base::StringPiece16> t1 = UniqueTokens(s1);

	157 std::set<base::StringPiece16> t2 = UniqueTokens(s2);

	158

	159 // Note: std::include() expects the items in each range to be in sorted order,

	160 // hence the use of std::set<> instead of std::unordered_set<>.

	161 return std::includes(t1.begin(), t1.end(), t2.begin(), t2.end()) \|\|

	162 std::includes(t2.begin(), t2.end(), t1.begin(), t1.end());

	163 }

	164

	165 bool AutofillProfileComparator::IsNameVariantOf(

	166 const AutofillProfile& profile,

	167 const base::string16& full_name) const {

	168 // Build the variants of profile 2's first and middle name, as well as the

	169 // last name or profile 2.

	170 const std::set<base::string16> first_name_variants =

	171 GetNameVariants(NormalizeForComparison(profile.GetRawInfo(NAME_FIRST)));

	172 const std::set<base::string16> middle_name_variants =

	173 GetNameVariants(NormalizeForComparison(profile.GetRawInfo(NAME_MIDDLE)));

	174 const base::string16 last_name =

	175 NormalizeForComparison(profile.GetRawInfo(NAME_LAST));

	176

	177 // Iterate over all full name variants of profile 2 and see if any of them

	178 // match the full name from profile 1.

	179 for (const base::string16& first_name : first_name_variants) {

	180 for (const base::string16& middle_name : middle_name_variants) {

	181 base::string16 candidate = base::CollapseWhitespace(

	182 base::JoinString({first_name, middle_name, last_name}, kSpace), true);

	183 if (candidate == full_name)

	184 return true;

	185 }

	186 }

	187

	188 // There was not match found.

	189 return false;

	190 }

	191

	192 bool AutofillProfileComparator::HaveMergeableNames(

	193 const AutofillProfile& p1,

	194 const AutofillProfile& p2) const {

	195 // Build the full name for profile 1.

	196 base::string16 full_name_1 = NormalizeForComparison(

	197 base::JoinString({p1.GetRawInfo(NAME_FIRST), p1.GetRawInfo(NAME_MIDDLE),

	198 p1.GetRawInfo(NAME_LAST)},

	199 kSpace));

	200

	201 // Build the full name for profile 2.

	202 base::string16 full_name_2 = NormalizeForComparison(

	203 base::JoinString({p2.GetRawInfo(NAME_FIRST), p2.GetRawInfo(NAME_MIDDLE),

	204 p2.GetRawInfo(NAME_LAST)},

	205 kSpace));

	206

	207 // Is it reasonable to merge the names from p1 and p2.

	208 return full_name_1.empty() \|\| full_name_2.empty() \|\|

	209 (full_name_1 == full_name_2) \|\| IsNameVariantOf(p2, full_name_1) \|\|

	210 IsNameVariantOf(p1, full_name_2);

	211 }

	212

	213 bool AutofillProfileComparator::HaveMergeableEmailAddresses(

	214 const AutofillProfile& p1,

	215 const AutofillProfile& p2) const {

	216 const base::string16& email_1 = p1.GetRawInfo(EMAIL_ADDRESS);

	217 const base::string16& email_2 = p2.GetRawInfo(EMAIL_ADDRESS);

	218 return email_1.empty() \|\| email_2.empty() \|\|

	219 case_insensitive_compare_.StringsEqual(email_1, email_2);

	220 }

	221

	222 bool AutofillProfileComparator::HaveMergeableCompanyNames(

	223 const AutofillProfile& p1,

	224 const AutofillProfile& p2) const {

	225 const base::string16& company_name_1 =

	226 NormalizeForComparison(p1.GetRawInfo(COMPANY_NAME));

	227 const base::string16& company_name_2 =

	228 NormalizeForComparison(p2.GetRawInfo(COMPANY_NAME));

	229 return company_name_1.empty() \|\| company_name_2.empty() \|\|

	230 HaveSameTokens(company_name_1, company_name_2);

	231 }

	232

	233 bool AutofillProfileComparator::HaveMergeablePhoneNumbers(

	234 const AutofillProfile& p1,

	235 const AutofillProfile& p2) const {

	236 using ::i18n::phonenumbers::PhoneNumberUtil;

	237

	238 // Are the two phone numbers trivially mergeable?

	239 const base::string16& raw_phone_1 = p1.GetRawInfo(PHONE_HOME_WHOLE_NUMBER);

	240 const base::string16& raw_phone_2 = p2.GetRawInfo(PHONE_HOME_WHOLE_NUMBER);

	241 if (raw_phone_1.empty() \|\| raw_phone_2.empty() \|\|

	242 raw_phone_1 == raw_phone_2) {

	243 return true;

	244 }

	245

	246 // The phone numbers util library needs the numbers in utf8.

	247 const std::string phone_1 = base::UTF16ToUTF8(raw_phone_1);

	248 const std::string phone_2 = base::UTF16ToUTF8(raw_phone_2);

	249

	250 // Parse and compare the phone numbers.

	251 PhoneNumberUtil* phone_util = PhoneNumberUtil::GetInstance();

	252 switch (phone_util->IsNumberMatchWithTwoStrings(phone_1, phone_2)) {

	253 case PhoneNumberUtil::INVALID_NUMBER:

	254 case PhoneNumberUtil::NO_MATCH:

	255 return false;

	256 case PhoneNumberUtil::SHORT_NSN_MATCH:

	257 case PhoneNumberUtil::NSN_MATCH:

	258 case PhoneNumberUtil::EXACT_MATCH:

	259 return true;

	260 }

	261

	262 NOTREACHED();

	263 return false;

	264 }

	265

	266 bool AutofillProfileComparator::HaveMergeableAddresses(

	267 const AutofillProfile& p1,

	268 const AutofillProfile& p2) const {

	269 // If the address are not in the same country, then they're not the same. If

	270 // one of the address countries is unknown/invalid the comparison continues.

	271 const base::string16& country1 =

	272 p1.GetInfo(AutofillType(HTML_TYPE_COUNTRY_CODE, HTML_MODE_NONE), "en-US");

	273 const base::string16& country2 =

	274 p2.GetInfo(AutofillType(HTML_TYPE_COUNTRY_CODE, HTML_MODE_NONE), "en-US");

	275 if (!country1.empty() && !country2.empty() &&

	276 !case_insensitive_compare_.StringsEqual(country1, country2)) {

	277 LOG(ERROR) << country1 << "!=" << country2;

	278 return false;

	279 }

	280

	281 // TODO(rogerm): Lookup the normalization rules for the country of the

	282 // address.

	283

	284 // Zip

	285 // ----

	286 // If the addresses are definitely not in the same zip/area code then we're

	287 // done. Otherwise,the comparison continues.

	288 const base::string16& zip1 = NormalizeForComparison(

	289 p1.GetRawInfo(ADDRESS_HOME_ZIP), DISCARD_WHITESPACE);

	290 const base::string16& zip2 = NormalizeForComparison(

	291 p2.GetRawInfo(ADDRESS_HOME_ZIP), DISCARD_WHITESPACE);

	292 if (!zip1.empty() && !zip2.empty() &&

	293 zip1.find(zip2) == base::string16::npos &&

	294 zip2.find(zip1) == base::string16::npos) {

	295 return false;

	296 }

	297

	298 // State

	299 // ------

	300 // Heuristic: If the match is between non-empty zip codes then we can infer

	301 // that the two state strings are intended to have the same meaning. This

	302 // handles the cases where we have invalid or poorly formed data in

	303 // one of the state values (like "Select one", or "CA - California").

	304 // Otherwise, we actually have to check if the states map to the the same set

	305 // of

	306 // tokens.

	307 const base::string16& state1 =

	308 NormalizeForComparison(p1.GetRawInfo(ADDRESS_HOME_STATE));

	309 const base::string16& state2 =

	310 NormalizeForComparison(p2.GetRawInfo(ADDRESS_HOME_STATE));

	311 if ((zip1.empty() \|\| zip2.empty()) && !HaveSameTokens(state1, state2)) {

	312 return false;

	313 }

	314

	315 // City

	316 // ------

	317 // Heuristic: If the match is between non-empty zip codes then we can infer

	318 // that the two city strings are intended to have the same meaning. This

	319 // handles the cases where we have a city vs one of its suburbs.

	320 // Otherwise, we actually have to check if the cities map to the the same set

	321 // of

	322 // tokens.

	323 const base::string16& city1 =

	324 NormalizeForComparison(p1.GetRawInfo(ADDRESS_HOME_CITY));

	325 const base::string16& city2 =

	326 NormalizeForComparison(p2.GetRawInfo(ADDRESS_HOME_CITY));

	327 if ((zip1.empty() \|\| zip2.empty()) && !HaveSameTokens(city1, city2)) {

	328 return false;

	329 }

	330

	331 // Address

	332 const base::string16& address1 = NormalizeForComparison(base::JoinString(

	333 {p1.GetRawInfo(ADDRESS_HOME_LINE1), p1.GetRawInfo(ADDRESS_HOME_LINE2)},

	334 kSpace));

	335 const base::string16& address2 = NormalizeForComparison(base::JoinString(

	336 {p2.GetRawInfo(ADDRESS_HOME_LINE1), p2.GetRawInfo(ADDRESS_HOME_LINE2)},

	337 kSpace));

	338 if (!HaveSameTokens(address1, address2)) {

	339 return false;

	340 }

	341

	342 return true;

	343 }

	344

	345 } // namespace autofill

OLD	NEW