Index: components/autofill/core/browser/autofill_profile_comparator.cc |
diff --git a/components/autofill/core/browser/autofill_profile_comparator.cc b/components/autofill/core/browser/autofill_profile_comparator.cc |
new file mode 100644 |
index 0000000000000000000000000000000000000000..b01c60d27edeac67dc944308d9e3396ed4544a5c |
--- /dev/null |
+++ b/components/autofill/core/browser/autofill_profile_comparator.cc |
@@ -0,0 +1,350 @@ |
+// Copyright 2016 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+#include "components/autofill/core/browser/autofill_profile_comparator.h" |
+ |
+#include <algorithm> |
+#include <vector> |
+ |
+#include "base/i18n/char_iterator.h" |
+#include "base/strings/string_split.h" |
+#include "base/strings/string_util.h" |
+#include "base/strings/utf_string_conversion_utils.h" |
+#include "base/strings/utf_string_conversions.h" |
+#include "components/autofill/core/browser/autofill_data_util.h" |
+#include "third_party/libphonenumber/phonenumber_api.h" |
+ |
+namespace autofill { |
+namespace { |
+ |
+const base::char16 kSpace[] = {L' ', L'\0'}; |
+ |
+} // namespace |
+ |
+AutofillProfileComparator::AutofillProfileComparator( |
+ const base::StringPiece& app_locale) |
+ : app_locale_(app_locale.data(), app_locale.size()) { |
+ // Use ICU transliteration to remove diacritics and fold case. |
+ // See http://userguide.icu-project.org/transforms/general |
+ UErrorCode status = U_ZERO_ERROR; |
+ std::unique_ptr<icu::Transliterator> transliterator( |
+ icu::Transliterator::createInstance( |
+ "NFD; [:Nonspacing Mark:] Remove; Lower; NFC", UTRANS_FORWARD, |
+ status)); |
+ if (U_FAILURE(status) || transliterator == nullptr) { |
+ // TODO(rogerm): Add a histogram to count how often this happens. |
+ LOG(ERROR) << "Failed to create ICU Transliterator: " |
+ << u_errorName(status); |
+ } |
+ |
+ transliterator_ = std::move(transliterator); |
+} |
+ |
+AutofillProfileComparator::~AutofillProfileComparator() {} |
+ |
+base::string16 AutofillProfileComparator::NormalizeForComparison( |
+ base::StringPiece16 text, |
+ AutofillProfileComparator::WhitespaceSpec whitespace_spec) const { |
+ // This algorithm is not designed to be perfect, we could get arbitrarily |
+ // fancy here trying to canonicalize address lines. Instead, this is designed |
+ // to handle common cases for all types of data (addresses and names) without |
+ // the need of domain-specific logic. |
+ // |
+ // 1. Convert punctuation to spaces and normalize all whitespace to spaces. |
+ // This will convert "Mid-Island Plz." -> "Mid Island Plz " (the trailing |
+ // space will be trimmed off outside of the end of the loop). |
+ // |
+ // 2. Collapse consecutive punctuation/whitespace characters to a single |
+ // space. We pretend the string has already started with whitespace in |
+ // order to trim leading spaces. |
+ // |
+ // 3. Remove diacritics (accents and other non-spacing marks) and perform |
+ // case folding to lower-case. |
+ base::string16 result; |
+ result.reserve(text.length()); |
+ bool previous_was_whitespace = (whitespace_spec == RETAIN_WHITESPACE); |
+ for (base::i18n::UTF16CharIterator iter(text.data(), text.length()); |
+ !iter.end(); iter.Advance()) { |
+ switch (u_charType(iter.get())) { |
+ // Punctuation |
+ case U_DASH_PUNCTUATION: |
+ case U_START_PUNCTUATION: |
+ case U_END_PUNCTUATION: |
+ case U_CONNECTOR_PUNCTUATION: |
+ case U_OTHER_PUNCTUATION: |
+ // Whitespace |
+ case U_CONTROL_CHAR: // To escape the '\n' character. |
+ case U_SPACE_SEPARATOR: |
+ case U_LINE_SEPARATOR: |
+ case U_PARAGRAPH_SEPARATOR: |
+ if (!previous_was_whitespace && whitespace_spec == RETAIN_WHITESPACE) { |
+ result.push_back(' '); |
+ previous_was_whitespace = true; |
+ } |
+ break; |
+ |
+ default: |
+ previous_was_whitespace = false; |
+ base::WriteUnicodeCharacter(iter.get(), &result); |
+ break; |
+ } |
+ } |
+ |
+ // Trim off trailing whitespace if we left one. |
+ if (previous_was_whitespace && !result.empty()) |
+ result.resize(result.size() - 1); |
+ |
+ if (transliterator_ == nullptr) |
+ return result; |
+ |
+ icu::UnicodeString value = icu::UnicodeString(result.data(), result.length()); |
+ transliterator_->transliterate(value); |
+ return base::string16(value.getBuffer(), value.length()); |
+} |
+ |
+bool AutofillProfileComparator::AreMergeable(const AutofillProfile& p1, |
+ const AutofillProfile& p2) const { |
+ // Sorted in order to relative expense of the tests to fail early and cheaply |
+ // if possible. |
+ return HaveMergeableEmailAddresses(p1, p2) && |
+ HaveMergeableCompanyNames(p1, p2) && |
+ HaveMergeablePhoneNumbers(p1, p2) && HaveMergeableNames(p1, p2) && |
+ HaveMergeableAddresses(p1, p2); |
+} |
+ |
+// static |
+std::set<base::StringPiece16> AutofillProfileComparator::UniqueTokens( |
+ base::StringPiece16 s) { |
+ std::vector<base::StringPiece16> tokens = base::SplitStringPiece( |
+ s, kSpace, base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY); |
+ return std::set<base::StringPiece16>(tokens.begin(), tokens.end()); |
+} |
+ |
+// static |
+bool AutofillProfileComparator::HaveSameTokens(base::StringPiece16 s1, |
+ base::StringPiece16 s2) { |
+ std::set<base::StringPiece16> t1 = UniqueTokens(s1); |
+ std::set<base::StringPiece16> t2 = UniqueTokens(s2); |
+ |
+ // Note: std::include() expects the items in each range to be in sorted order, |
+ // hence the use of std::set<> instead of std::unordered_set<>. |
+ return std::includes(t1.begin(), t1.end(), t2.begin(), t2.end()) || |
+ std::includes(t2.begin(), t2.end(), t1.begin(), t1.end()); |
+} |
+ |
+// static |
+std::set<base::string16> AutofillProfileComparator::GetNameVariants( |
+ const base::string16& name) { |
+ const size_t kMaxSupportedNameParts = 8; |
+ |
+ std::vector<base::string16> name_parts = base::SplitString( |
+ name, kSpace, base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY); |
+ |
+ // Limit the number of parts we support (to constrain memory usage); |
+ if (name_parts.size() > kMaxSupportedNameParts) |
+ return {name}; |
+ |
+ // Start with the empty string as a variant. |
+ std::set<base::string16> variants = {base::EmptyString16()}; |
+ |
+ // For each name part, add a variant of all the already existing variants that |
+ // appends this name part and one that appends the initial of this name part. |
+ // Duplicates will be discarded when they're added to the variants set. |
+ for (const base::string16& part : name_parts) { |
+ if (part.empty()) |
+ continue; |
+ std::vector<base::string16> new_variants; |
+ for (const base::string16& variant : variants) { |
+ new_variants.push_back(base::CollapseWhitespace( |
+ base::JoinString({variant, part}, kSpace), true)); |
+ new_variants.push_back(base::CollapseWhitespace( |
+ base::JoinString({variant, part.substr(0, 1)}, kSpace), true)); |
+ } |
+ variants.insert(new_variants.begin(), new_variants.end()); |
+ } |
+ return variants; |
+} |
+ |
+bool AutofillProfileComparator::IsNameVariantOf( |
+ const base::string16& full_name_1, |
+ const base::string16& full_name_2) const { |
+ data_util::NameParts name_1_parts = data_util::SplitName(full_name_1); |
+ // Build the variants of full_name_1`s given and middle names, and hang on |
+ // to its family name. |
+ const std::set<base::string16> given_name_variants = |
+ GetNameVariants(name_1_parts.given); |
+ const std::set<base::string16> middle_name_variants = |
+ GetNameVariants(name_1_parts.middle); |
+ const base::string16& family_name = name_1_parts.family; |
sebsg
2016/06/10 15:12:13
Hum, I'm really on the fence on whether I would wa
Roger McFarlane (Chromium)
2016/06/10 16:19:44
Done.
|
+ |
+ // Iterate over all full name variants of profile 2 and see if any of them |
+ // match the full name from profile 1. |
+ for (const base::string16& given_name : given_name_variants) { |
+ for (const base::string16& middle_name : middle_name_variants) { |
+ base::string16 candidate = base::CollapseWhitespace( |
+ base::JoinString({given_name, middle_name, family_name}, kSpace), |
+ true); |
+ if (candidate == full_name_2) |
+ return true; |
+ } |
+ } |
+ |
+ // There was not match found. |
+ return false; |
+} |
+ |
+bool AutofillProfileComparator::HaveMergeableNames( |
+ const AutofillProfile& p1, |
+ const AutofillProfile& p2) const { |
+ base::string16 full_name_1 = |
+ NormalizeForComparison(p1.GetInfo(AutofillType(NAME_FULL), app_locale_)); |
+ base::string16 full_name_2 = |
+ NormalizeForComparison(p2.GetInfo(AutofillType(NAME_FULL), app_locale_)); |
+ |
+ // Is it reasonable to merge the names from p1 and p2. |
+ return full_name_1.empty() || full_name_2.empty() || |
+ (full_name_1 == full_name_2) || |
+ IsNameVariantOf(full_name_1, full_name_2) || |
+ IsNameVariantOf(full_name_2, full_name_1); |
+} |
+ |
+bool AutofillProfileComparator::HaveMergeableEmailAddresses( |
+ const AutofillProfile& p1, |
+ const AutofillProfile& p2) const { |
+ const base::string16& email_1 = |
+ p1.GetInfo(AutofillType(EMAIL_ADDRESS), app_locale_); |
+ const base::string16& email_2 = |
+ p2.GetInfo(AutofillType(EMAIL_ADDRESS), app_locale_); |
+ return email_1.empty() || email_2.empty() || |
+ case_insensitive_compare_.StringsEqual(email_1, email_2); |
+} |
+ |
+bool AutofillProfileComparator::HaveMergeableCompanyNames( |
+ const AutofillProfile& p1, |
+ const AutofillProfile& p2) const { |
+ const base::string16& company_name_1 = NormalizeForComparison( |
+ p1.GetInfo(AutofillType(COMPANY_NAME), app_locale_)); |
+ const base::string16& company_name_2 = NormalizeForComparison( |
+ p2.GetInfo(AutofillType(COMPANY_NAME), app_locale_)); |
+ return company_name_1.empty() || company_name_2.empty() || |
+ HaveSameTokens(company_name_1, company_name_2); |
+} |
+ |
+bool AutofillProfileComparator::HaveMergeablePhoneNumbers( |
+ const AutofillProfile& p1, |
+ const AutofillProfile& p2) const { |
+ // We work with the raw phone numbers to avoid losing any helpful information |
+ // as we parse. |
+ const base::string16& raw_phone_1 = p1.GetRawInfo(PHONE_HOME_WHOLE_NUMBER); |
+ const base::string16& raw_phone_2 = p2.GetRawInfo(PHONE_HOME_WHOLE_NUMBER); |
+ |
+ // Are the two phone numbers trivially mergeable? |
+ if (raw_phone_1.empty() || raw_phone_2.empty() || |
+ raw_phone_1 == raw_phone_2) { |
+ return true; |
+ } |
+ |
+ // TODO(rogerm): Modify ::autofill::i18n::PhoneNumbersMatch to support |
+ // SHORT_NSN_MATCH and just call that instead of accessing the underlying |
+ // utility library directly? |
+ |
+ // The phone number util library needs the numbers in utf8. |
+ const std::string phone_1 = base::UTF16ToUTF8(raw_phone_1); |
+ const std::string phone_2 = base::UTF16ToUTF8(raw_phone_2); |
+ |
+ // Parse and compare the phone numbers. |
+ using ::i18n::phonenumbers::PhoneNumberUtil; |
+ PhoneNumberUtil* phone_util = PhoneNumberUtil::GetInstance(); |
+ switch (phone_util->IsNumberMatchWithTwoStrings(phone_1, phone_2)) { |
+ case PhoneNumberUtil::INVALID_NUMBER: |
+ case PhoneNumberUtil::NO_MATCH: |
+ return false; |
+ case PhoneNumberUtil::SHORT_NSN_MATCH: |
+ case PhoneNumberUtil::NSN_MATCH: |
+ case PhoneNumberUtil::EXACT_MATCH: |
+ return true; |
+ } |
+ |
+ NOTREACHED(); |
+ return false; |
+} |
+ |
+bool AutofillProfileComparator::HaveMergeableAddresses( |
+ const AutofillProfile& p1, |
+ const AutofillProfile& p2) const { |
+ // If the address are not in the same country, then they're not the same. If |
+ // one of the address countries is unknown/invalid the comparison continues. |
+ const base::string16& country1 = p1.GetInfo( |
+ AutofillType(HTML_TYPE_COUNTRY_CODE, HTML_MODE_NONE), app_locale_); |
+ const base::string16& country2 = p2.GetInfo( |
+ AutofillType(HTML_TYPE_COUNTRY_CODE, HTML_MODE_NONE), app_locale_); |
+ if (!country1.empty() && !country2.empty() && |
+ !case_insensitive_compare_.StringsEqual(country1, country2)) { |
+ return false; |
+ } |
+ |
+ // TODO(rogerm): Lookup the normalization rules for the (common) country of |
+ // the address. The rules should be applied post NormalizeForComparison to |
+ // the state, city, and address bag of words comparisons. |
+ |
+ // Zip |
+ // ---- |
+ // If the addresses are definitely not in the same zip/area code then we're |
+ // done. Otherwise,the comparison continues. |
+ const base::string16& zip1 = NormalizeForComparison( |
+ p1.GetInfo(AutofillType(ADDRESS_HOME_ZIP), app_locale_), |
+ DISCARD_WHITESPACE); |
+ const base::string16& zip2 = NormalizeForComparison( |
+ p2.GetInfo(AutofillType(ADDRESS_HOME_ZIP), app_locale_), |
+ DISCARD_WHITESPACE); |
+ if (!zip1.empty() && !zip2.empty() && |
+ zip1.find(zip2) == base::string16::npos && |
+ zip2.find(zip1) == base::string16::npos) { |
+ return false; |
+ } |
+ |
+ // State |
+ // ------ |
+ // Heuristic: If the match is between non-empty zip codes then we can infer |
+ // that the two state strings are intended to have the same meaning. This |
+ // handles the cases where we have invalid or poorly formed data in one of the |
+ // state values (like "Select one", or "CA - California"). Otherwise, we |
+ // actually have to check if the states map to the the same set of tokens. |
+ const base::string16& state1 = NormalizeForComparison( |
+ p1.GetInfo(AutofillType(ADDRESS_HOME_STATE), app_locale_)); |
+ const base::string16& state2 = NormalizeForComparison( |
+ p2.GetInfo(AutofillType(ADDRESS_HOME_STATE), app_locale_)); |
+ if ((zip1.empty() || zip2.empty()) && !HaveSameTokens(state1, state2)) { |
+ return false; |
+ } |
+ |
+ // City |
+ // ------ |
+ // Heuristic: If the match is between non-empty zip codes then we can infer |
+ // that the two city strings are intended to have the same meaning. This |
+ // handles the cases where we have a city vs one of its suburbs. Otherwise, we |
+ // actually have to check if the cities map to the the same set of tokens. |
+ const base::string16& city1 = NormalizeForComparison( |
+ p1.GetInfo(AutofillType(ADDRESS_HOME_CITY), app_locale_)); |
+ const base::string16& city2 = NormalizeForComparison( |
+ p2.GetInfo(AutofillType(ADDRESS_HOME_CITY), app_locale_)); |
+ if ((zip1.empty() || zip2.empty()) && !HaveSameTokens(city1, city2)) { |
+ return false; |
+ } |
+ |
+ // Address |
+ // -------- |
+ // Heuristic: Use bag of words comparison on the post-normalized addresses. |
+ const base::string16& address1 = NormalizeForComparison( |
+ p1.GetInfo(AutofillType(ADDRESS_HOME_STREET_ADDRESS), app_locale_)); |
+ const base::string16& address2 = NormalizeForComparison( |
+ p2.GetInfo(AutofillType(ADDRESS_HOME_STREET_ADDRESS), app_locale_)); |
+ if (!HaveSameTokens(address1, address2)) { |
+ return false; |
+ } |
+ |
+ return true; |
+} |
+ |
+} // namespace autofill |