| OLD | NEW |
| (Empty) | |
| 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include "components/autofill/core/browser/autofill_profile_comparator.h" |
| 6 |
| 7 #include <algorithm> |
| 8 #include <vector> |
| 9 |
| 10 #include "base/i18n/char_iterator.h" |
| 11 #include "base/strings/string_split.h" |
| 12 #include "base/strings/string_util.h" |
| 13 #include "base/strings/utf_string_conversion_utils.h" |
| 14 #include "base/strings/utf_string_conversions.h" |
| 15 #include "third_party/libphonenumber/phonenumber_api.h" |
| 16 |
| 17 namespace autofill { |
| 18 namespace { |
| 19 |
| 20 const base::char16 kSpace[] = {L' ', L'\0'}; |
| 21 |
| 22 } // namespace |
| 23 |
| 24 AutofillProfileComparator::AutofillProfileComparator() { |
| 25 // Use ICU transliteration to remove diacritics and fold case. |
| 26 // See http://userguide.icu-project.org/transforms/general |
| 27 UErrorCode status = U_ZERO_ERROR; |
| 28 std::unique_ptr<icu::Transliterator> transliterator( |
| 29 icu::Transliterator::createInstance( |
| 30 "NFD; [:Nonspacing Mark:] Remove; Lower; NFC", UTRANS_FORWARD, |
| 31 status)); |
| 32 if (U_FAILURE(status) || transliterator == nullptr) { |
| 33 // TODO(rogerm): Add a histogram to count how often this happens. |
| 34 LOG(ERROR) << "Failed to create ICU Transliterator: " |
| 35 << u_errorName(status); |
| 36 } |
| 37 |
| 38 transliterator_ = std::move(transliterator); |
| 39 } |
| 40 |
| 41 AutofillProfileComparator::~AutofillProfileComparator() {} |
| 42 |
| 43 base::string16 AutofillProfileComparator::NormalizeForComparison( |
| 44 base::StringPiece16 text, |
| 45 AutofillProfileComparator::WhitespaceSpec whitespace_spec) const { |
| 46 // This algorithm is not designed to be perfect, we could get arbitrarily |
| 47 // fancy here trying to canonicalize address lines. Instead, this is designed |
| 48 // to handle common cases for all types of data (addresses and names) without |
| 49 // the need of domain-specific logic. |
| 50 // |
| 51 // 1. Convert punctuation to spaces and normalize all whitespace to spaces. |
| 52 // This will convert "Mid-Island Plz." -> "Mid Island Plz " (the trailing |
| 53 // space will be trimmed off outside of the end of the loop). |
| 54 // |
| 55 // 2. Collapse consecutive punctuation/whitespace characters to a single |
| 56 // space. We pretend the string has already started with whitespace in |
| 57 // order to trim leading spaces. |
| 58 // |
| 59 // 3. Remove diacritics (accents and other non-spacing marks) and perform |
| 60 // case folding to lower-case. |
| 61 base::string16 result; |
| 62 result.reserve(text.length()); |
| 63 bool previous_was_whitespace = (whitespace_spec == RETAIN_WHITESPACE); |
| 64 for (base::i18n::UTF16CharIterator iter(text.data(), text.length()); |
| 65 !iter.end(); iter.Advance()) { |
| 66 switch (u_charType(iter.get())) { |
| 67 // Punctuation |
| 68 case U_DASH_PUNCTUATION: |
| 69 case U_START_PUNCTUATION: |
| 70 case U_END_PUNCTUATION: |
| 71 case U_CONNECTOR_PUNCTUATION: |
| 72 case U_OTHER_PUNCTUATION: |
| 73 // Whitespace |
| 74 case U_CONTROL_CHAR: // To escape the '\n' character. |
| 75 case U_SPACE_SEPARATOR: |
| 76 case U_LINE_SEPARATOR: |
| 77 case U_PARAGRAPH_SEPARATOR: |
| 78 if (!previous_was_whitespace && whitespace_spec == RETAIN_WHITESPACE) { |
| 79 result.push_back(' '); |
| 80 previous_was_whitespace = true; |
| 81 } |
| 82 break; |
| 83 |
| 84 default: |
| 85 previous_was_whitespace = false; |
| 86 base::WriteUnicodeCharacter(iter.get(), &result); |
| 87 break; |
| 88 } |
| 89 } |
| 90 |
| 91 // Trim off trailing whitespace if we left one. |
| 92 if (previous_was_whitespace && !result.empty()) |
| 93 result.resize(result.size() - 1); |
| 94 |
| 95 if (transliterator_ == nullptr) |
| 96 return result; |
| 97 |
| 98 icu::UnicodeString value = icu::UnicodeString(result.data(), result.length()); |
| 99 transliterator_->transliterate(value); |
| 100 return base::string16(value.getBuffer(), value.length()); |
| 101 } |
| 102 |
| 103 bool AutofillProfileComparator::IsMergeable(const AutofillProfile& p1, |
| 104 const AutofillProfile& p2) const { |
| 105 // Sorted in order to relative expense of the tests to fail early and cheaply |
| 106 // if possible. |
| 107 return HaveMergeableEmailAddresses(p1, p2) && |
| 108 HaveMergeableCompanyNames(p1, p2) && |
| 109 HaveMergeablePhoneNumbers(p1, p2) && HaveMergeableNames(p1, p2) && |
| 110 HaveMergeableAddresses(p1, p2); |
| 111 } |
| 112 |
| 113 std::set<base::string16> AutofillProfileComparator::GetNameVariants( |
| 114 const base::string16& name) const { |
| 115 const size_t kMaxSupportedNameParts = 8; |
| 116 |
| 117 std::vector<base::string16> name_parts = base::SplitString( |
| 118 name, kSpace, base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY); |
| 119 |
| 120 // Limit the number of parts we support (to constrain memory usage); |
| 121 if (name_parts.size() > kMaxSupportedNameParts) |
| 122 return {name}; |
| 123 |
| 124 // Start with the empty string as a variant. |
| 125 std::set<base::string16> variants = {base::EmptyString16()}; |
| 126 |
| 127 // For each name part, add a variant of all the already existing variants that |
| 128 // appends this name part and one that appends the initial of this name part. |
| 129 // Duplicates will be discarded when they're added to the variants set. |
| 130 for (const base::string16& part : name_parts) { |
| 131 if (part.empty()) |
| 132 continue; |
| 133 std::vector<base::string16> new_variants; |
| 134 for (const base::string16& variant : variants) { |
| 135 new_variants.push_back(base::CollapseWhitespace( |
| 136 base::JoinString({variant, part}, kSpace), true)); |
| 137 new_variants.push_back(base::CollapseWhitespace( |
| 138 base::JoinString({variant, part.substr(0, 1)}, kSpace), true)); |
| 139 } |
| 140 variants.insert(new_variants.begin(), new_variants.end()); |
| 141 } |
| 142 return variants; |
| 143 } |
| 144 |
| 145 // static |
| 146 std::set<base::StringPiece16> AutofillProfileComparator::UniqueTokens( |
| 147 base::StringPiece16 s) { |
| 148 std::vector<base::StringPiece16> tokens = base::SplitStringPiece( |
| 149 s, kSpace, base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY); |
| 150 return std::set<base::StringPiece16>(tokens.begin(), tokens.end()); |
| 151 } |
| 152 |
| 153 // static |
| 154 bool AutofillProfileComparator::HaveSameTokens(base::StringPiece16 s1, |
| 155 base::StringPiece16 s2) { |
| 156 std::set<base::StringPiece16> t1 = UniqueTokens(s1); |
| 157 std::set<base::StringPiece16> t2 = UniqueTokens(s2); |
| 158 |
| 159 // Note: std::include() expects the items in each range to be in sorted order, |
| 160 // hence the use of std::set<> instead of std::unordered_set<>. |
| 161 return std::includes(t1.begin(), t1.end(), t2.begin(), t2.end()) || |
| 162 std::includes(t2.begin(), t2.end(), t1.begin(), t1.end()); |
| 163 } |
| 164 |
| 165 bool AutofillProfileComparator::IsNameVariantOf( |
| 166 const AutofillProfile& profile, |
| 167 const base::string16& full_name) const { |
| 168 // Build the variants of profile 2's first and middle name, as well as the |
| 169 // last name or profile 2. |
| 170 const std::set<base::string16> first_name_variants = |
| 171 GetNameVariants(NormalizeForComparison(profile.GetRawInfo(NAME_FIRST))); |
| 172 const std::set<base::string16> middle_name_variants = |
| 173 GetNameVariants(NormalizeForComparison(profile.GetRawInfo(NAME_MIDDLE))); |
| 174 const base::string16 last_name = |
| 175 NormalizeForComparison(profile.GetRawInfo(NAME_LAST)); |
| 176 |
| 177 // Iterate over all full name variants of profile 2 and see if any of them |
| 178 // match the full name from profile 1. |
| 179 for (const base::string16& first_name : first_name_variants) { |
| 180 for (const base::string16& middle_name : middle_name_variants) { |
| 181 base::string16 candidate = base::CollapseWhitespace( |
| 182 base::JoinString({first_name, middle_name, last_name}, kSpace), true); |
| 183 if (candidate == full_name) |
| 184 return true; |
| 185 } |
| 186 } |
| 187 |
| 188 // There was not match found. |
| 189 return false; |
| 190 } |
| 191 |
| 192 bool AutofillProfileComparator::HaveMergeableNames( |
| 193 const AutofillProfile& p1, |
| 194 const AutofillProfile& p2) const { |
| 195 // Build the full name for profile 1. |
| 196 base::string16 full_name_1 = NormalizeForComparison( |
| 197 base::JoinString({p1.GetRawInfo(NAME_FIRST), p1.GetRawInfo(NAME_MIDDLE), |
| 198 p1.GetRawInfo(NAME_LAST)}, |
| 199 kSpace)); |
| 200 |
| 201 // Build the full name for profile 2. |
| 202 base::string16 full_name_2 = NormalizeForComparison( |
| 203 base::JoinString({p2.GetRawInfo(NAME_FIRST), p2.GetRawInfo(NAME_MIDDLE), |
| 204 p2.GetRawInfo(NAME_LAST)}, |
| 205 kSpace)); |
| 206 |
| 207 // Is it reasonable to merge the names from p1 and p2. |
| 208 return full_name_1.empty() || full_name_2.empty() || |
| 209 (full_name_1 == full_name_2) || IsNameVariantOf(p2, full_name_1) || |
| 210 IsNameVariantOf(p1, full_name_2); |
| 211 } |
| 212 |
| 213 bool AutofillProfileComparator::HaveMergeableEmailAddresses( |
| 214 const AutofillProfile& p1, |
| 215 const AutofillProfile& p2) const { |
| 216 const base::string16& email_1 = p1.GetRawInfo(EMAIL_ADDRESS); |
| 217 const base::string16& email_2 = p2.GetRawInfo(EMAIL_ADDRESS); |
| 218 return email_1.empty() || email_2.empty() || |
| 219 case_insensitive_compare_.StringsEqual(email_1, email_2); |
| 220 } |
| 221 |
| 222 bool AutofillProfileComparator::HaveMergeableCompanyNames( |
| 223 const AutofillProfile& p1, |
| 224 const AutofillProfile& p2) const { |
| 225 const base::string16& company_name_1 = |
| 226 NormalizeForComparison(p1.GetRawInfo(COMPANY_NAME)); |
| 227 const base::string16& company_name_2 = |
| 228 NormalizeForComparison(p2.GetRawInfo(COMPANY_NAME)); |
| 229 return company_name_1.empty() || company_name_2.empty() || |
| 230 HaveSameTokens(company_name_1, company_name_2); |
| 231 } |
| 232 |
| 233 bool AutofillProfileComparator::HaveMergeablePhoneNumbers( |
| 234 const AutofillProfile& p1, |
| 235 const AutofillProfile& p2) const { |
| 236 using ::i18n::phonenumbers::PhoneNumberUtil; |
| 237 |
| 238 // Are the two phone numbers trivially mergeable? |
| 239 const base::string16& raw_phone_1 = p1.GetRawInfo(PHONE_HOME_WHOLE_NUMBER); |
| 240 const base::string16& raw_phone_2 = p2.GetRawInfo(PHONE_HOME_WHOLE_NUMBER); |
| 241 if (raw_phone_1.empty() || raw_phone_2.empty() || |
| 242 raw_phone_1 == raw_phone_2) { |
| 243 return true; |
| 244 } |
| 245 |
| 246 // The phone numbers util library needs the numbers in utf8. |
| 247 const std::string phone_1 = base::UTF16ToUTF8(raw_phone_1); |
| 248 const std::string phone_2 = base::UTF16ToUTF8(raw_phone_2); |
| 249 |
| 250 // Parse and compare the phone numbers. |
| 251 PhoneNumberUtil* phone_util = PhoneNumberUtil::GetInstance(); |
| 252 switch (phone_util->IsNumberMatchWithTwoStrings(phone_1, phone_2)) { |
| 253 case PhoneNumberUtil::INVALID_NUMBER: |
| 254 case PhoneNumberUtil::NO_MATCH: |
| 255 return false; |
| 256 case PhoneNumberUtil::SHORT_NSN_MATCH: |
| 257 case PhoneNumberUtil::NSN_MATCH: |
| 258 case PhoneNumberUtil::EXACT_MATCH: |
| 259 return true; |
| 260 } |
| 261 |
| 262 NOTREACHED(); |
| 263 return false; |
| 264 } |
| 265 |
| 266 bool AutofillProfileComparator::HaveMergeableAddresses( |
| 267 const AutofillProfile& p1, |
| 268 const AutofillProfile& p2) const { |
| 269 // If the address are not in the same country, then they're not the same. If |
| 270 // one of the address countries is unknown/invalid the comparison continues. |
| 271 const base::string16& country1 = |
| 272 p1.GetInfo(AutofillType(HTML_TYPE_COUNTRY_CODE, HTML_MODE_NONE), "en-US"); |
| 273 const base::string16& country2 = |
| 274 p2.GetInfo(AutofillType(HTML_TYPE_COUNTRY_CODE, HTML_MODE_NONE), "en-US"); |
| 275 if (!country1.empty() && !country2.empty() && |
| 276 !case_insensitive_compare_.StringsEqual(country1, country2)) { |
| 277 LOG(ERROR) << country1 << "!=" << country2; |
| 278 return false; |
| 279 } |
| 280 |
| 281 // TODO(rogerm): Lookup the normalization rules for the country of the |
| 282 // address. |
| 283 |
| 284 // Zip |
| 285 // ---- |
| 286 // If the addresses are definitely not in the same zip/area code then we're |
| 287 // done. Otherwise,the comparison continues. |
| 288 const base::string16& zip1 = NormalizeForComparison( |
| 289 p1.GetRawInfo(ADDRESS_HOME_ZIP), DISCARD_WHITESPACE); |
| 290 const base::string16& zip2 = NormalizeForComparison( |
| 291 p2.GetRawInfo(ADDRESS_HOME_ZIP), DISCARD_WHITESPACE); |
| 292 if (!zip1.empty() && !zip2.empty() && |
| 293 zip1.find(zip2) == base::string16::npos && |
| 294 zip2.find(zip1) == base::string16::npos) { |
| 295 return false; |
| 296 } |
| 297 |
| 298 // State |
| 299 // ------ |
| 300 // Heuristic: If the match is between non-empty zip codes then we can infer |
| 301 // that the two state strings are intended to have the same meaning. This |
| 302 // handles the cases where we have invalid or poorly formed data in |
| 303 // one of the state values (like "Select one", or "CA - California"). |
| 304 // Otherwise, we actually have to check if the states map to the the same set |
| 305 // of |
| 306 // tokens. |
| 307 const base::string16& state1 = |
| 308 NormalizeForComparison(p1.GetRawInfo(ADDRESS_HOME_STATE)); |
| 309 const base::string16& state2 = |
| 310 NormalizeForComparison(p2.GetRawInfo(ADDRESS_HOME_STATE)); |
| 311 if ((zip1.empty() || zip2.empty()) && !HaveSameTokens(state1, state2)) { |
| 312 return false; |
| 313 } |
| 314 |
| 315 // City |
| 316 // ------ |
| 317 // Heuristic: If the match is between non-empty zip codes then we can infer |
| 318 // that the two city strings are intended to have the same meaning. This |
| 319 // handles the cases where we have a city vs one of its suburbs. |
| 320 // Otherwise, we actually have to check if the cities map to the the same set |
| 321 // of |
| 322 // tokens. |
| 323 const base::string16& city1 = |
| 324 NormalizeForComparison(p1.GetRawInfo(ADDRESS_HOME_CITY)); |
| 325 const base::string16& city2 = |
| 326 NormalizeForComparison(p2.GetRawInfo(ADDRESS_HOME_CITY)); |
| 327 if ((zip1.empty() || zip2.empty()) && !HaveSameTokens(city1, city2)) { |
| 328 return false; |
| 329 } |
| 330 |
| 331 // Address |
| 332 const base::string16& address1 = NormalizeForComparison(base::JoinString( |
| 333 {p1.GetRawInfo(ADDRESS_HOME_LINE1), p1.GetRawInfo(ADDRESS_HOME_LINE2)}, |
| 334 kSpace)); |
| 335 const base::string16& address2 = NormalizeForComparison(base::JoinString( |
| 336 {p2.GetRawInfo(ADDRESS_HOME_LINE1), p2.GetRawInfo(ADDRESS_HOME_LINE2)}, |
| 337 kSpace)); |
| 338 if (!HaveSameTokens(address1, address2)) { |
| 339 return false; |
| 340 } |
| 341 |
| 342 return true; |
| 343 } |
| 344 |
| 345 } // namespace autofill |
| OLD | NEW |