components/autofill/core/browser/autofill_profile_comparator.cc - Issue 2137533002: Embed address normalization rewriting rules.

Side by Side Diff: components/autofill/core/browser/autofill_profile_comparator.cc

Issue 2137533002: Embed address normalization rewriting rules. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « components/autofill/core/browser/autofill_profile_comparator.h ('k') | components/autofill/core/browser/autofill_profile_comparator_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright 2016 The Chromium Authors. All rights reserved.	1 // Copyright 2016 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/autofill/core/browser/autofill_profile_comparator.h"	5 #include "components/autofill/core/browser/autofill_profile_comparator.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8 #include <vector>	8 #include <vector>

9	9

10 #include "base/i18n/case_conversion.h"	10 #include "base/i18n/case_conversion.h"

11 #include "base/i18n/char_iterator.h"	11 #include "base/i18n/char_iterator.h"

12 #include "base/strings/string_piece.h"	12 #include "base/strings/string_piece.h"

13 #include "base/strings/string_split.h"	13 #include "base/strings/string_split.h"

14 #include "base/strings/string_util.h"	14 #include "base/strings/string_util.h"

15 #include "base/strings/utf_string_conversion_utils.h"	15 #include "base/strings/utf_string_conversion_utils.h"

16 #include "base/strings/utf_string_conversions.h"	16 #include "base/strings/utf_string_conversions.h"

	17 #include "components/autofill/core/browser/address_rewriter.h"

17 #include "components/autofill/core/browser/autofill_country.h"	18 #include "components/autofill/core/browser/autofill_country.h"

18 #include "components/autofill/core/browser/autofill_data_util.h"	19 #include "components/autofill/core/browser/autofill_data_util.h"

19 #include "components/autofill/core/browser/state_names.h"	20 #include "components/autofill/core/browser/state_names.h"

20 #include "third_party/libphonenumber/phonenumber_api.h"	21 #include "third_party/libphonenumber/phonenumber_api.h"

21	22

22 using i18n::phonenumbers::PhoneNumberUtil;	23 using i18n::phonenumbers::PhoneNumberUtil;

23 using base::UTF16ToUTF8;	24 using base::UTF16ToUTF8;

24 using base::UTF8ToUTF16;	25 using base::UTF8ToUTF16;

25	26

26 namespace autofill {	27 namespace autofill {

27 namespace {	28 namespace {

28	29

29 const base::char16 kSpace[] = {L' ', L'\0'};	30 const base::char16 kSpace[] = {L' ', L'\0'};

30 const base::char16 kUS[] = {L'U', L'S', L'\0'};

31	31

32 bool ContainsNewline(base::StringPiece16 text) {	32 bool ContainsNewline(base::StringPiece16 text) {

33 return text.find('\n') != base::StringPiece16::npos;	33 return text.find('\n') != base::StringPiece16::npos;

34 }	34 }

35	35

36 std::ostream& operator<<(std::ostream& os,	36 std::ostream& operator<<(std::ostream& os,

37 const ::i18n::phonenumbers::PhoneNumber& n) {	37 const ::i18n::phonenumbers::PhoneNumber& n) {

38 os << "country_code: " << n.country_code() << " "	38 os << "country_code: " << n.country_code() << " "

39 << "national_number: " << n.national_number();	39 << "national_number: " << n.national_number();

40 if (n.has_extension())	40 if (n.has_extension())

(...skipping 316 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
357 const base::string16& state2 = p2.GetInfo(kState, app_locale_);	357 const base::string16& state2 = p2.GetInfo(kState, app_locale_);

358 if (state1.empty()) {	358 if (state1.empty()) {

359 address->SetInfo(kState, state2, app_locale_);	359 address->SetInfo(kState, state2, app_locale_);

360 } else if (state2.empty()) {	360 } else if (state2.empty()) {

361 address->SetInfo(kState, state1, app_locale_);	361 address->SetInfo(kState, state1, app_locale_);

362 } else {	362 } else {

363 address->SetInfo(kState, (state2.size() < state1.size() ? state2 : state1),	363 address->SetInfo(kState, (state2.size() < state1.size() ? state2 : state1),

364 app_locale_);	364 app_locale_);

365 }	365 }

366	366

	367 AddressRewriter rewriter = AddressRewriter::ForCountryCode(country_code);

	368

367 // One of the cities is empty or one of the cities has a subset of tokens from	369 // One of the cities is empty or one of the cities has a subset of tokens from

368 // the other. Pick the city name with more tokens; this is usually the most	370 // the other. Pick the city name with more tokens; this is usually the most

369 // explicit one.	371 // explicit one.

370 const AutofillType kCity(ADDRESS_HOME_CITY);	372 const AutofillType kCity(ADDRESS_HOME_CITY);

371 const base::string16& city1 = p1.GetInfo(kCity, app_locale_);	373 const base::string16& city1 = p1.GetInfo(kCity, app_locale_);

372 const base::string16& city2 = p2.GetInfo(kCity, app_locale_);	374 const base::string16& city2 = p2.GetInfo(kCity, app_locale_);

373 if (city1.empty()) {	375 if (city1.empty()) {

374 address->SetInfo(kCity, city2, app_locale_);	376 address->SetInfo(kCity, city2, app_locale_);

375 } else if (city2.empty()) {	377 } else if (city2.empty()) {

376 address->SetInfo(kCity, city1, app_locale_);	378 address->SetInfo(kCity, city1, app_locale_);

377 } else {	379 } else {

378 // Prefer the one with more tokens.	380 // Prefer the one with more tokens, making sure to apply address

379 CompareTokensResult result = CompareTokens(NormalizeForComparison(city1),	381 // normalization and rewriting before doing the comparison.

380 NormalizeForComparison(city2));	382 CompareTokensResult result =

	383 CompareTokens(rewriter.Rewrite(NormalizeForComparison(city1)),

	384 rewriter.Rewrite(NormalizeForComparison(city2)));

381 switch (result) {	385 switch (result) {

382 case SAME_TOKENS:	386 case SAME_TOKENS:

383 // They have the same set of unique tokens. Let's pick the more recently	387 // They have the same set of unique tokens. Let's pick the more recently

384 // used one.	388 // used one.

385 address->SetInfo(kCity, (p2.use_date() > p1.use_date() ? city2 : city1),	389 address->SetInfo(kCity, (p2.use_date() > p1.use_date() ? city2 : city1),

386 app_locale_);	390 app_locale_);

387 break;	391 break;

388 case S1_CONTAINS_S2:	392 case S1_CONTAINS_S2:

389 // city1 has more unique tokens than city2.	393 // city1 has more unique tokens than city2.

390 address->SetInfo(kCity, city1, app_locale_);	394 address->SetInfo(kCity, city1, app_locale_);

(...skipping 24 matching lines...) Expand all Loading...
415 } else {	419 } else {

416 // Prefer the multi-line address if one is multi-line and the other isn't.	420 // Prefer the multi-line address if one is multi-line and the other isn't.

417 bool address1_multiline = ContainsNewline(address1);	421 bool address1_multiline = ContainsNewline(address1);

418 bool address2_multiline = ContainsNewline(address2);	422 bool address2_multiline = ContainsNewline(address2);

419 if (address1_multiline && !address2_multiline) {	423 if (address1_multiline && !address2_multiline) {

420 address->SetInfo(kStreetAddress, address1, app_locale_);	424 address->SetInfo(kStreetAddress, address1, app_locale_);

421 } else if (address2_multiline && !address1_multiline) {	425 } else if (address2_multiline && !address1_multiline) {

422 address->SetInfo(kStreetAddress, address2, app_locale_);	426 address->SetInfo(kStreetAddress, address2, app_locale_);

423 } else {	427 } else {

424 // Prefer the one with more tokens if they're both single-line or both	428 // Prefer the one with more tokens if they're both single-line or both

425 // multi-line addresses.	429 // multi-line addresses, making sure to apply address normalization and

426 CompareTokensResult result = CompareTokens(	430 // rewriting before doing the comparison.

427 NormalizeForComparison(address1), NormalizeForComparison(address2));	431 CompareTokensResult result =

	432 CompareTokens(rewriter.Rewrite(NormalizeForComparison(address1)),

	433 rewriter.Rewrite(NormalizeForComparison(address2)));

428 switch (result) {	434 switch (result) {

429 case SAME_TOKENS:	435 case SAME_TOKENS:

430 // They have the same set of unique tokens. Let's pick the one that's	436 // They have the same set of unique tokens. Let's pick the one that's

431 // longer.	437 // longer.

432 address->SetInfo(	438 address->SetInfo(

433 kStreetAddress,	439 kStreetAddress,

434 (p2.use_date() > p1.use_date() ? address2 : address1),	440 (p2.use_date() > p1.use_date() ? address2 : address1),

435 app_locale_);	441 app_locale_);

436 break;	442 break;

437 case S1_CONTAINS_S2:	443 case S1_CONTAINS_S2:

(...skipping 244 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
682 const base::string16& zip1 = NormalizeForComparison(	688 const base::string16& zip1 = NormalizeForComparison(

683 p1.GetInfo(kZipCode, app_locale_), DISCARD_WHITESPACE);	689 p1.GetInfo(kZipCode, app_locale_), DISCARD_WHITESPACE);

684 const base::string16& zip2 = NormalizeForComparison(	690 const base::string16& zip2 = NormalizeForComparison(

685 p2.GetInfo(kZipCode, app_locale_), DISCARD_WHITESPACE);	691 p2.GetInfo(kZipCode, app_locale_), DISCARD_WHITESPACE);

686 if (!zip1.empty() && !zip2.empty() &&	692 if (!zip1.empty() && !zip2.empty() &&

687 zip1.find(zip2) == base::string16::npos &&	693 zip1.find(zip2) == base::string16::npos &&

688 zip2.find(zip1) == base::string16::npos) {	694 zip2.find(zip1) == base::string16::npos) {

689 return false;	695 return false;

690 }	696 }

691	697

	698 AddressRewriter rewriter =

	699 AddressRewriter::ForCountryCode(country1.empty() ? country2 : country1);

	700

692 // State	701 // State

693 // ------	702 // ------

694 // Heuristic: States are mergeable if one is a (possibly empty) bag of words	703 // Heuristic: States are mergeable if one is a (possibly empty) bag of words

695 // subset of the other.	704 // subset of the other.

696 //	705 //

697 // TODO(rogerm): If the match is between non-empty zip codes then we can infer	706 // TODO(rogerm): If the match is between non-empty zip codes then we can infer

698 // that the two state strings are intended to have the same meaning. This	707 // that the two state strings are intended to have the same meaning. This

699 // handles the cases where we have invalid or poorly formed data in one of the	708 // handles the cases where we have invalid or poorly formed data in one of the

700 // state values (like "Select one", or "CA - California").	709 // state values (like "Select one", or "CA - California").

701 const AutofillType kState(ADDRESS_HOME_STATE);	710 const AutofillType kState(ADDRESS_HOME_STATE);

702 const base::string16& state1 =	711 const base::string16& state1 =

703 NormalizeForComparison(p1.GetInfo(kState, app_locale_));	712 rewriter.Rewrite(NormalizeForComparison(p1.GetInfo(kState, app_locale_)));

704 const base::string16& state2 =	713 const base::string16& state2 =

705 NormalizeForComparison(p2.GetInfo(kState, app_locale_));	714 rewriter.Rewrite(NormalizeForComparison(p2.GetInfo(kState, app_locale_)));

706 if (!IsMatchingState(GetNonEmptyOf(p1, p2, kCountryCode), state1, state2) &&	715 if (CompareTokens(state1, state2) == DIFFERENT_TOKENS) {

707 CompareTokens(state1, state2) == DIFFERENT_TOKENS) {

708 return false;	716 return false;

709 }	717 }

710	718

711 // City	719 // City

712 // ------	720 // ------

713 // Heuristic: Cities are mergeable if one is a (possibly empty) bag of words	721 // Heuristic: Cities are mergeable if one is a (possibly empty) bag of words

714 // subset of the other.	722 // subset of the other.

715 //	723 //

716 // TODO(rogerm): If the match is between non-empty zip codes then we can infer	724 // TODO(rogerm): If the match is between non-empty zip codes then we can infer

717 // that the two city strings are intended to have the same meaning. This	725 // that the two city strings are intended to have the same meaning. This

718 // handles the cases where we have a city vs one of its suburbs.	726 // handles the cases where we have a city vs one of its suburbs.

719 const base::string16& city1 = NormalizeForComparison(	727 const base::string16& city1 = rewriter.Rewrite(NormalizeForComparison(

720 p1.GetInfo(AutofillType(ADDRESS_HOME_CITY), app_locale_));	728 p1.GetInfo(AutofillType(ADDRESS_HOME_CITY), app_locale_)));

721 const base::string16& city2 = NormalizeForComparison(	729 const base::string16& city2 = rewriter.Rewrite(NormalizeForComparison(

722 p2.GetInfo(AutofillType(ADDRESS_HOME_CITY), app_locale_));	730 p2.GetInfo(AutofillType(ADDRESS_HOME_CITY), app_locale_)));

723 if (CompareTokens(city1, city2) == DIFFERENT_TOKENS) {	731 if (CompareTokens(city1, city2) == DIFFERENT_TOKENS) {

724 return false;	732 return false;

725 }	733 }

726	734

727 // Address	735 // Address

728 // --------	736 // --------

729 // Heuristic: Street addresses are mergeable if one is a (possibly empty) bag	737 // Heuristic: Street addresses are mergeable if one is a (possibly empty) bag

730 // of words subset of the other.	738 // of words subset of the other.

731 const base::string16& address1 = NormalizeForComparison(	739 const base::string16& address1 = rewriter.Rewrite(NormalizeForComparison(

732 p1.GetInfo(AutofillType(ADDRESS_HOME_STREET_ADDRESS), app_locale_));	740 p1.GetInfo(AutofillType(ADDRESS_HOME_STREET_ADDRESS), app_locale_)));

733 const base::string16& address2 = NormalizeForComparison(	741 const base::string16& address2 = rewriter.Rewrite(NormalizeForComparison(

734 p2.GetInfo(AutofillType(ADDRESS_HOME_STREET_ADDRESS), app_locale_));	742 p2.GetInfo(AutofillType(ADDRESS_HOME_STREET_ADDRESS), app_locale_)));

735 if (CompareTokens(address1, address2) == DIFFERENT_TOKENS) {	743 if (CompareTokens(address1, address2) == DIFFERENT_TOKENS) {

736 return false;	744 return false;

737 }	745 }

738	746

739 return true;	747 return true;

740 }	748 }

741	749

742 bool AutofillProfileComparator::IsMatchingState(

743 const base::string16& country_code,

744 const base::string16& state1,

745 const base::string16& state2) const {

746 if (state1 == state2)

747 return true;

748

749 if (country_code != kUS)

750 return false;

751

752 // TODO(rogerm): Generalize this to all locals using string equivalence rules.

753 base::string16 name, abbreviation;

754 autofill::state_names::GetNameAndAbbreviation(state1, &name, &abbreviation);

755 if (abbreviation.empty()) {

756 // state1 wasn't recognized. There's no need to compare it to state2

757 return false;

758 }

759

760 return state2 == name \|\| state2 == abbreviation;

761 }

762

763 } // namespace autofill	750 } // namespace autofill

OLD	NEW