Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(103)

Side by Side Diff: components/autofill/core/browser/autofill_profile_comparator.cc

Issue 2493253002: [autofill] Add address comparison/merge logic for dependent locality and sorting codes (Closed)
Patch Set: fix try bots (redux) Created 4 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2016 The Chromium Authors. All rights reserved. 1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/autofill/core/browser/autofill_profile_comparator.h" 5 #include "components/autofill/core/browser/autofill_profile_comparator.h"
6 6
7 #include <algorithm> 7 #include <algorithm>
8 #include <vector> 8 #include <vector>
9 9
10 #include "base/i18n/case_conversion.h" 10 #include "base/i18n/case_conversion.h"
(...skipping 298 matching lines...) Expand 10 before | Expand all | Expand 10 after
309 const base::string16* best = nullptr; 309 const base::string16* best = nullptr;
310 310
311 DCHECK(HaveMergeableCompanyNames(p1, p2)) 311 DCHECK(HaveMergeableCompanyNames(p1, p2))
312 << "Company names are not mergeable: '" << c1 << "' vs '" << c2 << "'"; 312 << "Company names are not mergeable: '" << c1 << "' vs '" << c2 << "'";
313 313
314 CompareTokensResult result = 314 CompareTokensResult result =
315 CompareTokens(NormalizeForComparison(c1), NormalizeForComparison(c2)); 315 CompareTokens(NormalizeForComparison(c1), NormalizeForComparison(c2));
316 switch (result) { 316 switch (result) {
317 case DIFFERENT_TOKENS: 317 case DIFFERENT_TOKENS:
318 default: 318 default:
319 NOTREACHED(); 319 NOTREACHED() << "Unexpected mismatch: '" << c1 << "' vs '" << c2 << "'";
320 return false; 320 return false;
321 case S1_CONTAINS_S2: 321 case S1_CONTAINS_S2:
322 best = &c1; 322 best = &c1;
323 break; 323 break;
324 case S2_CONTAINS_S1: 324 case S2_CONTAINS_S1:
325 best = &c2; 325 best = &c2;
326 break; 326 break;
327 case SAME_TOKENS: 327 case SAME_TOKENS:
328 best = p2.use_date() > p1.use_date() ? &c2 : &c1; 328 best = p2.use_date() > p1.use_date() ? &c2 : &c1;
329 break; 329 break;
(...skipping 164 matching lines...) Expand 10 before | Expand all | Expand 10 after
494 case S1_CONTAINS_S2: 494 case S1_CONTAINS_S2:
495 // city1 has more unique tokens than city2. 495 // city1 has more unique tokens than city2.
496 address->SetInfo(kCity, city1, app_locale_); 496 address->SetInfo(kCity, city1, app_locale_);
497 break; 497 break;
498 case S2_CONTAINS_S1: 498 case S2_CONTAINS_S1:
499 // city2 has more unique tokens than city1. 499 // city2 has more unique tokens than city1.
500 address->SetInfo(kCity, city2, app_locale_); 500 address->SetInfo(kCity, city2, app_locale_);
501 break; 501 break;
502 case DIFFERENT_TOKENS: 502 case DIFFERENT_TOKENS:
503 default: 503 default:
504 // The addresses aren't mergeable and we shouldn't be doing any of 504 // The cities aren't mergeable and we shouldn't be doing any of
505 // this. 505 // this.
506 NOTREACHED(); 506 NOTREACHED() << "Unexpected mismatch: '" << city1 << "' vs '" << city2
507 << "'";
507 return false; 508 return false;
508 } 509 }
509 } 510 }
510 511
512 // One of the dependend localities is empty or one of the localities has a
513 // subset of tokens from the other. Pick the locality name with more tokens;
514 // this is usually the most explicit one.
515 const AutofillType kDependentLocality(ADDRESS_HOME_DEPENDENT_LOCALITY);
516 const base::string16& locality1 = p1.GetInfo(kDependentLocality, app_locale_);
517 const base::string16& locality2 = p2.GetInfo(kDependentLocality, app_locale_);
518 if (locality1.empty()) {
519 address->SetInfo(kDependentLocality, locality2, app_locale_);
520 } else if (locality2.empty()) {
521 address->SetInfo(kDependentLocality, locality1, app_locale_);
522 } else {
523 // Prefer the one with more tokens, making sure to apply address
524 // normalization and rewriting before doing the comparison.
525 CompareTokensResult result =
526 CompareTokens(rewriter.Rewrite(NormalizeForComparison(locality1)),
527 rewriter.Rewrite(NormalizeForComparison(locality2)));
528 switch (result) {
529 case SAME_TOKENS:
530 // They have the same set of unique tokens. Let's pick the more recently
531 // used one.
532 address->SetInfo(
533 kDependentLocality,
534 (p2.use_date() > p1.use_date() ? locality2 : locality1),
535 app_locale_);
536 break;
537 case S1_CONTAINS_S2:
538 // locality1 has more unique tokens than locality2.
539 address->SetInfo(kDependentLocality, locality1, app_locale_);
540 break;
541 case S2_CONTAINS_S1:
542 // locality2 has more unique tokens than locality1.
543 address->SetInfo(kDependentLocality, locality2, app_locale_);
544 break;
545 case DIFFERENT_TOKENS:
546 default:
547 // The localities aren't mergeable and we shouldn't be doing any of
548 // this.
549 NOTREACHED() << "Unexpected mismatch: '" << locality1 << "' vs '"
550 << locality2 << "'";
551 return false;
552 }
553 }
554
555 // One of the sorting codes is empty, they are the same, or one is a substring
556 // of the other. We prefer the most recently used sorting code.
557 const AutofillType kSortingCode(ADDRESS_HOME_SORTING_CODE);
558 const base::string16& sorting1 = p1.GetInfo(kSortingCode, app_locale_);
559 const base::string16& sorting2 = p2.GetInfo(kSortingCode, app_locale_);
560 if (sorting1.empty()) {
561 address->SetInfo(kSortingCode, sorting2, app_locale_);
562 } else if (sorting2.empty()) {
563 address->SetInfo(kSortingCode, sorting1, app_locale_);
564 } else {
565 address->SetInfo(kSortingCode,
566 (p2.use_date() > p1.use_date() ? sorting2 : sorting1),
567 app_locale_);
568 }
569
511 // One of the addresses is empty or one of the addresses has a subset of 570 // One of the addresses is empty or one of the addresses has a subset of
512 // tokens from the other. Prefer the more verbosely expressed one. 571 // tokens from the other. Prefer the more verbosely expressed one.
513 const AutofillType kStreetAddress(ADDRESS_HOME_STREET_ADDRESS); 572 const AutofillType kStreetAddress(ADDRESS_HOME_STREET_ADDRESS);
514 const base::string16& address1 = p1.GetInfo(kStreetAddress, app_locale_); 573 const base::string16& address1 = p1.GetInfo(kStreetAddress, app_locale_);
515 const base::string16& address2 = p2.GetInfo(kStreetAddress, app_locale_); 574 const base::string16& address2 = p2.GetInfo(kStreetAddress, app_locale_);
516 // If one of the addresses is empty then use the other. 575 // If one of the addresses is empty then use the other.
517 if (address1.empty()) { 576 if (address1.empty()) {
518 address->SetInfo(kStreetAddress, address2, app_locale_); 577 address->SetInfo(kStreetAddress, address2, app_locale_);
519 } else if (address2.empty()) { 578 } else if (address2.empty()) {
520 address->SetInfo(kStreetAddress, address1, app_locale_); 579 address->SetInfo(kStreetAddress, address1, app_locale_);
(...skipping 20 matching lines...) Expand all
541 kStreetAddress, 600 kStreetAddress,
542 (p2.use_date() > p1.use_date() ? address2 : address1), 601 (p2.use_date() > p1.use_date() ? address2 : address1),
543 app_locale_); 602 app_locale_);
544 break; 603 break;
545 case S1_CONTAINS_S2: 604 case S1_CONTAINS_S2:
546 // address1 has more unique tokens than address2. 605 // address1 has more unique tokens than address2.
547 address->SetInfo(kStreetAddress, address1, app_locale_); 606 address->SetInfo(kStreetAddress, address1, app_locale_);
548 break; 607 break;
549 case S2_CONTAINS_S1: 608 case S2_CONTAINS_S1:
550 // address2 has more unique tokens than address1. 609 // address2 has more unique tokens than address1.
551 address->SetInfo(kStreetAddress, address1, app_locale_); 610 address->SetInfo(kStreetAddress, address2, app_locale_);
Roger McFarlane (Chromium) 2016/11/21 18:39:51 Latent copy-paste bug. Should have been address2 h
sebsg 2016/11/21 18:48:44 Good catch :)
552 break; 611 break;
553 case DIFFERENT_TOKENS: 612 case DIFFERENT_TOKENS:
554 default: 613 default:
555 // The addresses aren't mergeable and we shouldn't be doing any of 614 // The addresses aren't mergeable and we shouldn't be doing any of
556 // this. 615 // this.
557 NOTREACHED(); 616 NOTREACHED() << "Unexpected mismatch: '" << address1 << "' vs '"
617 << address2 << "'";
558 return false; 618 return false;
559 } 619 }
560 } 620 }
561 } 621 }
562 return true; 622 return true;
563 } 623 }
564 624
565 // static 625 // static
566 std::set<base::StringPiece16> AutofillProfileComparator::UniqueTokens( 626 std::set<base::StringPiece16> AutofillProfileComparator::UniqueTokens(
567 base::StringPiece16 s) { 627 base::StringPiece16 s) {
(...skipping 196 matching lines...) Expand 10 before | Expand all | Expand 10 after
764 // SHORT_NSN_MATCH and just call that instead of accessing the underlying 824 // SHORT_NSN_MATCH and just call that instead of accessing the underlying
765 // utility library directly? 825 // utility library directly?
766 826
767 // The phone number util library needs the numbers in utf8. 827 // The phone number util library needs the numbers in utf8.
768 const std::string phone_1 = base::UTF16ToUTF8(raw_phone_1); 828 const std::string phone_1 = base::UTF16ToUTF8(raw_phone_1);
769 const std::string phone_2 = base::UTF16ToUTF8(raw_phone_2); 829 const std::string phone_2 = base::UTF16ToUTF8(raw_phone_2);
770 830
771 // Parse and compare the phone numbers. 831 // Parse and compare the phone numbers.
772 PhoneNumberUtil* phone_util = PhoneNumberUtil::GetInstance(); 832 PhoneNumberUtil* phone_util = PhoneNumberUtil::GetInstance();
773 switch (phone_util->IsNumberMatchWithTwoStrings(phone_1, phone_2)) { 833 switch (phone_util->IsNumberMatchWithTwoStrings(phone_1, phone_2)) {
774 case PhoneNumberUtil::INVALID_NUMBER:
775 case PhoneNumberUtil::NO_MATCH:
776 return false;
777 case PhoneNumberUtil::SHORT_NSN_MATCH: 834 case PhoneNumberUtil::SHORT_NSN_MATCH:
778 case PhoneNumberUtil::NSN_MATCH: 835 case PhoneNumberUtil::NSN_MATCH:
779 case PhoneNumberUtil::EXACT_MATCH: 836 case PhoneNumberUtil::EXACT_MATCH:
780 return true; 837 return true;
838 case PhoneNumberUtil::INVALID_NUMBER:
839 case PhoneNumberUtil::NO_MATCH:
840 return false;
841 default:
842 NOTREACHED();
843 return false;
781 } 844 }
782
783 NOTREACHED();
784 return false;
785 } 845 }
786 846
787 bool AutofillProfileComparator::HaveMergeableAddresses( 847 bool AutofillProfileComparator::HaveMergeableAddresses(
788 const AutofillProfile& p1, 848 const AutofillProfile& p1,
789 const AutofillProfile& p2) const { 849 const AutofillProfile& p2) const {
790 // If the address are not in the same country, then they're not the same. If 850 // If the address are not in the same country, then they're not the same. If
791 // one of the address countries is unknown/invalid the comparison continues. 851 // one of the address countries is unknown/invalid the comparison continues.
792 const AutofillType kCountryCode(HTML_TYPE_COUNTRY_CODE, HTML_MODE_NONE); 852 const AutofillType kCountryCode(HTML_TYPE_COUNTRY_CODE, HTML_MODE_NONE);
793 const base::string16& country1 = p1.GetInfo(kCountryCode, app_locale_); 853 const base::string16& country1 = p1.GetInfo(kCountryCode, app_locale_);
794 const base::string16& country2 = p2.GetInfo(kCountryCode, app_locale_); 854 const base::string16& country2 = p2.GetInfo(kCountryCode, app_locale_);
795 if (!country1.empty() && !country2.empty() && 855 if (!country1.empty() && !country2.empty() &&
796 !case_insensitive_compare_.StringsEqual(country1, country2)) { 856 !case_insensitive_compare_.StringsEqual(country1, country2)) {
797 return false; 857 return false;
798 } 858 }
799 859
800 // TODO(rogerm): Lookup the normalization rules for the (common) country of
801 // the address. The rules should be applied post NormalizeForComparison to
802 // the state, city, and address bag of words comparisons.
803
804 // Zip 860 // Zip
805 // ---- 861 // ----
806 // If the addresses are definitely not in the same zip/area code then we're 862 // If the addresses are definitely not in the same zip/area code then we're
807 // done. Otherwise,the comparison continues. 863 // done. Otherwise,the comparison continues.
808 const AutofillType kZipCode(ADDRESS_HOME_ZIP); 864 const AutofillType kZipCode(ADDRESS_HOME_ZIP);
809 const base::string16& zip1 = NormalizeForComparison( 865 const base::string16& zip1 = NormalizeForComparison(
810 p1.GetInfo(kZipCode, app_locale_), DISCARD_WHITESPACE); 866 p1.GetInfo(kZipCode, app_locale_), DISCARD_WHITESPACE);
811 const base::string16& zip2 = NormalizeForComparison( 867 const base::string16& zip2 = NormalizeForComparison(
812 p2.GetInfo(kZipCode, app_locale_), DISCARD_WHITESPACE); 868 p2.GetInfo(kZipCode, app_locale_), DISCARD_WHITESPACE);
813 if (!zip1.empty() && !zip2.empty() && 869 if (!zip1.empty() && !zip2.empty() &&
814 zip1.find(zip2) == base::string16::npos && 870 zip1.find(zip2) == base::string16::npos &&
815 zip2.find(zip1) == base::string16::npos) { 871 zip2.find(zip1) == base::string16::npos) {
816 return false; 872 return false;
817 } 873 }
818 874
875 // Use the token rewrite rules for the (common) country of the address to
876 // transform equivalent substrings to a representative token for comparison.
819 AddressRewriter rewriter = 877 AddressRewriter rewriter =
820 AddressRewriter::ForCountryCode(country1.empty() ? country2 : country1); 878 AddressRewriter::ForCountryCode(country1.empty() ? country2 : country1);
821 879
822 // State 880 // State
823 // ------ 881 // ------
824 // Heuristic: States are mergeable if one is a (possibly empty) bag of words 882 // Heuristic: States are mergeable if one is a (possibly empty) bag of words
825 // subset of the other. 883 // subset of the other.
826 // 884 //
827 // TODO(rogerm): If the match is between non-empty zip codes then we can infer 885 // TODO(rogerm): If the match is between non-empty zip codes then we can infer
828 // that the two state strings are intended to have the same meaning. This 886 // that the two state strings are intended to have the same meaning. This
829 // handles the cases where we have invalid or poorly formed data in one of the 887 // handles the cases where we have invalid or poorly formed data in one of the
830 // state values (like "Select one", or "CA - California"). 888 // state values (like "Select one", or "CA - California").
831 const AutofillType kState(ADDRESS_HOME_STATE); 889 const AutofillType kState(ADDRESS_HOME_STATE);
832 const base::string16& state1 = 890 const base::string16& state1 =
833 rewriter.Rewrite(NormalizeForComparison(p1.GetInfo(kState, app_locale_))); 891 rewriter.Rewrite(NormalizeForComparison(p1.GetInfo(kState, app_locale_)));
834 const base::string16& state2 = 892 const base::string16& state2 =
835 rewriter.Rewrite(NormalizeForComparison(p2.GetInfo(kState, app_locale_))); 893 rewriter.Rewrite(NormalizeForComparison(p2.GetInfo(kState, app_locale_)));
836 if (CompareTokens(state1, state2) == DIFFERENT_TOKENS) { 894 if (CompareTokens(state1, state2) == DIFFERENT_TOKENS) {
837 return false; 895 return false;
838 } 896 }
839 897
840 // City 898 // City
841 // ------ 899 // ------
842 // Heuristic: Cities are mergeable if one is a (possibly empty) bag of words 900 // Heuristic: Cities are mergeable if one is a (possibly empty) bag of words
843 // subset of the other. 901 // subset of the other.
844 // 902 //
845 // TODO(rogerm): If the match is between non-empty zip codes then we can infer 903 // TODO(rogerm): If the match is between non-empty zip codes then we can infer
846 // that the two city strings are intended to have the same meaning. This 904 // that the two city strings are intended to have the same meaning. This
847 // handles the cases where we have a city vs one of its suburbs. 905 // handles the cases where we have a city vs one of its suburbs.
848 const base::string16& city1 = rewriter.Rewrite(NormalizeForComparison( 906 const AutofillType kCity(ADDRESS_HOME_CITY);
849 p1.GetInfo(AutofillType(ADDRESS_HOME_CITY), app_locale_))); 907 const base::string16& city1 =
850 const base::string16& city2 = rewriter.Rewrite(NormalizeForComparison( 908 rewriter.Rewrite(NormalizeForComparison(p1.GetInfo(kCity, app_locale_)));
851 p2.GetInfo(AutofillType(ADDRESS_HOME_CITY), app_locale_))); 909 const base::string16& city2 =
910 rewriter.Rewrite(NormalizeForComparison(p2.GetInfo(kCity, app_locale_)));
852 if (CompareTokens(city1, city2) == DIFFERENT_TOKENS) { 911 if (CompareTokens(city1, city2) == DIFFERENT_TOKENS) {
853 return false; 912 return false;
854 } 913 }
855 914
915 // Dependent Locality
916 // -------------------
917 // Heuristic: Dependent Localities are mergeable if one is a (possibly empty)
918 // bag of words subset of the other.
919 const AutofillType kDependentLocality(ADDRESS_HOME_DEPENDENT_LOCALITY);
920 const base::string16& locality1 = rewriter.Rewrite(
921 NormalizeForComparison(p1.GetInfo(kDependentLocality, app_locale_)));
922 const base::string16& locality2 = rewriter.Rewrite(
923 NormalizeForComparison(p2.GetInfo(kDependentLocality, app_locale_)));
924 if (CompareTokens(locality1, locality2) == DIFFERENT_TOKENS) {
925 return false;
926 }
927
928 // Sorting Code
929 // -------------
930 // Heuristic: Sorting codes are mergeable if one is empty or one is a
931 // substring of the other, post normalization and whitespace removed. This
932 // is similar to postal/zip codes.
933 const AutofillType kSortingCode(ADDRESS_HOME_SORTING_CODE);
934 const base::string16& sorting1 = NormalizeForComparison(
935 p1.GetInfo(kSortingCode, app_locale_), DISCARD_WHITESPACE);
936 const base::string16& sorting2 = NormalizeForComparison(
937 p2.GetInfo(kSortingCode, app_locale_), DISCARD_WHITESPACE);
938 if (!sorting1.empty() && !sorting2.empty() &&
939 sorting1.find(sorting2) == base::string16::npos &&
940 sorting2.find(sorting1) == base::string16::npos) {
941 return false;
942 }
943
856 // Address 944 // Address
857 // -------- 945 // --------
858 // Heuristic: Street addresses are mergeable if one is a (possibly empty) bag 946 // Heuristic: Street addresses are mergeable if one is a (possibly empty) bag
859 // of words subset of the other. 947 // of words subset of the other.
860 const base::string16& address1 = rewriter.Rewrite(NormalizeForComparison( 948 const base::string16& address1 = rewriter.Rewrite(NormalizeForComparison(
861 p1.GetInfo(AutofillType(ADDRESS_HOME_STREET_ADDRESS), app_locale_))); 949 p1.GetInfo(AutofillType(ADDRESS_HOME_STREET_ADDRESS), app_locale_)));
862 const base::string16& address2 = rewriter.Rewrite(NormalizeForComparison( 950 const base::string16& address2 = rewriter.Rewrite(NormalizeForComparison(
863 p2.GetInfo(AutofillType(ADDRESS_HOME_STREET_ADDRESS), app_locale_))); 951 p2.GetInfo(AutofillType(ADDRESS_HOME_STREET_ADDRESS), app_locale_)));
864 if (CompareTokens(address1, address2) == DIFFERENT_TOKENS) { 952 if (CompareTokens(address1, address2) == DIFFERENT_TOKENS) {
865 return false; 953 return false;
866 } 954 }
867 955
868 return true; 956 return true;
869 } 957 }
870 958
871 } // namespace autofill 959 } // namespace autofill
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698