components/autofill/core/browser/autofill_profile.cc - Issue 2013063002: Remove diacritics when normalizing autofill profile strings for comparison.

Side by Side Diff: components/autofill/core/browser/autofill_profile.cc

Issue 2013063002: Remove diacritics when normalizing autofill profile strings for comparison. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Simplify normalization loop and expand test cases. Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « components/autofill/core/browser/autofill_manager_unittest.cc ('k') | components/autofill/core/browser/autofill_profile_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright 2013 The Chromium Authors. All rights reserved.	1 // Copyright 2013 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/autofill/core/browser/autofill_profile.h"	5 #include "components/autofill/core/browser/autofill_profile.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8 #include <functional>	8 #include <functional>

9 #include <map>	9 #include <map>

10 #include <memory>	10 #include <memory>

(...skipping 18 matching lines...) Expand all Loading...
29 #include "components/autofill/core/browser/autofill_type.h"	29 #include "components/autofill/core/browser/autofill_type.h"

30 #include "components/autofill/core/browser/contact_info.h"	30 #include "components/autofill/core/browser/contact_info.h"

31 #include "components/autofill/core/browser/phone_number.h"	31 #include "components/autofill/core/browser/phone_number.h"

32 #include "components/autofill/core/browser/phone_number_i18n.h"	32 #include "components/autofill/core/browser/phone_number_i18n.h"

33 #include "components/autofill/core/browser/state_names.h"	33 #include "components/autofill/core/browser/state_names.h"

34 #include "components/autofill/core/browser/validation.h"	34 #include "components/autofill/core/browser/validation.h"

35 #include "components/autofill/core/common/autofill_l10n_util.h"	35 #include "components/autofill/core/common/autofill_l10n_util.h"

36 #include "components/autofill/core/common/form_field_data.h"	36 #include "components/autofill/core/common/form_field_data.h"

37 #include "grit/components_strings.h"	37 #include "grit/components_strings.h"

38 #include "third_party/icu/source/common/unicode/uchar.h"	38 #include "third_party/icu/source/common/unicode/uchar.h"

	39 #include "third_party/icu/source/common/unicode/utypes.h"

	40 #include "third_party/icu/source/i18n/unicode/translit.h"

39 #include "third_party/libaddressinput/chromium/addressinput_util.h"	41 #include "third_party/libaddressinput/chromium/addressinput_util.h"

40 #include "third_party/libaddressinput/src/cpp/include/libaddressinput/address_da ta.h"	42 #include "third_party/libaddressinput/src/cpp/include/libaddressinput/address_da ta.h"

41 #include "third_party/libaddressinput/src/cpp/include/libaddressinput/address_fo rmatter.h"	43 #include "third_party/libaddressinput/src/cpp/include/libaddressinput/address_fo rmatter.h"

42 #include "third_party/libaddressinput/src/cpp/include/libaddressinput/address_me tadata.h"	44 #include "third_party/libaddressinput/src/cpp/include/libaddressinput/address_me tadata.h"

43 #include "ui/base/l10n/l10n_util.h"	45 #include "ui/base/l10n/l10n_util.h"

44	46

45 using base::ASCIIToUTF16;	47 using base::ASCIIToUTF16;

46 using base::UTF16ToUTF8;	48 using base::UTF16ToUTF8;

47 using i18n::addressinput::AddressData;	49 using i18n::addressinput::AddressData;

48 using i18n::addressinput::AddressField;	50 using i18n::addressinput::AddressField;

(...skipping 179 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
228 bool operator()(const base::string16& phone) {	230 bool operator()(const base::string16& phone) {

229 return i18n::PhoneNumbersMatch(phone, phone_, country_code_, app_locale_);	231 return i18n::PhoneNumbersMatch(phone, phone_, country_code_, app_locale_);

230 }	232 }

231	233

232 private:	234 private:

233 base::string16 phone_;	235 base::string16 phone_;

234 std::string country_code_;	236 std::string country_code_;

235 std::string app_locale_;	237 std::string app_locale_;

236 };	238 };

237	239

	240 base::string16 NormalizeForComparison(const base::string16& text) {

	241 using icu::UnicodeString;

	242 using icu::Transliterator;

	243

	244 // Use ICU transliteration to remove diacritics and fold case.

	245 // See http://userguide.icu-project.org/transforms/general

	246 UErrorCode status = U_ZERO_ERROR;

	247 std::unique_ptr<Transliterator> transliterator(Transliterator::createInstance(

	248 "NFD; [:Nonspacing Mark:] Remove; Lower; NFC", UTRANS_FORWARD, status));

	249 if (U_FAILURE(status) \|\| transliterator == nullptr) {

	250 LOG(ERROR) << "Failed to create ICU Transliterator: "

	251 << u_errorName(status);

	252 return text;

	253 }

	254

	255 UnicodeString value = UnicodeString(text.data(), text.length());

	256 transliterator->transliterate(value);

	257

	258 return base::string16(value.getBuffer(), value.length());

	259 }

	260

238 } // namespace	261 } // namespace

239	262

240 AutofillProfile::AutofillProfile(const std::string& guid,	263 AutofillProfile::AutofillProfile(const std::string& guid,

241 const std::string& origin)	264 const std::string& origin)

242 : AutofillDataModel(guid, origin),	265 : AutofillDataModel(guid, origin),

243 record_type_(LOCAL_PROFILE),	266 record_type_(LOCAL_PROFILE),

244 phone_number_(this) {	267 phone_number_(this) {

245 }	268 }

246	269

247 AutofillProfile::AutofillProfile(RecordType type, const std::string& server_id)	270 AutofillProfile::AutofillProfile(RecordType type, const std::string& server_id)

(...skipping 539 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
787 (base::Time::Now() - use_date()).InDays());	810 (base::Time::Now() - use_date()).InDays());

788 RecordUse();	811 RecordUse();

789 }	812 }

790	813

791 // static	814 // static

792 base::string16 AutofillProfile::CanonicalizeProfileString(	815 base::string16 AutofillProfile::CanonicalizeProfileString(

793 const base::string16& str) {	816 const base::string16& str) {

794 base::string16 ret;	817 base::string16 ret;

795 ret.reserve(str.size());	818 ret.reserve(str.size());

796	819

797 bool previous_was_whitespace = false;	820 // This algorithm is not designed to be perfect, we could get arbitrarily

	821 // fancy here trying to canonicalize address lines. Instead, this is designed

	822 // to handle common cases for all types of data (addresses and names) without

	823 // the need of domain-specific logic.

	824 //

	825 // 1. Convert punctuation to spaces and normalize all whitespace to spaces.

	826 // This will convert "Mid-Island Plz." -> "Mid Island Plz " (the trailing

	827 // space will be trimmed off outside of the end of the loop).

	828 //

	829 // 2. Collapse consecutive punctuation/whitespace characters to a single

	830 // space. We pretend the string has already started with whitespace in

	831 // order to trim leading spaces.

	832 //

	833 // 3. Remove diacritics (accents and other non-spacing marks) and perform

	834 // case folding to lower-case.

798	835

799 // This algorithm isn't designed to be perfect, we could get arbitrarily	836 bool previous_was_whitespace = true;

800 // fancy here trying to canonicalize address lines. Instead, this is designed	837 for (base::i18n::UTF16CharIterator iter(&str); !iter.end(); iter.Advance()) {

801 // to handle common cases for all types of data (addresses and names)

802 // without the need of domain-specific logic.

803 base::i18n::UTF16CharIterator iter(&str);

804 while (!iter.end()) {

805 switch (u_charType(iter.get())) {	838 switch (u_charType(iter.get())) {

	839 // Punctuation

806 case U_DASH_PUNCTUATION:	840 case U_DASH_PUNCTUATION:

807 case U_START_PUNCTUATION:	841 case U_START_PUNCTUATION:

808 case U_END_PUNCTUATION:	842 case U_END_PUNCTUATION:

809 case U_CONNECTOR_PUNCTUATION:	843 case U_CONNECTOR_PUNCTUATION:

810 case U_OTHER_PUNCTUATION:	844 case U_OTHER_PUNCTUATION:

811 // Convert punctuation to spaces. This will convert "Mid-Island Plz."	845 // Whitespace

812 // -> "Mid Island Plz" (the trailing space will be trimmed off at the	846 case U_CONTROL_CHAR: // To escape the '\n' character.

813 // end of the loop).	847 case U_SPACE_SEPARATOR:

	848 case U_LINE_SEPARATOR:

	849 case U_PARAGRAPH_SEPARATOR:

814 if (!previous_was_whitespace) {	850 if (!previous_was_whitespace) {

815 ret.push_back(' ');	851 ret.push_back(' ');

816 previous_was_whitespace = true;	852 previous_was_whitespace = true;

817 }	853 }

818 break;	854 break;

819	855

820 case U_CONTROL_CHAR: // To escape the '\n' character.

821 case U_SPACE_SEPARATOR:

822 case U_LINE_SEPARATOR:

823 case U_PARAGRAPH_SEPARATOR:

824 // Convert sequences of spaces to single spaces.

825 if (!previous_was_whitespace) {

826 ret.push_back(' ');

827 previous_was_whitespace = true;

828 }

829 break;

830

831 case U_UPPERCASE_LETTER:

832 case U_TITLECASE_LETTER:

833 previous_was_whitespace = false;

834 base::WriteUnicodeCharacter(u_tolower(iter.get()), &ret);

835 break;

836

837 default:	856 default:

838 previous_was_whitespace = false;	857 previous_was_whitespace = false;

839 base::WriteUnicodeCharacter(iter.get(), &ret);	858 base::WriteUnicodeCharacter(iter.get(), &ret);

840 break;	859 break;

841 }	860 }

842 iter.Advance();

843 }	861 }

844	862

845 // Trim off trailing whitespace if we left one.	863 // Trim off trailing whitespace if we left one.

846 if (previous_was_whitespace)	864 if (previous_was_whitespace && !ret.empty())

847 ret.resize(ret.size() - 1);	865 ret.resize(ret.size() - 1);

848	866

849 return ret;	867 // Remove diacritics and perform case folding.

	868 return NormalizeForComparison(ret);

850 }	869 }

851	870

852 // static	871 // static

853 bool AutofillProfile::AreProfileStringsSimilar(const base::string16& a,	872 bool AutofillProfile::AreProfileStringsSimilar(const base::string16& a,

854 const base::string16& b) {	873 const base::string16& b) {

855 return CanonicalizeProfileString(a) == CanonicalizeProfileString(b);	874 return CanonicalizeProfileString(a) == CanonicalizeProfileString(b);

856 }	875 }

857	876

858 void AutofillProfile::GetSupportedTypes(	877 void AutofillProfile::GetSupportedTypes(

859 ServerFieldTypeSet* supported_types) const {	878 ServerFieldTypeSet* supported_types) const {

(...skipping 224 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1084 << " " << UTF16ToUTF8(profile.GetRawInfo(ADDRESS_HOME_CITY)) << " "	1103 << " " << UTF16ToUTF8(profile.GetRawInfo(ADDRESS_HOME_CITY)) << " "

1085 << UTF16ToUTF8(profile.GetRawInfo(ADDRESS_HOME_STATE)) << " "	1104 << UTF16ToUTF8(profile.GetRawInfo(ADDRESS_HOME_STATE)) << " "

1086 << UTF16ToUTF8(profile.GetRawInfo(ADDRESS_HOME_ZIP)) << " "	1105 << UTF16ToUTF8(profile.GetRawInfo(ADDRESS_HOME_ZIP)) << " "

1087 << UTF16ToUTF8(profile.GetRawInfo(ADDRESS_HOME_SORTING_CODE)) << " "	1106 << UTF16ToUTF8(profile.GetRawInfo(ADDRESS_HOME_SORTING_CODE)) << " "

1088 << UTF16ToUTF8(profile.GetRawInfo(ADDRESS_HOME_COUNTRY)) << " "	1107 << UTF16ToUTF8(profile.GetRawInfo(ADDRESS_HOME_COUNTRY)) << " "

1089 << profile.language_code() << " "	1108 << profile.language_code() << " "

1090 << UTF16ToUTF8(profile.GetRawInfo(PHONE_HOME_WHOLE_NUMBER));	1109 << UTF16ToUTF8(profile.GetRawInfo(PHONE_HOME_WHOLE_NUMBER));

1091 }	1110 }

1092	1111

1093 } // namespace autofill	1112 } // namespace autofill

OLD	NEW