Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(779)

Unified Diff: components/autofill/core/browser/autofill_profile.cc

Issue 2013063002: Remove diacritics when normalizing autofill profile strings for comparison. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Simplify normalization loop and expand test cases. Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: components/autofill/core/browser/autofill_profile.cc
diff --git a/components/autofill/core/browser/autofill_profile.cc b/components/autofill/core/browser/autofill_profile.cc
index 0f69602a9265b7a33035cbcbb7552847a1b150ad..2f3b6c34d6720b79e4a5ce06a4ffb3b25f804599 100644
--- a/components/autofill/core/browser/autofill_profile.cc
+++ b/components/autofill/core/browser/autofill_profile.cc
@@ -36,6 +36,8 @@
#include "components/autofill/core/common/form_field_data.h"
#include "grit/components_strings.h"
#include "third_party/icu/source/common/unicode/uchar.h"
+#include "third_party/icu/source/common/unicode/utypes.h"
+#include "third_party/icu/source/i18n/unicode/translit.h"
#include "third_party/libaddressinput/chromium/addressinput_util.h"
#include "third_party/libaddressinput/src/cpp/include/libaddressinput/address_data.h"
#include "third_party/libaddressinput/src/cpp/include/libaddressinput/address_formatter.h"
@@ -235,6 +237,27 @@ class FindByPhone {
std::string app_locale_;
};
+base::string16 NormalizeForComparison(const base::string16& text) {
+ using icu::UnicodeString;
+ using icu::Transliterator;
+
+ // Use ICU transliteration to remove diacritics and fold case.
+ // See http://userguide.icu-project.org/transforms/general
+ UErrorCode status = U_ZERO_ERROR;
+ std::unique_ptr<Transliterator> transliterator(Transliterator::createInstance(
+ "NFD; [:Nonspacing Mark:] Remove; Lower; NFC", UTRANS_FORWARD, status));
+ if (U_FAILURE(status) || transliterator == nullptr) {
+ LOG(ERROR) << "Failed to create ICU Transliterator: "
+ << u_errorName(status);
+ return text;
+ }
+
+ UnicodeString value = UnicodeString(text.data(), text.length());
+ transliterator->transliterate(value);
+
+ return base::string16(value.getBuffer(), value.length());
+}
+
} // namespace
AutofillProfile::AutofillProfile(const std::string& guid,
@@ -794,59 +817,55 @@ base::string16 AutofillProfile::CanonicalizeProfileString(
base::string16 ret;
ret.reserve(str.size());
- bool previous_was_whitespace = false;
-
- // This algorithm isn't designed to be perfect, we could get arbitrarily
+ // This algorithm is not designed to be perfect, we could get arbitrarily
// fancy here trying to canonicalize address lines. Instead, this is designed
- // to handle common cases for all types of data (addresses and names)
- // without the need of domain-specific logic.
- base::i18n::UTF16CharIterator iter(&str);
- while (!iter.end()) {
+ // to handle common cases for all types of data (addresses and names) without
+ // the need of domain-specific logic.
+ //
+ // 1. Convert punctuation to spaces and normalize all whitespace to spaces.
+ // This will convert "Mid-Island Plz." -> "Mid Island Plz " (the trailing
+ // space will be trimmed off outside of the end of the loop).
+ //
+ // 2. Collapse consecutive punctuation/whitespace characters to a single
+ // space. We pretend the string has already started with whitespace in
+ // order to trim leading spaces.
+ //
+ // 3. Remove diacritics (accents and other non-spacing marks) and perform
+ // case folding to lower-case.
+
+ bool previous_was_whitespace = true;
+ for (base::i18n::UTF16CharIterator iter(&str); !iter.end(); iter.Advance()) {
switch (u_charType(iter.get())) {
+ // Punctuation
case U_DASH_PUNCTUATION:
case U_START_PUNCTUATION:
case U_END_PUNCTUATION:
case U_CONNECTOR_PUNCTUATION:
case U_OTHER_PUNCTUATION:
- // Convert punctuation to spaces. This will convert "Mid-Island Plz."
- // -> "Mid Island Plz" (the trailing space will be trimmed off at the
- // end of the loop).
- if (!previous_was_whitespace) {
- ret.push_back(' ');
- previous_was_whitespace = true;
- }
- break;
-
+ // Whitespace
case U_CONTROL_CHAR: // To escape the '\n' character.
case U_SPACE_SEPARATOR:
case U_LINE_SEPARATOR:
case U_PARAGRAPH_SEPARATOR:
- // Convert sequences of spaces to single spaces.
if (!previous_was_whitespace) {
ret.push_back(' ');
previous_was_whitespace = true;
}
break;
- case U_UPPERCASE_LETTER:
- case U_TITLECASE_LETTER:
- previous_was_whitespace = false;
- base::WriteUnicodeCharacter(u_tolower(iter.get()), &ret);
- break;
-
default:
previous_was_whitespace = false;
base::WriteUnicodeCharacter(iter.get(), &ret);
break;
}
- iter.Advance();
}
// Trim off trailing whitespace if we left one.
- if (previous_was_whitespace)
+ if (previous_was_whitespace && !ret.empty())
ret.resize(ret.size() - 1);
- return ret;
+ // Remove diacritics and perform case folding.
+ return NormalizeForComparison(ret);
}
// static

Powered by Google App Engine
This is Rietveld 408576698