Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(109)

Unified Diff: components/autofill/core/browser/autofill_data_util.cc

Issue 2132103002: Split CJK full names into name parts correctly. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Fix build on Windows. Created 4 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | components/autofill/core/browser/autofill_data_util_unittest.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: components/autofill/core/browser/autofill_data_util.cc
diff --git a/components/autofill/core/browser/autofill_data_util.cc b/components/autofill/core/browser/autofill_data_util.cc
index 7387ed737ee06a4c2928a891cda050d0f829e724..d8b081208d0cb458d9eb9cea239d07f066d969e6 100644
--- a/components/autofill/core/browser/autofill_data_util.cc
+++ b/components/autofill/core/browser/autofill_data_util.cc
@@ -4,12 +4,14 @@
#include "components/autofill/core/browser/autofill_data_util.h"
+#include <algorithm>
#include <vector>
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "components/autofill/core/browser/field_types.h"
+#include "third_party/icu/source/common/unicode/uscript.h"
namespace autofill {
namespace data_util {
@@ -31,6 +33,28 @@ const char* const family_name_prefixes[] = {"d'", "de", "del", "der", "di",
"la", "le", "mc", "san", "st",
"ter", "van", "von"};
+// The common and non-ambiguous CJK surnames (last names) that have more than
+// one character.
+const char* common_cjk_multi_char_surnames[] = {
+ // Korean, taken from the list of surnames:
+ // https://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EC%84%B1%EC%94%A8_%EB%AA%A9%EB%A1%9D
+ "남궁", "사공", "서문", "선우", "제갈", "황보", "독고", "망절",
+
+ // Chinese, taken from the top 10 Chinese 2-character surnames:
+ // https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.84.E8.A4.87.E5.A7.93
+ // Simplified Chinese (mostly mainland China)
+ "欧阳", "令狐", "皇甫", "上官", "司徒", "诸葛", "司马", "宇文", "呼延", "端木",
+ // Traditional Chinese (mostly Taiwan)
+ "張簡", "歐陽", "諸葛", "申屠", "尉遲", "司馬", "軒轅", "夏侯"
+};
+
+// All Korean surnames that have more than one character, even the
+// rare/ambiguous ones.
+const char* korean_multi_char_surnames[] = {
+ "강전", "남궁", "독고", "동방", "망절", "사공", "서문", "선우",
+ "소봉", "어금", "장곡", "제갈", "황목", "황보"
+};
+
// Returns true if |set| contains |element|, modulo a final period.
bool ContainsString(const char* const set[],
size_t set_size,
@@ -74,6 +98,127 @@ void StripSuffixes(std::vector<base::string16>* name_tokens) {
}
}
+// Find whether |name| starts with any of the strings from the array
+// |prefixes|. The returned value is the length of the prefix found, or 0 if
+// none is found.
+size_t StartsWithAny(base::StringPiece16 name, const char** prefixes,
+ size_t prefix_count) {
+ base::string16 buffer;
+ for (size_t i = 0; i < prefix_count; i++) {
+ buffer.clear();
+ base::UTF8ToUTF16(prefixes[i], strlen(prefixes[i]), &buffer);
+ if (base::StartsWith(name, buffer, base::CompareCase::SENSITIVE)) {
+ return buffer.size();
+ }
+ }
+ return 0;
+}
+
+// Returns true if |c| is a CJK (Chinese, Japanese, Korean) character, for any
+// of the CJK alphabets.
+bool IsCJK(base::char16 c) {
+ static const std::set<UScriptCode> kCjkScripts {
+ USCRIPT_HAN, // CJK logographs, used by all 3 (but rarely for Korean)
+ USCRIPT_HANGUL, // Korean alphabet
+ USCRIPT_KATAKANA, // A Japanese syllabary
+ USCRIPT_HIRAGANA, // A Japanese syllabary
+ USCRIPT_BOPOMOFO // Chinese semisyllabary, rarely used
+ };
+ UErrorCode error = U_ZERO_ERROR;
+ UScriptCode script = uscript_getScript(c, &error);
+ return kCjkScripts.find(script) != kCjkScripts.end();
+}
+
+// Returns true if |name| looks like a CJK name (or some kind of mish-mash of
+// the three, at least). The name is considered to be a CJK name if it is only
+// CJK characters or spaces.
+//
+// Chinese and Japanese names are usually spelled out using the Han characters
+// (logographs), which constitute the "CJK Unified Ideographs" block in Unicode,
+// also referred to as Unihan. Korean names are usually spelled out in the
+// Korean alphabet (Hangul), although they do have a Han equivalent as well.
+bool IsCJKName(const base::string16& name) {
+ for (base::char16 c : name) {
+ if (!IsCJK(c) && !base::IsUnicodeWhitespace(c)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+// Returns true if |c| is a Korean Hangul character.
+bool IsHangul(base::char16 c) {
+ UErrorCode error = U_ZERO_ERROR;
+ return uscript_getScript(c, &error) == USCRIPT_HANGUL;
+}
+
+// Returns true if |name| looks like a Korean name, made up entirely of Hangul
+// characters or spaces.
+bool IsHangulName(const base::string16& name) {
+ for (base::char16 c : name) {
+ if (!IsHangul(c) && !base::IsUnicodeWhitespace(c)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+// Tries to split a Chinese, Japanese, or Korean name into its given name &
+// surname parts, and puts the result in |parts|. If splitting did not work for
+// whatever reason, returns false.
+bool SplitCJKName(const std::vector<base::string16>& name_tokens,
+ NameParts* parts) {
+ // The convention for CJK languages is to put the surname (last name) first,
+ // and the given name (first name) second. In a continuous text, there is
+ // normally no space between the two parts of the name. When entering their
+ // name into a field, though, some people add a space to disambiguate. CJK
+ // names (almost) never have a middle name.
+ //
+ // TODO(crbug.com/89111): Foreign names in Japanese are written in Katakana,
+ // with a '・' (KATAKANA MIDDLE DOT U+30FB) character as a separator, with
+ // the *western* ordering. e.g. "ビル・ゲイツ" ("biru・geitsu" AKA Bill Gates)
+ if (name_tokens.size() == 1) {
+ // There is no space between the surname and given name. Try to infer where
+ // to separate between the two. Most Chinese and Korean surnames have only
+ // one character, but there are a few that have 2. If the name does not
+ // start with a surname from a known list, default to 1 character.
+ //
+ // TODO(crbug.com/89111): Japanese names with no space will be mis-split,
+ // since we don't have a list of Japanese last names. In the Han alphabet,
+ // it might also be difficult for us to differentiate between Chinese &
+ // Japanese names.
+ const base::string16& name = name_tokens.front();
+ const bool is_korean = IsHangulName(name);
+ size_t surname_length = 0;
+ if (is_korean && name.size() > 3) {
+ // 4-character Korean names are more likely to be 2/2 than 1/3, so use
+ // the full list of Korean 2-char surnames. (instead of only the common
+ // ones)
+ surname_length = std::max<size_t>(
+ 1, StartsWithAny(name, korean_multi_char_surnames,
+ arraysize(korean_multi_char_surnames)));
+ } else {
+ // Default to 1 character if the surname is not in
+ // |common_cjk_multi_char_surnames|.
+ surname_length = std::max<size_t>(
+ 1, StartsWithAny(name, common_cjk_multi_char_surnames,
+ arraysize(common_cjk_multi_char_surnames)));
+ }
+ parts->family = name.substr(0, surname_length);
+ parts->given = name.substr(surname_length);
+ return true;
+ }
+ if (name_tokens.size() == 2) {
+ // The user entered a space between the two name parts. This makes our job
+ // easier. Family name first, given name second.
+ parts->family = name_tokens[0];
+ parts->given = name_tokens[1];
+ return true;
+ }
+ // We don't know what to do if there are more than 2 tokens.
+ return false;
+}
+
} // namespace
NameParts SplitName(const base::string16& name) {
@@ -82,12 +227,21 @@ NameParts SplitName(const base::string16& name) {
base::SPLIT_WANT_NONEMPTY);
StripPrefixes(&name_tokens);
+ NameParts parts;
+
+ // TODO(crbug.com/89111): Hungarian, Tamil, Telugu, and Vietnamese also have
+ // the given name before the surname, and should be treated as special cases
+ // too.
+
+ // Treat CJK names differently.
+ if (IsCJKName(name) && SplitCJKName(name_tokens, &parts)) {
+ return parts;
+ }
+
// Don't assume "Ma" is a suffix in John Ma.
if (name_tokens.size() > 2)
StripSuffixes(&name_tokens);
- NameParts parts;
-
if (name_tokens.empty()) {
// Bad things have happened; just assume the whole thing is a given name.
parts.given = name;
« no previous file with comments | « no previous file | components/autofill/core/browser/autofill_data_util_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698