components/autofill/core/browser/autofill_data_util.cc - Issue 2132103002: Split CJK full names into name parts correctly.

Unified Diff: components/autofill/core/browser/autofill_data_util.cc

Issue 2132103002: Split CJK full names into name parts correctly. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Improve precision for splitting Korean names. Created 4 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « no previous file | components/autofill/core/browser/autofill_data_util_unittest.cc » ('j') | components/autofill/core/browser/autofill_data_util_unittest.cc » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: components/autofill/core/browser/autofill_data_util.cc

diff --git a/components/autofill/core/browser/autofill_data_util.cc b/components/autofill/core/browser/autofill_data_util.cc

index 7387ed737ee06a4c2928a891cda050d0f829e724..df3fbf0798aaa54fb0abe6ccf155cbfc6e3472b9 100644

--- a/components/autofill/core/browser/autofill_data_util.cc

+++ b/components/autofill/core/browser/autofill_data_util.cc

@@ -10,6 +10,7 @@

#include "base/strings/string_util.h"

#include "base/strings/utf_string_conversions.h"

#include "components/autofill/core/browser/field_types.h"

+#include "third_party/icu/source/common/unicode/uscript.h"

namespace autofill {

namespace data_util {

@@ -31,6 +32,26 @@ const char* const family_name_prefixes[] = {"d'", "de", "del", "der", "di",

"la", "le", "mc", "san", "st",

"ter", "van", "von"};

+// The most common CJK surnames (last names) that have more than one character.

Jinsuk Kim 2016/07/13 21:51:09 In fact it contains two groups. How about updating

nicolaso 2016/07/14 18:12:43 Done.

+const char* common_cjk_multi_char_surnames[] = {

+ // Korean, taken from the list of surnames:

+ // https://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EC%84%B1%EC%94%A8_%EB%AA%A9%EB%A1%9D

+ "남궁", "사공", "서문", "선우", "제갈", "황보", "독고", "망절",

+ // Chinese, taken from the top 10 Chinese 2-character surnames:

+ // https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.84.E8.A4.87.E5.A7.93

+ // Simplified Chinese (mostly mainland China)

+ "欧阳", "令狐", "皇甫", "上官", "司徒", "诸葛", "司马", "宇文", "呼延", "端木",

+ // Traditional Chinese (mostly Taiwan)

+ "張簡", "歐陽", "諸葛", "申屠", "尉遲", "司馬", "軒轅", "夏侯"

+};

+// All Korean surnames that have more than one character, even the rare ones.

+const char* korean_multi_char_surnames[] = {

+ "강전", "남궁", "독고", "동방", "망절", "사공", "서문", "선우",

+ "소봉", "어금", "장곡", "제갈", "황목", "황보"

+};

// Returns true if |set| contains |element|, modulo a final period.

bool ContainsString(const char* const set[],

size_t set_size,

@@ -74,20 +95,156 @@ void StripSuffixes(std::vector<base::string16>* name_tokens) {

}

+// Find whether |name| starts with any of the strings from the array

+// |prefixes|. The returned value is the length of the prefix found, or 0 if

+// none is found.

+size_t StartsWithAny(base::StringPiece16 name, const char** prefixes,

+ size_t prefix_count) {

+ base::string16 buffer;

+ for (size_t i = 0; i < prefix_count; i++) {

+ buffer.clear();

+ base::UTF8ToUTF16(prefixes[i], strlen(prefixes[i]), &buffer);

+ if (base::StartsWith(name, buffer, base::CompareCase::SENSITIVE)) {

+ return buffer.size();

+ }

+ return 0;

+// Returns true if |c| is a CJK (Chinese, Japanese, Korean) character, for any

+// of the CJK alphabets.

+bool IsCJK(base::char16 c) {

+ static const std::set<UScriptCode> kCjkScripts {

+ USCRIPT_HAN, // CJK logographs, used by all 3 (but rarely for Korean)

+ USCRIPT_HANGUL, // Korean alphabet

+ USCRIPT_KATAKANA, // A Japanese syllabary

+ USCRIPT_HIRAGANA, // A Japanese syllabary

+ USCRIPT_BOPOMOFO // Chinese semisyllabary, rarely used

+ };

+ UErrorCode error = U_ZERO_ERROR;

+ UScriptCode script = uscript_getScript(c, &error);

+ return kCjkScripts.find(script) != kCjkScripts.end();

+// Returns true if |name| looks like a CJK name (or some kind of mish-mash of

+// the three, at least). The name is considered to be a CJK name if it is only

+// CJK characters or spaces.

+//

+// Chinese and Japanese names are usually spelled out using the Han characters

+// (logographs), which constitute the "CJK Unified Ideographs" block in Unicode,

+// also referred to as Unihan. Korean names are usually spelled out in the

+// Korean alphabet (Hangul), although they do have a Han equivalent as well.

+bool IsCJKName(const base::string16& name) {

+ for (base::char16 c : name) {

+ if (!IsCJK(c) && !base::IsUnicodeWhitespace(c)) {

+ return false;

+ }

+ return true;

+// Returns true if |c| is a Korean Hangul character.

+bool IsHangul(base::char16 c) {

+ UErrorCode error = U_ZERO_ERROR;

+ return uscript_getScript(c, &error) == USCRIPT_HANGUL;

+// Returns true if |name| looks like a Korean name, made up entirely of Hangul

+// characters or spaces.

+bool IsHangulName(const base::string16& name) {

+ for (base::char16 c : name) {

+ if (!IsHangul(c) && !base::IsUnicodeWhitespace(c)) {

+ return false;

+ }

+ return true;

+// Tries to split a Chinese, Japanese, or Korean name into its given name &

+// surname parts, and puts the result in |parts|. If splitting did not work for

+// whatever reason, returns false.

+bool SplitCJKName(const std::vector<base::string16>& name_tokens,

+ NameParts* parts) {

+ // The convention for CJK languages is to put the surname (last name) first,

+ // and the given name (first name) second. In a continuous text, there is

+ // normally no space between the two parts of the name. When entering their

+ // name into a field, though, some people add a space to disambiguate. CJK

+ // names (almost) never have a middle name.

+ //

+ // TODO(crbug.com/89111): Foreign names in Japanese are written in Katakana,

+ // with a '・' (KATAKANA MIDDLE DOT U+30FB) character as a separator, with

+ // the *western* ordering. e.g. "ビル・ゲイツ" ("biru・geitsu" AKA Bill Gates)

+ if (name_tokens.size() == 1) {

+ // There is no space between the surname and given name. Try to infer where

+ // to separate between the two. Most Chinese and Korean surnames have only

+ // one character, but there are a few that have 2. If the name does not

+ // start with a surname from a known list, default to 1 character.

+ //

+ // TODO(crbug.com/89111): Japanese names with no space will be mis-split,

+ // since we don't have a list of Japanese last names. In the Han alphabet,

+ // it might also be difficult for us to differentiate between Chinese &

+ // Japanese names.

+ const base::string16& name = name_tokens.front();

+ const bool is_korean = IsHangulName(name);

+ // Korean full names always have at least 3 characters. So, if there are

+ // less than three, it's only a given name.

+ if (is_korean && name.size() < 3) {

+ parts->given = name;

+ } else {

+ size_t surname_length = 0;

+ if (is_korean && name.size() > 3) {

+ // 4-character Korean names are more likely to be 2/2 than 1/3, so use

+ // the full list of Korean 2-char surnames. (instead of only the common

+ // ones)

+ surname_length = std::max<size_t>(

+ 1, StartsWithAny(name, korean_multi_char_surnames,

+ arraysize(korean_multi_char_surnames)));

+ }

+ else {

+ // Default to 1 character if the surname is not in

+ // |common_cjk_multi_char_surnames|.

+ surname_length = std::max<size_t>(

+ 1, StartsWithAny(name, common_cjk_multi_char_surnames,

+ arraysize(common_cjk_multi_char_surnames)));

+ }

+ parts->family = name.substr(0, surname_length);

+ parts->given = name.substr(surname_length);

+ }

+ return true;

+ } else if (name_tokens.size() == 2) {

gogerald1 2016/07/14 18:28:52 break this 'else if' into a separate 'if' statemen

nicolaso 2016/07/14 20:32:17 Done.

+ // The user entered a space between the two name parts. This makes our job

+ // easier. Family name first, given name second.

+ parts->family = name_tokens[0];

+ parts->given = name_tokens[1];

+ return true;

+ }

+ // We don't know what to do if there are more than 2 tokens.

+ return false;

} // namespace

NameParts SplitName(const base::string16& name) {

std::vector<base::string16> name_tokens =

- base::SplitString(name, base::ASCIIToUTF16(" ,"), base::KEEP_WHITESPACE,

+ base::SplitString(name, base::UTF8ToUTF16(" ,"), base::KEEP_WHITESPACE,

base::SPLIT_WANT_NONEMPTY);

StripPrefixes(&name_tokens);

+ NameParts parts;

+ // TODO(crbug.com/89111): Hungarian, Tamil, Telugu, and Vietnamese also have

+ // the given name before the surname, and should be treated as special cases

+ // too.

+ // Treat CJK names differently.

+ if (IsCJKName(name) && SplitCJKName(name_tokens, &parts)) {

+ return parts;

+ }

// Don't assume "Ma" is a suffix in John Ma.

if (name_tokens.size() > 2)

StripSuffixes(&name_tokens);

- NameParts parts;

if (name_tokens.empty()) {

// Bad things have happened; just assume the whole thing is a given name.

parts.given = name;