components/autofill/core/browser/autofill_data_util.cc - Issue 2132103002: Split CJK full names into name parts correctly.

Unified Diff: components/autofill/core/browser/autofill_data_util.cc

Issue 2132103002: Split CJK full names into name parts correctly. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Move a TODO. Created 4 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: components/autofill/core/browser/autofill_data_util.cc

diff --git a/components/autofill/core/browser/autofill_data_util.cc b/components/autofill/core/browser/autofill_data_util.cc

index 7387ed737ee06a4c2928a891cda050d0f829e724..7f4d0b2d576793251471a04f0729838d27d0f127 100644

--- a/components/autofill/core/browser/autofill_data_util.cc

+++ b/components/autofill/core/browser/autofill_data_util.cc

@@ -10,6 +10,7 @@

#include "base/strings/string_util.h"

#include "base/strings/utf_string_conversions.h"

#include "components/autofill/core/browser/field_types.h"

+#include "third_party/icu/source/common/unicode/uscript.h"

namespace autofill {

namespace data_util {

@@ -31,6 +32,21 @@ const char* const family_name_prefixes[] = {"d'", "de", "del", "der", "di",

"la", "le", "mc", "san", "st",

"ter", "van", "von"};

+// CJK surnames (last names) that have more than one character.

+const char* cjk_multi_char_surnames[] = {

+ // Korean, taken from the list of registered surnames:

+ // https://namu.wiki/w/%ED%95%9C%EA%B5%AD%EC%9D%98%20%EC%84%B1%EC%94%A8#s-6

Jinsuk Kim 2016/07/12 22:50:48 Consider changing the reference to the article in

nicolaso 2016/07/13 16:31:06 Done.

+ "강전", "남궁", "독고", "동방", "망절", "사공", "서문", "선우",

+ "소봉", "어금", "장곡", "제갈", "황목", "황보",

+ // Chinese, taken from the top 10 Chinese surnames:

+ // https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.84.E8.A4.87.E5.A7.93

+ // Simplified Chinese (mostly mainland China)

+ "欧阳", "令狐", "皇甫", "上官", "司徒", "诸葛", "司马", "宇文", "呼延", "端木",

+ // Traditional Chinese (mostly Taiwan)

+ "張簡", "歐陽", "諸葛", "申屠", "尉遲", "司馬", "軒轅", "夏侯"

+};

// Returns true if |set| contains |element|, modulo a final period.

bool ContainsString(const char* const set[],

size_t set_size,

@@ -74,20 +90,143 @@ void StripSuffixes(std::vector<base::string16>* name_tokens) {

}

+// Returns true if |c| is a CJK (Chinese, Japanese, Korean) character, for any

+// of the CJK alphabets.

+bool IsCJK(base::char16 c) {

+ static const std::set<UScriptCode> kCjkScripts {

+ USCRIPT_HAN, // CJK logographs, used by all 3 (but rarely for Korean)

+ USCRIPT_HANGUL, // Korean alphabet

+ USCRIPT_KATAKANA, // A Japanese syllabary

+ USCRIPT_HIRAGANA, // A Japanese syllabary

+ USCRIPT_BOPOMOFO // Chinese semisyllabary, rarely used

+ };

+ UErrorCode error = U_ZERO_ERROR;

+ UScriptCode script = uscript_getScript(c, &error);

+ return kCjkScripts.find(script) != kCjkScripts.end();

+// Returns true if |name| looks like a CJK name (or some kind of mish-mash of

+// the three, at least). The name is considered to be a CJK name if it is only

+// CJK characters or spaces.

+//

+// Chinese and Japanese names are usually spelled out using the Han characters

+// (logographs), which constitute the "CJK Unified Ideographs" block in Unicode,

+// also referred to as Unihan. Korean names are usually spelled out in the

+// Korean alphabet (Hangul), although they do have a Han equivalent as well.

+bool IsCJKName(const base::string16& name) {

+ for (base::char16 c : name) {

+ if (!IsCJK(c) && !base::IsUnicodeWhitespace(c)) {

+ return false;

+ }

+ return true;

+// Returns true if |c| is a Korean Hangul character.

+bool IsHangul(base::char16 c) {

+ UErrorCode error = U_ZERO_ERROR;

+ return uscript_getScript(c, &error) == USCRIPT_HANGUL;

+// Returns true if |name| looks like a Korean name, made up entirely of Hangul

+// characters or spaces.

+bool IsHangulName(const base::string16& name) {

+ for (base::char16 c : name) {

+ if (!IsHangul(c) && !base::IsUnicodeWhitespace(c)) {

+ return false;

+ }

+ return true;

+// Tries to split a Chinese, Japanese, or Korean name into its given name &

+// surname parts, and puts the result in |parts|. If splitting did not work for

+// whatever reason, returns false.

+bool SplitCJKName(const std::vector<base::string16>& name_tokens,

+ NameParts* parts) {

+ // The convention for CJK languages is to put the surname (last name) first,

+ // and the given name (first name) second. In a continuous text, there is

+ // normally no space between the two parts of the name. When entering their

+ // name into a field, though, some people add a space to disambiguate. CJK

+ // names (almost) never have a middle name.

+ //

+ // TODO(crbug.com/89111): Foreign names in Japanese are written in Katakana,

+ // with a '・' (KATAKANA MIDDLE DOT U+30FB) character as a separator, with

+ // the *western* ordering. e.g. "ビル・ゲイツ" ("biru・geitsu" AKA Bill Gates)

+ if (name_tokens.size() == 1) {

+ // There is no space between the surname and given name. Try to infer where

+ // to separate between the two. Most Chinese and Korean surnames have only

+ // one character, but there are a few that have 2. If the name does not

+ // start with a surname from a known list, default to 1 character.

+ //

+ // TODO(crbug.com/89111): Japanese names with no space will be mis-split,

+ // since we don't have a list of Japanese last names. In the Han alphabet,

+ // it might also be difficult for us to differentiate between Chinese &

+ // Japanese names.

+ const base::string16& name = name_tokens.front();

+ const bool is_korean = IsHangulName(name);

+ // Korean full names always have at least 3 characters. So, if there are

Jinsuk Kim 2016/07/12 22:50:48 This is not entirely true. Names with 1-char surna

nicolaso 2016/07/13 16:31:06 Should we check that the first character is a ver

Jinsuk Kim 2016/07/13 21:51:08 I wonder if it is safe to assume that people in mo

+ // less than three, it's only a given name.

+ if (is_korean && name.size() < 3) {

+ parts->given = name;

+ } else {

+ size_t surname_length = 1;

+ if (is_korean && name.size() == 4) {

+ // 4-character Korean full names default to a 2-character surname. It's

Jinsuk Kim 2016/07/12 22:50:48 You should still check if the first 2-character wo

nicolaso 2016/07/13 16:31:06 I used two separate lists, |common_cjk_multi_char_

Jinsuk Kim 2016/07/13 21:51:08 Acknowledged. Looks good.

+ // definitely a 2/2 split.

+ surname_length = 2;

+ } else {

+ base::string16 surname;

+ // Try to find the surname in |cjk_multi_char_surnames|.

+ for (size_t i = 0; i < arraysize(cjk_multi_char_surnames); i++) {

+ surname.clear();

+ base::UTF8ToUTF16(cjk_multi_char_surnames[i],

+ strlen(cjk_multi_char_surnames[i]),

+ &surname);

+ if (base::StartsWith(name, surname, base::CompareCase::SENSITIVE)) {

Jinsuk Kim 2016/07/12 22:50:48 This is more complicated because the name is more

nicolaso 2016/07/13 16:31:06 I renamed |cjk_multi_char_surnames| to |common_cjk

I renamed |cjk_multi_char_surnames| to |common_cjk_multi_char_surnames|, and removed the surnames you listed in A). Also added 강전희 as a test case.

Jinsuk Kim 2016/07/13 21:51:08 Looks good. How about adding a comment on the test

On 2016/07/13 16:31:06, nicolaso wrote: > On 2016/07/12 22:50:48, Jinsuk wrote: > > This is more complicated because the name is more likely to be 1/2 than 2/1 > > statistics wise. I suggest the input be categorized in following cases: > > > > A) 1/2: Checking the first character against the list of 1-char surname list > > will give much better result. A name like "강전희(Kang Jeon Hee)" (an old friend > of > > mine) has the first 2 letters that can be a surname in > |cjk_multi_char_surnames| > > but actually her surname is only 'Kang' which is one of the most common > surnames > > in Korea, and her given name is 'Jeon Hee'. Same goes for "동방", "소봉", "어금", > > "장곡", and "황목" whose first letter is a surname more common that whole 2-letter > > one. In fact I've never heard/met people with these 2-char surnames. They are > > quite obscure ones. > > https://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EB%B3%B5%EC%84%B1 > > shows that there are about 70 families of the name "어금". Likewise, There are > > only 70 동방, 50 소봉, 17 장곡, 12 강전 families, etc. > > > > B) 2/1: "남궁", "사공", "서문", "선우", "제갈" also have the first char that can be a > > surname but the whole 2-char one is much more likely to be a right surname. > > > > C) Easy decision 2/1: "독고", "망절" don't need such checking since their first > > letter cannot be a surname by itself. > > > > And I'm not really certain about "황보" - it can be either A) or B). I'd suggest > > B). > > > > Note that this is a subjective rule, but I don't think it's very biased. > > > > In conclusion, the surname to suggest for each input are: > > > > A) "강전", "동방", "소봉", "어금", "장곡", "황목" : first char only: i.e., "강", "동", "소", > > "어", "장", "황" > > B) "남궁", "사공", "서문", "선우", "제갈", "황보" : the whole 2 chars > > C) "독고", "망절" : ditto > > > > I renamed |cjk_multi_char_surnames| to |common_cjk_multi_char_surnames|, and > removed the surnames you listed in A). Also added 강전희 as a test case.

Looks good. How about adding a comment on the test case "강전희". It tests a different flow in your logic. comment in the sense of "..choose 강 over 강전 since the former is much more common".

+ surname_length = surname.size();

+ break;

+ }

+ parts->family = name.substr(0, surname_length);

+ parts->given = name.substr(surname_length);

+ }

+ return true;

+ } else if (name_tokens.size() == 2) {

+ // The user entered a space between the two name parts. This makes our job

+ // easier. Family name first, given name second.

+ parts->family = name_tokens[0];

+ parts->given = name_tokens[1];

+ return true;

+ }

+ // We don't know what to do if there are more than 2 tokens.

+ return false;

} // namespace

NameParts SplitName(const base::string16& name) {

std::vector<base::string16> name_tokens =

- base::SplitString(name, base::ASCIIToUTF16(" ,"), base::KEEP_WHITESPACE,

+ base::SplitString(name, base::UTF8ToUTF16(" ,"), base::KEEP_WHITESPACE,

base::SPLIT_WANT_NONEMPTY);

StripPrefixes(&name_tokens);

+ NameParts parts;

+ // TODO(crbug.com/89111): Hungarian, Tamil, Telugu, and Vietnamese also have

+ // the given name before the surname, and should be treated as special cases

+ // too.

+ // Treat CJK names differently.

+ if (IsCJKName(name) && SplitCJKName(name_tokens, &parts)) {

+ return parts;

+ }

// Don't assume "Ma" is a suffix in John Ma.

if (name_tokens.size() > 2)

StripSuffixes(&name_tokens);

- NameParts parts;

if (name_tokens.empty()) {

// Bad things have happened; just assume the whole thing is a given name.

parts.given = name;

« no previous file with comments | « no previous file | components/autofill/core/browser/autofill_data_util_unittest.cc » ('j') | no next file with comments »