Chromium Code Reviews| Index: components/autofill/core/browser/autofill_data_util.cc |
| diff --git a/components/autofill/core/browser/autofill_data_util.cc b/components/autofill/core/browser/autofill_data_util.cc |
| index 7387ed737ee06a4c2928a891cda050d0f829e724..7f4d0b2d576793251471a04f0729838d27d0f127 100644 |
| --- a/components/autofill/core/browser/autofill_data_util.cc |
| +++ b/components/autofill/core/browser/autofill_data_util.cc |
| @@ -10,6 +10,7 @@ |
| #include "base/strings/string_util.h" |
| #include "base/strings/utf_string_conversions.h" |
| #include "components/autofill/core/browser/field_types.h" |
| +#include "third_party/icu/source/common/unicode/uscript.h" |
| namespace autofill { |
| namespace data_util { |
| @@ -31,6 +32,21 @@ const char* const family_name_prefixes[] = {"d'", "de", "del", "der", "di", |
| "la", "le", "mc", "san", "st", |
| "ter", "van", "von"}; |
| +// CJK surnames (last names) that have more than one character. |
| +const char* cjk_multi_char_surnames[] = { |
| + // Korean, taken from the list of registered surnames: |
| + // https://namu.wiki/w/%ED%95%9C%EA%B5%AD%EC%9D%98%20%EC%84%B1%EC%94%A8#s-6 |
|
Jinsuk Kim
2016/07/12 22:50:48
Consider changing the reference to the article in
nicolaso
2016/07/13 16:31:06
Done.
|
| + "강전", "남궁", "독고", "동방", "망절", "사공", "서문", "선우", |
| + "소봉", "어금", "장곡", "제갈", "황목", "황보", |
| + |
| + // Chinese, taken from the top 10 Chinese surnames: |
| + // https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.84.E8.A4.87.E5.A7.93 |
| + // Simplified Chinese (mostly mainland China) |
| + "欧阳", "令狐", "皇甫", "上官", "司徒", "诸葛", "司马", "宇文", "呼延", "端木", |
| + // Traditional Chinese (mostly Taiwan) |
| + "張簡", "歐陽", "諸葛", "申屠", "尉遲", "司馬", "軒轅", "夏侯" |
| +}; |
| + |
| // Returns true if |set| contains |element|, modulo a final period. |
| bool ContainsString(const char* const set[], |
| size_t set_size, |
| @@ -74,20 +90,143 @@ void StripSuffixes(std::vector<base::string16>* name_tokens) { |
| } |
| } |
| +// Returns true if |c| is a CJK (Chinese, Japanese, Korean) character, for any |
| +// of the CJK alphabets. |
| +bool IsCJK(base::char16 c) { |
| + static const std::set<UScriptCode> kCjkScripts { |
| + USCRIPT_HAN, // CJK logographs, used by all 3 (but rarely for Korean) |
| + USCRIPT_HANGUL, // Korean alphabet |
| + USCRIPT_KATAKANA, // A Japanese syllabary |
| + USCRIPT_HIRAGANA, // A Japanese syllabary |
| + USCRIPT_BOPOMOFO // Chinese semisyllabary, rarely used |
| + }; |
| + UErrorCode error = U_ZERO_ERROR; |
| + UScriptCode script = uscript_getScript(c, &error); |
| + return kCjkScripts.find(script) != kCjkScripts.end(); |
| +} |
| + |
| +// Returns true if |name| looks like a CJK name (or some kind of mish-mash of |
| +// the three, at least). The name is considered to be a CJK name if it is only |
| +// CJK characters or spaces. |
| +// |
| +// Chinese and Japanese names are usually spelled out using the Han characters |
| +// (logographs), which constitute the "CJK Unified Ideographs" block in Unicode, |
| +// also referred to as Unihan. Korean names are usually spelled out in the |
| +// Korean alphabet (Hangul), although they do have a Han equivalent as well. |
| +bool IsCJKName(const base::string16& name) { |
| + for (base::char16 c : name) { |
| + if (!IsCJK(c) && !base::IsUnicodeWhitespace(c)) { |
| + return false; |
| + } |
| + } |
| + return true; |
| +} |
| + |
| +// Returns true if |c| is a Korean Hangul character. |
| +bool IsHangul(base::char16 c) { |
| + UErrorCode error = U_ZERO_ERROR; |
| + return uscript_getScript(c, &error) == USCRIPT_HANGUL; |
| +} |
| + |
| +// Returns true if |name| looks like a Korean name, made up entirely of Hangul |
| +// characters or spaces. |
| +bool IsHangulName(const base::string16& name) { |
| + for (base::char16 c : name) { |
| + if (!IsHangul(c) && !base::IsUnicodeWhitespace(c)) { |
| + return false; |
| + } |
| + } |
| + return true; |
| +} |
| + |
| +// Tries to split a Chinese, Japanese, or Korean name into its given name & |
| +// surname parts, and puts the result in |parts|. If splitting did not work for |
| +// whatever reason, returns false. |
| +bool SplitCJKName(const std::vector<base::string16>& name_tokens, |
| + NameParts* parts) { |
| + // The convention for CJK languages is to put the surname (last name) first, |
| + // and the given name (first name) second. In a continuous text, there is |
| + // normally no space between the two parts of the name. When entering their |
| + // name into a field, though, some people add a space to disambiguate. CJK |
| + // names (almost) never have a middle name. |
| + // |
| + // TODO(crbug.com/89111): Foreign names in Japanese are written in Katakana, |
| + // with a '・' (KATAKANA MIDDLE DOT U+30FB) character as a separator, with |
| + // the *western* ordering. e.g. "ビル・ゲイツ" ("biru・geitsu" AKA Bill Gates) |
| + if (name_tokens.size() == 1) { |
| + // There is no space between the surname and given name. Try to infer where |
| + // to separate between the two. Most Chinese and Korean surnames have only |
| + // one character, but there are a few that have 2. If the name does not |
| + // start with a surname from a known list, default to 1 character. |
| + // |
| + // TODO(crbug.com/89111): Japanese names with no space will be mis-split, |
| + // since we don't have a list of Japanese last names. In the Han alphabet, |
| + // it might also be difficult for us to differentiate between Chinese & |
| + // Japanese names. |
| + const base::string16& name = name_tokens.front(); |
| + const bool is_korean = IsHangulName(name); |
| + // Korean full names always have at least 3 characters. So, if there are |
|
Jinsuk Kim
2016/07/12 22:50:48
This is not entirely true. Names with 1-char surna
nicolaso
2016/07/13 16:31:06
Should we check that the first character is a ver
Jinsuk Kim
2016/07/13 21:51:08
I wonder if it is safe to assume that people in mo
|
| + // less than three, it's only a given name. |
| + if (is_korean && name.size() < 3) { |
| + parts->given = name; |
| + } else { |
| + size_t surname_length = 1; |
| + if (is_korean && name.size() == 4) { |
| + // 4-character Korean full names default to a 2-character surname. It's |
|
Jinsuk Kim
2016/07/12 22:50:48
You should still check if the first 2-character wo
nicolaso
2016/07/13 16:31:06
I used two separate lists, |common_cjk_multi_char_
Jinsuk Kim
2016/07/13 21:51:08
Acknowledged. Looks good.
|
| + // definitely a 2/2 split. |
| + surname_length = 2; |
| + } else { |
| + base::string16 surname; |
| + // Try to find the surname in |cjk_multi_char_surnames|. |
| + for (size_t i = 0; i < arraysize(cjk_multi_char_surnames); i++) { |
| + surname.clear(); |
| + base::UTF8ToUTF16(cjk_multi_char_surnames[i], |
| + strlen(cjk_multi_char_surnames[i]), |
| + &surname); |
| + if (base::StartsWith(name, surname, base::CompareCase::SENSITIVE)) { |
|
Jinsuk Kim
2016/07/12 22:50:48
This is more complicated because the name is more
nicolaso
2016/07/13 16:31:06
I renamed |cjk_multi_char_surnames| to |common_cjk
Jinsuk Kim
2016/07/13 21:51:08
Looks good. How about adding a comment on the test
|
| + surname_length = surname.size(); |
| + break; |
| + } |
| + } |
| + } |
| + parts->family = name.substr(0, surname_length); |
| + parts->given = name.substr(surname_length); |
| + } |
| + return true; |
| + } else if (name_tokens.size() == 2) { |
| + // The user entered a space between the two name parts. This makes our job |
| + // easier. Family name first, given name second. |
| + parts->family = name_tokens[0]; |
| + parts->given = name_tokens[1]; |
| + return true; |
| + } |
| + // We don't know what to do if there are more than 2 tokens. |
| + return false; |
| +} |
| + |
| } // namespace |
| NameParts SplitName(const base::string16& name) { |
| std::vector<base::string16> name_tokens = |
| - base::SplitString(name, base::ASCIIToUTF16(" ,"), base::KEEP_WHITESPACE, |
| + base::SplitString(name, base::UTF8ToUTF16(" ,"), base::KEEP_WHITESPACE, |
| base::SPLIT_WANT_NONEMPTY); |
| StripPrefixes(&name_tokens); |
| + NameParts parts; |
| + |
| + // TODO(crbug.com/89111): Hungarian, Tamil, Telugu, and Vietnamese also have |
| + // the given name before the surname, and should be treated as special cases |
| + // too. |
| + |
| + // Treat CJK names differently. |
| + if (IsCJKName(name) && SplitCJKName(name_tokens, &parts)) { |
| + return parts; |
| + } |
| + |
| // Don't assume "Ma" is a suffix in John Ma. |
| if (name_tokens.size() > 2) |
| StripSuffixes(&name_tokens); |
| - NameParts parts; |
| - |
| if (name_tokens.empty()) { |
| // Bad things have happened; just assume the whole thing is a given name. |
| parts.given = name; |