components/autofill/core/browser/autofill_data_util.cc - Issue 2132103002: Split CJK full names into name parts correctly.

Side by Side Diff: components/autofill/core/browser/autofill_data_util.cc

Issue 2132103002: Split CJK full names into name parts correctly. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Improve precision for splitting Korean names. Created 4 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2016 The Chromium Authors. All rights reserved.	1 // Copyright 2016 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/autofill/core/browser/autofill_data_util.h"	5 #include "components/autofill/core/browser/autofill_data_util.h"

6	6

7 #include <vector>	7 #include <vector>

8	8

9 #include "base/strings/string_split.h"	9 #include "base/strings/string_split.h"

10 #include "base/strings/string_util.h"	10 #include "base/strings/string_util.h"

11 #include "base/strings/utf_string_conversions.h"	11 #include "base/strings/utf_string_conversions.h"

12 #include "components/autofill/core/browser/field_types.h"	12 #include "components/autofill/core/browser/field_types.h"

	13 #include "third_party/icu/source/common/unicode/uscript.h"

13	14

14 namespace autofill {	15 namespace autofill {

15 namespace data_util {	16 namespace data_util {

16	17

17 namespace {	18 namespace {

18 const char* const name_prefixes[] = {	19 const char* const name_prefixes[] = {

19 "1lt", "1st", "2lt", "2nd", "3rd", "admiral", "capt",	20 "1lt", "1st", "2lt", "2nd", "3rd", "admiral", "capt",

20 "captain", "col", "cpt", "dr", "gen", "general", "lcdr",	21 "captain", "col", "cpt", "dr", "gen", "general", "lcdr",

21 "lt", "ltc", "ltg", "ltjg", "maj", "major", "mg",	22 "lt", "ltc", "ltg", "ltjg", "maj", "major", "mg",

22 "mr", "mrs", "ms", "pastor", "prof", "rep", "reverend",	23 "mr", "mrs", "ms", "pastor", "prof", "rep", "reverend",

23 "rev", "sen", "st"};	24 "rev", "sen", "st"};

24	25

25 const char* const name_suffixes[] = {"b.a", "ba", "d.d.s", "dds", "i", "ii",	26 const char* const name_suffixes[] = {"b.a", "ba", "d.d.s", "dds", "i", "ii",

26 "iii", "iv", "ix", "jr", "m.a", "m.d",	27 "iii", "iv", "ix", "jr", "m.a", "m.d",

27 "ma", "md", "ms", "ph.d", "phd", "sr",	28 "ma", "md", "ms", "ph.d", "phd", "sr",

28 "v", "vi", "vii", "viii", "x"};	29 "v", "vi", "vii", "viii", "x"};

29	30

30 const char* const family_name_prefixes[] = {"d'", "de", "del", "der", "di",	31 const char* const family_name_prefixes[] = {"d'", "de", "del", "der", "di",

31 "la", "le", "mc", "san", "st",	32 "la", "le", "mc", "san", "st",

32 "ter", "van", "von"};	33 "ter", "van", "von"};

33	34

	35 // The most common CJK surnames (last names) that have more than one character.
	Jinsuk Kim 2016/07/13 21:51:09 In fact it contains two groups. How about updating In fact it contains two groups. How about updating the comment "s/common/common or non-ambiguous/" so other people reading the code won't stop and stare at "독고"/"망절" nicolaso 2016/07/14 18:12:43 Done. Show quoted text On 2016/07/13 21:51:09, Jinsuk wrote: > In fact it contains two groups. How about updating the comment "s/common/common > or non-ambiguous/" so other people reading the code won't stop and stare at > "독고"/"망절" Done.
	36 const char* common_cjk_multi_char_surnames[] = {

	37 // Korean, taken from the list of surnames:

	38 // https://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EC%84%B1%EC%94%A 8_%EB%AA%A9%EB%A1%9D

	39 "남궁", "사공", "서문", "선우", "제갈", "황보", "독고", "망절",

	40

	41 // Chinese, taken from the top 10 Chinese 2-character surnames:

	42 // https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.8 4.E8.A4.87.E5.A7.93

	43 // Simplified Chinese (mostly mainland China)

	44 "欧阳", "令狐", "皇甫", "上官", "司徒", "诸葛", "司马", "宇文", "呼延", "端木",

	45 // Traditional Chinese (mostly Taiwan)

	46 "張簡", "歐陽", "諸葛", "申屠", "尉遲", "司馬", "軒轅", "夏侯"

	47 };

	48

	49 // All Korean surnames that have more than one character, even the rare ones.

	50 const char* korean_multi_char_surnames[] = {

	51 "강전", "남궁", "독고", "동방", "망절", "사공", "서문", "선우",

	52 "소봉", "어금", "장곡", "제갈", "황목", "황보"

	53 };

	54

34 // Returns true if \|set\| contains \|element\|, modulo a final period.	55 // Returns true if \|set\| contains \|element\|, modulo a final period.

35 bool ContainsString(const char* const set[],	56 bool ContainsString(const char* const set[],

36 size_t set_size,	57 size_t set_size,

37 const base::string16& element) {	58 const base::string16& element) {

38 if (!base::IsStringASCII(element))	59 if (!base::IsStringASCII(element))

39 return false;	60 return false;

40	61

41 base::string16 trimmed_element;	62 base::string16 trimmed_element;

42 base::TrimString(element, base::ASCIIToUTF16("."), &trimmed_element);	63 base::TrimString(element, base::ASCIIToUTF16("."), &trimmed_element);

43	64

(...skipping 23 matching lines...) Expand all Loading...
67 void StripSuffixes(std::vector<base::string16>* name_tokens) {	88 void StripSuffixes(std::vector<base::string16>* name_tokens) {

68 while (!name_tokens->empty()) {	89 while (!name_tokens->empty()) {

69 if (!ContainsString(name_suffixes, arraysize(name_suffixes),	90 if (!ContainsString(name_suffixes, arraysize(name_suffixes),

70 name_tokens->back())) {	91 name_tokens->back())) {

71 break;	92 break;

72 }	93 }

73 name_tokens->pop_back();	94 name_tokens->pop_back();

74 }	95 }

75 }	96 }

76	97

	98 // Find whether \|name\| starts with any of the strings from the array

	99 // \|prefixes\|. The returned value is the length of the prefix found, or 0 if

	100 // none is found.

	101 size_t StartsWithAny(base::StringPiece16 name, const char** prefixes,

	102 size_t prefix_count) {

	103 base::string16 buffer;

	104 for (size_t i = 0; i < prefix_count; i++) {

	105 buffer.clear();

	106 base::UTF8ToUTF16(prefixes[i], strlen(prefixes[i]), &buffer);

	107 if (base::StartsWith(name, buffer, base::CompareCase::SENSITIVE)) {

	108 return buffer.size();

	109 }

	110 }

	111 return 0;

	112 }

	113

	114 // Returns true if \|c\| is a CJK (Chinese, Japanese, Korean) character, for any

	115 // of the CJK alphabets.

	116 bool IsCJK(base::char16 c) {

	117 static const std::set<UScriptCode> kCjkScripts {

	118 USCRIPT_HAN, // CJK logographs, used by all 3 (but rarely for Korean)

	119 USCRIPT_HANGUL, // Korean alphabet

	120 USCRIPT_KATAKANA, // A Japanese syllabary

	121 USCRIPT_HIRAGANA, // A Japanese syllabary

	122 USCRIPT_BOPOMOFO // Chinese semisyllabary, rarely used

	123 };

	124 UErrorCode error = U_ZERO_ERROR;

	125 UScriptCode script = uscript_getScript(c, &error);

	126 return kCjkScripts.find(script) != kCjkScripts.end();

	127 }

	128

	129 // Returns true if \|name\| looks like a CJK name (or some kind of mish-mash of

	130 // the three, at least). The name is considered to be a CJK name if it is only

	131 // CJK characters or spaces.

	132 //

	133 // Chinese and Japanese names are usually spelled out using the Han characters

	134 // (logographs), which constitute the "CJK Unified Ideographs" block in Unicode,

	135 // also referred to as Unihan. Korean names are usually spelled out in the

	136 // Korean alphabet (Hangul), although they do have a Han equivalent as well.

	137 bool IsCJKName(const base::string16& name) {

	138 for (base::char16 c : name) {

	139 if (!IsCJK(c) && !base::IsUnicodeWhitespace(c)) {

	140 return false;

	141 }

	142 }

	143 return true;

	144 }

	145

	146 // Returns true if \|c\| is a Korean Hangul character.

	147 bool IsHangul(base::char16 c) {

	148 UErrorCode error = U_ZERO_ERROR;

	149 return uscript_getScript(c, &error) == USCRIPT_HANGUL;

	150 }

	151

	152 // Returns true if \|name\| looks like a Korean name, made up entirely of Hangul

	153 // characters or spaces.

	154 bool IsHangulName(const base::string16& name) {

	155 for (base::char16 c : name) {

	156 if (!IsHangul(c) && !base::IsUnicodeWhitespace(c)) {

	157 return false;

	158 }

	159 }

	160 return true;

	161 }

	162

	163 // Tries to split a Chinese, Japanese, or Korean name into its given name &

	164 // surname parts, and puts the result in \|parts\|. If splitting did not work for

	165 // whatever reason, returns false.

	166 bool SplitCJKName(const std::vector<base::string16>& name_tokens,

	167 NameParts* parts) {

	168 // The convention for CJK languages is to put the surname (last name) first,

	169 // and the given name (first name) second. In a continuous text, there is

	170 // normally no space between the two parts of the name. When entering their

	171 // name into a field, though, some people add a space to disambiguate. CJK

	172 // names (almost) never have a middle name.

	173 //

	174 // TODO(crbug.com/89111): Foreign names in Japanese are written in Katakana,

	175 // with a '・' (KATAKANA MIDDLE DOT U+30FB) character as a separator, with

	176 // the western ordering. e.g. "ビル・ゲイツ" ("biru・geitsu" AKA Bill Gates)

	177 if (name_tokens.size() == 1) {

	178 // There is no space between the surname and given name. Try to infer where

	179 // to separate between the two. Most Chinese and Korean surnames have only

	180 // one character, but there are a few that have 2. If the name does not

	181 // start with a surname from a known list, default to 1 character.

	182 //

	183 // TODO(crbug.com/89111): Japanese names with no space will be mis-split,

	184 // since we don't have a list of Japanese last names. In the Han alphabet,

	185 // it might also be difficult for us to differentiate between Chinese &

	186 // Japanese names.

	187 const base::string16& name = name_tokens.front();

	188 const bool is_korean = IsHangulName(name);

	189 // Korean full names always have at least 3 characters. So, if there are

	190 // less than three, it's only a given name.

	191 if (is_korean && name.size() < 3) {

	192 parts->given = name;

	193 } else {

	194 size_t surname_length = 0;

	195 if (is_korean && name.size() > 3) {

	196 // 4-character Korean names are more likely to be 2/2 than 1/3, so use

	197 // the full list of Korean 2-char surnames. (instead of only the common

	198 // ones)

	199 surname_length = std::max<size_t>(

	200 1, StartsWithAny(name, korean_multi_char_surnames,

	201 arraysize(korean_multi_char_surnames)));

	202 }

	203 else {

	204 // Default to 1 character if the surname is not in

	205 // \|common_cjk_multi_char_surnames\|.

	206 surname_length = std::max<size_t>(

	207 1, StartsWithAny(name, common_cjk_multi_char_surnames,

	208 arraysize(common_cjk_multi_char_surnames)));

	209 }

	210 parts->family = name.substr(0, surname_length);

	211 parts->given = name.substr(surname_length);

	212 }

	213 return true;

	214 } else if (name_tokens.size() == 2) {
	gogerald1 2016/07/14 18:28:52 break this 'else if' into a separate 'if' statemen break this 'else if' into a separate 'if' statement since you have returned, nicolaso 2016/07/14 20:32:17 Done. Show quoted text On 2016/07/14 18:28:52, gogerald1 wrote: > break this 'else if' into a separate 'if' statement since you have returned, Done.
	215 // The user entered a space between the two name parts. This makes our job

	216 // easier. Family name first, given name second.

	217 parts->family = name_tokens[0];

	218 parts->given = name_tokens[1];

	219 return true;

	220 }

	221 // We don't know what to do if there are more than 2 tokens.

	222 return false;

	223 }

	224

77 } // namespace	225 } // namespace

78	226

79 NameParts SplitName(const base::string16& name) {	227 NameParts SplitName(const base::string16& name) {

80 std::vector<base::string16> name_tokens =	228 std::vector<base::string16> name_tokens =

81 base::SplitString(name, base::ASCIIToUTF16(" ,"), base::KEEP_WHITESPACE,	229 base::SplitString(name, base::UTF8ToUTF16(" ,"), base::KEEP_WHITESPACE,

82 base::SPLIT_WANT_NONEMPTY);	230 base::SPLIT_WANT_NONEMPTY);

83 StripPrefixes(&name_tokens);	231 StripPrefixes(&name_tokens);

84	232

	233 NameParts parts;

	234

	235 // TODO(crbug.com/89111): Hungarian, Tamil, Telugu, and Vietnamese also have

	236 // the given name before the surname, and should be treated as special cases

	237 // too.

	238

	239 // Treat CJK names differently.

	240 if (IsCJKName(name) && SplitCJKName(name_tokens, &parts)) {

	241 return parts;

	242 }

	243

85 // Don't assume "Ma" is a suffix in John Ma.	244 // Don't assume "Ma" is a suffix in John Ma.

86 if (name_tokens.size() > 2)	245 if (name_tokens.size() > 2)

87 StripSuffixes(&name_tokens);	246 StripSuffixes(&name_tokens);

88	247

89 NameParts parts;

90

91 if (name_tokens.empty()) {	248 if (name_tokens.empty()) {

92 // Bad things have happened; just assume the whole thing is a given name.	249 // Bad things have happened; just assume the whole thing is a given name.

93 parts.given = name;	250 parts.given = name;

94 return parts;	251 return parts;

95 }	252 }

96	253

97 // Only one token, assume given name.	254 // Only one token, assume given name.

98 if (name_tokens.size() == 1) {	255 if (name_tokens.size() == 1) {

99 parts.given = name_tokens[0];	256 parts.given = name_tokens[0];

100 return parts;	257 return parts;

(...skipping 62 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
163 profile.GetRawInfo(autofill::NAME_LAST);	320 profile.GetRawInfo(autofill::NAME_LAST);

164 if (!full_name.compare(candidate)) {	321 if (!full_name.compare(candidate)) {

165 return true;	322 return true;

166 }	323 }

167	324

168 return false;	325 return false;

169 }	326 }

170	327

171 } // namespace data_util	328 } // namespace data_util

172 } // namespace autofill	329 } // namespace autofill

OLD	NEW