components/autofill/core/browser/autofill_data_util.cc - Issue 2132103002: Split CJK full names into name parts correctly.

Side by Side Diff: components/autofill/core/browser/autofill_data_util.cc

Issue 2132103002: Split CJK full names into name parts correctly. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2016 The Chromium Authors. All rights reserved.	1 // Copyright 2016 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/autofill/core/browser/autofill_data_util.h"	5 #include "components/autofill/core/browser/autofill_data_util.h"

6	6

7 #include <vector>	7 #include <vector>

8	8

9 #include "base/strings/string_split.h"	9 #include "base/strings/string_split.h"

10 #include "base/strings/string_util.h"	10 #include "base/strings/string_util.h"

11 #include "base/strings/utf_string_conversions.h"	11 #include "base/strings/utf_string_conversions.h"

12 #include "components/autofill/core/browser/field_types.h"	12 #include "components/autofill/core/browser/field_types.h"

	13 #include "third_party/icu/source/common/unicode/uscript.h"

13	14

14 namespace autofill {	15 namespace autofill {

15 namespace data_util {	16 namespace data_util {

16	17

17 namespace {	18 namespace {

18 const char* const name_prefixes[] = {	19 const char* const name_prefixes[] = {

19 "1lt", "1st", "2lt", "2nd", "3rd", "admiral", "capt",	20 "1lt", "1st", "2lt", "2nd", "3rd", "admiral", "capt",

20 "captain", "col", "cpt", "dr", "gen", "general", "lcdr",	21 "captain", "col", "cpt", "dr", "gen", "general", "lcdr",

21 "lt", "ltc", "ltg", "ltjg", "maj", "major", "mg",	22 "lt", "ltc", "ltg", "ltjg", "maj", "major", "mg",

22 "mr", "mrs", "ms", "pastor", "prof", "rep", "reverend",	23 "mr", "mrs", "ms", "pastor", "prof", "rep", "reverend",

23 "rev", "sen", "st"};	24 "rev", "sen", "st"};

24	25

25 const char* const name_suffixes[] = {"b.a", "ba", "d.d.s", "dds", "i", "ii",	26 const char* const name_suffixes[] = {"b.a", "ba", "d.d.s", "dds", "i", "ii",

26 "iii", "iv", "ix", "jr", "m.a", "m.d",	27 "iii", "iv", "ix", "jr", "m.a", "m.d",

27 "ma", "md", "ms", "ph.d", "phd", "sr",	28 "ma", "md", "ms", "ph.d", "phd", "sr",

28 "v", "vi", "vii", "viii", "x"};	29 "v", "vi", "vii", "viii", "x"};

29	30

30 const char* const family_name_prefixes[] = {"d'", "de", "del", "der", "di",	31 const char* const family_name_prefixes[] = {"d'", "de", "del", "der", "di",

31 "la", "le", "mc", "san", "st",	32 "la", "le", "mc", "san", "st",

32 "ter", "van", "von"};	33 "ter", "van", "von"};

33	34

	35 // CJK surnames (last names) that have more than one character.

	36 const char* cjk_multi_char_surnames[] = {

	37 // Korean, taken from the list of registered surnames:

	38 // https://namu.wiki/w/%ED%95%9C%EA%B5%AD%EC%9D%98%20%EC%84%B1%EC%94%A8#s-6

	39 "강전", "남궁", "독고", "동방", "망절", "사공", "서문", "선우",

	40 "소봉", "어금", "장곡", "제갈", "황목", "황보",

	41

	42 // Chinese, taken from the top 10 Chinese surnames:

	43 // https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.8 4.E8.A4.87.E5.A7.93

	44 // Simplified Chinese (mostly mainland China)

	45 "欧阳", "令狐", "皇甫", "上官", "司徒", "诸葛", "司马", "宇文", "呼延", "端木",

	46 // Traditional Chinese (mostly Taiwan)

	47 "張簡", "歐陽", "諸葛", "申屠", "尉遲", "司馬", "軒轅", "夏侯"

	48 };

	49

34 // Returns true if \|set\| contains \|element\|, modulo a final period.	50 // Returns true if \|set\| contains \|element\|, modulo a final period.

35 bool ContainsString(const char* const set[],	51 bool ContainsString(const char* const set[],

36 size_t set_size,	52 size_t set_size,

37 const base::string16& element) {	53 const base::string16& element) {

38 if (!base::IsStringASCII(element))	54 if (!base::IsStringASCII(element))

39 return false;	55 return false;

40	56

41 base::string16 trimmed_element;	57 base::string16 trimmed_element;

42 base::TrimString(element, base::ASCIIToUTF16("."), &trimmed_element);	58 base::TrimString(element, base::ASCIIToUTF16("."), &trimmed_element);

43	59

(...skipping 23 matching lines...) Expand all Loading...
67 void StripSuffixes(std::vector<base::string16>* name_tokens) {	83 void StripSuffixes(std::vector<base::string16>* name_tokens) {

68 while (!name_tokens->empty()) {	84 while (!name_tokens->empty()) {

69 if (!ContainsString(name_suffixes, arraysize(name_suffixes),	85 if (!ContainsString(name_suffixes, arraysize(name_suffixes),

70 name_tokens->back())) {	86 name_tokens->back())) {

71 break;	87 break;

72 }	88 }

73 name_tokens->pop_back();	89 name_tokens->pop_back();

74 }	90 }

75 }	91 }

76	92

	93 // Returns true if \|c\| is a CJK (Chinese, Japanese, Korean) character, for any

	94 // of the CJK alphabets.

	95 bool IsCJK(base::char16 c) {

	96 static const std::set<UScriptCode> kCjkScripts {

	97 USCRIPT_HAN, // CJK logographs, used by all 3 (but rarely for Korean)

	98 USCRIPT_HANGUL, // Korean alphabet

	99 USCRIPT_KATAKANA, // A Japanese syllabary

	100 USCRIPT_HIRAGANA, // A Japanese syllabary

	101 USCRIPT_BOPOMOFO // Chinese semisyllabary, rarely used

	102 };

	103 UErrorCode error = U_ZERO_ERROR;

	104 UScriptCode script = uscript_getScript(c, &error);

	105 return kCjkScripts.find(script) != kCjkScripts.end();

	106 }

	107

	108 // Returns true if \|name\| looks like a CJK name (or some kind of mish-mash of

	109 // the three, at least).

	110 //

	111 // Chinese and Japanese names are usually spelled out using the Han characters

	112 // (logographs), which constitute the "CJK Unified Ideographs" block in Unicode,

	113 // also referred to as Unihan. Korean names are usually spelled out in the

	114 // Korean alphabet (Hangul), although they do have a Han equivalent as well.

	115 bool IsCJKName(const base::string16& name) {

	116 for (base::char16 c : name) {

	117 if (!IsCJK(c) && !base::IsUnicodeWhitespace(c)) {

	118 return false;

	119 }

	120 }

	121 return true;

	122 }

	123

	124 // Returns true if \|c\| is a Korean Hangul character.

	125 bool IsHangul(base::char16 c) {

	126 UErrorCode error = U_ZERO_ERROR;

	127 return uscript_getScript(c, &error) == USCRIPT_HANGUL;

	128 }

	129

	130 // Returns true if \|name\| looks like a Korean name, in Hangul characters.
	Mathieu 2016/07/11 13:49:15 "in Hangul characters" -> "made up entirely of Han "in Hangul characters" -> "made up entirely of Hangul characters or spaces"? nicolaso 2016/07/11 15:28:54 Done. Show quoted text On 2016/07/11 13:49:15, Mathieu Perreault wrote: > "in Hangul characters" -> "made up entirely of Hangul characters or spaces"? Done.
	131 bool IsHangulName(const base::string16& name) {

	132 for (base::char16 c : name) {

	133 if (!IsHangul(c) && !base::IsUnicodeWhitespace(c)) {

	134 return false;

	135 }

	136 }

	137 return true;

	138 }

	139

	140 // Tries to split a Chinese, Japanese, or Korean name into its given name &

	141 // surname parts, and puts the result in \|parts\|. If splitting did not work for

	142 // whatever reason, returns false.

	143 bool SplitCJKName(const std::vector<base::string16>& name_tokens,

	144 NameParts* parts) {

	145 // The convention for CJK languages is to put the surname (last name) first,

	146 // and the given name (first name) second. In a continuous text, there is

	147 // normally no space between the two parts of the name. When entering their

	148 // name into a field, though, some people add a space to disambiguate. CJK

	149 // names (almost) never have a middle name.

	150 if (name_tokens.size() == 1) {

	151 // FIXME(89111): Japanese names with no space will be mis-split, since we
	Mathieu 2016/07/11 13:49:15 TODO(crbug.com/89111) TODO(crbug.com/89111) nicolaso 2016/07/11 15:28:54 Done. Show quoted text On 2016/07/11 13:49:15, Mathieu Perreault wrote: > TODO(crbug.com/89111) Done.
	152 // don't have a list of Japanese last names. In the Han alphabet, it might

	153 // also be difficult for us to differentiate between Chinese & Japanese

	154 // names.

	155 //

	156 // There is no space between the surname and given name. Try to infer where

	157 // to separate between the two. Most Chinese and Korean surnames have only

	158 // one character, but there are a few that have 2. If the name does not

	159 // start with a surname from a known list, default to 1 character.

	160 const base::string16& name = name_tokens.front();

	161 const bool is_korean = IsHangulName(name);

	162 // Korean full names always have at least 3 characters. So, if there are

	163 // less than three, it's only a given name.

	164 if (is_korean && name.size() < 3) {

	165 parts->given = name;

	166 } else {

	167 size_t surname_length = 1;

	168 if (is_korean && name.size() == 4) {

	169 // 4-character Korean full names default to a 2-character surname. It's

	170 // definitely a 2/2 split.

	171 surname_length = 2;

	172 } else {

	173 base::string16 surname;

	174 // Try to find the surname in \|cjk_multi_char_surnames\|.

	175 for (size_t i = 0; i < arraysize(cjk_multi_char_surnames); i++) {

	176 surname.clear();

	177 base::UTF8ToUTF16(cjk_multi_char_surnames[i],

	178 strlen(cjk_multi_char_surnames[i]),

	179 &surname);

	180 if (base::StartsWith(name, surname, base::CompareCase::SENSITIVE)) {

	181 surname_length = surname.size();

	182 break;

	183 }

	184 }

	185 }

	186 parts->family = name.substr(0, surname_length);

	187 parts->given = name.substr(surname_length);

	188 }

	189 return true;

	190 } else if (name_tokens.size() == 2) {

	191 // The user entered a space between the two name parts. This makes our job

	192 // easier. Family name first, given name second.

	193 parts->family = name_tokens[0];

	194 parts->given = name_tokens[1];

	195 return true;

	196 }

	197 // We don't know what to do if there are more than 2 tokens.

	198 return false;

	199 }

	200

77 } // namespace	201 } // namespace

78	202

79 NameParts SplitName(const base::string16& name) {	203 NameParts SplitName(const base::string16& name) {

80 std::vector<base::string16> name_tokens =	204 std::vector<base::string16> name_tokens =

81 base::SplitString(name, base::ASCIIToUTF16(" ,"), base::KEEP_WHITESPACE,	205 base::SplitString(name, base::ASCIIToUTF16(" ,"), base::KEEP_WHITESPACE,

82 base::SPLIT_WANT_NONEMPTY);	206 base::SPLIT_WANT_NONEMPTY);

83 StripPrefixes(&name_tokens);	207 StripPrefixes(&name_tokens);

84	208

	209 NameParts parts;

	210

	211 // TODO(89111): Hungarian, Tamil, Telugu, and Vietnamese also have the
	Mathieu 2016/07/11 13:49:15 format is TODO(crbug.com/89111) format is TODO(crbug.com/89111) nicolaso 2016/07/11 15:28:54 Done. Show quoted text On 2016/07/11 13:49:15, Mathieu Perreault wrote: > format is TODO(crbug.com/89111) Done.
	212 // given name before the surname, and should be treated as special cases too.

	213

	214 // Treat CJK names differently.

	215 if (IsCJKName(name) && SplitCJKName(name_tokens, &parts)) {
	Mathieu 2016/07/11 13:49:15 Are there cases where a name is made up of CJK cha Are there cases where a name is made up of CJK characters of different scripts? If not, I'm thinking IsCJKName could instead return the script, which could be used in SplitCJKName (would avoid going over the name twice). Let me know if such a thing could be possible, thanks nicolaso 2016/07/11 15:28:54 Off the top of my head, the only one I can think o Show quoted text On 2016/07/11 13:49:15, Mathieu Perreault wrote: > Are there cases where a name is made up of CJK characters of different scripts? > If not, I'm thinking IsCJKName could instead return the script, which could be > used in SplitCJKName (would avoid going over the name twice). > > Let me know if such a thing could be possible, thanks Off the top of my head, the only one I can think of, is someone whose name is like "Bruce Takeshi". Japanese uses one script for Japanese names, and another for foreign names. "Bruce" and "Takeshi" could be in different scripts. As far as performance is concerned (since we re-traverse the name twice, with `IsHangulName()'), I don't think we need to worry too much. Most CJK names are made up of about 3 characters. A quick benchmark also says `uscript_getScript(c)' is even faster than looking up `c' in an `std::unordered_map', meaning that `IsCJK(c)' and `IsHangul(c)' are pretty fast as well.
	216 return parts;

	217 }

	218

85 // Don't assume "Ma" is a suffix in John Ma.	219 // Don't assume "Ma" is a suffix in John Ma.

86 if (name_tokens.size() > 2)	220 if (name_tokens.size() > 2)

87 StripSuffixes(&name_tokens);	221 StripSuffixes(&name_tokens);

88	222

89 NameParts parts;

90

91 if (name_tokens.empty()) {	223 if (name_tokens.empty()) {

92 // Bad things have happened; just assume the whole thing is a given name.	224 // Bad things have happened; just assume the whole thing is a given name.

93 parts.given = name;	225 parts.given = name;

94 return parts;	226 return parts;

95 }	227 }

96	228

97 // Only one token, assume given name.	229 // Only one token, assume given name.

98 if (name_tokens.size() == 1) {	230 if (name_tokens.size() == 1) {

99 parts.given = name_tokens[0];	231 parts.given = name_tokens[0];

100 return parts;	232 return parts;

(...skipping 62 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
163 profile.GetRawInfo(autofill::NAME_LAST);	295 profile.GetRawInfo(autofill::NAME_LAST);

164 if (!full_name.compare(candidate)) {	296 if (!full_name.compare(candidate)) {

165 return true;	297 return true;

166 }	298 }

167	299

168 return false;	300 return false;

169 }	301 }

170	302

171 } // namespace data_util	303 } // namespace data_util

172 } // namespace autofill	304 } // namespace autofill

OLD	NEW

« no previous file with comments | « no previous file | components/autofill/core/browser/autofill_data_util_unittest.cc » ('j') | no next file with comments »