components/autofill/core/browser/autofill_data_util.cc - Issue 2132103002: Split CJK full names into name parts correctly.

Side by Side Diff: components/autofill/core/browser/autofill_data_util.cc

Issue 2132103002: Split CJK full names into name parts correctly. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Fix build on Windows. Created 4 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2016 The Chromium Authors. All rights reserved.	1 // Copyright 2016 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/autofill/core/browser/autofill_data_util.h"	5 #include "components/autofill/core/browser/autofill_data_util.h"

6	6

	7 #include <algorithm>

7 #include <vector>	8 #include <vector>

8	9

9 #include "base/strings/string_split.h"	10 #include "base/strings/string_split.h"

10 #include "base/strings/string_util.h"	11 #include "base/strings/string_util.h"

11 #include "base/strings/utf_string_conversions.h"	12 #include "base/strings/utf_string_conversions.h"

12 #include "components/autofill/core/browser/field_types.h"	13 #include "components/autofill/core/browser/field_types.h"

	14 #include "third_party/icu/source/common/unicode/uscript.h"

13	15

14 namespace autofill {	16 namespace autofill {

15 namespace data_util {	17 namespace data_util {

16	18

17 namespace {	19 namespace {

18 const char* const name_prefixes[] = {	20 const char* const name_prefixes[] = {

19 "1lt", "1st", "2lt", "2nd", "3rd", "admiral", "capt",	21 "1lt", "1st", "2lt", "2nd", "3rd", "admiral", "capt",

20 "captain", "col", "cpt", "dr", "gen", "general", "lcdr",	22 "captain", "col", "cpt", "dr", "gen", "general", "lcdr",

21 "lt", "ltc", "ltg", "ltjg", "maj", "major", "mg",	23 "lt", "ltc", "ltg", "ltjg", "maj", "major", "mg",

22 "mr", "mrs", "ms", "pastor", "prof", "rep", "reverend",	24 "mr", "mrs", "ms", "pastor", "prof", "rep", "reverend",

23 "rev", "sen", "st"};	25 "rev", "sen", "st"};

24	26

25 const char* const name_suffixes[] = {"b.a", "ba", "d.d.s", "dds", "i", "ii",	27 const char* const name_suffixes[] = {"b.a", "ba", "d.d.s", "dds", "i", "ii",

26 "iii", "iv", "ix", "jr", "m.a", "m.d",	28 "iii", "iv", "ix", "jr", "m.a", "m.d",

27 "ma", "md", "ms", "ph.d", "phd", "sr",	29 "ma", "md", "ms", "ph.d", "phd", "sr",

28 "v", "vi", "vii", "viii", "x"};	30 "v", "vi", "vii", "viii", "x"};

29	31

30 const char* const family_name_prefixes[] = {"d'", "de", "del", "der", "di",	32 const char* const family_name_prefixes[] = {"d'", "de", "del", "der", "di",

31 "la", "le", "mc", "san", "st",	33 "la", "le", "mc", "san", "st",

32 "ter", "van", "von"};	34 "ter", "van", "von"};

33	35

	36 // The common and non-ambiguous CJK surnames (last names) that have more than

	37 // one character.

	38 const char* common_cjk_multi_char_surnames[] = {

	39 // Korean, taken from the list of surnames:

	40 // https://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EC%84%B1%EC%94%A 8_%EB%AA%A9%EB%A1%9D

	41 "남궁", "사공", "서문", "선우", "제갈", "황보", "독고", "망절",

	42

	43 // Chinese, taken from the top 10 Chinese 2-character surnames:

	44 // https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.8 4.E8.A4.87.E5.A7.93

	45 // Simplified Chinese (mostly mainland China)

	46 "欧阳", "令狐", "皇甫", "上官", "司徒", "诸葛", "司马", "宇文", "呼延", "端木",

	47 // Traditional Chinese (mostly Taiwan)

	48 "張簡", "歐陽", "諸葛", "申屠", "尉遲", "司馬", "軒轅", "夏侯"

	49 };

	50

	51 // All Korean surnames that have more than one character, even the

	52 // rare/ambiguous ones.

	53 const char* korean_multi_char_surnames[] = {

	54 "강전", "남궁", "독고", "동방", "망절", "사공", "서문", "선우",

	55 "소봉", "어금", "장곡", "제갈", "황목", "황보"

	56 };

	57

34 // Returns true if \|set\| contains \|element\|, modulo a final period.	58 // Returns true if \|set\| contains \|element\|, modulo a final period.

35 bool ContainsString(const char* const set[],	59 bool ContainsString(const char* const set[],

36 size_t set_size,	60 size_t set_size,

37 const base::string16& element) {	61 const base::string16& element) {

38 if (!base::IsStringASCII(element))	62 if (!base::IsStringASCII(element))

39 return false;	63 return false;

40	64

41 base::string16 trimmed_element;	65 base::string16 trimmed_element;

42 base::TrimString(element, base::ASCIIToUTF16("."), &trimmed_element);	66 base::TrimString(element, base::ASCIIToUTF16("."), &trimmed_element);

43	67

(...skipping 23 matching lines...) Expand all Loading...
67 void StripSuffixes(std::vector<base::string16>* name_tokens) {	91 void StripSuffixes(std::vector<base::string16>* name_tokens) {

68 while (!name_tokens->empty()) {	92 while (!name_tokens->empty()) {

69 if (!ContainsString(name_suffixes, arraysize(name_suffixes),	93 if (!ContainsString(name_suffixes, arraysize(name_suffixes),

70 name_tokens->back())) {	94 name_tokens->back())) {

71 break;	95 break;

72 }	96 }

73 name_tokens->pop_back();	97 name_tokens->pop_back();

74 }	98 }

75 }	99 }

76	100

	101 // Find whether \|name\| starts with any of the strings from the array

	102 // \|prefixes\|. The returned value is the length of the prefix found, or 0 if

	103 // none is found.

	104 size_t StartsWithAny(base::StringPiece16 name, const char** prefixes,

	105 size_t prefix_count) {

	106 base::string16 buffer;

	107 for (size_t i = 0; i < prefix_count; i++) {

	108 buffer.clear();

	109 base::UTF8ToUTF16(prefixes[i], strlen(prefixes[i]), &buffer);

	110 if (base::StartsWith(name, buffer, base::CompareCase::SENSITIVE)) {

	111 return buffer.size();

	112 }

	113 }

	114 return 0;

	115 }

	116

	117 // Returns true if \|c\| is a CJK (Chinese, Japanese, Korean) character, for any

	118 // of the CJK alphabets.

	119 bool IsCJK(base::char16 c) {

	120 static const std::set<UScriptCode> kCjkScripts {

	121 USCRIPT_HAN, // CJK logographs, used by all 3 (but rarely for Korean)

	122 USCRIPT_HANGUL, // Korean alphabet

	123 USCRIPT_KATAKANA, // A Japanese syllabary

	124 USCRIPT_HIRAGANA, // A Japanese syllabary

	125 USCRIPT_BOPOMOFO // Chinese semisyllabary, rarely used

	126 };

	127 UErrorCode error = U_ZERO_ERROR;

	128 UScriptCode script = uscript_getScript(c, &error);

	129 return kCjkScripts.find(script) != kCjkScripts.end();

	130 }

	131

	132 // Returns true if \|name\| looks like a CJK name (or some kind of mish-mash of

	133 // the three, at least). The name is considered to be a CJK name if it is only

	134 // CJK characters or spaces.

	135 //

	136 // Chinese and Japanese names are usually spelled out using the Han characters

	137 // (logographs), which constitute the "CJK Unified Ideographs" block in Unicode,

	138 // also referred to as Unihan. Korean names are usually spelled out in the

	139 // Korean alphabet (Hangul), although they do have a Han equivalent as well.

	140 bool IsCJKName(const base::string16& name) {

	141 for (base::char16 c : name) {

	142 if (!IsCJK(c) && !base::IsUnicodeWhitespace(c)) {

	143 return false;

	144 }

	145 }

	146 return true;

	147 }

	148

	149 // Returns true if \|c\| is a Korean Hangul character.

	150 bool IsHangul(base::char16 c) {

	151 UErrorCode error = U_ZERO_ERROR;

	152 return uscript_getScript(c, &error) == USCRIPT_HANGUL;

	153 }

	154

	155 // Returns true if \|name\| looks like a Korean name, made up entirely of Hangul

	156 // characters or spaces.

	157 bool IsHangulName(const base::string16& name) {

	158 for (base::char16 c : name) {

	159 if (!IsHangul(c) && !base::IsUnicodeWhitespace(c)) {

	160 return false;

	161 }

	162 }

	163 return true;

	164 }

	165

	166 // Tries to split a Chinese, Japanese, or Korean name into its given name &

	167 // surname parts, and puts the result in \|parts\|. If splitting did not work for

	168 // whatever reason, returns false.

	169 bool SplitCJKName(const std::vector<base::string16>& name_tokens,

	170 NameParts* parts) {

	171 // The convention for CJK languages is to put the surname (last name) first,

	172 // and the given name (first name) second. In a continuous text, there is

	173 // normally no space between the two parts of the name. When entering their

	174 // name into a field, though, some people add a space to disambiguate. CJK

	175 // names (almost) never have a middle name.

	176 //

	177 // TODO(crbug.com/89111): Foreign names in Japanese are written in Katakana,

	178 // with a '・' (KATAKANA MIDDLE DOT U+30FB) character as a separator, with

	179 // the western ordering. e.g. "ビル・ゲイツ" ("biru・geitsu" AKA Bill Gates)

	180 if (name_tokens.size() == 1) {

	181 // There is no space between the surname and given name. Try to infer where

	182 // to separate between the two. Most Chinese and Korean surnames have only

	183 // one character, but there are a few that have 2. If the name does not

	184 // start with a surname from a known list, default to 1 character.

	185 //

	186 // TODO(crbug.com/89111): Japanese names with no space will be mis-split,

	187 // since we don't have a list of Japanese last names. In the Han alphabet,

	188 // it might also be difficult for us to differentiate between Chinese &

	189 // Japanese names.

	190 const base::string16& name = name_tokens.front();

	191 const bool is_korean = IsHangulName(name);

	192 size_t surname_length = 0;

	193 if (is_korean && name.size() > 3) {

	194 // 4-character Korean names are more likely to be 2/2 than 1/3, so use

	195 // the full list of Korean 2-char surnames. (instead of only the common

	196 // ones)

	197 surname_length = std::max<size_t>(

	198 1, StartsWithAny(name, korean_multi_char_surnames,

	199 arraysize(korean_multi_char_surnames)));

	200 } else {

	201 // Default to 1 character if the surname is not in

	202 // \|common_cjk_multi_char_surnames\|.

	203 surname_length = std::max<size_t>(

	204 1, StartsWithAny(name, common_cjk_multi_char_surnames,

	205 arraysize(common_cjk_multi_char_surnames)));

	206 }

	207 parts->family = name.substr(0, surname_length);

	208 parts->given = name.substr(surname_length);

	209 return true;

	210 }

	211 if (name_tokens.size() == 2) {

	212 // The user entered a space between the two name parts. This makes our job

	213 // easier. Family name first, given name second.

	214 parts->family = name_tokens[0];

	215 parts->given = name_tokens[1];

	216 return true;

	217 }

	218 // We don't know what to do if there are more than 2 tokens.

	219 return false;

	220 }

	221

77 } // namespace	222 } // namespace

78	223

79 NameParts SplitName(const base::string16& name) {	224 NameParts SplitName(const base::string16& name) {

80 std::vector<base::string16> name_tokens =	225 std::vector<base::string16> name_tokens =

81 base::SplitString(name, base::ASCIIToUTF16(" ,"), base::KEEP_WHITESPACE,	226 base::SplitString(name, base::ASCIIToUTF16(" ,"), base::KEEP_WHITESPACE,

82 base::SPLIT_WANT_NONEMPTY);	227 base::SPLIT_WANT_NONEMPTY);

83 StripPrefixes(&name_tokens);	228 StripPrefixes(&name_tokens);

84	229

	230 NameParts parts;

	231

	232 // TODO(crbug.com/89111): Hungarian, Tamil, Telugu, and Vietnamese also have

	233 // the given name before the surname, and should be treated as special cases

	234 // too.

	235

	236 // Treat CJK names differently.

	237 if (IsCJKName(name) && SplitCJKName(name_tokens, &parts)) {

	238 return parts;

	239 }

	240

85 // Don't assume "Ma" is a suffix in John Ma.	241 // Don't assume "Ma" is a suffix in John Ma.

86 if (name_tokens.size() > 2)	242 if (name_tokens.size() > 2)

87 StripSuffixes(&name_tokens);	243 StripSuffixes(&name_tokens);

88	244

89 NameParts parts;

90

91 if (name_tokens.empty()) {	245 if (name_tokens.empty()) {

92 // Bad things have happened; just assume the whole thing is a given name.	246 // Bad things have happened; just assume the whole thing is a given name.

93 parts.given = name;	247 parts.given = name;

94 return parts;	248 return parts;

95 }	249 }

96	250

97 // Only one token, assume given name.	251 // Only one token, assume given name.

98 if (name_tokens.size() == 1) {	252 if (name_tokens.size() == 1) {

99 parts.given = name_tokens[0];	253 parts.given = name_tokens[0];

100 return parts;	254 return parts;

(...skipping 62 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
163 profile.GetRawInfo(autofill::NAME_LAST);	317 profile.GetRawInfo(autofill::NAME_LAST);

164 if (!full_name.compare(candidate)) {	318 if (!full_name.compare(candidate)) {

165 return true;	319 return true;

166 }	320 }

167	321

168 return false;	322 return false;

169 }	323 }

170	324

171 } // namespace data_util	325 } // namespace data_util

172 } // namespace autofill	326 } // namespace autofill

OLD	NEW

« no previous file with comments | « no previous file | components/autofill/core/browser/autofill_data_util_unittest.cc » ('j') | no next file with comments »