Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(341)

Side by Side Diff: components/autofill/core/browser/autofill_data_util.cc

Issue 2132103002: Split CJK full names into name parts correctly. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Move a TODO. Created 4 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | components/autofill/core/browser/autofill_data_util_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2016 The Chromium Authors. All rights reserved. 1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/autofill/core/browser/autofill_data_util.h" 5 #include "components/autofill/core/browser/autofill_data_util.h"
6 6
7 #include <vector> 7 #include <vector>
8 8
9 #include "base/strings/string_split.h" 9 #include "base/strings/string_split.h"
10 #include "base/strings/string_util.h" 10 #include "base/strings/string_util.h"
11 #include "base/strings/utf_string_conversions.h" 11 #include "base/strings/utf_string_conversions.h"
12 #include "components/autofill/core/browser/field_types.h" 12 #include "components/autofill/core/browser/field_types.h"
13 #include "third_party/icu/source/common/unicode/uscript.h"
13 14
14 namespace autofill { 15 namespace autofill {
15 namespace data_util { 16 namespace data_util {
16 17
17 namespace { 18 namespace {
18 const char* const name_prefixes[] = { 19 const char* const name_prefixes[] = {
19 "1lt", "1st", "2lt", "2nd", "3rd", "admiral", "capt", 20 "1lt", "1st", "2lt", "2nd", "3rd", "admiral", "capt",
20 "captain", "col", "cpt", "dr", "gen", "general", "lcdr", 21 "captain", "col", "cpt", "dr", "gen", "general", "lcdr",
21 "lt", "ltc", "ltg", "ltjg", "maj", "major", "mg", 22 "lt", "ltc", "ltg", "ltjg", "maj", "major", "mg",
22 "mr", "mrs", "ms", "pastor", "prof", "rep", "reverend", 23 "mr", "mrs", "ms", "pastor", "prof", "rep", "reverend",
23 "rev", "sen", "st"}; 24 "rev", "sen", "st"};
24 25
25 const char* const name_suffixes[] = {"b.a", "ba", "d.d.s", "dds", "i", "ii", 26 const char* const name_suffixes[] = {"b.a", "ba", "d.d.s", "dds", "i", "ii",
26 "iii", "iv", "ix", "jr", "m.a", "m.d", 27 "iii", "iv", "ix", "jr", "m.a", "m.d",
27 "ma", "md", "ms", "ph.d", "phd", "sr", 28 "ma", "md", "ms", "ph.d", "phd", "sr",
28 "v", "vi", "vii", "viii", "x"}; 29 "v", "vi", "vii", "viii", "x"};
29 30
30 const char* const family_name_prefixes[] = {"d'", "de", "del", "der", "di", 31 const char* const family_name_prefixes[] = {"d'", "de", "del", "der", "di",
31 "la", "le", "mc", "san", "st", 32 "la", "le", "mc", "san", "st",
32 "ter", "van", "von"}; 33 "ter", "van", "von"};
33 34
35 // CJK surnames (last names) that have more than one character.
36 const char* cjk_multi_char_surnames[] = {
37 // Korean, taken from the list of registered surnames:
38 // https://namu.wiki/w/%ED%95%9C%EA%B5%AD%EC%9D%98%20%EC%84%B1%EC%94%A8#s-6
Jinsuk Kim 2016/07/12 22:50:48 Consider changing the reference to the article in
nicolaso 2016/07/13 16:31:06 Done.
39 "강전", "남궁", "독고", "동방", "망절", "사공", "서문", "선우",
40 "소봉", "어금", "장곡", "제갈", "황목", "황보",
41
42 // Chinese, taken from the top 10 Chinese surnames:
43 // https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.8 4.E8.A4.87.E5.A7.93
44 // Simplified Chinese (mostly mainland China)
45 "欧阳", "令狐", "皇甫", "上官", "司徒", "诸葛", "司马", "宇文", "呼延", "端木",
46 // Traditional Chinese (mostly Taiwan)
47 "張簡", "歐陽", "諸葛", "申屠", "尉遲", "司馬", "軒轅", "夏侯"
48 };
49
34 // Returns true if |set| contains |element|, modulo a final period. 50 // Returns true if |set| contains |element|, modulo a final period.
35 bool ContainsString(const char* const set[], 51 bool ContainsString(const char* const set[],
36 size_t set_size, 52 size_t set_size,
37 const base::string16& element) { 53 const base::string16& element) {
38 if (!base::IsStringASCII(element)) 54 if (!base::IsStringASCII(element))
39 return false; 55 return false;
40 56
41 base::string16 trimmed_element; 57 base::string16 trimmed_element;
42 base::TrimString(element, base::ASCIIToUTF16("."), &trimmed_element); 58 base::TrimString(element, base::ASCIIToUTF16("."), &trimmed_element);
43 59
(...skipping 23 matching lines...) Expand all
67 void StripSuffixes(std::vector<base::string16>* name_tokens) { 83 void StripSuffixes(std::vector<base::string16>* name_tokens) {
68 while (!name_tokens->empty()) { 84 while (!name_tokens->empty()) {
69 if (!ContainsString(name_suffixes, arraysize(name_suffixes), 85 if (!ContainsString(name_suffixes, arraysize(name_suffixes),
70 name_tokens->back())) { 86 name_tokens->back())) {
71 break; 87 break;
72 } 88 }
73 name_tokens->pop_back(); 89 name_tokens->pop_back();
74 } 90 }
75 } 91 }
76 92
93 // Returns true if |c| is a CJK (Chinese, Japanese, Korean) character, for any
94 // of the CJK alphabets.
95 bool IsCJK(base::char16 c) {
96 static const std::set<UScriptCode> kCjkScripts {
97 USCRIPT_HAN, // CJK logographs, used by all 3 (but rarely for Korean)
98 USCRIPT_HANGUL, // Korean alphabet
99 USCRIPT_KATAKANA, // A Japanese syllabary
100 USCRIPT_HIRAGANA, // A Japanese syllabary
101 USCRIPT_BOPOMOFO // Chinese semisyllabary, rarely used
102 };
103 UErrorCode error = U_ZERO_ERROR;
104 UScriptCode script = uscript_getScript(c, &error);
105 return kCjkScripts.find(script) != kCjkScripts.end();
106 }
107
108 // Returns true if |name| looks like a CJK name (or some kind of mish-mash of
109 // the three, at least). The name is considered to be a CJK name if it is only
110 // CJK characters or spaces.
111 //
112 // Chinese and Japanese names are usually spelled out using the Han characters
113 // (logographs), which constitute the "CJK Unified Ideographs" block in Unicode,
114 // also referred to as Unihan. Korean names are usually spelled out in the
115 // Korean alphabet (Hangul), although they do have a Han equivalent as well.
116 bool IsCJKName(const base::string16& name) {
117 for (base::char16 c : name) {
118 if (!IsCJK(c) && !base::IsUnicodeWhitespace(c)) {
119 return false;
120 }
121 }
122 return true;
123 }
124
125 // Returns true if |c| is a Korean Hangul character.
126 bool IsHangul(base::char16 c) {
127 UErrorCode error = U_ZERO_ERROR;
128 return uscript_getScript(c, &error) == USCRIPT_HANGUL;
129 }
130
131 // Returns true if |name| looks like a Korean name, made up entirely of Hangul
132 // characters or spaces.
133 bool IsHangulName(const base::string16& name) {
134 for (base::char16 c : name) {
135 if (!IsHangul(c) && !base::IsUnicodeWhitespace(c)) {
136 return false;
137 }
138 }
139 return true;
140 }
141
142 // Tries to split a Chinese, Japanese, or Korean name into its given name &
143 // surname parts, and puts the result in |parts|. If splitting did not work for
144 // whatever reason, returns false.
145 bool SplitCJKName(const std::vector<base::string16>& name_tokens,
146 NameParts* parts) {
147 // The convention for CJK languages is to put the surname (last name) first,
148 // and the given name (first name) second. In a continuous text, there is
149 // normally no space between the two parts of the name. When entering their
150 // name into a field, though, some people add a space to disambiguate. CJK
151 // names (almost) never have a middle name.
152 //
153 // TODO(crbug.com/89111): Foreign names in Japanese are written in Katakana,
154 // with a '・' (KATAKANA MIDDLE DOT U+30FB) character as a separator, with
155 // the *western* ordering. e.g. "ビル・ゲイツ" ("biru・geitsu" AKA Bill Gates)
156 if (name_tokens.size() == 1) {
157 // There is no space between the surname and given name. Try to infer where
158 // to separate between the two. Most Chinese and Korean surnames have only
159 // one character, but there are a few that have 2. If the name does not
160 // start with a surname from a known list, default to 1 character.
161 //
162 // TODO(crbug.com/89111): Japanese names with no space will be mis-split,
163 // since we don't have a list of Japanese last names. In the Han alphabet,
164 // it might also be difficult for us to differentiate between Chinese &
165 // Japanese names.
166 const base::string16& name = name_tokens.front();
167 const bool is_korean = IsHangulName(name);
168 // Korean full names always have at least 3 characters. So, if there are
Jinsuk Kim 2016/07/12 22:50:48 This is not entirely true. Names with 1-char surna
nicolaso 2016/07/13 16:31:06 Should we check that the first character is a ver
Jinsuk Kim 2016/07/13 21:51:08 I wonder if it is safe to assume that people in mo
169 // less than three, it's only a given name.
170 if (is_korean && name.size() < 3) {
171 parts->given = name;
172 } else {
173 size_t surname_length = 1;
174 if (is_korean && name.size() == 4) {
175 // 4-character Korean full names default to a 2-character surname. It's
Jinsuk Kim 2016/07/12 22:50:48 You should still check if the first 2-character wo
nicolaso 2016/07/13 16:31:06 I used two separate lists, |common_cjk_multi_char_
Jinsuk Kim 2016/07/13 21:51:08 Acknowledged. Looks good.
176 // definitely a 2/2 split.
177 surname_length = 2;
178 } else {
179 base::string16 surname;
180 // Try to find the surname in |cjk_multi_char_surnames|.
181 for (size_t i = 0; i < arraysize(cjk_multi_char_surnames); i++) {
182 surname.clear();
183 base::UTF8ToUTF16(cjk_multi_char_surnames[i],
184 strlen(cjk_multi_char_surnames[i]),
185 &surname);
186 if (base::StartsWith(name, surname, base::CompareCase::SENSITIVE)) {
Jinsuk Kim 2016/07/12 22:50:48 This is more complicated because the name is more
nicolaso 2016/07/13 16:31:06 I renamed |cjk_multi_char_surnames| to |common_cjk
Jinsuk Kim 2016/07/13 21:51:08 Looks good. How about adding a comment on the test
187 surname_length = surname.size();
188 break;
189 }
190 }
191 }
192 parts->family = name.substr(0, surname_length);
193 parts->given = name.substr(surname_length);
194 }
195 return true;
196 } else if (name_tokens.size() == 2) {
197 // The user entered a space between the two name parts. This makes our job
198 // easier. Family name first, given name second.
199 parts->family = name_tokens[0];
200 parts->given = name_tokens[1];
201 return true;
202 }
203 // We don't know what to do if there are more than 2 tokens.
204 return false;
205 }
206
77 } // namespace 207 } // namespace
78 208
79 NameParts SplitName(const base::string16& name) { 209 NameParts SplitName(const base::string16& name) {
80 std::vector<base::string16> name_tokens = 210 std::vector<base::string16> name_tokens =
81 base::SplitString(name, base::ASCIIToUTF16(" ,"), base::KEEP_WHITESPACE, 211 base::SplitString(name, base::UTF8ToUTF16(" ,"), base::KEEP_WHITESPACE,
82 base::SPLIT_WANT_NONEMPTY); 212 base::SPLIT_WANT_NONEMPTY);
83 StripPrefixes(&name_tokens); 213 StripPrefixes(&name_tokens);
84 214
215 NameParts parts;
216
217 // TODO(crbug.com/89111): Hungarian, Tamil, Telugu, and Vietnamese also have
218 // the given name before the surname, and should be treated as special cases
219 // too.
220
221 // Treat CJK names differently.
222 if (IsCJKName(name) && SplitCJKName(name_tokens, &parts)) {
223 return parts;
224 }
225
85 // Don't assume "Ma" is a suffix in John Ma. 226 // Don't assume "Ma" is a suffix in John Ma.
86 if (name_tokens.size() > 2) 227 if (name_tokens.size() > 2)
87 StripSuffixes(&name_tokens); 228 StripSuffixes(&name_tokens);
88 229
89 NameParts parts;
90
91 if (name_tokens.empty()) { 230 if (name_tokens.empty()) {
92 // Bad things have happened; just assume the whole thing is a given name. 231 // Bad things have happened; just assume the whole thing is a given name.
93 parts.given = name; 232 parts.given = name;
94 return parts; 233 return parts;
95 } 234 }
96 235
97 // Only one token, assume given name. 236 // Only one token, assume given name.
98 if (name_tokens.size() == 1) { 237 if (name_tokens.size() == 1) {
99 parts.given = name_tokens[0]; 238 parts.given = name_tokens[0];
100 return parts; 239 return parts;
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after
163 profile.GetRawInfo(autofill::NAME_LAST); 302 profile.GetRawInfo(autofill::NAME_LAST);
164 if (!full_name.compare(candidate)) { 303 if (!full_name.compare(candidate)) {
165 return true; 304 return true;
166 } 305 }
167 306
168 return false; 307 return false;
169 } 308 }
170 309
171 } // namespace data_util 310 } // namespace data_util
172 } // namespace autofill 311 } // namespace autofill
OLDNEW
« no previous file with comments | « no previous file | components/autofill/core/browser/autofill_data_util_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698