OLD | NEW |
---|---|
1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/autofill/core/browser/autofill_data_util.h" | 5 #include "components/autofill/core/browser/autofill_data_util.h" |
6 | 6 |
7 #include <vector> | 7 #include <vector> |
8 | 8 |
9 #include "base/strings/string_split.h" | 9 #include "base/strings/string_split.h" |
10 #include "base/strings/string_util.h" | 10 #include "base/strings/string_util.h" |
11 #include "base/strings/utf_string_conversions.h" | 11 #include "base/strings/utf_string_conversions.h" |
12 #include "components/autofill/core/browser/field_types.h" | 12 #include "components/autofill/core/browser/field_types.h" |
13 #include "third_party/icu/source/common/unicode/uscript.h" | |
13 | 14 |
14 namespace autofill { | 15 namespace autofill { |
15 namespace data_util { | 16 namespace data_util { |
16 | 17 |
17 namespace { | 18 namespace { |
18 const char* const name_prefixes[] = { | 19 const char* const name_prefixes[] = { |
19 "1lt", "1st", "2lt", "2nd", "3rd", "admiral", "capt", | 20 "1lt", "1st", "2lt", "2nd", "3rd", "admiral", "capt", |
20 "captain", "col", "cpt", "dr", "gen", "general", "lcdr", | 21 "captain", "col", "cpt", "dr", "gen", "general", "lcdr", |
21 "lt", "ltc", "ltg", "ltjg", "maj", "major", "mg", | 22 "lt", "ltc", "ltg", "ltjg", "maj", "major", "mg", |
22 "mr", "mrs", "ms", "pastor", "prof", "rep", "reverend", | 23 "mr", "mrs", "ms", "pastor", "prof", "rep", "reverend", |
23 "rev", "sen", "st"}; | 24 "rev", "sen", "st"}; |
24 | 25 |
25 const char* const name_suffixes[] = {"b.a", "ba", "d.d.s", "dds", "i", "ii", | 26 const char* const name_suffixes[] = {"b.a", "ba", "d.d.s", "dds", "i", "ii", |
26 "iii", "iv", "ix", "jr", "m.a", "m.d", | 27 "iii", "iv", "ix", "jr", "m.a", "m.d", |
27 "ma", "md", "ms", "ph.d", "phd", "sr", | 28 "ma", "md", "ms", "ph.d", "phd", "sr", |
28 "v", "vi", "vii", "viii", "x"}; | 29 "v", "vi", "vii", "viii", "x"}; |
29 | 30 |
30 const char* const family_name_prefixes[] = {"d'", "de", "del", "der", "di", | 31 const char* const family_name_prefixes[] = {"d'", "de", "del", "der", "di", |
31 "la", "le", "mc", "san", "st", | 32 "la", "le", "mc", "san", "st", |
32 "ter", "van", "von"}; | 33 "ter", "van", "von"}; |
33 | 34 |
35 // CJK surnames (last names) that have more than one character. | |
36 const char* cjk_multi_char_surnames[] = { | |
37 // Korean, taken from the list of registered surnames: | |
38 // https://namu.wiki/w/%ED%95%9C%EA%B5%AD%EC%9D%98%20%EC%84%B1%EC%94%A8#s-6 | |
Jinsuk Kim
2016/07/12 22:50:48
Consider changing the reference to the article in
nicolaso
2016/07/13 16:31:06
Done.
| |
39 "강전", "남궁", "독고", "동방", "망절", "사공", "서문", "선우", | |
40 "소봉", "어금", "장곡", "제갈", "황목", "황보", | |
41 | |
42 // Chinese, taken from the top 10 Chinese surnames: | |
43 // https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.8 4.E8.A4.87.E5.A7.93 | |
44 // Simplified Chinese (mostly mainland China) | |
45 "欧阳", "令狐", "皇甫", "上官", "司徒", "诸葛", "司马", "宇文", "呼延", "端木", | |
46 // Traditional Chinese (mostly Taiwan) | |
47 "張簡", "歐陽", "諸葛", "申屠", "尉遲", "司馬", "軒轅", "夏侯" | |
48 }; | |
49 | |
34 // Returns true if |set| contains |element|, modulo a final period. | 50 // Returns true if |set| contains |element|, modulo a final period. |
35 bool ContainsString(const char* const set[], | 51 bool ContainsString(const char* const set[], |
36 size_t set_size, | 52 size_t set_size, |
37 const base::string16& element) { | 53 const base::string16& element) { |
38 if (!base::IsStringASCII(element)) | 54 if (!base::IsStringASCII(element)) |
39 return false; | 55 return false; |
40 | 56 |
41 base::string16 trimmed_element; | 57 base::string16 trimmed_element; |
42 base::TrimString(element, base::ASCIIToUTF16("."), &trimmed_element); | 58 base::TrimString(element, base::ASCIIToUTF16("."), &trimmed_element); |
43 | 59 |
(...skipping 23 matching lines...) Expand all Loading... | |
67 void StripSuffixes(std::vector<base::string16>* name_tokens) { | 83 void StripSuffixes(std::vector<base::string16>* name_tokens) { |
68 while (!name_tokens->empty()) { | 84 while (!name_tokens->empty()) { |
69 if (!ContainsString(name_suffixes, arraysize(name_suffixes), | 85 if (!ContainsString(name_suffixes, arraysize(name_suffixes), |
70 name_tokens->back())) { | 86 name_tokens->back())) { |
71 break; | 87 break; |
72 } | 88 } |
73 name_tokens->pop_back(); | 89 name_tokens->pop_back(); |
74 } | 90 } |
75 } | 91 } |
76 | 92 |
93 // Returns true if |c| is a CJK (Chinese, Japanese, Korean) character, for any | |
94 // of the CJK alphabets. | |
95 bool IsCJK(base::char16 c) { | |
96 static const std::set<UScriptCode> kCjkScripts { | |
97 USCRIPT_HAN, // CJK logographs, used by all 3 (but rarely for Korean) | |
98 USCRIPT_HANGUL, // Korean alphabet | |
99 USCRIPT_KATAKANA, // A Japanese syllabary | |
100 USCRIPT_HIRAGANA, // A Japanese syllabary | |
101 USCRIPT_BOPOMOFO // Chinese semisyllabary, rarely used | |
102 }; | |
103 UErrorCode error = U_ZERO_ERROR; | |
104 UScriptCode script = uscript_getScript(c, &error); | |
105 return kCjkScripts.find(script) != kCjkScripts.end(); | |
106 } | |
107 | |
108 // Returns true if |name| looks like a CJK name (or some kind of mish-mash of | |
109 // the three, at least). The name is considered to be a CJK name if it is only | |
110 // CJK characters or spaces. | |
111 // | |
112 // Chinese and Japanese names are usually spelled out using the Han characters | |
113 // (logographs), which constitute the "CJK Unified Ideographs" block in Unicode, | |
114 // also referred to as Unihan. Korean names are usually spelled out in the | |
115 // Korean alphabet (Hangul), although they do have a Han equivalent as well. | |
116 bool IsCJKName(const base::string16& name) { | |
117 for (base::char16 c : name) { | |
118 if (!IsCJK(c) && !base::IsUnicodeWhitespace(c)) { | |
119 return false; | |
120 } | |
121 } | |
122 return true; | |
123 } | |
124 | |
125 // Returns true if |c| is a Korean Hangul character. | |
126 bool IsHangul(base::char16 c) { | |
127 UErrorCode error = U_ZERO_ERROR; | |
128 return uscript_getScript(c, &error) == USCRIPT_HANGUL; | |
129 } | |
130 | |
131 // Returns true if |name| looks like a Korean name, made up entirely of Hangul | |
132 // characters or spaces. | |
133 bool IsHangulName(const base::string16& name) { | |
134 for (base::char16 c : name) { | |
135 if (!IsHangul(c) && !base::IsUnicodeWhitespace(c)) { | |
136 return false; | |
137 } | |
138 } | |
139 return true; | |
140 } | |
141 | |
142 // Tries to split a Chinese, Japanese, or Korean name into its given name & | |
143 // surname parts, and puts the result in |parts|. If splitting did not work for | |
144 // whatever reason, returns false. | |
145 bool SplitCJKName(const std::vector<base::string16>& name_tokens, | |
146 NameParts* parts) { | |
147 // The convention for CJK languages is to put the surname (last name) first, | |
148 // and the given name (first name) second. In a continuous text, there is | |
149 // normally no space between the two parts of the name. When entering their | |
150 // name into a field, though, some people add a space to disambiguate. CJK | |
151 // names (almost) never have a middle name. | |
152 // | |
153 // TODO(crbug.com/89111): Foreign names in Japanese are written in Katakana, | |
154 // with a '・' (KATAKANA MIDDLE DOT U+30FB) character as a separator, with | |
155 // the *western* ordering. e.g. "ビル・ゲイツ" ("biru・geitsu" AKA Bill Gates) | |
156 if (name_tokens.size() == 1) { | |
157 // There is no space between the surname and given name. Try to infer where | |
158 // to separate between the two. Most Chinese and Korean surnames have only | |
159 // one character, but there are a few that have 2. If the name does not | |
160 // start with a surname from a known list, default to 1 character. | |
161 // | |
162 // TODO(crbug.com/89111): Japanese names with no space will be mis-split, | |
163 // since we don't have a list of Japanese last names. In the Han alphabet, | |
164 // it might also be difficult for us to differentiate between Chinese & | |
165 // Japanese names. | |
166 const base::string16& name = name_tokens.front(); | |
167 const bool is_korean = IsHangulName(name); | |
168 // Korean full names always have at least 3 characters. So, if there are | |
Jinsuk Kim
2016/07/12 22:50:48
This is not entirely true. Names with 1-char surna
nicolaso
2016/07/13 16:31:06
Should we check that the first character is a ver
Jinsuk Kim
2016/07/13 21:51:08
I wonder if it is safe to assume that people in mo
| |
169 // less than three, it's only a given name. | |
170 if (is_korean && name.size() < 3) { | |
171 parts->given = name; | |
172 } else { | |
173 size_t surname_length = 1; | |
174 if (is_korean && name.size() == 4) { | |
175 // 4-character Korean full names default to a 2-character surname. It's | |
Jinsuk Kim
2016/07/12 22:50:48
You should still check if the first 2-character wo
nicolaso
2016/07/13 16:31:06
I used two separate lists, |common_cjk_multi_char_
Jinsuk Kim
2016/07/13 21:51:08
Acknowledged. Looks good.
| |
176 // definitely a 2/2 split. | |
177 surname_length = 2; | |
178 } else { | |
179 base::string16 surname; | |
180 // Try to find the surname in |cjk_multi_char_surnames|. | |
181 for (size_t i = 0; i < arraysize(cjk_multi_char_surnames); i++) { | |
182 surname.clear(); | |
183 base::UTF8ToUTF16(cjk_multi_char_surnames[i], | |
184 strlen(cjk_multi_char_surnames[i]), | |
185 &surname); | |
186 if (base::StartsWith(name, surname, base::CompareCase::SENSITIVE)) { | |
Jinsuk Kim
2016/07/12 22:50:48
This is more complicated because the name is more
nicolaso
2016/07/13 16:31:06
I renamed |cjk_multi_char_surnames| to |common_cjk
Jinsuk Kim
2016/07/13 21:51:08
Looks good. How about adding a comment on the test
| |
187 surname_length = surname.size(); | |
188 break; | |
189 } | |
190 } | |
191 } | |
192 parts->family = name.substr(0, surname_length); | |
193 parts->given = name.substr(surname_length); | |
194 } | |
195 return true; | |
196 } else if (name_tokens.size() == 2) { | |
197 // The user entered a space between the two name parts. This makes our job | |
198 // easier. Family name first, given name second. | |
199 parts->family = name_tokens[0]; | |
200 parts->given = name_tokens[1]; | |
201 return true; | |
202 } | |
203 // We don't know what to do if there are more than 2 tokens. | |
204 return false; | |
205 } | |
206 | |
77 } // namespace | 207 } // namespace |
78 | 208 |
79 NameParts SplitName(const base::string16& name) { | 209 NameParts SplitName(const base::string16& name) { |
80 std::vector<base::string16> name_tokens = | 210 std::vector<base::string16> name_tokens = |
81 base::SplitString(name, base::ASCIIToUTF16(" ,"), base::KEEP_WHITESPACE, | 211 base::SplitString(name, base::UTF8ToUTF16(" ,"), base::KEEP_WHITESPACE, |
82 base::SPLIT_WANT_NONEMPTY); | 212 base::SPLIT_WANT_NONEMPTY); |
83 StripPrefixes(&name_tokens); | 213 StripPrefixes(&name_tokens); |
84 | 214 |
215 NameParts parts; | |
216 | |
217 // TODO(crbug.com/89111): Hungarian, Tamil, Telugu, and Vietnamese also have | |
218 // the given name before the surname, and should be treated as special cases | |
219 // too. | |
220 | |
221 // Treat CJK names differently. | |
222 if (IsCJKName(name) && SplitCJKName(name_tokens, &parts)) { | |
223 return parts; | |
224 } | |
225 | |
85 // Don't assume "Ma" is a suffix in John Ma. | 226 // Don't assume "Ma" is a suffix in John Ma. |
86 if (name_tokens.size() > 2) | 227 if (name_tokens.size() > 2) |
87 StripSuffixes(&name_tokens); | 228 StripSuffixes(&name_tokens); |
88 | 229 |
89 NameParts parts; | |
90 | |
91 if (name_tokens.empty()) { | 230 if (name_tokens.empty()) { |
92 // Bad things have happened; just assume the whole thing is a given name. | 231 // Bad things have happened; just assume the whole thing is a given name. |
93 parts.given = name; | 232 parts.given = name; |
94 return parts; | 233 return parts; |
95 } | 234 } |
96 | 235 |
97 // Only one token, assume given name. | 236 // Only one token, assume given name. |
98 if (name_tokens.size() == 1) { | 237 if (name_tokens.size() == 1) { |
99 parts.given = name_tokens[0]; | 238 parts.given = name_tokens[0]; |
100 return parts; | 239 return parts; |
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
163 profile.GetRawInfo(autofill::NAME_LAST); | 302 profile.GetRawInfo(autofill::NAME_LAST); |
164 if (!full_name.compare(candidate)) { | 303 if (!full_name.compare(candidate)) { |
165 return true; | 304 return true; |
166 } | 305 } |
167 | 306 |
168 return false; | 307 return false; |
169 } | 308 } |
170 | 309 |
171 } // namespace data_util | 310 } // namespace data_util |
172 } // namespace autofill | 311 } // namespace autofill |
OLD | NEW |