OLD | NEW |
1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/autofill/core/browser/autofill_data_util.h" | 5 #include "components/autofill/core/browser/autofill_data_util.h" |
6 | 6 |
| 7 #include <algorithm> |
7 #include <vector> | 8 #include <vector> |
8 | 9 |
9 #include "base/strings/string_split.h" | 10 #include "base/strings/string_split.h" |
10 #include "base/strings/string_util.h" | 11 #include "base/strings/string_util.h" |
11 #include "base/strings/utf_string_conversions.h" | 12 #include "base/strings/utf_string_conversions.h" |
12 #include "components/autofill/core/browser/field_types.h" | 13 #include "components/autofill/core/browser/field_types.h" |
| 14 #include "third_party/icu/source/common/unicode/uscript.h" |
13 | 15 |
14 namespace autofill { | 16 namespace autofill { |
15 namespace data_util { | 17 namespace data_util { |
16 | 18 |
17 namespace { | 19 namespace { |
18 const char* const name_prefixes[] = { | 20 const char* const name_prefixes[] = { |
19 "1lt", "1st", "2lt", "2nd", "3rd", "admiral", "capt", | 21 "1lt", "1st", "2lt", "2nd", "3rd", "admiral", "capt", |
20 "captain", "col", "cpt", "dr", "gen", "general", "lcdr", | 22 "captain", "col", "cpt", "dr", "gen", "general", "lcdr", |
21 "lt", "ltc", "ltg", "ltjg", "maj", "major", "mg", | 23 "lt", "ltc", "ltg", "ltjg", "maj", "major", "mg", |
22 "mr", "mrs", "ms", "pastor", "prof", "rep", "reverend", | 24 "mr", "mrs", "ms", "pastor", "prof", "rep", "reverend", |
23 "rev", "sen", "st"}; | 25 "rev", "sen", "st"}; |
24 | 26 |
25 const char* const name_suffixes[] = {"b.a", "ba", "d.d.s", "dds", "i", "ii", | 27 const char* const name_suffixes[] = {"b.a", "ba", "d.d.s", "dds", "i", "ii", |
26 "iii", "iv", "ix", "jr", "m.a", "m.d", | 28 "iii", "iv", "ix", "jr", "m.a", "m.d", |
27 "ma", "md", "ms", "ph.d", "phd", "sr", | 29 "ma", "md", "ms", "ph.d", "phd", "sr", |
28 "v", "vi", "vii", "viii", "x"}; | 30 "v", "vi", "vii", "viii", "x"}; |
29 | 31 |
30 const char* const family_name_prefixes[] = {"d'", "de", "del", "der", "di", | 32 const char* const family_name_prefixes[] = {"d'", "de", "del", "der", "di", |
31 "la", "le", "mc", "san", "st", | 33 "la", "le", "mc", "san", "st", |
32 "ter", "van", "von"}; | 34 "ter", "van", "von"}; |
33 | 35 |
| 36 // The common and non-ambiguous CJK surnames (last names) that have more than |
| 37 // one character. |
| 38 const char* common_cjk_multi_char_surnames[] = { |
| 39 // Korean, taken from the list of surnames: |
| 40 // https://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EC%84%B1%EC%94%A
8_%EB%AA%A9%EB%A1%9D |
| 41 "남궁", "사공", "서문", "선우", "제갈", "황보", "독고", "망절", |
| 42 |
| 43 // Chinese, taken from the top 10 Chinese 2-character surnames: |
| 44 // https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.8
4.E8.A4.87.E5.A7.93 |
| 45 // Simplified Chinese (mostly mainland China) |
| 46 "欧阳", "令狐", "皇甫", "上官", "司徒", "诸葛", "司马", "宇文", "呼延", "端木", |
| 47 // Traditional Chinese (mostly Taiwan) |
| 48 "張簡", "歐陽", "諸葛", "申屠", "尉遲", "司馬", "軒轅", "夏侯" |
| 49 }; |
| 50 |
| 51 // All Korean surnames that have more than one character, even the |
| 52 // rare/ambiguous ones. |
| 53 const char* korean_multi_char_surnames[] = { |
| 54 "강전", "남궁", "독고", "동방", "망절", "사공", "서문", "선우", |
| 55 "소봉", "어금", "장곡", "제갈", "황목", "황보" |
| 56 }; |
| 57 |
34 // Returns true if |set| contains |element|, modulo a final period. | 58 // Returns true if |set| contains |element|, modulo a final period. |
35 bool ContainsString(const char* const set[], | 59 bool ContainsString(const char* const set[], |
36 size_t set_size, | 60 size_t set_size, |
37 const base::string16& element) { | 61 const base::string16& element) { |
38 if (!base::IsStringASCII(element)) | 62 if (!base::IsStringASCII(element)) |
39 return false; | 63 return false; |
40 | 64 |
41 base::string16 trimmed_element; | 65 base::string16 trimmed_element; |
42 base::TrimString(element, base::ASCIIToUTF16("."), &trimmed_element); | 66 base::TrimString(element, base::ASCIIToUTF16("."), &trimmed_element); |
43 | 67 |
(...skipping 23 matching lines...) Expand all Loading... |
67 void StripSuffixes(std::vector<base::string16>* name_tokens) { | 91 void StripSuffixes(std::vector<base::string16>* name_tokens) { |
68 while (!name_tokens->empty()) { | 92 while (!name_tokens->empty()) { |
69 if (!ContainsString(name_suffixes, arraysize(name_suffixes), | 93 if (!ContainsString(name_suffixes, arraysize(name_suffixes), |
70 name_tokens->back())) { | 94 name_tokens->back())) { |
71 break; | 95 break; |
72 } | 96 } |
73 name_tokens->pop_back(); | 97 name_tokens->pop_back(); |
74 } | 98 } |
75 } | 99 } |
76 | 100 |
| 101 // Find whether |name| starts with any of the strings from the array |
| 102 // |prefixes|. The returned value is the length of the prefix found, or 0 if |
| 103 // none is found. |
| 104 size_t StartsWithAny(base::StringPiece16 name, const char** prefixes, |
| 105 size_t prefix_count) { |
| 106 base::string16 buffer; |
| 107 for (size_t i = 0; i < prefix_count; i++) { |
| 108 buffer.clear(); |
| 109 base::UTF8ToUTF16(prefixes[i], strlen(prefixes[i]), &buffer); |
| 110 if (base::StartsWith(name, buffer, base::CompareCase::SENSITIVE)) { |
| 111 return buffer.size(); |
| 112 } |
| 113 } |
| 114 return 0; |
| 115 } |
| 116 |
| 117 // Returns true if |c| is a CJK (Chinese, Japanese, Korean) character, for any |
| 118 // of the CJK alphabets. |
| 119 bool IsCJK(base::char16 c) { |
| 120 static const std::set<UScriptCode> kCjkScripts { |
| 121 USCRIPT_HAN, // CJK logographs, used by all 3 (but rarely for Korean) |
| 122 USCRIPT_HANGUL, // Korean alphabet |
| 123 USCRIPT_KATAKANA, // A Japanese syllabary |
| 124 USCRIPT_HIRAGANA, // A Japanese syllabary |
| 125 USCRIPT_BOPOMOFO // Chinese semisyllabary, rarely used |
| 126 }; |
| 127 UErrorCode error = U_ZERO_ERROR; |
| 128 UScriptCode script = uscript_getScript(c, &error); |
| 129 return kCjkScripts.find(script) != kCjkScripts.end(); |
| 130 } |
| 131 |
| 132 // Returns true if |name| looks like a CJK name (or some kind of mish-mash of |
| 133 // the three, at least). The name is considered to be a CJK name if it is only |
| 134 // CJK characters or spaces. |
| 135 // |
| 136 // Chinese and Japanese names are usually spelled out using the Han characters |
| 137 // (logographs), which constitute the "CJK Unified Ideographs" block in Unicode, |
| 138 // also referred to as Unihan. Korean names are usually spelled out in the |
| 139 // Korean alphabet (Hangul), although they do have a Han equivalent as well. |
| 140 bool IsCJKName(const base::string16& name) { |
| 141 for (base::char16 c : name) { |
| 142 if (!IsCJK(c) && !base::IsUnicodeWhitespace(c)) { |
| 143 return false; |
| 144 } |
| 145 } |
| 146 return true; |
| 147 } |
| 148 |
| 149 // Returns true if |c| is a Korean Hangul character. |
| 150 bool IsHangul(base::char16 c) { |
| 151 UErrorCode error = U_ZERO_ERROR; |
| 152 return uscript_getScript(c, &error) == USCRIPT_HANGUL; |
| 153 } |
| 154 |
| 155 // Returns true if |name| looks like a Korean name, made up entirely of Hangul |
| 156 // characters or spaces. |
| 157 bool IsHangulName(const base::string16& name) { |
| 158 for (base::char16 c : name) { |
| 159 if (!IsHangul(c) && !base::IsUnicodeWhitespace(c)) { |
| 160 return false; |
| 161 } |
| 162 } |
| 163 return true; |
| 164 } |
| 165 |
| 166 // Tries to split a Chinese, Japanese, or Korean name into its given name & |
| 167 // surname parts, and puts the result in |parts|. If splitting did not work for |
| 168 // whatever reason, returns false. |
| 169 bool SplitCJKName(const std::vector<base::string16>& name_tokens, |
| 170 NameParts* parts) { |
| 171 // The convention for CJK languages is to put the surname (last name) first, |
| 172 // and the given name (first name) second. In a continuous text, there is |
| 173 // normally no space between the two parts of the name. When entering their |
| 174 // name into a field, though, some people add a space to disambiguate. CJK |
| 175 // names (almost) never have a middle name. |
| 176 // |
| 177 // TODO(crbug.com/89111): Foreign names in Japanese are written in Katakana, |
| 178 // with a '・' (KATAKANA MIDDLE DOT U+30FB) character as a separator, with |
| 179 // the *western* ordering. e.g. "ビル・ゲイツ" ("biru・geitsu" AKA Bill Gates) |
| 180 if (name_tokens.size() == 1) { |
| 181 // There is no space between the surname and given name. Try to infer where |
| 182 // to separate between the two. Most Chinese and Korean surnames have only |
| 183 // one character, but there are a few that have 2. If the name does not |
| 184 // start with a surname from a known list, default to 1 character. |
| 185 // |
| 186 // TODO(crbug.com/89111): Japanese names with no space will be mis-split, |
| 187 // since we don't have a list of Japanese last names. In the Han alphabet, |
| 188 // it might also be difficult for us to differentiate between Chinese & |
| 189 // Japanese names. |
| 190 const base::string16& name = name_tokens.front(); |
| 191 const bool is_korean = IsHangulName(name); |
| 192 size_t surname_length = 0; |
| 193 if (is_korean && name.size() > 3) { |
| 194 // 4-character Korean names are more likely to be 2/2 than 1/3, so use |
| 195 // the full list of Korean 2-char surnames. (instead of only the common |
| 196 // ones) |
| 197 surname_length = std::max<size_t>( |
| 198 1, StartsWithAny(name, korean_multi_char_surnames, |
| 199 arraysize(korean_multi_char_surnames))); |
| 200 } else { |
| 201 // Default to 1 character if the surname is not in |
| 202 // |common_cjk_multi_char_surnames|. |
| 203 surname_length = std::max<size_t>( |
| 204 1, StartsWithAny(name, common_cjk_multi_char_surnames, |
| 205 arraysize(common_cjk_multi_char_surnames))); |
| 206 } |
| 207 parts->family = name.substr(0, surname_length); |
| 208 parts->given = name.substr(surname_length); |
| 209 return true; |
| 210 } |
| 211 if (name_tokens.size() == 2) { |
| 212 // The user entered a space between the two name parts. This makes our job |
| 213 // easier. Family name first, given name second. |
| 214 parts->family = name_tokens[0]; |
| 215 parts->given = name_tokens[1]; |
| 216 return true; |
| 217 } |
| 218 // We don't know what to do if there are more than 2 tokens. |
| 219 return false; |
| 220 } |
| 221 |
77 } // namespace | 222 } // namespace |
78 | 223 |
79 NameParts SplitName(const base::string16& name) { | 224 NameParts SplitName(const base::string16& name) { |
80 std::vector<base::string16> name_tokens = | 225 std::vector<base::string16> name_tokens = |
81 base::SplitString(name, base::ASCIIToUTF16(" ,"), base::KEEP_WHITESPACE, | 226 base::SplitString(name, base::ASCIIToUTF16(" ,"), base::KEEP_WHITESPACE, |
82 base::SPLIT_WANT_NONEMPTY); | 227 base::SPLIT_WANT_NONEMPTY); |
83 StripPrefixes(&name_tokens); | 228 StripPrefixes(&name_tokens); |
84 | 229 |
| 230 NameParts parts; |
| 231 |
| 232 // TODO(crbug.com/89111): Hungarian, Tamil, Telugu, and Vietnamese also have |
| 233 // the given name before the surname, and should be treated as special cases |
| 234 // too. |
| 235 |
| 236 // Treat CJK names differently. |
| 237 if (IsCJKName(name) && SplitCJKName(name_tokens, &parts)) { |
| 238 return parts; |
| 239 } |
| 240 |
85 // Don't assume "Ma" is a suffix in John Ma. | 241 // Don't assume "Ma" is a suffix in John Ma. |
86 if (name_tokens.size() > 2) | 242 if (name_tokens.size() > 2) |
87 StripSuffixes(&name_tokens); | 243 StripSuffixes(&name_tokens); |
88 | 244 |
89 NameParts parts; | |
90 | |
91 if (name_tokens.empty()) { | 245 if (name_tokens.empty()) { |
92 // Bad things have happened; just assume the whole thing is a given name. | 246 // Bad things have happened; just assume the whole thing is a given name. |
93 parts.given = name; | 247 parts.given = name; |
94 return parts; | 248 return parts; |
95 } | 249 } |
96 | 250 |
97 // Only one token, assume given name. | 251 // Only one token, assume given name. |
98 if (name_tokens.size() == 1) { | 252 if (name_tokens.size() == 1) { |
99 parts.given = name_tokens[0]; | 253 parts.given = name_tokens[0]; |
100 return parts; | 254 return parts; |
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
163 profile.GetRawInfo(autofill::NAME_LAST); | 317 profile.GetRawInfo(autofill::NAME_LAST); |
164 if (!full_name.compare(candidate)) { | 318 if (!full_name.compare(candidate)) { |
165 return true; | 319 return true; |
166 } | 320 } |
167 | 321 |
168 return false; | 322 return false; |
169 } | 323 } |
170 | 324 |
171 } // namespace data_util | 325 } // namespace data_util |
172 } // namespace autofill | 326 } // namespace autofill |
OLD | NEW |