OLD | NEW |
1 /* | 1 /* |
2 ********************************************************************** | 2 ********************************************************************** |
3 * Copyright (C) 2012-2013, International Business Machines | 3 * Copyright (C) 2012-2014, International Business Machines |
4 * Corporation and others. All Rights Reserved. | 4 * Corporation and others. All Rights Reserved. |
5 ********************************************************************** | 5 ********************************************************************** |
6 */ | 6 */ |
7 | 7 |
8 #include "unicode/utypes.h" | 8 #include "unicode/utypes.h" |
9 | 9 |
10 #include "unicode/uchar.h" | 10 #include "unicode/uchar.h" |
11 #include "unicode/utf16.h" | 11 #include "unicode/utf16.h" |
12 | 12 |
13 #include "identifier_info.h" | 13 #include "identifier_info.h" |
14 #include "mutex.h" | 14 #include "mutex.h" |
15 #include "scriptset.h" | 15 #include "scriptset.h" |
16 #include "ucln_in.h" | 16 #include "ucln_in.h" |
17 #include "uvector.h" | 17 #include "uvector.h" |
18 | 18 |
19 U_NAMESPACE_BEGIN | 19 U_NAMESPACE_BEGIN |
20 | 20 |
21 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) | 21 static UnicodeSet *ASCII; |
| 22 static ScriptSet *JAPANESE; |
| 23 static ScriptSet *CHINESE; |
| 24 static ScriptSet *KOREAN; |
| 25 static ScriptSet *CONFUSABLE_WITH_LATIN; |
| 26 static UInitOnce gIdentifierInfoInitOnce = U_INITONCE_INITIALIZER; |
22 | 27 |
23 static UMutex gInitMutex = U_MUTEX_INITIALIZER; | |
24 static UBool gStaticsAreInitialized = FALSE; | |
25 | 28 |
26 UnicodeSet *IdentifierInfo::ASCII; | 29 U_CDECL_BEGIN |
27 ScriptSet *IdentifierInfo::JAPANESE; | 30 static UBool U_CALLCONV |
28 ScriptSet *IdentifierInfo::CHINESE; | 31 IdentifierInfo_cleanup(void) { |
29 ScriptSet *IdentifierInfo::KOREAN; | |
30 ScriptSet *IdentifierInfo::CONFUSABLE_WITH_LATIN; | |
31 | |
32 UBool IdentifierInfo::cleanup() { | |
33 delete ASCII; | 32 delete ASCII; |
34 ASCII = NULL; | 33 ASCII = NULL; |
35 delete JAPANESE; | 34 delete JAPANESE; |
36 JAPANESE = NULL; | 35 JAPANESE = NULL; |
37 delete CHINESE; | 36 delete CHINESE; |
38 CHINESE = NULL; | 37 CHINESE = NULL; |
39 delete KOREAN; | 38 delete KOREAN; |
40 KOREAN = NULL; | 39 KOREAN = NULL; |
41 delete CONFUSABLE_WITH_LATIN; | 40 delete CONFUSABLE_WITH_LATIN; |
42 CONFUSABLE_WITH_LATIN = NULL; | 41 CONFUSABLE_WITH_LATIN = NULL; |
43 gStaticsAreInitialized = FALSE; | 42 gIdentifierInfoInitOnce.reset(); |
44 return TRUE; | 43 return TRUE; |
45 } | 44 } |
46 | 45 |
47 U_CDECL_BEGIN | 46 static void U_CALLCONV |
48 static UBool U_CALLCONV | 47 IdentifierInfo_init(UErrorCode &status) { |
49 IdentifierInfo_cleanup(void) { | 48 ASCII = new UnicodeSet(0, 0x7f); |
50 return IdentifierInfo::cleanup(); | 49 JAPANESE = new ScriptSet(); |
| 50 CHINESE = new ScriptSet(); |
| 51 KOREAN = new ScriptSet(); |
| 52 CONFUSABLE_WITH_LATIN = new ScriptSet(); |
| 53 if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL |
| 54 || CONFUSABLE_WITH_LATIN == NULL) { |
| 55 status = U_MEMORY_ALLOCATION_ERROR; |
| 56 return; |
| 57 } |
| 58 ASCII->freeze(); |
| 59 JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HI
RAGANA, status) |
| 60 .set(USCRIPT_KATAKANA, status); |
| 61 CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOP
OMOFO, status); |
| 62 KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANG
UL, status); |
| 63 CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, stat
us) |
| 64 .set(USCRIPT_CHEROKEE, status); |
| 65 ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup)
; |
51 } | 66 } |
52 U_CDECL_END | 67 U_CDECL_END |
53 | 68 |
54 | 69 |
55 IdentifierInfo::IdentifierInfo(UErrorCode &status): | 70 IdentifierInfo::IdentifierInfo(UErrorCode &status): |
56 fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL), | 71 fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL), |
57 fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL)
{ | 72 fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL)
{ |
| 73 umtx_initOnce(gIdentifierInfoInitOnce, &IdentifierInfo_init, status); |
58 if (U_FAILURE(status)) { | 74 if (U_FAILURE(status)) { |
59 return; | 75 return; |
60 } | 76 } |
61 { | 77 |
62 Mutex lock(&gInitMutex); | |
63 if (!gStaticsAreInitialized) { | |
64 ASCII = new UnicodeSet(0, 0x7f); | |
65 JAPANESE = new ScriptSet(); | |
66 CHINESE = new ScriptSet(); | |
67 KOREAN = new ScriptSet(); | |
68 CONFUSABLE_WITH_LATIN = new ScriptSet(); | |
69 if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN =
= NULL | |
70 || CONFUSABLE_WITH_LATIN == NULL) { | |
71 status = U_MEMORY_ALLOCATION_ERROR; | |
72 return; | |
73 } | |
74 ASCII->freeze(); | |
75 JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(US
CRIPT_HIRAGANA, status) | |
76 .set(USCRIPT_KATAKANA, status); | |
77 CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USC
RIPT_BOPOMOFO, status); | |
78 KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCR
IPT_HANGUL, status); | |
79 CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GRE
EK, status) | |
80 .set(USCRIPT_CHEROKEE, status); | |
81 ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_
cleanup); | |
82 gStaticsAreInitialized = TRUE; | |
83 } | |
84 } | |
85 fIdentifier = new UnicodeString(); | 78 fIdentifier = new UnicodeString(); |
86 fRequiredScripts = new ScriptSet(); | 79 fRequiredScripts = new ScriptSet(); |
87 fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL
, &status); | 80 fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL
, &status); |
88 uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet); | 81 uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet); |
89 fCommonAmongAlternates = new ScriptSet(); | 82 fCommonAmongAlternates = new ScriptSet(); |
90 fNumerics = new UnicodeSet(); | 83 fNumerics = new UnicodeSet(); |
91 fIdentifierProfile = new UnicodeSet(0, 0x10FFFF); | 84 fIdentifierProfile = new UnicodeSet(0, 0x10FFFF); |
92 | 85 |
93 if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL ||
fScriptSetSet == NULL || | 86 if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL ||
fScriptSetSet == NULL || |
94 fCommonAmongAlternates == NULL || fNumerics == NUL
L || fIdentifierProfile == NULL)) { | 87 fCommonAmongAlternates == NULL || fNumerics == NUL
L || fIdentifierProfile == NULL)) { |
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
135 ScriptSet scriptsForCP; | 128 ScriptSet scriptsForCP; |
136 UChar32 cp; | 129 UChar32 cp; |
137 for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) { | 130 for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) { |
138 cp = identifier.char32At(i); | 131 cp = identifier.char32At(i); |
139 // Store a representative character for each kind of decimal digit | 132 // Store a representative character for each kind of decimal digit |
140 if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) { | 133 if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) { |
141 // Just store the zero character as a representative for comparison.
Unicode guarantees it is cp - value | 134 // Just store the zero character as a representative for comparison.
Unicode guarantees it is cp - value |
142 fNumerics->add(cp - (UChar32)u_getNumericValue(cp)); | 135 fNumerics->add(cp - (UChar32)u_getNumericValue(cp)); |
143 } | 136 } |
144 UScriptCode extensions[500]; | 137 UScriptCode extensions[500]; |
145 int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LE
NGTHOF(extensions), &status); | 138 int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, UP
RV_LENGTHOF(extensions), &status); |
146 if (U_FAILURE(status)) { | 139 if (U_FAILURE(status)) { |
147 return *this; | 140 return *this; |
148 } | 141 } |
149 scriptsForCP.resetAll(); | 142 scriptsForCP.resetAll(); |
150 for (int32_t j=0; j<extensionsCount; j++) { | 143 for (int32_t j=0; j<extensionsCount; j++) { |
151 scriptsForCP.set(extensions[j], status); | 144 scriptsForCP.set(extensions[j], status); |
152 } | 145 } |
153 scriptsForCP.reset(USCRIPT_COMMON, status); | 146 scriptsForCP.reset(USCRIPT_COMMON, status); |
154 scriptsForCP.reset(USCRIPT_INHERITED, status); | 147 scriptsForCP.reset(USCRIPT_INHERITED, status); |
155 switch (scriptsForCP.countMembers()) { | 148 switch (scriptsForCP.countMembers()) { |
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
240 // This is a bit tricky. We look at a number of factors. | 233 // This is a bit tricky. We look at a number of factors. |
241 // The number of scripts in the text. | 234 // The number of scripts in the text. |
242 // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa];
[Arab Syrc]) | 235 // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa];
[Arab Syrc]) |
243 // Plus number of alternates otherwise (this only works because we only test
cardinality up to 2.) | 236 // Plus number of alternates otherwise (this only works because we only test
cardinality up to 2.) |
244 | 237 |
245 // Note: the requiredScripts set omits COMMON and INHERITED; they are taken
out at the | 238 // Note: the requiredScripts set omits COMMON and INHERITED; they are taken
out at the |
246 // time it is created, in setIdentifier(). | 239 // time it is created, in setIdentifier(). |
247 int32_t cardinalityPlus = fRequiredScripts->countMembers() + | 240 int32_t cardinalityPlus = fRequiredScripts->countMembers() + |
248 (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSe
tSet) : 1); | 241 (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSe
tSet) : 1); |
249 if (cardinalityPlus < 2) { | 242 if (cardinalityPlus < 2) { |
250 return USPOOF_HIGHLY_RESTRICTIVE; | 243 return USPOOF_SINGLE_SCRIPT_RESTRICTIVE; |
251 } | 244 } |
252 if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlte
rnates(*CHINESE, *fRequiredScripts) | 245 if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlte
rnates(*CHINESE, *fRequiredScripts) |
253 || containsWithAlternates(*KOREAN, *fRequiredScripts)) { | 246 || containsWithAlternates(*KOREAN, *fRequiredScripts)) { |
254 return USPOOF_HIGHLY_RESTRICTIVE; | 247 return USPOOF_HIGHLY_RESTRICTIVE; |
255 } | 248 } |
256 if (cardinalityPlus == 2 && | 249 if (cardinalityPlus == 2 && |
257 fRequiredScripts->test(USCRIPT_LATIN, status) && | 250 fRequiredScripts->test(USCRIPT_LATIN, status) && |
258 !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) { | 251 !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) { |
259 return USPOOF_MODERATELY_RESTRICTIVE; | 252 return USPOOF_MODERATELY_RESTRICTIVE; |
260 } | 253 } |
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
309 dest.append(separator); | 302 dest.append(separator); |
310 } | 303 } |
311 ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i)); | 304 ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i)); |
312 ss->displayScripts(dest); | 305 ss->displayScripts(dest); |
313 } | 306 } |
314 return dest; | 307 return dest; |
315 } | 308 } |
316 | 309 |
317 U_NAMESPACE_END | 310 U_NAMESPACE_END |
318 | 311 |
OLD | NEW |