OLD | NEW |
1 /* | 1 /* |
2 *************************************************************************** | 2 *************************************************************************** |
3 * Copyright (C) 2008-2014, International Business Machines Corporation | 3 * Copyright (C) 2008-2015, International Business Machines Corporation |
4 * and others. All Rights Reserved. | 4 * and others. All Rights Reserved. |
5 *************************************************************************** | 5 *************************************************************************** |
6 * file name: uspoof.h | 6 * file name: uspoof.h |
7 * encoding: US-ASCII | 7 * encoding: US-ASCII |
8 * tab size: 8 (not used) | 8 * tab size: 8 (not used) |
9 * indentation:4 | 9 * indentation:4 |
10 * | 10 * |
11 * created on: 2008Feb13 | 11 * created on: 2008Feb13 |
12 * created by: Andy Heninger | 12 * created by: Andy Heninger |
13 * | 13 * |
(...skipping 118 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
132 * a set of identifiers, and then quickly test whether a new identifier is | 132 * a set of identifiers, and then quickly test whether a new identifier is |
133 * confusable with an identifier already in the set. The uspoof_getSkeleton() | 133 * confusable with an identifier already in the set. The uspoof_getSkeleton() |
134 * family of functions will produce the skeleton from an identifier. | 134 * family of functions will produce the skeleton from an identifier. |
135 * | 135 * |
136 * Note that skeletons are not guaranteed to be stable between versions | 136 * Note that skeletons are not guaranteed to be stable between versions |
137 * of Unicode or ICU, so an applications should not rely on creating a permanen
t, | 137 * of Unicode or ICU, so an applications should not rely on creating a permanen
t, |
138 * or difficult to update, database of skeletons. Instabilities result from | 138 * or difficult to update, database of skeletons. Instabilities result from |
139 * identifying new pairs or sequences of characters that are visually | 139 * identifying new pairs or sequences of characters that are visually |
140 * confusable, and thus must be mapped to the same skeleton character(s). | 140 * confusable, and thus must be mapped to the same skeleton character(s). |
141 * | 141 * |
| 142 * Skeletons are computed using the algorithm and data describe in Unicode UAX
39. |
| 143 * The latest proposed update, UAX 39 Version 8 draft 1, says "the tables SL, S
A, and ML |
| 144 * were still problematic, and discouraged from use in [Uniocde] 7.0. |
| 145 * They were thus removed from version 8.0" |
| 146 * |
| 147 * In light of this, the default mapping data included with ICU 55 uses the |
| 148 * Unicode 7 MA (Multi script Any case) table data for the other type options |
| 149 * (Single Script, Any Case), (Single Script, Lower Case) and (Multi Script, Lo
wer Case). |
142 */ | 150 */ |
143 | 151 |
144 struct USpoofChecker; | 152 struct USpoofChecker; |
145 typedef struct USpoofChecker USpoofChecker; /**< typedef for C of USpoofChecker
*/ | 153 typedef struct USpoofChecker USpoofChecker; /**< typedef for C of USpoofChecker
*/ |
146 | 154 |
147 /** | 155 /** |
148 * Enum for the kinds of checks that USpoofChecker can perform. | 156 * Enum for the kinds of checks that USpoofChecker can perform. |
149 * These enum values are used both to select the set of checks that | 157 * These enum values are used both to select the set of checks that |
150 * will be performed, and to report results from the check function. | 158 * will be performed, and to report results from the check function. |
151 * | 159 * |
(...skipping 111 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
263 * for returned identifier restriction levels in check results. | 271 * for returned identifier restriction levels in check results. |
264 * @stable ICU 51 | 272 * @stable ICU 51 |
265 */ | 273 */ |
266 typedef enum URestrictionLevel { | 274 typedef enum URestrictionLevel { |
267 /** | 275 /** |
268 * Only ASCII characters: U+0000..U+007F | 276 * Only ASCII characters: U+0000..U+007F |
269 * | 277 * |
270 * @stable ICU 51 | 278 * @stable ICU 51 |
271 */ | 279 */ |
272 USPOOF_ASCII = 0x10000000, | 280 USPOOF_ASCII = 0x10000000, |
273 #ifndef U_HIDE_DRAFT_API | |
274 /** | 281 /** |
275 * All characters in each identifier must be from a single script. | 282 * All characters in each identifier must be from a single script. |
276 * | 283 * |
277 * @draft ICU 53 | 284 * @stable ICU 53 |
278 */ | 285 */ |
279 USPOOF_SINGLE_SCRIPT_RESTRICTIVE = 0x20000000, | 286 USPOOF_SINGLE_SCRIPT_RESTRICTIVE = 0x20000000, |
280 #endif /* U_HIDE_DRAFT_API */ | |
281 /** | 287 /** |
282 * All characters in each identifier must be from a single script, or fr
om the combinations: Latin + Han + | 288 * All characters in each identifier must be from a single script, or fr
om the combinations: Latin + Han + |
283 * Hiragana + Katakana; Latin + Han + Bopomofo; or Latin + Han + Hangul.
Note that this level will satisfy the | 289 * Hiragana + Katakana; Latin + Han + Bopomofo; or Latin + Han + Hangul.
Note that this level will satisfy the |
284 * vast majority of Latin-script users; also that TR36 has ASCII instead
of Latin. | 290 * vast majority of Latin-script users; also that TR36 has ASCII instead
of Latin. |
285 * | 291 * |
286 * @stable ICU 51 | 292 * @stable ICU 51 |
287 */ | 293 */ |
288 USPOOF_HIGHLY_RESTRICTIVE = 0x30000000, | 294 USPOOF_HIGHLY_RESTRICTIVE = 0x30000000, |
289 /** | 295 /** |
290 * Allow Latin with other scripts except Cyrillic, Greek, Cherokee Other
wise, the same as Highly Restrictive | 296 * Allow Latin with other scripts except Cyrillic, Greek, Cherokee Other
wise, the same as Highly Restrictive |
291 * | 297 * |
292 * @stable ICU 51 | 298 * @stable ICU 51 |
293 */ | 299 */ |
294 USPOOF_MODERATELY_RESTRICTIVE = 0x40000000, | 300 USPOOF_MODERATELY_RESTRICTIVE = 0x40000000, |
295 /** | 301 /** |
296 * Allow arbitrary mixtures of scripts. Otherwise, the same as Moderatel
y Restrictive. | 302 * Allow arbitrary mixtures of scripts. Otherwise, the same as Moderatel
y Restrictive. |
297 * | 303 * |
298 * @stable ICU 51 | 304 * @stable ICU 51 |
299 */ | 305 */ |
300 USPOOF_MINIMALLY_RESTRICTIVE = 0x50000000, | 306 USPOOF_MINIMALLY_RESTRICTIVE = 0x50000000, |
301 /** | 307 /** |
302 * Any valid identifiers, including characters outside of the Identifier
Profile. | 308 * Any valid identifiers, including characters outside of the Identifier
Profile. |
303 * | 309 * |
304 * @stable ICU 51 | 310 * @stable ICU 51 |
305 */ | 311 */ |
306 USPOOF_UNRESTRICTIVE = 0x60000000, | 312 USPOOF_UNRESTRICTIVE = 0x60000000, |
307 #ifndef U_HIDE_DRAFT_API | |
308 /** | 313 /** |
309 * Mask for selecting the Restriction Level bits from the return value
of uspoof_check(). | 314 * Mask for selecting the Restriction Level bits from the return value
of uspoof_check(). |
310 * | 315 * |
311 * @draft ICU 53 | 316 * @stable ICU 53 |
312 */ | 317 */ |
313 USPOOF_RESTRICTION_LEVEL_MASK = 0x7F000000 | 318 USPOOF_RESTRICTION_LEVEL_MASK = 0x7F000000 |
314 #endif /* U_HIDE_DRAFT_API */ | |
315 } URestrictionLevel; | 319 } URestrictionLevel; |
316 | 320 |
317 /** | 321 /** |
318 * Create a Unicode Spoof Checker, configured to perform all | 322 * Create a Unicode Spoof Checker, configured to perform all |
319 * checks except for USPOOF_LOCALE_LIMIT and USPOOF_CHAR_LIMIT. | 323 * checks except for USPOOF_LOCALE_LIMIT and USPOOF_CHAR_LIMIT. |
320 * Note that additional checks may be added in the future, | 324 * Note that additional checks may be added in the future, |
321 * resulting in the changes to the default checking behavior. | 325 * resulting in the changes to the default checking behavior. |
322 * | 326 * |
323 * @param status The error code, set if this function encounters a problem. | 327 * @param status The error code, set if this function encounters a problem. |
324 * @return the newly created Spoof Checker | 328 * @return the newly created Spoof Checker |
(...skipping 540 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
865 * Using skeletons directly makes it possible to quickly check | 869 * Using skeletons directly makes it possible to quickly check |
866 * whether an identifier is confusable with any of some large | 870 * whether an identifier is confusable with any of some large |
867 * set of existing identifiers, by creating an efficiently | 871 * set of existing identifiers, by creating an efficiently |
868 * searchable collection of the skeletons. | 872 * searchable collection of the skeletons. |
869 * | 873 * |
870 * @param sc The USpoofChecker | 874 * @param sc The USpoofChecker |
871 * @param type The type of skeleton, corresponding to which | 875 * @param type The type of skeleton, corresponding to which |
872 * of the Unicode confusable data tables to use. | 876 * of the Unicode confusable data tables to use. |
873 * The default is Mixed-Script, Lowercase. | 877 * The default is Mixed-Script, Lowercase. |
874 * Allowed options are USPOOF_SINGLE_SCRIPT_CONFUSABLE and | 878 * Allowed options are USPOOF_SINGLE_SCRIPT_CONFUSABLE and |
875 * USPOOF_ANY_CASE_CONFUSABLE. The two flags may be ORed. | 879 * USPOOF_ANY_CASE. The two flags may be ORed. |
876 * @param id The input identifier whose skeleton will be computed. | 880 * @param id The input identifier whose skeleton will be computed. |
877 * @param length The length of the input identifier, expressed in 16 bit | 881 * @param length The length of the input identifier, expressed in 16 bit |
878 * UTF-16 code units, or -1 if the string is zero terminated. | 882 * UTF-16 code units, or -1 if the string is zero terminated. |
879 * @param dest The output buffer, to receive the skeleton string. | 883 * @param dest The output buffer, to receive the skeleton string. |
880 * @param destCapacity The length of the output buffer, in 16 bit units. | 884 * @param destCapacity The length of the output buffer, in 16 bit units. |
881 * The destCapacity may be zero, in which case the function will | 885 * The destCapacity may be zero, in which case the function will |
882 * return the actual length of the skeleton. | 886 * return the actual length of the skeleton. |
883 * @param status The error code, set if an error occurred while attempting to | 887 * @param status The error code, set if an error occurred while attempting to |
884 * perform the check. | 888 * perform the check. |
885 * @return The length of the skeleton string. The returned length | 889 * @return The length of the skeleton string. The returned length |
(...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
947 * Using skeletons directly makes it possible to quickly check | 951 * Using skeletons directly makes it possible to quickly check |
948 * whether an identifier is confusable with any of some large | 952 * whether an identifier is confusable with any of some large |
949 * set of existing identifiers, by creating an efficiently | 953 * set of existing identifiers, by creating an efficiently |
950 * searchable collection of the skeletons. | 954 * searchable collection of the skeletons. |
951 * | 955 * |
952 * @param sc The USpoofChecker. | 956 * @param sc The USpoofChecker. |
953 * @param type The type of skeleton, corresponding to which | 957 * @param type The type of skeleton, corresponding to which |
954 * of the Unicode confusable data tables to use. | 958 * of the Unicode confusable data tables to use. |
955 * The default is Mixed-Script, Lowercase. | 959 * The default is Mixed-Script, Lowercase. |
956 * Allowed options are USPOOF_SINGLE_SCRIPT_CONFUSABLE and | 960 * Allowed options are USPOOF_SINGLE_SCRIPT_CONFUSABLE and |
957 * USPOOF_ANY_CASE_CONFUSABLE. The two flags may be ORed. | 961 * USPOOF_ANY_CASE. The two flags may be ORed. |
958 * @param id The input identifier whose skeleton will be computed. | 962 * @param id The input identifier whose skeleton will be computed. |
959 * @param dest The output identifier, to receive the skeleton string. | 963 * @param dest The output identifier, to receive the skeleton string. |
960 * @param status The error code, set if an error occurred while attempting to | 964 * @param status The error code, set if an error occurred while attempting to |
961 * perform the check. | 965 * perform the check. |
962 * @return A reference to the destination (skeleton) string. | 966 * @return A reference to the destination (skeleton) string. |
963 * | 967 * |
964 * @stable ICU 4.2 | 968 * @stable ICU 4.2 |
965 */ | 969 */ |
966 U_I18N_API icu::UnicodeString & U_EXPORT2 | 970 U_I18N_API icu::UnicodeString & U_EXPORT2 |
967 uspoof_getSkeletonUnicodeString(const USpoofChecker *sc, | 971 uspoof_getSkeletonUnicodeString(const USpoofChecker *sc, |
(...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1055 */ | 1059 */ |
1056 U_STABLE int32_t U_EXPORT2 | 1060 U_STABLE int32_t U_EXPORT2 |
1057 uspoof_serialize(USpoofChecker *sc, | 1061 uspoof_serialize(USpoofChecker *sc, |
1058 void *data, int32_t capacity, | 1062 void *data, int32_t capacity, |
1059 UErrorCode *status); | 1063 UErrorCode *status); |
1060 | 1064 |
1061 | 1065 |
1062 #endif | 1066 #endif |
1063 | 1067 |
1064 #endif /* USPOOF_H */ | 1068 #endif /* USPOOF_H */ |
OLD | NEW |