OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * |
| 4 * Copyright (C) 2004-2010, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ******************************************************************************* |
| 8 * file name: ucase.h |
| 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) |
| 11 * indentation:4 |
| 12 * |
| 13 * created on: 2004aug30 |
| 14 * created by: Markus W. Scherer |
| 15 * |
| 16 * Low-level Unicode character/string case mapping code. |
| 17 */ |
| 18 |
| 19 #ifndef __UCASE_H__ |
| 20 #define __UCASE_H__ |
| 21 |
| 22 #include "unicode/utypes.h" |
| 23 #include "unicode/uset.h" |
| 24 #include "uset_imp.h" |
| 25 #include "udataswp.h" |
| 26 |
| 27 U_CDECL_BEGIN |
| 28 |
| 29 /* library API -------------------------------------------------------------- */ |
| 30 |
| 31 struct UCaseProps; |
| 32 typedef struct UCaseProps UCaseProps; |
| 33 |
| 34 U_CAPI const UCaseProps * U_EXPORT2 |
| 35 ucase_getSingleton(void); |
| 36 |
| 37 U_CAPI int32_t U_EXPORT2 |
| 38 ucase_swap(const UDataSwapper *ds, |
| 39 const void *inData, int32_t length, void *outData, |
| 40 UErrorCode *pErrorCode); |
| 41 |
| 42 U_CFUNC void U_EXPORT2 |
| 43 ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *
pErrorCode); |
| 44 |
| 45 /** |
| 46 * Requires non-NULL locale ID but otherwise does the equivalent of |
| 47 * checking for language codes as if uloc_getLanguage() were called: |
| 48 * Accepts both 2- and 3-letter codes and accepts case variants. |
| 49 */ |
| 50 U_CFUNC int32_t |
| 51 ucase_getCaseLocale(const char *locale, int32_t *locCache); |
| 52 |
| 53 /* Casing locale types for ucase_getCaseLocale */ |
| 54 enum { |
| 55 UCASE_LOC_UNKNOWN, |
| 56 UCASE_LOC_ROOT, |
| 57 UCASE_LOC_TURKISH, |
| 58 UCASE_LOC_LITHUANIAN, |
| 59 UCASE_LOC_DUTCH |
| 60 }; |
| 61 |
| 62 /** |
| 63 * Bit mask for getting just the options from a string compare options word |
| 64 * that are relevant for case-insensitive string comparison. |
| 65 * See uchar.h. Also include _STRNCMP_STYLE and U_COMPARE_CODE_POINT_ORDER. |
| 66 * @internal |
| 67 */ |
| 68 #define _STRCASECMP_OPTIONS_MASK 0xffff |
| 69 |
| 70 /** |
| 71 * Bit mask for getting just the options from a string compare options word |
| 72 * that are relevant for case folding (of a single string or code point). |
| 73 * See uchar.h. |
| 74 * @internal |
| 75 */ |
| 76 #define _FOLD_CASE_OPTIONS_MASK 0xff |
| 77 |
| 78 /* single-code point functions */ |
| 79 |
| 80 U_CAPI UChar32 U_EXPORT2 |
| 81 ucase_tolower(const UCaseProps *csp, UChar32 c); |
| 82 |
| 83 U_CAPI UChar32 U_EXPORT2 |
| 84 ucase_toupper(const UCaseProps *csp, UChar32 c); |
| 85 |
| 86 U_CAPI UChar32 U_EXPORT2 |
| 87 ucase_totitle(const UCaseProps *csp, UChar32 c); |
| 88 |
| 89 U_CAPI UChar32 U_EXPORT2 |
| 90 ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options); |
| 91 |
| 92 /** |
| 93 * Adds all simple case mappings and the full case folding for c to sa, |
| 94 * and also adds special case closure mappings. |
| 95 * c itself is not added. |
| 96 * For example, the mappings |
| 97 * - for s include long s |
| 98 * - for sharp s include ss |
| 99 * - for k include the Kelvin sign |
| 100 */ |
| 101 U_CFUNC void U_EXPORT2 |
| 102 ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa); |
| 103 |
| 104 /** |
| 105 * Maps the string to single code points and adds the associated case closure |
| 106 * mappings. |
| 107 * The string is mapped to code points if it is their full case folding string. |
| 108 * In other words, this performs a reverse full case folding and then |
| 109 * adds the case closure items of the resulting code points. |
| 110 * If the string is found and its closure applied, then |
| 111 * the string itself is added as well as part of its code points' closure. |
| 112 * It must be length>=0. |
| 113 * |
| 114 * @return TRUE if the string was found |
| 115 */ |
| 116 U_CFUNC UBool U_EXPORT2 |
| 117 ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length
, const USetAdder *sa); |
| 118 |
| 119 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */ |
| 120 U_CAPI int32_t U_EXPORT2 |
| 121 ucase_getType(const UCaseProps *csp, UChar32 c); |
| 122 |
| 123 /** @return same as ucase_getType(), or <0 if c is case-ignorable */ |
| 124 U_CAPI int32_t U_EXPORT2 |
| 125 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c); |
| 126 |
| 127 U_CAPI UBool U_EXPORT2 |
| 128 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c); |
| 129 |
| 130 U_CAPI UBool U_EXPORT2 |
| 131 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c); |
| 132 |
| 133 /* string case mapping functions */ |
| 134 |
| 135 /** |
| 136 * Iterator function for string case mappings, which need to look at the |
| 137 * context (surrounding text) of a given character for conditional mappings. |
| 138 * |
| 139 * The iterator only needs to go backward or forward away from the |
| 140 * character in question. It does not use any indexes on this interface. |
| 141 * It does not support random access or an arbitrary change of |
| 142 * iteration direction. |
| 143 * |
| 144 * The code point being case-mapped itself is never returned by |
| 145 * this iterator. |
| 146 * |
| 147 * @param context A pointer to the iterator's working data. |
| 148 * @param dir If <0 then start iterating backward from the character; |
| 149 * if >0 then start iterating forward from the character; |
| 150 * if 0 then continue iterating in the current direction. |
| 151 * @return Next code point, or <0 when the iteration is done. |
| 152 */ |
| 153 typedef UChar32 U_CALLCONV |
| 154 UCaseContextIterator(void *context, int8_t dir); |
| 155 |
| 156 /** |
| 157 * Sample struct which may be used by some implementations of |
| 158 * UCaseContextIterator. |
| 159 */ |
| 160 struct UCaseContext { |
| 161 void *p; |
| 162 int32_t start, index, limit; |
| 163 int32_t cpStart, cpLimit; |
| 164 int8_t dir; |
| 165 int8_t b1, b2, b3; |
| 166 }; |
| 167 typedef struct UCaseContext UCaseContext; |
| 168 |
| 169 enum { |
| 170 /** |
| 171 * For string case mappings, a single character (a code point) is mapped |
| 172 * either to itself (in which case in-place mapping functions do nothing), |
| 173 * or to another single code point, or to a string. |
| 174 * Aside from the string contents, these are indicated with a single int32_t |
| 175 * value as follows: |
| 176 * |
| 177 * Mapping to self: Negative values (~self instead of -self to support U+000
0) |
| 178 * |
| 179 * Mapping to another code point: Positive values >UCASE_MAX_STRING_LENGTH |
| 180 * |
| 181 * Mapping to a string: The string length (0..UCASE_MAX_STRING_LENGTH) is |
| 182 * returned. Note that the string result may indeed have zero length. |
| 183 */ |
| 184 UCASE_MAX_STRING_LENGTH=0x1f |
| 185 }; |
| 186 |
| 187 /** |
| 188 * Get the full lowercase mapping for c. |
| 189 * |
| 190 * @param csp Case mapping properties. |
| 191 * @param c Character to be mapped. |
| 192 * @param iter Character iterator, used for context-sensitive mappings. |
| 193 * See UCaseContextIterator for details. |
| 194 * If iter==NULL then a context-independent result is returned. |
| 195 * @param context Pointer to be passed into iter. |
| 196 * @param pString If the mapping result is a string, then the pointer is |
| 197 * written to *pString. |
| 198 * @param locale Locale ID for locale-dependent mappings. |
| 199 * @param locCache Initialize to 0; may be used to cache the result of parsing |
| 200 * the locale ID for subsequent calls. |
| 201 * Can be NULL. |
| 202 * @return Output code point or string length, see UCASE_MAX_STRING_LENGTH. |
| 203 * |
| 204 * @see UCaseContextIterator |
| 205 * @see UCASE_MAX_STRING_LENGTH |
| 206 * @internal |
| 207 */ |
| 208 U_CAPI int32_t U_EXPORT2 |
| 209 ucase_toFullLower(const UCaseProps *csp, UChar32 c, |
| 210 UCaseContextIterator *iter, void *context, |
| 211 const UChar **pString, |
| 212 const char *locale, int32_t *locCache); |
| 213 |
| 214 U_CAPI int32_t U_EXPORT2 |
| 215 ucase_toFullUpper(const UCaseProps *csp, UChar32 c, |
| 216 UCaseContextIterator *iter, void *context, |
| 217 const UChar **pString, |
| 218 const char *locale, int32_t *locCache); |
| 219 |
| 220 U_CAPI int32_t U_EXPORT2 |
| 221 ucase_toFullTitle(const UCaseProps *csp, UChar32 c, |
| 222 UCaseContextIterator *iter, void *context, |
| 223 const UChar **pString, |
| 224 const char *locale, int32_t *locCache); |
| 225 |
| 226 U_CAPI int32_t U_EXPORT2 |
| 227 ucase_toFullFolding(const UCaseProps *csp, UChar32 c, |
| 228 const UChar **pString, |
| 229 uint32_t options); |
| 230 |
| 231 U_CFUNC int32_t U_EXPORT2 |
| 232 ucase_hasBinaryProperty(UChar32 c, UProperty which); |
| 233 |
| 234 |
| 235 U_CDECL_BEGIN |
| 236 |
| 237 /** |
| 238 * @internal |
| 239 */ |
| 240 typedef int32_t U_CALLCONV |
| 241 UCaseMapFull(const UCaseProps *csp, UChar32 c, |
| 242 UCaseContextIterator *iter, void *context, |
| 243 const UChar **pString, |
| 244 const char *locale, int32_t *locCache); |
| 245 |
| 246 U_CDECL_END |
| 247 |
| 248 /* file definitions --------------------------------------------------------- */ |
| 249 |
| 250 #define UCASE_DATA_NAME "ucase" |
| 251 #define UCASE_DATA_TYPE "icu" |
| 252 |
| 253 /* format "cAsE" */ |
| 254 #define UCASE_FMT_0 0x63 |
| 255 #define UCASE_FMT_1 0x41 |
| 256 #define UCASE_FMT_2 0x53 |
| 257 #define UCASE_FMT_3 0x45 |
| 258 |
| 259 /* indexes into indexes[] */ |
| 260 enum { |
| 261 UCASE_IX_INDEX_TOP, |
| 262 UCASE_IX_LENGTH, |
| 263 UCASE_IX_TRIE_SIZE, |
| 264 UCASE_IX_EXC_LENGTH, |
| 265 UCASE_IX_UNFOLD_LENGTH, |
| 266 |
| 267 UCASE_IX_MAX_FULL_LENGTH=15, |
| 268 UCASE_IX_TOP=16 |
| 269 }; |
| 270 |
| 271 /* definitions for 16-bit case properties word ------------------------------ */ |
| 272 |
| 273 /* 2-bit constants for types of cased characters */ |
| 274 #define UCASE_TYPE_MASK 3 |
| 275 enum { |
| 276 UCASE_NONE, |
| 277 UCASE_LOWER, |
| 278 UCASE_UPPER, |
| 279 UCASE_TITLE |
| 280 }; |
| 281 |
| 282 #define UCASE_GET_TYPE(props) ((props)&UCASE_TYPE_MASK) |
| 283 |
| 284 #define UCASE_SENSITIVE 4 |
| 285 #define UCASE_EXCEPTION 8 |
| 286 |
| 287 #define UCASE_DOT_MASK 0x30 |
| 288 enum { |
| 289 UCASE_NO_DOT=0, /* normal characters with cc=0 */ |
| 290 UCASE_SOFT_DOTTED=0x10, /* soft-dotted characters with cc=0 */ |
| 291 UCASE_ABOVE=0x20, /* "above" accents with cc=230 */ |
| 292 UCASE_OTHER_ACCENT=0x30 /* other accent character (0<cc!=230) */ |
| 293 }; |
| 294 |
| 295 /* no exception: bits 15..6 are a 10-bit signed case mapping delta */ |
| 296 #define UCASE_DELTA_SHIFT 6 |
| 297 #define UCASE_DELTA_MASK 0xffc0 |
| 298 #define UCASE_MAX_DELTA 0x1ff |
| 299 #define UCASE_MIN_DELTA (-UCASE_MAX_DELTA-1) |
| 300 |
| 301 #define UCASE_GET_DELTA(props) ((int16_t)(props)>>UCASE_DELTA_SHIFT) |
| 302 |
| 303 /* case-ignorable uses one of the delta bits, see gencase/store.c */ |
| 304 #define UCASE_CASE_IGNORABLE 0x40 |
| 305 |
| 306 /* exception: bits 15..4 are an unsigned 12-bit index into the exceptions array
*/ |
| 307 #define UCASE_EXC_SHIFT 4 |
| 308 #define UCASE_EXC_MASK 0xfff0 |
| 309 #define UCASE_MAX_EXCEPTIONS 0x1000 |
| 310 |
| 311 /* definitions for 16-bit main exceptions word ------------------------------ */ |
| 312 |
| 313 /* first 8 bits indicate values in optional slots */ |
| 314 enum { |
| 315 UCASE_EXC_LOWER, |
| 316 UCASE_EXC_FOLD, |
| 317 UCASE_EXC_UPPER, |
| 318 UCASE_EXC_TITLE, |
| 319 UCASE_EXC_4, /* reserved */ |
| 320 UCASE_EXC_5, /* reserved */ |
| 321 UCASE_EXC_CLOSURE, |
| 322 UCASE_EXC_FULL_MAPPINGS, |
| 323 UCASE_EXC_ALL_SLOTS /* one past the last slot */ |
| 324 }; |
| 325 |
| 326 /* each slot is 2 uint16_t instead of 1 */ |
| 327 #define UCASE_EXC_DOUBLE_SLOTS 0x100 |
| 328 |
| 329 /* reserved: exception bits 10..9 */ |
| 330 |
| 331 #define UCASE_EXC_CASE_IGNORABLE 0x800 |
| 332 |
| 333 /* UCASE_EXC_DOT_MASK=UCASE_DOT_MASK<<UCASE_EXC_DOT_SHIFT */ |
| 334 #define UCASE_EXC_DOT_SHIFT 8 |
| 335 |
| 336 /* normally stored in the main word, but pushed out for larger exception indexes
*/ |
| 337 #define UCASE_EXC_DOT_MASK 0x3000 |
| 338 enum { |
| 339 UCASE_EXC_NO_DOT=0, |
| 340 UCASE_EXC_SOFT_DOTTED=0x1000, |
| 341 UCASE_EXC_ABOVE=0x2000, /* "above" accents with cc=230 */ |
| 342 UCASE_EXC_OTHER_ACCENT=0x3000 /* other character (0<cc!=230) */ |
| 343 }; |
| 344 |
| 345 /* complex/conditional mappings */ |
| 346 #define UCASE_EXC_CONDITIONAL_SPECIAL 0x4000 |
| 347 #define UCASE_EXC_CONDITIONAL_FOLD 0x8000 |
| 348 |
| 349 /* definitions for lengths word for full case mappings */ |
| 350 #define UCASE_FULL_LOWER 0xf |
| 351 #define UCASE_FULL_FOLDING 0xf0 |
| 352 #define UCASE_FULL_UPPER 0xf00 |
| 353 #define UCASE_FULL_TITLE 0xf000 |
| 354 |
| 355 /* maximum lengths */ |
| 356 #define UCASE_FULL_MAPPINGS_MAX_LENGTH (4*0xf) |
| 357 #define UCASE_CLOSURE_MAX_LENGTH 0xf |
| 358 |
| 359 /* constants for reverse case folding ("unfold") data */ |
| 360 enum { |
| 361 UCASE_UNFOLD_ROWS, |
| 362 UCASE_UNFOLD_ROW_WIDTH, |
| 363 UCASE_UNFOLD_STRING_WIDTH |
| 364 }; |
| 365 |
| 366 U_CDECL_END |
| 367 |
| 368 #endif |
OLD | NEW |