OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * |
| 4 * Copyright (C) 2004-2010, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ******************************************************************************* |
| 8 * file name: ucase.c |
| 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) |
| 11 * indentation:4 |
| 12 * |
| 13 * created on: 2004aug30 |
| 14 * created by: Markus W. Scherer |
| 15 * |
| 16 * Low-level Unicode character/string case mapping code. |
| 17 * Much code moved here (and modified) from uchar.c. |
| 18 */ |
| 19 |
| 20 #include "unicode/utypes.h" |
| 21 #include "unicode/uset.h" |
| 22 #include "unicode/udata.h" /* UDataInfo */ |
| 23 #include "ucmndata.h" /* DataHeader */ |
| 24 #include "udatamem.h" |
| 25 #include "umutex.h" |
| 26 #include "uassert.h" |
| 27 #include "cmemory.h" |
| 28 #include "utrie2.h" |
| 29 #include "ucase.h" |
| 30 #include "ucln_cmn.h" |
| 31 |
| 32 struct UCaseProps { |
| 33 UDataMemory *mem; |
| 34 const int32_t *indexes; |
| 35 const uint16_t *exceptions; |
| 36 const UChar *unfold; |
| 37 |
| 38 UTrie2 trie; |
| 39 uint8_t formatVersion[4]; |
| 40 }; |
| 41 |
| 42 /* ucase_props_data.c is machine-generated by gencase --csource */ |
| 43 #include "ucase_props_data.c" |
| 44 |
| 45 /* UCaseProps singleton ----------------------------------------------------- */ |
| 46 |
| 47 U_CAPI const UCaseProps * U_EXPORT2 |
| 48 ucase_getSingleton() { |
| 49 return &ucase_props_singleton; |
| 50 } |
| 51 |
| 52 /* set of property starts for UnicodeSet ------------------------------------ */ |
| 53 |
| 54 static UBool U_CALLCONV |
| 55 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32
_t value) { |
| 56 /* add the start code point to the USet */ |
| 57 const USetAdder *sa=(const USetAdder *)context; |
| 58 sa->add(sa->set, start); |
| 59 return TRUE; |
| 60 } |
| 61 |
| 62 U_CFUNC void U_EXPORT2 |
| 63 ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *
pErrorCode) { |
| 64 if(U_FAILURE(*pErrorCode)) { |
| 65 return; |
| 66 } |
| 67 |
| 68 /* add the start code point of each same-value range of the trie */ |
| 69 utrie2_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa); |
| 70 |
| 71 /* add code points with hardcoded properties, plus the ones following them *
/ |
| 72 |
| 73 /* (none right now, see comment below) */ |
| 74 |
| 75 /* |
| 76 * Omit code points with hardcoded specialcasing properties |
| 77 * because we do not build property UnicodeSets for them right now. |
| 78 */ |
| 79 } |
| 80 |
| 81 /* data access primitives --------------------------------------------------- */ |
| 82 |
| 83 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT)
) |
| 84 |
| 85 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION) |
| 86 |
| 87 /* number of bits in an 8-bit integer value */ |
| 88 static const uint8_t flagsOffset[256]={ |
| 89 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, |
| 90 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, |
| 91 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, |
| 92 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
| 93 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, |
| 94 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
| 95 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
| 96 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, |
| 97 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, |
| 98 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
| 99 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
| 100 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, |
| 101 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
| 102 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, |
| 103 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, |
| 104 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 |
| 105 }; |
| 106 |
| 107 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx))) |
| 108 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)] |
| 109 |
| 110 /* |
| 111 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx). |
| 112 * |
| 113 * @param excWord (in) initial exceptions word |
| 114 * @param idx (in) desired slot index |
| 115 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++; |
| 116 * moved to the last uint16_t of the value, use +1 for beginning o
f next slot |
| 117 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modif
ied |
| 118 */ |
| 119 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) \ |
| 120 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \ |
| 121 (pExc16)+=SLOT_OFFSET(excWord, idx); \ |
| 122 (value)=*pExc16; \ |
| 123 } else { \ |
| 124 (pExc16)+=2*SLOT_OFFSET(excWord, idx); \ |
| 125 (value)=*pExc16++; \ |
| 126 (value)=((value)<<16)|*pExc16; \ |
| 127 } |
| 128 |
| 129 /* simple case mappings ----------------------------------------------------- */ |
| 130 |
| 131 U_CAPI UChar32 U_EXPORT2 |
| 132 ucase_tolower(const UCaseProps *csp, UChar32 c) { |
| 133 uint16_t props=UTRIE2_GET16(&csp->trie, c); |
| 134 if(!PROPS_HAS_EXCEPTION(props)) { |
| 135 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { |
| 136 c+=UCASE_GET_DELTA(props); |
| 137 } |
| 138 } else { |
| 139 const uint16_t *pe=GET_EXCEPTIONS(csp, props); |
| 140 uint16_t excWord=*pe++; |
| 141 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { |
| 142 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c); |
| 143 } |
| 144 } |
| 145 return c; |
| 146 } |
| 147 |
| 148 U_CAPI UChar32 U_EXPORT2 |
| 149 ucase_toupper(const UCaseProps *csp, UChar32 c) { |
| 150 uint16_t props=UTRIE2_GET16(&csp->trie, c); |
| 151 if(!PROPS_HAS_EXCEPTION(props)) { |
| 152 if(UCASE_GET_TYPE(props)==UCASE_LOWER) { |
| 153 c+=UCASE_GET_DELTA(props); |
| 154 } |
| 155 } else { |
| 156 const uint16_t *pe=GET_EXCEPTIONS(csp, props); |
| 157 uint16_t excWord=*pe++; |
| 158 if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { |
| 159 GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c); |
| 160 } |
| 161 } |
| 162 return c; |
| 163 } |
| 164 |
| 165 U_CAPI UChar32 U_EXPORT2 |
| 166 ucase_totitle(const UCaseProps *csp, UChar32 c) { |
| 167 uint16_t props=UTRIE2_GET16(&csp->trie, c); |
| 168 if(!PROPS_HAS_EXCEPTION(props)) { |
| 169 if(UCASE_GET_TYPE(props)==UCASE_LOWER) { |
| 170 c+=UCASE_GET_DELTA(props); |
| 171 } |
| 172 } else { |
| 173 const uint16_t *pe=GET_EXCEPTIONS(csp, props); |
| 174 uint16_t excWord=*pe++; |
| 175 int32_t idx; |
| 176 if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) { |
| 177 idx=UCASE_EXC_TITLE; |
| 178 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { |
| 179 idx=UCASE_EXC_UPPER; |
| 180 } else { |
| 181 return c; |
| 182 } |
| 183 GET_SLOT_VALUE(excWord, idx, pe, c); |
| 184 } |
| 185 return c; |
| 186 } |
| 187 |
| 188 static const UChar iDot[2] = { 0x69, 0x307 }; |
| 189 static const UChar jDot[2] = { 0x6a, 0x307 }; |
| 190 static const UChar iOgonekDot[3] = { 0x12f, 0x307 }; |
| 191 static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 }; |
| 192 static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 }; |
| 193 static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 }; |
| 194 |
| 195 |
| 196 U_CFUNC void U_EXPORT2 |
| 197 ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) { |
| 198 uint16_t props; |
| 199 |
| 200 /* |
| 201 * Hardcode the case closure of i and its relatives and ignore the |
| 202 * data file data for these characters. |
| 203 * The Turkic dotless i and dotted I with their case mapping conditions |
| 204 * and case folding option make the related characters behave specially. |
| 205 * This code matches their closure behavior to their case folding behavior. |
| 206 */ |
| 207 |
| 208 switch(c) { |
| 209 case 0x49: |
| 210 /* regular i and I are in one equivalence class */ |
| 211 sa->add(sa->set, 0x69); |
| 212 return; |
| 213 case 0x69: |
| 214 sa->add(sa->set, 0x49); |
| 215 return; |
| 216 case 0x130: |
| 217 /* dotted I is in a class with <0069 0307> (for canonical equivalence wi
th <0049 0307>) */ |
| 218 sa->addString(sa->set, iDot, 2); |
| 219 return; |
| 220 case 0x131: |
| 221 /* dotless i is in a class by itself */ |
| 222 return; |
| 223 default: |
| 224 /* otherwise use the data file data */ |
| 225 break; |
| 226 } |
| 227 |
| 228 props=UTRIE2_GET16(&csp->trie, c); |
| 229 if(!PROPS_HAS_EXCEPTION(props)) { |
| 230 if(UCASE_GET_TYPE(props)!=UCASE_NONE) { |
| 231 /* add the one simple case mapping, no matter what type it is */ |
| 232 int32_t delta=UCASE_GET_DELTA(props); |
| 233 if(delta!=0) { |
| 234 sa->add(sa->set, c+delta); |
| 235 } |
| 236 } |
| 237 } else { |
| 238 /* |
| 239 * c has exceptions, so there may be multiple simple and/or |
| 240 * full case mappings. Add them all. |
| 241 */ |
| 242 const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props); |
| 243 const UChar *closure; |
| 244 uint16_t excWord=*pe++; |
| 245 int32_t idx, closureLength, fullLength, length; |
| 246 |
| 247 pe0=pe; |
| 248 |
| 249 /* add all simple case mappings */ |
| 250 for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) { |
| 251 if(HAS_SLOT(excWord, idx)) { |
| 252 pe=pe0; |
| 253 GET_SLOT_VALUE(excWord, idx, pe, c); |
| 254 sa->add(sa->set, c); |
| 255 } |
| 256 } |
| 257 |
| 258 /* get the closure string pointer & length */ |
| 259 if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) { |
| 260 pe=pe0; |
| 261 GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength); |
| 262 closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved
*/ |
| 263 closure=(const UChar *)pe+1; /* behind this slot, unless there are f
ull case mappings */ |
| 264 } else { |
| 265 closureLength=0; |
| 266 closure=NULL; |
| 267 } |
| 268 |
| 269 /* add the full case folding */ |
| 270 if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { |
| 271 pe=pe0; |
| 272 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength); |
| 273 |
| 274 /* start of full case mapping strings */ |
| 275 ++pe; |
| 276 |
| 277 fullLength&=0xffff; /* bits 16 and higher are reserved */ |
| 278 |
| 279 /* skip the lowercase result string */ |
| 280 pe+=fullLength&UCASE_FULL_LOWER; |
| 281 fullLength>>=4; |
| 282 |
| 283 /* add the full case folding string */ |
| 284 length=fullLength&0xf; |
| 285 if(length!=0) { |
| 286 sa->addString(sa->set, (const UChar *)pe, length); |
| 287 pe+=length; |
| 288 } |
| 289 |
| 290 /* skip the uppercase and titlecase strings */ |
| 291 fullLength>>=4; |
| 292 pe+=fullLength&0xf; |
| 293 fullLength>>=4; |
| 294 pe+=fullLength; |
| 295 |
| 296 closure=(const UChar *)pe; /* behind full case mappings */ |
| 297 } |
| 298 |
| 299 /* add each code point in the closure string */ |
| 300 for(idx=0; idx<closureLength;) { |
| 301 U16_NEXT_UNSAFE(closure, idx, c); |
| 302 sa->add(sa->set, c); |
| 303 } |
| 304 } |
| 305 } |
| 306 |
| 307 /* |
| 308 * compare s, which has a length, with t, which has a maximum length or is NUL-t
erminated |
| 309 * must be length>0 and max>0 and length<=max |
| 310 */ |
| 311 static U_INLINE int32_t |
| 312 strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) { |
| 313 int32_t c1, c2; |
| 314 |
| 315 max-=length; /* we require length<=max, so no need to decrement max in the l
oop */ |
| 316 do { |
| 317 c1=*s++; |
| 318 c2=*t++; |
| 319 if(c2==0) { |
| 320 return 1; /* reached the end of t but not of s */ |
| 321 } |
| 322 c1-=c2; |
| 323 if(c1!=0) { |
| 324 return c1; /* return difference result */ |
| 325 } |
| 326 } while(--length>0); |
| 327 /* ends with length==0 */ |
| 328 |
| 329 if(max==0 || *t==0) { |
| 330 return 0; /* equal to length of both strings */ |
| 331 } else { |
| 332 return -max; /* return lengh difference */ |
| 333 } |
| 334 } |
| 335 |
| 336 U_CFUNC UBool U_EXPORT2 |
| 337 ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length
, const USetAdder *sa) { |
| 338 const UChar *unfold, *p; |
| 339 int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWid
th; |
| 340 |
| 341 if(csp->unfold==NULL || s==NULL) { |
| 342 return FALSE; /* no reverse case folding data, or no string */ |
| 343 } |
| 344 if(length<=1) { |
| 345 /* the string is too short to find any match */ |
| 346 /* |
| 347 * more precise would be: |
| 348 * if(!u_strHasMoreChar32Than(s, length, 1)) |
| 349 * but this does not make much practical difference because |
| 350 * a single supplementary code point would just not be found |
| 351 */ |
| 352 return FALSE; |
| 353 } |
| 354 |
| 355 unfold=csp->unfold; |
| 356 unfoldRows=unfold[UCASE_UNFOLD_ROWS]; |
| 357 unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH]; |
| 358 unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH]; |
| 359 unfold+=unfoldRowWidth; |
| 360 |
| 361 if(length>unfoldStringWidth) { |
| 362 /* the string is too long to find any match */ |
| 363 return FALSE; |
| 364 } |
| 365 |
| 366 /* do a binary search for the string */ |
| 367 start=0; |
| 368 limit=unfoldRows; |
| 369 while(start<limit) { |
| 370 i=(start+limit)/2; |
| 371 p=unfold+(i*unfoldRowWidth); |
| 372 result=strcmpMax(s, length, p, unfoldStringWidth); |
| 373 |
| 374 if(result==0) { |
| 375 /* found the string: add each code point, and its case closure */ |
| 376 UChar32 c; |
| 377 |
| 378 for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) { |
| 379 U16_NEXT_UNSAFE(p, i, c); |
| 380 sa->add(sa->set, c); |
| 381 ucase_addCaseClosure(csp, c, sa); |
| 382 } |
| 383 return TRUE; |
| 384 } else if(result<0) { |
| 385 limit=i; |
| 386 } else /* result>0 */ { |
| 387 start=i+1; |
| 388 } |
| 389 } |
| 390 |
| 391 return FALSE; /* string not found */ |
| 392 } |
| 393 |
| 394 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */ |
| 395 U_CAPI int32_t U_EXPORT2 |
| 396 ucase_getType(const UCaseProps *csp, UChar32 c) { |
| 397 uint16_t props=UTRIE2_GET16(&csp->trie, c); |
| 398 return UCASE_GET_TYPE(props); |
| 399 } |
| 400 |
| 401 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */ |
| 402 U_CAPI int32_t U_EXPORT2 |
| 403 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) { |
| 404 uint16_t props=UTRIE2_GET16(&csp->trie, c); |
| 405 int32_t type=UCASE_GET_TYPE(props); |
| 406 if(props&UCASE_EXCEPTION) { |
| 407 const uint16_t *pe=GET_EXCEPTIONS(csp, props); |
| 408 if(*pe&UCASE_EXC_CASE_IGNORABLE) { |
| 409 type|=4; |
| 410 } |
| 411 } else if(type==UCASE_NONE && (props&UCASE_CASE_IGNORABLE)) { |
| 412 type|=4; |
| 413 } |
| 414 return type; |
| 415 } |
| 416 |
| 417 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */ |
| 418 static U_INLINE int32_t |
| 419 getDotType(const UCaseProps *csp, UChar32 c) { |
| 420 uint16_t props=UTRIE2_GET16(&csp->trie, c); |
| 421 if(!PROPS_HAS_EXCEPTION(props)) { |
| 422 return props&UCASE_DOT_MASK; |
| 423 } else { |
| 424 const uint16_t *pe=GET_EXCEPTIONS(csp, props); |
| 425 return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK; |
| 426 } |
| 427 } |
| 428 |
| 429 U_CAPI UBool U_EXPORT2 |
| 430 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) { |
| 431 return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED); |
| 432 } |
| 433 |
| 434 U_CAPI UBool U_EXPORT2 |
| 435 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) { |
| 436 uint16_t props=UTRIE2_GET16(&csp->trie, c); |
| 437 return (UBool)((props&UCASE_SENSITIVE)!=0); |
| 438 } |
| 439 |
| 440 /* string casing ------------------------------------------------------------ */ |
| 441 |
| 442 /* |
| 443 * These internal functions form the core of string case mappings. |
| 444 * They map single code points to result code points or strings and take |
| 445 * all necessary conditions (context, locale ID, options) into account. |
| 446 * |
| 447 * They do not iterate over the source or write to the destination |
| 448 * so that the same functions are useful for non-standard string storage, |
| 449 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc. |
| 450 * For the same reason, the "surrounding text" context is passed in as a |
| 451 * UCaseContextIterator which does not make any assumptions about |
| 452 * the underlying storage. |
| 453 * |
| 454 * This section contains helper functions that check for conditions |
| 455 * in the input text surrounding the current code point |
| 456 * according to SpecialCasing.txt. |
| 457 * |
| 458 * Each helper function gets the index |
| 459 * - after the current code point if it looks at following text |
| 460 * - before the current code point if it looks at preceding text |
| 461 * |
| 462 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows: |
| 463 * |
| 464 * Final_Sigma |
| 465 * C is preceded by a sequence consisting of |
| 466 * a cased letter and a case-ignorable sequence, |
| 467 * and C is not followed by a sequence consisting of |
| 468 * an ignorable sequence and then a cased letter. |
| 469 * |
| 470 * More_Above |
| 471 * C is followed by one or more characters of combining class 230 (ABOVE) |
| 472 * in the combining character sequence. |
| 473 * |
| 474 * After_Soft_Dotted |
| 475 * The last preceding character with combining class of zero before C |
| 476 * was Soft_Dotted, |
| 477 * and there is no intervening combining character class 230 (ABOVE). |
| 478 * |
| 479 * Before_Dot |
| 480 * C is followed by combining dot above (U+0307). |
| 481 * Any sequence of characters with a combining class that is neither 0 nor 230 |
| 482 * may intervene between the current character and the combining dot above. |
| 483 * |
| 484 * The erratum from 2002-10-31 adds the condition |
| 485 * |
| 486 * After_I |
| 487 * The last preceding base character was an uppercase I, and there is no |
| 488 * intervening combining character class 230 (ABOVE). |
| 489 * |
| 490 * (See Jitterbug 2344 and the comments on After_I below.) |
| 491 * |
| 492 * Helper definitions in Unicode 3.2 UAX 21: |
| 493 * |
| 494 * D1. A character C is defined to be cased |
| 495 * if it meets any of the following criteria: |
| 496 * |
| 497 * - The general category of C is Titlecase Letter (Lt) |
| 498 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase |
| 499 * - Given D = NFD(C), then it is not the case that: |
| 500 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D) |
| 501 * (This third criterium does not add any characters to the list |
| 502 * for Unicode 3.2. Ignored.) |
| 503 * |
| 504 * D2. A character C is defined to be case-ignorable |
| 505 * if it meets either of the following criteria: |
| 506 * |
| 507 * - The general category of C is |
| 508 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or |
| 509 * Letter Modifier (Lm), or Symbol Modifier (Sk) |
| 510 * - C is one of the following characters |
| 511 * U+0027 APOSTROPHE |
| 512 * U+00AD SOFT HYPHEN (SHY) |
| 513 * U+2019 RIGHT SINGLE QUOTATION MARK |
| 514 * (the preferred character for apostrophe) |
| 515 * |
| 516 * D3. A case-ignorable sequence is a sequence of |
| 517 * zero or more case-ignorable characters. |
| 518 */ |
| 519 |
| 520 #define is_a(c) ((c)=='a' || (c)=='A') |
| 521 #define is_d(c) ((c)=='d' || (c)=='D') |
| 522 #define is_e(c) ((c)=='e' || (c)=='E') |
| 523 #define is_i(c) ((c)=='i' || (c)=='I') |
| 524 #define is_l(c) ((c)=='l' || (c)=='L') |
| 525 #define is_n(c) ((c)=='n' || (c)=='N') |
| 526 #define is_r(c) ((c)=='r' || (c)=='R') |
| 527 #define is_t(c) ((c)=='t' || (c)=='T') |
| 528 #define is_u(c) ((c)=='u' || (c)=='U') |
| 529 #define is_z(c) ((c)=='z' || (c)=='Z') |
| 530 |
| 531 /* separator? */ |
| 532 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0) |
| 533 |
| 534 /** |
| 535 * Requires non-NULL locale ID but otherwise does the equivalent of |
| 536 * checking for language codes as if uloc_getLanguage() were called: |
| 537 * Accepts both 2- and 3-letter codes and accepts case variants. |
| 538 */ |
| 539 U_CFUNC int32_t |
| 540 ucase_getCaseLocale(const char *locale, int32_t *locCache) { |
| 541 int32_t result; |
| 542 char c; |
| 543 |
| 544 if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) { |
| 545 return result; |
| 546 } |
| 547 |
| 548 result=UCASE_LOC_ROOT; |
| 549 |
| 550 /* |
| 551 * This function used to use uloc_getLanguage(), but the current code |
| 552 * removes the dependency of this low-level code on uloc implementation code |
| 553 * and is faster because not the whole locale ID has to be |
| 554 * examined and copied/transformed. |
| 555 * |
| 556 * Because this code does not want to depend on uloc, the caller must |
| 557 * pass in a non-NULL locale, i.e., may need to call uloc_getDefault(). |
| 558 */ |
| 559 c=*locale++; |
| 560 if(is_t(c)) { |
| 561 /* tr or tur? */ |
| 562 c=*locale++; |
| 563 if(is_u(c)) { |
| 564 c=*locale++; |
| 565 } |
| 566 if(is_r(c)) { |
| 567 c=*locale; |
| 568 if(is_sep(c)) { |
| 569 result=UCASE_LOC_TURKISH; |
| 570 } |
| 571 } |
| 572 } else if(is_a(c)) { |
| 573 /* az or aze? */ |
| 574 c=*locale++; |
| 575 if(is_z(c)) { |
| 576 c=*locale++; |
| 577 if(is_e(c)) { |
| 578 c=*locale; |
| 579 } |
| 580 if(is_sep(c)) { |
| 581 result=UCASE_LOC_TURKISH; |
| 582 } |
| 583 } |
| 584 } else if(is_l(c)) { |
| 585 /* lt or lit? */ |
| 586 c=*locale++; |
| 587 if(is_i(c)) { |
| 588 c=*locale++; |
| 589 } |
| 590 if(is_t(c)) { |
| 591 c=*locale; |
| 592 if(is_sep(c)) { |
| 593 result=UCASE_LOC_LITHUANIAN; |
| 594 } |
| 595 } |
| 596 } else if(is_n(c)) { |
| 597 /* nl or nld? */ |
| 598 c=*locale++; |
| 599 if(is_l(c)) { |
| 600 c=*locale++; |
| 601 if(is_d(c)) { |
| 602 c=*locale; |
| 603 } |
| 604 if(is_sep(c)) { |
| 605 result=UCASE_LOC_DUTCH; |
| 606 } |
| 607 } |
| 608 } |
| 609 |
| 610 if(locCache!=NULL) { |
| 611 *locCache=result; |
| 612 } |
| 613 return result; |
| 614 } |
| 615 |
| 616 /* |
| 617 * Is followed by |
| 618 * {case-ignorable}* cased |
| 619 * ? |
| 620 * (dir determines looking forward/backward) |
| 621 * If a character is case-ignorable, it is skipped regardless of whether |
| 622 * it is also cased or not. |
| 623 */ |
| 624 static UBool |
| 625 isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void
*context, int8_t dir) { |
| 626 UChar32 c; |
| 627 |
| 628 if(iter==NULL) { |
| 629 return FALSE; |
| 630 } |
| 631 |
| 632 for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) { |
| 633 int32_t type=ucase_getTypeOrIgnorable(csp, c); |
| 634 if(type&4) { |
| 635 /* case-ignorable, continue with the loop */ |
| 636 } else if(type!=UCASE_NONE) { |
| 637 return TRUE; /* followed by cased letter */ |
| 638 } else { |
| 639 return FALSE; /* uncased and not case-ignorable */ |
| 640 } |
| 641 } |
| 642 |
| 643 return FALSE; /* not followed by cased letter */ |
| 644 } |
| 645 |
| 646 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */ |
| 647 static UBool |
| 648 isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *
context) { |
| 649 UChar32 c; |
| 650 int32_t dotType; |
| 651 int8_t dir; |
| 652 |
| 653 if(iter==NULL) { |
| 654 return FALSE; |
| 655 } |
| 656 |
| 657 for(dir=-1; (c=iter(context, dir))>=0; dir=0) { |
| 658 dotType=getDotType(csp, c); |
| 659 if(dotType==UCASE_SOFT_DOTTED) { |
| 660 return TRUE; /* preceded by TYPE_i */ |
| 661 } else if(dotType!=UCASE_OTHER_ACCENT) { |
| 662 return FALSE; /* preceded by different base character (not TYPE_i),
or intervening cc==230 */ |
| 663 } |
| 664 } |
| 665 |
| 666 return FALSE; /* not preceded by TYPE_i */ |
| 667 } |
| 668 |
| 669 /* |
| 670 * See Jitterbug 2344: |
| 671 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above |
| 672 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because |
| 673 * we made those releases compatible with Unicode 3.2 which had not fixed |
| 674 * a related bug in SpecialCasing.txt. |
| 675 * |
| 676 * From the Jitterbug 2344 text: |
| 677 * ... this bug is listed as a Unicode erratum |
| 678 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html |
| 679 * <quote> |
| 680 * There are two errors in SpecialCasing.txt. |
| 681 * 1. Missing semicolons on two lines. ... [irrelevant for ICU] |
| 682 * 2. An incorrect context definition. Correct as follows: |
| 683 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE |
| 684 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE |
| 685 * --- |
| 686 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE |
| 687 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE |
| 688 * where the context After_I is defined as: |
| 689 * The last preceding base character was an uppercase I, and there is no |
| 690 * intervening combining character class 230 (ABOVE). |
| 691 * </quote> |
| 692 * |
| 693 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as: |
| 694 * |
| 695 * # When lowercasing, remove dot_above in the sequence I + dot_above, which wil
l turn into i. |
| 696 * # This matches the behavior of the canonically equivalent I-dot_above |
| 697 * |
| 698 * See also the description in this place in older versions of uchar.c (revision
1.100). |
| 699 * |
| 700 * Markus W. Scherer 2003-feb-15 |
| 701 */ |
| 702 |
| 703 /* Is preceded by base character 'I' with no intervening cc=230 ? */ |
| 704 static UBool |
| 705 isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context)
{ |
| 706 UChar32 c; |
| 707 int32_t dotType; |
| 708 int8_t dir; |
| 709 |
| 710 if(iter==NULL) { |
| 711 return FALSE; |
| 712 } |
| 713 |
| 714 for(dir=-1; (c=iter(context, dir))>=0; dir=0) { |
| 715 if(c==0x49) { |
| 716 return TRUE; /* preceded by I */ |
| 717 } |
| 718 dotType=getDotType(csp, c); |
| 719 if(dotType!=UCASE_OTHER_ACCENT) { |
| 720 return FALSE; /* preceded by different base character (not I), or in
tervening cc==230 */ |
| 721 } |
| 722 } |
| 723 |
| 724 return FALSE; /* not preceded by I */ |
| 725 } |
| 726 |
| 727 /* Is followed by one or more cc==230 ? */ |
| 728 static UBool |
| 729 isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *c
ontext) { |
| 730 UChar32 c; |
| 731 int32_t dotType; |
| 732 int8_t dir; |
| 733 |
| 734 if(iter==NULL) { |
| 735 return FALSE; |
| 736 } |
| 737 |
| 738 for(dir=1; (c=iter(context, dir))>=0; dir=0) { |
| 739 dotType=getDotType(csp, c); |
| 740 if(dotType==UCASE_ABOVE) { |
| 741 return TRUE; /* at least one cc==230 following */ |
| 742 } else if(dotType!=UCASE_OTHER_ACCENT) { |
| 743 return FALSE; /* next base character, no more cc==230 following */ |
| 744 } |
| 745 } |
| 746 |
| 747 return FALSE; /* no more cc==230 following */ |
| 748 } |
| 749 |
| 750 /* Is followed by a dot above (without cc==230 in between) ? */ |
| 751 static UBool |
| 752 isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *co
ntext) { |
| 753 UChar32 c; |
| 754 int32_t dotType; |
| 755 int8_t dir; |
| 756 |
| 757 if(iter==NULL) { |
| 758 return FALSE; |
| 759 } |
| 760 |
| 761 for(dir=1; (c=iter(context, dir))>=0; dir=0) { |
| 762 if(c==0x307) { |
| 763 return TRUE; |
| 764 } |
| 765 dotType=getDotType(csp, c); |
| 766 if(dotType!=UCASE_OTHER_ACCENT) { |
| 767 return FALSE; /* next base character or cc==230 in between */ |
| 768 } |
| 769 } |
| 770 |
| 771 return FALSE; /* no dot above following */ |
| 772 } |
| 773 |
| 774 U_CAPI int32_t U_EXPORT2 |
| 775 ucase_toFullLower(const UCaseProps *csp, UChar32 c, |
| 776 UCaseContextIterator *iter, void *context, |
| 777 const UChar **pString, |
| 778 const char *locale, int32_t *locCache) |
| 779 { |
| 780 UChar32 result=c; |
| 781 uint16_t props=UTRIE2_GET16(&csp->trie, c); |
| 782 if(!PROPS_HAS_EXCEPTION(props)) { |
| 783 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { |
| 784 result=c+UCASE_GET_DELTA(props); |
| 785 } |
| 786 } else { |
| 787 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; |
| 788 uint16_t excWord=*pe++; |
| 789 int32_t full; |
| 790 |
| 791 pe2=pe; |
| 792 |
| 793 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) { |
| 794 /* use hardcoded conditions and mappings */ |
| 795 int32_t loc=ucase_getCaseLocale(locale, locCache); |
| 796 |
| 797 /* |
| 798 * Test for conditional mappings first |
| 799 * (otherwise the unconditional default mappings are always taken)
, |
| 800 * then test for characters that have unconditional mappings in Spec
ialCasing.txt, |
| 801 * then get the UnicodeData.txt mappings. |
| 802 */ |
| 803 if( loc==UCASE_LOC_LITHUANIAN && |
| 804 /* base characters, find accents above */ |
| 805 (((c==0x49 || c==0x4a || c==0x12e) && |
| 806 isFollowedByMoreAbove(csp, iter, context)) || |
| 807 /* precomposed with accent above, no need to find one */ |
| 808 (c==0xcc || c==0xcd || c==0x128)) |
| 809 ) { |
| 810 /* |
| 811 # Lithuanian |
| 812 |
| 813 # Lithuanian retains the dot in a lowercase i when followed
by accents. |
| 814 |
| 815 # Introduce an explicit dot above when lowercasing capital I
's and J's |
| 816 # whenever there are more accents above. |
| 817 # (of the accents used in Lithuanian: grave, acute, tilde ab
ove, and ogonek) |
| 818 |
| 819 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL
LETTER I |
| 820 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL
LETTER J |
| 821 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL
LETTER I WITH OGONEK |
| 822 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER
I WITH GRAVE |
| 823 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER
I WITH ACUTE |
| 824 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER
I WITH TILDE |
| 825 */ |
| 826 switch(c) { |
| 827 case 0x49: /* LATIN CAPITAL LETTER I */ |
| 828 *pString=iDot; |
| 829 return 2; |
| 830 case 0x4a: /* LATIN CAPITAL LETTER J */ |
| 831 *pString=jDot; |
| 832 return 2; |
| 833 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */ |
| 834 *pString=iOgonekDot; |
| 835 return 2; |
| 836 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */ |
| 837 *pString=iDotGrave; |
| 838 return 3; |
| 839 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */ |
| 840 *pString=iDotAcute; |
| 841 return 3; |
| 842 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */ |
| 843 *pString=iDotTilde; |
| 844 return 3; |
| 845 default: |
| 846 return 0; /* will not occur */ |
| 847 } |
| 848 /* # Turkish and Azeri */ |
| 849 } else if(loc==UCASE_LOC_TURKISH && c==0x130) { |
| 850 /* |
| 851 # I and i-dotless; I-dot and i are case pairs in Turkish and
Azeri |
| 852 # The following rules handle those cases. |
| 853 |
| 854 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT
ABOVE |
| 855 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT
ABOVE |
| 856 */ |
| 857 return 0x69; |
| 858 } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp,
iter, context)) { |
| 859 /* |
| 860 # When lowercasing, remove dot_above in the sequence I + dot
_above, which will turn into i. |
| 861 # This matches the behavior of the canonically equivalent I-
dot_above |
| 862 |
| 863 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE |
| 864 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE |
| 865 */ |
| 866 return 0; /* remove the dot (continue without output) */ |
| 867 } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove
(csp, iter, context)) { |
| 868 /* |
| 869 # When lowercasing, unless an I is before a dot_above, it tu
rns into a dotless i. |
| 870 |
| 871 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL L
ETTER I |
| 872 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL L
ETTER I |
| 873 */ |
| 874 return 0x131; |
| 875 } else if(c==0x130) { |
| 876 /* |
| 877 # Preserve canonical equivalence for I with dot. Turkic is h
andled below. |
| 878 |
| 879 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH D
OT ABOVE |
| 880 */ |
| 881 *pString=iDot; |
| 882 return 2; |
| 883 } else if( c==0x3a3 && |
| 884 !isFollowedByCasedLetter(csp, iter, context, 1) && |
| 885 isFollowedByCasedLetter(csp, iter, context, -1) /* -1=pr
eceded */ |
| 886 ) { |
| 887 /* greek capital sigma maps depending on surrounding cased lette
rs (see SpecialCasing.txt) */ |
| 888 /* |
| 889 # Special case for final form of sigma |
| 890 |
| 891 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER
SIGMA |
| 892 */ |
| 893 return 0x3c2; /* greek small final sigma */ |
| 894 } else { |
| 895 /* no known conditional special case mapping, use a normal mappi
ng */ |
| 896 } |
| 897 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { |
| 898 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); |
| 899 full&=UCASE_FULL_LOWER; |
| 900 if(full!=0) { |
| 901 /* set the output pointer to the lowercase mapping */ |
| 902 *pString=pe+1; |
| 903 |
| 904 /* return the string length */ |
| 905 return full; |
| 906 } |
| 907 } |
| 908 |
| 909 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { |
| 910 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result); |
| 911 } |
| 912 } |
| 913 |
| 914 return (result==c) ? ~result : result; |
| 915 } |
| 916 |
| 917 /* internal */ |
| 918 static int32_t |
| 919 toUpperOrTitle(const UCaseProps *csp, UChar32 c, |
| 920 UCaseContextIterator *iter, void *context, |
| 921 const UChar **pString, |
| 922 const char *locale, int32_t *locCache, |
| 923 UBool upperNotTitle) { |
| 924 UChar32 result=c; |
| 925 uint16_t props=UTRIE2_GET16(&csp->trie, c); |
| 926 if(!PROPS_HAS_EXCEPTION(props)) { |
| 927 if(UCASE_GET_TYPE(props)==UCASE_LOWER) { |
| 928 result=c+UCASE_GET_DELTA(props); |
| 929 } |
| 930 } else { |
| 931 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; |
| 932 uint16_t excWord=*pe++; |
| 933 int32_t full, idx; |
| 934 |
| 935 pe2=pe; |
| 936 |
| 937 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) { |
| 938 /* use hardcoded conditions and mappings */ |
| 939 int32_t loc=ucase_getCaseLocale(locale, locCache); |
| 940 |
| 941 if(loc==UCASE_LOC_TURKISH && c==0x69) { |
| 942 /* |
| 943 # Turkish and Azeri |
| 944 |
| 945 # I and i-dotless; I-dot and i are case pairs in Turkish and
Azeri |
| 946 # The following rules handle those cases. |
| 947 |
| 948 # When uppercasing, i turns into a dotted capital I |
| 949 |
| 950 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I |
| 951 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I |
| 952 */ |
| 953 return 0x130; |
| 954 } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftD
otted(csp, iter, context)) { |
| 955 /* |
| 956 # Lithuanian |
| 957 |
| 958 # Lithuanian retains the dot in a lowercase i when followed
by accents. |
| 959 |
| 960 # Remove DOT ABOVE after "i" with upper or titlecase |
| 961 |
| 962 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE |
| 963 */ |
| 964 return 0; /* remove the dot (continue without output) */ |
| 965 } else { |
| 966 /* no known conditional special case mapping, use a normal mappi
ng */ |
| 967 } |
| 968 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { |
| 969 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); |
| 970 |
| 971 /* start of full case mapping strings */ |
| 972 ++pe; |
| 973 |
| 974 /* skip the lowercase and case-folding result strings */ |
| 975 pe+=full&UCASE_FULL_LOWER; |
| 976 full>>=4; |
| 977 pe+=full&0xf; |
| 978 full>>=4; |
| 979 |
| 980 if(upperNotTitle) { |
| 981 full&=0xf; |
| 982 } else { |
| 983 /* skip the uppercase result string */ |
| 984 pe+=full&0xf; |
| 985 full=(full>>4)&0xf; |
| 986 } |
| 987 |
| 988 if(full!=0) { |
| 989 /* set the output pointer to the result string */ |
| 990 *pString=pe; |
| 991 |
| 992 /* return the string length */ |
| 993 return full; |
| 994 } |
| 995 } |
| 996 |
| 997 if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) { |
| 998 idx=UCASE_EXC_TITLE; |
| 999 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { |
| 1000 /* here, titlecase is same as uppercase */ |
| 1001 idx=UCASE_EXC_UPPER; |
| 1002 } else { |
| 1003 return ~c; |
| 1004 } |
| 1005 GET_SLOT_VALUE(excWord, idx, pe2, result); |
| 1006 } |
| 1007 |
| 1008 return (result==c) ? ~result : result; |
| 1009 } |
| 1010 |
| 1011 U_CAPI int32_t U_EXPORT2 |
| 1012 ucase_toFullUpper(const UCaseProps *csp, UChar32 c, |
| 1013 UCaseContextIterator *iter, void *context, |
| 1014 const UChar **pString, |
| 1015 const char *locale, int32_t *locCache) { |
| 1016 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE
); |
| 1017 } |
| 1018 |
| 1019 U_CAPI int32_t U_EXPORT2 |
| 1020 ucase_toFullTitle(const UCaseProps *csp, UChar32 c, |
| 1021 UCaseContextIterator *iter, void *context, |
| 1022 const UChar **pString, |
| 1023 const char *locale, int32_t *locCache) { |
| 1024 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALS
E); |
| 1025 } |
| 1026 |
| 1027 /* case folding ------------------------------------------------------------- */ |
| 1028 |
| 1029 /* |
| 1030 * Case folding is similar to lowercasing. |
| 1031 * The result may be a simple mapping, i.e., a single code point, or |
| 1032 * a full mapping, i.e., a string. |
| 1033 * If the case folding for a code point is the same as its simple (1:1) lowercas
e mapping, |
| 1034 * then only the lowercase mapping is stored. |
| 1035 * |
| 1036 * Some special cases are hardcoded because their conditions cannot be |
| 1037 * parsed and processed from CaseFolding.txt. |
| 1038 * |
| 1039 * Unicode 3.2 CaseFolding.txt specifies for its status field: |
| 1040 |
| 1041 # C: common case folding, common mappings shared by both simple and full mapping
s. |
| 1042 # F: full case folding, mappings that cause strings to grow in length. Multiple
characters are separated by spaces. |
| 1043 # S: simple case folding, mappings to single characters where different from F. |
| 1044 # T: special case for uppercase I and dotted uppercase I |
| 1045 # - For non-Turkic languages, this mapping is normally not used. |
| 1046 # - For Turkic languages (tr, az), this mapping can be used instead of the no
rmal mapping for these characters. |
| 1047 # |
| 1048 # Usage: |
| 1049 # A. To do a simple case folding, use the mappings with status C + S. |
| 1050 # B. To do a full case folding, use the mappings with status C + F. |
| 1051 # |
| 1052 # The mappings with status T can be used or omitted depending on the desired
case-folding |
| 1053 # behavior. (The default option is to exclude them.) |
| 1054 |
| 1055 * Unicode 3.2 has 'T' mappings as follows: |
| 1056 |
| 1057 0049; T; 0131; # LATIN CAPITAL LETTER I |
| 1058 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE |
| 1059 |
| 1060 * while the default mappings for these code points are: |
| 1061 |
| 1062 0049; C; 0069; # LATIN CAPITAL LETTER I |
| 1063 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE |
| 1064 |
| 1065 * U+0130 has no simple case folding (simple-case-folds to itself). |
| 1066 */ |
| 1067 |
| 1068 /* return the simple case folding mapping for c */ |
| 1069 U_CAPI UChar32 U_EXPORT2 |
| 1070 ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) { |
| 1071 uint16_t props=UTRIE2_GET16(&csp->trie, c); |
| 1072 if(!PROPS_HAS_EXCEPTION(props)) { |
| 1073 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { |
| 1074 c+=UCASE_GET_DELTA(props); |
| 1075 } |
| 1076 } else { |
| 1077 const uint16_t *pe=GET_EXCEPTIONS(csp, props); |
| 1078 uint16_t excWord=*pe++; |
| 1079 int32_t idx; |
| 1080 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) { |
| 1081 /* special case folding mappings, hardcoded */ |
| 1082 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) { |
| 1083 /* default mappings */ |
| 1084 if(c==0x49) { |
| 1085 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ |
| 1086 return 0x69; |
| 1087 } else if(c==0x130) { |
| 1088 /* no simple case folding for U+0130 */ |
| 1089 return c; |
| 1090 } |
| 1091 } else { |
| 1092 /* Turkic mappings */ |
| 1093 if(c==0x49) { |
| 1094 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */ |
| 1095 return 0x131; |
| 1096 } else if(c==0x130) { |
| 1097 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ |
| 1098 return 0x69; |
| 1099 } |
| 1100 } |
| 1101 } |
| 1102 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) { |
| 1103 idx=UCASE_EXC_FOLD; |
| 1104 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { |
| 1105 idx=UCASE_EXC_LOWER; |
| 1106 } else { |
| 1107 return c; |
| 1108 } |
| 1109 GET_SLOT_VALUE(excWord, idx, pe, c); |
| 1110 } |
| 1111 return c; |
| 1112 } |
| 1113 |
| 1114 /* |
| 1115 * Issue for canonical caseless match (UAX #21): |
| 1116 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve |
| 1117 * canonical equivalence, unlike default-option casefolding. |
| 1118 * For example, I-grave and I + grave fold to strings that are not canonically |
| 1119 * equivalent. |
| 1120 * For more details, see the comment in unorm_compare() in unorm.cpp |
| 1121 * and the intermediate prototype changes for Jitterbug 2021. |
| 1122 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.) |
| 1123 * |
| 1124 * This did not get fixed because it appears that it is not possible to fix |
| 1125 * it for uppercase and lowercase characters (I-grave vs. i-grave) |
| 1126 * together in a way that they still fold to common result strings. |
| 1127 */ |
| 1128 |
| 1129 U_CAPI int32_t U_EXPORT2 |
| 1130 ucase_toFullFolding(const UCaseProps *csp, UChar32 c, |
| 1131 const UChar **pString, |
| 1132 uint32_t options) |
| 1133 { |
| 1134 UChar32 result=c; |
| 1135 uint16_t props=UTRIE2_GET16(&csp->trie, c); |
| 1136 if(!PROPS_HAS_EXCEPTION(props)) { |
| 1137 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { |
| 1138 result=c+UCASE_GET_DELTA(props); |
| 1139 } |
| 1140 } else { |
| 1141 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; |
| 1142 uint16_t excWord=*pe++; |
| 1143 int32_t full, idx; |
| 1144 |
| 1145 pe2=pe; |
| 1146 |
| 1147 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) { |
| 1148 /* use hardcoded conditions and mappings */ |
| 1149 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) { |
| 1150 /* default mappings */ |
| 1151 if(c==0x49) { |
| 1152 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ |
| 1153 return 0x69; |
| 1154 } else if(c==0x130) { |
| 1155 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABO
VE */ |
| 1156 *pString=iDot; |
| 1157 return 2; |
| 1158 } |
| 1159 } else { |
| 1160 /* Turkic mappings */ |
| 1161 if(c==0x49) { |
| 1162 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */ |
| 1163 return 0x131; |
| 1164 } else if(c==0x130) { |
| 1165 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ |
| 1166 return 0x69; |
| 1167 } |
| 1168 } |
| 1169 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { |
| 1170 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); |
| 1171 |
| 1172 /* start of full case mapping strings */ |
| 1173 ++pe; |
| 1174 |
| 1175 /* skip the lowercase result string */ |
| 1176 pe+=full&UCASE_FULL_LOWER; |
| 1177 full=(full>>4)&0xf; |
| 1178 |
| 1179 if(full!=0) { |
| 1180 /* set the output pointer to the result string */ |
| 1181 *pString=pe; |
| 1182 |
| 1183 /* return the string length */ |
| 1184 return full; |
| 1185 } |
| 1186 } |
| 1187 |
| 1188 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) { |
| 1189 idx=UCASE_EXC_FOLD; |
| 1190 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { |
| 1191 idx=UCASE_EXC_LOWER; |
| 1192 } else { |
| 1193 return ~c; |
| 1194 } |
| 1195 GET_SLOT_VALUE(excWord, idx, pe2, result); |
| 1196 } |
| 1197 |
| 1198 return (result==c) ? ~result : result; |
| 1199 } |
| 1200 |
| 1201 /* case mapping properties API ---------------------------------------------- */ |
| 1202 |
| 1203 #define GET_CASE_PROPS() &ucase_props_singleton |
| 1204 |
| 1205 /* public API (see uchar.h) */ |
| 1206 |
| 1207 U_CAPI UBool U_EXPORT2 |
| 1208 u_isULowercase(UChar32 c) { |
| 1209 return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c)); |
| 1210 } |
| 1211 |
| 1212 U_CAPI UBool U_EXPORT2 |
| 1213 u_isUUppercase(UChar32 c) { |
| 1214 return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c)); |
| 1215 } |
| 1216 |
| 1217 /* Transforms the Unicode character to its lower case equivalent.*/ |
| 1218 U_CAPI UChar32 U_EXPORT2 |
| 1219 u_tolower(UChar32 c) { |
| 1220 return ucase_tolower(GET_CASE_PROPS(), c); |
| 1221 } |
| 1222 |
| 1223 /* Transforms the Unicode character to its upper case equivalent.*/ |
| 1224 U_CAPI UChar32 U_EXPORT2 |
| 1225 u_toupper(UChar32 c) { |
| 1226 return ucase_toupper(GET_CASE_PROPS(), c); |
| 1227 } |
| 1228 |
| 1229 /* Transforms the Unicode character to its title case equivalent.*/ |
| 1230 U_CAPI UChar32 U_EXPORT2 |
| 1231 u_totitle(UChar32 c) { |
| 1232 return ucase_totitle(GET_CASE_PROPS(), c); |
| 1233 } |
| 1234 |
| 1235 /* return the simple case folding mapping for c */ |
| 1236 U_CAPI UChar32 U_EXPORT2 |
| 1237 u_foldCase(UChar32 c, uint32_t options) { |
| 1238 return ucase_fold(GET_CASE_PROPS(), c, options); |
| 1239 } |
| 1240 |
| 1241 U_CFUNC int32_t U_EXPORT2 |
| 1242 ucase_hasBinaryProperty(UChar32 c, UProperty which) { |
| 1243 /* case mapping properties */ |
| 1244 const UChar *resultString; |
| 1245 int32_t locCache; |
| 1246 const UCaseProps *csp=GET_CASE_PROPS(); |
| 1247 if(csp==NULL) { |
| 1248 return FALSE; |
| 1249 } |
| 1250 switch(which) { |
| 1251 case UCHAR_LOWERCASE: |
| 1252 return (UBool)(UCASE_LOWER==ucase_getType(csp, c)); |
| 1253 case UCHAR_UPPERCASE: |
| 1254 return (UBool)(UCASE_UPPER==ucase_getType(csp, c)); |
| 1255 case UCHAR_SOFT_DOTTED: |
| 1256 return ucase_isSoftDotted(csp, c); |
| 1257 case UCHAR_CASE_SENSITIVE: |
| 1258 return ucase_isCaseSensitive(csp, c); |
| 1259 case UCHAR_CASED: |
| 1260 return (UBool)(UCASE_NONE!=ucase_getType(csp, c)); |
| 1261 case UCHAR_CASE_IGNORABLE: |
| 1262 return (UBool)(ucase_getTypeOrIgnorable(csp, c)>>2); |
| 1263 /* |
| 1264 * Note: The following Changes_When_Xyz are defined as testing whether |
| 1265 * the NFD form of the input changes when Xyz-case-mapped. |
| 1266 * However, this simpler implementation of these properties, |
| 1267 * ignoring NFD, passes the tests. |
| 1268 * The implementation needs to be changed if the tests start failing. |
| 1269 * When that happens, optimizations should be used to work with the |
| 1270 * per-single-code point ucase_toFullXyz() functions unless |
| 1271 * the NFD form has more than one code point, |
| 1272 * and the property starts set needs to be the union of the |
| 1273 * start sets for normalization and case mappings. |
| 1274 */ |
| 1275 case UCHAR_CHANGES_WHEN_LOWERCASED: |
| 1276 locCache=UCASE_LOC_ROOT; |
| 1277 return (UBool)(ucase_toFullLower(csp, c, NULL, NULL, &resultString, "",
&locCache)>=0); |
| 1278 case UCHAR_CHANGES_WHEN_UPPERCASED: |
| 1279 locCache=UCASE_LOC_ROOT; |
| 1280 return (UBool)(ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "",
&locCache)>=0); |
| 1281 case UCHAR_CHANGES_WHEN_TITLECASED: |
| 1282 locCache=UCASE_LOC_ROOT; |
| 1283 return (UBool)(ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "",
&locCache)>=0); |
| 1284 /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */ |
| 1285 case UCHAR_CHANGES_WHEN_CASEMAPPED: |
| 1286 locCache=UCASE_LOC_ROOT; |
| 1287 return (UBool)( |
| 1288 ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>
=0 || |
| 1289 ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>
=0 || |
| 1290 ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>
=0); |
| 1291 default: |
| 1292 return FALSE; |
| 1293 } |
| 1294 } |
OLD | NEW |