OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * |
| 4 * Copyright (C) 2005-2010, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ******************************************************************************* |
| 8 * file name: ucasemap.c |
| 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) |
| 11 * indentation:4 |
| 12 * |
| 13 * created on: 2005may06 |
| 14 * created by: Markus W. Scherer |
| 15 * |
| 16 * Case mapping service object and functions using it. |
| 17 */ |
| 18 |
| 19 #include "unicode/utypes.h" |
| 20 #include "unicode/uloc.h" |
| 21 #include "unicode/ustring.h" |
| 22 #include "unicode/ucasemap.h" |
| 23 #if !UCONFIG_NO_BREAK_ITERATION |
| 24 #include "unicode/ubrk.h" |
| 25 #include "unicode/utext.h" |
| 26 #endif |
| 27 #include "cmemory.h" |
| 28 #include "cstring.h" |
| 29 #include "ucase.h" |
| 30 #include "ustr_imp.h" |
| 31 |
| 32 /* UCaseMap service object -------------------------------------------------- */ |
| 33 |
| 34 U_CAPI UCaseMap * U_EXPORT2 |
| 35 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) { |
| 36 UCaseMap *csm; |
| 37 |
| 38 if(U_FAILURE(*pErrorCode)) { |
| 39 return NULL; |
| 40 } |
| 41 |
| 42 csm=(UCaseMap *)uprv_malloc(sizeof(UCaseMap)); |
| 43 if(csm==NULL) { |
| 44 return NULL; |
| 45 } |
| 46 uprv_memset(csm, 0, sizeof(UCaseMap)); |
| 47 |
| 48 csm->csp=ucase_getSingleton(); |
| 49 ucasemap_setLocale(csm, locale, pErrorCode); |
| 50 if(U_FAILURE(*pErrorCode)) { |
| 51 uprv_free(csm); |
| 52 return NULL; |
| 53 } |
| 54 |
| 55 csm->options=options; |
| 56 return csm; |
| 57 } |
| 58 |
| 59 U_CAPI void U_EXPORT2 |
| 60 ucasemap_close(UCaseMap *csm) { |
| 61 if(csm!=NULL) { |
| 62 #if !UCONFIG_NO_BREAK_ITERATION |
| 63 ubrk_close(csm->iter); |
| 64 #endif |
| 65 uprv_free(csm); |
| 66 } |
| 67 } |
| 68 |
| 69 U_CAPI const char * U_EXPORT2 |
| 70 ucasemap_getLocale(const UCaseMap *csm) { |
| 71 return csm->locale; |
| 72 } |
| 73 |
| 74 U_CAPI uint32_t U_EXPORT2 |
| 75 ucasemap_getOptions(const UCaseMap *csm) { |
| 76 return csm->options; |
| 77 } |
| 78 |
| 79 U_CAPI void U_EXPORT2 |
| 80 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) { |
| 81 int32_t length; |
| 82 |
| 83 if(U_FAILURE(*pErrorCode)) { |
| 84 return; |
| 85 } |
| 86 |
| 87 length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErro
rCode); |
| 88 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) { |
| 89 *pErrorCode=U_ZERO_ERROR; |
| 90 /* we only really need the language code for case mappings */ |
| 91 length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale
), pErrorCode); |
| 92 } |
| 93 if(length==sizeof(csm->locale)) { |
| 94 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 95 } |
| 96 csm->locCache=0; |
| 97 if(U_SUCCESS(*pErrorCode)) { |
| 98 ucase_getCaseLocale(csm->locale, &csm->locCache); |
| 99 } else { |
| 100 csm->locale[0]=0; |
| 101 } |
| 102 } |
| 103 |
| 104 U_CAPI void U_EXPORT2 |
| 105 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) { |
| 106 csm->options=options; |
| 107 } |
| 108 |
| 109 #if !UCONFIG_NO_BREAK_ITERATION |
| 110 |
| 111 U_CAPI const UBreakIterator * U_EXPORT2 |
| 112 ucasemap_getBreakIterator(const UCaseMap *csm) { |
| 113 return csm->iter; |
| 114 } |
| 115 |
| 116 U_CAPI void U_EXPORT2 |
| 117 ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode
*pErrorCode) { |
| 118 ubrk_close(csm->iter); |
| 119 csm->iter=iterToAdopt; |
| 120 } |
| 121 |
| 122 #endif |
| 123 |
| 124 /* UTF-8 string case mappings ----------------------------------------------- */ |
| 125 |
| 126 /* TODO(markus): Move to a new, separate utf8case.c file. */ |
| 127 |
| 128 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */ |
| 129 static U_INLINE int32_t |
| 130 appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity, |
| 131 int32_t result, const UChar *s) { |
| 132 UChar32 c; |
| 133 int32_t length, destLength; |
| 134 UErrorCode errorCode; |
| 135 |
| 136 /* decode the result */ |
| 137 if(result<0) { |
| 138 /* (not) original code point */ |
| 139 c=~result; |
| 140 length=-1; |
| 141 } else if(result<=UCASE_MAX_STRING_LENGTH) { |
| 142 c=U_SENTINEL; |
| 143 length=result; |
| 144 } else { |
| 145 c=result; |
| 146 length=-1; |
| 147 } |
| 148 |
| 149 if(destIndex<destCapacity) { |
| 150 /* append the result */ |
| 151 if(length<0) { |
| 152 /* code point */ |
| 153 UBool isError=FALSE; |
| 154 U8_APPEND(dest, destIndex, destCapacity, c, isError); |
| 155 if(isError) { |
| 156 /* overflow, nothing written */ |
| 157 destIndex+=U8_LENGTH(c); |
| 158 } |
| 159 } else { |
| 160 /* string */ |
| 161 errorCode=U_ZERO_ERROR; |
| 162 u_strToUTF8( |
| 163 (char *)(dest+destIndex), destCapacity-destIndex, &destLength, |
| 164 s, length, |
| 165 &errorCode); |
| 166 destIndex+=destLength; |
| 167 /* we might have an overflow, but we know the actual length */ |
| 168 } |
| 169 } else { |
| 170 /* preflight */ |
| 171 if(length<0) { |
| 172 destIndex+=U8_LENGTH(c); |
| 173 } else { |
| 174 errorCode=U_ZERO_ERROR; |
| 175 u_strToUTF8( |
| 176 NULL, 0, &destLength, |
| 177 s, length, |
| 178 &errorCode); |
| 179 destIndex+=destLength; |
| 180 } |
| 181 } |
| 182 return destIndex; |
| 183 } |
| 184 |
| 185 static UChar32 U_CALLCONV |
| 186 utf8_caseContextIterator(void *context, int8_t dir) { |
| 187 UCaseContext *csc=(UCaseContext *)context; |
| 188 UChar32 c; |
| 189 |
| 190 if(dir<0) { |
| 191 /* reset for backward iteration */ |
| 192 csc->index=csc->cpStart; |
| 193 csc->dir=dir; |
| 194 } else if(dir>0) { |
| 195 /* reset for forward iteration */ |
| 196 csc->index=csc->cpLimit; |
| 197 csc->dir=dir; |
| 198 } else { |
| 199 /* continue current iteration direction */ |
| 200 dir=csc->dir; |
| 201 } |
| 202 |
| 203 if(dir<0) { |
| 204 if(csc->start<csc->index) { |
| 205 U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c); |
| 206 return c; |
| 207 } |
| 208 } else { |
| 209 if(csc->index<csc->limit) { |
| 210 U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c); |
| 211 return c; |
| 212 } |
| 213 } |
| 214 return U_SENTINEL; |
| 215 } |
| 216 |
| 217 /* |
| 218 * Case-maps [srcStart..srcLimit[ but takes |
| 219 * context [0..srcLength[ into account. |
| 220 */ |
| 221 static int32_t |
| 222 _caseMap(const UCaseMap *csm, UCaseMapFull *map, |
| 223 uint8_t *dest, int32_t destCapacity, |
| 224 const uint8_t *src, UCaseContext *csc, |
| 225 int32_t srcStart, int32_t srcLimit, |
| 226 UErrorCode *pErrorCode) { |
| 227 const UChar *s; |
| 228 UChar32 c, c2 = 0; |
| 229 int32_t srcIndex, destIndex; |
| 230 int32_t locCache; |
| 231 |
| 232 locCache=csm->locCache; |
| 233 |
| 234 /* case mapping loop */ |
| 235 srcIndex=srcStart; |
| 236 destIndex=0; |
| 237 while(srcIndex<srcLimit) { |
| 238 csc->cpStart=srcIndex; |
| 239 U8_NEXT(src, srcIndex, srcLimit, c); |
| 240 csc->cpLimit=srcIndex; |
| 241 if(c<0) { |
| 242 int32_t i=csc->cpStart; |
| 243 while(destIndex<destCapacity && i<srcIndex) { |
| 244 dest[destIndex++]=src[i++]; |
| 245 } |
| 246 continue; |
| 247 } |
| 248 c=map(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &locC
ache); |
| 249 if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_L
ENGTH<c && (c2=c)<=0x7f)) { |
| 250 /* fast path version of appendResult() for ASCII results */ |
| 251 dest[destIndex++]=(uint8_t)c2; |
| 252 } else { |
| 253 destIndex=appendResult(dest, destIndex, destCapacity, c, s); |
| 254 } |
| 255 } |
| 256 |
| 257 if(destIndex>destCapacity) { |
| 258 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 259 } |
| 260 return destIndex; |
| 261 } |
| 262 |
| 263 #if !UCONFIG_NO_BREAK_ITERATION |
| 264 |
| 265 /* |
| 266 * Internal titlecasing function. |
| 267 */ |
| 268 static int32_t |
| 269 _toTitle(UCaseMap *csm, |
| 270 uint8_t *dest, int32_t destCapacity, |
| 271 const uint8_t *src, UCaseContext *csc, |
| 272 int32_t srcLength, |
| 273 UErrorCode *pErrorCode) { |
| 274 UText utext=UTEXT_INITIALIZER; |
| 275 const UChar *s; |
| 276 UChar32 c; |
| 277 int32_t prev, titleStart, titleLimit, idx, destIndex, length; |
| 278 UBool isFirstIndex; |
| 279 |
| 280 utext_openUTF8(&utext, (const char *)src, srcLength, pErrorCode); |
| 281 if(U_FAILURE(*pErrorCode)) { |
| 282 return 0; |
| 283 } |
| 284 if(csm->iter==NULL) { |
| 285 csm->iter=ubrk_open(UBRK_WORD, csm->locale, |
| 286 NULL, 0, |
| 287 pErrorCode); |
| 288 } |
| 289 ubrk_setUText(csm->iter, &utext, pErrorCode); |
| 290 if(U_FAILURE(*pErrorCode)) { |
| 291 utext_close(&utext); |
| 292 return 0; |
| 293 } |
| 294 |
| 295 /* set up local variables */ |
| 296 destIndex=0; |
| 297 prev=0; |
| 298 isFirstIndex=TRUE; |
| 299 |
| 300 /* titlecasing loop */ |
| 301 while(prev<srcLength) { |
| 302 /* find next index where to titlecase */ |
| 303 if(isFirstIndex) { |
| 304 isFirstIndex=FALSE; |
| 305 idx=ubrk_first(csm->iter); |
| 306 } else { |
| 307 idx=ubrk_next(csm->iter); |
| 308 } |
| 309 if(idx==UBRK_DONE || idx>srcLength) { |
| 310 idx=srcLength; |
| 311 } |
| 312 |
| 313 /* |
| 314 * Unicode 4 & 5 section 3.13 Default Case Operations: |
| 315 * |
| 316 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standar
d Annex |
| 317 * #29, "Text Boundaries." Between each pair of word boundaries, find th
e first |
| 318 * cased character F. If F exists, map F to default_title(F); then map e
ach |
| 319 * subsequent character C to default_lower(C). |
| 320 * |
| 321 * In this implementation, segment [prev..index[ into 3 parts: |
| 322 * a) uncased characters (copy as-is) [prev..titleStart[ |
| 323 * b) first case letter (titlecase) [titleStart..titleLimit[ |
| 324 * c) subsequent characters (lowercase) [titleLimit..ind
ex[ |
| 325 */ |
| 326 if(prev<idx) { |
| 327 /* find and copy uncased characters [prev..titleStart[ */ |
| 328 titleStart=titleLimit=prev; |
| 329 U8_NEXT(src, titleLimit, idx, c); |
| 330 if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==
ucase_getType(csm->csp, c)) { |
| 331 /* Adjust the titlecasing index (titleStart) to the next cased c
haracter. */ |
| 332 for(;;) { |
| 333 titleStart=titleLimit; |
| 334 if(titleLimit==idx) { |
| 335 /* |
| 336 * only uncased characters in [prev..index[ |
| 337 * stop with titleStart==titleLimit==index |
| 338 */ |
| 339 break; |
| 340 } |
| 341 U8_NEXT(src, titleLimit, idx, c); |
| 342 if(UCASE_NONE!=ucase_getType(csm->csp, c)) { |
| 343 break; /* cased letter at [titleStart..titleLimit[ */ |
| 344 } |
| 345 } |
| 346 length=titleStart-prev; |
| 347 if(length>0) { |
| 348 if((destIndex+length)<=destCapacity) { |
| 349 uprv_memcpy(dest+destIndex, src+prev, length); |
| 350 } |
| 351 destIndex+=length; |
| 352 } |
| 353 } |
| 354 |
| 355 if(titleStart<titleLimit) { |
| 356 /* titlecase c which is from [titleStart..titleLimit[ */ |
| 357 csc->cpStart=titleStart; |
| 358 csc->cpLimit=titleLimit; |
| 359 c=ucase_toFullTitle(csm->csp, c, utf8_caseContextIterator, csc,
&s, csm->locale, &csm->locCache); |
| 360 destIndex=appendResult(dest, destIndex, destCapacity, c, s); |
| 361 |
| 362 |
| 363 /* Special case Dutch IJ titlecasing */ |
| 364 if ( titleStart+1 < idx && |
| 365 ucase_getCaseLocale(csm->locale,&csm->locCache) == UCASE_LO
C_DUTCH && |
| 366 ( src[titleStart] == 0x0049 || src[titleStart] == 0x0069 )
&& |
| 367 ( src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006
A )) { |
| 368 c=0x004A; |
| 369 destIndex=appendResult(dest, destIndex, destCapacity
, c, s); |
| 370 titleLimit++; |
| 371 } |
| 372 /* lowercase [titleLimit..index[ */ |
| 373 if(titleLimit<idx) { |
| 374 if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) { |
| 375 /* Normal operation: Lowercase the rest of the word. */ |
| 376 destIndex+= |
| 377 _caseMap( |
| 378 csm, ucase_toFullLower, |
| 379 dest+destIndex, destCapacity-destIndex, |
| 380 src, csc, |
| 381 titleLimit, idx, |
| 382 pErrorCode); |
| 383 } else { |
| 384 /* Optionally just copy the rest of the word unchanged.
*/ |
| 385 length=idx-titleLimit; |
| 386 if((destIndex+length)<=destCapacity) { |
| 387 uprv_memcpy(dest+destIndex, src+titleLimit, length); |
| 388 } |
| 389 destIndex+=length; |
| 390 } |
| 391 } |
| 392 } |
| 393 } |
| 394 |
| 395 prev=idx; |
| 396 } |
| 397 |
| 398 if(destIndex>destCapacity) { |
| 399 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 400 } |
| 401 utext_close(&utext); |
| 402 return destIndex; |
| 403 } |
| 404 |
| 405 #endif |
| 406 |
| 407 static int32_t |
| 408 utf8_foldCase(const UCaseProps *csp, |
| 409 uint8_t *dest, int32_t destCapacity, |
| 410 const uint8_t *src, int32_t srcLength, |
| 411 uint32_t options, |
| 412 UErrorCode *pErrorCode) { |
| 413 int32_t srcIndex, destIndex; |
| 414 |
| 415 const UChar *s; |
| 416 UChar32 c, c2; |
| 417 int32_t start; |
| 418 |
| 419 /* case mapping loop */ |
| 420 srcIndex=destIndex=0; |
| 421 while(srcIndex<srcLength) { |
| 422 start=srcIndex; |
| 423 U8_NEXT(src, srcIndex, srcLength, c); |
| 424 if(c<0) { |
| 425 while(destIndex<destCapacity && start<srcIndex) { |
| 426 dest[destIndex++]=src[start++]; |
| 427 } |
| 428 continue; |
| 429 } |
| 430 c=ucase_toFullFolding(csp, c, &s, options); |
| 431 if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_L
ENGTH<c && (c2=c)<=0x7f)) { |
| 432 /* fast path version of appendResult() for ASCII results */ |
| 433 dest[destIndex++]=(uint8_t)c2; |
| 434 } else { |
| 435 destIndex=appendResult(dest, destIndex, destCapacity, c, s); |
| 436 } |
| 437 } |
| 438 |
| 439 if(destIndex>destCapacity) { |
| 440 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 441 } |
| 442 return destIndex; |
| 443 } |
| 444 |
| 445 /* |
| 446 * Implement argument checking and buffer handling |
| 447 * for string case mapping as a common function. |
| 448 */ |
| 449 |
| 450 /* common internal function for public API functions */ |
| 451 |
| 452 static int32_t |
| 453 caseMap(const UCaseMap *csm, |
| 454 uint8_t *dest, int32_t destCapacity, |
| 455 const uint8_t *src, int32_t srcLength, |
| 456 int32_t toWhichCase, |
| 457 UErrorCode *pErrorCode) { |
| 458 int32_t destLength; |
| 459 |
| 460 /* check argument values */ |
| 461 if(U_FAILURE(*pErrorCode)) { |
| 462 return 0; |
| 463 } |
| 464 if( destCapacity<0 || |
| 465 (dest==NULL && destCapacity>0) || |
| 466 src==NULL || |
| 467 srcLength<-1 |
| 468 ) { |
| 469 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 470 return 0; |
| 471 } |
| 472 |
| 473 /* get the string length */ |
| 474 if(srcLength==-1) { |
| 475 srcLength=(int32_t)uprv_strlen((const char *)src); |
| 476 } |
| 477 |
| 478 /* check for overlapping source and destination */ |
| 479 if( dest!=NULL && |
| 480 ((src>=dest && src<(dest+destCapacity)) || |
| 481 (dest>=src && dest<(src+srcLength))) |
| 482 ) { |
| 483 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 484 return 0; |
| 485 } |
| 486 |
| 487 destLength=0; |
| 488 |
| 489 if(toWhichCase==FOLD_CASE) { |
| 490 destLength=utf8_foldCase(csm->csp, dest, destCapacity, src, srcLength, |
| 491 csm->options, pErrorCode); |
| 492 } else { |
| 493 UCaseContext csc={ NULL }; |
| 494 |
| 495 csc.p=(void *)src; |
| 496 csc.limit=srcLength; |
| 497 |
| 498 if(toWhichCase==TO_LOWER) { |
| 499 destLength=_caseMap(csm, ucase_toFullLower, |
| 500 dest, destCapacity, |
| 501 src, &csc, |
| 502 0, srcLength, |
| 503 pErrorCode); |
| 504 } else if(toWhichCase==TO_UPPER) { |
| 505 destLength=_caseMap(csm, ucase_toFullUpper, |
| 506 dest, destCapacity, |
| 507 src, &csc, |
| 508 0, srcLength, |
| 509 pErrorCode); |
| 510 } else /* if(toWhichCase==TO_TITLE) */ { |
| 511 #if UCONFIG_NO_BREAK_ITERATION |
| 512 *pErrorCode=U_UNSUPPORTED_ERROR; |
| 513 #else |
| 514 /* UCaseMap is actually non-const in toTitle() APIs. */ |
| 515 UCaseMap *tmp = (UCaseMap *)csm; |
| 516 destLength=_toTitle(tmp, dest, destCapacity, |
| 517 src, &csc, srcLength, |
| 518 pErrorCode); |
| 519 #endif |
| 520 } |
| 521 } |
| 522 |
| 523 return u_terminateChars((char *)dest, destCapacity, destLength, pErrorCode); |
| 524 } |
| 525 |
| 526 /* public API functions */ |
| 527 |
| 528 U_CAPI int32_t U_EXPORT2 |
| 529 ucasemap_utf8ToLower(const UCaseMap *csm, |
| 530 char *dest, int32_t destCapacity, |
| 531 const char *src, int32_t srcLength, |
| 532 UErrorCode *pErrorCode) { |
| 533 return caseMap(csm, |
| 534 (uint8_t *)dest, destCapacity, |
| 535 (const uint8_t *)src, srcLength, |
| 536 TO_LOWER, pErrorCode); |
| 537 } |
| 538 |
| 539 U_CAPI int32_t U_EXPORT2 |
| 540 ucasemap_utf8ToUpper(const UCaseMap *csm, |
| 541 char *dest, int32_t destCapacity, |
| 542 const char *src, int32_t srcLength, |
| 543 UErrorCode *pErrorCode) { |
| 544 return caseMap(csm, |
| 545 (uint8_t *)dest, destCapacity, |
| 546 (const uint8_t *)src, srcLength, |
| 547 TO_UPPER, pErrorCode); |
| 548 } |
| 549 |
| 550 #if !UCONFIG_NO_BREAK_ITERATION |
| 551 |
| 552 U_CAPI int32_t U_EXPORT2 |
| 553 ucasemap_utf8ToTitle(UCaseMap *csm, |
| 554 char *dest, int32_t destCapacity, |
| 555 const char *src, int32_t srcLength, |
| 556 UErrorCode *pErrorCode) { |
| 557 return caseMap(csm, |
| 558 (uint8_t *)dest, destCapacity, |
| 559 (const uint8_t *)src, srcLength, |
| 560 TO_TITLE, pErrorCode); |
| 561 } |
| 562 |
| 563 #endif |
| 564 |
| 565 U_CAPI int32_t U_EXPORT2 |
| 566 ucasemap_utf8FoldCase(const UCaseMap *csm, |
| 567 char *dest, int32_t destCapacity, |
| 568 const char *src, int32_t srcLength, |
| 569 UErrorCode *pErrorCode) { |
| 570 return caseMap(csm, |
| 571 (uint8_t *)dest, destCapacity, |
| 572 (const uint8_t *)src, srcLength, |
| 573 FOLD_CASE, pErrorCode); |
| 574 } |
OLD | NEW |