OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * |
| 4 * Copyright (C) 2002-2006, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ******************************************************************************* |
| 8 * file name: uiter.cpp |
| 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) |
| 11 * indentation:4 |
| 12 * |
| 13 * created on: 2002jan18 |
| 14 * created by: Markus W. Scherer |
| 15 */ |
| 16 |
| 17 #include "unicode/utypes.h" |
| 18 #include "unicode/ustring.h" |
| 19 #include "unicode/chariter.h" |
| 20 #include "unicode/rep.h" |
| 21 #include "unicode/uiter.h" |
| 22 #include "cstring.h" |
| 23 |
| 24 U_NAMESPACE_USE |
| 25 |
| 26 #define IS_EVEN(n) (((n)&1)==0) |
| 27 #define IS_POINTER_EVEN(p) IS_EVEN((size_t)p) |
| 28 |
| 29 U_CDECL_BEGIN |
| 30 |
| 31 /* No-Op UCharIterator implementation for illegal input --------------------- */ |
| 32 |
| 33 static int32_t U_CALLCONV |
| 34 noopGetIndex(UCharIterator * /*iter*/, UCharIteratorOrigin /*origin*/) { |
| 35 return 0; |
| 36 } |
| 37 |
| 38 static int32_t U_CALLCONV |
| 39 noopMove(UCharIterator * /*iter*/, int32_t /*delta*/, UCharIteratorOrigin /*orig
in*/) { |
| 40 return 0; |
| 41 } |
| 42 |
| 43 static UBool U_CALLCONV |
| 44 noopHasNext(UCharIterator * /*iter*/) { |
| 45 return FALSE; |
| 46 } |
| 47 |
| 48 static UChar32 U_CALLCONV |
| 49 noopCurrent(UCharIterator * /*iter*/) { |
| 50 return U_SENTINEL; |
| 51 } |
| 52 |
| 53 static uint32_t U_CALLCONV |
| 54 noopGetState(const UCharIterator * /*iter*/) { |
| 55 return UITER_NO_STATE; |
| 56 } |
| 57 |
| 58 static void U_CALLCONV |
| 59 noopSetState(UCharIterator * /*iter*/, uint32_t /*state*/, UErrorCode *pErrorCod
e) { |
| 60 *pErrorCode=U_UNSUPPORTED_ERROR; |
| 61 } |
| 62 |
| 63 static const UCharIterator noopIterator={ |
| 64 0, 0, 0, 0, 0, 0, |
| 65 noopGetIndex, |
| 66 noopMove, |
| 67 noopHasNext, |
| 68 noopHasNext, |
| 69 noopCurrent, |
| 70 noopCurrent, |
| 71 noopCurrent, |
| 72 NULL, |
| 73 noopGetState, |
| 74 noopSetState |
| 75 }; |
| 76 |
| 77 /* UCharIterator implementation for simple strings -------------------------- */ |
| 78 |
| 79 /* |
| 80 * This is an implementation of a code unit (UChar) iterator |
| 81 * for UChar * strings. |
| 82 * |
| 83 * The UCharIterator.context field holds a pointer to the string. |
| 84 */ |
| 85 |
| 86 static int32_t U_CALLCONV |
| 87 stringIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { |
| 88 switch(origin) { |
| 89 case UITER_ZERO: |
| 90 return 0; |
| 91 case UITER_START: |
| 92 return iter->start; |
| 93 case UITER_CURRENT: |
| 94 return iter->index; |
| 95 case UITER_LIMIT: |
| 96 return iter->limit; |
| 97 case UITER_LENGTH: |
| 98 return iter->length; |
| 99 default: |
| 100 /* not a valid origin */ |
| 101 /* Should never get here! */ |
| 102 return -1; |
| 103 } |
| 104 } |
| 105 |
| 106 static int32_t U_CALLCONV |
| 107 stringIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origi
n) { |
| 108 int32_t pos; |
| 109 |
| 110 switch(origin) { |
| 111 case UITER_ZERO: |
| 112 pos=delta; |
| 113 break; |
| 114 case UITER_START: |
| 115 pos=iter->start+delta; |
| 116 break; |
| 117 case UITER_CURRENT: |
| 118 pos=iter->index+delta; |
| 119 break; |
| 120 case UITER_LIMIT: |
| 121 pos=iter->limit+delta; |
| 122 break; |
| 123 case UITER_LENGTH: |
| 124 pos=iter->length+delta; |
| 125 break; |
| 126 default: |
| 127 return -1; /* Error */ |
| 128 } |
| 129 |
| 130 if(pos<iter->start) { |
| 131 pos=iter->start; |
| 132 } else if(pos>iter->limit) { |
| 133 pos=iter->limit; |
| 134 } |
| 135 |
| 136 return iter->index=pos; |
| 137 } |
| 138 |
| 139 static UBool U_CALLCONV |
| 140 stringIteratorHasNext(UCharIterator *iter) { |
| 141 return iter->index<iter->limit; |
| 142 } |
| 143 |
| 144 static UBool U_CALLCONV |
| 145 stringIteratorHasPrevious(UCharIterator *iter) { |
| 146 return iter->index>iter->start; |
| 147 } |
| 148 |
| 149 static UChar32 U_CALLCONV |
| 150 stringIteratorCurrent(UCharIterator *iter) { |
| 151 if(iter->index<iter->limit) { |
| 152 return ((const UChar *)(iter->context))[iter->index]; |
| 153 } else { |
| 154 return U_SENTINEL; |
| 155 } |
| 156 } |
| 157 |
| 158 static UChar32 U_CALLCONV |
| 159 stringIteratorNext(UCharIterator *iter) { |
| 160 if(iter->index<iter->limit) { |
| 161 return ((const UChar *)(iter->context))[iter->index++]; |
| 162 } else { |
| 163 return U_SENTINEL; |
| 164 } |
| 165 } |
| 166 |
| 167 static UChar32 U_CALLCONV |
| 168 stringIteratorPrevious(UCharIterator *iter) { |
| 169 if(iter->index>iter->start) { |
| 170 return ((const UChar *)(iter->context))[--iter->index]; |
| 171 } else { |
| 172 return U_SENTINEL; |
| 173 } |
| 174 } |
| 175 |
| 176 static uint32_t U_CALLCONV |
| 177 stringIteratorGetState(const UCharIterator *iter) { |
| 178 return (uint32_t)iter->index; |
| 179 } |
| 180 |
| 181 static void U_CALLCONV |
| 182 stringIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCo
de) { |
| 183 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
| 184 /* do nothing */ |
| 185 } else if(iter==NULL) { |
| 186 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 187 } else if((int32_t)state<iter->start || iter->limit<(int32_t)state) { |
| 188 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
| 189 } else { |
| 190 iter->index=(int32_t)state; |
| 191 } |
| 192 } |
| 193 |
| 194 static const UCharIterator stringIterator={ |
| 195 0, 0, 0, 0, 0, 0, |
| 196 stringIteratorGetIndex, |
| 197 stringIteratorMove, |
| 198 stringIteratorHasNext, |
| 199 stringIteratorHasPrevious, |
| 200 stringIteratorCurrent, |
| 201 stringIteratorNext, |
| 202 stringIteratorPrevious, |
| 203 NULL, |
| 204 stringIteratorGetState, |
| 205 stringIteratorSetState |
| 206 }; |
| 207 |
| 208 U_CAPI void U_EXPORT2 |
| 209 uiter_setString(UCharIterator *iter, const UChar *s, int32_t length) { |
| 210 if(iter!=0) { |
| 211 if(s!=0 && length>=-1) { |
| 212 *iter=stringIterator; |
| 213 iter->context=s; |
| 214 if(length>=0) { |
| 215 iter->length=length; |
| 216 } else { |
| 217 iter->length=u_strlen(s); |
| 218 } |
| 219 iter->limit=iter->length; |
| 220 } else { |
| 221 *iter=noopIterator; |
| 222 } |
| 223 } |
| 224 } |
| 225 |
| 226 /* UCharIterator implementation for UTF-16BE strings ------------------------ */ |
| 227 |
| 228 /* |
| 229 * This is an implementation of a code unit (UChar) iterator |
| 230 * for UTF-16BE strings, i.e., strings in byte-vectors where |
| 231 * each UChar is stored as a big-endian pair of bytes. |
| 232 * |
| 233 * The UCharIterator.context field holds a pointer to the string. |
| 234 * Everything works just like with a normal UChar iterator (uiter_setString), |
| 235 * except that UChars are assembled from byte pairs. |
| 236 */ |
| 237 |
| 238 /* internal helper function */ |
| 239 static inline UChar32 |
| 240 utf16BEIteratorGet(UCharIterator *iter, int32_t index) { |
| 241 const uint8_t *p=(const uint8_t *)iter->context; |
| 242 return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1]; |
| 243 } |
| 244 |
| 245 static UChar32 U_CALLCONV |
| 246 utf16BEIteratorCurrent(UCharIterator *iter) { |
| 247 int32_t index; |
| 248 |
| 249 if((index=iter->index)<iter->limit) { |
| 250 return utf16BEIteratorGet(iter, index); |
| 251 } else { |
| 252 return U_SENTINEL; |
| 253 } |
| 254 } |
| 255 |
| 256 static UChar32 U_CALLCONV |
| 257 utf16BEIteratorNext(UCharIterator *iter) { |
| 258 int32_t index; |
| 259 |
| 260 if((index=iter->index)<iter->limit) { |
| 261 iter->index=index+1; |
| 262 return utf16BEIteratorGet(iter, index); |
| 263 } else { |
| 264 return U_SENTINEL; |
| 265 } |
| 266 } |
| 267 |
| 268 static UChar32 U_CALLCONV |
| 269 utf16BEIteratorPrevious(UCharIterator *iter) { |
| 270 int32_t index; |
| 271 |
| 272 if((index=iter->index)>iter->start) { |
| 273 iter->index=--index; |
| 274 return utf16BEIteratorGet(iter, index); |
| 275 } else { |
| 276 return U_SENTINEL; |
| 277 } |
| 278 } |
| 279 |
| 280 static const UCharIterator utf16BEIterator={ |
| 281 0, 0, 0, 0, 0, 0, |
| 282 stringIteratorGetIndex, |
| 283 stringIteratorMove, |
| 284 stringIteratorHasNext, |
| 285 stringIteratorHasPrevious, |
| 286 utf16BEIteratorCurrent, |
| 287 utf16BEIteratorNext, |
| 288 utf16BEIteratorPrevious, |
| 289 NULL, |
| 290 stringIteratorGetState, |
| 291 stringIteratorSetState |
| 292 }; |
| 293 |
| 294 /* |
| 295 * Count the number of UChars in a UTF-16BE string before a terminating UChar NU
L, |
| 296 * i.e., before a pair of 0 bytes where the first 0 byte is at an even |
| 297 * offset from s. |
| 298 */ |
| 299 static int32_t |
| 300 utf16BE_strlen(const char *s) { |
| 301 if(IS_POINTER_EVEN(s)) { |
| 302 /* |
| 303 * even-aligned, call u_strlen(s) |
| 304 * we are probably on a little-endian machine, but searching for UChar N
UL |
| 305 * does not care about endianness |
| 306 */ |
| 307 return u_strlen((const UChar *)s); |
| 308 } else { |
| 309 /* odd-aligned, search for pair of 0 bytes */ |
| 310 const char *p=s; |
| 311 |
| 312 while(!(*p==0 && p[1]==0)) { |
| 313 p+=2; |
| 314 } |
| 315 return (int32_t)((p-s)/2); |
| 316 } |
| 317 } |
| 318 |
| 319 U_CAPI void U_EXPORT2 |
| 320 uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) { |
| 321 if(iter!=NULL) { |
| 322 /* allow only even-length strings (the input length counts bytes) */ |
| 323 if(s!=NULL && (length==-1 || (length>=0 && IS_EVEN(length)))) { |
| 324 /* length/=2, except that >>=1 also works for -1 (-1/2==0, -1>>1==-1
) */ |
| 325 length>>=1; |
| 326 |
| 327 if(U_IS_BIG_ENDIAN && IS_POINTER_EVEN(s)) { |
| 328 /* big-endian machine and 2-aligned UTF-16BE string: use normal
UChar iterator */ |
| 329 uiter_setString(iter, (const UChar *)s, length); |
| 330 return; |
| 331 } |
| 332 |
| 333 *iter=utf16BEIterator; |
| 334 iter->context=s; |
| 335 if(length>=0) { |
| 336 iter->length=length; |
| 337 } else { |
| 338 iter->length=utf16BE_strlen(s); |
| 339 } |
| 340 iter->limit=iter->length; |
| 341 } else { |
| 342 *iter=noopIterator; |
| 343 } |
| 344 } |
| 345 } |
| 346 |
| 347 /* UCharIterator wrapper around CharacterIterator --------------------------- */ |
| 348 |
| 349 /* |
| 350 * This is wrapper code around a C++ CharacterIterator to |
| 351 * look like a C UCharIterator. |
| 352 * |
| 353 * The UCharIterator.context field holds a pointer to the CharacterIterator. |
| 354 */ |
| 355 |
| 356 static int32_t U_CALLCONV |
| 357 characterIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { |
| 358 switch(origin) { |
| 359 case UITER_ZERO: |
| 360 return 0; |
| 361 case UITER_START: |
| 362 return ((CharacterIterator *)(iter->context))->startIndex(); |
| 363 case UITER_CURRENT: |
| 364 return ((CharacterIterator *)(iter->context))->getIndex(); |
| 365 case UITER_LIMIT: |
| 366 return ((CharacterIterator *)(iter->context))->endIndex(); |
| 367 case UITER_LENGTH: |
| 368 return ((CharacterIterator *)(iter->context))->getLength(); |
| 369 default: |
| 370 /* not a valid origin */ |
| 371 /* Should never get here! */ |
| 372 return -1; |
| 373 } |
| 374 } |
| 375 |
| 376 static int32_t U_CALLCONV |
| 377 characterIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin or
igin) { |
| 378 switch(origin) { |
| 379 case UITER_ZERO: |
| 380 ((CharacterIterator *)(iter->context))->setIndex(delta); |
| 381 return ((CharacterIterator *)(iter->context))->getIndex(); |
| 382 case UITER_START: |
| 383 case UITER_CURRENT: |
| 384 case UITER_LIMIT: |
| 385 return ((CharacterIterator *)(iter->context))->move(delta, (CharacterIte
rator::EOrigin)origin); |
| 386 case UITER_LENGTH: |
| 387 ((CharacterIterator *)(iter->context))->setIndex(((CharacterIterator *)(
iter->context))->getLength()+delta); |
| 388 return ((CharacterIterator *)(iter->context))->getIndex(); |
| 389 default: |
| 390 /* not a valid origin */ |
| 391 /* Should never get here! */ |
| 392 return -1; |
| 393 } |
| 394 } |
| 395 |
| 396 static UBool U_CALLCONV |
| 397 characterIteratorHasNext(UCharIterator *iter) { |
| 398 return ((CharacterIterator *)(iter->context))->hasNext(); |
| 399 } |
| 400 |
| 401 static UBool U_CALLCONV |
| 402 characterIteratorHasPrevious(UCharIterator *iter) { |
| 403 return ((CharacterIterator *)(iter->context))->hasPrevious(); |
| 404 } |
| 405 |
| 406 static UChar32 U_CALLCONV |
| 407 characterIteratorCurrent(UCharIterator *iter) { |
| 408 UChar32 c; |
| 409 |
| 410 c=((CharacterIterator *)(iter->context))->current(); |
| 411 if(c!=0xffff || ((CharacterIterator *)(iter->context))->hasNext()) { |
| 412 return c; |
| 413 } else { |
| 414 return U_SENTINEL; |
| 415 } |
| 416 } |
| 417 |
| 418 static UChar32 U_CALLCONV |
| 419 characterIteratorNext(UCharIterator *iter) { |
| 420 if(((CharacterIterator *)(iter->context))->hasNext()) { |
| 421 return ((CharacterIterator *)(iter->context))->nextPostInc(); |
| 422 } else { |
| 423 return U_SENTINEL; |
| 424 } |
| 425 } |
| 426 |
| 427 static UChar32 U_CALLCONV |
| 428 characterIteratorPrevious(UCharIterator *iter) { |
| 429 if(((CharacterIterator *)(iter->context))->hasPrevious()) { |
| 430 return ((CharacterIterator *)(iter->context))->previous(); |
| 431 } else { |
| 432 return U_SENTINEL; |
| 433 } |
| 434 } |
| 435 |
| 436 static uint32_t U_CALLCONV |
| 437 characterIteratorGetState(const UCharIterator *iter) { |
| 438 return ((CharacterIterator *)(iter->context))->getIndex(); |
| 439 } |
| 440 |
| 441 static void U_CALLCONV |
| 442 characterIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErro
rCode) { |
| 443 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
| 444 /* do nothing */ |
| 445 } else if(iter==NULL || iter->context==NULL) { |
| 446 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 447 } else if((int32_t)state<((CharacterIterator *)(iter->context))->startIndex(
) || ((CharacterIterator *)(iter->context))->endIndex()<(int32_t)state) { |
| 448 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
| 449 } else { |
| 450 ((CharacterIterator *)(iter->context))->setIndex((int32_t)state); |
| 451 } |
| 452 } |
| 453 |
| 454 static const UCharIterator characterIteratorWrapper={ |
| 455 0, 0, 0, 0, 0, 0, |
| 456 characterIteratorGetIndex, |
| 457 characterIteratorMove, |
| 458 characterIteratorHasNext, |
| 459 characterIteratorHasPrevious, |
| 460 characterIteratorCurrent, |
| 461 characterIteratorNext, |
| 462 characterIteratorPrevious, |
| 463 NULL, |
| 464 characterIteratorGetState, |
| 465 characterIteratorSetState |
| 466 }; |
| 467 |
| 468 U_CAPI void U_EXPORT2 |
| 469 uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter) { |
| 470 if(iter!=0) { |
| 471 if(charIter!=0) { |
| 472 *iter=characterIteratorWrapper; |
| 473 iter->context=charIter; |
| 474 } else { |
| 475 *iter=noopIterator; |
| 476 } |
| 477 } |
| 478 } |
| 479 |
| 480 /* UCharIterator wrapper around Replaceable --------------------------------- */ |
| 481 |
| 482 /* |
| 483 * This is an implementation of a code unit (UChar) iterator |
| 484 * based on a Replaceable object. |
| 485 * |
| 486 * The UCharIterator.context field holds a pointer to the Replaceable. |
| 487 * UCharIterator.length and UCharIterator.index hold Replaceable.length() |
| 488 * and the iteration index. |
| 489 */ |
| 490 |
| 491 static UChar32 U_CALLCONV |
| 492 replaceableIteratorCurrent(UCharIterator *iter) { |
| 493 if(iter->index<iter->limit) { |
| 494 return ((Replaceable *)(iter->context))->charAt(iter->index); |
| 495 } else { |
| 496 return U_SENTINEL; |
| 497 } |
| 498 } |
| 499 |
| 500 static UChar32 U_CALLCONV |
| 501 replaceableIteratorNext(UCharIterator *iter) { |
| 502 if(iter->index<iter->limit) { |
| 503 return ((Replaceable *)(iter->context))->charAt(iter->index++); |
| 504 } else { |
| 505 return U_SENTINEL; |
| 506 } |
| 507 } |
| 508 |
| 509 static UChar32 U_CALLCONV |
| 510 replaceableIteratorPrevious(UCharIterator *iter) { |
| 511 if(iter->index>iter->start) { |
| 512 return ((Replaceable *)(iter->context))->charAt(--iter->index); |
| 513 } else { |
| 514 return U_SENTINEL; |
| 515 } |
| 516 } |
| 517 |
| 518 static const UCharIterator replaceableIterator={ |
| 519 0, 0, 0, 0, 0, 0, |
| 520 stringIteratorGetIndex, |
| 521 stringIteratorMove, |
| 522 stringIteratorHasNext, |
| 523 stringIteratorHasPrevious, |
| 524 replaceableIteratorCurrent, |
| 525 replaceableIteratorNext, |
| 526 replaceableIteratorPrevious, |
| 527 NULL, |
| 528 stringIteratorGetState, |
| 529 stringIteratorSetState |
| 530 }; |
| 531 |
| 532 U_CAPI void U_EXPORT2 |
| 533 uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep) { |
| 534 if(iter!=0) { |
| 535 if(rep!=0) { |
| 536 *iter=replaceableIterator; |
| 537 iter->context=rep; |
| 538 iter->limit=iter->length=rep->length(); |
| 539 } else { |
| 540 *iter=noopIterator; |
| 541 } |
| 542 } |
| 543 } |
| 544 |
| 545 /* UCharIterator implementation for UTF-8 strings --------------------------- */ |
| 546 |
| 547 /* |
| 548 * Possible, probably necessary only for an implementation for arbitrary |
| 549 * converters: |
| 550 * Maintain a buffer (ring buffer?) for a piece of converted 16-bit text. |
| 551 * This would require to turn reservedFn into a close function and |
| 552 * to introduce a uiter_close(iter). |
| 553 */ |
| 554 |
| 555 #define UITER_CNV_CAPACITY 16 |
| 556 |
| 557 /* |
| 558 * Minimal implementation: |
| 559 * Maintain a single-UChar buffer for an additional surrogate. |
| 560 * The caller must not modify start and limit because they are used internally. |
| 561 * |
| 562 * Use UCharIterator fields as follows: |
| 563 * context pointer to UTF-8 string |
| 564 * length UTF-16 length of the string; -1 until lazy evaluation |
| 565 * start current UTF-8 index |
| 566 * index current UTF-16 index; may be -1="unknown" after setState() |
| 567 * limit UTF-8 length of the string |
| 568 * reservedField supplementary code point |
| 569 * |
| 570 * Since UCharIterator delivers 16-bit code units, the iteration can be |
| 571 * currently in the middle of the byte sequence for a supplementary code point. |
| 572 * In this case, reservedField will contain that code point and start will |
| 573 * point to after the corresponding byte sequence. The UTF-16 index will be |
| 574 * one less than what it would otherwise be corresponding to the UTF-8 index. |
| 575 * Otherwise, reservedField will be 0. |
| 576 */ |
| 577 |
| 578 /* |
| 579 * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings: |
| 580 * Add implementations that do not call strlen() for iteration but check for NUL
. |
| 581 */ |
| 582 |
| 583 static int32_t U_CALLCONV |
| 584 utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { |
| 585 switch(origin) { |
| 586 case UITER_ZERO: |
| 587 case UITER_START: |
| 588 return 0; |
| 589 case UITER_CURRENT: |
| 590 if(iter->index<0) { |
| 591 /* the current UTF-16 index is unknown after setState(), count from
the beginning */ |
| 592 const uint8_t *s; |
| 593 UChar32 c; |
| 594 int32_t i, limit, index; |
| 595 |
| 596 s=(const uint8_t *)iter->context; |
| 597 i=index=0; |
| 598 limit=iter->start; /* count up to the UTF-8 index */ |
| 599 while(i<limit) { |
| 600 U8_NEXT(s, i, limit, c); |
| 601 if(c<=0xffff) { |
| 602 ++index; |
| 603 } else { |
| 604 index+=2; |
| 605 } |
| 606 } |
| 607 |
| 608 iter->start=i; /* just in case setState() did not get us to a code p
oint boundary */ |
| 609 if(i==iter->limit) { |
| 610 iter->length=index; /* in case it was <0 or wrong */ |
| 611 } |
| 612 if(iter->reservedField!=0) { |
| 613 --index; /* we are in the middle of a supplementary code point *
/ |
| 614 } |
| 615 iter->index=index; |
| 616 } |
| 617 return iter->index; |
| 618 case UITER_LIMIT: |
| 619 case UITER_LENGTH: |
| 620 if(iter->length<0) { |
| 621 const uint8_t *s; |
| 622 UChar32 c; |
| 623 int32_t i, limit, length; |
| 624 |
| 625 s=(const uint8_t *)iter->context; |
| 626 if(iter->index<0) { |
| 627 /* |
| 628 * the current UTF-16 index is unknown after setState(), |
| 629 * we must first count from the beginning to here |
| 630 */ |
| 631 i=length=0; |
| 632 limit=iter->start; |
| 633 |
| 634 /* count from the beginning to the current index */ |
| 635 while(i<limit) { |
| 636 U8_NEXT(s, i, limit, c); |
| 637 if(c<=0xffff) { |
| 638 ++length; |
| 639 } else { |
| 640 length+=2; |
| 641 } |
| 642 } |
| 643 |
| 644 /* assume i==limit==iter->start, set the UTF-16 index */ |
| 645 iter->start=i; /* just in case setState() did not get us to a co
de point boundary */ |
| 646 iter->index= iter->reservedField!=0 ? length-1 : length; |
| 647 } else { |
| 648 i=iter->start; |
| 649 length=iter->index; |
| 650 if(iter->reservedField!=0) { |
| 651 ++length; |
| 652 } |
| 653 } |
| 654 |
| 655 /* count from the current index to the end */ |
| 656 limit=iter->limit; |
| 657 while(i<limit) { |
| 658 U8_NEXT(s, i, limit, c); |
| 659 if(c<=0xffff) { |
| 660 ++length; |
| 661 } else { |
| 662 length+=2; |
| 663 } |
| 664 } |
| 665 iter->length=length; |
| 666 } |
| 667 return iter->length; |
| 668 default: |
| 669 /* not a valid origin */ |
| 670 /* Should never get here! */ |
| 671 return -1; |
| 672 } |
| 673 } |
| 674 |
| 675 static int32_t U_CALLCONV |
| 676 utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin)
{ |
| 677 const uint8_t *s; |
| 678 UChar32 c; |
| 679 int32_t pos; /* requested UTF-16 index */ |
| 680 int32_t i; /* UTF-8 index */ |
| 681 UBool havePos; |
| 682 |
| 683 /* calculate the requested UTF-16 index */ |
| 684 switch(origin) { |
| 685 case UITER_ZERO: |
| 686 case UITER_START: |
| 687 pos=delta; |
| 688 havePos=TRUE; |
| 689 /* iter->index<0 (unknown) is possible */ |
| 690 break; |
| 691 case UITER_CURRENT: |
| 692 if(iter->index>=0) { |
| 693 pos=iter->index+delta; |
| 694 havePos=TRUE; |
| 695 } else { |
| 696 /* the current UTF-16 index is unknown after setState(), use only de
lta */ |
| 697 pos=0; |
| 698 havePos=FALSE; |
| 699 } |
| 700 break; |
| 701 case UITER_LIMIT: |
| 702 case UITER_LENGTH: |
| 703 if(iter->length>=0) { |
| 704 pos=iter->length+delta; |
| 705 havePos=TRUE; |
| 706 } else { |
| 707 /* pin to the end, avoid counting the length */ |
| 708 iter->index=-1; |
| 709 iter->start=iter->limit; |
| 710 iter->reservedField=0; |
| 711 if(delta>=0) { |
| 712 return UITER_UNKNOWN_INDEX; |
| 713 } else { |
| 714 /* the current UTF-16 index is unknown, use only delta */ |
| 715 pos=0; |
| 716 havePos=FALSE; |
| 717 } |
| 718 } |
| 719 break; |
| 720 default: |
| 721 return -1; /* Error */ |
| 722 } |
| 723 |
| 724 if(havePos) { |
| 725 /* shortcuts: pinning to the edges of the string */ |
| 726 if(pos<=0) { |
| 727 iter->index=iter->start=iter->reservedField=0; |
| 728 return 0; |
| 729 } else if(iter->length>=0 && pos>=iter->length) { |
| 730 iter->index=iter->length; |
| 731 iter->start=iter->limit; |
| 732 iter->reservedField=0; |
| 733 return iter->index; |
| 734 } |
| 735 |
| 736 /* minimize the number of U8_NEXT/PREV operations */ |
| 737 if(iter->index<0 || pos<iter->index/2) { |
| 738 /* go forward from the start instead of backward from the current in
dex */ |
| 739 iter->index=iter->start=iter->reservedField=0; |
| 740 } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) { |
| 741 /* |
| 742 * if we have the UTF-16 index and length and the new position is |
| 743 * closer to the end than the current index, |
| 744 * then go backward from the end instead of forward from the current
index |
| 745 */ |
| 746 iter->index=iter->length; |
| 747 iter->start=iter->limit; |
| 748 iter->reservedField=0; |
| 749 } |
| 750 |
| 751 delta=pos-iter->index; |
| 752 if(delta==0) { |
| 753 return iter->index; /* nothing to do */ |
| 754 } |
| 755 } else { |
| 756 /* move relative to unknown UTF-16 index */ |
| 757 if(delta==0) { |
| 758 return UITER_UNKNOWN_INDEX; /* nothing to do */ |
| 759 } else if(-delta>=iter->start) { |
| 760 /* moving backwards by more UChars than there are UTF-8 bytes, pin t
o 0 */ |
| 761 iter->index=iter->start=iter->reservedField=0; |
| 762 return 0; |
| 763 } else if(delta>=(iter->limit-iter->start)) { |
| 764 /* moving forward by more UChars than the remaining UTF-8 bytes, pin
to the end */ |
| 765 iter->index=iter->length; /* may or may not be <0 (unknown) */ |
| 766 iter->start=iter->limit; |
| 767 iter->reservedField=0; |
| 768 return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX; |
| 769 } |
| 770 } |
| 771 |
| 772 /* delta!=0 */ |
| 773 |
| 774 /* move towards the requested position, pin to the edges of the string */ |
| 775 s=(const uint8_t *)iter->context; |
| 776 pos=iter->index; /* could be <0 (unknown) */ |
| 777 i=iter->start; |
| 778 if(delta>0) { |
| 779 /* go forward */ |
| 780 int32_t limit=iter->limit; |
| 781 if(iter->reservedField!=0) { |
| 782 iter->reservedField=0; |
| 783 ++pos; |
| 784 --delta; |
| 785 } |
| 786 while(delta>0 && i<limit) { |
| 787 U8_NEXT(s, i, limit, c); |
| 788 if(c<0xffff) { |
| 789 ++pos; |
| 790 --delta; |
| 791 } else if(delta>=2) { |
| 792 pos+=2; |
| 793 delta-=2; |
| 794 } else /* delta==1 */ { |
| 795 /* stop in the middle of a supplementary code point */ |
| 796 iter->reservedField=c; |
| 797 ++pos; |
| 798 break; /* delta=0; */ |
| 799 } |
| 800 } |
| 801 if(i==limit) { |
| 802 if(iter->length<0 && iter->index>=0) { |
| 803 iter->length= iter->reservedField==0 ? pos : pos+1; |
| 804 } else if(iter->index<0 && iter->length>=0) { |
| 805 iter->index= iter->reservedField==0 ? iter->length : iter->lengt
h-1; |
| 806 } |
| 807 } |
| 808 } else /* delta<0 */ { |
| 809 /* go backward */ |
| 810 if(iter->reservedField!=0) { |
| 811 iter->reservedField=0; |
| 812 i-=4; /* we stayed behind the supplementary code point; go before it
now */ |
| 813 --pos; |
| 814 ++delta; |
| 815 } |
| 816 while(delta<0 && i>0) { |
| 817 U8_PREV(s, 0, i, c); |
| 818 if(c<0xffff) { |
| 819 --pos; |
| 820 ++delta; |
| 821 } else if(delta<=-2) { |
| 822 pos-=2; |
| 823 delta+=2; |
| 824 } else /* delta==-1 */ { |
| 825 /* stop in the middle of a supplementary code point */ |
| 826 i+=4; /* back to behind this supplementary code point for consis
tent state */ |
| 827 iter->reservedField=c; |
| 828 --pos; |
| 829 break; /* delta=0; */ |
| 830 } |
| 831 } |
| 832 } |
| 833 |
| 834 iter->start=i; |
| 835 if(iter->index>=0) { |
| 836 return iter->index=pos; |
| 837 } else { |
| 838 /* we started with index<0 (unknown) so pos is bogus */ |
| 839 if(i<=1) { |
| 840 return iter->index=i; /* reached the beginning */ |
| 841 } else { |
| 842 /* we still don't know the UTF-16 index */ |
| 843 return UITER_UNKNOWN_INDEX; |
| 844 } |
| 845 } |
| 846 } |
| 847 |
| 848 static UBool U_CALLCONV |
| 849 utf8IteratorHasNext(UCharIterator *iter) { |
| 850 return iter->start<iter->limit || iter->reservedField!=0; |
| 851 } |
| 852 |
| 853 static UBool U_CALLCONV |
| 854 utf8IteratorHasPrevious(UCharIterator *iter) { |
| 855 return iter->start>0; |
| 856 } |
| 857 |
| 858 static UChar32 U_CALLCONV |
| 859 utf8IteratorCurrent(UCharIterator *iter) { |
| 860 if(iter->reservedField!=0) { |
| 861 return U16_TRAIL(iter->reservedField); |
| 862 } else if(iter->start<iter->limit) { |
| 863 const uint8_t *s=(const uint8_t *)iter->context; |
| 864 UChar32 c; |
| 865 int32_t i=iter->start; |
| 866 |
| 867 U8_NEXT(s, i, iter->limit, c); |
| 868 if(c<0) { |
| 869 return 0xfffd; |
| 870 } else if(c<=0xffff) { |
| 871 return c; |
| 872 } else { |
| 873 return U16_LEAD(c); |
| 874 } |
| 875 } else { |
| 876 return U_SENTINEL; |
| 877 } |
| 878 } |
| 879 |
| 880 static UChar32 U_CALLCONV |
| 881 utf8IteratorNext(UCharIterator *iter) { |
| 882 int32_t index; |
| 883 |
| 884 if(iter->reservedField!=0) { |
| 885 UChar trail=U16_TRAIL(iter->reservedField); |
| 886 iter->reservedField=0; |
| 887 if((index=iter->index)>=0) { |
| 888 iter->index=index+1; |
| 889 } |
| 890 return trail; |
| 891 } else if(iter->start<iter->limit) { |
| 892 const uint8_t *s=(const uint8_t *)iter->context; |
| 893 UChar32 c; |
| 894 |
| 895 U8_NEXT(s, iter->start, iter->limit, c); |
| 896 if((index=iter->index)>=0) { |
| 897 iter->index=++index; |
| 898 if(iter->length<0 && iter->start==iter->limit) { |
| 899 iter->length= c<=0xffff ? index : index+1; |
| 900 } |
| 901 } else if(iter->start==iter->limit && iter->length>=0) { |
| 902 iter->index= c<=0xffff ? iter->length : iter->length-1; |
| 903 } |
| 904 if(c<0) { |
| 905 return 0xfffd; |
| 906 } else if(c<=0xffff) { |
| 907 return c; |
| 908 } else { |
| 909 iter->reservedField=c; |
| 910 return U16_LEAD(c); |
| 911 } |
| 912 } else { |
| 913 return U_SENTINEL; |
| 914 } |
| 915 } |
| 916 |
| 917 static UChar32 U_CALLCONV |
| 918 utf8IteratorPrevious(UCharIterator *iter) { |
| 919 int32_t index; |
| 920 |
| 921 if(iter->reservedField!=0) { |
| 922 UChar lead=U16_LEAD(iter->reservedField); |
| 923 iter->reservedField=0; |
| 924 iter->start-=4; /* we stayed behind the supplementary code point; go bef
ore it now */ |
| 925 if((index=iter->index)>0) { |
| 926 iter->index=index-1; |
| 927 } |
| 928 return lead; |
| 929 } else if(iter->start>0) { |
| 930 const uint8_t *s=(const uint8_t *)iter->context; |
| 931 UChar32 c; |
| 932 |
| 933 U8_PREV(s, 0, iter->start, c); |
| 934 if((index=iter->index)>0) { |
| 935 iter->index=index-1; |
| 936 } else if(iter->start<=1) { |
| 937 iter->index= c<=0xffff ? iter->start : iter->start+1; |
| 938 } |
| 939 if(c<0) { |
| 940 return 0xfffd; |
| 941 } else if(c<=0xffff) { |
| 942 return c; |
| 943 } else { |
| 944 iter->start+=4; /* back to behind this supplementary code point for
consistent state */ |
| 945 iter->reservedField=c; |
| 946 return U16_TRAIL(c); |
| 947 } |
| 948 } else { |
| 949 return U_SENTINEL; |
| 950 } |
| 951 } |
| 952 |
| 953 static uint32_t U_CALLCONV |
| 954 utf8IteratorGetState(const UCharIterator *iter) { |
| 955 uint32_t state=(uint32_t)(iter->start<<1); |
| 956 if(iter->reservedField!=0) { |
| 957 state|=1; |
| 958 } |
| 959 return state; |
| 960 } |
| 961 |
| 962 static void U_CALLCONV |
| 963 utf8IteratorSetState(UCharIterator *iter, |
| 964 uint32_t state, |
| 965 UErrorCode *pErrorCode) |
| 966 { |
| 967 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
| 968 /* do nothing */ |
| 969 } else if(iter==NULL) { |
| 970 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 971 } else if(state==utf8IteratorGetState(iter)) { |
| 972 /* setting to the current state: no-op */ |
| 973 } else { |
| 974 int32_t index=(int32_t)(state>>1); /* UTF-8 index */ |
| 975 state&=1; /* 1 if in surrogate pair, must be index>=4 */ |
| 976 |
| 977 if((state==0 ? index<0 : index<4) || iter->limit<index) { |
| 978 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
| 979 } else { |
| 980 iter->start=index; /* restore UTF-8 byte index */ |
| 981 if(index<=1) { |
| 982 iter->index=index; |
| 983 } else { |
| 984 iter->index=-1; /* unknown UTF-16 index */ |
| 985 } |
| 986 if(state==0) { |
| 987 iter->reservedField=0; |
| 988 } else { |
| 989 /* verified index>=4 above */ |
| 990 UChar32 c; |
| 991 U8_PREV((const uint8_t *)iter->context, 0, index, c); |
| 992 if(c<=0xffff) { |
| 993 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
| 994 } else { |
| 995 iter->reservedField=c; |
| 996 } |
| 997 } |
| 998 } |
| 999 } |
| 1000 } |
| 1001 |
| 1002 static const UCharIterator utf8Iterator={ |
| 1003 0, 0, 0, 0, 0, 0, |
| 1004 utf8IteratorGetIndex, |
| 1005 utf8IteratorMove, |
| 1006 utf8IteratorHasNext, |
| 1007 utf8IteratorHasPrevious, |
| 1008 utf8IteratorCurrent, |
| 1009 utf8IteratorNext, |
| 1010 utf8IteratorPrevious, |
| 1011 NULL, |
| 1012 utf8IteratorGetState, |
| 1013 utf8IteratorSetState |
| 1014 }; |
| 1015 |
| 1016 U_CAPI void U_EXPORT2 |
| 1017 uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) { |
| 1018 if(iter!=0) { |
| 1019 if(s!=0 && length>=-1) { |
| 1020 *iter=utf8Iterator; |
| 1021 iter->context=s; |
| 1022 if(length>=0) { |
| 1023 iter->limit=length; |
| 1024 } else { |
| 1025 iter->limit=(int32_t)uprv_strlen(s); |
| 1026 } |
| 1027 iter->length= iter->limit<=1 ? iter->limit : -1; |
| 1028 } else { |
| 1029 *iter=noopIterator; |
| 1030 } |
| 1031 } |
| 1032 } |
| 1033 |
| 1034 /* Helper functions --------------------------------------------------------- */ |
| 1035 |
| 1036 U_CAPI UChar32 U_EXPORT2 |
| 1037 uiter_current32(UCharIterator *iter) { |
| 1038 UChar32 c, c2; |
| 1039 |
| 1040 c=iter->current(iter); |
| 1041 if(UTF_IS_SURROGATE(c)) { |
| 1042 if(UTF_IS_SURROGATE_FIRST(c)) { |
| 1043 /* |
| 1044 * go to the next code unit |
| 1045 * we know that we are not at the limit because c!=U_SENTINEL |
| 1046 */ |
| 1047 iter->move(iter, 1, UITER_CURRENT); |
| 1048 if(UTF_IS_SECOND_SURROGATE(c2=iter->current(iter))) { |
| 1049 c=UTF16_GET_PAIR_VALUE(c, c2); |
| 1050 } |
| 1051 |
| 1052 /* undo index movement */ |
| 1053 iter->move(iter, -1, UITER_CURRENT); |
| 1054 } else { |
| 1055 if(UTF_IS_FIRST_SURROGATE(c2=iter->previous(iter))) { |
| 1056 c=UTF16_GET_PAIR_VALUE(c2, c); |
| 1057 } |
| 1058 if(c2>=0) { |
| 1059 /* undo index movement */ |
| 1060 iter->move(iter, 1, UITER_CURRENT); |
| 1061 } |
| 1062 } |
| 1063 } |
| 1064 return c; |
| 1065 } |
| 1066 |
| 1067 U_CAPI UChar32 U_EXPORT2 |
| 1068 uiter_next32(UCharIterator *iter) { |
| 1069 UChar32 c, c2; |
| 1070 |
| 1071 c=iter->next(iter); |
| 1072 if(UTF_IS_FIRST_SURROGATE(c)) { |
| 1073 if(UTF_IS_SECOND_SURROGATE(c2=iter->next(iter))) { |
| 1074 c=UTF16_GET_PAIR_VALUE(c, c2); |
| 1075 } else if(c2>=0) { |
| 1076 /* unmatched first surrogate, undo index movement */ |
| 1077 iter->move(iter, -1, UITER_CURRENT); |
| 1078 } |
| 1079 } |
| 1080 return c; |
| 1081 } |
| 1082 |
| 1083 U_CAPI UChar32 U_EXPORT2 |
| 1084 uiter_previous32(UCharIterator *iter) { |
| 1085 UChar32 c, c2; |
| 1086 |
| 1087 c=iter->previous(iter); |
| 1088 if(UTF_IS_SECOND_SURROGATE(c)) { |
| 1089 if(UTF_IS_FIRST_SURROGATE(c2=iter->previous(iter))) { |
| 1090 c=UTF16_GET_PAIR_VALUE(c2, c); |
| 1091 } else if(c2>=0) { |
| 1092 /* unmatched second surrogate, undo index movement */ |
| 1093 iter->move(iter, 1, UITER_CURRENT); |
| 1094 } |
| 1095 } |
| 1096 return c; |
| 1097 } |
| 1098 |
| 1099 U_CAPI uint32_t U_EXPORT2 |
| 1100 uiter_getState(const UCharIterator *iter) { |
| 1101 if(iter==NULL || iter->getState==NULL) { |
| 1102 return UITER_NO_STATE; |
| 1103 } else { |
| 1104 return iter->getState(iter); |
| 1105 } |
| 1106 } |
| 1107 |
| 1108 U_CAPI void U_EXPORT2 |
| 1109 uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) { |
| 1110 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
| 1111 /* do nothing */ |
| 1112 } else if(iter==NULL) { |
| 1113 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 1114 } else if(iter->setState==NULL) { |
| 1115 *pErrorCode=U_UNSUPPORTED_ERROR; |
| 1116 } else { |
| 1117 iter->setState(iter, state, pErrorCode); |
| 1118 } |
| 1119 } |
| 1120 |
| 1121 U_CDECL_END |
OLD | NEW |