OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * Copyright (C) 2010-2014, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ******************************************************************************* |
| 6 * utf16collationiterator.cpp |
| 7 * |
| 8 * created on: 2010oct27 |
| 9 * created by: Markus W. Scherer |
| 10 */ |
| 11 |
| 12 #include "unicode/utypes.h" |
| 13 |
| 14 #if !UCONFIG_NO_COLLATION |
| 15 |
| 16 #include "charstr.h" |
| 17 #include "cmemory.h" |
| 18 #include "collation.h" |
| 19 #include "collationdata.h" |
| 20 #include "collationfcd.h" |
| 21 #include "collationiterator.h" |
| 22 #include "normalizer2impl.h" |
| 23 #include "uassert.h" |
| 24 #include "utf16collationiterator.h" |
| 25 |
| 26 U_NAMESPACE_BEGIN |
| 27 |
| 28 UTF16CollationIterator::UTF16CollationIterator(const UTF16CollationIterator &oth
er, |
| 29 const UChar *newText) |
| 30 : CollationIterator(other), |
| 31 start(newText), |
| 32 pos(newText + (other.pos - other.start)), |
| 33 limit(other.limit == NULL ? NULL : newText + (other.limit - other.star
t)) { |
| 34 } |
| 35 |
| 36 UTF16CollationIterator::~UTF16CollationIterator() {} |
| 37 |
| 38 UBool |
| 39 UTF16CollationIterator::operator==(const CollationIterator &other) const { |
| 40 if(!CollationIterator::operator==(other)) { return FALSE; } |
| 41 const UTF16CollationIterator &o = static_cast<const UTF16CollationIterator &
>(other); |
| 42 // Compare the iterator state but not the text: Assume that the caller does
that. |
| 43 return (pos - start) == (o.pos - o.start); |
| 44 } |
| 45 |
| 46 void |
| 47 UTF16CollationIterator::resetToOffset(int32_t newOffset) { |
| 48 reset(); |
| 49 pos = start + newOffset; |
| 50 } |
| 51 |
| 52 int32_t |
| 53 UTF16CollationIterator::getOffset() const { |
| 54 return (int32_t)(pos - start); |
| 55 } |
| 56 |
| 57 uint32_t |
| 58 UTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) { |
| 59 if(pos == limit) { |
| 60 c = U_SENTINEL; |
| 61 return Collation::FALLBACK_CE32; |
| 62 } |
| 63 c = *pos++; |
| 64 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); |
| 65 } |
| 66 |
| 67 UChar |
| 68 UTF16CollationIterator::handleGetTrailSurrogate() { |
| 69 if(pos == limit) { return 0; } |
| 70 UChar trail; |
| 71 if(U16_IS_TRAIL(trail = *pos)) { ++pos; } |
| 72 return trail; |
| 73 } |
| 74 |
| 75 UBool |
| 76 UTF16CollationIterator::foundNULTerminator() { |
| 77 if(limit == NULL) { |
| 78 limit = --pos; |
| 79 return TRUE; |
| 80 } else { |
| 81 return FALSE; |
| 82 } |
| 83 } |
| 84 |
| 85 UChar32 |
| 86 UTF16CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) { |
| 87 if(pos == limit) { |
| 88 return U_SENTINEL; |
| 89 } |
| 90 UChar32 c = *pos; |
| 91 if(c == 0 && limit == NULL) { |
| 92 limit = pos; |
| 93 return U_SENTINEL; |
| 94 } |
| 95 ++pos; |
| 96 UChar trail; |
| 97 if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) { |
| 98 ++pos; |
| 99 return U16_GET_SUPPLEMENTARY(c, trail); |
| 100 } else { |
| 101 return c; |
| 102 } |
| 103 } |
| 104 |
| 105 UChar32 |
| 106 UTF16CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) { |
| 107 if(pos == start) { |
| 108 return U_SENTINEL; |
| 109 } |
| 110 UChar32 c = *--pos; |
| 111 UChar lead; |
| 112 if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) { |
| 113 --pos; |
| 114 return U16_GET_SUPPLEMENTARY(lead, c); |
| 115 } else { |
| 116 return c; |
| 117 } |
| 118 } |
| 119 |
| 120 void |
| 121 UTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCo
de*/) { |
| 122 while(num > 0 && pos != limit) { |
| 123 UChar32 c = *pos; |
| 124 if(c == 0 && limit == NULL) { |
| 125 limit = pos; |
| 126 break; |
| 127 } |
| 128 ++pos; |
| 129 --num; |
| 130 if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(*pos)) { |
| 131 ++pos; |
| 132 } |
| 133 } |
| 134 } |
| 135 |
| 136 void |
| 137 UTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorC
ode*/) { |
| 138 while(num > 0 && pos != start) { |
| 139 UChar32 c = *--pos; |
| 140 --num; |
| 141 if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(*(pos-1))) { |
| 142 --pos; |
| 143 } |
| 144 } |
| 145 } |
| 146 |
| 147 // FCDUTF16CollationIterator ----------------------------------------------- *** |
| 148 |
| 149 FCDUTF16CollationIterator::FCDUTF16CollationIterator(const FCDUTF16CollationIter
ator &other, |
| 150 const UChar *newText) |
| 151 : UTF16CollationIterator(other), |
| 152 rawStart(newText), |
| 153 segmentStart(newText + (other.segmentStart - other.rawStart)), |
| 154 segmentLimit(other.segmentLimit == NULL ? NULL : newText + (other.segm
entLimit - other.rawStart)), |
| 155 rawLimit(other.rawLimit == NULL ? NULL : newText + (other.rawLimit - o
ther.rawStart)), |
| 156 nfcImpl(other.nfcImpl), |
| 157 normalized(other.normalized), |
| 158 checkDir(other.checkDir) { |
| 159 if(checkDir != 0 || other.start == other.segmentStart) { |
| 160 start = newText + (other.start - other.rawStart); |
| 161 pos = newText + (other.pos - other.rawStart); |
| 162 limit = other.limit == NULL ? NULL : newText + (other.limit - other.rawS
tart); |
| 163 } else { |
| 164 start = normalized.getBuffer(); |
| 165 pos = start + (other.pos - other.start); |
| 166 limit = start + normalized.length(); |
| 167 } |
| 168 } |
| 169 |
| 170 FCDUTF16CollationIterator::~FCDUTF16CollationIterator() {} |
| 171 |
| 172 UBool |
| 173 FCDUTF16CollationIterator::operator==(const CollationIterator &other) const { |
| 174 // Skip the UTF16CollationIterator and call its parent. |
| 175 if(!CollationIterator::operator==(other)) { return FALSE; } |
| 176 const FCDUTF16CollationIterator &o = static_cast<const FCDUTF16CollationIter
ator &>(other); |
| 177 // Compare the iterator state but not the text: Assume that the caller does
that. |
| 178 if(checkDir != o.checkDir) { return FALSE; } |
| 179 if(checkDir == 0 && (start == segmentStart) != (o.start == o.segmentStart))
{ return FALSE; } |
| 180 if(checkDir != 0 || start == segmentStart) { |
| 181 return (pos - rawStart) == (o.pos - o.rawStart); |
| 182 } else { |
| 183 return (segmentStart - rawStart) == (o.segmentStart - o.rawStart) && |
| 184 (pos - start) == (o.pos - o.start); |
| 185 } |
| 186 } |
| 187 |
| 188 void |
| 189 FCDUTF16CollationIterator::resetToOffset(int32_t newOffset) { |
| 190 reset(); |
| 191 start = segmentStart = pos = rawStart + newOffset; |
| 192 limit = rawLimit; |
| 193 checkDir = 1; |
| 194 } |
| 195 |
| 196 int32_t |
| 197 FCDUTF16CollationIterator::getOffset() const { |
| 198 if(checkDir != 0 || start == segmentStart) { |
| 199 return (int32_t)(pos - rawStart); |
| 200 } else if(pos == start) { |
| 201 return (int32_t)(segmentStart - rawStart); |
| 202 } else { |
| 203 return (int32_t)(segmentLimit - rawStart); |
| 204 } |
| 205 } |
| 206 |
| 207 uint32_t |
| 208 FCDUTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) { |
| 209 for(;;) { |
| 210 if(checkDir > 0) { |
| 211 if(pos == limit) { |
| 212 c = U_SENTINEL; |
| 213 return Collation::FALLBACK_CE32; |
| 214 } |
| 215 c = *pos++; |
| 216 if(CollationFCD::hasTccc(c)) { |
| 217 if(CollationFCD::maybeTibetanCompositeVowel(c) || |
| 218 (pos != limit && CollationFCD::hasLccc(*pos))) { |
| 219 --pos; |
| 220 if(!nextSegment(errorCode)) { |
| 221 c = U_SENTINEL; |
| 222 return Collation::FALLBACK_CE32; |
| 223 } |
| 224 c = *pos++; |
| 225 } |
| 226 } |
| 227 break; |
| 228 } else if(checkDir == 0 && pos != limit) { |
| 229 c = *pos++; |
| 230 break; |
| 231 } else { |
| 232 switchToForward(); |
| 233 } |
| 234 } |
| 235 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); |
| 236 } |
| 237 |
| 238 UBool |
| 239 FCDUTF16CollationIterator::foundNULTerminator() { |
| 240 if(limit == NULL) { |
| 241 limit = rawLimit = --pos; |
| 242 return TRUE; |
| 243 } else { |
| 244 return FALSE; |
| 245 } |
| 246 } |
| 247 |
| 248 UChar32 |
| 249 FCDUTF16CollationIterator::nextCodePoint(UErrorCode &errorCode) { |
| 250 UChar32 c; |
| 251 for(;;) { |
| 252 if(checkDir > 0) { |
| 253 if(pos == limit) { |
| 254 return U_SENTINEL; |
| 255 } |
| 256 c = *pos++; |
| 257 if(CollationFCD::hasTccc(c)) { |
| 258 if(CollationFCD::maybeTibetanCompositeVowel(c) || |
| 259 (pos != limit && CollationFCD::hasLccc(*pos))) { |
| 260 --pos; |
| 261 if(!nextSegment(errorCode)) { |
| 262 return U_SENTINEL; |
| 263 } |
| 264 c = *pos++; |
| 265 } |
| 266 } else if(c == 0 && limit == NULL) { |
| 267 limit = rawLimit = --pos; |
| 268 return U_SENTINEL; |
| 269 } |
| 270 break; |
| 271 } else if(checkDir == 0 && pos != limit) { |
| 272 c = *pos++; |
| 273 break; |
| 274 } else { |
| 275 switchToForward(); |
| 276 } |
| 277 } |
| 278 UChar trail; |
| 279 if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) { |
| 280 ++pos; |
| 281 return U16_GET_SUPPLEMENTARY(c, trail); |
| 282 } else { |
| 283 return c; |
| 284 } |
| 285 } |
| 286 |
| 287 UChar32 |
| 288 FCDUTF16CollationIterator::previousCodePoint(UErrorCode &errorCode) { |
| 289 UChar32 c; |
| 290 for(;;) { |
| 291 if(checkDir < 0) { |
| 292 if(pos == start) { |
| 293 return U_SENTINEL; |
| 294 } |
| 295 c = *--pos; |
| 296 if(CollationFCD::hasLccc(c)) { |
| 297 if(CollationFCD::maybeTibetanCompositeVowel(c) || |
| 298 (pos != start && CollationFCD::hasTccc(*(pos - 1)))) { |
| 299 ++pos; |
| 300 if(!previousSegment(errorCode)) { |
| 301 return U_SENTINEL; |
| 302 } |
| 303 c = *--pos; |
| 304 } |
| 305 } |
| 306 break; |
| 307 } else if(checkDir == 0 && pos != start) { |
| 308 c = *--pos; |
| 309 break; |
| 310 } else { |
| 311 switchToBackward(); |
| 312 } |
| 313 } |
| 314 UChar lead; |
| 315 if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) { |
| 316 --pos; |
| 317 return U16_GET_SUPPLEMENTARY(lead, c); |
| 318 } else { |
| 319 return c; |
| 320 } |
| 321 } |
| 322 |
| 323 void |
| 324 FCDUTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCo
de) { |
| 325 // Specify the class to avoid a virtual-function indirection. |
| 326 // In Java, we would declare this class final. |
| 327 while(num > 0 && FCDUTF16CollationIterator::nextCodePoint(errorCode) >= 0) { |
| 328 --num; |
| 329 } |
| 330 } |
| 331 |
| 332 void |
| 333 FCDUTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorC
ode) { |
| 334 // Specify the class to avoid a virtual-function indirection. |
| 335 // In Java, we would declare this class final. |
| 336 while(num > 0 && FCDUTF16CollationIterator::previousCodePoint(errorCode) >=
0) { |
| 337 --num; |
| 338 } |
| 339 } |
| 340 |
| 341 void |
| 342 FCDUTF16CollationIterator::switchToForward() { |
| 343 U_ASSERT(checkDir < 0 || (checkDir == 0 && pos == limit)); |
| 344 if(checkDir < 0) { |
| 345 // Turn around from backward checking. |
| 346 start = segmentStart = pos; |
| 347 if(pos == segmentLimit) { |
| 348 limit = rawLimit; |
| 349 checkDir = 1; // Check forward. |
| 350 } else { // pos < segmentLimit |
| 351 checkDir = 0; // Stay in FCD segment. |
| 352 } |
| 353 } else { |
| 354 // Reached the end of the FCD segment. |
| 355 if(start == segmentStart) { |
| 356 // The input text segment is FCD, extend it forward. |
| 357 } else { |
| 358 // The input text segment needed to be normalized. |
| 359 // Switch to checking forward from it. |
| 360 pos = start = segmentStart = segmentLimit; |
| 361 // Note: If this segment is at the end of the input text, |
| 362 // then it might help to return FALSE to indicate that, so that |
| 363 // we do not have to re-check and normalize when we turn around and
go backwards. |
| 364 // However, that would complicate the call sites for an optimization
of an unusual case. |
| 365 } |
| 366 limit = rawLimit; |
| 367 checkDir = 1; |
| 368 } |
| 369 } |
| 370 |
| 371 UBool |
| 372 FCDUTF16CollationIterator::nextSegment(UErrorCode &errorCode) { |
| 373 if(U_FAILURE(errorCode)) { return FALSE; } |
| 374 U_ASSERT(checkDir > 0 && pos != limit); |
| 375 // The input text [segmentStart..pos[ passes the FCD check. |
| 376 const UChar *p = pos; |
| 377 uint8_t prevCC = 0; |
| 378 for(;;) { |
| 379 // Fetch the next character's fcd16 value. |
| 380 const UChar *q = p; |
| 381 uint16_t fcd16 = nfcImpl.nextFCD16(p, rawLimit); |
| 382 uint8_t leadCC = (uint8_t)(fcd16 >> 8); |
| 383 if(leadCC == 0 && q != pos) { |
| 384 // FCD boundary before the [q, p[ character. |
| 385 limit = segmentLimit = q; |
| 386 break; |
| 387 } |
| 388 if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanComp
ositeVowel(fcd16))) { |
| 389 // Fails FCD check. Find the next FCD boundary and normalize. |
| 390 do { |
| 391 q = p; |
| 392 } while(p != rawLimit && nfcImpl.nextFCD16(p, rawLimit) > 0xff); |
| 393 if(!normalize(pos, q, errorCode)) { return FALSE; } |
| 394 pos = start; |
| 395 break; |
| 396 } |
| 397 prevCC = (uint8_t)fcd16; |
| 398 if(p == rawLimit || prevCC == 0) { |
| 399 // FCD boundary after the last character. |
| 400 limit = segmentLimit = p; |
| 401 break; |
| 402 } |
| 403 } |
| 404 U_ASSERT(pos != limit); |
| 405 checkDir = 0; |
| 406 return TRUE; |
| 407 } |
| 408 |
| 409 void |
| 410 FCDUTF16CollationIterator::switchToBackward() { |
| 411 U_ASSERT(checkDir > 0 || (checkDir == 0 && pos == start)); |
| 412 if(checkDir > 0) { |
| 413 // Turn around from forward checking. |
| 414 limit = segmentLimit = pos; |
| 415 if(pos == segmentStart) { |
| 416 start = rawStart; |
| 417 checkDir = -1; // Check backward. |
| 418 } else { // pos > segmentStart |
| 419 checkDir = 0; // Stay in FCD segment. |
| 420 } |
| 421 } else { |
| 422 // Reached the start of the FCD segment. |
| 423 if(start == segmentStart) { |
| 424 // The input text segment is FCD, extend it backward. |
| 425 } else { |
| 426 // The input text segment needed to be normalized. |
| 427 // Switch to checking backward from it. |
| 428 pos = limit = segmentLimit = segmentStart; |
| 429 } |
| 430 start = rawStart; |
| 431 checkDir = -1; |
| 432 } |
| 433 } |
| 434 |
| 435 UBool |
| 436 FCDUTF16CollationIterator::previousSegment(UErrorCode &errorCode) { |
| 437 if(U_FAILURE(errorCode)) { return FALSE; } |
| 438 U_ASSERT(checkDir < 0 && pos != start); |
| 439 // The input text [pos..segmentLimit[ passes the FCD check. |
| 440 const UChar *p = pos; |
| 441 uint8_t nextCC = 0; |
| 442 for(;;) { |
| 443 // Fetch the previous character's fcd16 value. |
| 444 const UChar *q = p; |
| 445 uint16_t fcd16 = nfcImpl.previousFCD16(rawStart, p); |
| 446 uint8_t trailCC = (uint8_t)fcd16; |
| 447 if(trailCC == 0 && q != pos) { |
| 448 // FCD boundary after the [p, q[ character. |
| 449 start = segmentStart = q; |
| 450 break; |
| 451 } |
| 452 if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) || |
| 453 CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))
) { |
| 454 // Fails FCD check. Find the previous FCD boundary and normalize. |
| 455 do { |
| 456 q = p; |
| 457 } while(fcd16 > 0xff && p != rawStart && |
| 458 (fcd16 = nfcImpl.previousFCD16(rawStart, p)) != 0); |
| 459 if(!normalize(q, pos, errorCode)) { return FALSE; } |
| 460 pos = limit; |
| 461 break; |
| 462 } |
| 463 nextCC = (uint8_t)(fcd16 >> 8); |
| 464 if(p == rawStart || nextCC == 0) { |
| 465 // FCD boundary before the following character. |
| 466 start = segmentStart = p; |
| 467 break; |
| 468 } |
| 469 } |
| 470 U_ASSERT(pos != start); |
| 471 checkDir = 0; |
| 472 return TRUE; |
| 473 } |
| 474 |
| 475 UBool |
| 476 FCDUTF16CollationIterator::normalize(const UChar *from, const UChar *to, UErrorC
ode &errorCode) { |
| 477 // NFD without argument checking. |
| 478 U_ASSERT(U_SUCCESS(errorCode)); |
| 479 nfcImpl.decompose(from, to, normalized, (int32_t)(to - from), errorCode); |
| 480 if(U_FAILURE(errorCode)) { return FALSE; } |
| 481 // Switch collation processing into the FCD buffer |
| 482 // with the result of normalizing [segmentStart, segmentLimit[. |
| 483 segmentStart = from; |
| 484 segmentLimit = to; |
| 485 start = normalized.getBuffer(); |
| 486 limit = start + normalized.length(); |
| 487 return TRUE; |
| 488 } |
| 489 |
| 490 U_NAMESPACE_END |
| 491 |
| 492 #endif // !UCONFIG_NO_COLLATION |
OLD | NEW |