OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * Copyright (C) 2012-2014, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ******************************************************************************* |
| 6 * uitercollationiterator.cpp |
| 7 * |
| 8 * created on: 2012sep23 (from utf16collationiterator.cpp) |
| 9 * created by: Markus W. Scherer |
| 10 */ |
| 11 |
| 12 #include "unicode/utypes.h" |
| 13 |
| 14 #if !UCONFIG_NO_COLLATION |
| 15 |
| 16 #include "unicode/uiter.h" |
| 17 #include "charstr.h" |
| 18 #include "cmemory.h" |
| 19 #include "collation.h" |
| 20 #include "collationdata.h" |
| 21 #include "collationfcd.h" |
| 22 #include "collationiterator.h" |
| 23 #include "normalizer2impl.h" |
| 24 #include "uassert.h" |
| 25 #include "uitercollationiterator.h" |
| 26 |
| 27 U_NAMESPACE_BEGIN |
| 28 |
| 29 UIterCollationIterator::~UIterCollationIterator() {} |
| 30 |
| 31 void |
| 32 UIterCollationIterator::resetToOffset(int32_t newOffset) { |
| 33 reset(); |
| 34 iter.move(&iter, newOffset, UITER_START); |
| 35 } |
| 36 |
| 37 int32_t |
| 38 UIterCollationIterator::getOffset() const { |
| 39 return iter.getIndex(&iter, UITER_CURRENT); |
| 40 } |
| 41 |
| 42 uint32_t |
| 43 UIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) { |
| 44 c = iter.next(&iter); |
| 45 if(c < 0) { |
| 46 return Collation::FALLBACK_CE32; |
| 47 } |
| 48 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); |
| 49 } |
| 50 |
| 51 UChar |
| 52 UIterCollationIterator::handleGetTrailSurrogate() { |
| 53 UChar32 trail = iter.next(&iter); |
| 54 if(!U16_IS_TRAIL(trail) && trail >= 0) { iter.previous(&iter); } |
| 55 return (UChar)trail; |
| 56 } |
| 57 |
| 58 UChar32 |
| 59 UIterCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) { |
| 60 return uiter_next32(&iter); |
| 61 } |
| 62 |
| 63 UChar32 |
| 64 UIterCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) { |
| 65 return uiter_previous32(&iter); |
| 66 } |
| 67 |
| 68 void |
| 69 UIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCo
de*/) { |
| 70 while(num > 0 && (uiter_next32(&iter)) >= 0) { |
| 71 --num; |
| 72 } |
| 73 } |
| 74 |
| 75 void |
| 76 UIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorC
ode*/) { |
| 77 while(num > 0 && (uiter_previous32(&iter)) >= 0) { |
| 78 --num; |
| 79 } |
| 80 } |
| 81 |
| 82 // FCDUIterCollationIterator ----------------------------------------------- *** |
| 83 |
| 84 FCDUIterCollationIterator::~FCDUIterCollationIterator() {} |
| 85 |
| 86 void |
| 87 FCDUIterCollationIterator::resetToOffset(int32_t newOffset) { |
| 88 UIterCollationIterator::resetToOffset(newOffset); |
| 89 start = newOffset; |
| 90 state = ITER_CHECK_FWD; |
| 91 } |
| 92 |
| 93 int32_t |
| 94 FCDUIterCollationIterator::getOffset() const { |
| 95 if(state <= ITER_CHECK_BWD) { |
| 96 return iter.getIndex(&iter, UITER_CURRENT); |
| 97 } else if(state == ITER_IN_FCD_SEGMENT) { |
| 98 return pos; |
| 99 } else if(pos == 0) { |
| 100 return start; |
| 101 } else { |
| 102 return limit; |
| 103 } |
| 104 } |
| 105 |
| 106 uint32_t |
| 107 FCDUIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) { |
| 108 for(;;) { |
| 109 if(state == ITER_CHECK_FWD) { |
| 110 c = iter.next(&iter); |
| 111 if(c < 0) { |
| 112 return Collation::FALLBACK_CE32; |
| 113 } |
| 114 if(CollationFCD::hasTccc(c)) { |
| 115 if(CollationFCD::maybeTibetanCompositeVowel(c) || |
| 116 CollationFCD::hasLccc(iter.current(&iter))) { |
| 117 iter.previous(&iter); |
| 118 if(!nextSegment(errorCode)) { |
| 119 c = U_SENTINEL; |
| 120 return Collation::FALLBACK_CE32; |
| 121 } |
| 122 continue; |
| 123 } |
| 124 } |
| 125 break; |
| 126 } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) { |
| 127 c = iter.next(&iter); |
| 128 ++pos; |
| 129 U_ASSERT(c >= 0); |
| 130 break; |
| 131 } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length())
{ |
| 132 c = normalized[pos++]; |
| 133 break; |
| 134 } else { |
| 135 switchToForward(); |
| 136 } |
| 137 } |
| 138 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); |
| 139 } |
| 140 |
| 141 UChar |
| 142 FCDUIterCollationIterator::handleGetTrailSurrogate() { |
| 143 if(state <= ITER_IN_FCD_SEGMENT) { |
| 144 UChar32 trail = iter.next(&iter); |
| 145 if(U16_IS_TRAIL(trail)) { |
| 146 if(state == ITER_IN_FCD_SEGMENT) { ++pos; } |
| 147 } else if(trail >= 0) { |
| 148 iter.previous(&iter); |
| 149 } |
| 150 return (UChar)trail; |
| 151 } else { |
| 152 U_ASSERT(pos < normalized.length()); |
| 153 UChar trail; |
| 154 if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; } |
| 155 return trail; |
| 156 } |
| 157 } |
| 158 |
| 159 UChar32 |
| 160 FCDUIterCollationIterator::nextCodePoint(UErrorCode &errorCode) { |
| 161 UChar32 c; |
| 162 for(;;) { |
| 163 if(state == ITER_CHECK_FWD) { |
| 164 c = iter.next(&iter); |
| 165 if(c < 0) { |
| 166 return c; |
| 167 } |
| 168 if(CollationFCD::hasTccc(c)) { |
| 169 if(CollationFCD::maybeTibetanCompositeVowel(c) || |
| 170 CollationFCD::hasLccc(iter.current(&iter))) { |
| 171 iter.previous(&iter); |
| 172 if(!nextSegment(errorCode)) { |
| 173 return U_SENTINEL; |
| 174 } |
| 175 continue; |
| 176 } |
| 177 } |
| 178 if(U16_IS_LEAD(c)) { |
| 179 UChar32 trail = iter.next(&iter); |
| 180 if(U16_IS_TRAIL(trail)) { |
| 181 return U16_GET_SUPPLEMENTARY(c, trail); |
| 182 } else if(trail >= 0) { |
| 183 iter.previous(&iter); |
| 184 } |
| 185 } |
| 186 return c; |
| 187 } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) { |
| 188 c = uiter_next32(&iter); |
| 189 pos += U16_LENGTH(c); |
| 190 U_ASSERT(c >= 0); |
| 191 return c; |
| 192 } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length())
{ |
| 193 c = normalized.char32At(pos); |
| 194 pos += U16_LENGTH(c); |
| 195 return c; |
| 196 } else { |
| 197 switchToForward(); |
| 198 } |
| 199 } |
| 200 } |
| 201 |
| 202 UChar32 |
| 203 FCDUIterCollationIterator::previousCodePoint(UErrorCode &errorCode) { |
| 204 UChar32 c; |
| 205 for(;;) { |
| 206 if(state == ITER_CHECK_BWD) { |
| 207 c = iter.previous(&iter); |
| 208 if(c < 0) { |
| 209 start = pos = 0; |
| 210 state = ITER_IN_FCD_SEGMENT; |
| 211 return U_SENTINEL; |
| 212 } |
| 213 if(CollationFCD::hasLccc(c)) { |
| 214 UChar32 prev = U_SENTINEL; |
| 215 if(CollationFCD::maybeTibetanCompositeVowel(c) || |
| 216 CollationFCD::hasTccc(prev = iter.previous(&iter))) { |
| 217 iter.next(&iter); |
| 218 if(prev >= 0) { |
| 219 iter.next(&iter); |
| 220 } |
| 221 if(!previousSegment(errorCode)) { |
| 222 return U_SENTINEL; |
| 223 } |
| 224 continue; |
| 225 } |
| 226 // hasLccc(trail)=true for all trail surrogates |
| 227 if(U16_IS_TRAIL(c)) { |
| 228 if(prev < 0) { |
| 229 prev = iter.previous(&iter); |
| 230 } |
| 231 if(U16_IS_LEAD(prev)) { |
| 232 return U16_GET_SUPPLEMENTARY(prev, c); |
| 233 } |
| 234 } |
| 235 if(prev >= 0) { |
| 236 iter.next(&iter); |
| 237 } |
| 238 } |
| 239 return c; |
| 240 } else if(state == ITER_IN_FCD_SEGMENT && pos != start) { |
| 241 c = uiter_previous32(&iter); |
| 242 pos -= U16_LENGTH(c); |
| 243 U_ASSERT(c >= 0); |
| 244 return c; |
| 245 } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != 0) { |
| 246 c = normalized.char32At(pos - 1); |
| 247 pos -= U16_LENGTH(c); |
| 248 return c; |
| 249 } else { |
| 250 switchToBackward(); |
| 251 } |
| 252 } |
| 253 } |
| 254 |
| 255 void |
| 256 FCDUIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCo
de) { |
| 257 // Specify the class to avoid a virtual-function indirection. |
| 258 // In Java, we would declare this class final. |
| 259 while(num > 0 && FCDUIterCollationIterator::nextCodePoint(errorCode) >= 0) { |
| 260 --num; |
| 261 } |
| 262 } |
| 263 |
| 264 void |
| 265 FCDUIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorC
ode) { |
| 266 // Specify the class to avoid a virtual-function indirection. |
| 267 // In Java, we would declare this class final. |
| 268 while(num > 0 && FCDUIterCollationIterator::previousCodePoint(errorCode) >=
0) { |
| 269 --num; |
| 270 } |
| 271 } |
| 272 |
| 273 void |
| 274 FCDUIterCollationIterator::switchToForward() { |
| 275 U_ASSERT(state == ITER_CHECK_BWD || |
| 276 (state == ITER_IN_FCD_SEGMENT && pos == limit) || |
| 277 (state >= IN_NORM_ITER_AT_LIMIT && pos == normalized.length())); |
| 278 if(state == ITER_CHECK_BWD) { |
| 279 // Turn around from backward checking. |
| 280 start = pos = iter.getIndex(&iter, UITER_CURRENT); |
| 281 if(pos == limit) { |
| 282 state = ITER_CHECK_FWD; // Check forward. |
| 283 } else { // pos < limit |
| 284 state = ITER_IN_FCD_SEGMENT; // Stay in FCD segment. |
| 285 } |
| 286 } else { |
| 287 // Reached the end of the FCD segment. |
| 288 if(state == ITER_IN_FCD_SEGMENT) { |
| 289 // The input text segment is FCD, extend it forward. |
| 290 } else { |
| 291 // The input text segment needed to be normalized. |
| 292 // Switch to checking forward from it. |
| 293 if(state == IN_NORM_ITER_AT_START) { |
| 294 iter.move(&iter, limit - start, UITER_CURRENT); |
| 295 } |
| 296 start = limit; |
| 297 } |
| 298 state = ITER_CHECK_FWD; |
| 299 } |
| 300 } |
| 301 |
| 302 UBool |
| 303 FCDUIterCollationIterator::nextSegment(UErrorCode &errorCode) { |
| 304 if(U_FAILURE(errorCode)) { return FALSE; } |
| 305 U_ASSERT(state == ITER_CHECK_FWD); |
| 306 // The input text [start..(iter index)[ passes the FCD check. |
| 307 pos = iter.getIndex(&iter, UITER_CURRENT); |
| 308 // Collect the characters being checked, in case they need to be normalized. |
| 309 UnicodeString s; |
| 310 uint8_t prevCC = 0; |
| 311 for(;;) { |
| 312 // Fetch the next character and its fcd16 value. |
| 313 UChar32 c = uiter_next32(&iter); |
| 314 if(c < 0) { break; } |
| 315 uint16_t fcd16 = nfcImpl.getFCD16(c); |
| 316 uint8_t leadCC = (uint8_t)(fcd16 >> 8); |
| 317 if(leadCC == 0 && !s.isEmpty()) { |
| 318 // FCD boundary before this character. |
| 319 uiter_previous32(&iter); |
| 320 break; |
| 321 } |
| 322 s.append(c); |
| 323 if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanComp
ositeVowel(fcd16))) { |
| 324 // Fails FCD check. Find the next FCD boundary and normalize. |
| 325 for(;;) { |
| 326 c = uiter_next32(&iter); |
| 327 if(c < 0) { break; } |
| 328 if(nfcImpl.getFCD16(c) <= 0xff) { |
| 329 uiter_previous32(&iter); |
| 330 break; |
| 331 } |
| 332 s.append(c); |
| 333 } |
| 334 if(!normalize(s, errorCode)) { return FALSE; } |
| 335 start = pos; |
| 336 limit = pos + s.length(); |
| 337 state = IN_NORM_ITER_AT_LIMIT; |
| 338 pos = 0; |
| 339 return TRUE; |
| 340 } |
| 341 prevCC = (uint8_t)fcd16; |
| 342 if(prevCC == 0) { |
| 343 // FCD boundary after the last character. |
| 344 break; |
| 345 } |
| 346 } |
| 347 limit = pos + s.length(); |
| 348 U_ASSERT(pos != limit); |
| 349 iter.move(&iter, -s.length(), UITER_CURRENT); |
| 350 state = ITER_IN_FCD_SEGMENT; |
| 351 return TRUE; |
| 352 } |
| 353 |
| 354 void |
| 355 FCDUIterCollationIterator::switchToBackward() { |
| 356 U_ASSERT(state == ITER_CHECK_FWD || |
| 357 (state == ITER_IN_FCD_SEGMENT && pos == start) || |
| 358 (state >= IN_NORM_ITER_AT_LIMIT && pos == 0)); |
| 359 if(state == ITER_CHECK_FWD) { |
| 360 // Turn around from forward checking. |
| 361 limit = pos = iter.getIndex(&iter, UITER_CURRENT); |
| 362 if(pos == start) { |
| 363 state = ITER_CHECK_BWD; // Check backward. |
| 364 } else { // pos > start |
| 365 state = ITER_IN_FCD_SEGMENT; // Stay in FCD segment. |
| 366 } |
| 367 } else { |
| 368 // Reached the start of the FCD segment. |
| 369 if(state == ITER_IN_FCD_SEGMENT) { |
| 370 // The input text segment is FCD, extend it backward. |
| 371 } else { |
| 372 // The input text segment needed to be normalized. |
| 373 // Switch to checking backward from it. |
| 374 if(state == IN_NORM_ITER_AT_LIMIT) { |
| 375 iter.move(&iter, start - limit, UITER_CURRENT); |
| 376 } |
| 377 limit = start; |
| 378 } |
| 379 state = ITER_CHECK_BWD; |
| 380 } |
| 381 } |
| 382 |
| 383 UBool |
| 384 FCDUIterCollationIterator::previousSegment(UErrorCode &errorCode) { |
| 385 if(U_FAILURE(errorCode)) { return FALSE; } |
| 386 U_ASSERT(state == ITER_CHECK_BWD); |
| 387 // The input text [(iter index)..limit[ passes the FCD check. |
| 388 pos = iter.getIndex(&iter, UITER_CURRENT); |
| 389 // Collect the characters being checked, in case they need to be normalized. |
| 390 UnicodeString s; |
| 391 uint8_t nextCC = 0; |
| 392 for(;;) { |
| 393 // Fetch the previous character and its fcd16 value. |
| 394 UChar32 c = uiter_previous32(&iter); |
| 395 if(c < 0) { break; } |
| 396 uint16_t fcd16 = nfcImpl.getFCD16(c); |
| 397 uint8_t trailCC = (uint8_t)fcd16; |
| 398 if(trailCC == 0 && !s.isEmpty()) { |
| 399 // FCD boundary after this character. |
| 400 uiter_next32(&iter); |
| 401 break; |
| 402 } |
| 403 s.append(c); |
| 404 if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) || |
| 405 CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))
) { |
| 406 // Fails FCD check. Find the previous FCD boundary and normalize. |
| 407 while(fcd16 > 0xff) { |
| 408 c = uiter_previous32(&iter); |
| 409 if(c < 0) { break; } |
| 410 fcd16 = nfcImpl.getFCD16(c); |
| 411 if(fcd16 == 0) { |
| 412 (void)uiter_next32(&iter); |
| 413 break; |
| 414 } |
| 415 s.append(c); |
| 416 } |
| 417 s.reverse(); |
| 418 if(!normalize(s, errorCode)) { return FALSE; } |
| 419 limit = pos; |
| 420 start = pos - s.length(); |
| 421 state = IN_NORM_ITER_AT_START; |
| 422 pos = normalized.length(); |
| 423 return TRUE; |
| 424 } |
| 425 nextCC = (uint8_t)(fcd16 >> 8); |
| 426 if(nextCC == 0) { |
| 427 // FCD boundary before the following character. |
| 428 break; |
| 429 } |
| 430 } |
| 431 start = pos - s.length(); |
| 432 U_ASSERT(pos != start); |
| 433 iter.move(&iter, s.length(), UITER_CURRENT); |
| 434 state = ITER_IN_FCD_SEGMENT; |
| 435 return TRUE; |
| 436 } |
| 437 |
| 438 UBool |
| 439 FCDUIterCollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCo
de) { |
| 440 // NFD without argument checking. |
| 441 U_ASSERT(U_SUCCESS(errorCode)); |
| 442 nfcImpl.decompose(s, normalized, errorCode); |
| 443 return U_SUCCESS(errorCode); |
| 444 } |
| 445 |
| 446 U_NAMESPACE_END |
| 447 |
| 448 #endif // !UCONFIG_NO_COLLATION |
OLD | NEW |