OLD | NEW |
(Empty) | |
| 1 /* |
| 2 *************************************************************************** |
| 3 * Copyright (C) 1999-2010 International Business Machines Corporation |
| 4 * and others. All rights reserved. |
| 5 *************************************************************************** |
| 6 */ |
| 7 // |
| 8 // file: rbbi.c Contains the implementation of the rule based break iterato
r |
| 9 // runtime engine and the API implementation for |
| 10 // class RuleBasedBreakIterator |
| 11 // |
| 12 |
| 13 #include <typeinfo> // for 'typeid' to work |
| 14 |
| 15 #include "unicode/utypes.h" |
| 16 |
| 17 #if !UCONFIG_NO_BREAK_ITERATION |
| 18 |
| 19 #include "unicode/rbbi.h" |
| 20 #include "unicode/schriter.h" |
| 21 #include "unicode/uchriter.h" |
| 22 #include "unicode/udata.h" |
| 23 #include "unicode/uclean.h" |
| 24 #include "rbbidata.h" |
| 25 #include "rbbirb.h" |
| 26 #include "cmemory.h" |
| 27 #include "cstring.h" |
| 28 #include "umutex.h" |
| 29 #include "ucln_cmn.h" |
| 30 #include "brkeng.h" |
| 31 |
| 32 #include "uassert.h" |
| 33 #include "uvector.h" |
| 34 |
| 35 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be inclu
ded. |
| 36 #if U_LOCAL_SERVICE_HOOK |
| 37 #include "localsvc.h" |
| 38 #endif |
| 39 |
| 40 #ifdef RBBI_DEBUG |
| 41 static UBool fTrace = FALSE; |
| 42 #endif |
| 43 |
| 44 U_NAMESPACE_BEGIN |
| 45 |
| 46 // The state number of the starting state |
| 47 #define START_STATE 1 |
| 48 |
| 49 // The state-transition value indicating "stop" |
| 50 #define STOP_STATE 0 |
| 51 |
| 52 |
| 53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator) |
| 54 |
| 55 |
| 56 //======================================================================= |
| 57 // constructors |
| 58 //======================================================================= |
| 59 |
| 60 /** |
| 61 * Constructs a RuleBasedBreakIterator that uses the already-created |
| 62 * tables object that is passed in as a parameter. |
| 63 */ |
| 64 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode
&status) |
| 65 { |
| 66 init(); |
| 67 fData = new RBBIDataWrapper(data, status); // status checked in constructor |
| 68 if (U_FAILURE(status)) {return;} |
| 69 if(fData == 0) { |
| 70 status = U_MEMORY_ALLOCATION_ERROR; |
| 71 return; |
| 72 } |
| 73 } |
| 74 |
| 75 /** |
| 76 * Same as above but does not adopt memory |
| 77 */ |
| 78 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum
EDontAdopt, UErrorCode &status) |
| 79 { |
| 80 init(); |
| 81 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // s
tatus checked in constructor |
| 82 if (U_FAILURE(status)) {return;} |
| 83 if(fData == 0) { |
| 84 status = U_MEMORY_ALLOCATION_ERROR; |
| 85 return; |
| 86 } |
| 87 } |
| 88 |
| 89 //------------------------------------------------------------------------------
- |
| 90 // |
| 91 // Constructor from a UDataMemory handle to precompiled break rules |
| 92 // stored in an ICU data file. |
| 93 // |
| 94 //------------------------------------------------------------------------------
- |
| 95 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &sta
tus) |
| 96 { |
| 97 init(); |
| 98 fData = new RBBIDataWrapper(udm, status); // status checked in constructor |
| 99 if (U_FAILURE(status)) {return;} |
| 100 if(fData == 0) { |
| 101 status = U_MEMORY_ALLOCATION_ERROR; |
| 102 return; |
| 103 } |
| 104 } |
| 105 |
| 106 |
| 107 |
| 108 //------------------------------------------------------------------------------
- |
| 109 // |
| 110 // Constructor from a set of rules supplied as a string. |
| 111 // |
| 112 //------------------------------------------------------------------------------
- |
| 113 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, |
| 114 UParseError &parseError
, |
| 115 UErrorCode &status) |
| 116 { |
| 117 init(); |
| 118 if (U_FAILURE(status)) {return;} |
| 119 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *) |
| 120 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status
); |
| 121 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method
that |
| 122 // creates and returns a complete RBBI. From here, in a constructor,
we |
| 123 // can't just return the object created by the builder factory, hence |
| 124 // the assignment of the factory created object to "this". |
| 125 if (U_SUCCESS(status)) { |
| 126 *this = *bi; |
| 127 delete bi; |
| 128 } |
| 129 } |
| 130 |
| 131 |
| 132 //------------------------------------------------------------------------------
- |
| 133 // |
| 134 // Default Constructor. Create an empty shell that can be set up later. |
| 135 // Used when creating a RuleBasedBreakIterator from a
set |
| 136 // of rules. |
| 137 //------------------------------------------------------------------------------
- |
| 138 RuleBasedBreakIterator::RuleBasedBreakIterator() { |
| 139 init(); |
| 140 } |
| 141 |
| 142 |
| 143 //------------------------------------------------------------------------------
- |
| 144 // |
| 145 // Copy constructor. Will produce a break iterator with the same behavior, |
| 146 // and which iterates over the same text, as the one passed
in. |
| 147 // |
| 148 //------------------------------------------------------------------------------
- |
| 149 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& oth
er) |
| 150 : BreakIterator(other) |
| 151 { |
| 152 this->init(); |
| 153 *this = other; |
| 154 } |
| 155 |
| 156 |
| 157 /** |
| 158 * Destructor |
| 159 */ |
| 160 RuleBasedBreakIterator::~RuleBasedBreakIterator() { |
| 161 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { |
| 162 // fCharIter was adopted from the outside. |
| 163 delete fCharIter; |
| 164 } |
| 165 fCharIter = NULL; |
| 166 delete fSCharIter; |
| 167 fCharIter = NULL; |
| 168 delete fDCharIter; |
| 169 fDCharIter = NULL; |
| 170 |
| 171 utext_close(fText); |
| 172 |
| 173 if (fData != NULL) { |
| 174 fData->removeReference(); |
| 175 fData = NULL; |
| 176 } |
| 177 if (fCachedBreakPositions) { |
| 178 uprv_free(fCachedBreakPositions); |
| 179 fCachedBreakPositions = NULL; |
| 180 } |
| 181 if (fLanguageBreakEngines) { |
| 182 delete fLanguageBreakEngines; |
| 183 fLanguageBreakEngines = NULL; |
| 184 } |
| 185 if (fUnhandledBreakEngine) { |
| 186 delete fUnhandledBreakEngine; |
| 187 fUnhandledBreakEngine = NULL; |
| 188 } |
| 189 } |
| 190 |
| 191 /** |
| 192 * Assignment operator. Sets this iterator to have the same behavior, |
| 193 * and iterate over the same text, as the one passed in. |
| 194 */ |
| 195 RuleBasedBreakIterator& |
| 196 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { |
| 197 if (this == &that) { |
| 198 return *this; |
| 199 } |
| 200 reset(); // Delete break cache information |
| 201 fBreakType = that.fBreakType; |
| 202 if (fLanguageBreakEngines != NULL) { |
| 203 delete fLanguageBreakEngines; |
| 204 fLanguageBreakEngines = NULL; // Just rebuild for now |
| 205 } |
| 206 // TODO: clone fLanguageBreakEngines from "that" |
| 207 UErrorCode status = U_ZERO_ERROR; |
| 208 fText = utext_clone(fText, that.fText, FALSE, TRUE, &status); |
| 209 |
| 210 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { |
| 211 delete fCharIter; |
| 212 } |
| 213 fCharIter = NULL; |
| 214 |
| 215 if (that.fCharIter != NULL ) { |
| 216 // This is a little bit tricky - it will intially appear that |
| 217 // this->fCharIter is adopted, even if that->fCharIter was |
| 218 // not adopted. That's ok. |
| 219 fCharIter = that.fCharIter->clone(); |
| 220 } |
| 221 |
| 222 if (fData != NULL) { |
| 223 fData->removeReference(); |
| 224 fData = NULL; |
| 225 } |
| 226 if (that.fData != NULL) { |
| 227 fData = that.fData->addReference(); |
| 228 } |
| 229 |
| 230 return *this; |
| 231 } |
| 232 |
| 233 |
| 234 |
| 235 //----------------------------------------------------------------------------- |
| 236 // |
| 237 // init() Shared initialization routine. Used by all the constructors. |
| 238 // Initializes all fields, leaving the object in a consistent sta
te. |
| 239 // |
| 240 //----------------------------------------------------------------------------- |
| 241 void RuleBasedBreakIterator::init() { |
| 242 UErrorCode status = U_ZERO_ERROR; |
| 243 fBufferClone = FALSE; |
| 244 fText = utext_openUChars(NULL, NULL, 0, &status); |
| 245 fCharIter = NULL; |
| 246 fSCharIter = NULL; |
| 247 fDCharIter = NULL; |
| 248 fData = NULL; |
| 249 fLastRuleStatusIndex = 0; |
| 250 fLastStatusIndexValid = TRUE; |
| 251 fDictionaryCharCount = 0; |
| 252 fBreakType = UBRK_WORD; // Defaulting BreakType to word gives re
asonable |
| 253 // dictionary behavior for Break Itera
tors that are |
| 254 // built from rules. Even better woul
d be the ability to |
| 255 // declare the type in the rules. |
| 256 |
| 257 fCachedBreakPositions = NULL; |
| 258 fLanguageBreakEngines = NULL; |
| 259 fUnhandledBreakEngine = NULL; |
| 260 fNumCachedBreakPositions = 0; |
| 261 fPositionInCache = 0; |
| 262 |
| 263 #ifdef RBBI_DEBUG |
| 264 static UBool debugInitDone = FALSE; |
| 265 if (debugInitDone == FALSE) { |
| 266 char *debugEnv = getenv("U_RBBIDEBUG"); |
| 267 if (debugEnv && uprv_strstr(debugEnv, "trace")) { |
| 268 fTrace = TRUE; |
| 269 } |
| 270 debugInitDone = TRUE; |
| 271 } |
| 272 #endif |
| 273 } |
| 274 |
| 275 |
| 276 |
| 277 //----------------------------------------------------------------------------- |
| 278 // |
| 279 // clone - Returns a newly-constructed RuleBasedBreakIterator with the same |
| 280 // behavior, and iterating over the same text, as this one. |
| 281 // Virtual function: does the right thing with subclasses. |
| 282 // |
| 283 //----------------------------------------------------------------------------- |
| 284 BreakIterator* |
| 285 RuleBasedBreakIterator::clone(void) const { |
| 286 return new RuleBasedBreakIterator(*this); |
| 287 } |
| 288 |
| 289 /** |
| 290 * Equality operator. Returns TRUE if both BreakIterators are of the |
| 291 * same class, have the same behavior, and iterate over the same text. |
| 292 */ |
| 293 UBool |
| 294 RuleBasedBreakIterator::operator==(const BreakIterator& that) const { |
| 295 if (typeid(*this) != typeid(that)) { |
| 296 return FALSE; |
| 297 } |
| 298 |
| 299 const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that; |
| 300 |
| 301 if (!utext_equals(fText, that2.fText)) { |
| 302 // The two break iterators are operating on different text, |
| 303 // or have a different interation position. |
| 304 return FALSE; |
| 305 }; |
| 306 |
| 307 // TODO: need a check for when in a dictionary region at different offsets. |
| 308 |
| 309 if (that2.fData == fData || |
| 310 (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) { |
| 311 // The two break iterators are using the same rules. |
| 312 return TRUE; |
| 313 } |
| 314 return FALSE; |
| 315 } |
| 316 |
| 317 /** |
| 318 * Compute a hash code for this BreakIterator |
| 319 * @return A hash code |
| 320 */ |
| 321 int32_t |
| 322 RuleBasedBreakIterator::hashCode(void) const { |
| 323 int32_t hash = 0; |
| 324 if (fData != NULL) { |
| 325 hash = fData->hashCode(); |
| 326 } |
| 327 return hash; |
| 328 } |
| 329 |
| 330 |
| 331 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { |
| 332 if (U_FAILURE(status)) { |
| 333 return; |
| 334 } |
| 335 reset(); |
| 336 fText = utext_clone(fText, ut, FALSE, TRUE, &status); |
| 337 |
| 338 // Set up a dummy CharacterIterator to be returned if anyone |
| 339 // calls getText(). With input from UText, there is no reasonable |
| 340 // way to return a characterIterator over the actual input text. |
| 341 // Return one over an empty string instead - this is the closest |
| 342 // we can come to signaling a failure. |
| 343 // (GetText() is obsolete, this failure is sort of OK) |
| 344 if (fDCharIter == NULL) { |
| 345 static const UChar c = 0; |
| 346 fDCharIter = new UCharCharacterIterator(&c, 0); |
| 347 if (fDCharIter == NULL) { |
| 348 status = U_MEMORY_ALLOCATION_ERROR; |
| 349 return; |
| 350 } |
| 351 } |
| 352 |
| 353 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { |
| 354 // existing fCharIter was adopted from the outside. Delete it now. |
| 355 delete fCharIter; |
| 356 } |
| 357 fCharIter = fDCharIter; |
| 358 |
| 359 this->first(); |
| 360 } |
| 361 |
| 362 |
| 363 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const
{ |
| 364 UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status); |
| 365 return result; |
| 366 } |
| 367 |
| 368 |
| 369 |
| 370 /** |
| 371 * Returns the description used to create this iterator |
| 372 */ |
| 373 const UnicodeString& |
| 374 RuleBasedBreakIterator::getRules() const { |
| 375 if (fData != NULL) { |
| 376 return fData->getRuleSourceString(); |
| 377 } else { |
| 378 static const UnicodeString *s; |
| 379 if (s == NULL) { |
| 380 // TODO: something more elegant here. |
| 381 // perhaps API should return the string by value. |
| 382 // Note: thread unsafe init & leak are semi-ok, better than |
| 383 // what was before. Sould be cleaned up, though. |
| 384 s = new UnicodeString; |
| 385 } |
| 386 return *s; |
| 387 } |
| 388 } |
| 389 |
| 390 //======================================================================= |
| 391 // BreakIterator overrides |
| 392 //======================================================================= |
| 393 |
| 394 /** |
| 395 * Return a CharacterIterator over the text being analyzed. |
| 396 */ |
| 397 CharacterIterator& |
| 398 RuleBasedBreakIterator::getText() const { |
| 399 return *fCharIter; |
| 400 } |
| 401 |
| 402 /** |
| 403 * Set the iterator to analyze a new piece of text. This function resets |
| 404 * the current iteration position to the beginning of the text. |
| 405 * @param newText An iterator over the text to analyze. |
| 406 */ |
| 407 void |
| 408 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) { |
| 409 // If we are holding a CharacterIterator adopted from a |
| 410 // previous call to this function, delete it now. |
| 411 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { |
| 412 delete fCharIter; |
| 413 } |
| 414 |
| 415 fCharIter = newText; |
| 416 UErrorCode status = U_ZERO_ERROR; |
| 417 reset(); |
| 418 if (newText==NULL || newText->startIndex() != 0) { |
| 419 // startIndex !=0 wants to be an error, but there's no way to report it. |
| 420 // Make the iterator text be an empty string. |
| 421 fText = utext_openUChars(fText, NULL, 0, &status); |
| 422 } else { |
| 423 fText = utext_openCharacterIterator(fText, newText, &status); |
| 424 } |
| 425 this->first(); |
| 426 } |
| 427 |
| 428 /** |
| 429 * Set the iterator to analyze a new piece of text. This function resets |
| 430 * the current iteration position to the beginning of the text. |
| 431 * @param newText An iterator over the text to analyze. |
| 432 */ |
| 433 void |
| 434 RuleBasedBreakIterator::setText(const UnicodeString& newText) { |
| 435 UErrorCode status = U_ZERO_ERROR; |
| 436 reset(); |
| 437 fText = utext_openConstUnicodeString(fText, &newText, &status); |
| 438 |
| 439 // Set up a character iterator on the string. |
| 440 // Needed in case someone calls getText(). |
| 441 // Can not, unfortunately, do this lazily on the (probably never) |
| 442 // call to getText(), because getText is const. |
| 443 if (fSCharIter == NULL) { |
| 444 fSCharIter = new StringCharacterIterator(newText); |
| 445 } else { |
| 446 fSCharIter->setText(newText); |
| 447 } |
| 448 |
| 449 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { |
| 450 // old fCharIter was adopted from the outside. Delete it. |
| 451 delete fCharIter; |
| 452 } |
| 453 fCharIter = fSCharIter; |
| 454 |
| 455 this->first(); |
| 456 } |
| 457 |
| 458 |
| 459 |
| 460 /** |
| 461 * Sets the current iteration position to the beginning of the text. |
| 462 * @return The offset of the beginning of the text. |
| 463 */ |
| 464 int32_t RuleBasedBreakIterator::first(void) { |
| 465 reset(); |
| 466 fLastRuleStatusIndex = 0; |
| 467 fLastStatusIndexValid = TRUE; |
| 468 //if (fText == NULL) |
| 469 // return BreakIterator::DONE; |
| 470 |
| 471 utext_setNativeIndex(fText, 0); |
| 472 return 0; |
| 473 } |
| 474 |
| 475 /** |
| 476 * Sets the current iteration position to the end of the text. |
| 477 * @return The text's past-the-end offset. |
| 478 */ |
| 479 int32_t RuleBasedBreakIterator::last(void) { |
| 480 reset(); |
| 481 if (fText == NULL) { |
| 482 fLastRuleStatusIndex = 0; |
| 483 fLastStatusIndexValid = TRUE; |
| 484 return BreakIterator::DONE; |
| 485 } |
| 486 |
| 487 fLastStatusIndexValid = FALSE; |
| 488 int32_t pos = (int32_t)utext_nativeLength(fText); |
| 489 utext_setNativeIndex(fText, pos); |
| 490 return pos; |
| 491 } |
| 492 |
| 493 /** |
| 494 * Advances the iterator either forward or backward the specified number of step
s. |
| 495 * Negative values move backward, and positive values move forward. This is |
| 496 * equivalent to repeatedly calling next() or previous(). |
| 497 * @param n The number of steps to move. The sign indicates the direction |
| 498 * (negative is backwards, and positive is forwards). |
| 499 * @return The character offset of the boundary position n boundaries away from |
| 500 * the current one. |
| 501 */ |
| 502 int32_t RuleBasedBreakIterator::next(int32_t n) { |
| 503 int32_t result = current(); |
| 504 while (n > 0) { |
| 505 result = next(); |
| 506 --n; |
| 507 } |
| 508 while (n < 0) { |
| 509 result = previous(); |
| 510 ++n; |
| 511 } |
| 512 return result; |
| 513 } |
| 514 |
| 515 /** |
| 516 * Advances the iterator to the next boundary position. |
| 517 * @return The position of the first boundary after this one. |
| 518 */ |
| 519 int32_t RuleBasedBreakIterator::next(void) { |
| 520 // if we have cached break positions and we're still in the range |
| 521 // covered by them, just move one step forward in the cache |
| 522 if (fCachedBreakPositions != NULL) { |
| 523 if (fPositionInCache < fNumCachedBreakPositions - 1) { |
| 524 ++fPositionInCache; |
| 525 int32_t pos = fCachedBreakPositions[fPositionInCache]; |
| 526 utext_setNativeIndex(fText, pos); |
| 527 return pos; |
| 528 } |
| 529 else { |
| 530 reset(); |
| 531 } |
| 532 } |
| 533 |
| 534 int32_t startPos = current(); |
| 535 int32_t result = handleNext(fData->fForwardTable); |
| 536 if (fDictionaryCharCount > 0) { |
| 537 result = checkDictionary(startPos, result, FALSE); |
| 538 } |
| 539 return result; |
| 540 } |
| 541 |
| 542 /** |
| 543 * Advances the iterator backwards, to the last boundary preceding this one. |
| 544 * @return The position of the last boundary position preceding this one. |
| 545 */ |
| 546 int32_t RuleBasedBreakIterator::previous(void) { |
| 547 int32_t result; |
| 548 int32_t startPos; |
| 549 |
| 550 // if we have cached break positions and we're still in the range |
| 551 // covered by them, just move one step backward in the cache |
| 552 if (fCachedBreakPositions != NULL) { |
| 553 if (fPositionInCache > 0) { |
| 554 --fPositionInCache; |
| 555 // If we're at the beginning of the cache, need to reevaluate the |
| 556 // rule status |
| 557 if (fPositionInCache <= 0) { |
| 558 fLastStatusIndexValid = FALSE; |
| 559 } |
| 560 int32_t pos = fCachedBreakPositions[fPositionInCache]; |
| 561 utext_setNativeIndex(fText, pos); |
| 562 return pos; |
| 563 } |
| 564 else { |
| 565 reset(); |
| 566 } |
| 567 } |
| 568 |
| 569 // if we're already sitting at the beginning of the text, return DONE |
| 570 if (fText == NULL || (startPos = current()) == 0) { |
| 571 fLastRuleStatusIndex = 0; |
| 572 fLastStatusIndexValid = TRUE; |
| 573 return BreakIterator::DONE; |
| 574 } |
| 575 |
| 576 if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) { |
| 577 result = handlePrevious(fData->fReverseTable); |
| 578 if (fDictionaryCharCount > 0) { |
| 579 result = checkDictionary(result, startPos, TRUE); |
| 580 } |
| 581 return result; |
| 582 } |
| 583 |
| 584 // old rule syntax |
| 585 // set things up. handlePrevious() will back us up to some valid |
| 586 // break position before the current position (we back our internal |
| 587 // iterator up one step to prevent handlePrevious() from returning |
| 588 // the current position), but not necessarily the last one before |
| 589 |
| 590 // where we started |
| 591 |
| 592 int32_t start = current(); |
| 593 |
| 594 UTEXT_PREVIOUS32(fText); |
| 595 int32_t lastResult = handlePrevious(fData->fReverseTable); |
| 596 if (lastResult == UBRK_DONE) { |
| 597 lastResult = 0; |
| 598 utext_setNativeIndex(fText, 0); |
| 599 } |
| 600 result = lastResult; |
| 601 int32_t lastTag = 0; |
| 602 UBool breakTagValid = FALSE; |
| 603 |
| 604 // iterate forward from the known break position until we pass our |
| 605 // starting point. The last break position before the starting |
| 606 // point is our return value |
| 607 |
| 608 for (;;) { |
| 609 result = next(); |
| 610 if (result == BreakIterator::DONE || result >= start) { |
| 611 break; |
| 612 } |
| 613 lastResult = result; |
| 614 lastTag = fLastRuleStatusIndex; |
| 615 breakTagValid = TRUE; |
| 616 } |
| 617 |
| 618 // fLastBreakTag wants to have the value for section of text preceding |
| 619 // the result position that we are to return (in lastResult.) If |
| 620 // the backwards rules overshot and the above loop had to do two or more |
| 621 // next()s to move up to the desired return position, we will have a valid |
| 622 // tag value. But, if handlePrevious() took us to exactly the correct result
positon, |
| 623 // we wont have a tag value for that position, which is only set by handleNe
xt(). |
| 624 |
| 625 // set the current iteration position to be the last break position |
| 626 // before where we started, and then return that value |
| 627 utext_setNativeIndex(fText, lastResult); |
| 628 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() |
| 629 fLastStatusIndexValid = breakTagValid; |
| 630 |
| 631 // No need to check the dictionary; it will have been handled by |
| 632 // next() |
| 633 |
| 634 return lastResult; |
| 635 } |
| 636 |
| 637 /** |
| 638 * Sets the iterator to refer to the first boundary position following |
| 639 * the specified position. |
| 640 * @offset The position from which to begin searching for a break position. |
| 641 * @return The position of the first break after the current position. |
| 642 */ |
| 643 int32_t RuleBasedBreakIterator::following(int32_t offset) { |
| 644 // if we have cached break positions and offset is in the range |
| 645 // covered by them, use them |
| 646 // TODO: could use binary search |
| 647 // TODO: what if offset is outside range, but break is not? |
| 648 if (fCachedBreakPositions != NULL) { |
| 649 if (offset >= fCachedBreakPositions[0] |
| 650 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1])
{ |
| 651 fPositionInCache = 0; |
| 652 // We are guaranteed not to leave the array due to range test above |
| 653 while (offset >= fCachedBreakPositions[fPositionInCache]) { |
| 654 ++fPositionInCache; |
| 655 } |
| 656 int32_t pos = fCachedBreakPositions[fPositionInCache]; |
| 657 utext_setNativeIndex(fText, pos); |
| 658 return pos; |
| 659 } |
| 660 else { |
| 661 reset(); |
| 662 } |
| 663 } |
| 664 |
| 665 // if the offset passed in is already past the end of the text, |
| 666 // just return DONE; if it's before the beginning, return the |
| 667 // text's starting offset |
| 668 fLastRuleStatusIndex = 0; |
| 669 fLastStatusIndexValid = TRUE; |
| 670 if (fText == NULL || offset >= utext_nativeLength(fText)) { |
| 671 last(); |
| 672 return next(); |
| 673 } |
| 674 else if (offset < 0) { |
| 675 return first(); |
| 676 } |
| 677 |
| 678 // otherwise, set our internal iteration position (temporarily) |
| 679 // to the position passed in. If this is the _beginning_ position, |
| 680 // then we can just use next() to get our return value |
| 681 |
| 682 int32_t result = 0; |
| 683 |
| 684 if (fData->fSafeRevTable != NULL) { |
| 685 // new rule syntax |
| 686 utext_setNativeIndex(fText, offset); |
| 687 // move forward one codepoint to prepare for moving back to a |
| 688 // safe point. |
| 689 // this handles offset being between a supplementary character |
| 690 UTEXT_NEXT32(fText); |
| 691 // handlePrevious will move most of the time to < 1 boundary away |
| 692 handlePrevious(fData->fSafeRevTable); |
| 693 int32_t result = next(); |
| 694 while (result <= offset) { |
| 695 result = next(); |
| 696 } |
| 697 return result; |
| 698 } |
| 699 if (fData->fSafeFwdTable != NULL) { |
| 700 // backup plan if forward safe table is not available |
| 701 utext_setNativeIndex(fText, offset); |
| 702 UTEXT_PREVIOUS32(fText); |
| 703 // handle next will give result >= offset |
| 704 handleNext(fData->fSafeFwdTable); |
| 705 // previous will give result 0 or 1 boundary away from offset, |
| 706 // most of the time |
| 707 // we have to |
| 708 int32_t oldresult = previous(); |
| 709 while (oldresult > offset) { |
| 710 int32_t result = previous(); |
| 711 if (result <= offset) { |
| 712 return oldresult; |
| 713 } |
| 714 oldresult = result; |
| 715 } |
| 716 int32_t result = next(); |
| 717 if (result <= offset) { |
| 718 return next(); |
| 719 } |
| 720 return result; |
| 721 } |
| 722 // otherwise, we have to sync up first. Use handlePrevious() to back |
| 723 // up to a known break position before the specified position (if |
| 724 // we can determine that the specified position is a break position, |
| 725 // we don't back up at all). This may or may not be the last break |
| 726 // position at or before our starting position. Advance forward |
| 727 // from here until we've passed the starting position. The position |
| 728 // we stop on will be the first break position after the specified one. |
| 729 // old rule syntax |
| 730 |
| 731 utext_setNativeIndex(fText, offset); |
| 732 if (offset==0 || |
| 733 (offset==1 && utext_getNativeIndex(fText)==0)) { |
| 734 return next(); |
| 735 } |
| 736 result = previous(); |
| 737 |
| 738 while (result != BreakIterator::DONE && result <= offset) { |
| 739 result = next(); |
| 740 } |
| 741 |
| 742 return result; |
| 743 } |
| 744 |
| 745 /** |
| 746 * Sets the iterator to refer to the last boundary position before the |
| 747 * specified position. |
| 748 * @offset The position to begin searching for a break from. |
| 749 * @return The position of the last boundary before the starting position. |
| 750 */ |
| 751 int32_t RuleBasedBreakIterator::preceding(int32_t offset) { |
| 752 // if we have cached break positions and offset is in the range |
| 753 // covered by them, use them |
| 754 if (fCachedBreakPositions != NULL) { |
| 755 // TODO: binary search? |
| 756 // TODO: What if offset is outside range, but break is not? |
| 757 if (offset > fCachedBreakPositions[0] |
| 758 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]
) { |
| 759 fPositionInCache = 0; |
| 760 while (fPositionInCache < fNumCachedBreakPositions |
| 761 && offset > fCachedBreakPositions[fPositionInCache]) |
| 762 ++fPositionInCache; |
| 763 --fPositionInCache; |
| 764 // If we're at the beginning of the cache, need to reevaluate the |
| 765 // rule status |
| 766 if (fPositionInCache <= 0) { |
| 767 fLastStatusIndexValid = FALSE; |
| 768 } |
| 769 utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache])
; |
| 770 return fCachedBreakPositions[fPositionInCache]; |
| 771 } |
| 772 else { |
| 773 reset(); |
| 774 } |
| 775 } |
| 776 |
| 777 // if the offset passed in is already past the end of the text, |
| 778 // just return DONE; if it's before the beginning, return the |
| 779 // text's starting offset |
| 780 if (fText == NULL || offset > utext_nativeLength(fText)) { |
| 781 // return BreakIterator::DONE; |
| 782 return last(); |
| 783 } |
| 784 else if (offset < 0) { |
| 785 return first(); |
| 786 } |
| 787 |
| 788 // if we start by updating the current iteration position to the |
| 789 // position specified by the caller, we can just use previous() |
| 790 // to carry out this operation |
| 791 |
| 792 if (fData->fSafeFwdTable != NULL) { |
| 793 // new rule syntax |
| 794 utext_setNativeIndex(fText, offset); |
| 795 int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
| 796 if (newOffset != offset) { |
| 797 // Will come here if specified offset was not a code point boundary
AND |
| 798 // the underlying implmentation is using UText, which snaps any no
n-code-point-boundary |
| 799 // indices to the containing code point. |
| 800 // For breakitereator::preceding only, these non-code-point indices
need to be moved |
| 801 // up to refer to the following codepoint. |
| 802 UTEXT_NEXT32(fText); |
| 803 offset = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
| 804 } |
| 805 |
| 806 // TODO: (synwee) would it be better to just check for being in the mid
dle of a surrogate pair, |
| 807 // rather than adjusting the position unconditionally? |
| 808 // (Change would interact with safe rules.) |
| 809 // TODO: change RBBI behavior for off-boundary indices to match that of
UText? |
| 810 // affects only preceding(), seems cleaner, but is slightly diffe
rent. |
| 811 UTEXT_PREVIOUS32(fText); |
| 812 handleNext(fData->fSafeFwdTable); |
| 813 int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
| 814 while (result >= offset) { |
| 815 result = previous(); |
| 816 } |
| 817 return result; |
| 818 } |
| 819 if (fData->fSafeRevTable != NULL) { |
| 820 // backup plan if forward safe table is not available |
| 821 // TODO: check whether this path can be discarded |
| 822 // It's probably OK to say that rules must supply both safe tabl
es |
| 823 // if they use safe tables at all. We have certainly never d
escribed |
| 824 // to anyone how to work with just one safe table. |
| 825 utext_setNativeIndex(fText, offset); |
| 826 UTEXT_NEXT32(fText); |
| 827 |
| 828 // handle previous will give result <= offset |
| 829 handlePrevious(fData->fSafeRevTable); |
| 830 |
| 831 // next will give result 0 or 1 boundary away from offset, |
| 832 // most of the time |
| 833 // we have to |
| 834 int32_t oldresult = next(); |
| 835 while (oldresult < offset) { |
| 836 int32_t result = next(); |
| 837 if (result >= offset) { |
| 838 return oldresult; |
| 839 } |
| 840 oldresult = result; |
| 841 } |
| 842 int32_t result = previous(); |
| 843 if (result >= offset) { |
| 844 return previous(); |
| 845 } |
| 846 return result; |
| 847 } |
| 848 |
| 849 // old rule syntax |
| 850 utext_setNativeIndex(fText, offset); |
| 851 return previous(); |
| 852 } |
| 853 |
| 854 /** |
| 855 * Returns true if the specfied position is a boundary position. As a side |
| 856 * effect, leaves the iterator pointing to the first boundary position at |
| 857 * or after "offset". |
| 858 * @param offset the offset to check. |
| 859 * @return True if "offset" is a boundary position. |
| 860 */ |
| 861 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { |
| 862 // the beginning index of the iterator is always a boundary position by defi
nition |
| 863 if (offset == 0) { |
| 864 first(); // For side effects on current position, tag values. |
| 865 return TRUE; |
| 866 } |
| 867 |
| 868 if (offset == (int32_t)utext_nativeLength(fText)) { |
| 869 last(); // For side effects on current position, tag values. |
| 870 return TRUE; |
| 871 } |
| 872 |
| 873 // out-of-range indexes are never boundary positions |
| 874 if (offset < 0) { |
| 875 first(); // For side effects on current position, tag values. |
| 876 return FALSE; |
| 877 } |
| 878 |
| 879 if (offset > utext_nativeLength(fText)) { |
| 880 last(); // For side effects on current position, tag values. |
| 881 return FALSE; |
| 882 } |
| 883 |
| 884 // otherwise, we can use following() on the position before the specified |
| 885 // one and return true if the position we get back is the one the user |
| 886 // specified |
| 887 utext_previous32From(fText, offset); |
| 888 int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
| 889 UBool result = following(backOne) == offset; |
| 890 return result; |
| 891 } |
| 892 |
| 893 /** |
| 894 * Returns the current iteration position. |
| 895 * @return The current iteration position. |
| 896 */ |
| 897 int32_t RuleBasedBreakIterator::current(void) const { |
| 898 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
| 899 return pos; |
| 900 } |
| 901 |
| 902 //======================================================================= |
| 903 // implementation |
| 904 //======================================================================= |
| 905 |
| 906 // |
| 907 // RBBIRunMode - the state machine runs an extra iteration at the beginning an
d end |
| 908 // of user text. A variable with this enum type keeps track of
where we |
| 909 // are. The state machine only fetches user input while in the
RUN mode. |
| 910 // |
| 911 enum RBBIRunMode { |
| 912 RBBI_START, // state machine processing is before first char of input |
| 913 RBBI_RUN, // state machine processing is in the user text |
| 914 RBBI_END // state machine processing is after end of user text. |
| 915 }; |
| 916 |
| 917 |
| 918 //------------------------------------------------------------------------------
----- |
| 919 // |
| 920 // handleNext(stateTable) |
| 921 // This method is the actual implementation of the rbbi next() method. |
| 922 // This method initializes the state machine to state 1 |
| 923 // and advances through the text character by character until we reach the e
nd |
| 924 // of the text or the state machine transitions to state 0. We update our r
eturn |
| 925 // value every time the state machine passes through an accepting state. |
| 926 // |
| 927 //------------------------------------------------------------------------------
----- |
| 928 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { |
| 929 int32_t state; |
| 930 int16_t category = 0; |
| 931 RBBIRunMode mode; |
| 932 |
| 933 RBBIStateTableRow *row; |
| 934 UChar32 c; |
| 935 int32_t lookaheadStatus = 0; |
| 936 int32_t lookaheadTagIdx = 0; |
| 937 int32_t result = 0; |
| 938 int32_t initialPosition = 0; |
| 939 int32_t lookaheadResult = 0; |
| 940 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEA
D_HARD_BREAK) != 0; |
| 941 const char *tableData = statetable->fTableData; |
| 942 uint32_t tableRowLen = statetable->fRowLen; |
| 943 |
| 944 #ifdef RBBI_DEBUG |
| 945 if (fTrace) { |
| 946 RBBIDebugPuts("Handle Next pos char state category"); |
| 947 } |
| 948 #endif |
| 949 |
| 950 // No matter what, handleNext alway correctly sets the break tag value. |
| 951 fLastStatusIndexValid = TRUE; |
| 952 fLastRuleStatusIndex = 0; |
| 953 |
| 954 // if we're already at the end of the text, return DONE. |
| 955 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
| 956 result = initialPosition; |
| 957 c = UTEXT_NEXT32(fText); |
| 958 if (fData == NULL || c==U_SENTINEL) { |
| 959 return BreakIterator::DONE; |
| 960 } |
| 961 |
| 962 // Set the initial state for the state machine |
| 963 state = START_STATE; |
| 964 row = (RBBIStateTableRow *) |
| 965 //(statetable->fTableData + (statetable->fRowLen * state)); |
| 966 (tableData + tableRowLen * state); |
| 967 |
| 968 |
| 969 mode = RBBI_RUN; |
| 970 if (statetable->fFlags & RBBI_BOF_REQUIRED) { |
| 971 category = 2; |
| 972 mode = RBBI_START; |
| 973 } |
| 974 |
| 975 |
| 976 // loop until we reach the end of the text or transition to state 0 |
| 977 // |
| 978 for (;;) { |
| 979 if (c == U_SENTINEL) { |
| 980 // Reached end of input string. |
| 981 if (mode == RBBI_END) { |
| 982 // We have already run the loop one last time with the |
| 983 // character set to the psueudo {eof} value. Now it is time |
| 984 // to unconditionally bail out. |
| 985 if (lookaheadResult > result) { |
| 986 // We ran off the end of the string with a pending look-ahea
d match. |
| 987 // Treat this as if the look-ahead condition had been met, a
nd return |
| 988 // the match at the / position from the look-ahead rule. |
| 989 result = lookaheadResult; |
| 990 fLastRuleStatusIndex = lookaheadTagIdx; |
| 991 lookaheadStatus = 0; |
| 992 } |
| 993 break; |
| 994 } |
| 995 // Run the loop one last time with the fake end-of-input character c
ategory. |
| 996 mode = RBBI_END; |
| 997 category = 1; |
| 998 } |
| 999 |
| 1000 // |
| 1001 // Get the char category. An incoming category of 1 or 2 means that |
| 1002 // we are preset for doing the beginning or end of input, and |
| 1003 // that we shouldn't get a category from an actual text input chara
cter. |
| 1004 // |
| 1005 if (mode == RBBI_RUN) { |
| 1006 // look up the current character's character category, which tells u
s |
| 1007 // which column in the state table to look at. |
| 1008 // Note: the 16 in UTRIE_GET16 refers to the size of the data being
returned, |
| 1009 // not the size of the character going in, which is a UChar32
. |
| 1010 // |
| 1011 UTRIE_GET16(&fData->fTrie, c, category); |
| 1012 |
| 1013 // Check the dictionary bit in the character's category. |
| 1014 // Counter is only used by dictionary based iterators (subclasses
). |
| 1015 // Chars that need to be handled by a dictionary have a flag bit
set |
| 1016 // in their category values. |
| 1017 // |
| 1018 if ((category & 0x4000) != 0) { |
| 1019 fDictionaryCharCount++; |
| 1020 // And off the dictionary flag bit. |
| 1021 category &= ~0x4000; |
| 1022 } |
| 1023 } |
| 1024 |
| 1025 #ifdef RBBI_DEBUG |
| 1026 if (fTrace) { |
| 1027 RBBIDebugPrintf(" %4ld ", utext_getNativeIndex(fTe
xt)); |
| 1028 if (0x20<=c && c<0x7f) { |
| 1029 RBBIDebugPrintf("\"%c\" ", c); |
| 1030 } else { |
| 1031 RBBIDebugPrintf("%5x ", c); |
| 1032 } |
| 1033 RBBIDebugPrintf("%3d %3d\n", state, category); |
| 1034 } |
| 1035 #endif |
| 1036 |
| 1037 // State Transition - move machine to its next state |
| 1038 // |
| 1039 state = row->fNextState[category]; |
| 1040 row = (RBBIStateTableRow *) |
| 1041 // (statetable->fTableData + (statetable->fRowLen * state)); |
| 1042 (tableData + tableRowLen * state); |
| 1043 |
| 1044 |
| 1045 if (row->fAccepting == -1) { |
| 1046 // Match found, common case. |
| 1047 if (mode != RBBI_START) { |
| 1048 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
| 1049 } |
| 1050 fLastRuleStatusIndex = row->fTagIdx; // Remember the break status
(tag) values. |
| 1051 } |
| 1052 |
| 1053 if (row->fLookAhead != 0) { |
| 1054 if (lookaheadStatus != 0 |
| 1055 && row->fAccepting == lookaheadStatus) { |
| 1056 // Lookahead match is completed. |
| 1057 result = lookaheadResult; |
| 1058 fLastRuleStatusIndex = lookaheadTagIdx; |
| 1059 lookaheadStatus = 0; |
| 1060 // TODO: make a standalone hard break in a rule work. |
| 1061 if (lookAheadHardBreak) { |
| 1062 UTEXT_SETNATIVEINDEX(fText, result); |
| 1063 return result; |
| 1064 } |
| 1065 // Look-ahead completed, but other rules may match further. Con
tinue on |
| 1066 // TODO: junk this feature? I don't think it's used anywhwere
. |
| 1067 goto continueOn; |
| 1068 } |
| 1069 |
| 1070 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
| 1071 lookaheadResult = r; |
| 1072 lookaheadStatus = row->fLookAhead; |
| 1073 lookaheadTagIdx = row->fTagIdx; |
| 1074 goto continueOn; |
| 1075 } |
| 1076 |
| 1077 |
| 1078 if (row->fAccepting != 0) { |
| 1079 // Because this is an accepting state, any in-progress look-ahead ma
tch |
| 1080 // is no longer relavant. Clear out the pending lookahead status. |
| 1081 lookaheadStatus = 0; // clear out any pending look-ahead m
atch. |
| 1082 } |
| 1083 |
| 1084 continueOn: |
| 1085 if (state == STOP_STATE) { |
| 1086 // This is the normal exit from the lookup state machine. |
| 1087 // We have advanced through the string until it is certain that no |
| 1088 // longer match is possible, no matter what characters follow. |
| 1089 break; |
| 1090 } |
| 1091 |
| 1092 // Advance to the next character. |
| 1093 // If this is a beginning-of-input loop iteration, don't advance |
| 1094 // the input position. The next iteration will be processing the |
| 1095 // first real input character. |
| 1096 if (mode == RBBI_RUN) { |
| 1097 c = UTEXT_NEXT32(fText); |
| 1098 } else { |
| 1099 if (mode == RBBI_START) { |
| 1100 mode = RBBI_RUN; |
| 1101 } |
| 1102 } |
| 1103 |
| 1104 |
| 1105 } |
| 1106 |
| 1107 // The state machine is done. Check whether it found a match... |
| 1108 |
| 1109 // If the iterator failed to advance in the match engine, force it ahead by
one. |
| 1110 // (This really indicates a defect in the break rules. They should always
match |
| 1111 // at least one character.) |
| 1112 if (result == initialPosition) { |
| 1113 UTEXT_SETNATIVEINDEX(fText, initialPosition); |
| 1114 UTEXT_NEXT32(fText); |
| 1115 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
| 1116 } |
| 1117 |
| 1118 // Leave the iterator at our result position. |
| 1119 UTEXT_SETNATIVEINDEX(fText, result); |
| 1120 #ifdef RBBI_DEBUG |
| 1121 if (fTrace) { |
| 1122 RBBIDebugPrintf("result = %d\n\n", result); |
| 1123 } |
| 1124 #endif |
| 1125 return result; |
| 1126 } |
| 1127 |
| 1128 |
| 1129 |
| 1130 //------------------------------------------------------------------------------
----- |
| 1131 // |
| 1132 // handlePrevious() |
| 1133 // |
| 1134 // Iterate backwards, according to the logic of the reverse rules. |
| 1135 // This version handles the exact style backwards rules. |
| 1136 // |
| 1137 // The logic of this function is very similar to handleNext(), above. |
| 1138 // |
| 1139 //------------------------------------------------------------------------------
----- |
| 1140 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
{ |
| 1141 int32_t state; |
| 1142 int16_t category = 0; |
| 1143 RBBIRunMode mode; |
| 1144 RBBIStateTableRow *row; |
| 1145 UChar32 c; |
| 1146 int32_t lookaheadStatus = 0; |
| 1147 int32_t result = 0; |
| 1148 int32_t initialPosition = 0; |
| 1149 int32_t lookaheadResult = 0; |
| 1150 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEA
D_HARD_BREAK) != 0; |
| 1151 |
| 1152 #ifdef RBBI_DEBUG |
| 1153 if (fTrace) { |
| 1154 RBBIDebugPuts("Handle Previous pos char state category"); |
| 1155 } |
| 1156 #endif |
| 1157 |
| 1158 // handlePrevious() never gets the rule status. |
| 1159 // Flag the status as invalid; if the user ever asks for status, we will nee
d |
| 1160 // to back up, then re-find the break position using handleNext(), which doe
s |
| 1161 // get the status value. |
| 1162 fLastStatusIndexValid = FALSE; |
| 1163 fLastRuleStatusIndex = 0; |
| 1164 |
| 1165 // if we're already at the start of the text, return DONE. |
| 1166 if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) { |
| 1167 return BreakIterator::DONE; |
| 1168 } |
| 1169 |
| 1170 // Set up the starting char. |
| 1171 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
| 1172 result = initialPosition; |
| 1173 c = UTEXT_PREVIOUS32(fText); |
| 1174 |
| 1175 // Set the initial state for the state machine |
| 1176 state = START_STATE; |
| 1177 row = (RBBIStateTableRow *) |
| 1178 (statetable->fTableData + (statetable->fRowLen * state)); |
| 1179 category = 3; |
| 1180 mode = RBBI_RUN; |
| 1181 if (statetable->fFlags & RBBI_BOF_REQUIRED) { |
| 1182 category = 2; |
| 1183 mode = RBBI_START; |
| 1184 } |
| 1185 |
| 1186 |
| 1187 // loop until we reach the start of the text or transition to state 0 |
| 1188 // |
| 1189 for (;;) { |
| 1190 if (c == U_SENTINEL) { |
| 1191 // Reached end of input string. |
| 1192 if (mode == RBBI_END) { |
| 1193 // We have already run the loop one last time with the |
| 1194 // character set to the psueudo {eof} value. Now it is time |
| 1195 // to unconditionally bail out. |
| 1196 if (lookaheadResult < result) { |
| 1197 // We ran off the end of the string with a pending look-ahea
d match. |
| 1198 // Treat this as if the look-ahead condition had been met, a
nd return |
| 1199 // the match at the / position from the look-ahead rule. |
| 1200 result = lookaheadResult; |
| 1201 lookaheadStatus = 0; |
| 1202 } else if (result == initialPosition) { |
| 1203 // Ran off start, no match found. |
| 1204 // move one index one (towards the start, since we are doing
a previous()) |
| 1205 UTEXT_SETNATIVEINDEX(fText, initialPosition); |
| 1206 UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary.
We're already at beginning. Check. |
| 1207 } |
| 1208 break; |
| 1209 } |
| 1210 // Run the loop one last time with the fake end-of-input character c
ategory. |
| 1211 mode = RBBI_END; |
| 1212 category = 1; |
| 1213 } |
| 1214 |
| 1215 // |
| 1216 // Get the char category. An incoming category of 1 or 2 means that |
| 1217 // we are preset for doing the beginning or end of input, and |
| 1218 // that we shouldn't get a category from an actual text input chara
cter. |
| 1219 // |
| 1220 if (mode == RBBI_RUN) { |
| 1221 // look up the current character's character category, which tells u
s |
| 1222 // which column in the state table to look at. |
| 1223 // Note: the 16 in UTRIE_GET16 refers to the size of the data being
returned, |
| 1224 // not the size of the character going in, which is a UChar32
. |
| 1225 // |
| 1226 UTRIE_GET16(&fData->fTrie, c, category); |
| 1227 |
| 1228 // Check the dictionary bit in the character's category. |
| 1229 // Counter is only used by dictionary based iterators (subclasses
). |
| 1230 // Chars that need to be handled by a dictionary have a flag bit
set |
| 1231 // in their category values. |
| 1232 // |
| 1233 if ((category & 0x4000) != 0) { |
| 1234 fDictionaryCharCount++; |
| 1235 // And off the dictionary flag bit. |
| 1236 category &= ~0x4000; |
| 1237 } |
| 1238 } |
| 1239 |
| 1240 #ifdef RBBI_DEBUG |
| 1241 if (fTrace) { |
| 1242 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeI
ndex(fText)); |
| 1243 if (0x20<=c && c<0x7f) { |
| 1244 RBBIDebugPrintf("\"%c\" ", c); |
| 1245 } else { |
| 1246 RBBIDebugPrintf("%5x ", c); |
| 1247 } |
| 1248 RBBIDebugPrintf("%3d %3d\n", state, category); |
| 1249 } |
| 1250 #endif |
| 1251 |
| 1252 // State Transition - move machine to its next state |
| 1253 // |
| 1254 state = row->fNextState[category]; |
| 1255 row = (RBBIStateTableRow *) |
| 1256 (statetable->fTableData + (statetable->fRowLen * state)); |
| 1257 |
| 1258 if (row->fAccepting == -1) { |
| 1259 // Match found, common case. |
| 1260 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
| 1261 } |
| 1262 |
| 1263 if (row->fLookAhead != 0) { |
| 1264 if (lookaheadStatus != 0 |
| 1265 && row->fAccepting == lookaheadStatus) { |
| 1266 // Lookahead match is completed. |
| 1267 result = lookaheadResult; |
| 1268 lookaheadStatus = 0; |
| 1269 // TODO: make a standalone hard break in a rule work. |
| 1270 if (lookAheadHardBreak) { |
| 1271 UTEXT_SETNATIVEINDEX(fText, result); |
| 1272 return result; |
| 1273 } |
| 1274 // Look-ahead completed, but other rules may match further. Con
tinue on |
| 1275 // TODO: junk this feature? I don't think it's used anywhwere
. |
| 1276 goto continueOn; |
| 1277 } |
| 1278 |
| 1279 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
| 1280 lookaheadResult = r; |
| 1281 lookaheadStatus = row->fLookAhead; |
| 1282 goto continueOn; |
| 1283 } |
| 1284 |
| 1285 |
| 1286 if (row->fAccepting != 0) { |
| 1287 // Because this is an accepting state, any in-progress look-ahead ma
tch |
| 1288 // is no longer relavant. Clear out the pending lookahead status. |
| 1289 lookaheadStatus = 0; |
| 1290 } |
| 1291 |
| 1292 continueOn: |
| 1293 if (state == STOP_STATE) { |
| 1294 // This is the normal exit from the lookup state machine. |
| 1295 // We have advanced through the string until it is certain that no |
| 1296 // longer match is possible, no matter what characters follow. |
| 1297 break; |
| 1298 } |
| 1299 |
| 1300 // Move (backwards) to the next character to process. |
| 1301 // If this is a beginning-of-input loop iteration, don't advance |
| 1302 // the input position. The next iteration will be processing the |
| 1303 // first real input character. |
| 1304 if (mode == RBBI_RUN) { |
| 1305 c = UTEXT_PREVIOUS32(fText); |
| 1306 } else { |
| 1307 if (mode == RBBI_START) { |
| 1308 mode = RBBI_RUN; |
| 1309 } |
| 1310 } |
| 1311 } |
| 1312 |
| 1313 // The state machine is done. Check whether it found a match... |
| 1314 |
| 1315 // If the iterator failed to advance in the match engine, force it ahead by
one. |
| 1316 // (This really indicates a defect in the break rules. They should always
match |
| 1317 // at least one character.) |
| 1318 if (result == initialPosition) { |
| 1319 UTEXT_SETNATIVEINDEX(fText, initialPosition); |
| 1320 UTEXT_PREVIOUS32(fText); |
| 1321 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
| 1322 } |
| 1323 |
| 1324 // Leave the iterator at our result position. |
| 1325 UTEXT_SETNATIVEINDEX(fText, result); |
| 1326 #ifdef RBBI_DEBUG |
| 1327 if (fTrace) { |
| 1328 RBBIDebugPrintf("result = %d\n\n", result); |
| 1329 } |
| 1330 #endif |
| 1331 return result; |
| 1332 } |
| 1333 |
| 1334 |
| 1335 void |
| 1336 RuleBasedBreakIterator::reset() |
| 1337 { |
| 1338 if (fCachedBreakPositions) { |
| 1339 uprv_free(fCachedBreakPositions); |
| 1340 } |
| 1341 fCachedBreakPositions = NULL; |
| 1342 fNumCachedBreakPositions = 0; |
| 1343 fDictionaryCharCount = 0; |
| 1344 fPositionInCache = 0; |
| 1345 } |
| 1346 |
| 1347 |
| 1348 |
| 1349 //------------------------------------------------------------------------------
- |
| 1350 // |
| 1351 // getRuleStatus() Return the break rule tag associated with the current |
| 1352 // iterator position. If the iterator arrived at its curren
t |
| 1353 // position by iterating forwards, the value will have been |
| 1354 // cached by the handleNext() function. |
| 1355 // |
| 1356 // If no cached status value is available, the status is |
| 1357 // found by doing a previous() followed by a next(), which |
| 1358 // leaves the iterator where it started, and computes the |
| 1359 // status while doing the next(). |
| 1360 // |
| 1361 //------------------------------------------------------------------------------
- |
| 1362 void RuleBasedBreakIterator::makeRuleStatusValid() { |
| 1363 if (fLastStatusIndexValid == FALSE) { |
| 1364 // No cached status is available. |
| 1365 if (fText == NULL || current() == 0) { |
| 1366 // At start of text, or there is no text. Status is always zero. |
| 1367 fLastRuleStatusIndex = 0; |
| 1368 fLastStatusIndexValid = TRUE; |
| 1369 } else { |
| 1370 // Not at start of text. Find status the tedious way. |
| 1371 int32_t pa = current(); |
| 1372 previous(); |
| 1373 if (fNumCachedBreakPositions > 0) { |
| 1374 reset(); // Blow off the dictionary cache |
| 1375 } |
| 1376 int32_t pb = next(); |
| 1377 if (pa != pb) { |
| 1378 // note: the if (pa != pb) test is here only to eliminate warnin
gs for |
| 1379 // unused local variables on gcc. Logically, it isn't nee
ded. |
| 1380 U_ASSERT(pa == pb); |
| 1381 } |
| 1382 } |
| 1383 } |
| 1384 U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatu
sMaxIdx); |
| 1385 } |
| 1386 |
| 1387 |
| 1388 int32_t RuleBasedBreakIterator::getRuleStatus() const { |
| 1389 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; |
| 1390 nonConstThis->makeRuleStatusValid(); |
| 1391 |
| 1392 // fLastRuleStatusIndex indexes to the start of the appropriate status recor
d |
| 1393 // (the number of status val
ues.) |
| 1394 // This function returns the last (largest) of the array of status values. |
| 1395 int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatu
sIndex]; |
| 1396 int32_t tagVal = fData->fRuleStatusTable[idx]; |
| 1397 |
| 1398 return tagVal; |
| 1399 } |
| 1400 |
| 1401 |
| 1402 |
| 1403 |
| 1404 int32_t RuleBasedBreakIterator::getRuleStatusVec( |
| 1405 int32_t *fillInVec, int32_t capacity, UErrorCode &status) |
| 1406 { |
| 1407 if (U_FAILURE(status)) { |
| 1408 return 0; |
| 1409 } |
| 1410 |
| 1411 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; |
| 1412 nonConstThis->makeRuleStatusValid(); |
| 1413 int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex]; |
| 1414 int32_t numValsToCopy = numVals; |
| 1415 if (numVals > capacity) { |
| 1416 status = U_BUFFER_OVERFLOW_ERROR; |
| 1417 numValsToCopy = capacity; |
| 1418 } |
| 1419 int i; |
| 1420 for (i=0; i<numValsToCopy; i++) { |
| 1421 fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1]; |
| 1422 } |
| 1423 return numVals; |
| 1424 } |
| 1425 |
| 1426 |
| 1427 |
| 1428 //------------------------------------------------------------------------------
- |
| 1429 // |
| 1430 // getBinaryRules Access to the compiled form of the rules, |
| 1431 // for use by build system tools that save the data |
| 1432 // for standard iterator types. |
| 1433 // |
| 1434 //------------------------------------------------------------------------------
- |
| 1435 const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { |
| 1436 const uint8_t *retPtr = NULL; |
| 1437 length = 0; |
| 1438 |
| 1439 if (fData != NULL) { |
| 1440 retPtr = (const uint8_t *)fData->fHeader; |
| 1441 length = fData->fHeader->fLength; |
| 1442 } |
| 1443 return retPtr; |
| 1444 } |
| 1445 |
| 1446 |
| 1447 |
| 1448 |
| 1449 //------------------------------------------------------------------------------
- |
| 1450 // |
| 1451 // BufferClone TODO: In my (Andy) opinion, this function should be depre
cated. |
| 1452 // Saving one heap allocation isn't worth the trouble. |
| 1453 // Cloning shouldn't be done in tight loops, and |
| 1454 // making the clone copy involves other heap operations anywa
y. |
| 1455 // And the application code for correctly dealing with buffer |
| 1456 // size problems and the eventual object destruction is ugly. |
| 1457 // |
| 1458 //------------------------------------------------------------------------------
- |
| 1459 BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer, |
| 1460 int32_t &bufferSize, |
| 1461 UErrorCode &status) |
| 1462 { |
| 1463 if (U_FAILURE(status)){ |
| 1464 return NULL; |
| 1465 } |
| 1466 |
| 1467 // |
| 1468 // If user buffer size is zero this is a preflight operation to |
| 1469 // obtain the needed buffer size, allowing for worst case misalignment. |
| 1470 // |
| 1471 if (bufferSize == 0) { |
| 1472 bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0); |
| 1473 return NULL; |
| 1474 } |
| 1475 |
| 1476 |
| 1477 // |
| 1478 // Check the alignment and size of the user supplied buffer. |
| 1479 // Allocate heap memory if the user supplied memory is insufficient. |
| 1480 // |
| 1481 char *buf = (char *)stackBuffer; |
| 1482 uint32_t s = bufferSize; |
| 1483 |
| 1484 if (stackBuffer == NULL) { |
| 1485 s = 0; // Ignore size, force allocation if user didn't give us a buffe
r. |
| 1486 } |
| 1487 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) { |
| 1488 uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf); |
| 1489 s -= offsetUp; |
| 1490 buf += offsetUp; |
| 1491 } |
| 1492 if (s < sizeof(RuleBasedBreakIterator)) { |
| 1493 // Not enough room in the caller-supplied buffer. |
| 1494 // Do a plain-vanilla heap based clone and return that, along with |
| 1495 // a warning that the clone was allocated. |
| 1496 RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this); |
| 1497 if (clonedBI == 0) { |
| 1498 status = U_MEMORY_ALLOCATION_ERROR; |
| 1499 } else { |
| 1500 status = U_SAFECLONE_ALLOCATED_WARNING; |
| 1501 } |
| 1502 return clonedBI; |
| 1503 } |
| 1504 |
| 1505 // |
| 1506 // Clone the source BI into the caller-supplied buffer. |
| 1507 // TODO: using an overloaded operator new to directly initialize the |
| 1508 // copy in the user's buffer would be better, but it doesn't seem |
| 1509 // to get along with namespaces. Investigate why. |
| 1510 // |
| 1511 // The memcpy is only safe with an empty (default constructed) |
| 1512 // break iterator. Use on others can screw up reference counts |
| 1513 // to data. memcpy-ing objects is not really a good idea... |
| 1514 // |
| 1515 RuleBasedBreakIterator localIter; // Empty break iterator, source for
memcpy |
| 1516 RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf; |
| 1517 uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++
gorp, BreakIterator base class part |
| 1518 clone->init(); // Init RuleBasedBreakIterator part, (user def
ault constructor) |
| 1519 *clone = *this; // clone = the real BI we want. |
| 1520 clone->fBufferClone = TRUE; // Flag to prevent deleting storage on close (
From C code) |
| 1521 |
| 1522 return clone; |
| 1523 } |
| 1524 |
| 1525 |
| 1526 //------------------------------------------------------------------------------
- |
| 1527 // |
| 1528 // isDictionaryChar Return true if the category lookup for this char |
| 1529 // indicates that it is in the set of dictionary lookup |
| 1530 // chars. |
| 1531 // |
| 1532 // This function is intended for use by dictionary based |
| 1533 // break iterators. |
| 1534 // |
| 1535 //------------------------------------------------------------------------------
- |
| 1536 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) { |
| 1537 if (fData == NULL) { |
| 1538 return FALSE; |
| 1539 } |
| 1540 uint16_t category; |
| 1541 UTRIE_GET16(&fData->fTrie, c, category); |
| 1542 return (category & 0x4000) != 0; |
| 1543 }*/ |
| 1544 |
| 1545 |
| 1546 //------------------------------------------------------------------------------
- |
| 1547 // |
| 1548 // checkDictionary This function handles all processing of characters in |
| 1549 // the "dictionary" set. It will determine the appropriat
e |
| 1550 // course of action, and possibly set up a cache in the |
| 1551 // process. |
| 1552 // |
| 1553 //------------------------------------------------------------------------------
- |
| 1554 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, |
| 1555 int32_t endPos, |
| 1556 UBool reverse) { |
| 1557 // Reset the old break cache first. |
| 1558 uint32_t dictionaryCount = fDictionaryCharCount; |
| 1559 reset(); |
| 1560 |
| 1561 if (dictionaryCount <= 1 || (endPos - startPos) <= 1) { |
| 1562 return (reverse ? startPos : endPos); |
| 1563 } |
| 1564 |
| 1565 // Bug 5532. The dictionary code will crash if the input text is UTF-8 |
| 1566 // because native indexes are different from UTF-16 indexes. |
| 1567 // Temporary hack: skip dictionary lookup for UTF-8 encoded text. |
| 1568 // It wont give the right breaks, but it's better than a crash. |
| 1569 // |
| 1570 // Check the type of the UText by checking its pFuncs field, which |
| 1571 // is UText's function dispatch table. It will be the same for all |
| 1572 // UTF-8 UTexts and different for any other UText type. |
| 1573 // |
| 1574 // We have no other type of UText available with non-UTF-16 native inde
xing. |
| 1575 // This whole check will go away once the dictionary code is fixed. |
| 1576 static const void *utext_utf8Funcs; |
| 1577 if (utext_utf8Funcs == NULL) { |
| 1578 // Cache the UTF-8 UText function pointer value. |
| 1579 UErrorCode status = U_ZERO_ERROR; |
| 1580 UText tempUText = UTEXT_INITIALIZER; |
| 1581 utext_openUTF8(&tempUText, NULL, 0, &status); |
| 1582 utext_utf8Funcs = tempUText.pFuncs; |
| 1583 utext_close(&tempUText); |
| 1584 } |
| 1585 if (fText->pFuncs == utext_utf8Funcs) { |
| 1586 return (reverse ? startPos : endPos); |
| 1587 } |
| 1588 |
| 1589 // Starting from the starting point, scan towards the proposed result, |
| 1590 // looking for the first dictionary character (which may be the one |
| 1591 // we're on, if we're starting in the middle of a range). |
| 1592 utext_setNativeIndex(fText, reverse ? endPos : startPos); |
| 1593 if (reverse) { |
| 1594 UTEXT_PREVIOUS32(fText); |
| 1595 } |
| 1596 |
| 1597 int32_t rangeStart = startPos; |
| 1598 int32_t rangeEnd = endPos; |
| 1599 |
| 1600 uint16_t category; |
| 1601 int32_t current; |
| 1602 UErrorCode status = U_ZERO_ERROR; |
| 1603 UStack breaks(status); |
| 1604 int32_t foundBreakCount = 0; |
| 1605 UChar32 c = utext_current32(fText); |
| 1606 |
| 1607 UTRIE_GET16(&fData->fTrie, c, category); |
| 1608 |
| 1609 // Is the character we're starting on a dictionary character? If so, we |
| 1610 // need to back up to include the entire run; otherwise the results of |
| 1611 // the break algorithm will differ depending on where we start. Since |
| 1612 // the result is cached and there is typically a non-dictionary break |
| 1613 // within a small number of words, there should be little performance impact
. |
| 1614 if (category & 0x4000) { |
| 1615 if (reverse) { |
| 1616 do { |
| 1617 utext_next32(fText); // TODO: recast to work directly
with postincrement. |
| 1618 c = utext_current32(fText); |
| 1619 UTRIE_GET16(&fData->fTrie, c, category); |
| 1620 } while (c != U_SENTINEL && (category & 0x4000)); |
| 1621 // Back up to the last dictionary character |
| 1622 rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
| 1623 if (c == U_SENTINEL) { |
| 1624 // c = fText->last32(); |
| 1625 // TODO: why was this if needed? |
| 1626 c = UTEXT_PREVIOUS32(fText); |
| 1627 } |
| 1628 else { |
| 1629 c = UTEXT_PREVIOUS32(fText); |
| 1630 } |
| 1631 } |
| 1632 else { |
| 1633 do { |
| 1634 c = UTEXT_PREVIOUS32(fText); |
| 1635 UTRIE_GET16(&fData->fTrie, c, category); |
| 1636 } |
| 1637 while (c != U_SENTINEL && (category & 0x4000)); |
| 1638 // Back up to the last dictionary character |
| 1639 if (c == U_SENTINEL) { |
| 1640 // c = fText->first32(); |
| 1641 c = utext_current32(fText); |
| 1642 } |
| 1643 else { |
| 1644 utext_next32(fText); |
| 1645 c = utext_current32(fText); |
| 1646 } |
| 1647 rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);; |
| 1648 } |
| 1649 UTRIE_GET16(&fData->fTrie, c, category); |
| 1650 } |
| 1651 |
| 1652 // Loop through the text, looking for ranges of dictionary characters. |
| 1653 // For each span, find the appropriate break engine, and ask it to find |
| 1654 // any breaks within the span. |
| 1655 // Note: we always do this in the forward direction, so that the break |
| 1656 // cache is built in the right order. |
| 1657 if (reverse) { |
| 1658 utext_setNativeIndex(fText, rangeStart); |
| 1659 c = utext_current32(fText); |
| 1660 UTRIE_GET16(&fData->fTrie, c, category); |
| 1661 } |
| 1662 while(U_SUCCESS(status)) { |
| 1663 while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (ca
tegory & 0x4000) == 0) { |
| 1664 utext_next32(fText); // TODO: tweak for post-increment op
eration |
| 1665 c = utext_current32(fText); |
| 1666 UTRIE_GET16(&fData->fTrie, c, category); |
| 1667 } |
| 1668 if (current >= rangeEnd) { |
| 1669 break; |
| 1670 } |
| 1671 |
| 1672 // We now have a dictionary character. Get the appropriate language obje
ct |
| 1673 // to deal with it. |
| 1674 const LanguageBreakEngine *lbe = getLanguageBreakEngine(c); |
| 1675 |
| 1676 // Ask the language object if there are any breaks. It will leave the te
xt |
| 1677 // pointer on the other side of its range, ready to search for the next
one. |
| 1678 if (lbe != NULL) { |
| 1679 foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALS
E, fBreakType, breaks); |
| 1680 } |
| 1681 |
| 1682 // Reload the loop variables for the next go-round |
| 1683 c = utext_current32(fText); |
| 1684 UTRIE_GET16(&fData->fTrie, c, category); |
| 1685 } |
| 1686 |
| 1687 // If we found breaks, build a new break cache. The first and last entries m
ust |
| 1688 // be the original starting and ending position. |
| 1689 if (foundBreakCount > 0) { |
| 1690 int32_t totalBreaks = foundBreakCount; |
| 1691 if (startPos < breaks.elementAti(0)) { |
| 1692 totalBreaks += 1; |
| 1693 } |
| 1694 if (endPos > breaks.peeki()) { |
| 1695 totalBreaks += 1; |
| 1696 } |
| 1697 fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int3
2_t)); |
| 1698 if (fCachedBreakPositions != NULL) { |
| 1699 int32_t out = 0; |
| 1700 fNumCachedBreakPositions = totalBreaks; |
| 1701 if (startPos < breaks.elementAti(0)) { |
| 1702 fCachedBreakPositions[out++] = startPos; |
| 1703 } |
| 1704 for (int32_t i = 0; i < foundBreakCount; ++i) { |
| 1705 fCachedBreakPositions[out++] = breaks.elementAti(i); |
| 1706 } |
| 1707 if (endPos > fCachedBreakPositions[out-1]) { |
| 1708 fCachedBreakPositions[out] = endPos; |
| 1709 } |
| 1710 // If there are breaks, then by definition, we are replacing the ori
ginal |
| 1711 // proposed break by one of the breaks we found. Use following() and |
| 1712 // preceding() to do the work. They should never recurse in this cas
e. |
| 1713 if (reverse) { |
| 1714 return preceding(endPos - 1); |
| 1715 } |
| 1716 else { |
| 1717 return following(startPos); |
| 1718 } |
| 1719 } |
| 1720 // If the allocation failed, just fall through to the "no breaks found"
case. |
| 1721 } |
| 1722 |
| 1723 // If we get here, there were no language-based breaks. Set the text pointer |
| 1724 // to the original proposed break. |
| 1725 utext_setNativeIndex(fText, reverse ? startPos : endPos); |
| 1726 return (reverse ? startPos : endPos); |
| 1727 } |
| 1728 |
| 1729 U_NAMESPACE_END |
| 1730 |
| 1731 // defined in ucln_cmn.h |
| 1732 |
| 1733 static U_NAMESPACE_QUALIFIER UStack *gLanguageBreakFactories = NULL; |
| 1734 |
| 1735 /** |
| 1736 * Release all static memory held by breakiterator. |
| 1737 */ |
| 1738 U_CDECL_BEGIN |
| 1739 static UBool U_CALLCONV breakiterator_cleanup_dict(void) { |
| 1740 if (gLanguageBreakFactories) { |
| 1741 delete gLanguageBreakFactories; |
| 1742 gLanguageBreakFactories = NULL; |
| 1743 } |
| 1744 return TRUE; |
| 1745 } |
| 1746 U_CDECL_END |
| 1747 |
| 1748 U_CDECL_BEGIN |
| 1749 static void U_CALLCONV _deleteFactory(void *obj) { |
| 1750 delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory *) obj; |
| 1751 } |
| 1752 U_CDECL_END |
| 1753 U_NAMESPACE_BEGIN |
| 1754 |
| 1755 static const LanguageBreakEngine* |
| 1756 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType) |
| 1757 { |
| 1758 UBool needsInit; |
| 1759 UErrorCode status = U_ZERO_ERROR; |
| 1760 UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit); |
| 1761 |
| 1762 if (needsInit) { |
| 1763 UStack *factories = new UStack(_deleteFactory, NULL, status); |
| 1764 if (factories != NULL && U_SUCCESS(status)) { |
| 1765 ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(statu
s); |
| 1766 factories->push(builtIn, status); |
| 1767 #ifdef U_LOCAL_SERVICE_HOOK |
| 1768 LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook(
"languageBreakFactory", &status); |
| 1769 if (extra != NULL) { |
| 1770 factories->push(extra, status); |
| 1771 } |
| 1772 #endif |
| 1773 } |
| 1774 umtx_lock(NULL); |
| 1775 if (gLanguageBreakFactories == NULL) { |
| 1776 gLanguageBreakFactories = factories; |
| 1777 factories = NULL; |
| 1778 ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakite
rator_cleanup_dict); |
| 1779 } |
| 1780 umtx_unlock(NULL); |
| 1781 delete factories; |
| 1782 } |
| 1783 |
| 1784 if (gLanguageBreakFactories == NULL) { |
| 1785 return NULL; |
| 1786 } |
| 1787 |
| 1788 int32_t i = gLanguageBreakFactories->size(); |
| 1789 const LanguageBreakEngine *lbe = NULL; |
| 1790 while (--i >= 0) { |
| 1791 LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakF
actories->elementAt(i)); |
| 1792 lbe = factory->getEngineFor(c, breakType); |
| 1793 if (lbe != NULL) { |
| 1794 break; |
| 1795 } |
| 1796 } |
| 1797 return lbe; |
| 1798 } |
| 1799 |
| 1800 |
| 1801 //------------------------------------------------------------------------------
- |
| 1802 // |
| 1803 // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the |
| 1804 // the characer c. |
| 1805 // |
| 1806 //------------------------------------------------------------------------------
- |
| 1807 const LanguageBreakEngine * |
| 1808 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { |
| 1809 const LanguageBreakEngine *lbe = NULL; |
| 1810 UErrorCode status = U_ZERO_ERROR; |
| 1811 |
| 1812 if (fLanguageBreakEngines == NULL) { |
| 1813 fLanguageBreakEngines = new UStack(status); |
| 1814 if (fLanguageBreakEngines == NULL || U_FAILURE(status)) { |
| 1815 delete fLanguageBreakEngines; |
| 1816 fLanguageBreakEngines = 0; |
| 1817 return NULL; |
| 1818 } |
| 1819 } |
| 1820 |
| 1821 int32_t i = fLanguageBreakEngines->size(); |
| 1822 while (--i >= 0) { |
| 1823 lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i))
; |
| 1824 if (lbe->handles(c, fBreakType)) { |
| 1825 return lbe; |
| 1826 } |
| 1827 } |
| 1828 |
| 1829 // No existing dictionary took the character. See if a factory wants to |
| 1830 // give us a new LanguageBreakEngine for this character. |
| 1831 lbe = getLanguageBreakEngineFromFactory(c, fBreakType); |
| 1832 |
| 1833 // If we got one, use it and push it on our stack. |
| 1834 if (lbe != NULL) { |
| 1835 fLanguageBreakEngines->push((void *)lbe, status); |
| 1836 // Even if we can't remember it, we can keep looking it up, so |
| 1837 // return it even if the push fails. |
| 1838 return lbe; |
| 1839 } |
| 1840 |
| 1841 // No engine is forthcoming for this character. Add it to the |
| 1842 // reject set. Create the reject break engine if needed. |
| 1843 if (fUnhandledBreakEngine == NULL) { |
| 1844 fUnhandledBreakEngine = new UnhandledEngine(status); |
| 1845 if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) { |
| 1846 status = U_MEMORY_ALLOCATION_ERROR; |
| 1847 } |
| 1848 // Put it last so that scripts for which we have an engine get tried |
| 1849 // first. |
| 1850 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status)
; |
| 1851 // If we can't insert it, or creation failed, get rid of it |
| 1852 if (U_FAILURE(status)) { |
| 1853 delete fUnhandledBreakEngine; |
| 1854 fUnhandledBreakEngine = 0; |
| 1855 return NULL; |
| 1856 } |
| 1857 } |
| 1858 |
| 1859 // Tell the reject engine about the character; at its discretion, it may |
| 1860 // add more than just the one character. |
| 1861 fUnhandledBreakEngine->handleCharacter(c, fBreakType); |
| 1862 |
| 1863 return fUnhandledBreakEngine; |
| 1864 } |
| 1865 |
| 1866 |
| 1867 |
| 1868 /*int32_t RuleBasedBreakIterator::getBreakType() const { |
| 1869 return fBreakType; |
| 1870 }*/ |
| 1871 |
| 1872 void RuleBasedBreakIterator::setBreakType(int32_t type) { |
| 1873 fBreakType = type; |
| 1874 reset(); |
| 1875 } |
| 1876 |
| 1877 U_NAMESPACE_END |
| 1878 |
| 1879 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
OLD | NEW |