icu46/source/common/rbbi.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/common/rbbi.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 ***************************************************************************

	3 * Copyright (C) 1999-2010 International Business Machines Corporation

	4 * and others. All rights reserved.

	5 ***************************************************************************

	6 */

	7 //

	8 // file: rbbi.c Contains the implementation of the rule based break iterato r

	9 // runtime engine and the API implementation for

	10 // class RuleBasedBreakIterator

	11 //

	12

	13 #include <typeinfo> // for 'typeid' to work

	14

	15 #include "unicode/utypes.h"

	16

	17 #if !UCONFIG_NO_BREAK_ITERATION

	18

	19 #include "unicode/rbbi.h"

	20 #include "unicode/schriter.h"

	21 #include "unicode/uchriter.h"

	22 #include "unicode/udata.h"

	23 #include "unicode/uclean.h"

	24 #include "rbbidata.h"

	25 #include "rbbirb.h"

	26 #include "cmemory.h"

	27 #include "cstring.h"

	28 #include "umutex.h"

	29 #include "ucln_cmn.h"

	30 #include "brkeng.h"

	31

	32 #include "uassert.h"

	33 #include "uvector.h"

	34

	35 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be inclu ded.

	36 #if U_LOCAL_SERVICE_HOOK

	37 #include "localsvc.h"

	38 #endif

	39

	40 #ifdef RBBI_DEBUG

	41 static UBool fTrace = FALSE;

	42 #endif

	43

	44 U_NAMESPACE_BEGIN

	45

	46 // The state number of the starting state

	47 #define START_STATE 1

	48

	49 // The state-transition value indicating "stop"

	50 #define STOP_STATE 0

	51

	52

	53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)

	54

	55

	56 //=======================================================================

	57 // constructors

	58 //=======================================================================

	59

	60 /**

	61 * Constructs a RuleBasedBreakIterator that uses the already-created

	62 * tables object that is passed in as a parameter.

	63 */

	64 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)

	65 {

	66 init();

	67 fData = new RBBIDataWrapper(data, status); // status checked in constructor

	68 if (U_FAILURE(status)) {return;}

	69 if(fData == 0) {

	70 status = U_MEMORY_ALLOCATION_ERROR;

	71 return;

	72 }

	73 }

	74

	75 /**

	76 * Same as above but does not adopt memory

	77 */

	78 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)

	79 {

	80 init();

	81 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // s tatus checked in constructor

	82 if (U_FAILURE(status)) {return;}

	83 if(fData == 0) {

	84 status = U_MEMORY_ALLOCATION_ERROR;

	85 return;

	86 }

	87 }

	88

	89 //------------------------------------------------------------------------------ -

	90 //

	91 // Constructor from a UDataMemory handle to precompiled break rules

	92 // stored in an ICU data file.

	93 //

	94 //------------------------------------------------------------------------------ -

	95 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &sta tus)

	96 {

	97 init();

	98 fData = new RBBIDataWrapper(udm, status); // status checked in constructor

	99 if (U_FAILURE(status)) {return;}

	100 if(fData == 0) {

	101 status = U_MEMORY_ALLOCATION_ERROR;

	102 return;

	103 }

	104 }

	105

	106

	107

	108 //------------------------------------------------------------------------------ -

	109 //

	110 // Constructor from a set of rules supplied as a string.

	111 //

	112 //------------------------------------------------------------------------------ -

	113 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules,

	114 UParseError &parseError ,

	115 UErrorCode &status)

	116 {

	117 init();

	118 if (U_FAILURE(status)) {return;}

	119 RuleBasedBreakIterator bi = (RuleBasedBreakIterator )

	120 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status );

	121 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that

	122 // creates and returns a complete RBBI. From here, in a constructor, we

	123 // can't just return the object created by the builder factory, hence

	124 // the assignment of the factory created object to "this".

	125 if (U_SUCCESS(status)) {

	126 this = bi;

	127 delete bi;

	128 }

	129 }

	130

	131

	132 //------------------------------------------------------------------------------ -

	133 //

	134 // Default Constructor. Create an empty shell that can be set up later.

	135 // Used when creating a RuleBasedBreakIterator from a set

	136 // of rules.

	137 //------------------------------------------------------------------------------ -

	138 RuleBasedBreakIterator::RuleBasedBreakIterator() {

	139 init();

	140 }

	141

	142

	143 //------------------------------------------------------------------------------ -

	144 //

	145 // Copy constructor. Will produce a break iterator with the same behavior,

	146 // and which iterates over the same text, as the one passed in.

	147 //

	148 //------------------------------------------------------------------------------ -

	149 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& oth er)

	150 : BreakIterator(other)

	151 {

	152 this->init();

	153 *this = other;

	154 }

	155

	156

	157 /**

	158 * Destructor

	159 */

	160 RuleBasedBreakIterator::~RuleBasedBreakIterator() {

	161 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {

	162 // fCharIter was adopted from the outside.

	163 delete fCharIter;

	164 }

	165 fCharIter = NULL;

	166 delete fSCharIter;

	167 fCharIter = NULL;

	168 delete fDCharIter;

	169 fDCharIter = NULL;

	170

	171 utext_close(fText);

	172

	173 if (fData != NULL) {

	174 fData->removeReference();

	175 fData = NULL;

	176 }

	177 if (fCachedBreakPositions) {

	178 uprv_free(fCachedBreakPositions);

	179 fCachedBreakPositions = NULL;

	180 }

	181 if (fLanguageBreakEngines) {

	182 delete fLanguageBreakEngines;

	183 fLanguageBreakEngines = NULL;

	184 }

	185 if (fUnhandledBreakEngine) {

	186 delete fUnhandledBreakEngine;

	187 fUnhandledBreakEngine = NULL;

	188 }

	189 }

	190

	191 /**

	192 * Assignment operator. Sets this iterator to have the same behavior,

	193 * and iterate over the same text, as the one passed in.

	194 */

	195 RuleBasedBreakIterator&

	196 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {

	197 if (this == &that) {

	198 return *this;

	199 }

	200 reset(); // Delete break cache information

	201 fBreakType = that.fBreakType;

	202 if (fLanguageBreakEngines != NULL) {

	203 delete fLanguageBreakEngines;

	204 fLanguageBreakEngines = NULL; // Just rebuild for now

	205 }

	206 // TODO: clone fLanguageBreakEngines from "that"

	207 UErrorCode status = U_ZERO_ERROR;

	208 fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);

	209

	210 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {

	211 delete fCharIter;

	212 }

	213 fCharIter = NULL;

	214

	215 if (that.fCharIter != NULL ) {

	216 // This is a little bit tricky - it will intially appear that

	217 // this->fCharIter is adopted, even if that->fCharIter was

	218 // not adopted. That's ok.

	219 fCharIter = that.fCharIter->clone();

	220 }

	221

	222 if (fData != NULL) {

	223 fData->removeReference();

	224 fData = NULL;

	225 }

	226 if (that.fData != NULL) {

	227 fData = that.fData->addReference();

	228 }

	229

	230 return *this;

	231 }

	232

	233

	234

	235 //-----------------------------------------------------------------------------

	236 //

	237 // init() Shared initialization routine. Used by all the constructors.

	238 // Initializes all fields, leaving the object in a consistent sta te.

	239 //

	240 //-----------------------------------------------------------------------------

	241 void RuleBasedBreakIterator::init() {

	242 UErrorCode status = U_ZERO_ERROR;

	243 fBufferClone = FALSE;

	244 fText = utext_openUChars(NULL, NULL, 0, &status);

	245 fCharIter = NULL;

	246 fSCharIter = NULL;

	247 fDCharIter = NULL;

	248 fData = NULL;

	249 fLastRuleStatusIndex = 0;

	250 fLastStatusIndexValid = TRUE;

	251 fDictionaryCharCount = 0;

	252 fBreakType = UBRK_WORD; // Defaulting BreakType to word gives re asonable

	253 // dictionary behavior for Break Itera tors that are

	254 // built from rules. Even better woul d be the ability to

	255 // declare the type in the rules.

	256

	257 fCachedBreakPositions = NULL;

	258 fLanguageBreakEngines = NULL;

	259 fUnhandledBreakEngine = NULL;

	260 fNumCachedBreakPositions = 0;

	261 fPositionInCache = 0;

	262

	263 #ifdef RBBI_DEBUG

	264 static UBool debugInitDone = FALSE;

	265 if (debugInitDone == FALSE) {

	266 char *debugEnv = getenv("U_RBBIDEBUG");

	267 if (debugEnv && uprv_strstr(debugEnv, "trace")) {

	268 fTrace = TRUE;

	269 }

	270 debugInitDone = TRUE;

	271 }

	272 #endif

	273 }

	274

	275

	276

	277 //-----------------------------------------------------------------------------

	278 //

	279 // clone - Returns a newly-constructed RuleBasedBreakIterator with the same

	280 // behavior, and iterating over the same text, as this one.

	281 // Virtual function: does the right thing with subclasses.

	282 //

	283 //-----------------------------------------------------------------------------

	284 BreakIterator*

	285 RuleBasedBreakIterator::clone(void) const {

	286 return new RuleBasedBreakIterator(*this);

	287 }

	288

	289 /**

	290 * Equality operator. Returns TRUE if both BreakIterators are of the

	291 * same class, have the same behavior, and iterate over the same text.

	292 */

	293 UBool

	294 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {

	295 if (typeid(*this) != typeid(that)) {

	296 return FALSE;

	297 }

	298

	299 const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;

	300

	301 if (!utext_equals(fText, that2.fText)) {

	302 // The two break iterators are operating on different text,

	303 // or have a different interation position.

	304 return FALSE;

	305 };

	306

	307 // TODO: need a check for when in a dictionary region at different offsets.

	308

	309 if (that2.fData == fData \|\|

	310 (fData != NULL && that2.fData != NULL && that2.fData == fData)) {

	311 // The two break iterators are using the same rules.

	312 return TRUE;

	313 }

	314 return FALSE;

	315 }

	316

	317 /**

	318 * Compute a hash code for this BreakIterator

	319 * @return A hash code

	320 */

	321 int32_t

	322 RuleBasedBreakIterator::hashCode(void) const {

	323 int32_t hash = 0;

	324 if (fData != NULL) {

	325 hash = fData->hashCode();

	326 }

	327 return hash;

	328 }

	329

	330

	331 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {

	332 if (U_FAILURE(status)) {

	333 return;

	334 }

	335 reset();

	336 fText = utext_clone(fText, ut, FALSE, TRUE, &status);

	337

	338 // Set up a dummy CharacterIterator to be returned if anyone

	339 // calls getText(). With input from UText, there is no reasonable

	340 // way to return a characterIterator over the actual input text.

	341 // Return one over an empty string instead - this is the closest

	342 // we can come to signaling a failure.

	343 // (GetText() is obsolete, this failure is sort of OK)

	344 if (fDCharIter == NULL) {

	345 static const UChar c = 0;

	346 fDCharIter = new UCharCharacterIterator(&c, 0);

	347 if (fDCharIter == NULL) {

	348 status = U_MEMORY_ALLOCATION_ERROR;

	349 return;

	350 }

	351 }

	352

	353 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {

	354 // existing fCharIter was adopted from the outside. Delete it now.

	355 delete fCharIter;

	356 }

	357 fCharIter = fDCharIter;

	358

	359 this->first();

	360 }

	361

	362

	363 UText RuleBasedBreakIterator::getUText(UText fillIn, UErrorCode &status) const {

	364 UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);

	365 return result;

	366 }

	367

	368

	369

	370 /**

	371 * Returns the description used to create this iterator

	372 */

	373 const UnicodeString&

	374 RuleBasedBreakIterator::getRules() const {

	375 if (fData != NULL) {

	376 return fData->getRuleSourceString();

	377 } else {

	378 static const UnicodeString *s;

	379 if (s == NULL) {

	380 // TODO: something more elegant here.

	381 // perhaps API should return the string by value.

	382 // Note: thread unsafe init & leak are semi-ok, better than

	383 // what was before. Sould be cleaned up, though.

	384 s = new UnicodeString;

	385 }

	386 return *s;

	387 }

	388 }

	389

	390 //=======================================================================

	391 // BreakIterator overrides

	392 //=======================================================================

	393

	394 /**

	395 * Return a CharacterIterator over the text being analyzed.

	396 */

	397 CharacterIterator&

	398 RuleBasedBreakIterator::getText() const {

	399 return *fCharIter;

	400 }

	401

	402 /**

	403 * Set the iterator to analyze a new piece of text. This function resets

	404 * the current iteration position to the beginning of the text.

	405 * @param newText An iterator over the text to analyze.

	406 */

	407 void

	408 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {

	409 // If we are holding a CharacterIterator adopted from a

	410 // previous call to this function, delete it now.

	411 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {

	412 delete fCharIter;

	413 }

	414

	415 fCharIter = newText;

	416 UErrorCode status = U_ZERO_ERROR;

	417 reset();

	418 if (newText==NULL \|\| newText->startIndex() != 0) {

	419 // startIndex !=0 wants to be an error, but there's no way to report it.

	420 // Make the iterator text be an empty string.

	421 fText = utext_openUChars(fText, NULL, 0, &status);

	422 } else {

	423 fText = utext_openCharacterIterator(fText, newText, &status);

	424 }

	425 this->first();

	426 }

	427

	428 /**

	429 * Set the iterator to analyze a new piece of text. This function resets

	430 * the current iteration position to the beginning of the text.

	431 * @param newText An iterator over the text to analyze.

	432 */

	433 void

	434 RuleBasedBreakIterator::setText(const UnicodeString& newText) {

	435 UErrorCode status = U_ZERO_ERROR;

	436 reset();

	437 fText = utext_openConstUnicodeString(fText, &newText, &status);

	438

	439 // Set up a character iterator on the string.

	440 // Needed in case someone calls getText().

	441 // Can not, unfortunately, do this lazily on the (probably never)

	442 // call to getText(), because getText is const.

	443 if (fSCharIter == NULL) {

	444 fSCharIter = new StringCharacterIterator(newText);

	445 } else {

	446 fSCharIter->setText(newText);

	447 }

	448

	449 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {

	450 // old fCharIter was adopted from the outside. Delete it.

	451 delete fCharIter;

	452 }

	453 fCharIter = fSCharIter;

	454

	455 this->first();

	456 }

	457

	458

	459

	460 /**

	461 * Sets the current iteration position to the beginning of the text.

	462 * @return The offset of the beginning of the text.

	463 */

	464 int32_t RuleBasedBreakIterator::first(void) {

	465 reset();

	466 fLastRuleStatusIndex = 0;

	467 fLastStatusIndexValid = TRUE;

	468 //if (fText == NULL)

	469 // return BreakIterator::DONE;

	470

	471 utext_setNativeIndex(fText, 0);

	472 return 0;

	473 }

	474

	475 /**

	476 * Sets the current iteration position to the end of the text.

	477 * @return The text's past-the-end offset.

	478 */

	479 int32_t RuleBasedBreakIterator::last(void) {

	480 reset();

	481 if (fText == NULL) {

	482 fLastRuleStatusIndex = 0;

	483 fLastStatusIndexValid = TRUE;

	484 return BreakIterator::DONE;

	485 }

	486

	487 fLastStatusIndexValid = FALSE;

	488 int32_t pos = (int32_t)utext_nativeLength(fText);

	489 utext_setNativeIndex(fText, pos);

	490 return pos;

	491 }

	492

	493 /**

	494 * Advances the iterator either forward or backward the specified number of step s.

	495 * Negative values move backward, and positive values move forward. This is

	496 * equivalent to repeatedly calling next() or previous().

	497 * @param n The number of steps to move. The sign indicates the direction

	498 * (negative is backwards, and positive is forwards).

	499 * @return The character offset of the boundary position n boundaries away from

	500 * the current one.

	501 */

	502 int32_t RuleBasedBreakIterator::next(int32_t n) {

	503 int32_t result = current();

	504 while (n > 0) {

	505 result = next();

	506 --n;

	507 }

	508 while (n < 0) {

	509 result = previous();

	510 ++n;

	511 }

	512 return result;

	513 }

	514

	515 /**

	516 * Advances the iterator to the next boundary position.

	517 * @return The position of the first boundary after this one.

	518 */

	519 int32_t RuleBasedBreakIterator::next(void) {

	520 // if we have cached break positions and we're still in the range

	521 // covered by them, just move one step forward in the cache

	522 if (fCachedBreakPositions != NULL) {

	523 if (fPositionInCache < fNumCachedBreakPositions - 1) {

	524 ++fPositionInCache;

	525 int32_t pos = fCachedBreakPositions[fPositionInCache];

	526 utext_setNativeIndex(fText, pos);

	527 return pos;

	528 }

	529 else {

	530 reset();

	531 }

	532 }

	533

	534 int32_t startPos = current();

	535 int32_t result = handleNext(fData->fForwardTable);

	536 if (fDictionaryCharCount > 0) {

	537 result = checkDictionary(startPos, result, FALSE);

	538 }

	539 return result;

	540 }

	541

	542 /**

	543 * Advances the iterator backwards, to the last boundary preceding this one.

	544 * @return The position of the last boundary position preceding this one.

	545 */

	546 int32_t RuleBasedBreakIterator::previous(void) {

	547 int32_t result;

	548 int32_t startPos;

	549

	550 // if we have cached break positions and we're still in the range

	551 // covered by them, just move one step backward in the cache

	552 if (fCachedBreakPositions != NULL) {

	553 if (fPositionInCache > 0) {

	554 --fPositionInCache;

	555 // If we're at the beginning of the cache, need to reevaluate the

	556 // rule status

	557 if (fPositionInCache <= 0) {

	558 fLastStatusIndexValid = FALSE;

	559 }

	560 int32_t pos = fCachedBreakPositions[fPositionInCache];

	561 utext_setNativeIndex(fText, pos);

	562 return pos;

	563 }

	564 else {

	565 reset();

	566 }

	567 }

	568

	569 // if we're already sitting at the beginning of the text, return DONE

	570 if (fText == NULL \|\| (startPos = current()) == 0) {

	571 fLastRuleStatusIndex = 0;

	572 fLastStatusIndexValid = TRUE;

	573 return BreakIterator::DONE;

	574 }

	575

	576 if (fData->fSafeRevTable != NULL \|\| fData->fSafeFwdTable != NULL) {

	577 result = handlePrevious(fData->fReverseTable);

	578 if (fDictionaryCharCount > 0) {

	579 result = checkDictionary(result, startPos, TRUE);

	580 }

	581 return result;

	582 }

	583

	584 // old rule syntax

	585 // set things up. handlePrevious() will back us up to some valid

	586 // break position before the current position (we back our internal

	587 // iterator up one step to prevent handlePrevious() from returning

	588 // the current position), but not necessarily the last one before

	589

	590 // where we started

	591

	592 int32_t start = current();

	593

	594 UTEXT_PREVIOUS32(fText);

	595 int32_t lastResult = handlePrevious(fData->fReverseTable);

	596 if (lastResult == UBRK_DONE) {

	597 lastResult = 0;

	598 utext_setNativeIndex(fText, 0);

	599 }

	600 result = lastResult;

	601 int32_t lastTag = 0;

	602 UBool breakTagValid = FALSE;

	603

	604 // iterate forward from the known break position until we pass our

	605 // starting point. The last break position before the starting

	606 // point is our return value

	607

	608 for (;;) {

	609 result = next();

	610 if (result == BreakIterator::DONE \|\| result >= start) {

	611 break;

	612 }

	613 lastResult = result;

	614 lastTag = fLastRuleStatusIndex;

	615 breakTagValid = TRUE;

	616 }

	617

	618 // fLastBreakTag wants to have the value for section of text preceding

	619 // the result position that we are to return (in lastResult.) If

	620 // the backwards rules overshot and the above loop had to do two or more

	621 // next()s to move up to the desired return position, we will have a valid

	622 // tag value. But, if handlePrevious() took us to exactly the correct result positon,

	623 // we wont have a tag value for that position, which is only set by handleNe xt().

	624

	625 // set the current iteration position to be the last break position

	626 // before where we started, and then return that value

	627 utext_setNativeIndex(fText, lastResult);

	628 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()

	629 fLastStatusIndexValid = breakTagValid;

	630

	631 // No need to check the dictionary; it will have been handled by

	632 // next()

	633

	634 return lastResult;

	635 }

	636

	637 /**

	638 * Sets the iterator to refer to the first boundary position following

	639 * the specified position.

	640 * @offset The position from which to begin searching for a break position.

	641 * @return The position of the first break after the current position.

	642 */

	643 int32_t RuleBasedBreakIterator::following(int32_t offset) {

	644 // if we have cached break positions and offset is in the range

	645 // covered by them, use them

	646 // TODO: could use binary search

	647 // TODO: what if offset is outside range, but break is not?

	648 if (fCachedBreakPositions != NULL) {

	649 if (offset >= fCachedBreakPositions[0]

	650 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {

	651 fPositionInCache = 0;

	652 // We are guaranteed not to leave the array due to range test above

	653 while (offset >= fCachedBreakPositions[fPositionInCache]) {

	654 ++fPositionInCache;

	655 }

	656 int32_t pos = fCachedBreakPositions[fPositionInCache];

	657 utext_setNativeIndex(fText, pos);

	658 return pos;

	659 }

	660 else {

	661 reset();

	662 }

	663 }

	664

	665 // if the offset passed in is already past the end of the text,

	666 // just return DONE; if it's before the beginning, return the

	667 // text's starting offset

	668 fLastRuleStatusIndex = 0;

	669 fLastStatusIndexValid = TRUE;

	670 if (fText == NULL \|\| offset >= utext_nativeLength(fText)) {

	671 last();

	672 return next();

	673 }

	674 else if (offset < 0) {

	675 return first();

	676 }

	677

	678 // otherwise, set our internal iteration position (temporarily)

	679 // to the position passed in. If this is the _beginning_ position,

	680 // then we can just use next() to get our return value

	681

	682 int32_t result = 0;

	683

	684 if (fData->fSafeRevTable != NULL) {

	685 // new rule syntax

	686 utext_setNativeIndex(fText, offset);

	687 // move forward one codepoint to prepare for moving back to a

	688 // safe point.

	689 // this handles offset being between a supplementary character

	690 UTEXT_NEXT32(fText);

	691 // handlePrevious will move most of the time to < 1 boundary away

	692 handlePrevious(fData->fSafeRevTable);

	693 int32_t result = next();

	694 while (result <= offset) {

	695 result = next();

	696 }

	697 return result;

	698 }

	699 if (fData->fSafeFwdTable != NULL) {

	700 // backup plan if forward safe table is not available

	701 utext_setNativeIndex(fText, offset);

	702 UTEXT_PREVIOUS32(fText);

	703 // handle next will give result >= offset

	704 handleNext(fData->fSafeFwdTable);

	705 // previous will give result 0 or 1 boundary away from offset,

	706 // most of the time

	707 // we have to

	708 int32_t oldresult = previous();

	709 while (oldresult > offset) {

	710 int32_t result = previous();

	711 if (result <= offset) {

	712 return oldresult;

	713 }

	714 oldresult = result;

	715 }

	716 int32_t result = next();

	717 if (result <= offset) {

	718 return next();

	719 }

	720 return result;

	721 }

	722 // otherwise, we have to sync up first. Use handlePrevious() to back

	723 // up to a known break position before the specified position (if

	724 // we can determine that the specified position is a break position,

	725 // we don't back up at all). This may or may not be the last break

	726 // position at or before our starting position. Advance forward

	727 // from here until we've passed the starting position. The position

	728 // we stop on will be the first break position after the specified one.

	729 // old rule syntax

	730

	731 utext_setNativeIndex(fText, offset);

	732 if (offset==0 \|\|

	733 (offset==1 && utext_getNativeIndex(fText)==0)) {

	734 return next();

	735 }

	736 result = previous();

	737

	738 while (result != BreakIterator::DONE && result <= offset) {

	739 result = next();

	740 }

	741

	742 return result;

	743 }

	744

	745 /**

	746 * Sets the iterator to refer to the last boundary position before the

	747 * specified position.

	748 * @offset The position to begin searching for a break from.

	749 * @return The position of the last boundary before the starting position.

	750 */

	751 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {

	752 // if we have cached break positions and offset is in the range

	753 // covered by them, use them

	754 if (fCachedBreakPositions != NULL) {

	755 // TODO: binary search?

	756 // TODO: What if offset is outside range, but break is not?

	757 if (offset > fCachedBreakPositions[0]

	758 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1] ) {

	759 fPositionInCache = 0;

	760 while (fPositionInCache < fNumCachedBreakPositions

	761 && offset > fCachedBreakPositions[fPositionInCache])

	762 ++fPositionInCache;

	763 --fPositionInCache;

	764 // If we're at the beginning of the cache, need to reevaluate the

	765 // rule status

	766 if (fPositionInCache <= 0) {

	767 fLastStatusIndexValid = FALSE;

	768 }

	769 utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]) ;

	770 return fCachedBreakPositions[fPositionInCache];

	771 }

	772 else {

	773 reset();

	774 }

	775 }

	776

	777 // if the offset passed in is already past the end of the text,

	778 // just return DONE; if it's before the beginning, return the

	779 // text's starting offset

	780 if (fText == NULL \|\| offset > utext_nativeLength(fText)) {

	781 // return BreakIterator::DONE;

	782 return last();

	783 }

	784 else if (offset < 0) {

	785 return first();

	786 }

	787

	788 // if we start by updating the current iteration position to the

	789 // position specified by the caller, we can just use previous()

	790 // to carry out this operation

	791

	792 if (fData->fSafeFwdTable != NULL) {

	793 // new rule syntax

	794 utext_setNativeIndex(fText, offset);

	795 int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);

	796 if (newOffset != offset) {

	797 // Will come here if specified offset was not a code point boundary AND

	798 // the underlying implmentation is using UText, which snaps any no n-code-point-boundary

	799 // indices to the containing code point.

	800 // For breakitereator::preceding only, these non-code-point indices need to be moved

	801 // up to refer to the following codepoint.

	802 UTEXT_NEXT32(fText);

	803 offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);

	804 }

	805

	806 // TODO: (synwee) would it be better to just check for being in the mid dle of a surrogate pair,

	807 // rather than adjusting the position unconditionally?

	808 // (Change would interact with safe rules.)

	809 // TODO: change RBBI behavior for off-boundary indices to match that of UText?

	810 // affects only preceding(), seems cleaner, but is slightly diffe rent.

	811 UTEXT_PREVIOUS32(fText);

	812 handleNext(fData->fSafeFwdTable);

	813 int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);

	814 while (result >= offset) {

	815 result = previous();

	816 }

	817 return result;

	818 }

	819 if (fData->fSafeRevTable != NULL) {

	820 // backup plan if forward safe table is not available

	821 // TODO: check whether this path can be discarded

	822 // It's probably OK to say that rules must supply both safe tabl es

	823 // if they use safe tables at all. We have certainly never d escribed

	824 // to anyone how to work with just one safe table.

	825 utext_setNativeIndex(fText, offset);

	826 UTEXT_NEXT32(fText);

	827

	828 // handle previous will give result <= offset

	829 handlePrevious(fData->fSafeRevTable);

	830

	831 // next will give result 0 or 1 boundary away from offset,

	832 // most of the time

	833 // we have to

	834 int32_t oldresult = next();

	835 while (oldresult < offset) {

	836 int32_t result = next();

	837 if (result >= offset) {

	838 return oldresult;

	839 }

	840 oldresult = result;

	841 }

	842 int32_t result = previous();

	843 if (result >= offset) {

	844 return previous();

	845 }

	846 return result;

	847 }

	848

	849 // old rule syntax

	850 utext_setNativeIndex(fText, offset);

	851 return previous();

	852 }

	853

	854 /**

	855 * Returns true if the specfied position is a boundary position. As a side

	856 * effect, leaves the iterator pointing to the first boundary position at

	857 * or after "offset".

	858 * @param offset the offset to check.

	859 * @return True if "offset" is a boundary position.

	860 */

	861 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {

	862 // the beginning index of the iterator is always a boundary position by defi nition

	863 if (offset == 0) {

	864 first(); // For side effects on current position, tag values.

	865 return TRUE;

	866 }

	867

	868 if (offset == (int32_t)utext_nativeLength(fText)) {

	869 last(); // For side effects on current position, tag values.

	870 return TRUE;

	871 }

	872

	873 // out-of-range indexes are never boundary positions

	874 if (offset < 0) {

	875 first(); // For side effects on current position, tag values.

	876 return FALSE;

	877 }

	878

	879 if (offset > utext_nativeLength(fText)) {

	880 last(); // For side effects on current position, tag values.

	881 return FALSE;

	882 }

	883

	884 // otherwise, we can use following() on the position before the specified

	885 // one and return true if the position we get back is the one the user

	886 // specified

	887 utext_previous32From(fText, offset);

	888 int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);

	889 UBool result = following(backOne) == offset;

	890 return result;

	891 }

	892

	893 /**

	894 * Returns the current iteration position.

	895 * @return The current iteration position.

	896 */

	897 int32_t RuleBasedBreakIterator::current(void) const {

	898 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);

	899 return pos;

	900 }

	901

	902 //=======================================================================

	903 // implementation

	904 //=======================================================================

	905

	906 //

	907 // RBBIRunMode - the state machine runs an extra iteration at the beginning an d end

	908 // of user text. A variable with this enum type keeps track of where we

	909 // are. The state machine only fetches user input while in the RUN mode.

	910 //

	911 enum RBBIRunMode {

	912 RBBI_START, // state machine processing is before first char of input

	913 RBBI_RUN, // state machine processing is in the user text

	914 RBBI_END // state machine processing is after end of user text.

	915 };

	916

	917

	918 //------------------------------------------------------------------------------ -----

	919 //

	920 // handleNext(stateTable)

	921 // This method is the actual implementation of the rbbi next() method.

	922 // This method initializes the state machine to state 1

	923 // and advances through the text character by character until we reach the e nd

	924 // of the text or the state machine transitions to state 0. We update our r eturn

	925 // value every time the state machine passes through an accepting state.

	926 //

	927 //------------------------------------------------------------------------------ -----

	928 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {

	929 int32_t state;

	930 int16_t category = 0;

	931 RBBIRunMode mode;

	932

	933 RBBIStateTableRow *row;

	934 UChar32 c;

	935 int32_t lookaheadStatus = 0;

	936 int32_t lookaheadTagIdx = 0;

	937 int32_t result = 0;

	938 int32_t initialPosition = 0;

	939 int32_t lookaheadResult = 0;

	940 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEA D_HARD_BREAK) != 0;

	941 const char *tableData = statetable->fTableData;

	942 uint32_t tableRowLen = statetable->fRowLen;

	943

	944 #ifdef RBBI_DEBUG

	945 if (fTrace) {

	946 RBBIDebugPuts("Handle Next pos char state category");

	947 }

	948 #endif

	949

	950 // No matter what, handleNext alway correctly sets the break tag value.

	951 fLastStatusIndexValid = TRUE;

	952 fLastRuleStatusIndex = 0;

	953

	954 // if we're already at the end of the text, return DONE.

	955 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);

	956 result = initialPosition;

	957 c = UTEXT_NEXT32(fText);

	958 if (fData == NULL \|\| c==U_SENTINEL) {

	959 return BreakIterator::DONE;

	960 }

	961

	962 // Set the initial state for the state machine

	963 state = START_STATE;

	964 row = (RBBIStateTableRow *)

	965 //(statetable->fTableData + (statetable->fRowLen * state));

	966 (tableData + tableRowLen * state);

	967

	968

	969 mode = RBBI_RUN;

	970 if (statetable->fFlags & RBBI_BOF_REQUIRED) {

	971 category = 2;

	972 mode = RBBI_START;

	973 }

	974

	975

	976 // loop until we reach the end of the text or transition to state 0

	977 //

	978 for (;;) {

	979 if (c == U_SENTINEL) {

	980 // Reached end of input string.

	981 if (mode == RBBI_END) {

	982 // We have already run the loop one last time with the

	983 // character set to the psueudo {eof} value. Now it is time

	984 // to unconditionally bail out.

	985 if (lookaheadResult > result) {

	986 // We ran off the end of the string with a pending look-ahea d match.

	987 // Treat this as if the look-ahead condition had been met, a nd return

	988 // the match at the / position from the look-ahead rule.

	989 result = lookaheadResult;

	990 fLastRuleStatusIndex = lookaheadTagIdx;

	991 lookaheadStatus = 0;

	992 }

	993 break;

	994 }

	995 // Run the loop one last time with the fake end-of-input character c ategory.

	996 mode = RBBI_END;

	997 category = 1;

	998 }

	999

	1000 //

	1001 // Get the char category. An incoming category of 1 or 2 means that

	1002 // we are preset for doing the beginning or end of input, and

	1003 // that we shouldn't get a category from an actual text input chara cter.

	1004 //

	1005 if (mode == RBBI_RUN) {

	1006 // look up the current character's character category, which tells u s

	1007 // which column in the state table to look at.

	1008 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,

	1009 // not the size of the character going in, which is a UChar32 .

	1010 //

	1011 UTRIE_GET16(&fData->fTrie, c, category);

	1012

	1013 // Check the dictionary bit in the character's category.

	1014 // Counter is only used by dictionary based iterators (subclasses ).

	1015 // Chars that need to be handled by a dictionary have a flag bit set

	1016 // in their category values.

	1017 //

	1018 if ((category & 0x4000) != 0) {

	1019 fDictionaryCharCount++;

	1020 // And off the dictionary flag bit.

	1021 category &= ~0x4000;

	1022 }

	1023 }

	1024

	1025 #ifdef RBBI_DEBUG

	1026 if (fTrace) {

	1027 RBBIDebugPrintf(" %4ld ", utext_getNativeIndex(fTe xt));

	1028 if (0x20<=c && c<0x7f) {

	1029 RBBIDebugPrintf("\"%c\" ", c);

	1030 } else {

	1031 RBBIDebugPrintf("%5x ", c);

	1032 }

	1033 RBBIDebugPrintf("%3d %3d\n", state, category);

	1034 }

	1035 #endif

	1036

	1037 // State Transition - move machine to its next state

	1038 //

	1039 state = row->fNextState[category];

	1040 row = (RBBIStateTableRow *)

	1041 // (statetable->fTableData + (statetable->fRowLen * state));

	1042 (tableData + tableRowLen * state);

	1043

	1044

	1045 if (row->fAccepting == -1) {

	1046 // Match found, common case.

	1047 if (mode != RBBI_START) {

	1048 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);

	1049 }

	1050 fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.

	1051 }

	1052

	1053 if (row->fLookAhead != 0) {

	1054 if (lookaheadStatus != 0

	1055 && row->fAccepting == lookaheadStatus) {

	1056 // Lookahead match is completed.

	1057 result = lookaheadResult;

	1058 fLastRuleStatusIndex = lookaheadTagIdx;

	1059 lookaheadStatus = 0;

	1060 // TODO: make a standalone hard break in a rule work.

	1061 if (lookAheadHardBreak) {

	1062 UTEXT_SETNATIVEINDEX(fText, result);

	1063 return result;

	1064 }

	1065 // Look-ahead completed, but other rules may match further. Con tinue on

	1066 // TODO: junk this feature? I don't think it's used anywhwere .

	1067 goto continueOn;

	1068 }

	1069

	1070 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText);

	1071 lookaheadResult = r;

	1072 lookaheadStatus = row->fLookAhead;

	1073 lookaheadTagIdx = row->fTagIdx;

	1074 goto continueOn;

	1075 }

	1076

	1077

	1078 if (row->fAccepting != 0) {

	1079 // Because this is an accepting state, any in-progress look-ahead ma tch

	1080 // is no longer relavant. Clear out the pending lookahead status.

	1081 lookaheadStatus = 0; // clear out any pending look-ahead m atch.

	1082 }

	1083

	1084 continueOn:

	1085 if (state == STOP_STATE) {

	1086 // This is the normal exit from the lookup state machine.

	1087 // We have advanced through the string until it is certain that no

	1088 // longer match is possible, no matter what characters follow.

	1089 break;

	1090 }

	1091

	1092 // Advance to the next character.

	1093 // If this is a beginning-of-input loop iteration, don't advance

	1094 // the input position. The next iteration will be processing the

	1095 // first real input character.

	1096 if (mode == RBBI_RUN) {

	1097 c = UTEXT_NEXT32(fText);

	1098 } else {

	1099 if (mode == RBBI_START) {

	1100 mode = RBBI_RUN;

	1101 }

	1102 }

	1103

	1104

	1105 }

	1106

	1107 // The state machine is done. Check whether it found a match...

	1108

	1109 // If the iterator failed to advance in the match engine, force it ahead by one.

	1110 // (This really indicates a defect in the break rules. They should always match

	1111 // at least one character.)

	1112 if (result == initialPosition) {

	1113 UTEXT_SETNATIVEINDEX(fText, initialPosition);

	1114 UTEXT_NEXT32(fText);

	1115 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);

	1116 }

	1117

	1118 // Leave the iterator at our result position.

	1119 UTEXT_SETNATIVEINDEX(fText, result);

	1120 #ifdef RBBI_DEBUG

	1121 if (fTrace) {

	1122 RBBIDebugPrintf("result = %d\n\n", result);

	1123 }

	1124 #endif

	1125 return result;

	1126 }

	1127

	1128

	1129

	1130 //------------------------------------------------------------------------------ -----

	1131 //

	1132 // handlePrevious()

	1133 //

	1134 // Iterate backwards, according to the logic of the reverse rules.

	1135 // This version handles the exact style backwards rules.

	1136 //

	1137 // The logic of this function is very similar to handleNext(), above.

	1138 //

	1139 //------------------------------------------------------------------------------ -----

	1140 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {

	1141 int32_t state;

	1142 int16_t category = 0;

	1143 RBBIRunMode mode;

	1144 RBBIStateTableRow *row;

	1145 UChar32 c;

	1146 int32_t lookaheadStatus = 0;

	1147 int32_t result = 0;

	1148 int32_t initialPosition = 0;

	1149 int32_t lookaheadResult = 0;

	1150 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEA D_HARD_BREAK) != 0;

	1151

	1152 #ifdef RBBI_DEBUG

	1153 if (fTrace) {

	1154 RBBIDebugPuts("Handle Previous pos char state category");

	1155 }

	1156 #endif

	1157

	1158 // handlePrevious() never gets the rule status.

	1159 // Flag the status as invalid; if the user ever asks for status, we will nee d

	1160 // to back up, then re-find the break position using handleNext(), which doe s

	1161 // get the status value.

	1162 fLastStatusIndexValid = FALSE;

	1163 fLastRuleStatusIndex = 0;

	1164

	1165 // if we're already at the start of the text, return DONE.

	1166 if (fText == NULL \|\| fData == NULL \|\| UTEXT_GETNATIVEINDEX(fText)==0) {

	1167 return BreakIterator::DONE;

	1168 }

	1169

	1170 // Set up the starting char.

	1171 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);

	1172 result = initialPosition;

	1173 c = UTEXT_PREVIOUS32(fText);

	1174

	1175 // Set the initial state for the state machine

	1176 state = START_STATE;

	1177 row = (RBBIStateTableRow *)

	1178 (statetable->fTableData + (statetable->fRowLen * state));

	1179 category = 3;

	1180 mode = RBBI_RUN;

	1181 if (statetable->fFlags & RBBI_BOF_REQUIRED) {

	1182 category = 2;

	1183 mode = RBBI_START;

	1184 }

	1185

	1186

	1187 // loop until we reach the start of the text or transition to state 0

	1188 //

	1189 for (;;) {

	1190 if (c == U_SENTINEL) {

	1191 // Reached end of input string.

	1192 if (mode == RBBI_END) {

	1193 // We have already run the loop one last time with the

	1194 // character set to the psueudo {eof} value. Now it is time

	1195 // to unconditionally bail out.

	1196 if (lookaheadResult < result) {

	1197 // We ran off the end of the string with a pending look-ahea d match.

	1198 // Treat this as if the look-ahead condition had been met, a nd return

	1199 // the match at the / position from the look-ahead rule.

	1200 result = lookaheadResult;

	1201 lookaheadStatus = 0;

	1202 } else if (result == initialPosition) {

	1203 // Ran off start, no match found.

	1204 // move one index one (towards the start, since we are doing a previous())

	1205 UTEXT_SETNATIVEINDEX(fText, initialPosition);

	1206 UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check.

	1207 }

	1208 break;

	1209 }

	1210 // Run the loop one last time with the fake end-of-input character c ategory.

	1211 mode = RBBI_END;

	1212 category = 1;

	1213 }

	1214

	1215 //

	1216 // Get the char category. An incoming category of 1 or 2 means that

	1217 // we are preset for doing the beginning or end of input, and

	1218 // that we shouldn't get a category from an actual text input chara cter.

	1219 //

	1220 if (mode == RBBI_RUN) {

	1221 // look up the current character's character category, which tells u s

	1222 // which column in the state table to look at.

	1223 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,

	1224 // not the size of the character going in, which is a UChar32 .

	1225 //

	1226 UTRIE_GET16(&fData->fTrie, c, category);

	1227

	1228 // Check the dictionary bit in the character's category.

	1229 // Counter is only used by dictionary based iterators (subclasses ).

	1230 // Chars that need to be handled by a dictionary have a flag bit set

	1231 // in their category values.

	1232 //

	1233 if ((category & 0x4000) != 0) {

	1234 fDictionaryCharCount++;

	1235 // And off the dictionary flag bit.

	1236 category &= ~0x4000;

	1237 }

	1238 }

	1239

	1240 #ifdef RBBI_DEBUG

	1241 if (fTrace) {

	1242 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeI ndex(fText));

	1243 if (0x20<=c && c<0x7f) {

	1244 RBBIDebugPrintf("\"%c\" ", c);

	1245 } else {

	1246 RBBIDebugPrintf("%5x ", c);

	1247 }

	1248 RBBIDebugPrintf("%3d %3d\n", state, category);

	1249 }

	1250 #endif

	1251

	1252 // State Transition - move machine to its next state

	1253 //

	1254 state = row->fNextState[category];

	1255 row = (RBBIStateTableRow *)

	1256 (statetable->fTableData + (statetable->fRowLen * state));

	1257

	1258 if (row->fAccepting == -1) {

	1259 // Match found, common case.

	1260 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);

	1261 }

	1262

	1263 if (row->fLookAhead != 0) {

	1264 if (lookaheadStatus != 0

	1265 && row->fAccepting == lookaheadStatus) {

	1266 // Lookahead match is completed.

	1267 result = lookaheadResult;

	1268 lookaheadStatus = 0;

	1269 // TODO: make a standalone hard break in a rule work.

	1270 if (lookAheadHardBreak) {

	1271 UTEXT_SETNATIVEINDEX(fText, result);

	1272 return result;

	1273 }

	1274 // Look-ahead completed, but other rules may match further. Con tinue on

	1275 // TODO: junk this feature? I don't think it's used anywhwere .

	1276 goto continueOn;

	1277 }

	1278

	1279 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText);

	1280 lookaheadResult = r;

	1281 lookaheadStatus = row->fLookAhead;

	1282 goto continueOn;

	1283 }

	1284

	1285

	1286 if (row->fAccepting != 0) {

	1287 // Because this is an accepting state, any in-progress look-ahead ma tch

	1288 // is no longer relavant. Clear out the pending lookahead status.

	1289 lookaheadStatus = 0;

	1290 }

	1291

	1292 continueOn:

	1293 if (state == STOP_STATE) {

	1294 // This is the normal exit from the lookup state machine.

	1295 // We have advanced through the string until it is certain that no

	1296 // longer match is possible, no matter what characters follow.

	1297 break;

	1298 }

	1299

	1300 // Move (backwards) to the next character to process.

	1301 // If this is a beginning-of-input loop iteration, don't advance

	1302 // the input position. The next iteration will be processing the

	1303 // first real input character.

	1304 if (mode == RBBI_RUN) {

	1305 c = UTEXT_PREVIOUS32(fText);

	1306 } else {

	1307 if (mode == RBBI_START) {

	1308 mode = RBBI_RUN;

	1309 }

	1310 }

	1311 }

	1312

	1313 // The state machine is done. Check whether it found a match...

	1314

	1315 // If the iterator failed to advance in the match engine, force it ahead by one.

	1316 // (This really indicates a defect in the break rules. They should always match

	1317 // at least one character.)

	1318 if (result == initialPosition) {

	1319 UTEXT_SETNATIVEINDEX(fText, initialPosition);

	1320 UTEXT_PREVIOUS32(fText);

	1321 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);

	1322 }

	1323

	1324 // Leave the iterator at our result position.

	1325 UTEXT_SETNATIVEINDEX(fText, result);

	1326 #ifdef RBBI_DEBUG

	1327 if (fTrace) {

	1328 RBBIDebugPrintf("result = %d\n\n", result);

	1329 }

	1330 #endif

	1331 return result;

	1332 }

	1333

	1334

	1335 void

	1336 RuleBasedBreakIterator::reset()

	1337 {

	1338 if (fCachedBreakPositions) {

	1339 uprv_free(fCachedBreakPositions);

	1340 }

	1341 fCachedBreakPositions = NULL;

	1342 fNumCachedBreakPositions = 0;

	1343 fDictionaryCharCount = 0;

	1344 fPositionInCache = 0;

	1345 }

	1346

	1347

	1348

	1349 //------------------------------------------------------------------------------ -

	1350 //

	1351 // getRuleStatus() Return the break rule tag associated with the current

	1352 // iterator position. If the iterator arrived at its curren t

	1353 // position by iterating forwards, the value will have been

	1354 // cached by the handleNext() function.

	1355 //

	1356 // If no cached status value is available, the status is

	1357 // found by doing a previous() followed by a next(), which

	1358 // leaves the iterator where it started, and computes the

	1359 // status while doing the next().

	1360 //

	1361 //------------------------------------------------------------------------------ -

	1362 void RuleBasedBreakIterator::makeRuleStatusValid() {

	1363 if (fLastStatusIndexValid == FALSE) {

	1364 // No cached status is available.

	1365 if (fText == NULL \|\| current() == 0) {

	1366 // At start of text, or there is no text. Status is always zero.

	1367 fLastRuleStatusIndex = 0;

	1368 fLastStatusIndexValid = TRUE;

	1369 } else {

	1370 // Not at start of text. Find status the tedious way.

	1371 int32_t pa = current();

	1372 previous();

	1373 if (fNumCachedBreakPositions > 0) {

	1374 reset(); // Blow off the dictionary cache

	1375 }

	1376 int32_t pb = next();

	1377 if (pa != pb) {

	1378 // note: the if (pa != pb) test is here only to eliminate warnin gs for

	1379 // unused local variables on gcc. Logically, it isn't nee ded.

	1380 U_ASSERT(pa == pb);

	1381 }

	1382 }

	1383 }

	1384 U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatu sMaxIdx);

	1385 }

	1386

	1387

	1388 int32_t RuleBasedBreakIterator::getRuleStatus() const {

	1389 RuleBasedBreakIterator nonConstThis = (RuleBasedBreakIterator )this;

	1390 nonConstThis->makeRuleStatusValid();

	1391

	1392 // fLastRuleStatusIndex indexes to the start of the appropriate status recor d

	1393 // (the number of status val ues.)

	1394 // This function returns the last (largest) of the array of status values.

	1395 int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatu sIndex];

	1396 int32_t tagVal = fData->fRuleStatusTable[idx];

	1397

	1398 return tagVal;

	1399 }

	1400

	1401

	1402

	1403

	1404 int32_t RuleBasedBreakIterator::getRuleStatusVec(

	1405 int32_t *fillInVec, int32_t capacity, UErrorCode &status)

	1406 {

	1407 if (U_FAILURE(status)) {

	1408 return 0;

	1409 }

	1410

	1411 RuleBasedBreakIterator nonConstThis = (RuleBasedBreakIterator )this;

	1412 nonConstThis->makeRuleStatusValid();

	1413 int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];

	1414 int32_t numValsToCopy = numVals;

	1415 if (numVals > capacity) {

	1416 status = U_BUFFER_OVERFLOW_ERROR;

	1417 numValsToCopy = capacity;

	1418 }

	1419 int i;

	1420 for (i=0; i<numValsToCopy; i++) {

	1421 fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];

	1422 }

	1423 return numVals;

	1424 }

	1425

	1426

	1427

	1428 //------------------------------------------------------------------------------ -

	1429 //

	1430 // getBinaryRules Access to the compiled form of the rules,

	1431 // for use by build system tools that save the data

	1432 // for standard iterator types.

	1433 //

	1434 //------------------------------------------------------------------------------ -

	1435 const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {

	1436 const uint8_t *retPtr = NULL;

	1437 length = 0;

	1438

	1439 if (fData != NULL) {

	1440 retPtr = (const uint8_t *)fData->fHeader;

	1441 length = fData->fHeader->fLength;

	1442 }

	1443 return retPtr;

	1444 }

	1445

	1446

	1447

	1448

	1449 //------------------------------------------------------------------------------ -

	1450 //

	1451 // BufferClone TODO: In my (Andy) opinion, this function should be depre cated.

	1452 // Saving one heap allocation isn't worth the trouble.

	1453 // Cloning shouldn't be done in tight loops, and

	1454 // making the clone copy involves other heap operations anywa y.

	1455 // And the application code for correctly dealing with buffer

	1456 // size problems and the eventual object destruction is ugly.

	1457 //

	1458 //------------------------------------------------------------------------------ -

	1459 BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer,

	1460 int32_t &bufferSize,

	1461 UErrorCode &status)

	1462 {

	1463 if (U_FAILURE(status)){

	1464 return NULL;

	1465 }

	1466

	1467 //

	1468 // If user buffer size is zero this is a preflight operation to

	1469 // obtain the needed buffer size, allowing for worst case misalignment.

	1470 //

	1471 if (bufferSize == 0) {

	1472 bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);

	1473 return NULL;

	1474 }

	1475

	1476

	1477 //

	1478 // Check the alignment and size of the user supplied buffer.

	1479 // Allocate heap memory if the user supplied memory is insufficient.

	1480 //

	1481 char buf = (char )stackBuffer;

	1482 uint32_t s = bufferSize;

	1483

	1484 if (stackBuffer == NULL) {

	1485 s = 0; // Ignore size, force allocation if user didn't give us a buffe r.

	1486 }

	1487 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {

	1488 uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf);

	1489 s -= offsetUp;

	1490 buf += offsetUp;

	1491 }

	1492 if (s < sizeof(RuleBasedBreakIterator)) {

	1493 // Not enough room in the caller-supplied buffer.

	1494 // Do a plain-vanilla heap based clone and return that, along with

	1495 // a warning that the clone was allocated.

	1496 RuleBasedBreakIterator clonedBI = new RuleBasedBreakIterator(this);

	1497 if (clonedBI == 0) {

	1498 status = U_MEMORY_ALLOCATION_ERROR;

	1499 } else {

	1500 status = U_SAFECLONE_ALLOCATED_WARNING;

	1501 }

	1502 return clonedBI;

	1503 }

	1504

	1505 //

	1506 // Clone the source BI into the caller-supplied buffer.

	1507 // TODO: using an overloaded operator new to directly initialize the

	1508 // copy in the user's buffer would be better, but it doesn't seem

	1509 // to get along with namespaces. Investigate why.

	1510 //

	1511 // The memcpy is only safe with an empty (default constructed)

	1512 // break iterator. Use on others can screw up reference counts

	1513 // to data. memcpy-ing objects is not really a good idea...

	1514 //

	1515 RuleBasedBreakIterator localIter; // Empty break iterator, source for memcpy

	1516 RuleBasedBreakIterator clone = (RuleBasedBreakIterator )buf;

	1517 uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part

	1518 clone->init(); // Init RuleBasedBreakIterator part, (user def ault constructor)

	1519 clone = this; // clone = the real BI we want.

	1520 clone->fBufferClone = TRUE; // Flag to prevent deleting storage on close ( From C code)

	1521

	1522 return clone;

	1523 }

	1524

	1525

	1526 //------------------------------------------------------------------------------ -

	1527 //

	1528 // isDictionaryChar Return true if the category lookup for this char

	1529 // indicates that it is in the set of dictionary lookup

	1530 // chars.

	1531 //

	1532 // This function is intended for use by dictionary based

	1533 // break iterators.

	1534 //

	1535 //------------------------------------------------------------------------------ -

	1536 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) {

	1537 if (fData == NULL) {

	1538 return FALSE;

	1539 }

	1540 uint16_t category;

	1541 UTRIE_GET16(&fData->fTrie, c, category);

	1542 return (category & 0x4000) != 0;

	1543 }*/

	1544

	1545

	1546 //------------------------------------------------------------------------------ -

	1547 //

	1548 // checkDictionary This function handles all processing of characters in

	1549 // the "dictionary" set. It will determine the appropriat e

	1550 // course of action, and possibly set up a cache in the

	1551 // process.

	1552 //

	1553 //------------------------------------------------------------------------------ -

	1554 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,

	1555 int32_t endPos,

	1556 UBool reverse) {

	1557 // Reset the old break cache first.

	1558 uint32_t dictionaryCount = fDictionaryCharCount;

	1559 reset();

	1560

	1561 if (dictionaryCount <= 1 \|\| (endPos - startPos) <= 1) {

	1562 return (reverse ? startPos : endPos);

	1563 }

	1564

	1565 // Bug 5532. The dictionary code will crash if the input text is UTF-8

	1566 // because native indexes are different from UTF-16 indexes.

	1567 // Temporary hack: skip dictionary lookup for UTF-8 encoded text.

	1568 // It wont give the right breaks, but it's better than a crash.

	1569 //

	1570 // Check the type of the UText by checking its pFuncs field, which

	1571 // is UText's function dispatch table. It will be the same for all

	1572 // UTF-8 UTexts and different for any other UText type.

	1573 //

	1574 // We have no other type of UText available with non-UTF-16 native inde xing.

	1575 // This whole check will go away once the dictionary code is fixed.

	1576 static const void *utext_utf8Funcs;

	1577 if (utext_utf8Funcs == NULL) {

	1578 // Cache the UTF-8 UText function pointer value.

	1579 UErrorCode status = U_ZERO_ERROR;

	1580 UText tempUText = UTEXT_INITIALIZER;

	1581 utext_openUTF8(&tempUText, NULL, 0, &status);

	1582 utext_utf8Funcs = tempUText.pFuncs;

	1583 utext_close(&tempUText);

	1584 }

	1585 if (fText->pFuncs == utext_utf8Funcs) {

	1586 return (reverse ? startPos : endPos);

	1587 }

	1588

	1589 // Starting from the starting point, scan towards the proposed result,

	1590 // looking for the first dictionary character (which may be the one

	1591 // we're on, if we're starting in the middle of a range).

	1592 utext_setNativeIndex(fText, reverse ? endPos : startPos);

	1593 if (reverse) {

	1594 UTEXT_PREVIOUS32(fText);

	1595 }

	1596

	1597 int32_t rangeStart = startPos;

	1598 int32_t rangeEnd = endPos;

	1599

	1600 uint16_t category;

	1601 int32_t current;

	1602 UErrorCode status = U_ZERO_ERROR;

	1603 UStack breaks(status);

	1604 int32_t foundBreakCount = 0;

	1605 UChar32 c = utext_current32(fText);

	1606

	1607 UTRIE_GET16(&fData->fTrie, c, category);

	1608

	1609 // Is the character we're starting on a dictionary character? If so, we

	1610 // need to back up to include the entire run; otherwise the results of

	1611 // the break algorithm will differ depending on where we start. Since

	1612 // the result is cached and there is typically a non-dictionary break

	1613 // within a small number of words, there should be little performance impact .

	1614 if (category & 0x4000) {

	1615 if (reverse) {

	1616 do {

	1617 utext_next32(fText); // TODO: recast to work directly with postincrement.

	1618 c = utext_current32(fText);

	1619 UTRIE_GET16(&fData->fTrie, c, category);

	1620 } while (c != U_SENTINEL && (category & 0x4000));

	1621 // Back up to the last dictionary character

	1622 rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);

	1623 if (c == U_SENTINEL) {

	1624 // c = fText->last32();

	1625 // TODO: why was this if needed?

	1626 c = UTEXT_PREVIOUS32(fText);

	1627 }

	1628 else {

	1629 c = UTEXT_PREVIOUS32(fText);

	1630 }

	1631 }

	1632 else {

	1633 do {

	1634 c = UTEXT_PREVIOUS32(fText);

	1635 UTRIE_GET16(&fData->fTrie, c, category);

	1636 }

	1637 while (c != U_SENTINEL && (category & 0x4000));

	1638 // Back up to the last dictionary character

	1639 if (c == U_SENTINEL) {

	1640 // c = fText->first32();

	1641 c = utext_current32(fText);

	1642 }

	1643 else {

	1644 utext_next32(fText);

	1645 c = utext_current32(fText);

	1646 }

	1647 rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;

	1648 }

	1649 UTRIE_GET16(&fData->fTrie, c, category);

	1650 }

	1651

	1652 // Loop through the text, looking for ranges of dictionary characters.

	1653 // For each span, find the appropriate break engine, and ask it to find

	1654 // any breaks within the span.

	1655 // Note: we always do this in the forward direction, so that the break

	1656 // cache is built in the right order.

	1657 if (reverse) {

	1658 utext_setNativeIndex(fText, rangeStart);

	1659 c = utext_current32(fText);

	1660 UTRIE_GET16(&fData->fTrie, c, category);

	1661 }

	1662 while(U_SUCCESS(status)) {

	1663 while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (ca tegory & 0x4000) == 0) {

	1664 utext_next32(fText); // TODO: tweak for post-increment op eration

	1665 c = utext_current32(fText);

	1666 UTRIE_GET16(&fData->fTrie, c, category);

	1667 }

	1668 if (current >= rangeEnd) {

	1669 break;

	1670 }

	1671

	1672 // We now have a dictionary character. Get the appropriate language obje ct

	1673 // to deal with it.

	1674 const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);

	1675

	1676 // Ask the language object if there are any breaks. It will leave the te xt

	1677 // pointer on the other side of its range, ready to search for the next one.

	1678 if (lbe != NULL) {

	1679 foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALS E, fBreakType, breaks);

	1680 }

	1681

	1682 // Reload the loop variables for the next go-round

	1683 c = utext_current32(fText);

	1684 UTRIE_GET16(&fData->fTrie, c, category);

	1685 }

	1686

	1687 // If we found breaks, build a new break cache. The first and last entries m ust

	1688 // be the original starting and ending position.

	1689 if (foundBreakCount > 0) {

	1690 int32_t totalBreaks = foundBreakCount;

	1691 if (startPos < breaks.elementAti(0)) {

	1692 totalBreaks += 1;

	1693 }

	1694 if (endPos > breaks.peeki()) {

	1695 totalBreaks += 1;

	1696 }

	1697 fCachedBreakPositions = (int32_t )uprv_malloc(totalBreaks sizeof(int3 2_t));

	1698 if (fCachedBreakPositions != NULL) {

	1699 int32_t out = 0;

	1700 fNumCachedBreakPositions = totalBreaks;

	1701 if (startPos < breaks.elementAti(0)) {

	1702 fCachedBreakPositions[out++] = startPos;

	1703 }

	1704 for (int32_t i = 0; i < foundBreakCount; ++i) {

	1705 fCachedBreakPositions[out++] = breaks.elementAti(i);

	1706 }

	1707 if (endPos > fCachedBreakPositions[out-1]) {

	1708 fCachedBreakPositions[out] = endPos;

	1709 }

	1710 // If there are breaks, then by definition, we are replacing the ori ginal

	1711 // proposed break by one of the breaks we found. Use following() and

	1712 // preceding() to do the work. They should never recurse in this cas e.

	1713 if (reverse) {

	1714 return preceding(endPos - 1);

	1715 }

	1716 else {

	1717 return following(startPos);

	1718 }

	1719 }

	1720 // If the allocation failed, just fall through to the "no breaks found" case.

	1721 }

	1722

	1723 // If we get here, there were no language-based breaks. Set the text pointer

	1724 // to the original proposed break.

	1725 utext_setNativeIndex(fText, reverse ? startPos : endPos);

	1726 return (reverse ? startPos : endPos);

	1727 }

	1728

	1729 U_NAMESPACE_END

	1730

	1731 // defined in ucln_cmn.h

	1732

	1733 static U_NAMESPACE_QUALIFIER UStack *gLanguageBreakFactories = NULL;

	1734

	1735 /**

	1736 * Release all static memory held by breakiterator.

	1737 */

	1738 U_CDECL_BEGIN

	1739 static UBool U_CALLCONV breakiterator_cleanup_dict(void) {

	1740 if (gLanguageBreakFactories) {

	1741 delete gLanguageBreakFactories;

	1742 gLanguageBreakFactories = NULL;

	1743 }

	1744 return TRUE;

	1745 }

	1746 U_CDECL_END

	1747

	1748 U_CDECL_BEGIN

	1749 static void U_CALLCONV _deleteFactory(void *obj) {

	1750 delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory *) obj;

	1751 }

	1752 U_CDECL_END

	1753 U_NAMESPACE_BEGIN

	1754

	1755 static const LanguageBreakEngine*

	1756 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)

	1757 {

	1758 UBool needsInit;

	1759 UErrorCode status = U_ZERO_ERROR;

	1760 UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit);

	1761

	1762 if (needsInit) {

	1763 UStack *factories = new UStack(_deleteFactory, NULL, status);

	1764 if (factories != NULL && U_SUCCESS(status)) {

	1765 ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(statu s);

	1766 factories->push(builtIn, status);

	1767 #ifdef U_LOCAL_SERVICE_HOOK

	1768 LanguageBreakFactory extra = (LanguageBreakFactory )uprv_svc_hook( "languageBreakFactory", &status);

	1769 if (extra != NULL) {

	1770 factories->push(extra, status);

	1771 }

	1772 #endif

	1773 }

	1774 umtx_lock(NULL);

	1775 if (gLanguageBreakFactories == NULL) {

	1776 gLanguageBreakFactories = factories;

	1777 factories = NULL;

	1778 ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakite rator_cleanup_dict);

	1779 }

	1780 umtx_unlock(NULL);

	1781 delete factories;

	1782 }

	1783

	1784 if (gLanguageBreakFactories == NULL) {

	1785 return NULL;

	1786 }

	1787

	1788 int32_t i = gLanguageBreakFactories->size();

	1789 const LanguageBreakEngine *lbe = NULL;

	1790 while (--i >= 0) {

	1791 LanguageBreakFactory factory = (LanguageBreakFactory )(gLanguageBreakF actories->elementAt(i));

	1792 lbe = factory->getEngineFor(c, breakType);

	1793 if (lbe != NULL) {

	1794 break;

	1795 }

	1796 }

	1797 return lbe;

	1798 }

	1799

	1800

	1801 //------------------------------------------------------------------------------ -

	1802 //

	1803 // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the

	1804 // the characer c.

	1805 //

	1806 //------------------------------------------------------------------------------ -

	1807 const LanguageBreakEngine *

	1808 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {

	1809 const LanguageBreakEngine *lbe = NULL;

	1810 UErrorCode status = U_ZERO_ERROR;

	1811

	1812 if (fLanguageBreakEngines == NULL) {

	1813 fLanguageBreakEngines = new UStack(status);

	1814 if (fLanguageBreakEngines == NULL \|\| U_FAILURE(status)) {

	1815 delete fLanguageBreakEngines;

	1816 fLanguageBreakEngines = 0;

	1817 return NULL;

	1818 }

	1819 }

	1820

	1821 int32_t i = fLanguageBreakEngines->size();

	1822 while (--i >= 0) {

	1823 lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i)) ;

	1824 if (lbe->handles(c, fBreakType)) {

	1825 return lbe;

	1826 }

	1827 }

	1828

	1829 // No existing dictionary took the character. See if a factory wants to

	1830 // give us a new LanguageBreakEngine for this character.

	1831 lbe = getLanguageBreakEngineFromFactory(c, fBreakType);

	1832

	1833 // If we got one, use it and push it on our stack.

	1834 if (lbe != NULL) {

	1835 fLanguageBreakEngines->push((void *)lbe, status);

	1836 // Even if we can't remember it, we can keep looking it up, so

	1837 // return it even if the push fails.

	1838 return lbe;

	1839 }

	1840

	1841 // No engine is forthcoming for this character. Add it to the

	1842 // reject set. Create the reject break engine if needed.

	1843 if (fUnhandledBreakEngine == NULL) {

	1844 fUnhandledBreakEngine = new UnhandledEngine(status);

	1845 if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {

	1846 status = U_MEMORY_ALLOCATION_ERROR;

	1847 }

	1848 // Put it last so that scripts for which we have an engine get tried

	1849 // first.

	1850 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status) ;

	1851 // If we can't insert it, or creation failed, get rid of it

	1852 if (U_FAILURE(status)) {

	1853 delete fUnhandledBreakEngine;

	1854 fUnhandledBreakEngine = 0;

	1855 return NULL;

	1856 }

	1857 }

	1858

	1859 // Tell the reject engine about the character; at its discretion, it may

	1860 // add more than just the one character.

	1861 fUnhandledBreakEngine->handleCharacter(c, fBreakType);

	1862

	1863 return fUnhandledBreakEngine;

	1864 }

	1865

	1866

	1867

	1868 /*int32_t RuleBasedBreakIterator::getBreakType() const {

	1869 return fBreakType;

	1870 }*/

	1871

	1872 void RuleBasedBreakIterator::setBreakType(int32_t type) {

	1873 fBreakType = type;

	1874 reset();

	1875 }

	1876

	1877 U_NAMESPACE_END

	1878

	1879 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

OLD	NEW

« no previous file with comments | « icu46/source/common/putilimp.h ('k') | icu46/source/common/rbbicst.pl » ('j') | no next file with comments »