Index: source/common/rbbi.cpp |
diff --git a/source/common/rbbi.cpp b/source/common/rbbi.cpp |
index 6ab57a7c1175d5769e091c4c70edf8b09b364e28..19494af26a564a38909aac4af915be6d459f8b9c 100644 |
--- a/source/common/rbbi.cpp |
+++ b/source/common/rbbi.cpp |
@@ -1,6 +1,6 @@ |
/* |
*************************************************************************** |
-* Copyright (C) 1999-2013 International Business Machines Corporation |
+* Copyright (C) 1999-2014 International Business Machines Corporation |
* and others. All rights reserved. |
*************************************************************************** |
*/ |
@@ -518,8 +518,8 @@ RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, U |
/** |
- * Sets the current iteration position to the beginning of the text. |
- * @return The offset of the beginning of the text. |
+ * Sets the current iteration position to the beginning of the text, position zero. |
+ * @return The new iterator position, which is zero. |
*/ |
int32_t RuleBasedBreakIterator::first(void) { |
reset(); |
@@ -592,6 +592,7 @@ int32_t RuleBasedBreakIterator::next(void) { |
} |
int32_t startPos = current(); |
+ fDictionaryCharCount = 0; |
int32_t result = handleNext(fData->fForwardTable); |
if (fDictionaryCharCount > 0) { |
result = checkDictionary(startPos, result, FALSE); |
@@ -646,7 +647,6 @@ int32_t RuleBasedBreakIterator::previous(void) { |
// break position before the current position (we back our internal |
// iterator up one step to prevent handlePrevious() from returning |
// the current position), but not necessarily the last one before |
- |
// where we started |
int32_t start = current(); |
@@ -679,11 +679,11 @@ int32_t RuleBasedBreakIterator::previous(void) { |
// the result position that we are to return (in lastResult.) If |
// the backwards rules overshot and the above loop had to do two or more |
// next()s to move up to the desired return position, we will have a valid |
- // tag value. But, if handlePrevious() took us to exactly the correct result positon, |
+ // tag value. But, if handlePrevious() took us to exactly the correct result position, |
// we wont have a tag value for that position, which is only set by handleNext(). |
- // set the current iteration position to be the last break position |
- // before where we started, and then return that value |
+ // Set the current iteration position to be the last break position |
+ // before where we started, and then return that value. |
utext_setNativeIndex(fText, lastResult); |
fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() |
fLastStatusIndexValid = breakTagValid; |
@@ -701,6 +701,22 @@ int32_t RuleBasedBreakIterator::previous(void) { |
* @return The position of the first break after the current position. |
*/ |
int32_t RuleBasedBreakIterator::following(int32_t offset) { |
+ // if the offset passed in is already past the end of the text, |
+ // just return DONE; if it's before the beginning, return the |
+ // text's starting offset |
+ if (fText == NULL || offset >= utext_nativeLength(fText)) { |
+ last(); |
+ return next(); |
+ } |
+ else if (offset < 0) { |
+ return first(); |
+ } |
+ |
+ // Move requested offset to a code point start. It might be on a trail surrogate, |
+ // or on a trail byte if the input is UTF-8. |
+ utext_setNativeIndex(fText, offset); |
+ offset = utext_getNativeIndex(fText); |
+ |
// if we have cached break positions and offset is in the range |
// covered by them, use them |
// TODO: could use binary search |
@@ -722,20 +738,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) { |
} |
} |
- // if the offset passed in is already past the end of the text, |
- // just return DONE; if it's before the beginning, return the |
- // text's starting offset |
- fLastRuleStatusIndex = 0; |
- fLastStatusIndexValid = TRUE; |
- if (fText == NULL || offset >= utext_nativeLength(fText)) { |
- last(); |
- return next(); |
- } |
- else if (offset < 0) { |
- return first(); |
- } |
- |
- // otherwise, set our internal iteration position (temporarily) |
+ // Set our internal iteration position (temporarily) |
// to the position passed in. If this is the _beginning_ position, |
// then we can just use next() to get our return value |
@@ -747,6 +750,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) { |
// move forward one codepoint to prepare for moving back to a |
// safe point. |
// this handles offset being between a supplementary character |
+ // TODO: is this still needed, with move to code point boundary handled above? |
(void)UTEXT_NEXT32(fText); |
// handlePrevious will move most of the time to < 1 boundary away |
handlePrevious(fData->fSafeRevTable); |
@@ -809,6 +813,21 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) { |
* @return The position of the last boundary before the starting position. |
*/ |
int32_t RuleBasedBreakIterator::preceding(int32_t offset) { |
+ // if the offset passed in is already past the end of the text, |
+ // just return DONE; if it's before the beginning, return the |
+ // text's starting offset |
+ if (fText == NULL || offset > utext_nativeLength(fText)) { |
+ return last(); |
+ } |
+ else if (offset < 0) { |
+ return first(); |
+ } |
+ |
+ // Move requested offset to a code point start. It might be on a trail surrogate, |
+ // or on a trail byte if the input is UTF-8. |
+ utext_setNativeIndex(fText, offset); |
+ offset = utext_getNativeIndex(fText); |
+ |
// if we have cached break positions and offset is in the range |
// covered by them, use them |
if (fCachedBreakPositions != NULL) { |
@@ -834,17 +853,6 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) { |
} |
} |
- // if the offset passed in is already past the end of the text, |
- // just return DONE; if it's before the beginning, return the |
- // text's starting offset |
- if (fText == NULL || offset > utext_nativeLength(fText)) { |
- // return BreakIterator::DONE; |
- return last(); |
- } |
- else if (offset < 0) { |
- return first(); |
- } |
- |
// if we start by updating the current iteration position to the |
// position specified by the caller, we can just use previous() |
// to carry out this operation |
@@ -1578,30 +1586,6 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, |
return (reverse ? startPos : endPos); |
} |
- // Bug 5532. The dictionary code will crash if the input text is UTF-8 |
- // because native indexes are different from UTF-16 indexes. |
- // Temporary hack: skip dictionary lookup for UTF-8 encoded text. |
- // It wont give the right breaks, but it's better than a crash. |
- // |
- // Check the type of the UText by checking its pFuncs field, which |
- // is UText's function dispatch table. It will be the same for all |
- // UTF-8 UTexts and different for any other UText type. |
- // |
- // We have no other type of UText available with non-UTF-16 native indexing. |
- // This whole check will go away once the dictionary code is fixed. |
- static const void *utext_utf8Funcs; |
- if (utext_utf8Funcs == NULL) { |
- // Cache the UTF-8 UText function pointer value. |
- UErrorCode status = U_ZERO_ERROR; |
- UText tempUText = UTEXT_INITIALIZER; |
- utext_openUTF8(&tempUText, NULL, 0, &status); |
- utext_utf8Funcs = tempUText.pFuncs; |
- utext_close(&tempUText); |
- } |
- if (fText->pFuncs == utext_utf8Funcs) { |
- return (reverse ? startPos : endPos); |
- } |
- |
// Starting from the starting point, scan towards the proposed result, |
// looking for the first dictionary character (which may be the one |
// we're on, if we're starting in the middle of a range). |
@@ -1703,6 +1687,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, |
// If we found breaks, build a new break cache. The first and last entries must |
// be the original starting and ending position. |
if (foundBreakCount > 0) { |
+ U_ASSERT(foundBreakCount == breaks.size()); |
int32_t totalBreaks = foundBreakCount; |
if (startPos < breaks.elementAti(0)) { |
totalBreaks += 1; |
@@ -1742,8 +1727,6 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, |
return (reverse ? startPos : endPos); |
} |
-// defined in ucln_cmn.h |
- |
U_NAMESPACE_END |