Index: source/common/filteredbrk.cpp |
diff --git a/source/i18n/filteredbrk.cpp b/source/common/filteredbrk.cpp |
similarity index 71% |
rename from source/i18n/filteredbrk.cpp |
rename to source/common/filteredbrk.cpp |
index cc6880a600f4638aeb8a8b94f89e05db08cdd3f2..5a8f0b0873e645647bf9bac071547b66c9e5c9bb 100644 |
--- a/source/i18n/filteredbrk.cpp |
+++ b/source/common/filteredbrk.cpp |
@@ -1,6 +1,6 @@ |
/* |
******************************************************************************* |
-* Copyright (C) 2014, International Business Machines Corporation and |
+* Copyright (C) 2014-2015, International Business Machines Corporation and |
* others. All Rights Reserved. |
******************************************************************************* |
*/ |
@@ -43,6 +43,9 @@ static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, |
#define FB_TRACE(m,s,b,d) |
#endif |
+/** |
+ * Used with sortedInsert() |
+ */ |
static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { |
const UnicodeString &a = *(const UnicodeString*)t1.pointer; |
const UnicodeString &b = *(const UnicodeString*)t2.pointer; |
@@ -52,7 +55,7 @@ static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { |
/** |
* A UVector which implements a set of strings. |
*/ |
-class U_I18N_API UStringSet : public UVector { |
+class U_COMMON_API UStringSet : public UVector { |
public: |
UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject, |
uhash_compareUnicodeString, |
@@ -117,23 +120,46 @@ class U_I18N_API UStringSet : public UVector { |
*/ |
UStringSet::~UStringSet() {} |
+/* ----------------------------------------------------------- */ |
+ |
+/* Filtered Break constants */ |
static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie |
static const int32_t kMATCH = (1<<1); //< exact match - skip this one. |
static const int32_t kSuppressInReverse = (1<<0); |
static const int32_t kAddToForward = (1<<1); |
-static const UChar kFULLSTOP = 0x002E; // '.' |
+static const UChar kFULLSTOP = 0x002E; // '.' |
+ |
+/** |
+ * Shared data for SimpleFilteredSentenceBreakIterator |
+ */ |
+class SimpleFilteredSentenceBreakData : public UMemory { |
+public: |
+ SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards ) |
+ : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { } |
+ SimpleFilteredSentenceBreakData *incr() { refcount++; return this; } |
+ SimpleFilteredSentenceBreakData *decr() { if((--refcount) <= 0) delete this; return 0; } |
+ virtual ~SimpleFilteredSentenceBreakData(); |
+ LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M." |
+ LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs. |
+ int32_t refcount; |
+}; |
+ |
+SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {} |
+ |
+/** |
+ * Concrete implementation |
+ */ |
class SimpleFilteredSentenceBreakIterator : public BreakIterator { |
public: |
SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status); |
SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other); |
virtual ~SimpleFilteredSentenceBreakIterator(); |
private: |
+ SimpleFilteredSentenceBreakData *fData; |
LocalPointer<BreakIterator> fDelegate; |
LocalUTextPointer fText; |
- LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs. |
- LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M." |
/* -- subclass interface -- */ |
public: |
@@ -160,62 +186,82 @@ public: |
virtual CharacterIterator& getText(void) const { return fDelegate->getText(); } |
/* -- ITERATION -- */ |
- virtual int32_t first(void) { return fDelegate->first(); } |
- virtual int32_t preceding(int32_t /*offset*/) { /* TODO: not implemented */ return UBRK_DONE; } |
- virtual int32_t previous(void) { /* TODO: not implemented */ return UBRK_DONE; } |
- virtual UBool isBoundary(int32_t offset) { return fDelegate->isBoundary(offset); } |
- virtual int32_t current(void) const { return fDelegate->current(); } |
+ virtual int32_t first(void); |
+ virtual int32_t preceding(int32_t offset); |
+ virtual int32_t previous(void); |
+ virtual UBool isBoundary(int32_t offset); |
+ virtual int32_t current(void) const { return fDelegate->current(); } // we keep the delegate current, so this should be correct. |
virtual int32_t next(void); |
- virtual int32_t next(int32_t /*n*/) { /* TODO: not implemented */ return UBRK_DONE; } |
- virtual int32_t following(int32_t /*offset*/) { /* TODO: not implemented */ return UBRK_DONE; } |
- virtual int32_t last(void) { return fDelegate->last(); } |
+ virtual int32_t next(int32_t n); |
+ virtual int32_t following(int32_t offset); |
+ virtual int32_t last(void); |
+private: |
+ /** |
+ * Given that the fDelegate has already given its "initial" answer, |
+ * find the NEXT actual (non-excepted) break. |
+ * @param n initial position from delegate |
+ * @return new break position or UBRK_DONE |
+ */ |
+ int32_t internalNext(int32_t n); |
+ /** |
+ * Given that the fDelegate has already given its "initial" answer, |
+ * find the PREV actual (non-excepted) break. |
+ * @param n initial position from delegate |
+ * @return new break position or UBRK_DONE |
+ */ |
+ int32_t internalPrev(int32_t n); |
+ /** |
+ * set up the UText with the value of the fDelegate. |
+ * Call this before calling breakExceptionAt. |
+ * May be able to avoid excess calls |
+ */ |
+ void resetState(UErrorCode &status); |
+ /** |
+ * Is there a match (exception) at this spot? |
+ */ |
+ enum EFBMatchResult { kNoExceptionHere, kExceptionHere }; |
+ /** |
+ * Determine if there is an exception at this spot |
+ * @param n spot to check |
+ * @return kNoExceptionHere or kExceptionHere |
+ **/ |
+ enum EFBMatchResult breakExceptionAt(int32_t n); |
}; |
SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other) |
- : BreakIterator(other), fDelegate(other.fDelegate->clone()) |
+ : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone()) |
{ |
- /* |
- TODO: not able to clone Tries. Should be a refcounted hidden master instead. |
- if(other.fBackwardsTrie.isValid()) { |
- fBackwardsTrie.adoptInstead(other.fBackwardsTrie->clone()); |
- } |
- if(other.fForwardsPartialTrie.isValid()) { |
- fForwardsPartialTrie.adoptInstead(other.fForwardsPartialTrie->clone()); |
- } |
- */ |
} |
SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) : |
BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)), |
- fDelegate(adopt), |
- fBackwardsTrie(backwards), |
- fForwardsPartialTrie(forwards) |
+ fData(new SimpleFilteredSentenceBreakData(forwards, backwards)), |
+ fDelegate(adopt) |
{ |
// all set.. |
} |
-SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() {} |
+SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() { |
+ fData = fData->decr(); |
+} |
-int32_t SimpleFilteredSentenceBreakIterator::next() { |
- int32_t n = fDelegate->next(); |
- if(n == UBRK_DONE || // at end or |
- fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions |
- return n; |
- } |
- // OK, do we need to break here? |
- UErrorCode status = U_ZERO_ERROR; |
- // refresh text |
+void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) { |
fText.adoptInstead(fDelegate->getUText(fText.orphan(), status)); |
- //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); |
- do { // outer loop runs once per underlying break (from fDelegate). |
+} |
+ |
+SimpleFilteredSentenceBreakIterator::EFBMatchResult |
+SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) { |
+ int64_t bestPosn = -1; |
+ int32_t bestValue = -1; |
// loops while 'n' points to an exception. |
utext_setNativeIndex(fText.getAlias(), n); // from n.. |
- fBackwardsTrie->reset(); |
+ fData->fBackwardsTrie->reset(); |
UChar32 uch; |
+ |
//if(debug2) u_printf(" n@ %d\n", n); |
// Assume a space is following the '.' (so we handle the case: "Mr. /Brown") |
if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) { // TODO: skip a class of chars here?? |
@@ -226,23 +272,21 @@ int32_t SimpleFilteredSentenceBreakIterator::next() { |
uch = utext_next32(fText.getAlias()); |
//if(debug2) u_printf(" -> : |%C| \n", (UChar)uch); |
} |
- UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE; |
- int32_t bestPosn = -1; |
- int32_t bestValue = -1; |
+ UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE; |
while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL && // more to consume backwards and.. |
- USTRINGTRIE_HAS_NEXT(r=fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie |
+ USTRINGTRIE_HAS_NEXT(r=fData->fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie |
if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far |
bestPosn = utext_getNativeIndex(fText.getAlias()); |
- bestValue = fBackwardsTrie->getValue(); |
+ bestValue = fData->fBackwardsTrie->getValue(); |
} |
//if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias())); |
} |
if(USTRINGTRIE_MATCHES(r)) { // exact match? |
//if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); |
- bestValue = fBackwardsTrie->getValue(); |
+ bestValue = fData->fBackwardsTrie->getValue(); |
bestPosn = utext_getNativeIndex(fText.getAlias()); |
//if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); |
} |
@@ -256,49 +300,158 @@ int32_t SimpleFilteredSentenceBreakIterator::next() { |
if(bestValue == kMATCH) { // exact match! |
//if(debug2) u_printf(" exact backward match\n"); |
- n = fDelegate->next(); // skip this one. Find the next lowerlevel break. |
- if(n==UBRK_DONE) return n; |
- continue; // See if the next is another exception. |
+ return kExceptionHere; // See if the next is another exception. |
} else if(bestValue == kPARTIAL |
- && fForwardsPartialTrie.isValid()) { // make sure there's a forward trie |
+ && fData->fForwardsPartialTrie.isValid()) { // make sure there's a forward trie |
//if(debug2) u_printf(" partial backward match\n"); |
// We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie |
// to see if it matches something going forward. |
- fForwardsPartialTrie->reset(); |
+ fData->fForwardsPartialTrie->reset(); |
UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE; |
utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close .. |
//if(debug2) u_printf("Retrying at %d\n", bestPosn); |
while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL && |
- USTRINGTRIE_HAS_NEXT(rfwd=fForwardsPartialTrie->nextForCodePoint(uch))) { |
+ USTRINGTRIE_HAS_NEXT(rfwd=fData->fForwardsPartialTrie->nextForCodePoint(uch))) { |
//if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias())); |
} |
if(USTRINGTRIE_MATCHES(rfwd)) { |
//if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch); |
// only full matches here, nothing to check |
// skip the next: |
- n = fDelegate->next(); |
- if(n==UBRK_DONE) return n; |
- continue; |
+ return kExceptionHere; |
} else { |
//if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch); |
// no match (no exception) -return the 'underlying' break |
- return n; |
+ return kNoExceptionHere; |
} |
} else { |
- return n; // internal error and/or no forwards trie |
+ return kNoExceptionHere; // internal error and/or no forwards trie |
} |
} else { |
//if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r); // no best match |
- return n; // No match - so exit. Not an exception. |
+ return kNoExceptionHere; // No match - so exit. Not an exception. |
} |
- } while(n != UBRK_DONE); |
+} |
+ |
+// the workhorse single next. |
+int32_t |
+SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) { |
+ if(n == UBRK_DONE || // at end or |
+ fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions |
+ return n; |
+ } |
+ // OK, do we need to break here? |
+ UErrorCode status = U_ZERO_ERROR; |
+ // refresh text |
+ resetState(status); |
+ if(U_FAILURE(status)) return UBRK_DONE; // bail out |
+ int64_t utextLen = utext_nativeLength(fText.getAlias()); |
+ |
+ //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); |
+ while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate). |
+ SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n); |
+ |
+ switch(m) { |
+ case kExceptionHere: |
+ n = fDelegate->next(); // skip this one. Find the next lowerlevel break. |
+ continue; |
+ |
+ default: |
+ case kNoExceptionHere: |
+ return n; |
+ } |
+ } |
return n; |
} |
+int32_t |
+SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) { |
+ if(n == 0 || n == UBRK_DONE || // at end or |
+ fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions |
+ return n; |
+ } |
+ // OK, do we need to break here? |
+ UErrorCode status = U_ZERO_ERROR; |
+ // refresh text |
+ resetState(status); |
+ if(U_FAILURE(status)) return UBRK_DONE; // bail out |
+ |
+ //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); |
+ while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate). |
+ SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n); |
+ |
+ switch(m) { |
+ case kExceptionHere: |
+ n = fDelegate->previous(); // skip this one. Find the next lowerlevel break. |
+ continue; |
+ |
+ default: |
+ case kNoExceptionHere: |
+ return n; |
+ } |
+ } |
+ return n; |
+} |
+ |
+ |
+int32_t |
+SimpleFilteredSentenceBreakIterator::next() { |
+ return internalNext(fDelegate->next()); |
+} |
+ |
+int32_t |
+SimpleFilteredSentenceBreakIterator::first(void) { |
+ return internalNext(fDelegate->first()); |
+} |
+ |
+int32_t |
+SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) { |
+ return internalPrev(fDelegate->preceding(offset)); |
+} |
+ |
+int32_t |
+SimpleFilteredSentenceBreakIterator::previous(void) { |
+ return internalPrev(fDelegate->previous()); |
+} |
+ |
+UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) { |
+ if(!fDelegate->isBoundary(offset)) return false; // no break to suppress |
+ |
+ UErrorCode status = U_ZERO_ERROR; |
+ resetState(status); |
+ |
+ SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset); |
+ |
+ switch(m) { |
+ case kExceptionHere: |
+ return false; |
+ default: |
+ case kNoExceptionHere: |
+ return true; |
+ } |
+} |
+ |
+int32_t |
+SimpleFilteredSentenceBreakIterator::next(int32_t offset) { |
+ return internalNext(fDelegate->next(offset)); |
+} |
+ |
+int32_t |
+SimpleFilteredSentenceBreakIterator::following(int32_t offset) { |
+ return internalNext(fDelegate->following(offset)); |
+} |
+ |
+int32_t |
+SimpleFilteredSentenceBreakIterator::last(void) { |
+ // Don't suppress a break opportunity at the end of text. |
+ return fDelegate->last(); |
+} |
+ |
+ |
/** |
* Concrete implementation of builder class. |
*/ |
-class U_I18N_API SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder { |
+class U_COMMON_API SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder { |
public: |
virtual ~SimpleFilteredBreakIteratorBuilder(); |
SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status); |
@@ -374,13 +527,12 @@ BreakIterator * |
SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) { |
LocalPointer<BreakIterator> adopt(adoptBreakIterator); |
+ LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status); |
+ LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status); |
if(U_FAILURE(status)) { |
return NULL; |
} |
- LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status)); |
- LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status)); |
- |
int32_t revCount = 0; |
int32_t fwdCount = 0; |
@@ -503,17 +655,15 @@ FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() { |
FilteredBreakIteratorBuilder * |
FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) { |
if(U_FAILURE(status)) return NULL; |
- LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status)); |
- if(U_SUCCESS(status) && !ret.isValid()) status = U_MEMORY_ALLOCATION_ERROR; |
- return ret.orphan(); |
+ LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status); |
+ return (U_SUCCESS(status))? ret.orphan(): NULL; |
} |
FilteredBreakIteratorBuilder * |
FilteredBreakIteratorBuilder::createInstance(UErrorCode& status) { |
if(U_FAILURE(status)) return NULL; |
- LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status)); |
- if(U_SUCCESS(status) && !ret.isValid()) status = U_MEMORY_ALLOCATION_ERROR; |
- return ret.orphan(); |
+ LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status); |
+ return (U_SUCCESS(status))? ret.orphan(): NULL; |
} |
U_NAMESPACE_END |