| Index: icu46/source/i18n/bmsearch.cpp
|
| ===================================================================
|
| --- icu46/source/i18n/bmsearch.cpp (revision 0)
|
| +++ icu46/source/i18n/bmsearch.cpp (revision 0)
|
| @@ -0,0 +1,817 @@
|
| +/*
|
| + ******************************************************************************
|
| + * Copyright (C) 1996-2010, International Business Machines *
|
| + * Corporation and others. All Rights Reserved. *
|
| + ******************************************************************************
|
| + */
|
| +
|
| +#include "unicode/utypes.h"
|
| +
|
| +#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
|
| +
|
| +#include "unicode/unistr.h"
|
| +#include "unicode/putil.h"
|
| +#include "unicode/usearch.h"
|
| +
|
| +#include "cmemory.h"
|
| +#include "unicode/coll.h"
|
| +#include "unicode/tblcoll.h"
|
| +#include "unicode/coleitr.h"
|
| +#include "unicode/ucoleitr.h"
|
| +
|
| +#include "unicode/regex.h" // TODO: make conditional on regexp being built.
|
| +
|
| +#include "unicode/uniset.h"
|
| +#include "unicode/uset.h"
|
| +#include "unicode/ustring.h"
|
| +#include "hash.h"
|
| +#include "uhash.h"
|
| +#include "ucol_imp.h"
|
| +#include "normalizer2impl.h"
|
| +
|
| +#include "unicode/colldata.h"
|
| +#include "unicode/bmsearch.h"
|
| +
|
| +U_NAMESPACE_BEGIN
|
| +
|
| +#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
|
| +#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
|
| +#define DELETE_ARRAY(array) uprv_free((void *) (array))
|
| +
|
| +
|
| +struct CEI
|
| +{
|
| + uint32_t order;
|
| + int32_t lowOffset;
|
| + int32_t highOffset;
|
| +};
|
| +
|
| +class Target : public UMemory
|
| +{
|
| +public:
|
| + Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status);
|
| + ~Target();
|
| +
|
| + void setTargetString(const UnicodeString *target);
|
| +
|
| + const CEI *nextCE(int32_t offset);
|
| + const CEI *prevCE(int32_t offset);
|
| +
|
| + int32_t stringLength();
|
| + UChar charAt(int32_t offset);
|
| +
|
| + UBool isBreakBoundary(int32_t offset);
|
| + int32_t nextBreakBoundary(int32_t offset);
|
| + int32_t nextSafeBoundary(int32_t offset);
|
| +
|
| + UBool isIdentical(UnicodeString &pattern, int32_t start, int32_t end);
|
| +
|
| + void setOffset(int32_t offset);
|
| + void setLast(int32_t last);
|
| + int32_t getOffset();
|
| +
|
| +private:
|
| + CEI *ceb;
|
| + int32_t bufferSize;
|
| + int32_t bufferMin;
|
| + int32_t bufferMax;
|
| +
|
| + uint32_t strengthMask;
|
| + UCollationStrength strength;
|
| + uint32_t variableTop;
|
| + UBool toShift;
|
| + UCollator *coll;
|
| + const Normalizer2 &nfd;
|
| +
|
| + const UnicodeString *targetString;
|
| + const UChar *targetBuffer;
|
| + int32_t targetLength;
|
| +
|
| + UCollationElements *elements;
|
| + UBreakIterator *charBreakIterator;
|
| +};
|
| +
|
| +Target::Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status)
|
| + : bufferSize(0), bufferMin(0), bufferMax(0),
|
| + strengthMask(0), strength(UCOL_PRIMARY), variableTop(0), toShift(FALSE), coll(theCollator),
|
| + nfd(*Normalizer2Factory::getNFDInstance(status)),
|
| + targetString(NULL), targetBuffer(NULL), targetLength(0), elements(NULL), charBreakIterator(NULL)
|
| +{
|
| + strength = ucol_getStrength(coll);
|
| + toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
|
| + variableTop = ucol_getVariableTop(coll, &status);
|
| +
|
| + // find the largest expansion
|
| + uint8_t maxExpansion = 0;
|
| + for (const uint8_t *expansion = coll->expansionCESize; *expansion != 0; expansion += 1) {
|
| + if (*expansion > maxExpansion) {
|
| + maxExpansion = *expansion;
|
| + }
|
| + }
|
| +
|
| + // room for an extra character on each end, plus 4 for safety
|
| + bufferSize = patternLength + (2 * maxExpansion) + 4;
|
| +
|
| + ceb = NEW_ARRAY(CEI, bufferSize);
|
| +
|
| + if (ceb == NULL) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + return;
|
| + }
|
| +
|
| + if (target != NULL) {
|
| + setTargetString(target);
|
| + }
|
| +
|
| + switch (strength)
|
| + {
|
| + default:
|
| + strengthMask |= UCOL_TERTIARYORDERMASK;
|
| + /* fall through */
|
| +
|
| + case UCOL_SECONDARY:
|
| + strengthMask |= UCOL_SECONDARYORDERMASK;
|
| + /* fall through */
|
| +
|
| + case UCOL_PRIMARY:
|
| + strengthMask |= UCOL_PRIMARYORDERMASK;
|
| + }
|
| +}
|
| +
|
| +Target::~Target()
|
| +{
|
| + ubrk_close(charBreakIterator);
|
| + ucol_closeElements(elements);
|
| +
|
| + DELETE_ARRAY(ceb);
|
| +}
|
| +
|
| +void Target::setTargetString(const UnicodeString *target)
|
| +{
|
| + if (charBreakIterator != NULL) {
|
| + ubrk_close(charBreakIterator);
|
| + ucol_closeElements(elements);
|
| + }
|
| +
|
| + targetString = target;
|
| +
|
| + if (targetString != NULL) {
|
| + UErrorCode status = U_ZERO_ERROR;
|
| +
|
| + targetBuffer = targetString->getBuffer();
|
| + targetLength = targetString->length();
|
| +
|
| + elements = ucol_openElements(coll, target->getBuffer(), target->length(), &status);
|
| + ucol_forceHanImplicit(elements, &status);
|
| +
|
| + charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
|
| + targetBuffer, targetLength, &status);
|
| + } else {
|
| + targetBuffer = NULL;
|
| + targetLength = 0;
|
| + }
|
| +}
|
| +
|
| +const CEI *Target::nextCE(int32_t offset)
|
| +{
|
| + UErrorCode status = U_ZERO_ERROR;
|
| + int32_t low = -1, high = -1;
|
| + uint32_t order;
|
| + UBool cont = FALSE;
|
| +
|
| + if (offset >= bufferMin && offset < bufferMax) {
|
| + return &ceb[offset];
|
| + }
|
| +
|
| + if (bufferMax >= bufferSize || offset != bufferMax) {
|
| + return NULL;
|
| + }
|
| +
|
| + do {
|
| + low = ucol_getOffset(elements);
|
| + order = ucol_next(elements, &status);
|
| + high = ucol_getOffset(elements);
|
| +
|
| + if (order == (uint32_t)UCOL_NULLORDER) {
|
| + //high = low = -1;
|
| + break;
|
| + }
|
| +
|
| + cont = isContinuation(order);
|
| + order &= strengthMask;
|
| +
|
| + if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
|
| + if (strength >= UCOL_QUATERNARY) {
|
| + order &= UCOL_PRIMARYORDERMASK;
|
| + } else {
|
| + order = UCOL_IGNORABLE;
|
| + }
|
| + }
|
| + } while (order == UCOL_IGNORABLE);
|
| +
|
| + if (cont) {
|
| + order |= UCOL_CONTINUATION_MARKER;
|
| + }
|
| +
|
| + ceb[offset].order = order;
|
| + ceb[offset].lowOffset = low;
|
| + ceb[offset].highOffset = high;
|
| +
|
| + bufferMax += 1;
|
| +
|
| + return &ceb[offset];
|
| +}
|
| +
|
| +const CEI *Target::prevCE(int32_t offset)
|
| +{
|
| + UErrorCode status = U_ZERO_ERROR;
|
| + int32_t low = -1, high = -1;
|
| + uint32_t order;
|
| + UBool cont = FALSE;
|
| +
|
| + if (offset >= bufferMin && offset < bufferMax) {
|
| + return &ceb[offset];
|
| + }
|
| +
|
| + if (bufferMax >= bufferSize || offset != bufferMax) {
|
| + return NULL;
|
| + }
|
| +
|
| + do {
|
| + high = ucol_getOffset(elements);
|
| + order = ucol_previous(elements, &status);
|
| + low = ucol_getOffset(elements);
|
| +
|
| + if (order == (uint32_t)UCOL_NULLORDER) {
|
| + break;
|
| + }
|
| +
|
| + cont = isContinuation(order);
|
| + order &= strengthMask;
|
| +
|
| + if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
|
| + if (strength >= UCOL_QUATERNARY) {
|
| + order &= UCOL_PRIMARYORDERMASK;
|
| + } else {
|
| + order = UCOL_IGNORABLE;
|
| + }
|
| + }
|
| + } while (order == UCOL_IGNORABLE);
|
| +
|
| + bufferMax += 1;
|
| +
|
| + if (cont) {
|
| + order |= UCOL_CONTINUATION_MARKER;
|
| + }
|
| +
|
| + ceb[offset].order = order;
|
| + ceb[offset].lowOffset = low;
|
| + ceb[offset].highOffset = high;
|
| +
|
| + return &ceb[offset];
|
| +}
|
| +
|
| +int32_t Target::stringLength()
|
| +{
|
| + if (targetString != NULL) {
|
| + return targetLength;
|
| + }
|
| +
|
| + return 0;
|
| +}
|
| +
|
| +UChar Target::charAt(int32_t offset)
|
| +{
|
| + if (targetString != NULL) {
|
| + return targetBuffer[offset];
|
| + }
|
| +
|
| + return 0x0000;
|
| +}
|
| +
|
| +void Target::setOffset(int32_t offset)
|
| +{
|
| + UErrorCode status = U_ZERO_ERROR;
|
| +
|
| + bufferMin = 0;
|
| + bufferMax = 0;
|
| +
|
| + ucol_setOffset(elements, offset, &status);
|
| +}
|
| +
|
| +void Target::setLast(int32_t last)
|
| +{
|
| + UErrorCode status = U_ZERO_ERROR;
|
| +
|
| + bufferMin = 0;
|
| + bufferMax = 1;
|
| +
|
| + ceb[0].order = UCOL_NULLORDER;
|
| + ceb[0].lowOffset = last;
|
| + ceb[0].highOffset = last;
|
| +
|
| + ucol_setOffset(elements, last, &status);
|
| +}
|
| +
|
| +int32_t Target::getOffset()
|
| +{
|
| + return ucol_getOffset(elements);
|
| +}
|
| +
|
| +UBool Target::isBreakBoundary(int32_t offset)
|
| +{
|
| + return ubrk_isBoundary(charBreakIterator, offset);
|
| +}
|
| +
|
| +int32_t Target::nextBreakBoundary(int32_t offset)
|
| +{
|
| + return ubrk_following(charBreakIterator, offset);
|
| +}
|
| +
|
| +int32_t Target::nextSafeBoundary(int32_t offset)
|
| +{
|
| + while (offset < targetLength) {
|
| + //UChar ch = charAt(offset);
|
| + UChar ch = targetBuffer[offset];
|
| +
|
| + if (U_IS_LEAD(ch) || ! ucol_unsafeCP(ch, coll)) {
|
| + return offset;
|
| + }
|
| +
|
| + offset += 1;
|
| + }
|
| +
|
| + return targetLength;
|
| +}
|
| +
|
| +UBool Target::isIdentical(UnicodeString &pattern, int32_t start, int32_t end)
|
| +{
|
| + if (strength < UCOL_IDENTICAL) {
|
| + return TRUE;
|
| + }
|
| +
|
| + // Note: We could use Normalizer::compare() or similar, but for short strings
|
| + // which may not be in FCD it might be faster to just NFD them.
|
| + UErrorCode status = U_ZERO_ERROR;
|
| + UnicodeString t2, p2;
|
| + nfd.normalize(UnicodeString(FALSE, targetBuffer + start, end - start), t2, status);
|
| + nfd.normalize(pattern, p2, status);
|
| + // return FALSE if NFD failed
|
| + return U_SUCCESS(status) && t2 == p2;
|
| +}
|
| +
|
| +#define HASH_TABLE_SIZE 257
|
| +
|
| +class BadCharacterTable : public UMemory
|
| +{
|
| +public:
|
| + BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status);
|
| + ~BadCharacterTable();
|
| +
|
| + int32_t operator[](uint32_t ce) const;
|
| + int32_t getMaxSkip() const;
|
| + int32_t minLengthInChars(int32_t index);
|
| +
|
| +private:
|
| + static int32_t hash(uint32_t ce);
|
| +
|
| + int32_t maxSkip;
|
| + int32_t badCharacterTable[HASH_TABLE_SIZE];
|
| +
|
| + int32_t *minLengthCache;
|
| +};
|
| +
|
| +BadCharacterTable::BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status)
|
| + : minLengthCache(NULL)
|
| +{
|
| + int32_t plen = patternCEs.size();
|
| +
|
| + // **** need a better way to deal with this ****
|
| + if (U_FAILURE(status) || plen == 0) {
|
| + return;
|
| + }
|
| +
|
| + int32_t *history = NEW_ARRAY(int32_t, plen);
|
| +
|
| + if (history == NULL) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + return;
|
| + }
|
| +
|
| + for (int32_t i = 0; i < plen; i += 1) {
|
| + history[i] = -1;
|
| + }
|
| +
|
| + minLengthCache = NEW_ARRAY(int32_t, plen + 1);
|
| +
|
| + if (minLengthCache == NULL) {
|
| + DELETE_ARRAY(history);
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + return;
|
| + }
|
| +
|
| + maxSkip = minLengthCache[0] = data->minLengthInChars(&patternCEs, 0, history);
|
| +
|
| + for(int32_t j = 0; j < HASH_TABLE_SIZE; j += 1) {
|
| + badCharacterTable[j] = maxSkip;
|
| + }
|
| +
|
| + for(int32_t p = 1; p < plen; p += 1) {
|
| + minLengthCache[p] = data->minLengthInChars(&patternCEs, p, history);
|
| +
|
| + // Make sure this entry is not bigger than the previous one.
|
| + // Otherwise, we might skip too far in some cases.
|
| + if (minLengthCache[p] < 0 || minLengthCache[p] > minLengthCache[p - 1]) {
|
| + minLengthCache[p] = minLengthCache[p - 1];
|
| + }
|
| + }
|
| +
|
| + minLengthCache[plen] = 0;
|
| +
|
| + for(int32_t p = 0; p < plen - 1; p += 1) {
|
| + badCharacterTable[hash(patternCEs[p])] = minLengthCache[p + 1];
|
| + }
|
| +
|
| + DELETE_ARRAY(history);
|
| +}
|
| +
|
| +BadCharacterTable::~BadCharacterTable()
|
| +{
|
| + DELETE_ARRAY(minLengthCache);
|
| +}
|
| +
|
| +int32_t BadCharacterTable::operator[](uint32_t ce) const
|
| +{
|
| + return badCharacterTable[hash(ce)];
|
| +}
|
| +
|
| +int32_t BadCharacterTable::getMaxSkip() const
|
| +{
|
| + return maxSkip;
|
| +}
|
| +
|
| +int32_t BadCharacterTable::minLengthInChars(int32_t index)
|
| +{
|
| + return minLengthCache[index];
|
| +}
|
| +
|
| +int32_t BadCharacterTable::hash(uint32_t ce)
|
| +{
|
| + return UCOL_PRIMARYORDER(ce) % HASH_TABLE_SIZE;
|
| +}
|
| +
|
| +class GoodSuffixTable : public UMemory
|
| +{
|
| +public:
|
| + GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status);
|
| + ~GoodSuffixTable();
|
| +
|
| + int32_t operator[](int32_t offset) const;
|
| +
|
| +private:
|
| + int32_t *goodSuffixTable;
|
| +};
|
| +
|
| +GoodSuffixTable::GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status)
|
| + : goodSuffixTable(NULL)
|
| +{
|
| + int32_t patlen = patternCEs.size();
|
| +
|
| + // **** need a better way to deal with this ****
|
| + if (U_FAILURE(status) || patlen <= 0) {
|
| + return;
|
| + }
|
| +
|
| + int32_t *suff = NEW_ARRAY(int32_t, patlen);
|
| + int32_t start = patlen - 1, end = - 1;
|
| + int32_t maxSkip = badCharacterTable.getMaxSkip();
|
| +
|
| + if (suff == NULL) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + return;
|
| + }
|
| +
|
| + // initialze suff
|
| + suff[patlen - 1] = patlen;
|
| +
|
| + for (int32_t i = patlen - 2; i >= 0; i -= 1) {
|
| + // (i > start) means we're inside the last suffix match we found
|
| + // ((patlen - 1) - end) is how far the end of that match is from end of pattern
|
| + // (i - start) is how far we are from start of that match
|
| + // (i + (patlen - 1) - end) is index of same character at end of pattern
|
| + // so if any suffix match at that character doesn't extend beyond the last match,
|
| + // it's the suffix for this character as well
|
| + if (i > start && suff[i + patlen - 1 - end] < i - start) {
|
| + suff[i] = suff[i + patlen - 1 - end];
|
| + } else {
|
| + start = end = i;
|
| +
|
| + int32_t s = patlen;
|
| +
|
| + while (start >= 0 && patternCEs[start] == patternCEs[--s]) {
|
| + start -= 1;
|
| + }
|
| +
|
| + suff[i] = end - start;
|
| + }
|
| + }
|
| +
|
| + // now build goodSuffixTable
|
| + goodSuffixTable = NEW_ARRAY(int32_t, patlen);
|
| +
|
| + if (goodSuffixTable == NULL) {
|
| + DELETE_ARRAY(suff);
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + return;
|
| + }
|
| +
|
| +
|
| + // initialize entries to minLengthInChars of the pattern
|
| + for (int32_t i = 0; i < patlen; i += 1) {
|
| + goodSuffixTable[i] = maxSkip;
|
| + }
|
| +
|
| + int32_t prefix = 0;
|
| +
|
| + for (int32_t i = patlen - /*1*/ 2; i >= 0; i -= 1) {
|
| + if (suff[i] == i + 1) {
|
| + // this matching suffix is a prefix of the pattern
|
| + int32_t prefixSkip = badCharacterTable.minLengthInChars(i + 1);
|
| +
|
| + // for any mis-match before this suffix, we should skip
|
| + // so that the front of the pattern (i.e. the prefix)
|
| + // lines up with the front of the suffix.
|
| + // (patlen - 1 - i) is the start of the suffix
|
| + while (prefix < patlen - 1 - i) {
|
| + // value of maxSkip means never set...
|
| + if (goodSuffixTable[prefix] == maxSkip) {
|
| + goodSuffixTable[prefix] = prefixSkip;
|
| + }
|
| +
|
| + prefix += 1;
|
| + }
|
| + }
|
| + }
|
| +
|
| + for (int32_t i = 0; i < patlen - 1; i += 1) {
|
| + goodSuffixTable[patlen - 1 - suff[i]] = badCharacterTable.minLengthInChars(i + 1);
|
| + }
|
| +
|
| + DELETE_ARRAY(suff);
|
| +}
|
| +
|
| +GoodSuffixTable::~GoodSuffixTable()
|
| +{
|
| + DELETE_ARRAY(goodSuffixTable);
|
| +}
|
| +
|
| +int32_t GoodSuffixTable::operator[](int32_t offset) const
|
| +{
|
| + return goodSuffixTable[offset];
|
| +}
|
| +
|
| +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BoyerMooreSearch)
|
| +
|
| +
|
| +UBool BoyerMooreSearch::empty()
|
| +{
|
| + return patCEs->size() <= 0;
|
| +}
|
| +
|
| +CollData *BoyerMooreSearch::getData()
|
| +{
|
| + return data;
|
| +}
|
| +
|
| +CEList *BoyerMooreSearch::getPatternCEs()
|
| +{
|
| + return patCEs;
|
| +}
|
| +
|
| +BadCharacterTable *BoyerMooreSearch::getBadCharacterTable()
|
| +{
|
| + return badCharacterTable;
|
| +}
|
| +
|
| +GoodSuffixTable *BoyerMooreSearch::getGoodSuffixTable()
|
| +{
|
| + return goodSuffixTable;
|
| +}
|
| +
|
| +BoyerMooreSearch::BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString,
|
| + UErrorCode &status)
|
| + : data(theData), patCEs(NULL), badCharacterTable(NULL), goodSuffixTable(NULL), pattern(patternString), target(NULL)
|
| +{
|
| +
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| +
|
| + UCollator *collator = data->getCollator();
|
| +
|
| + patCEs = new CEList(collator, patternString, status);
|
| +
|
| + if (patCEs == NULL || U_FAILURE(status)) {
|
| + return;
|
| + }
|
| +
|
| + badCharacterTable = new BadCharacterTable(*patCEs, data, status);
|
| +
|
| + if (badCharacterTable == NULL || U_FAILURE(status)) {
|
| + return;
|
| + }
|
| +
|
| + goodSuffixTable = new GoodSuffixTable(*patCEs, *badCharacterTable, status);
|
| +
|
| + if (targetString != NULL) {
|
| + target = new Target(collator, targetString, patCEs->size(), status);
|
| + }
|
| +}
|
| +
|
| +BoyerMooreSearch::~BoyerMooreSearch()
|
| +{
|
| + delete target;
|
| + delete goodSuffixTable;
|
| + delete badCharacterTable;
|
| + delete patCEs;
|
| +}
|
| +
|
| +void BoyerMooreSearch::setTargetString(const UnicodeString *targetString, UErrorCode &status)
|
| +{
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| +
|
| + if (target == NULL) {
|
| + target = new Target(data->getCollator(), targetString, patCEs->size(), status);
|
| + } else {
|
| + target->setTargetString(targetString);
|
| + }
|
| +}
|
| +
|
| +// **** main flow of this code from Laura Werner's "Unicode Text Searching in Java" paper. ****
|
| +/*
|
| + * TODO:
|
| + * * deal with trailing (and leading?) ignorables.
|
| + * * Adding BoyerMooreSearch object slowed it down. How can we speed it up?
|
| + */
|
| +UBool BoyerMooreSearch::search(int32_t offset, int32_t &start, int32_t &end)
|
| +{
|
| + /*UCollator *coll =*/ data->getCollator();
|
| + int32_t plen = patCEs->size();
|
| + int32_t tlen = target->stringLength();
|
| + int32_t maxSkip = badCharacterTable->getMaxSkip();
|
| + int32_t tOffset = offset + maxSkip;
|
| +
|
| + if (plen <= 0) {
|
| + // Searching for a zero length pattern always fails.
|
| + start = end = -1;
|
| + return FALSE;
|
| + }
|
| +
|
| + while (tOffset <= tlen) {
|
| + int32_t pIndex = plen - 1;
|
| + int32_t tIndex = 0;
|
| + int32_t lIndex = 0;
|
| +
|
| + if (tOffset < tlen) {
|
| + // **** we really want to skip ahead enough to ****
|
| + // **** be sure we get at least 1 non-ignorable ****
|
| + // **** CE after the end of the pattern. ****
|
| + int32_t next = target->nextSafeBoundary(tOffset + 1);
|
| +
|
| + target->setOffset(next);
|
| +
|
| + for (lIndex = 0; ; lIndex += 1) {
|
| + const CEI *cei = target->prevCE(lIndex);
|
| + int32_t low = cei->lowOffset;
|
| + int32_t high = cei->highOffset;
|
| +
|
| + if (high == 0 || (low < high && low <= tOffset)) {
|
| + if (low < tOffset) {
|
| + while (lIndex >= 0 && target->prevCE(lIndex)->highOffset == high) {
|
| + lIndex -= 1;
|
| + }
|
| +
|
| + if (high > tOffset) {
|
| + tOffset = high;
|
| + }
|
| + }
|
| +
|
| + break;
|
| + }
|
| + }
|
| + } else {
|
| + target->setLast(tOffset);
|
| + lIndex = 0;
|
| + }
|
| +
|
| + tIndex = ++lIndex;
|
| +
|
| + // Iterate backward until we hit the beginning of the pattern
|
| + while (pIndex >= 0) {
|
| + uint32_t pce = (*patCEs)[pIndex];
|
| + const CEI *tcei = target->prevCE(tIndex++);
|
| +
|
| +
|
| + if (tcei->order != pce) {
|
| + // There is a mismatch at this position. Decide how far
|
| + // over to shift the pattern, then try again.
|
| +
|
| + int32_t gsOffset = tOffset + (*goodSuffixTable)[pIndex];
|
| +#ifdef EXTRA_CAUTIOUS
|
| + int32_t old = tOffset;
|
| +#endif
|
| +
|
| + tOffset += (*badCharacterTable)[tcei->order] - badCharacterTable->minLengthInChars(pIndex + 1);
|
| +
|
| + if (gsOffset > tOffset) {
|
| + tOffset = gsOffset;
|
| + }
|
| +
|
| +#ifdef EXTRA_CAUTIOUS
|
| + // Make sure we don't skip backwards...
|
| + if (tOffset <= old) {
|
| + tOffset = old + 1;
|
| + }
|
| +#endif
|
| +
|
| + break;
|
| + }
|
| +
|
| + pIndex -= 1;
|
| + }
|
| +
|
| + if (pIndex < 0) {
|
| + // We made it back to the beginning of the pattern,
|
| + // which means we matched it all. Return the location.
|
| + const CEI firstCEI = *target->prevCE(tIndex - 1);
|
| + const CEI lastCEI = *target->prevCE(lIndex);
|
| + int32_t mStart = firstCEI.lowOffset;
|
| + int32_t minLimit = lastCEI.lowOffset;
|
| + int32_t maxLimit = lastCEI.highOffset;
|
| + int32_t mLimit;
|
| + UBool found = TRUE;
|
| +
|
| + target->setOffset(/*tOffset*/maxLimit);
|
| +
|
| + const CEI nextCEI = *target->nextCE(0);
|
| +
|
| + if (nextCEI.lowOffset > maxLimit) {
|
| + maxLimit = nextCEI.lowOffset;
|
| + }
|
| +
|
| + if (nextCEI.lowOffset == nextCEI.highOffset && nextCEI.order != (uint32_t)UCOL_NULLORDER) {
|
| + found = FALSE;
|
| + }
|
| +
|
| + if (! target->isBreakBoundary(mStart)) {
|
| + found = FALSE;
|
| + }
|
| +
|
| + if (firstCEI.lowOffset == firstCEI.highOffset) {
|
| + found = FALSE;
|
| + }
|
| +
|
| + mLimit = maxLimit;
|
| + if (minLimit < maxLimit) {
|
| + int32_t nbb = target->nextBreakBoundary(minLimit);
|
| +
|
| + if (nbb >= lastCEI.highOffset) {
|
| + mLimit = nbb;
|
| + }
|
| + }
|
| +
|
| + if (mLimit > maxLimit) {
|
| + found = FALSE;
|
| + }
|
| +
|
| + if (! target->isBreakBoundary(mLimit)) {
|
| + found = FALSE;
|
| + }
|
| +
|
| + if (! target->isIdentical(pattern, mStart, mLimit)) {
|
| + found = FALSE;
|
| + }
|
| +
|
| + if (found) {
|
| + start = mStart;
|
| + end = mLimit;
|
| +
|
| + return TRUE;
|
| + }
|
| +
|
| + tOffset += (*goodSuffixTable)[0]; // really? Maybe += 1 or += maxSkip?
|
| + }
|
| + // Otherwise, we're here because of a mismatch, so keep going....
|
| + }
|
| +
|
| + // no match
|
| + start = -1;
|
| + end = -1;
|
| + return FALSE;
|
| +}
|
| +
|
| +U_NAMESPACE_END
|
| +
|
| +#endif // #if !UCONFIG_NO_COLLATION
|
|
|
| Property changes on: icu46/source/i18n/bmsearch.cpp
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + LF
|
|
|
|
|