icu46/source/i18n/bmsearch.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Unified Diff: icu46/source/i18n/bmsearch.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: icu46/source/i18n/bmsearch.cpp

===================================================================

--- icu46/source/i18n/bmsearch.cpp (revision 0)

+++ icu46/source/i18n/bmsearch.cpp (revision 0)

@@ -0,0 +1,817 @@

+/*

+ ******************************************************************************

+ */

+#include "unicode/utypes.h"

+#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION

+#include "unicode/unistr.h"

+#include "unicode/putil.h"

+#include "unicode/usearch.h"

+#include "cmemory.h"

+#include "unicode/coll.h"

+#include "unicode/tblcoll.h"

+#include "unicode/coleitr.h"

+#include "unicode/ucoleitr.h"

+#include "unicode/regex.h" // TODO: make conditional on regexp being built.

+#include "unicode/uniset.h"

+#include "unicode/uset.h"

+#include "unicode/ustring.h"

+#include "hash.h"

+#include "uhash.h"

+#include "ucol_imp.h"

+#include "normalizer2impl.h"

+#include "unicode/colldata.h"

+#include "unicode/bmsearch.h"

+U_NAMESPACE_BEGIN

+#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))

+#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))

+#define DELETE_ARRAY(array) uprv_free((void *) (array))

+struct CEI

+ uint32_t order;

+ int32_t lowOffset;

+ int32_t highOffset;

+};

+class Target : public UMemory

+public:

+ Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status);

+ ~Target();

+ void setTargetString(const UnicodeString *target);

+ const CEI *nextCE(int32_t offset);

+ const CEI *prevCE(int32_t offset);

+ int32_t stringLength();

+ UChar charAt(int32_t offset);

+ UBool isBreakBoundary(int32_t offset);

+ int32_t nextBreakBoundary(int32_t offset);

+ int32_t nextSafeBoundary(int32_t offset);

+ UBool isIdentical(UnicodeString &pattern, int32_t start, int32_t end);

+ void setOffset(int32_t offset);

+ void setLast(int32_t last);

+ int32_t getOffset();

+private:

+ CEI *ceb;

+ int32_t bufferSize;

+ int32_t bufferMin;

+ int32_t bufferMax;

+ uint32_t strengthMask;

+ UCollationStrength strength;

+ uint32_t variableTop;

+ UBool toShift;

+ UCollator *coll;

+ const Normalizer2 &nfd;

+ const UnicodeString *targetString;

+ const UChar *targetBuffer;

+ int32_t targetLength;

+ UCollationElements *elements;

+ UBreakIterator *charBreakIterator;

+};

+Target::Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status)

+ : bufferSize(0), bufferMin(0), bufferMax(0),

+ strengthMask(0), strength(UCOL_PRIMARY), variableTop(0), toShift(FALSE), coll(theCollator),

+ nfd(*Normalizer2Factory::getNFDInstance(status)),

+ targetString(NULL), targetBuffer(NULL), targetLength(0), elements(NULL), charBreakIterator(NULL)

+ strength = ucol_getStrength(coll);

+ toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;

+ variableTop = ucol_getVariableTop(coll, &status);

+ // find the largest expansion

+ uint8_t maxExpansion = 0;

+ for (const uint8_t *expansion = coll->expansionCESize; *expansion != 0; expansion += 1) {

+ if (*expansion > maxExpansion) {

+ maxExpansion = *expansion;

+ }

+ // room for an extra character on each end, plus 4 for safety

+ bufferSize = patternLength + (2 * maxExpansion) + 4;

+ ceb = NEW_ARRAY(CEI, bufferSize);

+ if (ceb == NULL) {

+ status = U_MEMORY_ALLOCATION_ERROR;

+ return;

+ }

+ if (target != NULL) {

+ setTargetString(target);

+ }

+ switch (strength)

+ {

+ default:

+ strengthMask |= UCOL_TERTIARYORDERMASK;

+ /* fall through */

+ case UCOL_SECONDARY:

+ strengthMask |= UCOL_SECONDARYORDERMASK;

+ /* fall through */

+ case UCOL_PRIMARY:

+ strengthMask |= UCOL_PRIMARYORDERMASK;

+ }

+Target::~Target()

+ ubrk_close(charBreakIterator);

+ ucol_closeElements(elements);

+ DELETE_ARRAY(ceb);

+void Target::setTargetString(const UnicodeString *target)

+ if (charBreakIterator != NULL) {

+ ubrk_close(charBreakIterator);

+ ucol_closeElements(elements);

+ }

+ targetString = target;

+ if (targetString != NULL) {

+ UErrorCode status = U_ZERO_ERROR;

+ targetBuffer = targetString->getBuffer();

+ targetLength = targetString->length();

+ elements = ucol_openElements(coll, target->getBuffer(), target->length(), &status);

+ ucol_forceHanImplicit(elements, &status);

+ charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),

+ targetBuffer, targetLength, &status);

+ } else {

+ targetBuffer = NULL;

+ targetLength = 0;

+ }

+const CEI *Target::nextCE(int32_t offset)

+ UErrorCode status = U_ZERO_ERROR;

+ int32_t low = -1, high = -1;

+ uint32_t order;

+ UBool cont = FALSE;

+ if (offset >= bufferMin && offset < bufferMax) {

+ return &ceb[offset];

+ }

+ if (bufferMax >= bufferSize || offset != bufferMax) {

+ return NULL;

+ }

+ do {

+ low = ucol_getOffset(elements);

+ order = ucol_next(elements, &status);

+ high = ucol_getOffset(elements);

+ if (order == (uint32_t)UCOL_NULLORDER) {

+ //high = low = -1;

+ break;

+ }

+ cont = isContinuation(order);

+ order &= strengthMask;

+ if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {

+ if (strength >= UCOL_QUATERNARY) {

+ order &= UCOL_PRIMARYORDERMASK;

+ } else {

+ order = UCOL_IGNORABLE;

+ }

+ } while (order == UCOL_IGNORABLE);

+ if (cont) {

+ order |= UCOL_CONTINUATION_MARKER;

+ }

+ ceb[offset].order = order;

+ ceb[offset].lowOffset = low;

+ ceb[offset].highOffset = high;

+ bufferMax += 1;

+ return &ceb[offset];

+const CEI *Target::prevCE(int32_t offset)

+ UErrorCode status = U_ZERO_ERROR;

+ int32_t low = -1, high = -1;

+ uint32_t order;

+ UBool cont = FALSE;

+ if (offset >= bufferMin && offset < bufferMax) {

+ return &ceb[offset];

+ }

+ if (bufferMax >= bufferSize || offset != bufferMax) {

+ return NULL;

+ }

+ do {

+ high = ucol_getOffset(elements);

+ order = ucol_previous(elements, &status);

+ low = ucol_getOffset(elements);

+ if (order == (uint32_t)UCOL_NULLORDER) {

+ break;

+ }

+ cont = isContinuation(order);

+ order &= strengthMask;

+ if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {

+ if (strength >= UCOL_QUATERNARY) {

+ order &= UCOL_PRIMARYORDERMASK;

+ } else {

+ order = UCOL_IGNORABLE;

+ }

+ } while (order == UCOL_IGNORABLE);

+ bufferMax += 1;

+ if (cont) {

+ order |= UCOL_CONTINUATION_MARKER;

+ }

+ ceb[offset].order = order;

+ ceb[offset].lowOffset = low;

+ ceb[offset].highOffset = high;

+ return &ceb[offset];

+int32_t Target::stringLength()

+ if (targetString != NULL) {

+ return targetLength;

+ }

+ return 0;

+UChar Target::charAt(int32_t offset)

+ if (targetString != NULL) {

+ return targetBuffer[offset];

+ }

+ return 0x0000;

+void Target::setOffset(int32_t offset)

+ UErrorCode status = U_ZERO_ERROR;

+ bufferMin = 0;

+ bufferMax = 0;

+ ucol_setOffset(elements, offset, &status);

+void Target::setLast(int32_t last)

+ UErrorCode status = U_ZERO_ERROR;

+ bufferMin = 0;

+ bufferMax = 1;

+ ceb[0].order = UCOL_NULLORDER;

+ ceb[0].lowOffset = last;

+ ceb[0].highOffset = last;

+ ucol_setOffset(elements, last, &status);

+int32_t Target::getOffset()

+ return ucol_getOffset(elements);

+UBool Target::isBreakBoundary(int32_t offset)

+ return ubrk_isBoundary(charBreakIterator, offset);

+int32_t Target::nextBreakBoundary(int32_t offset)

+ return ubrk_following(charBreakIterator, offset);

+int32_t Target::nextSafeBoundary(int32_t offset)

+ while (offset < targetLength) {

+ //UChar ch = charAt(offset);

+ UChar ch = targetBuffer[offset];

+ if (U_IS_LEAD(ch) || ! ucol_unsafeCP(ch, coll)) {

+ return offset;

+ }

+ offset += 1;

+ }

+ return targetLength;

+UBool Target::isIdentical(UnicodeString &pattern, int32_t start, int32_t end)

+ if (strength < UCOL_IDENTICAL) {

+ return TRUE;

+ }

+ // Note: We could use Normalizer::compare() or similar, but for short strings

+ // which may not be in FCD it might be faster to just NFD them.

+ UErrorCode status = U_ZERO_ERROR;

+ UnicodeString t2, p2;

+ nfd.normalize(UnicodeString(FALSE, targetBuffer + start, end - start), t2, status);

+ nfd.normalize(pattern, p2, status);

+ // return FALSE if NFD failed

+ return U_SUCCESS(status) && t2 == p2;

+#define HASH_TABLE_SIZE 257

+class BadCharacterTable : public UMemory

+public:

+ BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status);

+ ~BadCharacterTable();

+ int32_t operator[](uint32_t ce) const;

+ int32_t getMaxSkip() const;

+ int32_t minLengthInChars(int32_t index);

+private:

+ static int32_t hash(uint32_t ce);

+ int32_t maxSkip;

+ int32_t badCharacterTable[HASH_TABLE_SIZE];

+ int32_t *minLengthCache;

+};

+BadCharacterTable::BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status)

+ : minLengthCache(NULL)

+ int32_t plen = patternCEs.size();

+ // **** need a better way to deal with this ****

+ if (U_FAILURE(status) || plen == 0) {

+ return;

+ }

+ int32_t *history = NEW_ARRAY(int32_t, plen);

+ if (history == NULL) {

+ status = U_MEMORY_ALLOCATION_ERROR;

+ return;

+ }

+ for (int32_t i = 0; i < plen; i += 1) {

+ history[i] = -1;

+ }

+ minLengthCache = NEW_ARRAY(int32_t, plen + 1);

+ if (minLengthCache == NULL) {

+ DELETE_ARRAY(history);

+ status = U_MEMORY_ALLOCATION_ERROR;

+ return;

+ }

+ maxSkip = minLengthCache[0] = data->minLengthInChars(&patternCEs, 0, history);

+ for(int32_t j = 0; j < HASH_TABLE_SIZE; j += 1) {

+ badCharacterTable[j] = maxSkip;

+ }

+ for(int32_t p = 1; p < plen; p += 1) {

+ minLengthCache[p] = data->minLengthInChars(&patternCEs, p, history);

+ // Make sure this entry is not bigger than the previous one.

+ // Otherwise, we might skip too far in some cases.

+ if (minLengthCache[p] < 0 || minLengthCache[p] > minLengthCache[p - 1]) {

+ minLengthCache[p] = minLengthCache[p - 1];

+ }

+ minLengthCache[plen] = 0;

+ for(int32_t p = 0; p < plen - 1; p += 1) {

+ badCharacterTable[hash(patternCEs[p])] = minLengthCache[p + 1];

+ }

+ DELETE_ARRAY(history);

+BadCharacterTable::~BadCharacterTable()

+ DELETE_ARRAY(minLengthCache);

+int32_t BadCharacterTable::operator[](uint32_t ce) const

+ return badCharacterTable[hash(ce)];

+int32_t BadCharacterTable::getMaxSkip() const

+ return maxSkip;

+int32_t BadCharacterTable::minLengthInChars(int32_t index)

+ return minLengthCache[index];

+int32_t BadCharacterTable::hash(uint32_t ce)

+ return UCOL_PRIMARYORDER(ce) % HASH_TABLE_SIZE;

+class GoodSuffixTable : public UMemory

+public:

+ GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status);

+ ~GoodSuffixTable();

+ int32_t operator[](int32_t offset) const;

+private:

+ int32_t *goodSuffixTable;

+};

+GoodSuffixTable::GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status)

+ : goodSuffixTable(NULL)

+ int32_t patlen = patternCEs.size();

+ // **** need a better way to deal with this ****

+ if (U_FAILURE(status) || patlen <= 0) {

+ return;

+ }

+ int32_t *suff = NEW_ARRAY(int32_t, patlen);

+ int32_t start = patlen - 1, end = - 1;

+ int32_t maxSkip = badCharacterTable.getMaxSkip();

+ if (suff == NULL) {

+ status = U_MEMORY_ALLOCATION_ERROR;

+ return;

+ }

+ // initialze suff

+ suff[patlen - 1] = patlen;

+ for (int32_t i = patlen - 2; i >= 0; i -= 1) {

+ // (i > start) means we're inside the last suffix match we found

+ // ((patlen - 1) - end) is how far the end of that match is from end of pattern

+ // (i - start) is how far we are from start of that match

+ // (i + (patlen - 1) - end) is index of same character at end of pattern

+ // so if any suffix match at that character doesn't extend beyond the last match,

+ // it's the suffix for this character as well

+ if (i > start && suff[i + patlen - 1 - end] < i - start) {

+ suff[i] = suff[i + patlen - 1 - end];

+ } else {

+ start = end = i;

+ int32_t s = patlen;

+ while (start >= 0 && patternCEs[start] == patternCEs[--s]) {

+ start -= 1;

+ }

+ suff[i] = end - start;

+ }

+ // now build goodSuffixTable

+ goodSuffixTable = NEW_ARRAY(int32_t, patlen);

+ if (goodSuffixTable == NULL) {

+ DELETE_ARRAY(suff);

+ status = U_MEMORY_ALLOCATION_ERROR;

+ return;

+ }

+ // initialize entries to minLengthInChars of the pattern

+ for (int32_t i = 0; i < patlen; i += 1) {

+ goodSuffixTable[i] = maxSkip;

+ }

+ int32_t prefix = 0;

+ for (int32_t i = patlen - /*1*/ 2; i >= 0; i -= 1) {

+ if (suff[i] == i + 1) {

+ // this matching suffix is a prefix of the pattern

+ int32_t prefixSkip = badCharacterTable.minLengthInChars(i + 1);

+ // for any mis-match before this suffix, we should skip

+ // so that the front of the pattern (i.e. the prefix)

+ // lines up with the front of the suffix.

+ // (patlen - 1 - i) is the start of the suffix

+ while (prefix < patlen - 1 - i) {

+ // value of maxSkip means never set...

+ if (goodSuffixTable[prefix] == maxSkip) {

+ goodSuffixTable[prefix] = prefixSkip;

+ }

+ prefix += 1;

+ }

+ for (int32_t i = 0; i < patlen - 1; i += 1) {

+ goodSuffixTable[patlen - 1 - suff[i]] = badCharacterTable.minLengthInChars(i + 1);

+ }

+ DELETE_ARRAY(suff);

+GoodSuffixTable::~GoodSuffixTable()

+ DELETE_ARRAY(goodSuffixTable);

+int32_t GoodSuffixTable::operator[](int32_t offset) const

+ return goodSuffixTable[offset];

+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BoyerMooreSearch)

+UBool BoyerMooreSearch::empty()

+ return patCEs->size() <= 0;

+CollData *BoyerMooreSearch::getData()

+ return data;

+CEList *BoyerMooreSearch::getPatternCEs()

+ return patCEs;

+BadCharacterTable *BoyerMooreSearch::getBadCharacterTable()

+ return badCharacterTable;

+GoodSuffixTable *BoyerMooreSearch::getGoodSuffixTable()

+ return goodSuffixTable;

+BoyerMooreSearch::BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString,

+ UErrorCode &status)

+ : data(theData), patCEs(NULL), badCharacterTable(NULL), goodSuffixTable(NULL), pattern(patternString), target(NULL)

+ if (U_FAILURE(status)) {

+ return;

+ }

+ UCollator *collator = data->getCollator();

+ patCEs = new CEList(collator, patternString, status);

+ if (patCEs == NULL || U_FAILURE(status)) {

+ return;

+ }

+ badCharacterTable = new BadCharacterTable(*patCEs, data, status);

+ if (badCharacterTable == NULL || U_FAILURE(status)) {

+ return;

+ }

+ goodSuffixTable = new GoodSuffixTable(*patCEs, *badCharacterTable, status);

+ if (targetString != NULL) {

+ target = new Target(collator, targetString, patCEs->size(), status);

+ }

+BoyerMooreSearch::~BoyerMooreSearch()

+ delete target;

+ delete goodSuffixTable;

+ delete badCharacterTable;

+ delete patCEs;

+void BoyerMooreSearch::setTargetString(const UnicodeString *targetString, UErrorCode &status)

+ if (U_FAILURE(status)) {

+ return;

+ }

+ if (target == NULL) {

+ target = new Target(data->getCollator(), targetString, patCEs->size(), status);

+ } else {

+ target->setTargetString(targetString);

+ }

+// **** main flow of this code from Laura Werner's "Unicode Text Searching in Java" paper. ****

+/*

+ * TODO:

+ * * deal with trailing (and leading?) ignorables.

+ * * Adding BoyerMooreSearch object slowed it down. How can we speed it up?

+ */

+UBool BoyerMooreSearch::search(int32_t offset, int32_t &start, int32_t &end)

+ /*UCollator *coll =*/ data->getCollator();

+ int32_t plen = patCEs->size();

+ int32_t tlen = target->stringLength();

+ int32_t maxSkip = badCharacterTable->getMaxSkip();

+ int32_t tOffset = offset + maxSkip;

+ if (plen <= 0) {

+ // Searching for a zero length pattern always fails.

+ start = end = -1;

+ return FALSE;

+ }

+ while (tOffset <= tlen) {

+ int32_t pIndex = plen - 1;

+ int32_t tIndex = 0;

+ int32_t lIndex = 0;

+ if (tOffset < tlen) {

+ // **** we really want to skip ahead enough to ****

+ // **** be sure we get at least 1 non-ignorable ****

+ // **** CE after the end of the pattern. ****

+ int32_t next = target->nextSafeBoundary(tOffset + 1);

+ target->setOffset(next);

+ for (lIndex = 0; ; lIndex += 1) {

+ const CEI *cei = target->prevCE(lIndex);

+ int32_t low = cei->lowOffset;

+ int32_t high = cei->highOffset;

+ if (high == 0 || (low < high && low <= tOffset)) {

+ if (low < tOffset) {

+ while (lIndex >= 0 && target->prevCE(lIndex)->highOffset == high) {

+ lIndex -= 1;

+ }

+ if (high > tOffset) {

+ tOffset = high;

+ }

+ break;

+ }

+ } else {

+ target->setLast(tOffset);

+ lIndex = 0;

+ }

+ tIndex = ++lIndex;

+ // Iterate backward until we hit the beginning of the pattern

+ while (pIndex >= 0) {

+ uint32_t pce = (*patCEs)[pIndex];

+ const CEI *tcei = target->prevCE(tIndex++);

+ if (tcei->order != pce) {

+ // There is a mismatch at this position. Decide how far

+ // over to shift the pattern, then try again.

+ int32_t gsOffset = tOffset + (*goodSuffixTable)[pIndex];

+#ifdef EXTRA_CAUTIOUS

+ int32_t old = tOffset;

+#endif

+ tOffset += (*badCharacterTable)[tcei->order] - badCharacterTable->minLengthInChars(pIndex + 1);

+ if (gsOffset > tOffset) {

+ tOffset = gsOffset;

+ }

+#ifdef EXTRA_CAUTIOUS

+ // Make sure we don't skip backwards...

+ if (tOffset <= old) {

+ tOffset = old + 1;

+ }

+#endif

+ break;

+ }

+ pIndex -= 1;

+ }

+ if (pIndex < 0) {

+ // We made it back to the beginning of the pattern,

+ // which means we matched it all. Return the location.

+ const CEI firstCEI = *target->prevCE(tIndex - 1);

+ const CEI lastCEI = *target->prevCE(lIndex);

+ int32_t mStart = firstCEI.lowOffset;

+ int32_t minLimit = lastCEI.lowOffset;

+ int32_t maxLimit = lastCEI.highOffset;

+ int32_t mLimit;

+ UBool found = TRUE;

+ target->setOffset(/*tOffset*/maxLimit);

+ const CEI nextCEI = *target->nextCE(0);

+ if (nextCEI.lowOffset > maxLimit) {

+ maxLimit = nextCEI.lowOffset;

+ }

+ if (nextCEI.lowOffset == nextCEI.highOffset && nextCEI.order != (uint32_t)UCOL_NULLORDER) {

+ found = FALSE;

+ }

+ if (! target->isBreakBoundary(mStart)) {

+ found = FALSE;

+ }

+ if (firstCEI.lowOffset == firstCEI.highOffset) {

+ found = FALSE;

+ }

+ mLimit = maxLimit;

+ if (minLimit < maxLimit) {

+ int32_t nbb = target->nextBreakBoundary(minLimit);

+ if (nbb >= lastCEI.highOffset) {

+ mLimit = nbb;

+ }

+ if (mLimit > maxLimit) {

+ found = FALSE;

+ }

+ if (! target->isBreakBoundary(mLimit)) {

+ found = FALSE;

+ }

+ if (! target->isIdentical(pattern, mStart, mLimit)) {

+ found = FALSE;

+ }

+ if (found) {

+ start = mStart;

+ end = mLimit;

+ return TRUE;

+ }

+ tOffset += (*goodSuffixTable)[0]; // really? Maybe += 1 or += maxSkip?

+ }

+ // Otherwise, we're here because of a mismatch, so keep going....

+ }

+ // no match

+ start = -1;

+ end = -1;

+ return FALSE;

+U_NAMESPACE_END

+#endif // #if !UCONFIG_NO_COLLATION

Property changes on: icu46/source/i18n/bmsearch.cpp

___________________________________________________________________

Added: svn:eol-style

+ LF

« no previous file with comments | « icu46/source/i18n/bms.cpp ('k') | icu46/source/i18n/bocsu.h » ('j') | no next file with comments »