| Index: source/i18n/usearch.cpp
|
| diff --git a/source/i18n/usearch.cpp b/source/i18n/usearch.cpp
|
| index 96414dbf4d90ab38be5ecad097101c45c79e913f..41639b6947de16c18c4f9d70e5c25f1104a3f1d5 100644
|
| --- a/source/i18n/usearch.cpp
|
| +++ b/source/i18n/usearch.cpp
|
| @@ -1,6 +1,6 @@
|
| /*
|
| **********************************************************************
|
| -* Copyright (C) 2001-2014 IBM and others. All rights reserved.
|
| +* Copyright (C) 2001-2015 IBM and others. All rights reserved.
|
| **********************************************************************
|
| * Date Name Description
|
| * 07/02/2001 synwee Creation.
|
| @@ -75,18 +75,22 @@ inline uint32_t getMask(UCollationStrength strength)
|
| }
|
|
|
| /**
|
| -* This is to squeeze the 21bit ces into a 256 table
|
| -* @param ce collation element
|
| -* @return collapsed version of the collation element
|
| +* @param ce 32-bit collation element
|
| +* @return hash code
|
| */
|
| static
|
| -inline int hash(uint32_t ce)
|
| +inline int hashFromCE32(uint32_t ce)
|
| {
|
| - // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
|
| - // well with the new collation where most of the latin 1 characters
|
| - // are of the value xx000xxx. their hashes will most of the time be 0
|
| - // to be discussed on the hash algo.
|
| - return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
|
| + int hc = (int)(
|
| + ((((((ce >> 24) * 37) +
|
| + (ce >> 16)) * 37) +
|
| + (ce >> 8)) * 37) +
|
| + ce);
|
| + hc %= MAX_TABLE_SIZE_;
|
| + if (hc < 0) {
|
| + hc += MAX_TABLE_SIZE_;
|
| + }
|
| + return hc;
|
| }
|
|
|
| U_CDECL_BEGIN
|
| @@ -492,22 +496,22 @@ inline void setShiftTable(int16_t shift[], int16_t backshift[],
|
| for (count = 0; count < cesize; count ++) {
|
| // number of ces from right of array to the count
|
| int temp = defaultforward - count - 1;
|
| - shift[hash(cetable[count])] = temp > 1 ? temp : 1;
|
| + shift[hashFromCE32(cetable[count])] = temp > 1 ? temp : 1;
|
| }
|
| - shift[hash(cetable[cesize])] = 1;
|
| + shift[hashFromCE32(cetable[cesize])] = 1;
|
| // for ignorables we just shift by one. see test examples.
|
| - shift[hash(0)] = 1;
|
| + shift[hashFromCE32(0)] = 1;
|
|
|
| for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
|
| backshift[count] = defaultbackward;
|
| }
|
| for (count = cesize; count > 0; count --) {
|
| // the original value count does not seem to work
|
| - backshift[hash(cetable[count])] = count > expansionsize ?
|
| + backshift[hashFromCE32(cetable[count])] = count > expansionsize ?
|
| (int16_t)(count - expansionsize) : 1;
|
| }
|
| - backshift[hash(cetable[0])] = 1;
|
| - backshift[hash(0)] = 1;
|
| + backshift[hashFromCE32(cetable[0])] = 1;
|
| + backshift[hashFromCE32(0)] = 1;
|
| }
|
|
|
| /**
|
| @@ -730,7 +734,7 @@ inline int32_t shiftForward(UStringSearch *strsrch,
|
| {
|
| UPattern *pattern = &(strsrch->pattern);
|
| if (ce != UCOL_NULLORDER) {
|
| - int32_t shift = pattern->shift[hash(ce)];
|
| + int32_t shift = pattern->shift[hashFromCE32(ce)];
|
| // this is to adjust for characters in the middle of the
|
| // substring for matching that failed.
|
| int32_t adjust = pattern->cesLength - patternceindex;
|
| @@ -1971,7 +1975,7 @@ inline int32_t reverseShift(UStringSearch *strsrch,
|
| }
|
| else {
|
| if (ce != UCOL_NULLORDER) {
|
| - int32_t shift = strsrch->pattern.backShift[hash(ce)];
|
| + int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)];
|
|
|
| // this is to adjust for characters in the middle of the substring
|
| // for matching that failed.
|
| @@ -3805,6 +3809,28 @@ static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t com
|
| // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
|
| #endif
|
|
|
| +namespace {
|
| +
|
| +UChar32 codePointAt(const USearch &search, int32_t index) {
|
| + if (index < search.textLength) {
|
| + UChar32 c;
|
| + U16_NEXT(search.text, index, search.textLength, c);
|
| + return c;
|
| + }
|
| + return U_SENTINEL;
|
| +}
|
| +
|
| +UChar32 codePointBefore(const USearch &search, int32_t index) {
|
| + if (0 < index) {
|
| + UChar32 c;
|
| + U16_PREV(search.text, 0, index, c);
|
| + return c;
|
| + }
|
| + return U_SENTINEL;
|
| +}
|
| +
|
| +} // namespace
|
| +
|
| U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
|
| int32_t startIdx,
|
| int32_t *matchStart,
|
| @@ -3998,6 +4024,34 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
|
| found = FALSE;
|
| }
|
|
|
| + // Allow matches to end in the middle of a grapheme cluster if the following
|
| + // conditions are met; this is needed to make prefix search work properly in
|
| + // Indic, see #11750
|
| + // * the default breakIter is being used
|
| + // * the next collation element after this combining sequence
|
| + // - has non-zero primary weight
|
| + // - corresponds to a separate character following the one at end of the current match
|
| + // (the second of these conditions, and perhaps both, may be redundant given the
|
| + // subsequent check for normalization boundary; however they are likely much faster
|
| + // tests in any case)
|
| + // * the match limit is a normalization boundary
|
| + UBool allowMidclusterMatch = FALSE;
|
| + if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
|
| + allowMidclusterMatch =
|
| + strsrch->search->breakIter == NULL &&
|
| + nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
|
| + maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
|
| + (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
|
| + strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
|
| + }
|
| + // If those conditions are met, then:
|
| + // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
|
| + // the match limit may be backed off to a previous break boundary. This handles
|
| + // cases in which mLimit includes target characters that are ignorable with current
|
| + // settings (such as space) and which extend beyond the pattern match.
|
| + // * do NOT require that end of the combining sequence not extend beyond the match in CE space
|
| + // * do NOT require that match limit be on a breakIter boundary
|
| +
|
| // Advance the match end position to the first acceptable match boundary.
|
| // This advances the index over any combining charcters.
|
| mLimit = maxLimit;
|
| @@ -4012,7 +4066,10 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
|
| mLimit = minLimit;
|
| } else {
|
| int32_t nba = nextBoundaryAfter(strsrch, minLimit);
|
| - if (nba >= lastCEI->highIndex) {
|
| + // Note that we can have nba < maxLimit && nba >= minLImit, in which
|
| + // case we want to set mLimit to nba regardless of allowMidclusterMatch
|
| + // (i.e. we back off mLimit to the previous breakIterator boundary).
|
| + if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
|
| mLimit = nba;
|
| }
|
| }
|
| @@ -4024,14 +4081,16 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
|
| }
|
| #endif
|
|
|
| - // If advancing to the end of a combining sequence in character indexing space
|
| - // advanced us beyond the end of the match in CE space, reject this match.
|
| - if (mLimit > maxLimit) {
|
| - found = FALSE;
|
| - }
|
| + if (!allowMidclusterMatch) {
|
| + // If advancing to the end of a combining sequence in character indexing space
|
| + // advanced us beyond the end of the match in CE space, reject this match.
|
| + if (mLimit > maxLimit) {
|
| + found = FALSE;
|
| + }
|
|
|
| - if (!isBreakBoundary(strsrch, mLimit)) {
|
| - found = FALSE;
|
| + if (!isBreakBoundary(strsrch, mLimit)) {
|
| + found = FALSE;
|
| + }
|
| }
|
|
|
| if (! checkIdentical(strsrch, mStart, mLimit)) {
|
| @@ -4248,25 +4307,57 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
|
|
|
| mLimit = maxLimit = nextCEI->lowIndex;
|
|
|
| + // Allow matches to end in the middle of a grapheme cluster if the following
|
| + // conditions are met; this is needed to make prefix search work properly in
|
| + // Indic, see #11750
|
| + // * the default breakIter is being used
|
| + // * the next collation element after this combining sequence
|
| + // - has non-zero primary weight
|
| + // - corresponds to a separate character following the one at end of the current match
|
| + // (the second of these conditions, and perhaps both, may be redundant given the
|
| + // subsequent check for normalization boundary; however they are likely much faster
|
| + // tests in any case)
|
| + // * the match limit is a normalization boundary
|
| + UBool allowMidclusterMatch = FALSE;
|
| + if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
|
| + allowMidclusterMatch =
|
| + strsrch->search->breakIter == NULL &&
|
| + nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
|
| + maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
|
| + (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
|
| + strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
|
| + }
|
| + // If those conditions are met, then:
|
| + // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
|
| + // the match limit may be backed off to a previous break boundary. This handles
|
| + // cases in which mLimit includes target characters that are ignorable with current
|
| + // settings (such as space) and which extend beyond the pattern match.
|
| + // * do NOT require that end of the combining sequence not extend beyond the match in CE space
|
| + // * do NOT require that match limit be on a breakIter boundary
|
| +
|
| // Advance the match end position to the first acceptable match boundary.
|
| - // This advances the index over any combining charcters.
|
| + // This advances the index over any combining characters.
|
| if (minLimit < maxLimit) {
|
| int32_t nba = nextBoundaryAfter(strsrch, minLimit);
|
| -
|
| - if (nba >= lastCEI->highIndex) {
|
| + // Note that we can have nba < maxLimit && nba >= minLImit, in which
|
| + // case we want to set mLimit to nba regardless of allowMidclusterMatch
|
| + // (i.e. we back off mLimit to the previous breakIterator boundary).
|
| + if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
|
| mLimit = nba;
|
| }
|
| }
|
|
|
| - // If advancing to the end of a combining sequence in character indexing space
|
| - // advanced us beyond the end of the match in CE space, reject this match.
|
| - if (mLimit > maxLimit) {
|
| - found = FALSE;
|
| - }
|
| + if (!allowMidclusterMatch) {
|
| + // If advancing to the end of a combining sequence in character indexing space
|
| + // advanced us beyond the end of the match in CE space, reject this match.
|
| + if (mLimit > maxLimit) {
|
| + found = FALSE;
|
| + }
|
|
|
| - // Make sure the end of the match is on a break boundary
|
| - if (!isBreakBoundary(strsrch, mLimit)) {
|
| - found = FALSE;
|
| + // Make sure the end of the match is on a break boundary
|
| + if (!isBreakBoundary(strsrch, mLimit)) {
|
| + found = FALSE;
|
| + }
|
| }
|
|
|
| } else {
|
|
|