Index: source/i18n/usearch.cpp |
diff --git a/source/i18n/usearch.cpp b/source/i18n/usearch.cpp |
index 96414dbf4d90ab38be5ecad097101c45c79e913f..41639b6947de16c18c4f9d70e5c25f1104a3f1d5 100644 |
--- a/source/i18n/usearch.cpp |
+++ b/source/i18n/usearch.cpp |
@@ -1,6 +1,6 @@ |
/* |
********************************************************************** |
-* Copyright (C) 2001-2014 IBM and others. All rights reserved. |
+* Copyright (C) 2001-2015 IBM and others. All rights reserved. |
********************************************************************** |
* Date Name Description |
* 07/02/2001 synwee Creation. |
@@ -75,18 +75,22 @@ inline uint32_t getMask(UCollationStrength strength) |
} |
/** |
-* This is to squeeze the 21bit ces into a 256 table |
-* @param ce collation element |
-* @return collapsed version of the collation element |
+* @param ce 32-bit collation element |
+* @return hash code |
*/ |
static |
-inline int hash(uint32_t ce) |
+inline int hashFromCE32(uint32_t ce) |
{ |
- // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work |
- // well with the new collation where most of the latin 1 characters |
- // are of the value xx000xxx. their hashes will most of the time be 0 |
- // to be discussed on the hash algo. |
- return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_; |
+ int hc = (int)( |
+ ((((((ce >> 24) * 37) + |
+ (ce >> 16)) * 37) + |
+ (ce >> 8)) * 37) + |
+ ce); |
+ hc %= MAX_TABLE_SIZE_; |
+ if (hc < 0) { |
+ hc += MAX_TABLE_SIZE_; |
+ } |
+ return hc; |
} |
U_CDECL_BEGIN |
@@ -492,22 +496,22 @@ inline void setShiftTable(int16_t shift[], int16_t backshift[], |
for (count = 0; count < cesize; count ++) { |
// number of ces from right of array to the count |
int temp = defaultforward - count - 1; |
- shift[hash(cetable[count])] = temp > 1 ? temp : 1; |
+ shift[hashFromCE32(cetable[count])] = temp > 1 ? temp : 1; |
} |
- shift[hash(cetable[cesize])] = 1; |
+ shift[hashFromCE32(cetable[cesize])] = 1; |
// for ignorables we just shift by one. see test examples. |
- shift[hash(0)] = 1; |
+ shift[hashFromCE32(0)] = 1; |
for (count = 0; count < MAX_TABLE_SIZE_; count ++) { |
backshift[count] = defaultbackward; |
} |
for (count = cesize; count > 0; count --) { |
// the original value count does not seem to work |
- backshift[hash(cetable[count])] = count > expansionsize ? |
+ backshift[hashFromCE32(cetable[count])] = count > expansionsize ? |
(int16_t)(count - expansionsize) : 1; |
} |
- backshift[hash(cetable[0])] = 1; |
- backshift[hash(0)] = 1; |
+ backshift[hashFromCE32(cetable[0])] = 1; |
+ backshift[hashFromCE32(0)] = 1; |
} |
/** |
@@ -730,7 +734,7 @@ inline int32_t shiftForward(UStringSearch *strsrch, |
{ |
UPattern *pattern = &(strsrch->pattern); |
if (ce != UCOL_NULLORDER) { |
- int32_t shift = pattern->shift[hash(ce)]; |
+ int32_t shift = pattern->shift[hashFromCE32(ce)]; |
// this is to adjust for characters in the middle of the |
// substring for matching that failed. |
int32_t adjust = pattern->cesLength - patternceindex; |
@@ -1971,7 +1975,7 @@ inline int32_t reverseShift(UStringSearch *strsrch, |
} |
else { |
if (ce != UCOL_NULLORDER) { |
- int32_t shift = strsrch->pattern.backShift[hash(ce)]; |
+ int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)]; |
// this is to adjust for characters in the middle of the substring |
// for matching that failed. |
@@ -3805,6 +3809,28 @@ static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t com |
// TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s |
#endif |
+namespace { |
+ |
+UChar32 codePointAt(const USearch &search, int32_t index) { |
+ if (index < search.textLength) { |
+ UChar32 c; |
+ U16_NEXT(search.text, index, search.textLength, c); |
+ return c; |
+ } |
+ return U_SENTINEL; |
+} |
+ |
+UChar32 codePointBefore(const USearch &search, int32_t index) { |
+ if (0 < index) { |
+ UChar32 c; |
+ U16_PREV(search.text, 0, index, c); |
+ return c; |
+ } |
+ return U_SENTINEL; |
+} |
+ |
+} // namespace |
+ |
U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, |
int32_t startIdx, |
int32_t *matchStart, |
@@ -3998,6 +4024,34 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, |
found = FALSE; |
} |
+ // Allow matches to end in the middle of a grapheme cluster if the following |
+ // conditions are met; this is needed to make prefix search work properly in |
+ // Indic, see #11750 |
+ // * the default breakIter is being used |
+ // * the next collation element after this combining sequence |
+ // - has non-zero primary weight |
+ // - corresponds to a separate character following the one at end of the current match |
+ // (the second of these conditions, and perhaps both, may be redundant given the |
+ // subsequent check for normalization boundary; however they are likely much faster |
+ // tests in any case) |
+ // * the match limit is a normalization boundary |
+ UBool allowMidclusterMatch = FALSE; |
+ if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) { |
+ allowMidclusterMatch = |
+ strsrch->search->breakIter == NULL && |
+ nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 && |
+ maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit && |
+ (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) || |
+ strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit))); |
+ } |
+ // If those conditions are met, then: |
+ // * do NOT advance the candidate match limit (mLimit) to a break boundary; however |
+ // the match limit may be backed off to a previous break boundary. This handles |
+ // cases in which mLimit includes target characters that are ignorable with current |
+ // settings (such as space) and which extend beyond the pattern match. |
+ // * do NOT require that end of the combining sequence not extend beyond the match in CE space |
+ // * do NOT require that match limit be on a breakIter boundary |
+ |
// Advance the match end position to the first acceptable match boundary. |
// This advances the index over any combining charcters. |
mLimit = maxLimit; |
@@ -4012,7 +4066,10 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, |
mLimit = minLimit; |
} else { |
int32_t nba = nextBoundaryAfter(strsrch, minLimit); |
- if (nba >= lastCEI->highIndex) { |
+ // Note that we can have nba < maxLimit && nba >= minLImit, in which |
+ // case we want to set mLimit to nba regardless of allowMidclusterMatch |
+ // (i.e. we back off mLimit to the previous breakIterator boundary). |
+ if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) { |
mLimit = nba; |
} |
} |
@@ -4024,14 +4081,16 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, |
} |
#endif |
- // If advancing to the end of a combining sequence in character indexing space |
- // advanced us beyond the end of the match in CE space, reject this match. |
- if (mLimit > maxLimit) { |
- found = FALSE; |
- } |
+ if (!allowMidclusterMatch) { |
+ // If advancing to the end of a combining sequence in character indexing space |
+ // advanced us beyond the end of the match in CE space, reject this match. |
+ if (mLimit > maxLimit) { |
+ found = FALSE; |
+ } |
- if (!isBreakBoundary(strsrch, mLimit)) { |
- found = FALSE; |
+ if (!isBreakBoundary(strsrch, mLimit)) { |
+ found = FALSE; |
+ } |
} |
if (! checkIdentical(strsrch, mStart, mLimit)) { |
@@ -4248,25 +4307,57 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, |
mLimit = maxLimit = nextCEI->lowIndex; |
+ // Allow matches to end in the middle of a grapheme cluster if the following |
+ // conditions are met; this is needed to make prefix search work properly in |
+ // Indic, see #11750 |
+ // * the default breakIter is being used |
+ // * the next collation element after this combining sequence |
+ // - has non-zero primary weight |
+ // - corresponds to a separate character following the one at end of the current match |
+ // (the second of these conditions, and perhaps both, may be redundant given the |
+ // subsequent check for normalization boundary; however they are likely much faster |
+ // tests in any case) |
+ // * the match limit is a normalization boundary |
+ UBool allowMidclusterMatch = FALSE; |
+ if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) { |
+ allowMidclusterMatch = |
+ strsrch->search->breakIter == NULL && |
+ nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 && |
+ maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit && |
+ (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) || |
+ strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit))); |
+ } |
+ // If those conditions are met, then: |
+ // * do NOT advance the candidate match limit (mLimit) to a break boundary; however |
+ // the match limit may be backed off to a previous break boundary. This handles |
+ // cases in which mLimit includes target characters that are ignorable with current |
+ // settings (such as space) and which extend beyond the pattern match. |
+ // * do NOT require that end of the combining sequence not extend beyond the match in CE space |
+ // * do NOT require that match limit be on a breakIter boundary |
+ |
// Advance the match end position to the first acceptable match boundary. |
- // This advances the index over any combining charcters. |
+ // This advances the index over any combining characters. |
if (minLimit < maxLimit) { |
int32_t nba = nextBoundaryAfter(strsrch, minLimit); |
- |
- if (nba >= lastCEI->highIndex) { |
+ // Note that we can have nba < maxLimit && nba >= minLImit, in which |
+ // case we want to set mLimit to nba regardless of allowMidclusterMatch |
+ // (i.e. we back off mLimit to the previous breakIterator boundary). |
+ if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) { |
mLimit = nba; |
} |
} |
- // If advancing to the end of a combining sequence in character indexing space |
- // advanced us beyond the end of the match in CE space, reject this match. |
- if (mLimit > maxLimit) { |
- found = FALSE; |
- } |
+ if (!allowMidclusterMatch) { |
+ // If advancing to the end of a combining sequence in character indexing space |
+ // advanced us beyond the end of the match in CE space, reject this match. |
+ if (mLimit > maxLimit) { |
+ found = FALSE; |
+ } |
- // Make sure the end of the match is on a break boundary |
- if (!isBreakBoundary(strsrch, mLimit)) { |
- found = FALSE; |
+ // Make sure the end of the match is on a break boundary |
+ if (!isBreakBoundary(strsrch, mLimit)) { |
+ found = FALSE; |
+ } |
} |
} else { |