source/i18n/usearch.cpp - Issue 1621843002: ICU 56 update step 1

Unified Diff: source/i18n/usearch.cpp

Issue 1621843002: ICU 56 update step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@561

Patch Set: Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: source/i18n/usearch.cpp

diff --git a/source/i18n/usearch.cpp b/source/i18n/usearch.cpp

index 96414dbf4d90ab38be5ecad097101c45c79e913f..41639b6947de16c18c4f9d70e5c25f1104a3f1d5 100644

--- a/source/i18n/usearch.cpp

+++ b/source/i18n/usearch.cpp

@@ -1,6 +1,6 @@

**********************************************************************

* Date Name Description

* 07/02/2001 synwee Creation.

@@ -75,18 +75,22 @@ inline uint32_t getMask(UCollationStrength strength)

}

/**

-* This is to squeeze the 21bit ces into a 256 table

-* @param ce collation element

-* @return collapsed version of the collation element

+* @param ce 32-bit collation element

+* @return hash code

static

-inline int hash(uint32_t ce)

+inline int hashFromCE32(uint32_t ce)

{

- // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work

- // well with the new collation where most of the latin 1 characters

- // are of the value xx000xxx. their hashes will most of the time be 0

- // to be discussed on the hash algo.

- return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;

+ int hc = (int)(

+ ((((((ce >> 24) * 37) +

+ (ce >> 16)) * 37) +

+ (ce >> 8)) * 37) +

+ ce);

+ hc %= MAX_TABLE_SIZE_;

+ if (hc < 0) {

+ hc += MAX_TABLE_SIZE_;

+ }

+ return hc;

}

U_CDECL_BEGIN

@@ -492,22 +496,22 @@ inline void setShiftTable(int16_t shift[], int16_t backshift[],

for (count = 0; count < cesize; count ++) {

// number of ces from right of array to the count

int temp = defaultforward - count - 1;

- shift[hash(cetable[count])] = temp > 1 ? temp : 1;

+ shift[hashFromCE32(cetable[count])] = temp > 1 ? temp : 1;

}

- shift[hash(cetable[cesize])] = 1;

+ shift[hashFromCE32(cetable[cesize])] = 1;

// for ignorables we just shift by one. see test examples.

- shift[hash(0)] = 1;

+ shift[hashFromCE32(0)] = 1;

for (count = 0; count < MAX_TABLE_SIZE_; count ++) {

backshift[count] = defaultbackward;

}

for (count = cesize; count > 0; count --) {

// the original value count does not seem to work

- backshift[hash(cetable[count])] = count > expansionsize ?

+ backshift[hashFromCE32(cetable[count])] = count > expansionsize ?

(int16_t)(count - expansionsize) : 1;

}

- backshift[hash(cetable[0])] = 1;

- backshift[hash(0)] = 1;

+ backshift[hashFromCE32(cetable[0])] = 1;

+ backshift[hashFromCE32(0)] = 1;

}

/**

@@ -730,7 +734,7 @@ inline int32_t shiftForward(UStringSearch *strsrch,

{

UPattern *pattern = &(strsrch->pattern);

if (ce != UCOL_NULLORDER) {

- int32_t shift = pattern->shift[hash(ce)];

+ int32_t shift = pattern->shift[hashFromCE32(ce)];

// this is to adjust for characters in the middle of the

// substring for matching that failed.

int32_t adjust = pattern->cesLength - patternceindex;

@@ -1971,7 +1975,7 @@ inline int32_t reverseShift(UStringSearch *strsrch,

}

else {

if (ce != UCOL_NULLORDER) {

- int32_t shift = strsrch->pattern.backShift[hash(ce)];

+ int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)];

// this is to adjust for characters in the middle of the substring

// for matching that failed.

@@ -3805,6 +3809,28 @@ static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t com

// TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s

#endif

+namespace {

+UChar32 codePointAt(const USearch &search, int32_t index) {

+ if (index < search.textLength) {

+ UChar32 c;

+ U16_NEXT(search.text, index, search.textLength, c);

+ return c;

+ }

+ return U_SENTINEL;

+UChar32 codePointBefore(const USearch &search, int32_t index) {

+ if (0 < index) {

+ UChar32 c;

+ U16_PREV(search.text, 0, index, c);

+ return c;

+ }

+ return U_SENTINEL;

+} // namespace

U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,

int32_t startIdx,

int32_t *matchStart,

@@ -3998,6 +4024,34 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,

found = FALSE;

}

+ // Allow matches to end in the middle of a grapheme cluster if the following

+ // conditions are met; this is needed to make prefix search work properly in

+ // Indic, see #11750

+ // * the default breakIter is being used

+ // * the next collation element after this combining sequence

+ // - has non-zero primary weight

+ // - corresponds to a separate character following the one at end of the current match

+ // (the second of these conditions, and perhaps both, may be redundant given the

+ // subsequent check for normalization boundary; however they are likely much faster

+ // tests in any case)

+ // * the match limit is a normalization boundary

+ UBool allowMidclusterMatch = FALSE;

+ if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {

+ allowMidclusterMatch =

+ strsrch->search->breakIter == NULL &&

+ nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&

+ maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&

+ (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||

+ strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));

+ }

+ // If those conditions are met, then:

+ // * do NOT advance the candidate match limit (mLimit) to a break boundary; however

+ // the match limit may be backed off to a previous break boundary. This handles

+ // cases in which mLimit includes target characters that are ignorable with current

+ // settings (such as space) and which extend beyond the pattern match.

+ // * do NOT require that end of the combining sequence not extend beyond the match in CE space

+ // * do NOT require that match limit be on a breakIter boundary

// Advance the match end position to the first acceptable match boundary.

// This advances the index over any combining charcters.

mLimit = maxLimit;

@@ -4012,7 +4066,10 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,

mLimit = minLimit;

} else {

int32_t nba = nextBoundaryAfter(strsrch, minLimit);

- if (nba >= lastCEI->highIndex) {

+ // Note that we can have nba < maxLimit && nba >= minLImit, in which

+ // case we want to set mLimit to nba regardless of allowMidclusterMatch

+ // (i.e. we back off mLimit to the previous breakIterator boundary).

+ if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {

mLimit = nba;

}

@@ -4024,14 +4081,16 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,

}

#endif

- // If advancing to the end of a combining sequence in character indexing space

- // advanced us beyond the end of the match in CE space, reject this match.

- if (mLimit > maxLimit) {

- found = FALSE;

- }

+ if (!allowMidclusterMatch) {

+ // If advancing to the end of a combining sequence in character indexing space

+ // advanced us beyond the end of the match in CE space, reject this match.

+ if (mLimit > maxLimit) {

+ found = FALSE;

+ }

- if (!isBreakBoundary(strsrch, mLimit)) {

- found = FALSE;

+ if (!isBreakBoundary(strsrch, mLimit)) {

+ found = FALSE;

+ }

}

if (! checkIdentical(strsrch, mStart, mLimit)) {

@@ -4248,25 +4307,57 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,

mLimit = maxLimit = nextCEI->lowIndex;

+ // Allow matches to end in the middle of a grapheme cluster if the following

+ // conditions are met; this is needed to make prefix search work properly in

+ // Indic, see #11750

+ // * the default breakIter is being used

+ // * the next collation element after this combining sequence

+ // - has non-zero primary weight

+ // - corresponds to a separate character following the one at end of the current match

+ // (the second of these conditions, and perhaps both, may be redundant given the

+ // subsequent check for normalization boundary; however they are likely much faster

+ // tests in any case)

+ // * the match limit is a normalization boundary

+ UBool allowMidclusterMatch = FALSE;

+ if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {

+ allowMidclusterMatch =

+ strsrch->search->breakIter == NULL &&

+ nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&

+ maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&

+ (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||

+ strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));

+ }

+ // If those conditions are met, then:

+ // * do NOT advance the candidate match limit (mLimit) to a break boundary; however

+ // the match limit may be backed off to a previous break boundary. This handles

+ // cases in which mLimit includes target characters that are ignorable with current

+ // settings (such as space) and which extend beyond the pattern match.

+ // * do NOT require that end of the combining sequence not extend beyond the match in CE space

+ // * do NOT require that match limit be on a breakIter boundary

// Advance the match end position to the first acceptable match boundary.

- // This advances the index over any combining charcters.

+ // This advances the index over any combining characters.

if (minLimit < maxLimit) {

int32_t nba = nextBoundaryAfter(strsrch, minLimit);

- if (nba >= lastCEI->highIndex) {

+ // Note that we can have nba < maxLimit && nba >= minLImit, in which

+ // case we want to set mLimit to nba regardless of allowMidclusterMatch

+ // (i.e. we back off mLimit to the previous breakIterator boundary).

+ if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {

mLimit = nba;

}

- // If advancing to the end of a combining sequence in character indexing space

- // advanced us beyond the end of the match in CE space, reject this match.

- if (mLimit > maxLimit) {

- found = FALSE;

- }

+ if (!allowMidclusterMatch) {

+ // If advancing to the end of a combining sequence in character indexing space

+ // advanced us beyond the end of the match in CE space, reject this match.

+ if (mLimit > maxLimit) {

+ found = FALSE;

+ }

- // Make sure the end of the match is on a break boundary

- if (!isBreakBoundary(strsrch, mLimit)) {

- found = FALSE;

+ // Make sure the end of the match is on a break boundary

+ if (!isBreakBoundary(strsrch, mLimit)) {

+ found = FALSE;

+ }

}

} else {

« no previous file with comments | « source/i18n/uregion.cpp ('k') | source/i18n/uspoof.cpp » ('j') | no next file with comments »