source/i18n/usearch.cpp - Issue 1621843002: ICU 56 update step 1

Side by Side Diff: source/i18n/usearch.cpp

Issue 1621843002: ICU 56 update step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@561

Patch Set: Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 **********************************************************************	2 **********************************************************************

3 * Copyright (C) 2001-2014 IBM and others. All rights reserved.	3 * Copyright (C) 2001-2015 IBM and others. All rights reserved.

4 **********************************************************************	4 **********************************************************************

5 * Date Name Description	5 * Date Name Description

6 * 07/02/2001 synwee Creation.	6 * 07/02/2001 synwee Creation.

7 **********************************************************************	7 **********************************************************************

8 */	8 */

9	9

10 #include "unicode/utypes.h"	10 #include "unicode/utypes.h"

11	11

12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION	12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION

13	13

(...skipping 54 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
68 return UCOL_PRIMARYORDERMASK;	68 return UCOL_PRIMARYORDERMASK;

69 case UCOL_SECONDARY:	69 case UCOL_SECONDARY:

70 return UCOL_SECONDARYORDERMASK \| UCOL_PRIMARYORDERMASK;	70 return UCOL_SECONDARYORDERMASK \| UCOL_PRIMARYORDERMASK;

71 default:	71 default:

72 return UCOL_TERTIARYORDERMASK \| UCOL_SECONDARYORDERMASK \|	72 return UCOL_TERTIARYORDERMASK \| UCOL_SECONDARYORDERMASK \|

73 UCOL_PRIMARYORDERMASK;	73 UCOL_PRIMARYORDERMASK;

74 }	74 }

75 }	75 }

76	76

77 /**	77 /**

78 * This is to squeeze the 21bit ces into a 256 table	78 * @param ce 32-bit collation element

79 * @param ce collation element	79 * @return hash code

80 * @return collapsed version of the collation element

81 */	80 */

82 static	81 static

83 inline int hash(uint32_t ce)	82 inline int hashFromCE32(uint32_t ce)

84 {	83 {

85 // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work	84 int hc = (int)(

86 // well with the new collation where most of the latin 1 characters	85 ((((((ce >> 24) * 37) +

87 // are of the value xx000xxx. their hashes will most of the time be 0	86 (ce >> 16)) * 37) +

88 // to be discussed on the hash algo.	87 (ce >> 8)) * 37) +

89 return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;	88 ce);

	89 hc %= MAX_TABLE_SIZE_;

	90 if (hc < 0) {

	91 hc += MAX_TABLE_SIZE_;

	92 }

	93 return hc;

90 }	94 }

91	95

92 U_CDECL_BEGIN	96 U_CDECL_BEGIN

93 static UBool U_CALLCONV	97 static UBool U_CALLCONV

94 usearch_cleanup(void) {	98 usearch_cleanup(void) {

95 g_nfcImpl = NULL;	99 g_nfcImpl = NULL;

96 return TRUE;	100 return TRUE;

97 }	101 }

98 U_CDECL_END	102 U_CDECL_END

99	103

(...skipping 385 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
485 // the number of ces minus their expansion, since expansions can come	489 // the number of ces minus their expansion, since expansions can come

486 // from a character.	490 // from a character.

487 int32_t count;	491 int32_t count;

488 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {	492 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {

489 shift[count] = defaultforward;	493 shift[count] = defaultforward;

490 }	494 }

491 cesize --; // down to the last index	495 cesize --; // down to the last index

492 for (count = 0; count < cesize; count ++) {	496 for (count = 0; count < cesize; count ++) {

493 // number of ces from right of array to the count	497 // number of ces from right of array to the count

494 int temp = defaultforward - count - 1;	498 int temp = defaultforward - count - 1;

495 shift[hash(cetable[count])] = temp > 1 ? temp : 1;	499 shift[hashFromCE32(cetable[count])] = temp > 1 ? temp : 1;

496 }	500 }

497 shift[hash(cetable[cesize])] = 1;	501 shift[hashFromCE32(cetable[cesize])] = 1;

498 // for ignorables we just shift by one. see test examples.	502 // for ignorables we just shift by one. see test examples.

499 shift[hash(0)] = 1;	503 shift[hashFromCE32(0)] = 1;

500	504

501 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {	505 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {

502 backshift[count] = defaultbackward;	506 backshift[count] = defaultbackward;

503 }	507 }

504 for (count = cesize; count > 0; count --) {	508 for (count = cesize; count > 0; count --) {

505 // the original value count does not seem to work	509 // the original value count does not seem to work

506 backshift[hash(cetable[count])] = count > expansionsize ?	510 backshift[hashFromCE32(cetable[count])] = count > expansionsize ?

507 (int16_t)(count - expansionsize) : 1;	511 (int16_t)(count - expansionsize) : 1;

508 }	512 }

509 backshift[hash(cetable[0])] = 1;	513 backshift[hashFromCE32(cetable[0])] = 1;

510 backshift[hash(0)] = 1;	514 backshift[hashFromCE32(0)] = 1;

511 }	515 }

512	516

513 /**	517 /**

514 * Building of the pattern collation element list and the boyer moore strsrch	518 * Building of the pattern collation element list and the boyer moore strsrch

515 * table.	519 * table.

516 * The canonical match will only be performed after the default match fails.	520 * The canonical match will only be performed after the default match fails.

517 * For both cases we need to remember the size of the composed and decomposed	521 * For both cases we need to remember the size of the composed and decomposed

518 * versions of the string. Since the Boyer-Moore shift calculations shifts by	522 * versions of the string. Since the Boyer-Moore shift calculations shifts by

519 * a number of characters in the text and tries to match the pattern from that	523 * a number of characters in the text and tries to match the pattern from that

520 * offset, the shift value can not be too large in case we miss some	524 * offset, the shift value can not be too large in case we miss some

(...skipping 202 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
723 * @return final offset	727 * @return final offset

724 */	728 */

725 static	729 static

726 inline int32_t shiftForward(UStringSearch *strsrch,	730 inline int32_t shiftForward(UStringSearch *strsrch,

727 int32_t textoffset,	731 int32_t textoffset,

728 int32_t ce,	732 int32_t ce,

729 int32_t patternceindex)	733 int32_t patternceindex)

730 {	734 {

731 UPattern *pattern = &(strsrch->pattern);	735 UPattern *pattern = &(strsrch->pattern);

732 if (ce != UCOL_NULLORDER) {	736 if (ce != UCOL_NULLORDER) {

733 int32_t shift = pattern->shift[hash(ce)];	737 int32_t shift = pattern->shift[hashFromCE32(ce)];

734 // this is to adjust for characters in the middle of the	738 // this is to adjust for characters in the middle of the

735 // substring for matching that failed.	739 // substring for matching that failed.

736 int32_t adjust = pattern->cesLength - patternceindex;	740 int32_t adjust = pattern->cesLength - patternceindex;

737 if (adjust > 1 && shift >= adjust) {	741 if (adjust > 1 && shift >= adjust) {

738 shift -= adjust - 1;	742 shift -= adjust - 1;

739 }	743 }

740 textoffset += shift;	744 textoffset += shift;

741 }	745 }

742 else {	746 else {

743 textoffset += pattern->defaultShiftSize;	747 textoffset += pattern->defaultShiftSize;

(...skipping 1220 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1964 if (strsrch->search->isOverlap) {	1968 if (strsrch->search->isOverlap) {

1965 if (textoffset != strsrch->search->textLength) {	1969 if (textoffset != strsrch->search->textLength) {

1966 textoffset --;	1970 textoffset --;

1967 }	1971 }

1968 else {	1972 else {

1969 textoffset -= strsrch->pattern.defaultShiftSize;	1973 textoffset -= strsrch->pattern.defaultShiftSize;

1970 }	1974 }

1971 }	1975 }

1972 else {	1976 else {

1973 if (ce != UCOL_NULLORDER) {	1977 if (ce != UCOL_NULLORDER) {

1974 int32_t shift = strsrch->pattern.backShift[hash(ce)];	1978 int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)];

1975	1979

1976 // this is to adjust for characters in the middle of the substring	1980 // this is to adjust for characters in the middle of the substring

1977 // for matching that failed.	1981 // for matching that failed.

1978 int32_t adjust = patternceindex;	1982 int32_t adjust = patternceindex;

1979 if (adjust > 1 && shift > adjust) {	1983 if (adjust > 1 && shift > adjust) {

1980 shift -= adjust - 1;	1984 shift -= adjust - 1;

1981 }	1985 }

1982 textoffset -= shift;	1986 textoffset -= shift;

1983 }	1987 }

1984 else {	1988 else {

(...skipping 1813 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3798 U_CE_MATCH: U_CE_NO_MATCH;	3802 U_CE_MATCH: U_CE_NO_MATCH;

3799 }	3803 }

3800	3804

3801 return U_CE_MATCH;	3805 return U_CE_MATCH;

3802 }	3806 }

3803	3807

3804 #if BOYER_MOORE	3808 #if BOYER_MOORE

3805 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s	3809 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s

3806 #endif	3810 #endif

3807	3811

	3812 namespace {

	3813

	3814 UChar32 codePointAt(const USearch &search, int32_t index) {

	3815 if (index < search.textLength) {

	3816 UChar32 c;

	3817 U16_NEXT(search.text, index, search.textLength, c);

	3818 return c;

	3819 }

	3820 return U_SENTINEL;

	3821 }

	3822

	3823 UChar32 codePointBefore(const USearch &search, int32_t index) {

	3824 if (0 < index) {

	3825 UChar32 c;

	3826 U16_PREV(search.text, 0, index, c);

	3827 return c;

	3828 }

	3829 return U_SENTINEL;

	3830 }

	3831

	3832 } // namespace

	3833

3808 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,	3834 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,

3809 int32_t startIdx,	3835 int32_t startIdx,

3810 int32_t *matchStart,	3836 int32_t *matchStart,

3811 int32_t *matchLimit,	3837 int32_t *matchLimit,

3812 UErrorCode *status)	3838 UErrorCode *status)

3813 {	3839 {

3814 if (U_FAILURE(*status)) {	3840 if (U_FAILURE(*status)) {

3815 return FALSE;	3841 return FALSE;

3816 }	3842 }

3817	3843

(...skipping 173 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3991 // Check for the start of the match being within an Collation Element Ex pansion,	4017 // Check for the start of the match being within an Collation Element Ex pansion,

3992 // meaning that the first char of the match is only partially matched.	4018 // meaning that the first char of the match is only partially matched.

3993 // With exapnsions, the first CE will report the index of the source	4019 // With exapnsions, the first CE will report the index of the source

3994 // character, and all subsequent (expansions) CEs will report the sour ce index of the	4020 // character, and all subsequent (expansions) CEs will report the sour ce index of the

3995 // _following_ character.	4021 // _following_ character.

3996 int32_t secondIx = firstCEI->highIndex;	4022 int32_t secondIx = firstCEI->highIndex;

3997 if (mStart == secondIx) {	4023 if (mStart == secondIx) {

3998 found = FALSE;	4024 found = FALSE;

3999 }	4025 }

4000	4026

	4027 // Allow matches to end in the middle of a grapheme cluster if the follo wing

	4028 // conditions are met; this is needed to make prefix search work properl y in

	4029 // Indic, see #11750

	4030 // * the default breakIter is being used

	4031 // * the next collation element after this combining sequence

	4032 // - has non-zero primary weight

	4033 // - corresponds to a separate character following the one at end of t he current match

	4034 // (the second of these conditions, and perhaps both, may be redundant given the

	4035 // subsequent check for normalization boundary; however they are likel y much faster

	4036 // tests in any case)

	4037 // * the match limit is a normalization boundary

	4038 UBool allowMidclusterMatch = FALSE;

	4039 if (strsrch->search->text != NULL && strsrch->search->textLength > maxLi mit) {

	4040 allowMidclusterMatch =

	4041 strsrch->search->breakIter == NULL &&

	4042 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&

	4043 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLi mit &&

	4044 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->searc h, maxLimit)) \|\|

	4045 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch- >search, maxLimit)));

	4046 }

	4047 // If those conditions are met, then:

	4048 // * do NOT advance the candidate match limit (mLimit) to a break bounda ry; however

	4049 // the match limit may be backed off to a previous break boundary. Thi s handles

	4050 // cases in which mLimit includes target characters that are ignorable with current

	4051 // settings (such as space) and which extend beyond the pattern match.

	4052 // * do NOT require that end of the combining sequence not extend beyond the match in CE space

	4053 // * do NOT require that match limit be on a breakIter boundary

	4054

4001 // Advance the match end position to the first acceptable match boundar y.	4055 // Advance the match end position to the first acceptable match boundar y.

4002 // This advances the index over any combining charcters.	4056 // This advances the index over any combining charcters.

4003 mLimit = maxLimit;	4057 mLimit = maxLimit;

4004 if (minLimit < maxLimit) {	4058 if (minLimit < maxLimit) {

4005 // When the last CE's low index is same with its high index, the CE is likely	4059 // When the last CE's low index is same with its high index, the CE is likely

4006 // a part of expansion. In this case, the index is located just afte r the	4060 // a part of expansion. In this case, the index is located just afte r the

4007 // character corresponding to the CEs compared above. If the index i s right	4061 // character corresponding to the CEs compared above. If the index i s right

4008 // at the break boundary, move the position to the next boundary wil l result	4062 // at the break boundary, move the position to the next boundary wil l result

4009 // incorrect match length when there are ignorable characters exist between	4063 // incorrect match length when there are ignorable characters exist between

4010 // the position and the next character produces CE(s). See ticket#84 82.	4064 // the position and the next character produces CE(s). See ticket#84 82.

4011 if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLi mit)) {	4065 if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLi mit)) {

4012 mLimit = minLimit;	4066 mLimit = minLimit;

4013 } else {	4067 } else {

4014 int32_t nba = nextBoundaryAfter(strsrch, minLimit);	4068 int32_t nba = nextBoundaryAfter(strsrch, minLimit);

4015 if (nba >= lastCEI->highIndex) {	4069 // Note that we can have nba < maxLimit && nba >= minLImit, in w hich

	4070 // case we want to set mLimit to nba regardless of allowMidclust erMatch

	4071 // (i.e. we back off mLimit to the previous breakIterator bounda ry).

	4072 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch \|\| nba < maxLimit)) {

4016 mLimit = nba;	4073 mLimit = nba;

4017 }	4074 }

4018 }	4075 }

4019 }	4076 }

4020	4077

4021 #ifdef USEARCH_DEBUG	4078 #ifdef USEARCH_DEBUG

4022 if (getenv("USEARCH_DEBUG") != NULL) {	4079 if (getenv("USEARCH_DEBUG") != NULL) {

4023 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLim it, mLimit);	4080 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLim it, mLimit);

4024 }	4081 }

4025 #endif	4082 #endif

4026	4083

4027 // If advancing to the end of a combining sequence in character indexing space	4084 if (!allowMidclusterMatch) {

4028 // advanced us beyond the end of the match in CE space, reject this ma tch.	4085 // If advancing to the end of a combining sequence in character inde xing space

4029 if (mLimit > maxLimit) {	4086 // advanced us beyond the end of the match in CE space, reject thi s match.

4030 found = FALSE;	4087 if (mLimit > maxLimit) {

4031 }	4088 found = FALSE;

	4089 }

4032	4090

4033 if (!isBreakBoundary(strsrch, mLimit)) {	4091 if (!isBreakBoundary(strsrch, mLimit)) {

4034 found = FALSE;	4092 found = FALSE;

	4093 }

4035 }	4094 }

4036	4095

4037 if (! checkIdentical(strsrch, mStart, mLimit)) {	4096 if (! checkIdentical(strsrch, mStart, mLimit)) {

4038 found = FALSE;	4097 found = FALSE;

4039 }	4098 }

4040	4099

4041 if (found) {	4100 if (found) {

4042 break;	4101 break;

4043 }	4102 }

4044 }	4103 }

(...skipping 196 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4241 // 2. The last CE that was part of the match is in an expansion t hat extends	4300 // 2. The last CE that was part of the match is in an expansion t hat extends

4242 // to the first CE after the match. In this case, we reject th e match.	4301 // to the first CE after the match. In this case, we reject th e match.

4243 const CEI *nextCEI = ceb.getPrevious(targetIx - 1);	4302 const CEI *nextCEI = ceb.getPrevious(targetIx - 1);

4244	4303

4245 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_P ROCESSED_NULLORDER) {	4304 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_P ROCESSED_NULLORDER) {

4246 found = FALSE;	4305 found = FALSE;

4247 }	4306 }

4248	4307

4249 mLimit = maxLimit = nextCEI->lowIndex;	4308 mLimit = maxLimit = nextCEI->lowIndex;

4250	4309

	4310 // Allow matches to end in the middle of a grapheme cluster if the f ollowing

	4311 // conditions are met; this is needed to make prefix search work pro perly in

	4312 // Indic, see #11750

	4313 // * the default breakIter is being used

	4314 // * the next collation element after this combining sequence

	4315 // - has non-zero primary weight

	4316 // - corresponds to a separate character following the one at end of the current match

	4317 // (the second of these conditions, and perhaps both, may be redun dant given the

	4318 // subsequent check for normalization boundary; however they are l ikely much faster

	4319 // tests in any case)

	4320 // * the match limit is a normalization boundary

	4321 UBool allowMidclusterMatch = FALSE;

	4322 if (strsrch->search->text != NULL && strsrch->search->textLength > m axLimit) {

	4323 allowMidclusterMatch =

	4324 strsrch->search->breakIter == NULL &&

	4325 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL ) != 0 &&

	4326 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > m axLimit &&

	4327 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->s earch, maxLimit)) \|\|

	4328 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strs rch->search, maxLimit)));

	4329 }

	4330 // If those conditions are met, then:

	4331 // * do NOT advance the candidate match limit (mLimit) to a break bo undary; however

	4332 // the match limit may be backed off to a previous break boundary. This handles

	4333 // cases in which mLimit includes target characters that are ignor able with current

	4334 // settings (such as space) and which extend beyond the pattern ma tch.

	4335 // * do NOT require that end of the combining sequence not extend be yond the match in CE space

	4336 // * do NOT require that match limit be on a breakIter boundary

	4337

4251 // Advance the match end position to the first acceptable match bou ndary.	4338 // Advance the match end position to the first acceptable match bou ndary.

4252 // This advances the index over any combining charcters.	4339 // This advances the index over any combining characters.

4253 if (minLimit < maxLimit) {	4340 if (minLimit < maxLimit) {

4254 int32_t nba = nextBoundaryAfter(strsrch, minLimit);	4341 int32_t nba = nextBoundaryAfter(strsrch, minLimit);

4255	4342 // Note that we can have nba < maxLimit && nba >= minLImit, in w hich

4256 if (nba >= lastCEI->highIndex) {	4343 // case we want to set mLimit to nba regardless of allowMidclust erMatch

	4344 // (i.e. we back off mLimit to the previous breakIterator bounda ry).

	4345 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch \|\| nba < maxLimit)) {

4257 mLimit = nba;	4346 mLimit = nba;

4258 }	4347 }

4259 }	4348 }

4260	4349

4261 // If advancing to the end of a combining sequence in character inde xing space	4350 if (!allowMidclusterMatch) {

4262 // advanced us beyond the end of the match in CE space, reject thi s match.	4351 // If advancing to the end of a combining sequence in character indexing space

4263 if (mLimit > maxLimit) {	4352 // advanced us beyond the end of the match in CE space, reject this match.

4264 found = FALSE;	4353 if (mLimit > maxLimit) {

4265 }	4354 found = FALSE;

	4355 }

4266	4356

4267 // Make sure the end of the match is on a break boundary	4357 // Make sure the end of the match is on a break boundary

4268 if (!isBreakBoundary(strsrch, mLimit)) {	4358 if (!isBreakBoundary(strsrch, mLimit)) {

4269 found = FALSE;	4359 found = FALSE;

	4360 }

4270 }	4361 }

4271	4362

4272 } else {	4363 } else {

4273 // No non-ignorable CEs after this point.	4364 // No non-ignorable CEs after this point.

4274 // The maximum position is detected by boundary after	4365 // The maximum position is detected by boundary after

4275 // the last non-ignorable CE. Combining sequence	4366 // the last non-ignorable CE. Combining sequence

4276 // across the start index will be truncated.	4367 // across the start index will be truncated.

4277 int32_t nba = nextBoundaryAfter(strsrch, minLimit);	4368 int32_t nba = nextBoundaryAfter(strsrch, minLimit);

4278 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;	4369 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;

4279 }	4370 }

(...skipping 566 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4846 strsrch->search->matchedLength = end - start;	4937 strsrch->search->matchedLength = end - start;

4847 return TRUE;	4938 return TRUE;

4848 } else {	4939 } else {

4849 setMatchNotFound(strsrch);	4940 setMatchNotFound(strsrch);

4850 return FALSE;	4941 return FALSE;

4851 }	4942 }

4852 #endif	4943 #endif

4853 }	4944 }

4854	4945

4855 #endif /* #if !UCONFIG_NO_COLLATION */	4946 #endif /* #if !UCONFIG_NO_COLLATION */

OLD	NEW

« no previous file with comments | « source/i18n/uregion.cpp ('k') | source/i18n/uspoof.cpp » ('j') | no next file with comments »