Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(127)

Side by Side Diff: source/i18n/usearch.cpp

Issue 1621843002: ICU 56 update step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@561
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/i18n/uregion.cpp ('k') | source/i18n/uspoof.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 ********************************************************************** 2 **********************************************************************
3 * Copyright (C) 2001-2014 IBM and others. All rights reserved. 3 * Copyright (C) 2001-2015 IBM and others. All rights reserved.
4 ********************************************************************** 4 **********************************************************************
5 * Date Name Description 5 * Date Name Description
6 * 07/02/2001 synwee Creation. 6 * 07/02/2001 synwee Creation.
7 ********************************************************************** 7 **********************************************************************
8 */ 8 */
9 9
10 #include "unicode/utypes.h" 10 #include "unicode/utypes.h"
11 11
12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION 12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
13 13
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after
68 return UCOL_PRIMARYORDERMASK; 68 return UCOL_PRIMARYORDERMASK;
69 case UCOL_SECONDARY: 69 case UCOL_SECONDARY:
70 return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK; 70 return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
71 default: 71 default:
72 return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK | 72 return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
73 UCOL_PRIMARYORDERMASK; 73 UCOL_PRIMARYORDERMASK;
74 } 74 }
75 } 75 }
76 76
77 /** 77 /**
78 * This is to squeeze the 21bit ces into a 256 table 78 * @param ce 32-bit collation element
79 * @param ce collation element 79 * @return hash code
80 * @return collapsed version of the collation element
81 */ 80 */
82 static 81 static
83 inline int hash(uint32_t ce) 82 inline int hashFromCE32(uint32_t ce)
84 { 83 {
85 // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work 84 int hc = (int)(
86 // well with the new collation where most of the latin 1 characters 85 ((((((ce >> 24) * 37) +
87 // are of the value xx000xxx. their hashes will most of the time be 0 86 (ce >> 16)) * 37) +
88 // to be discussed on the hash algo. 87 (ce >> 8)) * 37) +
89 return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_; 88 ce);
89 hc %= MAX_TABLE_SIZE_;
90 if (hc < 0) {
91 hc += MAX_TABLE_SIZE_;
92 }
93 return hc;
90 } 94 }
91 95
92 U_CDECL_BEGIN 96 U_CDECL_BEGIN
93 static UBool U_CALLCONV 97 static UBool U_CALLCONV
94 usearch_cleanup(void) { 98 usearch_cleanup(void) {
95 g_nfcImpl = NULL; 99 g_nfcImpl = NULL;
96 return TRUE; 100 return TRUE;
97 } 101 }
98 U_CDECL_END 102 U_CDECL_END
99 103
(...skipping 385 matching lines...) Expand 10 before | Expand all | Expand 10 after
485 // the number of ces minus their expansion, since expansions can come 489 // the number of ces minus their expansion, since expansions can come
486 // from a character. 490 // from a character.
487 int32_t count; 491 int32_t count;
488 for (count = 0; count < MAX_TABLE_SIZE_; count ++) { 492 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
489 shift[count] = defaultforward; 493 shift[count] = defaultforward;
490 } 494 }
491 cesize --; // down to the last index 495 cesize --; // down to the last index
492 for (count = 0; count < cesize; count ++) { 496 for (count = 0; count < cesize; count ++) {
493 // number of ces from right of array to the count 497 // number of ces from right of array to the count
494 int temp = defaultforward - count - 1; 498 int temp = defaultforward - count - 1;
495 shift[hash(cetable[count])] = temp > 1 ? temp : 1; 499 shift[hashFromCE32(cetable[count])] = temp > 1 ? temp : 1;
496 } 500 }
497 shift[hash(cetable[cesize])] = 1; 501 shift[hashFromCE32(cetable[cesize])] = 1;
498 // for ignorables we just shift by one. see test examples. 502 // for ignorables we just shift by one. see test examples.
499 shift[hash(0)] = 1; 503 shift[hashFromCE32(0)] = 1;
500 504
501 for (count = 0; count < MAX_TABLE_SIZE_; count ++) { 505 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
502 backshift[count] = defaultbackward; 506 backshift[count] = defaultbackward;
503 } 507 }
504 for (count = cesize; count > 0; count --) { 508 for (count = cesize; count > 0; count --) {
505 // the original value count does not seem to work 509 // the original value count does not seem to work
506 backshift[hash(cetable[count])] = count > expansionsize ? 510 backshift[hashFromCE32(cetable[count])] = count > expansionsize ?
507 (int16_t)(count - expansionsize) : 1; 511 (int16_t)(count - expansionsize) : 1;
508 } 512 }
509 backshift[hash(cetable[0])] = 1; 513 backshift[hashFromCE32(cetable[0])] = 1;
510 backshift[hash(0)] = 1; 514 backshift[hashFromCE32(0)] = 1;
511 } 515 }
512 516
513 /** 517 /**
514 * Building of the pattern collation element list and the boyer moore strsrch 518 * Building of the pattern collation element list and the boyer moore strsrch
515 * table. 519 * table.
516 * The canonical match will only be performed after the default match fails. 520 * The canonical match will only be performed after the default match fails.
517 * For both cases we need to remember the size of the composed and decomposed 521 * For both cases we need to remember the size of the composed and decomposed
518 * versions of the string. Since the Boyer-Moore shift calculations shifts by 522 * versions of the string. Since the Boyer-Moore shift calculations shifts by
519 * a number of characters in the text and tries to match the pattern from that 523 * a number of characters in the text and tries to match the pattern from that
520 * offset, the shift value can not be too large in case we miss some 524 * offset, the shift value can not be too large in case we miss some
(...skipping 202 matching lines...) Expand 10 before | Expand all | Expand 10 after
723 * @return final offset 727 * @return final offset
724 */ 728 */
725 static 729 static
726 inline int32_t shiftForward(UStringSearch *strsrch, 730 inline int32_t shiftForward(UStringSearch *strsrch,
727 int32_t textoffset, 731 int32_t textoffset,
728 int32_t ce, 732 int32_t ce,
729 int32_t patternceindex) 733 int32_t patternceindex)
730 { 734 {
731 UPattern *pattern = &(strsrch->pattern); 735 UPattern *pattern = &(strsrch->pattern);
732 if (ce != UCOL_NULLORDER) { 736 if (ce != UCOL_NULLORDER) {
733 int32_t shift = pattern->shift[hash(ce)]; 737 int32_t shift = pattern->shift[hashFromCE32(ce)];
734 // this is to adjust for characters in the middle of the 738 // this is to adjust for characters in the middle of the
735 // substring for matching that failed. 739 // substring for matching that failed.
736 int32_t adjust = pattern->cesLength - patternceindex; 740 int32_t adjust = pattern->cesLength - patternceindex;
737 if (adjust > 1 && shift >= adjust) { 741 if (adjust > 1 && shift >= adjust) {
738 shift -= adjust - 1; 742 shift -= adjust - 1;
739 } 743 }
740 textoffset += shift; 744 textoffset += shift;
741 } 745 }
742 else { 746 else {
743 textoffset += pattern->defaultShiftSize; 747 textoffset += pattern->defaultShiftSize;
(...skipping 1220 matching lines...) Expand 10 before | Expand all | Expand 10 after
1964 if (strsrch->search->isOverlap) { 1968 if (strsrch->search->isOverlap) {
1965 if (textoffset != strsrch->search->textLength) { 1969 if (textoffset != strsrch->search->textLength) {
1966 textoffset --; 1970 textoffset --;
1967 } 1971 }
1968 else { 1972 else {
1969 textoffset -= strsrch->pattern.defaultShiftSize; 1973 textoffset -= strsrch->pattern.defaultShiftSize;
1970 } 1974 }
1971 } 1975 }
1972 else { 1976 else {
1973 if (ce != UCOL_NULLORDER) { 1977 if (ce != UCOL_NULLORDER) {
1974 int32_t shift = strsrch->pattern.backShift[hash(ce)]; 1978 int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)];
1975 1979
1976 // this is to adjust for characters in the middle of the substring 1980 // this is to adjust for characters in the middle of the substring
1977 // for matching that failed. 1981 // for matching that failed.
1978 int32_t adjust = patternceindex; 1982 int32_t adjust = patternceindex;
1979 if (adjust > 1 && shift > adjust) { 1983 if (adjust > 1 && shift > adjust) {
1980 shift -= adjust - 1; 1984 shift -= adjust - 1;
1981 } 1985 }
1982 textoffset -= shift; 1986 textoffset -= shift;
1983 } 1987 }
1984 else { 1988 else {
(...skipping 1813 matching lines...) Expand 10 before | Expand all | Expand 10 after
3798 U_CE_MATCH: U_CE_NO_MATCH; 3802 U_CE_MATCH: U_CE_NO_MATCH;
3799 } 3803 }
3800 3804
3801 return U_CE_MATCH; 3805 return U_CE_MATCH;
3802 } 3806 }
3803 3807
3804 #if BOYER_MOORE 3808 #if BOYER_MOORE
3805 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s 3809 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3806 #endif 3810 #endif
3807 3811
3812 namespace {
3813
3814 UChar32 codePointAt(const USearch &search, int32_t index) {
3815 if (index < search.textLength) {
3816 UChar32 c;
3817 U16_NEXT(search.text, index, search.textLength, c);
3818 return c;
3819 }
3820 return U_SENTINEL;
3821 }
3822
3823 UChar32 codePointBefore(const USearch &search, int32_t index) {
3824 if (0 < index) {
3825 UChar32 c;
3826 U16_PREV(search.text, 0, index, c);
3827 return c;
3828 }
3829 return U_SENTINEL;
3830 }
3831
3832 } // namespace
3833
3808 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, 3834 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
3809 int32_t startIdx, 3835 int32_t startIdx,
3810 int32_t *matchStart, 3836 int32_t *matchStart,
3811 int32_t *matchLimit, 3837 int32_t *matchLimit,
3812 UErrorCode *status) 3838 UErrorCode *status)
3813 { 3839 {
3814 if (U_FAILURE(*status)) { 3840 if (U_FAILURE(*status)) {
3815 return FALSE; 3841 return FALSE;
3816 } 3842 }
3817 3843
(...skipping 173 matching lines...) Expand 10 before | Expand all | Expand 10 after
3991 // Check for the start of the match being within an Collation Element Ex pansion, 4017 // Check for the start of the match being within an Collation Element Ex pansion,
3992 // meaning that the first char of the match is only partially matched. 4018 // meaning that the first char of the match is only partially matched.
3993 // With exapnsions, the first CE will report the index of the source 4019 // With exapnsions, the first CE will report the index of the source
3994 // character, and all subsequent (expansions) CEs will report the sour ce index of the 4020 // character, and all subsequent (expansions) CEs will report the sour ce index of the
3995 // _following_ character. 4021 // _following_ character.
3996 int32_t secondIx = firstCEI->highIndex; 4022 int32_t secondIx = firstCEI->highIndex;
3997 if (mStart == secondIx) { 4023 if (mStart == secondIx) {
3998 found = FALSE; 4024 found = FALSE;
3999 } 4025 }
4000 4026
4027 // Allow matches to end in the middle of a grapheme cluster if the follo wing
4028 // conditions are met; this is needed to make prefix search work properl y in
4029 // Indic, see #11750
4030 // * the default breakIter is being used
4031 // * the next collation element after this combining sequence
4032 // - has non-zero primary weight
4033 // - corresponds to a separate character following the one at end of t he current match
4034 // (the second of these conditions, and perhaps both, may be redundant given the
4035 // subsequent check for normalization boundary; however they are likel y much faster
4036 // tests in any case)
4037 // * the match limit is a normalization boundary
4038 UBool allowMidclusterMatch = FALSE;
4039 if (strsrch->search->text != NULL && strsrch->search->textLength > maxLi mit) {
4040 allowMidclusterMatch =
4041 strsrch->search->breakIter == NULL &&
4042 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4043 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLi mit &&
4044 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->searc h, maxLimit)) ||
4045 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch- >search, maxLimit)));
4046 }
4047 // If those conditions are met, then:
4048 // * do NOT advance the candidate match limit (mLimit) to a break bounda ry; however
4049 // the match limit may be backed off to a previous break boundary. Thi s handles
4050 // cases in which mLimit includes target characters that are ignorable with current
4051 // settings (such as space) and which extend beyond the pattern match.
4052 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4053 // * do NOT require that match limit be on a breakIter boundary
4054
4001 // Advance the match end position to the first acceptable match boundar y. 4055 // Advance the match end position to the first acceptable match boundar y.
4002 // This advances the index over any combining charcters. 4056 // This advances the index over any combining charcters.
4003 mLimit = maxLimit; 4057 mLimit = maxLimit;
4004 if (minLimit < maxLimit) { 4058 if (minLimit < maxLimit) {
4005 // When the last CE's low index is same with its high index, the CE is likely 4059 // When the last CE's low index is same with its high index, the CE is likely
4006 // a part of expansion. In this case, the index is located just afte r the 4060 // a part of expansion. In this case, the index is located just afte r the
4007 // character corresponding to the CEs compared above. If the index i s right 4061 // character corresponding to the CEs compared above. If the index i s right
4008 // at the break boundary, move the position to the next boundary wil l result 4062 // at the break boundary, move the position to the next boundary wil l result
4009 // incorrect match length when there are ignorable characters exist between 4063 // incorrect match length when there are ignorable characters exist between
4010 // the position and the next character produces CE(s). See ticket#84 82. 4064 // the position and the next character produces CE(s). See ticket#84 82.
4011 if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLi mit)) { 4065 if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLi mit)) {
4012 mLimit = minLimit; 4066 mLimit = minLimit;
4013 } else { 4067 } else {
4014 int32_t nba = nextBoundaryAfter(strsrch, minLimit); 4068 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4015 if (nba >= lastCEI->highIndex) { 4069 // Note that we can have nba < maxLimit && nba >= minLImit, in w hich
4070 // case we want to set mLimit to nba regardless of allowMidclust erMatch
4071 // (i.e. we back off mLimit to the previous breakIterator bounda ry).
4072 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4016 mLimit = nba; 4073 mLimit = nba;
4017 } 4074 }
4018 } 4075 }
4019 } 4076 }
4020 4077
4021 #ifdef USEARCH_DEBUG 4078 #ifdef USEARCH_DEBUG
4022 if (getenv("USEARCH_DEBUG") != NULL) { 4079 if (getenv("USEARCH_DEBUG") != NULL) {
4023 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLim it, mLimit); 4080 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLim it, mLimit);
4024 } 4081 }
4025 #endif 4082 #endif
4026 4083
4027 // If advancing to the end of a combining sequence in character indexing space 4084 if (!allowMidclusterMatch) {
4028 // advanced us beyond the end of the match in CE space, reject this ma tch. 4085 // If advancing to the end of a combining sequence in character inde xing space
4029 if (mLimit > maxLimit) { 4086 // advanced us beyond the end of the match in CE space, reject thi s match.
4030 found = FALSE; 4087 if (mLimit > maxLimit) {
4031 } 4088 found = FALSE;
4089 }
4032 4090
4033 if (!isBreakBoundary(strsrch, mLimit)) { 4091 if (!isBreakBoundary(strsrch, mLimit)) {
4034 found = FALSE; 4092 found = FALSE;
4093 }
4035 } 4094 }
4036 4095
4037 if (! checkIdentical(strsrch, mStart, mLimit)) { 4096 if (! checkIdentical(strsrch, mStart, mLimit)) {
4038 found = FALSE; 4097 found = FALSE;
4039 } 4098 }
4040 4099
4041 if (found) { 4100 if (found) {
4042 break; 4101 break;
4043 } 4102 }
4044 } 4103 }
(...skipping 196 matching lines...) Expand 10 before | Expand all | Expand 10 after
4241 // 2. The last CE that was part of the match is in an expansion t hat extends 4300 // 2. The last CE that was part of the match is in an expansion t hat extends
4242 // to the first CE after the match. In this case, we reject th e match. 4301 // to the first CE after the match. In this case, we reject th e match.
4243 const CEI *nextCEI = ceb.getPrevious(targetIx - 1); 4302 const CEI *nextCEI = ceb.getPrevious(targetIx - 1);
4244 4303
4245 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_P ROCESSED_NULLORDER) { 4304 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_P ROCESSED_NULLORDER) {
4246 found = FALSE; 4305 found = FALSE;
4247 } 4306 }
4248 4307
4249 mLimit = maxLimit = nextCEI->lowIndex; 4308 mLimit = maxLimit = nextCEI->lowIndex;
4250 4309
4310 // Allow matches to end in the middle of a grapheme cluster if the f ollowing
4311 // conditions are met; this is needed to make prefix search work pro perly in
4312 // Indic, see #11750
4313 // * the default breakIter is being used
4314 // * the next collation element after this combining sequence
4315 // - has non-zero primary weight
4316 // - corresponds to a separate character following the one at end of the current match
4317 // (the second of these conditions, and perhaps both, may be redun dant given the
4318 // subsequent check for normalization boundary; however they are l ikely much faster
4319 // tests in any case)
4320 // * the match limit is a normalization boundary
4321 UBool allowMidclusterMatch = FALSE;
4322 if (strsrch->search->text != NULL && strsrch->search->textLength > m axLimit) {
4323 allowMidclusterMatch =
4324 strsrch->search->breakIter == NULL &&
4325 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL ) != 0 &&
4326 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > m axLimit &&
4327 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->s earch, maxLimit)) ||
4328 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strs rch->search, maxLimit)));
4329 }
4330 // If those conditions are met, then:
4331 // * do NOT advance the candidate match limit (mLimit) to a break bo undary; however
4332 // the match limit may be backed off to a previous break boundary. This handles
4333 // cases in which mLimit includes target characters that are ignor able with current
4334 // settings (such as space) and which extend beyond the pattern ma tch.
4335 // * do NOT require that end of the combining sequence not extend be yond the match in CE space
4336 // * do NOT require that match limit be on a breakIter boundary
4337
4251 // Advance the match end position to the first acceptable match bou ndary. 4338 // Advance the match end position to the first acceptable match bou ndary.
4252 // This advances the index over any combining charcters. 4339 // This advances the index over any combining characters.
4253 if (minLimit < maxLimit) { 4340 if (minLimit < maxLimit) {
4254 int32_t nba = nextBoundaryAfter(strsrch, minLimit); 4341 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4255 4342 // Note that we can have nba < maxLimit && nba >= minLImit, in w hich
4256 if (nba >= lastCEI->highIndex) { 4343 // case we want to set mLimit to nba regardless of allowMidclust erMatch
4344 // (i.e. we back off mLimit to the previous breakIterator bounda ry).
4345 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4257 mLimit = nba; 4346 mLimit = nba;
4258 } 4347 }
4259 } 4348 }
4260 4349
4261 // If advancing to the end of a combining sequence in character inde xing space 4350 if (!allowMidclusterMatch) {
4262 // advanced us beyond the end of the match in CE space, reject thi s match. 4351 // If advancing to the end of a combining sequence in character indexing space
4263 if (mLimit > maxLimit) { 4352 // advanced us beyond the end of the match in CE space, reject this match.
4264 found = FALSE; 4353 if (mLimit > maxLimit) {
4265 } 4354 found = FALSE;
4355 }
4266 4356
4267 // Make sure the end of the match is on a break boundary 4357 // Make sure the end of the match is on a break boundary
4268 if (!isBreakBoundary(strsrch, mLimit)) { 4358 if (!isBreakBoundary(strsrch, mLimit)) {
4269 found = FALSE; 4359 found = FALSE;
4360 }
4270 } 4361 }
4271 4362
4272 } else { 4363 } else {
4273 // No non-ignorable CEs after this point. 4364 // No non-ignorable CEs after this point.
4274 // The maximum position is detected by boundary after 4365 // The maximum position is detected by boundary after
4275 // the last non-ignorable CE. Combining sequence 4366 // the last non-ignorable CE. Combining sequence
4276 // across the start index will be truncated. 4367 // across the start index will be truncated.
4277 int32_t nba = nextBoundaryAfter(strsrch, minLimit); 4368 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4278 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx; 4369 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
4279 } 4370 }
(...skipping 566 matching lines...) Expand 10 before | Expand all | Expand 10 after
4846 strsrch->search->matchedLength = end - start; 4937 strsrch->search->matchedLength = end - start;
4847 return TRUE; 4938 return TRUE;
4848 } else { 4939 } else {
4849 setMatchNotFound(strsrch); 4940 setMatchNotFound(strsrch);
4850 return FALSE; 4941 return FALSE;
4851 } 4942 }
4852 #endif 4943 #endif
4853 } 4944 }
4854 4945
4855 #endif /* #if !UCONFIG_NO_COLLATION */ 4946 #endif /* #if !UCONFIG_NO_COLLATION */
OLDNEW
« no previous file with comments | « source/i18n/uregion.cpp ('k') | source/i18n/uspoof.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698