icu46/source/i18n/usearch.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/i18n/usearch.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 **********************************************************************

	3 * Copyright (C) 2001-2010 IBM and others. All rights reserved.

	4 **********************************************************************

	5 * Date Name Description

	6 * 07/02/2001 synwee Creation.

	7 **********************************************************************

	8 */

	9

	10 #include "unicode/utypes.h"

	11

	12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION

	13

	14 #include "unicode/usearch.h"

	15 #include "unicode/ustring.h"

	16 #include "unicode/uchar.h"

	17 #include "normalizer2impl.h"

	18 #include "ucol_imp.h"

	19 #include "usrchimp.h"

	20 #include "cmemory.h"

	21 #include "ucln_in.h"

	22 #include "uassert.h"

	23 #include "ustr_imp.h"

	24

	25 U_NAMESPACE_USE

	26

	27 // don't use Boyer-Moore

	28 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)

	29 #define BOYER_MOORE 0

	30

	31 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))

	32

	33 // internal definition ---------------------------------------------------

	34

	35 #define LAST_BYTE_MASK_ 0xFF

	36 #define SECOND_LAST_BYTE_SHIFT_ 8

	37 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000

	38

	39 static const uint16_t *fcdTrieIndex = NULL;

	40 static UChar32 fcdHighStart = 0;

	41

	42 // internal methods -------------------------------------------------

	43

	44 /**

	45 * Fast collation element iterator setOffset.

	46 * This function does not check for bounds.

	47 * @param coleiter collation element iterator

	48 * @param offset to set

	49 */

	50 static

	51 inline void setColEIterOffset(UCollationElements *elems,

	52 int32_t offset)

	53 {

	54 collIterate *ci = &(elems->iteratordata_);

	55 ci->pos = ci->string + offset;

	56 ci->CEpos = ci->toReturn = ci->extendCEs ? ci->extendCEs : ci->CEs;

	57 if (ci->flags & UCOL_ITER_INNORMBUF) {

	58 ci->flags = ci->origFlags;

	59 }

	60 ci->fcdPosition = NULL;

	61

	62 ci->offsetReturn = NULL;

	63 ci->offsetStore = ci->offsetBuffer;

	64 ci->offsetRepeatCount = ci->offsetRepeatValue = 0;

	65 }

	66

	67 /**

	68 * Getting the mask for collation strength

	69 * @param strength collation strength

	70 * @return collation element mask

	71 */

	72 static

	73 inline uint32_t getMask(UCollationStrength strength)

	74 {

	75 switch (strength)

	76 {

	77 case UCOL_PRIMARY:

	78 return UCOL_PRIMARYORDERMASK;

	79 case UCOL_SECONDARY:

	80 return UCOL_SECONDARYORDERMASK \| UCOL_PRIMARYORDERMASK;

	81 default:

	82 return UCOL_TERTIARYORDERMASK \| UCOL_SECONDARYORDERMASK \|

	83 UCOL_PRIMARYORDERMASK;

	84 }

	85 }

	86

	87 /**

	88 * This is to squeeze the 21bit ces into a 256 table

	89 * @param ce collation element

	90 * @return collapsed version of the collation element

	91 */

	92 static

	93 inline int hash(uint32_t ce)

	94 {

	95 // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work

	96 // well with the new collation where most of the latin 1 characters

	97 // are of the value xx000xxx. their hashes will most of the time be 0

	98 // to be discussed on the hash algo.

	99 return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;

	100 }

	101

	102 U_CDECL_BEGIN

	103 static UBool U_CALLCONV

	104 usearch_cleanup(void) {

	105 fcdTrieIndex = NULL;

	106 return TRUE;

	107 }

	108 U_CDECL_END

	109

	110 /**

	111 * Initializing the fcd tables.

	112 * Internal method, status assumed to be a success.

	113 * @param status output error if any, caller to check status before calling

	114 * method, status assumed to be success when passed in.

	115 */

	116 static

	117 inline void initializeFCD(UErrorCode *status)

	118 {

	119 if (fcdTrieIndex == NULL) {

	120 fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);

	121 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);

	122 }

	123 }

	124

	125 /**

	126 * Gets the fcd value for a character at the argument index.

	127 * This method takes into accounts of the supplementary characters.

	128 * @param str UTF16 string where character for fcd retrieval resides

	129 * @param offset position of the character whose fcd is to be retrieved, to be

	130 * overwritten with the next character position, taking

	131 * surrogate characters into consideration.

	132 * @param strlength length of the argument string

	133 * @return fcd value

	134 */

	135 static

	136 uint16_t getFCD(const UChar str, int32_t offset,

	137 int32_t strlength)

	138 {

	139 const UChar temp = str + offset;

	140 uint16_t result = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, temp, str + strlength);

	141 *offset = (int32_t)(temp - str);

	142 return result;

	143 }

	144

	145 /**

	146 * Getting the modified collation elements taking into account the collation

	147 * attributes

	148 * @param strsrch string search data

	149 * @param sourcece

	150 * @return the modified collation element

	151 */

	152 static

	153 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)

	154 {

	155 // note for tertiary we can't use the collator->tertiaryMask, that

	156 // is a preprocessed mask that takes into account case options. since

	157 // we are only concerned with exact matches, we don't need that.

	158 sourcece &= strsrch->ceMask;

	159

	160 if (strsrch->toShift) {

	161 // alternate handling here, since only the 16 most significant digits

	162 // is only used, we can safely do a compare without masking

	163 // if the ce is a variable, we mask and get only the primary values

	164 // no shifting to quartenary is required since all primary values

	165 // less than variabletop will need to be masked off anyway.

	166 if (strsrch->variableTop > sourcece) {

	167 if (strsrch->strength >= UCOL_QUATERNARY) {

	168 sourcece &= UCOL_PRIMARYORDERMASK;

	169 }

	170 else {

	171 sourcece = UCOL_IGNORABLE;

	172 }

	173 }

	174 } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABL E) {

	175 sourcece = 0xFFFF;

	176 }

	177

	178 return sourcece;

	179 }

	180

	181 /**

	182 * Allocate a memory and returns NULL if it failed.

	183 * Internal method, status assumed to be a success.

	184 * @param size to allocate

	185 * @param status output error if any, caller to check status before calling

	186 * method, status assumed to be success when passed in.

	187 * @return newly allocated array, NULL otherwise

	188 */

	189 static

	190 inline void * allocateMemory(uint32_t size, UErrorCode *status)

	191 {

	192 uint32_t result = (uint32_t )uprv_malloc(size);

	193 if (result == NULL) {

	194 *status = U_MEMORY_ALLOCATION_ERROR;

	195 }

	196 return result;

	197 }

	198

	199 /**

	200 * Adds a uint32_t value to a destination array.

	201 * Creates a new array if we run out of space. The caller will have to

	202 * manually deallocate the newly allocated array.

	203 * Internal method, status assumed to be success, caller has to check status

	204 * before calling this method. destination not to be NULL and has at least

	205 * size destinationlength.

	206 * @param destination target array

	207 * @param offset destination offset to add value

	208 * @param destinationlength target array size, return value for the new size

	209 * @param value to be added

	210 * @param increments incremental size expected

	211 * @param status output error if any, caller to check status before calling

	212 * method, status assumed to be success when passed in.

	213 * @return new destination array, destination if there was no new allocation

	214 */

	215 static

	216 inline int32_t * addTouint32_tArray(int32_t *destination,

	217 uint32_t offset,

	218 uint32_t *destinationlength,

	219 uint32_t value,

	220 uint32_t increments,

	221 UErrorCode *status)

	222 {

	223 uint32_t newlength = *destinationlength;

	224 if (offset + 1 == newlength) {

	225 newlength += increments;

	226 int32_t temp = (int32_t )allocateMemory(

	227 sizeof(int32_t) * newlength, status);

	228 if (U_FAILURE(*status)) {

	229 return NULL;

	230 }

	231 uprv_memcpy(temp, destination, sizeof(int32_t) * offset);

	232 *destinationlength = newlength;

	233 destination = temp;

	234 }

	235 destination[offset] = value;

	236 return destination;

	237 }

	238

	239 /**

	240 * Adds a uint64_t value to a destination array.

	241 * Creates a new array if we run out of space. The caller will have to

	242 * manually deallocate the newly allocated array.

	243 * Internal method, status assumed to be success, caller has to check status

	244 * before calling this method. destination not to be NULL and has at least

	245 * size destinationlength.

	246 * @param destination target array

	247 * @param offset destination offset to add value

	248 * @param destinationlength target array size, return value for the new size

	249 * @param value to be added

	250 * @param increments incremental size expected

	251 * @param status output error if any, caller to check status before calling

	252 * method, status assumed to be success when passed in.

	253 * @return new destination array, destination if there was no new allocation

	254 */

	255 static

	256 inline int64_t * addTouint64_tArray(int64_t *destination,

	257 uint32_t offset,

	258 uint32_t *destinationlength,

	259 uint64_t value,

	260 uint32_t increments,

	261 UErrorCode *status)

	262 {

	263 uint32_t newlength = *destinationlength;

	264 if (offset + 1 == newlength) {

	265 newlength += increments;

	266 int64_t temp = (int64_t )allocateMemory(

	267 sizeof(int64_t) * newlength, status);

	268

	269 if (U_FAILURE(*status)) {

	270 return NULL;

	271 }

	272

	273 uprv_memcpy(temp, destination, sizeof(int64_t) * offset);

	274 *destinationlength = newlength;

	275 destination = temp;

	276 }

	277

	278 destination[offset] = value;

	279

	280 return destination;

	281 }

	282

	283 /**

	284 * Initializing the ce table for a pattern.

	285 * Stores non-ignorable collation keys.

	286 * Table size will be estimated by the size of the pattern text. Table

	287 * expansion will be perform as we go along. Adding 1 to ensure that the table

	288 * size definitely increases.

	289 * Internal method, status assumed to be a success.

	290 * @param strsrch string search data

	291 * @param status output error if any, caller to check status before calling

	292 * method, status assumed to be success when passed in.

	293 * @return total number of expansions

	294 */

	295 static

	296 inline uint16_t initializePatternCETable(UStringSearch *strsrch,

	297 UErrorCode *status)

	298 {

	299 UPattern *pattern = &(strsrch->pattern);

	300 uint32_t cetablesize = INITIAL_ARRAY_SIZE_;

	301 int32_t *cetable = pattern->CEBuffer;

	302 uint32_t patternlength = pattern->textLength;

	303 UCollationElements *coleiter = strsrch->utilIter;

	304

	305 if (coleiter == NULL) {

	306 coleiter = ucol_openElements(strsrch->collator, pattern->text,

	307 patternlength, status);

	308 // status will be checked in ucol_next(..) later and if it is an

	309 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be

	310 // returned.

	311 strsrch->utilIter = coleiter;

	312 }

	313 else {

	314 uprv_init_collIterate(strsrch->collator, pattern->text,

	315 pattern->textLength,

	316 &coleiter->iteratordata_,

	317 status);

	318 }

	319 if(U_FAILURE(*status)) {

	320 return 0;

	321 }

	322

	323 if (pattern->CE != cetable && pattern->CE) {

	324 uprv_free(pattern->CE);

	325 }

	326

	327 uint16_t offset = 0;

	328 uint16_t result = 0;

	329 int32_t ce;

	330

	331 while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&

	332 U_SUCCESS(*status)) {

	333 uint32_t newce = getCE(strsrch, ce);

	334 if (newce) {

	335 int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,

	336 newce,

	337 patternlength - ucol_getOffset(coleiter) + 1,

	338 status);

	339 if (U_FAILURE(*status)) {

	340 return 0;

	341 }

	342 offset ++;

	343 if (cetable != temp && cetable != pattern->CEBuffer) {

	344 uprv_free(cetable);

	345 }

	346 cetable = temp;

	347 }

	348 result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);

	349 }

	350

	351 cetable[offset] = 0;

	352 pattern->CE = cetable;

	353 pattern->CELength = offset;

	354

	355 return result;

	356 }

	357

	358 /**

	359 * Initializing the pce table for a pattern.

	360 * Stores non-ignorable collation keys.

	361 * Table size will be estimated by the size of the pattern text. Table

	362 * expansion will be perform as we go along. Adding 1 to ensure that the table

	363 * size definitely increases.

	364 * Internal method, status assumed to be a success.

	365 * @param strsrch string search data

	366 * @param status output error if any, caller to check status before calling

	367 * method, status assumed to be success when passed in.

	368 * @return total number of expansions

	369 */

	370 static

	371 inline uint16_t initializePatternPCETable(UStringSearch *strsrch,

	372 UErrorCode *status)

	373 {

	374 UPattern *pattern = &(strsrch->pattern);

	375 uint32_t pcetablesize = INITIAL_ARRAY_SIZE_;

	376 int64_t *pcetable = pattern->PCEBuffer;

	377 uint32_t patternlength = pattern->textLength;

	378 UCollationElements *coleiter = strsrch->utilIter;

	379

	380 if (coleiter == NULL) {

	381 coleiter = ucol_openElements(strsrch->collator, pattern->text,

	382 patternlength, status);

	383 // status will be checked in ucol_next(..) later and if it is an

	384 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be

	385 // returned.

	386 strsrch->utilIter = coleiter;

	387 } else {

	388 uprv_init_collIterate(strsrch->collator, pattern->text,

	389 pattern->textLength,

	390 &coleiter->iteratordata_,

	391 status);

	392 }

	393 if(U_FAILURE(*status)) {

	394 return 0;

	395 }

	396

	397 if (pattern->PCE != pcetable && pattern->PCE != NULL) {

	398 uprv_free(pattern->PCE);

	399 }

	400

	401 uint16_t offset = 0;

	402 uint16_t result = 0;

	403 int64_t pce;

	404

	405 uprv_init_pce(coleiter);

	406

	407 // ** Should processed CEs be signed or unsigned?

	408 // ** (the rest of the code in this file seems to play fast-and-loose with

	409 // ** whether a CE is signed or unsigned. For example, look at routine abov e this one.)

	410 while ((pce = ucol_nextProcessed(coleiter, NULL, NULL, status)) != UCOL_PROC ESSED_NULLORDER &&

	411 U_SUCCESS(*status)) {

	412 int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,

	413 pce,

	414 patternlength - ucol_getOffset(coleiter) + 1,

	415 status);

	416

	417 if (U_FAILURE(*status)) {

	418 return 0;

	419 }

	420

	421 offset += 1;

	422

	423 if (pcetable != temp && pcetable != pattern->PCEBuffer) {

	424 uprv_free(pcetable);

	425 }

	426

	427 pcetable = temp;

	428 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);

	429 }

	430

	431 pcetable[offset] = 0;

	432 pattern->PCE = pcetable;

	433 pattern->PCELength = offset;

	434

	435 return result;

	436 }

	437

	438 /**

	439 * Initializes the pattern struct.

	440 * Internal method, status assumed to be success.

	441 * @param strsrch UStringSearch data storage

	442 * @param status output error if any, caller to check status before calling

	443 * method, status assumed to be success when passed in.

	444 * @return expansionsize the total expansion size of the pattern

	445 */

	446 static

	447 inline int16_t initializePattern(UStringSearch strsrch, UErrorCode status)

	448 {

	449 UPattern *pattern = &(strsrch->pattern);

	450 const UChar *patterntext = pattern->text;

	451 int32_t length = pattern->textLength;

	452 int32_t index = 0;

	453

	454 // Since the strength is primary, accents are ignored in the pattern.

	455 if (strsrch->strength == UCOL_PRIMARY) {

	456 pattern->hasPrefixAccents = 0;

	457 pattern->hasSuffixAccents = 0;

	458 } else {

	459 pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>

	460 SECOND_LAST_BYTE_SH IFT_;

	461 index = length;

	462 UTF_BACK_1(patterntext, 0, index);

	463 pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &

	464 LAST_BYTE_M ASK_;

	465 }

	466

	467 // HACK

	468 if (strsrch->pattern.PCE != NULL) {

	469 if (strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {

	470 uprv_free(strsrch->pattern.PCE);

	471 }

	472

	473 strsrch->pattern.PCE = NULL;

	474 }

	475

	476 // since intializePattern is an internal method status is a success.

	477 return initializePatternCETable(strsrch, status);

	478 }

	479

	480 /**

	481 * Initializing shift tables, with the default values.

	482 * If a corresponding default value is 0, the shift table is not set.

	483 * @param shift table for forwards shift

	484 * @param backshift table for backwards shift

	485 * @param cetable table containing pattern ce

	486 * @param cesize size of the pattern ces

	487 * @param expansionsize total size of the expansions

	488 * @param defaultforward the default forward value

	489 * @param defaultbackward the default backward value

	490 */

	491 static

	492 inline void setShiftTable(int16_t shift[], int16_t backshift[],

	493 int32_t *cetable, int32_t cesize,

	494 int16_t expansionsize,

	495 int16_t defaultforward,

	496 int16_t defaultbackward)

	497 {

	498 // estimate the value to shift. to do that we estimate the smallest

	499 // number of characters to give the relevant ces, ie approximately

	500 // the number of ces minus their expansion, since expansions can come

	501 // from a character.

	502 int32_t count;

	503 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {

	504 shift[count] = defaultforward;

	505 }

	506 cesize --; // down to the last index

	507 for (count = 0; count < cesize; count ++) {

	508 // number of ces from right of array to the count

	509 int temp = defaultforward - count - 1;

	510 shift[hash(cetable[count])] = temp > 1 ? temp : 1;

	511 }

	512 shift[hash(cetable[cesize])] = 1;

	513 // for ignorables we just shift by one. see test examples.

	514 shift[hash(0)] = 1;

	515

	516 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {

	517 backshift[count] = defaultbackward;

	518 }

	519 for (count = cesize; count > 0; count --) {

	520 // the original value count does not seem to work

	521 backshift[hash(cetable[count])] = count > expansionsize ?

	522 (int16_t)(count - expansionsize) : 1;

	523 }

	524 backshift[hash(cetable[0])] = 1;

	525 backshift[hash(0)] = 1;

	526 }

	527

	528 /**

	529 * Building of the pattern collation element list and the boyer moore strsrch

	530 * table.

	531 * The canonical match will only be performed after the default match fails.

	532 * For both cases we need to remember the size of the composed and decomposed

	533 * versions of the string. Since the Boyer-Moore shift calculations shifts by

	534 * a number of characters in the text and tries to match the pattern from that

	535 * offset, the shift value can not be too large in case we miss some

	536 * characters. To choose a right shift size, we estimate the NFC form of the

	537 * and use its size as a shift guide. The NFC form should be the small

	538 * possible representation of the pattern. Anyways, we'll err on the smaller

	539 * shift size. Hence the calculation for minlength.

	540 * Canonical match will be performed slightly differently. We'll split the

	541 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by

	542 * the first and last base character (MS), the ending accents (EA). Matches

	543 * will be done on MS first, and only when we match MS then some processing

	544 * will be required for the prefix and end accents in order to determine if

	545 * they match PA and EA. Hence the default shift values

	546 * for the canonical match will take the size of either end's accent into

	547 * consideration. Forwards search will take the end accents into consideration

	548 * for the default shift values and the backwards search will take the prefix

	549 * accents into consideration.

	550 * If pattern has no non-ignorable ce, we return a illegal argument error.

	551 * Internal method, status assumed to be success.

	552 * @param strsrch UStringSearch data storage

	553 * @param status for output errors if it occurs, status is assumed to be a

	554 * success when it is passed in.

	555 */

	556 static

	557 inline void initialize(UStringSearch strsrch, UErrorCode status)

	558 {

	559 int16_t expandlength = initializePattern(strsrch, status);

	560 if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) {

	561 UPattern *pattern = &strsrch->pattern;

	562 int32_t cesize = pattern->CELength;

	563

	564 int16_t minlength = cesize > expandlength

	565 ? (int16_t)cesize - expandlength : 1;

	566 pattern->defaultShiftSize = minlength;

	567 setShiftTable(pattern->shift, pattern->backShift, pattern->CE,

	568 cesize, expandlength, minlength, minlength);

	569 return;

	570 }

	571 strsrch->pattern.defaultShiftSize = 0;

	572 }

	573

	574 #if BOYER_MOORE

	575 /**

	576 * Check to make sure that the match length is at the end of the character by

	577 * using the breakiterator.

	578 * @param strsrch string search data

	579 * @param start target text start offset

	580 * @param end target text end offset

	581 */

	582 static

	583 void checkBreakBoundary(const UStringSearch strsrch, int32_t /start/,

	584 int32_t *end)

	585 {

	586 #if !UCONFIG_NO_BREAK_ITERATION

	587 UBreakIterator *breakiterator = strsrch->search->internalBreakIter;

	588 if (breakiterator) {

	589 int32_t matchend = *end;

	590 //int32_t matchstart = *start;

	591

	592 if (!ubrk_isBoundary(breakiterator, matchend)) {

	593 *end = ubrk_following(breakiterator, matchend);

	594 }

	595

	596 /* Check the start of the matched text to make sure it doesn't have any accents

	597 * before it. This code may not be necessary and so it is commented out */

	598 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundar y(breakiterator, matchstart-1)) {

	599 *start = ubrk_preceding(breakiterator, matchstart);

	600 }*/

	601 }

	602 #endif

	603 }

	604

	605 /**

	606 * Determine whether the target text in UStringSearch bounded by the offset

	607 * start and end is one or more whole units of text as

	608 * determined by the breakiterator in UStringSearch.

	609 * @param strsrch string search data

	610 * @param start target text start offset

	611 * @param end target text end offset

	612 */

	613 static

	614 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,

	615 int32_t end)

	616 {

	617 #if !UCONFIG_NO_BREAK_ITERATION

	618 UBreakIterator *breakiterator = strsrch->search->breakIter;

	619 //TODO: Add here.

	620 if (breakiterator) {

	621 int32_t startindex = ubrk_first(breakiterator);

	622 int32_t endindex = ubrk_last(breakiterator);

	623

	624 // out-of-range indexes are never boundary positions

	625 if (start < startindex \|\| start > endindex \|\|

	626 end < startindex \|\| end > endindex) {

	627 return FALSE;

	628 }

	629 // otherwise, we can use following() on the position before the

	630 // specified one and return true of the position we get back is the

	631 // one the user specified

	632 UBool result = (start == startindex \|\|

	633 ubrk_following(breakiterator, start - 1) == start) &&

	634 (end == endindex \|\|

	635 ubrk_following(breakiterator, end - 1) == end);

	636 if (result) {

	637 // iterates the individual ces

	638 UCollationElements *coleiter = strsrch->utilIter;

	639 const UChar *text = strsrch->search->text +

	640 start;

	641 UErrorCode status = U_ZERO_ERROR;

	642 ucol_setText(coleiter, text, end - start, &status);

	643 for (int32_t count = 0; count < strsrch->pattern.CELength;

	644 count ++) {

	645 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));

	646 if (ce == UCOL_IGNORABLE) {

	647 count --;

	648 continue;

	649 }

	650 if (U_FAILURE(status) \|\| ce != strsrch->pattern.CE[count]) {

	651 return FALSE;

	652 }

	653 }

	654 int32_t nextce = ucol_next(coleiter, &status);

	655 while (ucol_getOffset(coleiter) == (end - start)

	656 && getCE(strsrch, nextce) == UCOL_IGNORABLE) {

	657 nextce = ucol_next(coleiter, &status);

	658 }

	659 if (ucol_getOffset(coleiter) == (end - start)

	660 && nextce != UCOL_NULLORDER) {

	661 // extra collation elements at the end of the match

	662 return FALSE;

	663 }

	664 }

	665 return result;

	666 }

	667 #endif

	668 return TRUE;

	669 }

	670

	671 /**

	672 * Getting the next base character offset if current offset is an accent,

	673 * or the current offset if the current character contains a base character.

	674 * accents the following base character will be returned

	675 * @param text string

	676 * @param textoffset current offset

	677 * @param textlength length of text string

	678 * @return the next base character or the current offset

	679 * if the current character is contains a base character.

	680 */

	681 static

	682 inline int32_t getNextBaseOffset(const UChar *text,

	683 int32_t textoffset,

	684 int32_t textlength)

	685 {

	686 if (textoffset < textlength) {

	687 int32_t temp = textoffset;

	688 if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {

	689 while (temp < textlength) {

	690 int32_t result = temp;

	691 if ((getFCD(text, &temp, textlength) >>

	692 SECOND_LAST_BYTE_SHIFT_) == 0) {

	693 return result;

	694 }

	695 }

	696 return textlength;

	697 }

	698 }

	699 return textoffset;

	700 }

	701

	702 /**

	703 * Gets the next base character offset depending on the string search pattern

	704 * data

	705 * @param strsrch string search data

	706 * @param textoffset current offset, one offset away from the last character

	707 * to search for.

	708 * @return start index of the next base character or the current offset

	709 * if the current character is contains a base character.

	710 */

	711 static

	712 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,

	713 int32_t textoffset)

	714 {

	715 int32_t textlength = strsrch->search->textLength;

	716 if (strsrch->pattern.hasSuffixAccents &&

	717 textoffset < textlength) {

	718 int32_t temp = textoffset;

	719 const UChar *text = strsrch->search->text;

	720 UTF_BACK_1(text, 0, temp);

	721 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {

	722 return getNextBaseOffset(text, textoffset, textlength);

	723 }

	724 }

	725 return textoffset;

	726 }

	727

	728 /**

	729 * Shifting the collation element iterator position forward to prepare for

	730 * a following match. If the last character is a unsafe character, we'll only

	731 * shift by 1 to capture contractions, normalization etc.

	732 * Internal method, status assumed to be success.

	733 * @param text strsrch string search data

	734 * @param textoffset start text position to do search

	735 * @param ce the text ce which failed the match.

	736 * @param patternceindex index of the ce within the pattern ce buffer which

	737 * failed the match

	738 * @return final offset

	739 */

	740 static

	741 inline int32_t shiftForward(UStringSearch *strsrch,

	742 int32_t textoffset,

	743 int32_t ce,

	744 int32_t patternceindex)

	745 {

	746 UPattern *pattern = &(strsrch->pattern);

	747 if (ce != UCOL_NULLORDER) {

	748 int32_t shift = pattern->shift[hash(ce)];

	749 // this is to adjust for characters in the middle of the

	750 // substring for matching that failed.

	751 int32_t adjust = pattern->CELength - patternceindex;

	752 if (adjust > 1 && shift >= adjust) {

	753 shift -= adjust - 1;

	754 }

	755 textoffset += shift;

	756 }

	757 else {

	758 textoffset += pattern->defaultShiftSize;

	759 }

	760

	761 textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);

	762 // check for unsafe characters

	763 // * if it is the start or middle of a contraction: to be done after

	764 // a initial match is found

	765 // * thai or lao base consonant character: similar to contraction

	766 // * high surrogate character: similar to contraction

	767 // * next character is a accent: shift to the next base character

	768 return textoffset;

	769 }

	770 #endif // #if BOYER_MOORE

	771

	772 /**

	773 * sets match not found

	774 * @param strsrch string search data

	775 */

	776 static

	777 inline void setMatchNotFound(UStringSearch *strsrch)

	778 {

	779 // this method resets the match result regardless of the error status.

	780 strsrch->search->matchedIndex = USEARCH_DONE;

	781 strsrch->search->matchedLength = 0;

	782 if (strsrch->search->isForwardSearching) {

	783 setColEIterOffset(strsrch->textIter, strsrch->search->textLength);

	784 }

	785 else {

	786 setColEIterOffset(strsrch->textIter, 0);

	787 }

	788 }

	789

	790 #if BOYER_MOORE

	791 /**

	792 * Gets the offset to the next safe point in text.

	793 * ie. not the middle of a contraction, swappable characters or supplementary

	794 * characters.

	795 * @param collator collation sata

	796 * @param text string to work with

	797 * @param textoffset offset in string

	798 * @param textlength length of text string

	799 * @return offset to the next safe character

	800 */

	801 static

	802 inline int32_t getNextSafeOffset(const UCollator *collator,

	803 const UChar *text,

	804 int32_t textoffset,

	805 int32_t textlength)

	806 {

	807 int32_t result = textoffset; // first contraction character

	808 while (result != textlength && ucol_unsafeCP(text[result], collator)) {

	809 result ++;

	810 }

	811 return result;

	812 }

	813

	814 /**

	815 * This checks for accents in the potential match started with a .

	816 * composite character.

	817 * This is really painful... we have to check that composite character do not

	818 * have any extra accents. We have to normalize the potential match and find

	819 * the immediate decomposed character before the match.

	820 * The first composite character would have been taken care of by the fcd

	821 * checks in checkForwardExactMatch.

	822 * This is the slow path after the fcd of the first character and

	823 * the last character has been checked by checkForwardExactMatch and we

	824 * determine that the potential match has extra non-ignorable preceding

	825 * ces.

	826 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,

	827 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA

	828 * Note here that accents checking are slow and cautioned in the API docs.

	829 * Internal method, status assumed to be a success, caller should check status

	830 * before calling this method

	831 * @param strsrch string search data

	832 * @param start index of the potential unfriendly composite character

	833 * @param end index of the potential unfriendly composite character

	834 * @param status output error status if any.

	835 * @return TRUE if there is non-ignorable accents before at the beginning

	836 * of the match, FALSE otherwise.

	837 */

	838

	839 static

	840 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,

	841 int32_t end,

	842 UErrorCode *status)

	843 {

	844 UBool result = FALSE;

	845 if (strsrch->pattern.hasPrefixAccents) {

	846 int32_t length = end - start;

	847 int32_t offset = 0;

	848 const UChar *text = strsrch->search->text + start;

	849

	850 UTF_FWD_1(text, offset, length);

	851 // we are only concerned with the first composite character

	852 if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {

	853 int32_t safeoffset = getNextSafeOffset(strsrch->collator,

	854 text, 0, length);

	855 if (safeoffset != length) {

	856 safeoffset ++;

	857 }

	858 UChar *norm = NULL;

	859 UChar buffer[INITIAL_ARRAY_SIZE_];

	860 int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,

	861 buffer, INITIAL_ARRAY_SIZE_,

	862 status);

	863 if (U_FAILURE(*status)) {

	864 return FALSE;

	865 }

	866 if (size >= INITIAL_ARRAY_SIZE_) {

	867 norm = (UChar )allocateMemory((size + 1) sizeof(UChar),

	868 status);

	869 // if allocation failed, status will be set to

	870 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally

	871 // checks for it.

	872 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,

	873 size, status);

	874 if (U_FAILURE(*status) && norm != NULL) {

	875 uprv_free(norm);

	876 return FALSE;

	877 }

	878 }

	879 else {

	880 norm = buffer;

	881 }

	882

	883 UCollationElements *coleiter = strsrch->utilIter;

	884 ucol_setText(coleiter, norm, size, status);

	885 uint32_t firstce = strsrch->pattern.CE[0];

	886 UBool ignorable = TRUE;

	887 uint32_t ce = UCOL_IGNORABLE;

	888 while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_N ULLORDER) {

	889 offset = ucol_getOffset(coleiter);

	890 if (ce != firstce && ce != UCOL_IGNORABLE) {

	891 ignorable = FALSE;

	892 }

	893 ce = ucol_next(coleiter, status);

	894 }

	895 UChar32 codepoint;

	896 UTF_PREV_CHAR(norm, 0, offset, codepoint);

	897 result = !ignorable && (u_getCombiningClass(codepoint) != 0);

	898

	899 if (norm != buffer) {

	900 uprv_free(norm);

	901 }

	902 }

	903 }

	904

	905 return result;

	906 }

	907

	908 /**

	909 * Used by exact matches, checks if there are accents before the match.

	910 * This is really painful... we have to check that composite characters at

	911 * the start of the matches have to not have any extra accents.

	912 * We check the FCD of the character first, if it starts with an accent and

	913 * the first pattern ce does not match the first ce of the character, we bail.

	914 * Otherwise we try normalizing the first composite

	915 * character and find the immediate decomposed character before the match to

	916 * see if it is an non-ignorable accent.

	917 * Now normalizing the first composite character is enough because we ensure

	918 * that when the match is passed in here with extra beginning ces, the

	919 * first or last ce that match has to occur within the first character.

	920 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,

	921 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA

	922 * Note here that accents checking are slow and cautioned in the API docs.

	923 * @param strsrch string search data

	924 * @param start offset

	925 * @param end offset

	926 * @return TRUE if there are accents on either side of the match,

	927 * FALSE otherwise

	928 */

	929 static

	930 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,

	931 int32_t end)

	932 {

	933 if (strsrch->pattern.hasPrefixAccents) {

	934 UCollationElements *coleiter = strsrch->textIter;

	935 UErrorCode status = U_ZERO_ERROR;

	936 // we have been iterating forwards previously

	937 uint32_t ignorable = TRUE;

	938 int32_t firstce = strsrch->pattern.CE[0];

	939

	940 setColEIterOffset(coleiter, start);

	941 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));

	942 if (U_FAILURE(status)) {

	943 return TRUE;

	944 }

	945 while (ce != firstce) {

	946 if (ce != UCOL_IGNORABLE) {

	947 ignorable = FALSE;

	948 }

	949 ce = getCE(strsrch, ucol_next(coleiter, &status));

	950 if (U_FAILURE(status) \|\| ce == UCOL_NULLORDER) {

	951 return TRUE;

	952 }

	953 }

	954 if (!ignorable && inNormBuf(coleiter)) {

	955 // within normalization buffer, discontiguous handled here

	956 return TRUE;

	957 }

	958

	959 // within text

	960 int32_t temp = start;

	961 // original code

	962 // accent = (getFCD(strsrch->search->text, &temp,

	963 // strsrch->search->textLength)

	964 // >> SECOND_LAST_BYTE_SHIFT_);

	965 // however this code does not work well with VC7 .net in release mode.

	966 // maybe the inlines for getFCD combined with shifting has bugs in

	967 // VC7. anyways this is a work around.

	968 UBool accent = getFCD(strsrch->search->text, &temp,

	969 strsrch->search->textLength) > 0xFF;

	970 if (!accent) {

	971 return checkExtraMatchAccents(strsrch, start, end, &status);

	972 }

	973 if (!ignorable) {

	974 return TRUE;

	975 }

	976 if (start > 0) {

	977 temp = start;

	978 UTF_BACK_1(strsrch->search->text, 0, temp);

	979 if (getFCD(strsrch->search->text, &temp,

	980 strsrch->search->textLength) & LAST_BYTE_MASK_) {

	981 setColEIterOffset(coleiter, start);

	982 ce = ucol_previous(coleiter, &status);

	983 if (U_FAILURE(status) \|\|

	984 (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {

	985 return TRUE;

	986 }

	987 }

	988 }

	989 }

	990

	991 return FALSE;

	992 }

	993

	994 /**

	995 * Used by exact matches, checks if there are accents bounding the match.

	996 * Note this is the initial boundary check. If the potential match

	997 * starts or ends with composite characters, the accents in those

	998 * characters will be determined later.

	999 * Not doing backwards iteration here, since discontiguos contraction for

	1000 * backwards collation element iterator, use up too many characters.

	1001 * E.g. looking for \u030A ring in \u01FA A ring above and acute,

	1002 * should fail since there is a acute at the end of \u01FA

	1003 * Note here that accents checking are slow and cautioned in the API docs.

	1004 * @param strsrch string search data

	1005 * @param start offset of match

	1006 * @param end end offset of the match

	1007 * @return TRUE if there are accents on either side of the match,

	1008 * FALSE otherwise

	1009 */

	1010 static

	1011 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,

	1012 int32_t end)

	1013 {

	1014 if (strsrch->pattern.hasSuffixAccents) {

	1015 const UChar *text = strsrch->search->text;

	1016 int32_t temp = end;

	1017 int32_t textlength = strsrch->search->textLength;

	1018 UTF_BACK_1(text, 0, temp);

	1019 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {

	1020 int32_t firstce = strsrch->pattern.CE[0];

	1021 UCollationElements *coleiter = strsrch->textIter;

	1022 UErrorCode status = U_ZERO_ERROR;

	1023 int32_t ce;

	1024 setColEIterOffset(coleiter, start);

	1025 while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstc e) {

	1026 if (U_FAILURE(status) \|\| ce == UCOL_NULLORDER) {

	1027 return TRUE;

	1028 }

	1029 }

	1030 int32_t count = 1;

	1031 while (count < strsrch->pattern.CELength) {

	1032 if (getCE(strsrch, ucol_next(coleiter, &status))

	1033 == UCOL_IGNORABLE) {

	1034 // Thai can give an ignorable here.

	1035 count --;

	1036 }

	1037 if (U_FAILURE(status)) {

	1038 return TRUE;

	1039 }

	1040 count ++;

	1041 }

	1042

	1043 ce = ucol_next(coleiter, &status);

	1044 if (U_FAILURE(status)) {

	1045 return TRUE;

	1046 }

	1047 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {

	1048 ce = getCE(strsrch, ce);

	1049 }

	1050 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {

	1051 if (ucol_getOffset(coleiter) <= end) {

	1052 return TRUE;

	1053 }

	1054 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {

	1055 return TRUE;

	1056 }

	1057 }

	1058 }

	1059 }

	1060 return FALSE;

	1061 }

	1062 #endif // #if BOYER_MOORE

	1063

	1064 /**

	1065 * Checks if the offset runs out of the text string

	1066 * @param offset

	1067 * @param textlength of the text string

	1068 * @return TRUE if offset is out of bounds, FALSE otherwise

	1069 */

	1070 static

	1071 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)

	1072 {

	1073 return offset < 0 \|\| offset > textlength;

	1074 }

	1075

	1076 /**

	1077 * Checks for identical match

	1078 * @param strsrch string search data

	1079 * @param start offset of possible match

	1080 * @param end offset of possible match

	1081 * @return TRUE if identical match is found

	1082 */

	1083 static

	1084 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,

	1085 int32_t end)

	1086 {

	1087 if (strsrch->strength != UCOL_IDENTICAL) {

	1088 return TRUE;

	1089 }

	1090

	1091 // Note: We could use Normalizer::compare() or similar, but for short string s

	1092 // which may not be in FCD it might be faster to just NFD them.

	1093 UErrorCode status = U_ZERO_ERROR;

	1094 UnicodeString t2, p2;

	1095 strsrch->nfd->normalize(

	1096 UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, st atus);

	1097 strsrch->nfd->normalize(

	1098 UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength) , p2, status);

	1099 // return FALSE if NFD failed

	1100 return U_SUCCESS(status) && t2 == p2;

	1101 }

	1102

	1103 #if BOYER_MOORE

	1104 /**

	1105 * Checks to see if the match is repeated

	1106 * @param strsrch string search data

	1107 * @param start new match start index

	1108 * @param end new match end index

	1109 * @return TRUE if the the match is repeated, FALSE otherwise

	1110 */

	1111 static

	1112 inline UBool checkRepeatedMatch(UStringSearch *strsrch,

	1113 int32_t start,

	1114 int32_t end)

	1115 {

	1116 int32_t lastmatchindex = strsrch->search->matchedIndex;

	1117 UBool result;

	1118 if (lastmatchindex == USEARCH_DONE) {

	1119 return FALSE;

	1120 }

	1121 if (strsrch->search->isForwardSearching) {

	1122 result = start <= lastmatchindex;

	1123 }

	1124 else {

	1125 result = start >= lastmatchindex;

	1126 }

	1127 if (!result && !strsrch->search->isOverlap) {

	1128 if (strsrch->search->isForwardSearching) {

	1129 result = start < lastmatchindex + strsrch->search->matchedLength;

	1130 }

	1131 else {

	1132 result = end > lastmatchindex;

	1133 }

	1134 }

	1135 return result;

	1136 }

	1137

	1138 /**

	1139 * Gets the collation element iterator's current offset.

	1140 * @param coleiter collation element iterator

	1141 * @param forwards flag TRUE if we are moving in th forwards direction

	1142 * @return current offset

	1143 */

	1144 static

	1145 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,

	1146 UBool forwards)

	1147 {

	1148 int32_t result = ucol_getOffset(coleiter);

	1149 // intricacies of the the backwards collation element iterator

	1150 if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter) ) {

	1151 result ++;

	1152 }

	1153 return result;

	1154 }

	1155

	1156 /**

	1157 * Checks match for contraction.

	1158 * If the match ends with a partial contraction we fail.

	1159 * If the match starts too far off (because of backwards iteration) we try to

	1160 * chip off the extra characters depending on whether a breakiterator has

	1161 * been used.

	1162 * Internal method, error assumed to be success, caller has to check status

	1163 * before calling this method.

	1164 * @param strsrch string search data

	1165 * @param start offset of potential match, to be modified if necessary

	1166 * @param end offset of potential match, to be modified if necessary

	1167 * @param status output error status if any

	1168 * @return TRUE if match passes the contraction test, FALSE otherwise

	1169 */

	1170

	1171 static

	1172 UBool checkNextExactContractionMatch(UStringSearch *strsrch,

	1173 int32_t *start,

	1174 int32_t end, UErrorCode status)

	1175 {

	1176 UCollationElements *coleiter = strsrch->textIter;

	1177 int32_t textlength = strsrch->search->textLength;

	1178 int32_t temp = *start;

	1179 const UCollator *collator = strsrch->collator;

	1180 const UChar *text = strsrch->search->text;

	1181 // This part checks if either ends of the match contains potential

	1182 // contraction. If so we'll have to iterate through them

	1183 // The start contraction needs to be checked since ucol_previous dumps

	1184 // all characters till the first safe character into the buffer.

	1185 // start + 1 is used to test for the unsafe characters instead of start

	1186 // because ucol_prev takes all unsafe characters till the first safe

	1187 // character ie start. so by testing start + 1, we can estimate if

	1188 // excess prefix characters has been included in the potential search

	1189 // results.

	1190 if ((end < textlength && ucol_unsafeCP(text[end], collator)) \|\|

	1191 (*start + 1 < textlength

	1192 && ucol_unsafeCP(text[*start + 1], collator))) {

	1193 int32_t expansion = getExpansionPrefix(coleiter);

	1194 UBool expandflag = expansion > 0;

	1195 setColEIterOffset(coleiter, *start);

	1196 while (expansion > 0) {

	1197 // getting rid of the redundant ce, caused by setOffset.

	1198 // since backward contraction/expansion may have extra ces if we

	1199 // are in the normalization buffer, hasAccentsBeforeMatch would

	1200 // have taken care of it.

	1201 // E.g. the character \u01FA will have an expansion of 3, but if

	1202 // we are only looking for acute and ring \u030A and \u0301, we'll

	1203 // have to skip the first ce in the expansion buffer.

	1204 ucol_next(coleiter, status);

	1205 if (U_FAILURE(*status)) {

	1206 return FALSE;

	1207 }

	1208 if (ucol_getOffset(coleiter) != temp) {

	1209 *start = temp;

	1210 temp = ucol_getOffset(coleiter);

	1211 }

	1212 expansion --;

	1213 }

	1214

	1215 int32_t *patternce = strsrch->pattern.CE;

	1216 int32_t patterncelength = strsrch->pattern.CELength;

	1217 int32_t count = 0;

	1218 while (count < patterncelength) {

	1219 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));

	1220 if (ce == UCOL_IGNORABLE) {

	1221 continue;

	1222 }

	1223 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {

	1224 *start = temp;

	1225 temp = ucol_getOffset(coleiter);

	1226 }

	1227 if (U_FAILURE(*status) \|\| ce != patternce[count]) {

	1228 (*end) ++;

	1229 end = getNextUStringSearchBaseOffset(strsrch, end);

	1230 return FALSE;

	1231 }

	1232 count ++;

	1233 }

	1234 }

	1235 return TRUE;

	1236 }

	1237

	1238 /**

	1239 * Checks and sets the match information if found.

	1240 * Checks

	1241 * <ul>

	1242 * <li> the potential match does not repeat the previous match

	1243 * <li> boundaries are correct

	1244 * <li> exact matches has no extra accents

	1245 * <li> identical matchesb

	1246 * <li> potential match does not end in the middle of a contraction

	1247 * <\ul>

	1248 * Otherwise the offset will be shifted to the next character.

	1249 * Internal method, status assumed to be success, caller has to check status

	1250 * before calling this method.

	1251 * @param strsrch string search data

	1252 * @param textoffset offset in the collation element text. the returned value

	1253 * will be the truncated end offset of the match or the new start

	1254 * search offset.

	1255 * @param status output error status if any

	1256 * @return TRUE if the match is valid, FALSE otherwise

	1257 */

	1258 static

	1259 inline UBool checkNextExactMatch(UStringSearch *strsrch,

	1260 int32_t textoffset, UErrorCode status)

	1261 {

	1262 UCollationElements *coleiter = strsrch->textIter;

	1263 int32_t start = getColElemIterOffset(coleiter, FALSE);

	1264

	1265 if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {

	1266 return FALSE;

	1267 }

	1268

	1269 // this totally matches, however we need to check if it is repeating

	1270 if (!isBreakUnit(strsrch, start, *textoffset) \|\|

	1271 checkRepeatedMatch(strsrch, start, *textoffset) \|\|

	1272 hasAccentsBeforeMatch(strsrch, start, *textoffset) \|\|

	1273 !checkIdentical(strsrch, start, *textoffset) \|\|

	1274 hasAccentsAfterMatch(strsrch, start, *textoffset)) {

	1275

	1276 (*textoffset) ++;

	1277 textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);

	1278 return FALSE;

	1279 }

	1280

	1281 //Add breakiterator boundary check for primary strength search.

	1282 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {

	1283 checkBreakBoundary(strsrch, &start, textoffset);

	1284 }

	1285

	1286 // totally match, we will get rid of the ending ignorables.

	1287 strsrch->search->matchedIndex = start;

	1288 strsrch->search->matchedLength = *textoffset - start;

	1289 return TRUE;

	1290 }

	1291

	1292 /**

	1293 * Getting the previous base character offset, or the current offset if the

	1294 * current character is a base character

	1295 * @param text string

	1296 * @param textoffset one offset after the current character

	1297 * @return the offset of the next character after the base character or the first

	1298 * composed character with accents

	1299 */

	1300 static

	1301 inline int32_t getPreviousBaseOffset(const UChar *text,

	1302 int32_t textoffset)

	1303 {

	1304 if (textoffset > 0) {

	1305 for (;;) {

	1306 int32_t result = textoffset;

	1307 UTF_BACK_1(text, 0, textoffset);

	1308 int32_t temp = textoffset;

	1309 uint16_t fcd = getFCD(text, &temp, result);

	1310 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {

	1311 if (fcd & LAST_BYTE_MASK_) {

	1312 return textoffset;

	1313 }

	1314 return result;

	1315 }

	1316 if (textoffset == 0) {

	1317 return 0;

	1318 }

	1319 }

	1320 }

	1321 return textoffset;

	1322 }

	1323

	1324 /**

	1325 * Getting the indexes of the accents that are not blocked in the argument

	1326 * accent array

	1327 * @param accents array of accents in nfd terminated by a 0.

	1328 * @param accentsindex array of indexes of the accents that are not blocked

	1329 */

	1330 static

	1331 inline int getUnblockedAccentIndex(UChar accents, int32_t accentsindex)

	1332 {

	1333 int32_t index = 0;

	1334 int32_t length = u_strlen(accents);

	1335 UChar32 codepoint = 0;

	1336 int cclass = 0;

	1337 int result = 0;

	1338 int32_t temp;

	1339 while (index < length) {

	1340 temp = index;

	1341 UTF_NEXT_CHAR(accents, index, length, codepoint);

	1342 if (u_getCombiningClass(codepoint) != cclass) {

	1343 cclass = u_getCombiningClass(codepoint);

	1344 accentsindex[result] = temp;

	1345 result ++;

	1346 }

	1347 }

	1348 accentsindex[result] = length;

	1349 return result;

	1350 }

	1351

	1352 /**

	1353 * Appends 3 UChar arrays to a destination array.

	1354 * Creates a new array if we run out of space. The caller will have to

	1355 * manually deallocate the newly allocated array.

	1356 * Internal method, status assumed to be success, caller has to check status

	1357 * before calling this method. destination not to be NULL and has at least

	1358 * size destinationlength.

	1359 * @param destination target array

	1360 * @param destinationlength target array size, returning the appended length

	1361 * @param source1 null-terminated first array

	1362 * @param source2 second array

	1363 * @param source2length length of seond array

	1364 * @param source3 null-terminated third array

	1365 * @param status error status if any

	1366 * @return new destination array, destination if there was no new allocation

	1367 */

	1368 static

	1369 inline UChar * addToUCharArray( UChar *destination,

	1370 int32_t *destinationlength,

	1371 const UChar *source1,

	1372 const UChar *source2,

	1373 int32_t source2length,

	1374 const UChar *source3,

	1375 UErrorCode *status)

	1376 {

	1377 int32_t source1length = source1 ? u_strlen(source1) : 0;

	1378 int32_t source3length = source3 ? u_strlen(source3) : 0;

	1379 if (*destinationlength < source1length + source2length + source3length +

	1380 1)

	1381 {

	1382 destination = (UChar *)allocateMemory(

	1383 (source1length + source2length + source3length + 1) * sizeof(UChar),

	1384 status);

	1385 // if error allocating memory, status will be

	1386 // U_MEMORY_ALLOCATION_ERROR

	1387 if (U_FAILURE(*status)) {

	1388 *destinationlength = 0;

	1389 return NULL;

	1390 }

	1391 }

	1392 if (source1length != 0) {

	1393 uprv_memcpy(destination, source1, sizeof(UChar) * source1length);

	1394 }

	1395 if (source2length != 0) {

	1396 uprv_memcpy(destination + source1length, source2,

	1397 sizeof(UChar) * source2length);

	1398 }

	1399 if (source3length != 0) {

	1400 uprv_memcpy(destination + source1length + source2length, source3,

	1401 sizeof(UChar) * source3length);

	1402 }

	1403 *destinationlength = source1length + source2length + source3length;

	1404 return destination;

	1405 }

	1406

	1407 /**

	1408 * Running through a collation element iterator to see if the contents matches

	1409 * pattern in string search data

	1410 * @param strsrch string search data

	1411 * @param coleiter collation element iterator

	1412 * @return TRUE if a match if found, FALSE otherwise

	1413 */

	1414 static

	1415 inline UBool checkCollationMatch(const UStringSearch *strsrch,

	1416 UCollationElements *coleiter)

	1417 {

	1418 int patternceindex = strsrch->pattern.CELength;

	1419 int32_t *patternce = strsrch->pattern.CE;

	1420 UErrorCode status = U_ZERO_ERROR;

	1421 while (patternceindex > 0) {

	1422 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));

	1423 if (ce == UCOL_IGNORABLE) {

	1424 continue;

	1425 }

	1426 if (U_FAILURE(status) \|\| ce != *patternce) {

	1427 return FALSE;

	1428 }

	1429 patternce ++;

	1430 patternceindex --;

	1431 }

	1432 return TRUE;

	1433 }

	1434

	1435 /**

	1436 * Rearranges the front accents to try matching.

	1437 * Prefix accents in the text will be grouped according to their combining

	1438 * class and the groups will be mixed and matched to try find the perfect

	1439 * match with the pattern.

	1440 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"

	1441 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings

	1442 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",

	1443 * "\u0301\u0325".

	1444 * step 2: check if any of the generated substrings matches the pattern.

	1445 * Internal method, status is assumed to be success, caller has to check status

	1446 * before calling this method.

	1447 * @param strsrch string search match

	1448 * @param start first offset of the accents to start searching

	1449 * @param end start of the last accent set

	1450 * @param status output error status if any

	1451 * @return USEARCH_DONE if a match is not found, otherwise return the starting

	1452 * offset of the match. Note this start includes all preceding accents.

	1453 */

	1454 static

	1455 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,

	1456 int32_t start,

	1457 int32_t end,

	1458 UErrorCode *status)

	1459 {

	1460 const UChar *text = strsrch->search->text;

	1461 int32_t textlength = strsrch->search->textLength;

	1462 int32_t tempstart = start;

	1463

	1464 if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {

	1465 // die... failed at a base character

	1466 return USEARCH_DONE;

	1467 }

	1468

	1469 int32_t offset = getNextBaseOffset(text, tempstart, textlength);

	1470 start = getPreviousBaseOffset(text, tempstart);

	1471

	1472 UChar accents[INITIAL_ARRAY_SIZE_];

	1473 // normalizing the offensive string

	1474 unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,

	1475 INITIAL_ARRAY_SIZE_, status);

	1476 if (U_FAILURE(*status)) {

	1477 return USEARCH_DONE;

	1478 }

	1479

	1480 int32_t accentsindex[INITIAL_ARRAY_SIZE_];

	1481 int32_t accentsize = getUnblockedAccentIndex(accents,

	1482 accentsindex);

	1483 int32_t count = (2 << (accentsize - 1)) - 1;

	1484 UChar buffer[INITIAL_ARRAY_SIZE_];

	1485 UCollationElements *coleiter = strsrch->utilIter;

	1486 while (U_SUCCESS(*status) && count > 0) {

	1487 UChar *rearrange = strsrch->canonicalPrefixAccents;

	1488 // copy the base characters

	1489 for (int k = 0; k < accentsindex[0]; k ++) {

	1490 *rearrange ++ = accents[k];

	1491 }

	1492 // forming all possible canonical rearrangement by dropping

	1493 // sets of accents

	1494 for (int i = 0; i <= accentsize - 1; i ++) {

	1495 int32_t mask = 1 << (accentsize - i - 1);

	1496 if (count & mask) {

	1497 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {

	1498 *rearrange ++ = accents[j];

	1499 }

	1500 }

	1501 }

	1502 *rearrange = 0;

	1503 int32_t matchsize = INITIAL_ARRAY_SIZE_;

	1504 UChar *match = addToUCharArray(buffer, &matchsize,

	1505 strsrch->canonicalPrefixAccents,

	1506 strsrch->search->text + offset,

	1507 end - offset,

	1508 strsrch->canonicalSuffixAccents,

	1509 status);

	1510

	1511 // if status is a failure, ucol_setText does nothing.

	1512 // run the collator iterator through this match

	1513 ucol_setText(coleiter, match, matchsize, status);

	1514 if (U_SUCCESS(*status)) {

	1515 if (checkCollationMatch(strsrch, coleiter)) {

	1516 if (match != buffer) {

	1517 uprv_free(match);

	1518 }

	1519 return start;

	1520 }

	1521 }

	1522 count --;

	1523 }

	1524 return USEARCH_DONE;

	1525 }

	1526

	1527 /**

	1528 * Gets the offset to the safe point in text before textoffset.

	1529 * ie. not the middle of a contraction, swappable characters or supplementary

	1530 * characters.

	1531 * @param collator collation sata

	1532 * @param text string to work with

	1533 * @param textoffset offset in string

	1534 * @param textlength length of text string

	1535 * @return offset to the previous safe character

	1536 */

	1537 static

	1538 inline uint32_t getPreviousSafeOffset(const UCollator *collator,

	1539 const UChar *text,

	1540 int32_t textoffset)

	1541 {

	1542 int32_t result = textoffset; // first contraction character

	1543 while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {

	1544 result --;

	1545 }

	1546 if (result != 0) {

	1547 // the first contraction character is consider unsafe here

	1548 result --;

	1549 }

	1550 return result;

	1551 }

	1552

	1553 /**

	1554 * Cleaning up after we passed the safe zone

	1555 * @param strsrch string search data

	1556 * @param safetext safe text array

	1557 * @param safebuffer safe text buffer

	1558 * @param coleiter collation element iterator for safe text

	1559 */

	1560 static

	1561 inline void cleanUpSafeText(const UStringSearch strsrch, UChar safetext,

	1562 UChar *safebuffer)

	1563 {

	1564 if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)

	1565 {

	1566 uprv_free(safetext);

	1567 }

	1568 }

	1569

	1570 /**

	1571 * Take the rearranged end accents and tries matching. If match failed at

	1572 * a seperate preceding set of accents (seperated from the rearranged on by

	1573 * at least a base character) then we rearrange the preceding accents and

	1574 * tries matching again.

	1575 * We allow skipping of the ends of the accent set if the ces do not match.

	1576 * However if the failure is found before the accent set, it fails.

	1577 * Internal method, status assumed to be success, caller has to check status

	1578 * before calling this method.

	1579 * @param strsrch string search data

	1580 * @param textoffset of the start of the rearranged accent

	1581 * @param status output error status if any

	1582 * @return USEARCH_DONE if a match is not found, otherwise return the starting

	1583 * offset of the match. Note this start includes all preceding accents.

	1584 */

	1585 static

	1586 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,

	1587 int32_t textoffset,

	1588 UErrorCode *status)

	1589 {

	1590 const UChar *text = strsrch->search->text;

	1591 const UCollator *collator = strsrch->collator;

	1592 int32_t safelength = 0;

	1593 UChar *safetext;

	1594 int32_t safetextlength;

	1595 UChar safebuffer[INITIAL_ARRAY_SIZE_];

	1596 UCollationElements *coleiter = strsrch->utilIter;

	1597 int32_t safeoffset = textoffset;

	1598

	1599 if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],

	1600 collator)) {

	1601 safeoffset = getPreviousSafeOffset(collator, text, textoffset);

	1602 safelength = textoffset - safeoffset;

	1603 safetextlength = INITIAL_ARRAY_SIZE_;

	1604 safetext = addToUCharArray(safebuffer, &safetextlength, NULL,

	1605 text + safeoffset, safelength,

	1606 strsrch->canonicalSuffixAccents,

	1607 status);

	1608 }

	1609 else {

	1610 safetextlength = u_strlen(strsrch->canonicalSuffixAccents);

	1611 safetext = strsrch->canonicalSuffixAccents;

	1612 }

	1613

	1614 // if status is a failure, ucol_setText does nothing

	1615 ucol_setText(coleiter, safetext, safetextlength, status);

	1616 // status checked in loop below

	1617

	1618 int32_t *ce = strsrch->pattern.CE;

	1619 int32_t celength = strsrch->pattern.CELength;

	1620 int ceindex = celength - 1;

	1621 UBool isSafe = TRUE; // indication flag for position in safe zone

	1622

	1623 while (ceindex >= 0) {

	1624 int32_t textce = ucol_previous(coleiter, status);

	1625 if (U_FAILURE(*status)) {

	1626 if (isSafe) {

	1627 cleanUpSafeText(strsrch, safetext, safebuffer);

	1628 }

	1629 return USEARCH_DONE;

	1630 }

	1631 if (textce == UCOL_NULLORDER) {

	1632 // check if we have passed the safe buffer

	1633 if (coleiter == strsrch->textIter) {

	1634 cleanUpSafeText(strsrch, safetext, safebuffer);

	1635 return USEARCH_DONE;

	1636 }

	1637 cleanUpSafeText(strsrch, safetext, safebuffer);

	1638 safetext = safebuffer;

	1639 coleiter = strsrch->textIter;

	1640 setColEIterOffset(coleiter, safeoffset);

	1641 // status checked at the start of the loop

	1642 isSafe = FALSE;

	1643 continue;

	1644 }

	1645 textce = getCE(strsrch, textce);

	1646 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {

	1647 // do the beginning stuff

	1648 int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);

	1649 if (isSafe && failedoffset >= safelength) {

	1650 // alas... no hope. failed at rearranged accent set

	1651 cleanUpSafeText(strsrch, safetext, safebuffer);

	1652 return USEARCH_DONE;

	1653 }

	1654 else {

	1655 if (isSafe) {

	1656 failedoffset += safeoffset;

	1657 cleanUpSafeText(strsrch, safetext, safebuffer);

	1658 }

	1659

	1660 // try rearranging the front accents

	1661 int32_t result = doNextCanonicalPrefixMatch(strsrch,

	1662 failedoffset, textoffset, status);

	1663 if (result != USEARCH_DONE) {

	1664 // if status is a failure, ucol_setOffset does nothing

	1665 setColEIterOffset(strsrch->textIter, result);

	1666 }

	1667 if (U_FAILURE(*status)) {

	1668 return USEARCH_DONE;

	1669 }

	1670 return result;

	1671 }

	1672 }

	1673 if (textce == ce[ceindex]) {

	1674 ceindex --;

	1675 }

	1676 }

	1677 // set offset here

	1678 if (isSafe) {

	1679 int32_t result = getColElemIterOffset(coleiter, FALSE);

	1680 // sets the text iterator here with the correct expansion and offset

	1681 int32_t leftoverces = getExpansionPrefix(coleiter);

	1682 cleanUpSafeText(strsrch, safetext, safebuffer);

	1683 if (result >= safelength) {

	1684 result = textoffset;

	1685 }

	1686 else {

	1687 result += safeoffset;

	1688 }

	1689 setColEIterOffset(strsrch->textIter, result);

	1690 strsrch->textIter->iteratordata_.toReturn =

	1691 setExpansionPrefix(strsrch->textIter, leftoverces);

	1692 return result;

	1693 }

	1694

	1695 return ucol_getOffset(coleiter);

	1696 }

	1697

	1698 /**

	1699 * Trying out the substring and sees if it can be a canonical match.

	1700 * This will try normalizing the end accents and arranging them into canonical

	1701 * equivalents and check their corresponding ces with the pattern ce.

	1702 * Suffix accents in the text will be grouped according to their combining

	1703 * class and the groups will be mixed and matched to try find the perfect

	1704 * match with the pattern.

	1705 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"

	1706 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings

	1707 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",

	1708 * "\u0301\u0325".

	1709 * step 2: check if any of the generated substrings matches the pattern.

	1710 * Internal method, status assumed to be success, caller has to check status

	1711 * before calling this method.

	1712 * @param strsrch string search data

	1713 * @param textoffset end offset in the collation element text that ends with

	1714 * the accents to be rearranged

	1715 * @param status error status if any

	1716 * @return TRUE if the match is valid, FALSE otherwise

	1717 */

	1718 static

	1719 UBool doNextCanonicalMatch(UStringSearch *strsrch,

	1720 int32_t textoffset,

	1721 UErrorCode *status)

	1722 {

	1723 const UChar *text = strsrch->search->text;

	1724 int32_t temp = textoffset;

	1725 UTF_BACK_1(text, 0, temp);

	1726 if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {

	1727 UCollationElements *coleiter = strsrch->textIter;

	1728 int32_t offset = getColElemIterOffset(coleiter, FALSE);

	1729 if (strsrch->pattern.hasPrefixAccents) {

	1730 offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,

	1731 status);

	1732 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {

	1733 setColEIterOffset(coleiter, offset);

	1734 return TRUE;

	1735 }

	1736 }

	1737 return FALSE;

	1738 }

	1739

	1740 if (!strsrch->pattern.hasSuffixAccents) {

	1741 return FALSE;

	1742 }

	1743

	1744 UChar accents[INITIAL_ARRAY_SIZE_];

	1745 // offset to the last base character in substring to search

	1746 int32_t baseoffset = getPreviousBaseOffset(text, textoffset);

	1747 // normalizing the offensive string

	1748 unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,

	1749 0, accents, INITIAL_ARRAY_SIZE_, status);

	1750 // status checked in loop below

	1751

	1752 int32_t accentsindex[INITIAL_ARRAY_SIZE_];

	1753 int32_t size = getUnblockedAccentIndex(accents, accentsindex);

	1754

	1755 // 2 power n - 1 plus the full set of accents

	1756 int32_t count = (2 << (size - 1)) - 1;

	1757 while (U_SUCCESS(*status) && count > 0) {

	1758 UChar *rearrange = strsrch->canonicalSuffixAccents;

	1759 // copy the base characters

	1760 for (int k = 0; k < accentsindex[0]; k ++) {

	1761 *rearrange ++ = accents[k];

	1762 }

	1763 // forming all possible canonical rearrangement by dropping

	1764 // sets of accents

	1765 for (int i = 0; i <= size - 1; i ++) {

	1766 int32_t mask = 1 << (size - i - 1);

	1767 if (count & mask) {

	1768 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {

	1769 *rearrange ++ = accents[j];

	1770 }

	1771 }

	1772 }

	1773 *rearrange = 0;

	1774 int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,

	1775 status);

	1776 if (offset != USEARCH_DONE) {

	1777 return TRUE; // match found

	1778 }

	1779 count --;

	1780 }

	1781 return FALSE;

	1782 }

	1783

	1784 /**

	1785 * Gets the previous base character offset depending on the string search

	1786 * pattern data

	1787 * @param strsrch string search data

	1788 * @param textoffset current offset, current character

	1789 * @return the offset of the next character after this base character or itself

	1790 * if it is a composed character with accents

	1791 */

	1792 static

	1793 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,

	1794 int32_t textoffset)

	1795 {

	1796 if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {

	1797 const UChar *text = strsrch->search->text;

	1798 int32_t offset = textoffset;

	1799 if (getFCD(text, &offset, strsrch->search->textLength) >>

	1800 SECOND_LAST_BYTE_SHIFT_) {

	1801 return getPreviousBaseOffset(text, textoffset);

	1802 }

	1803 }

	1804 return textoffset;

	1805 }

	1806

	1807 /**

	1808 * Checks match for contraction.

	1809 * If the match ends with a partial contraction we fail.

	1810 * If the match starts too far off (because of backwards iteration) we try to

	1811 * chip off the extra characters

	1812 * Internal method, status assumed to be success, caller has to check status

	1813 * before calling this method.

	1814 * @param strsrch string search data

	1815 * @param start offset of potential match, to be modified if necessary

	1816 * @param end offset of potential match, to be modified if necessary

	1817 * @param status output error status if any

	1818 * @return TRUE if match passes the contraction test, FALSE otherwise

	1819 */

	1820 static

	1821 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,

	1822 int32_t *start,

	1823 int32_t *end,

	1824 UErrorCode *status)

	1825 {

	1826 UCollationElements *coleiter = strsrch->textIter;

	1827 int32_t textlength = strsrch->search->textLength;

	1828 int32_t temp = *start;

	1829 const UCollator *collator = strsrch->collator;

	1830 const UChar *text = strsrch->search->text;

	1831 // This part checks if either ends of the match contains potential

	1832 // contraction. If so we'll have to iterate through them

	1833 if ((end < textlength && ucol_unsafeCP(text[end], collator)) \|\|

	1834 (*start + 1 < textlength

	1835 && ucol_unsafeCP(text[*start + 1], collator))) {

	1836 int32_t expansion = getExpansionPrefix(coleiter);

	1837 UBool expandflag = expansion > 0;

	1838 setColEIterOffset(coleiter, *start);

	1839 while (expansion > 0) {

	1840 // getting rid of the redundant ce, caused by setOffset.

	1841 // since backward contraction/expansion may have extra ces if we

	1842 // are in the normalization buffer, hasAccentsBeforeMatch would

	1843 // have taken care of it.

	1844 // E.g. the character \u01FA will have an expansion of 3, but if

	1845 // we are only looking for acute and ring \u030A and \u0301, we'll

	1846 // have to skip the first ce in the expansion buffer.

	1847 ucol_next(coleiter, status);

	1848 if (U_FAILURE(*status)) {

	1849 return FALSE;

	1850 }

	1851 if (ucol_getOffset(coleiter) != temp) {

	1852 *start = temp;

	1853 temp = ucol_getOffset(coleiter);

	1854 }

	1855 expansion --;

	1856 }

	1857

	1858 int32_t *patternce = strsrch->pattern.CE;

	1859 int32_t patterncelength = strsrch->pattern.CELength;

	1860 int32_t count = 0;

	1861 int32_t textlength = strsrch->search->textLength;

	1862 while (count < patterncelength) {

	1863 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));

	1864 // status checked below, note that if status is a failure

	1865 // ucol_next returns UCOL_NULLORDER

	1866 if (ce == UCOL_IGNORABLE) {

	1867 continue;

	1868 }

	1869 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {

	1870 *start = temp;

	1871 temp = ucol_getOffset(coleiter);

	1872 }

	1873

	1874 if (count == 0 && ce != patternce[0]) {

	1875 // accents may have extra starting ces, this occurs when a

	1876 // pure accent pattern is matched without rearrangement

	1877 // text \u0325\u0300 and looking for \u0300

	1878 int32_t expected = patternce[0];

	1879 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {

	1880 ce = getCE(strsrch, ucol_next(coleiter, status));

	1881 while (U_SUCCESS(*status) && ce != expected &&

	1882 ce != UCOL_NULLORDER &&

	1883 ucol_getOffset(coleiter) <= *end) {

	1884 ce = getCE(strsrch, ucol_next(coleiter, status));

	1885 }

	1886 }

	1887 }

	1888 if (U_FAILURE(*status) \|\| ce != patternce[count]) {

	1889 (*end) ++;

	1890 end = getNextUStringSearchBaseOffset(strsrch, end);

	1891 return FALSE;

	1892 }

	1893 count ++;

	1894 }

	1895 }

	1896 return TRUE;

	1897 }

	1898

	1899 /**

	1900 * Checks and sets the match information if found.

	1901 * Checks

	1902 * <ul>

	1903 * <li> the potential match does not repeat the previous match

	1904 * <li> boundaries are correct

	1905 * <li> potential match does not end in the middle of a contraction

	1906 * <li> identical matches

	1907 * <\ul>

	1908 * Otherwise the offset will be shifted to the next character.

	1909 * Internal method, status assumed to be success, caller has to check the

	1910 * status before calling this method.

	1911 * @param strsrch string search data

	1912 * @param textoffset offset in the collation element text. the returned value

	1913 * will be the truncated end offset of the match or the new start

	1914 * search offset.

	1915 * @param status output error status if any

	1916 * @return TRUE if the match is valid, FALSE otherwise

	1917 */

	1918 static

	1919 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,

	1920 int32_t *textoffset,

	1921 UErrorCode *status)

	1922 {

	1923 // to ensure that the start and ends are not composite characters

	1924 UCollationElements *coleiter = strsrch->textIter;

	1925 // if we have a canonical accent match

	1926 if ((strsrch->pattern.hasSuffixAccents &&

	1927 strsrch->canonicalSuffixAccents[0]) \|\|

	1928 (strsrch->pattern.hasPrefixAccents &&

	1929 strsrch->canonicalPrefixAccents[0])) {

	1930 strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset(

	1931 strsrch,

	1932 ucol_getOffset(coleiter));

	1933 strsrch->search->matchedLength = *textoffset -

	1934 strsrch->search->matchedIndex;

	1935 return TRUE;

	1936 }

	1937

	1938 int32_t start = getColElemIterOffset(coleiter, FALSE);

	1939 if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,

	1940 status) \|\| U_FAILURE(*status)) {

	1941 return FALSE;

	1942 }

	1943

	1944 start = getPreviousUStringSearchBaseOffset(strsrch, start);

	1945 // this totally matches, however we need to check if it is repeating

	1946 if (checkRepeatedMatch(strsrch, start, *textoffset) \|\|

	1947 !isBreakUnit(strsrch, start, *textoffset) \|\|

	1948 !checkIdentical(strsrch, start, *textoffset)) {

	1949 (*textoffset) ++;

	1950 textoffset = getNextBaseOffset(strsrch->search->text, textoffset,

	1951 strsrch->search->textLength);

	1952 return FALSE;

	1953 }

	1954

	1955 strsrch->search->matchedIndex = start;

	1956 strsrch->search->matchedLength = *textoffset - start;

	1957 return TRUE;

	1958 }

	1959

	1960 /**

	1961 * Shifting the collation element iterator position forward to prepare for

	1962 * a preceding match. If the first character is a unsafe character, we'll only

	1963 * shift by 1 to capture contractions, normalization etc.

	1964 * Internal method, status assumed to be success, caller has to check status

	1965 * before calling this method.

	1966 * @param text strsrch string search data

	1967 * @param textoffset start text position to do search

	1968 * @param ce the text ce which failed the match.

	1969 * @param patternceindex index of the ce within the pattern ce buffer which

	1970 * failed the match

	1971 * @return final offset

	1972 */

	1973 static

	1974 inline int32_t reverseShift(UStringSearch *strsrch,

	1975 int32_t textoffset,

	1976 int32_t ce,

	1977 int32_t patternceindex)

	1978 {

	1979 if (strsrch->search->isOverlap) {

	1980 if (textoffset != strsrch->search->textLength) {

	1981 textoffset --;

	1982 }

	1983 else {

	1984 textoffset -= strsrch->pattern.defaultShiftSize;

	1985 }

	1986 }

	1987 else {

	1988 if (ce != UCOL_NULLORDER) {

	1989 int32_t shift = strsrch->pattern.backShift[hash(ce)];

	1990

	1991 // this is to adjust for characters in the middle of the substring

	1992 // for matching that failed.

	1993 int32_t adjust = patternceindex;

	1994 if (adjust > 1 && shift > adjust) {

	1995 shift -= adjust - 1;

	1996 }

	1997 textoffset -= shift;

	1998 }

	1999 else {

	2000 textoffset -= strsrch->pattern.defaultShiftSize;

	2001 }

	2002 }

	2003 textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);

	2004 return textoffset;

	2005 }

	2006

	2007 /**

	2008 * Checks match for contraction.

	2009 * If the match starts with a partial contraction we fail.

	2010 * Internal method, status assumed to be success, caller has to check status

	2011 * before calling this method.

	2012 * @param strsrch string search data

	2013 * @param start offset of potential match, to be modified if necessary

	2014 * @param end offset of potential match, to be modified if necessary

	2015 * @param status output error status if any

	2016 * @return TRUE if match passes the contraction test, FALSE otherwise

	2017 */

	2018 static

	2019 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,

	2020 int32_t *start,

	2021 int32_t end, UErrorCode status)

	2022 {

	2023 UCollationElements *coleiter = strsrch->textIter;

	2024 int32_t textlength = strsrch->search->textLength;

	2025 int32_t temp = *end;

	2026 const UCollator *collator = strsrch->collator;

	2027 const UChar *text = strsrch->search->text;

	2028 // This part checks if either if the start of the match contains potential

	2029 // contraction. If so we'll have to iterate through them

	2030 // Since we used ucol_next while previously looking for the potential

	2031 // match, this guarantees that our end will not be a partial contraction,

	2032 // or a partial supplementary character.

	2033 if (start < textlength && ucol_unsafeCP(text[start], collator)) {

	2034 int32_t expansion = getExpansionSuffix(coleiter);

	2035 UBool expandflag = expansion > 0;

	2036 setColEIterOffset(coleiter, *end);

	2037 while (U_SUCCESS(*status) && expansion > 0) {

	2038 // getting rid of the redundant ce

	2039 // since forward contraction/expansion may have extra ces

	2040 // if we are in the normalization buffer, hasAccentsBeforeMatch

	2041 // would have taken care of it.

	2042 // E.g. the character \u01FA will have an expansion of 3, but if

	2043 // we are only looking for A ring A\u030A, we'll have to skip the

	2044 // last ce in the expansion buffer

	2045 ucol_previous(coleiter, status);

	2046 if (U_FAILURE(*status)) {

	2047 return FALSE;

	2048 }

	2049 if (ucol_getOffset(coleiter) != temp) {

	2050 *end = temp;

	2051 temp = ucol_getOffset(coleiter);

	2052 }

	2053 expansion --;

	2054 }

	2055

	2056 int32_t *patternce = strsrch->pattern.CE;

	2057 int32_t patterncelength = strsrch->pattern.CELength;

	2058 int32_t count = patterncelength;

	2059 while (count > 0) {

	2060 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));

	2061 // status checked below, note that if status is a failure

	2062 // ucol_previous returns UCOL_NULLORDER

	2063 if (ce == UCOL_IGNORABLE) {

	2064 continue;

	2065 }

	2066 if (expandflag && count == 0 &&

	2067 getColElemIterOffset(coleiter, FALSE) != temp) {

	2068 *end = temp;

	2069 temp = ucol_getOffset(coleiter);

	2070 }

	2071 if (U_FAILURE(*status) \|\| ce != patternce[count - 1]) {

	2072 (*start) --;

	2073 start = getPreviousBaseOffset(text, start);

	2074 return FALSE;

	2075 }

	2076 count --;

	2077 }

	2078 }

	2079 return TRUE;

	2080 }

	2081

	2082 /**

	2083 * Checks and sets the match information if found.

	2084 * Checks

	2085 * <ul>

	2086 * <li> the current match does not repeat the last match

	2087 * <li> boundaries are correct

	2088 * <li> exact matches has no extra accents

	2089 * <li> identical matches

	2090 * <\ul>

	2091 * Otherwise the offset will be shifted to the preceding character.

	2092 * Internal method, status assumed to be success, caller has to check status

	2093 * before calling this method.

	2094 * @param strsrch string search data

	2095 * @param collator

	2096 * @param coleiter collation element iterator

	2097 * @param text string

	2098 * @param textoffset offset in the collation element text. the returned value

	2099 * will be the truncated start offset of the match or the new start

	2100 * search offset.

	2101 * @param status output error status if any

	2102 * @return TRUE if the match is valid, FALSE otherwise

	2103 */

	2104 static

	2105 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,

	2106 int32_t *textoffset,

	2107 UErrorCode *status)

	2108 {

	2109 // to ensure that the start and ends are not composite characters

	2110 int32_t end = ucol_getOffset(strsrch->textIter);

	2111 if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)

	2112 \|\| U_FAILURE(*status)) {

	2113 return FALSE;

	2114 }

	2115

	2116 // this totally matches, however we need to check if it is repeating

	2117 // the old match

	2118 if (checkRepeatedMatch(strsrch, *textoffset, end) \|\|

	2119 !isBreakUnit(strsrch, *textoffset, end) \|\|

	2120 hasAccentsBeforeMatch(strsrch, *textoffset, end) \|\|

	2121 !checkIdentical(strsrch, *textoffset, end) \|\|

	2122 hasAccentsAfterMatch(strsrch, *textoffset, end)) {

	2123 (*textoffset) --;

	2124 *textoffset = getPreviousBaseOffset(strsrch->search->text,

	2125 *textoffset);

	2126 return FALSE;

	2127 }

	2128

	2129 //Add breakiterator boundary check for primary strength search.

	2130 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {

	2131 checkBreakBoundary(strsrch, textoffset, &end);

	2132 }

	2133

	2134 strsrch->search->matchedIndex = *textoffset;

	2135 strsrch->search->matchedLength = end - *textoffset;

	2136 return TRUE;

	2137 }

	2138

	2139 /**

	2140 * Rearranges the end accents to try matching.

	2141 * Suffix accents in the text will be grouped according to their combining

	2142 * class and the groups will be mixed and matched to try find the perfect

	2143 * match with the pattern.

	2144 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"

	2145 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings

	2146 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",

	2147 * "\u0301\u0325".

	2148 * step 2: check if any of the generated substrings matches the pattern.

	2149 * Internal method, status assumed to be success, user has to check status

	2150 * before calling this method.

	2151 * @param strsrch string search match

	2152 * @param start offset of the first base character

	2153 * @param end start of the last accent set

	2154 * @param status only error status if any

	2155 * @return USEARCH_DONE if a match is not found, otherwise return the ending

	2156 * offset of the match. Note this start includes all following accents.

	2157 */

	2158 static

	2159 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,

	2160 int32_t start,

	2161 int32_t end,

	2162 UErrorCode *status)

	2163 {

	2164 const UChar *text = strsrch->search->text;

	2165 int32_t tempend = end;

	2166

	2167 UTF_BACK_1(text, 0, tempend);

	2168 if (!(getFCD(text, &tempend, strsrch->search->textLength) &

	2169 LAST_BYTE_MASK_)) {

	2170 // die... failed at a base character

	2171 return USEARCH_DONE;

	2172 }

	2173 end = getNextBaseOffset(text, end, strsrch->search->textLength);

	2174

	2175 if (U_SUCCESS(*status)) {

	2176 UChar accents[INITIAL_ARRAY_SIZE_];

	2177 int32_t offset = getPreviousBaseOffset(text, end);

	2178 // normalizing the offensive string

	2179 unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,

	2180 INITIAL_ARRAY_SIZE_, status);

	2181

	2182 int32_t accentsindex[INITIAL_ARRAY_SIZE_];

	2183 int32_t accentsize = getUnblockedAccentIndex(accents,

	2184 accentsindex);

	2185 int32_t count = (2 << (accentsize - 1)) - 1;

	2186 UChar buffer[INITIAL_ARRAY_SIZE_];

	2187 UCollationElements *coleiter = strsrch->utilIter;

	2188 while (U_SUCCESS(*status) && count > 0) {

	2189 UChar *rearrange = strsrch->canonicalSuffixAccents;

	2190 // copy the base characters

	2191 for (int k = 0; k < accentsindex[0]; k ++) {

	2192 *rearrange ++ = accents[k];

	2193 }

	2194 // forming all possible canonical rearrangement by dropping

	2195 // sets of accents

	2196 for (int i = 0; i <= accentsize - 1; i ++) {

	2197 int32_t mask = 1 << (accentsize - i - 1);

	2198 if (count & mask) {

	2199 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {

	2200 *rearrange ++ = accents[j];

	2201 }

	2202 }

	2203 }

	2204 *rearrange = 0;

	2205 int32_t matchsize = INITIAL_ARRAY_SIZE_;

	2206 UChar *match = addToUCharArray(buffer, &matchsize,

	2207 strsrch->canonicalPrefixAccents,

	2208 strsrch->search->text + start,

	2209 offset - start,

	2210 strsrch->canonicalSuffixAccents,

	2211 status);

	2212

	2213 // run the collator iterator through this match

	2214 // if status is a failure ucol_setText does nothing

	2215 ucol_setText(coleiter, match, matchsize, status);

	2216 if (U_SUCCESS(*status)) {

	2217 if (checkCollationMatch(strsrch, coleiter)) {

	2218 if (match != buffer) {

	2219 uprv_free(match);

	2220 }

	2221 return end;

	2222 }

	2223 }

	2224 count --;

	2225 }

	2226 }

	2227 return USEARCH_DONE;

	2228 }

	2229

	2230 /**

	2231 * Take the rearranged start accents and tries matching. If match failed at

	2232 * a seperate following set of accents (seperated from the rearranged on by

	2233 * at least a base character) then we rearrange the preceding accents and

	2234 * tries matching again.

	2235 * We allow skipping of the ends of the accent set if the ces do not match.

	2236 * However if the failure is found before the accent set, it fails.

	2237 * Internal method, status assumed to be success, caller has to check status

	2238 * before calling this method.

	2239 * @param strsrch string search data

	2240 * @param textoffset of the ends of the rearranged accent

	2241 * @param status output error status if any

	2242 * @return USEARCH_DONE if a match is not found, otherwise return the ending

	2243 * offset of the match. Note this start includes all following accents.

	2244 */

	2245 static

	2246 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,

	2247 int32_t textoffset,

	2248 UErrorCode *status)

	2249 {

	2250 const UChar *text = strsrch->search->text;

	2251 const UCollator *collator = strsrch->collator;

	2252 int32_t safelength = 0;

	2253 UChar *safetext;

	2254 int32_t safetextlength;

	2255 UChar safebuffer[INITIAL_ARRAY_SIZE_];

	2256 int32_t safeoffset = textoffset;

	2257

	2258 if (textoffset &&

	2259 ucol_unsafeCP(strsrch->canonicalPrefixAccents[

	2260 u_strlen(strsrch->canonicalPrefixAccents) - 1

	2261 ], collator)) {

	2262 safeoffset = getNextSafeOffset(collator, text, textoffset,

	2263 strsrch->search->textLength);

	2264 safelength = safeoffset - textoffset;

	2265 safetextlength = INITIAL_ARRAY_SIZE_;

	2266 safetext = addToUCharArray(safebuffer, &safetextlength,

	2267 strsrch->canonicalPrefixAccents,

	2268 text + textoffset, safelength,

	2269 NULL, status);

	2270 }

	2271 else {

	2272 safetextlength = u_strlen(strsrch->canonicalPrefixAccents);

	2273 safetext = strsrch->canonicalPrefixAccents;

	2274 }

	2275

	2276 UCollationElements *coleiter = strsrch->utilIter;

	2277 // if status is a failure, ucol_setText does nothing

	2278 ucol_setText(coleiter, safetext, safetextlength, status);

	2279 // status checked in loop below

	2280

	2281 int32_t *ce = strsrch->pattern.CE;

	2282 int32_t celength = strsrch->pattern.CELength;

	2283 int ceindex = 0;

	2284 UBool isSafe = TRUE; // safe zone indication flag for position

	2285 int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents);

	2286

	2287 while (ceindex < celength) {

	2288 int32_t textce = ucol_next(coleiter, status);

	2289 if (U_FAILURE(*status)) {

	2290 if (isSafe) {

	2291 cleanUpSafeText(strsrch, safetext, safebuffer);

	2292 }

	2293 return USEARCH_DONE;

	2294 }

	2295 if (textce == UCOL_NULLORDER) {

	2296 // check if we have passed the safe buffer

	2297 if (coleiter == strsrch->textIter) {

	2298 cleanUpSafeText(strsrch, safetext, safebuffer);

	2299 return USEARCH_DONE;

	2300 }

	2301 cleanUpSafeText(strsrch, safetext, safebuffer);

	2302 safetext = safebuffer;

	2303 coleiter = strsrch->textIter;

	2304 setColEIterOffset(coleiter, safeoffset);

	2305 // status checked at the start of the loop

	2306 isSafe = FALSE;

	2307 continue;

	2308 }

	2309 textce = getCE(strsrch, textce);

	2310 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {

	2311 // do the beginning stuff

	2312 int32_t failedoffset = ucol_getOffset(coleiter);

	2313 if (isSafe && failedoffset <= prefixlength) {

	2314 // alas... no hope. failed at rearranged accent set

	2315 cleanUpSafeText(strsrch, safetext, safebuffer);

	2316 return USEARCH_DONE;

	2317 }

	2318 else {

	2319 if (isSafe) {

	2320 failedoffset = safeoffset - failedoffset;

	2321 cleanUpSafeText(strsrch, safetext, safebuffer);

	2322 }

	2323

	2324 // try rearranging the end accents

	2325 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,

	2326 textoffset, failedoffset, status);

	2327 if (result != USEARCH_DONE) {

	2328 // if status is a failure, ucol_setOffset does nothing

	2329 setColEIterOffset(strsrch->textIter, result);

	2330 }

	2331 if (U_FAILURE(*status)) {

	2332 return USEARCH_DONE;

	2333 }

	2334 return result;

	2335 }

	2336 }

	2337 if (textce == ce[ceindex]) {

	2338 ceindex ++;

	2339 }

	2340 }

	2341 // set offset here

	2342 if (isSafe) {

	2343 int32_t result = ucol_getOffset(coleiter);

	2344 // sets the text iterator here with the correct expansion and offset

	2345 int32_t leftoverces = getExpansionSuffix(coleiter);

	2346 cleanUpSafeText(strsrch, safetext, safebuffer);

	2347 if (result <= prefixlength) {

	2348 result = textoffset;

	2349 }

	2350 else {

	2351 result = textoffset + (safeoffset - result);

	2352 }

	2353 setColEIterOffset(strsrch->textIter, result);

	2354 setExpansionSuffix(strsrch->textIter, leftoverces);

	2355 return result;

	2356 }

	2357

	2358 return ucol_getOffset(coleiter);

	2359 }

	2360

	2361 /**

	2362 * Trying out the substring and sees if it can be a canonical match.

	2363 * This will try normalizing the starting accents and arranging them into

	2364 * canonical equivalents and check their corresponding ces with the pattern ce.

	2365 * Prefix accents in the text will be grouped according to their combining

	2366 * class and the groups will be mixed and matched to try find the perfect

	2367 * match with the pattern.

	2368 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"

	2369 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings

	2370 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",

	2371 * "\u0301\u0325".

	2372 * step 2: check if any of the generated substrings matches the pattern.

	2373 * Internal method, status assumed to be success, caller has to check status

	2374 * before calling this method.

	2375 * @param strsrch string search data

	2376 * @param textoffset start offset in the collation element text that starts

	2377 * with the accents to be rearranged

	2378 * @param status output error status if any

	2379 * @return TRUE if the match is valid, FALSE otherwise

	2380 */

	2381 static

	2382 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,

	2383 int32_t textoffset,

	2384 UErrorCode *status)

	2385 {

	2386 const UChar *text = strsrch->search->text;

	2387 int32_t temp = textoffset;

	2388 int32_t textlength = strsrch->search->textLength;

	2389 if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {

	2390 UCollationElements *coleiter = strsrch->textIter;

	2391 int32_t offset = ucol_getOffset(coleiter);

	2392 if (strsrch->pattern.hasSuffixAccents) {

	2393 offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,

	2394 offset, status);

	2395 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {

	2396 setColEIterOffset(coleiter, offset);

	2397 return TRUE;

	2398 }

	2399 }

	2400 return FALSE;

	2401 }

	2402

	2403 if (!strsrch->pattern.hasPrefixAccents) {

	2404 return FALSE;

	2405 }

	2406

	2407 UChar accents[INITIAL_ARRAY_SIZE_];

	2408 // offset to the last base character in substring to search

	2409 int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);

	2410 // normalizing the offensive string

	2411 unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,

	2412 0, accents, INITIAL_ARRAY_SIZE_, status);

	2413 // status checked in loop

	2414

	2415 int32_t accentsindex[INITIAL_ARRAY_SIZE_];

	2416 int32_t size = getUnblockedAccentIndex(accents, accentsindex);

	2417

	2418 // 2 power n - 1 plus the full set of accents

	2419 int32_t count = (2 << (size - 1)) - 1;

	2420 while (U_SUCCESS(*status) && count > 0) {

	2421 UChar *rearrange = strsrch->canonicalPrefixAccents;

	2422 // copy the base characters

	2423 for (int k = 0; k < accentsindex[0]; k ++) {

	2424 *rearrange ++ = accents[k];

	2425 }

	2426 // forming all possible canonical rearrangement by dropping

	2427 // sets of accents

	2428 for (int i = 0; i <= size - 1; i ++) {

	2429 int32_t mask = 1 << (size - i - 1);

	2430 if (count & mask) {

	2431 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {

	2432 *rearrange ++ = accents[j];

	2433 }

	2434 }

	2435 }

	2436 *rearrange = 0;

	2437 int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,

	2438 baseoffset, status);

	2439 if (offset != USEARCH_DONE) {

	2440 return TRUE; // match found

	2441 }

	2442 count --;

	2443 }

	2444 return FALSE;

	2445 }

	2446

	2447 /**

	2448 * Checks match for contraction.

	2449 * If the match starts with a partial contraction we fail.

	2450 * Internal method, status assumed to be success, caller has to check status

	2451 * before calling this method.

	2452 * @param strsrch string search data

	2453 * @param start offset of potential match, to be modified if necessary

	2454 * @param end offset of potential match, to be modified if necessary

	2455 * @param status only error status if any

	2456 * @return TRUE if match passes the contraction test, FALSE otherwise

	2457 */

	2458 static

	2459 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,

	2460 int32_t *start,

	2461 int32_t end, UErrorCode status)

	2462 {

	2463 UCollationElements *coleiter = strsrch->textIter;

	2464 int32_t textlength = strsrch->search->textLength;

	2465 int32_t temp = *end;

	2466 const UCollator *collator = strsrch->collator;

	2467 const UChar *text = strsrch->search->text;

	2468 // This part checks if either if the start of the match contains potential

	2469 // contraction. If so we'll have to iterate through them

	2470 // Since we used ucol_next while previously looking for the potential

	2471 // match, this guarantees that our end will not be a partial contraction,

	2472 // or a partial supplementary character.

	2473 if (start < textlength && ucol_unsafeCP(text[start], collator)) {

	2474 int32_t expansion = getExpansionSuffix(coleiter);

	2475 UBool expandflag = expansion > 0;

	2476 setColEIterOffset(coleiter, *end);

	2477 while (expansion > 0) {

	2478 // getting rid of the redundant ce

	2479 // since forward contraction/expansion may have extra ces

	2480 // if we are in the normalization buffer, hasAccentsBeforeMatch

	2481 // would have taken care of it.

	2482 // E.g. the character \u01FA will have an expansion of 3, but if

	2483 // we are only looking for A ring A\u030A, we'll have to skip the

	2484 // last ce in the expansion buffer

	2485 ucol_previous(coleiter, status);

	2486 if (U_FAILURE(*status)) {

	2487 return FALSE;

	2488 }

	2489 if (ucol_getOffset(coleiter) != temp) {

	2490 *end = temp;

	2491 temp = ucol_getOffset(coleiter);

	2492 }

	2493 expansion --;

	2494 }

	2495

	2496 int32_t *patternce = strsrch->pattern.CE;

	2497 int32_t patterncelength = strsrch->pattern.CELength;

	2498 int32_t count = patterncelength;

	2499 while (count > 0) {

	2500 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));

	2501 // status checked below, note that if status is a failure

	2502 // ucol_previous returns UCOL_NULLORDER

	2503 if (ce == UCOL_IGNORABLE) {

	2504 continue;

	2505 }

	2506 if (expandflag && count == 0 &&

	2507 getColElemIterOffset(coleiter, FALSE) != temp) {

	2508 *end = temp;

	2509 temp = ucol_getOffset(coleiter);

	2510 }

	2511 if (count == patterncelength &&

	2512 ce != patternce[patterncelength - 1]) {

	2513 // accents may have extra starting ces, this occurs when a

	2514 // pure accent pattern is matched without rearrangement

	2515 int32_t expected = patternce[patterncelength - 1];

	2516 UTF_BACK_1(text, 0, *end);

	2517 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {

	2518 ce = getCE(strsrch, ucol_previous(coleiter, status));

	2519 while (U_SUCCESS(*status) && ce != expected &&

	2520 ce != UCOL_NULLORDER &&

	2521 ucol_getOffset(coleiter) <= *start) {

	2522 ce = getCE(strsrch, ucol_previous(coleiter, status));

	2523 }

	2524 }

	2525 }

	2526 if (U_FAILURE(*status) \|\| ce != patternce[count - 1]) {

	2527 (*start) --;

	2528 start = getPreviousBaseOffset(text, start);

	2529 return FALSE;

	2530 }

	2531 count --;

	2532 }

	2533 }

	2534 return TRUE;

	2535 }

	2536

	2537 /**

	2538 * Checks and sets the match information if found.

	2539 * Checks

	2540 * <ul>

	2541 * <li> the potential match does not repeat the previous match

	2542 * <li> boundaries are correct

	2543 * <li> potential match does not end in the middle of a contraction

	2544 * <li> identical matches

	2545 * <\ul>

	2546 * Otherwise the offset will be shifted to the next character.

	2547 * Internal method, status assumed to be success, caller has to check status

	2548 * before calling this method.

	2549 * @param strsrch string search data

	2550 * @param textoffset offset in the collation element text. the returned value

	2551 * will be the truncated start offset of the match or the new start

	2552 * search offset.

	2553 * @param status only error status if any

	2554 * @return TRUE if the match is valid, FALSE otherwise

	2555 */

	2556 static

	2557 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,

	2558 int32_t *textoffset,

	2559 UErrorCode *status)

	2560 {

	2561 // to ensure that the start and ends are not composite characters

	2562 UCollationElements *coleiter = strsrch->textIter;

	2563 // if we have a canonical accent match

	2564 if ((strsrch->pattern.hasSuffixAccents &&

	2565 strsrch->canonicalSuffixAccents[0]) \|\|

	2566 (strsrch->pattern.hasPrefixAccents &&

	2567 strsrch->canonicalPrefixAccents[0])) {

	2568 strsrch->search->matchedIndex = *textoffset;

	2569 strsrch->search->matchedLength =

	2570 getNextUStringSearchBaseOffset(strsrch,

	2571 getColElemIterOffset(coleiter, FALSE))

	2572 - *textoffset;

	2573 return TRUE;

	2574 }

	2575

	2576 int32_t end = ucol_getOffset(coleiter);

	2577 if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,

	2578 status) \|\|

	2579 U_FAILURE(*status)) {

	2580 return FALSE;

	2581 }

	2582

	2583 end = getNextUStringSearchBaseOffset(strsrch, end);

	2584 // this totally matches, however we need to check if it is repeating

	2585 if (checkRepeatedMatch(strsrch, *textoffset, end) \|\|

	2586 !isBreakUnit(strsrch, *textoffset, end) \|\|

	2587 !checkIdentical(strsrch, *textoffset, end)) {

	2588 (*textoffset) --;

	2589 *textoffset = getPreviousBaseOffset(strsrch->search->text,

	2590 *textoffset);

	2591 return FALSE;

	2592 }

	2593

	2594 strsrch->search->matchedIndex = *textoffset;

	2595 strsrch->search->matchedLength = end - *textoffset;

	2596 return TRUE;

	2597 }

	2598 #endif // #if BOYER_MOORE

	2599

	2600 // constructors and destructor -------------------------------------------

	2601

	2602 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,

	2603 int32_t patternlength,

	2604 const UChar *text,

	2605 int32_t textlength,

	2606 const char *locale,

	2607 UBreakIterator *breakiter,

	2608 UErrorCode *status)

	2609 {

	2610 if (U_FAILURE(*status)) {

	2611 return NULL;

	2612 }

	2613 #if UCONFIG_NO_BREAK_ITERATION

	2614 if (breakiter != NULL) {

	2615 *status = U_UNSUPPORTED_ERROR;

	2616 return NULL;

	2617 }

	2618 #endif

	2619 if (locale) {

	2620 // ucol_open internally checks for status

	2621 UCollator *collator = ucol_open(locale, status);

	2622 // pattern, text checks are done in usearch_openFromCollator

	2623 UStringSearch *result = usearch_openFromCollator(pattern,

	2624 patternlength, text, textlength,

	2625 collator, breakiter, status);

	2626

	2627 if (result == NULL \|\| U_FAILURE(*status)) {

	2628 if (collator) {

	2629 ucol_close(collator);

	2630 }

	2631 return NULL;

	2632 }

	2633 else {

	2634 result->ownCollator = TRUE;

	2635 }

	2636 return result;

	2637 }

	2638 *status = U_ILLEGAL_ARGUMENT_ERROR;

	2639 return NULL;

	2640 }

	2641

	2642 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(

	2643 const UChar *pattern,

	2644 int32_t patternlength,

	2645 const UChar *text,

	2646 int32_t textlength,

	2647 const UCollator *collator,

	2648 UBreakIterator *breakiter,

	2649 UErrorCode *status)

	2650 {

	2651 if (U_FAILURE(*status)) {

	2652 return NULL;

	2653 }

	2654 #if UCONFIG_NO_BREAK_ITERATION

	2655 if (breakiter != NULL) {

	2656 *status = U_UNSUPPORTED_ERROR;

	2657 return NULL;

	2658 }

	2659 #endif

	2660 if (pattern == NULL \|\| text == NULL \|\| collator == NULL) {

	2661 *status = U_ILLEGAL_ARGUMENT_ERROR;

	2662 return NULL;

	2663 }

	2664

	2665 // string search does not really work when numeric collation is turned on

	2666 if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {

	2667 *status = U_UNSUPPORTED_ERROR;

	2668 return NULL;

	2669 }

	2670

	2671 if (U_SUCCESS(*status)) {

	2672 initializeFCD(status);

	2673 if (U_FAILURE(*status)) {

	2674 return NULL;

	2675 }

	2676

	2677 UStringSearch *result;

	2678 if (textlength == -1) {

	2679 textlength = u_strlen(text);

	2680 }

	2681 if (patternlength == -1) {

	2682 patternlength = u_strlen(pattern);

	2683 }

	2684 if (textlength <= 0 \|\| patternlength <= 0) {

	2685 *status = U_ILLEGAL_ARGUMENT_ERROR;

	2686 return NULL;

	2687 }

	2688

	2689 result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));

	2690 if (result == NULL) {

	2691 *status = U_MEMORY_ALLOCATION_ERROR;

	2692 return NULL;

	2693 }

	2694

	2695 result->collator = collator;

	2696 result->strength = ucol_getStrength(collator);

	2697 result->ceMask = getMask(result->strength);

	2698 result->toShift =

	2699 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==

	2700 UCOL_SHIFTED;

	2701 result->variableTop = ucol_getVariableTop(collator, status);

	2702

	2703 result->nfd = Normalizer2Factory::getNFDInstance(*status);

	2704

	2705 if (U_FAILURE(*status)) {

	2706 uprv_free(result);

	2707 return NULL;

	2708 }

	2709

	2710 result->search = (USearch *)uprv_malloc(sizeof(USearch));

	2711 if (result->search == NULL) {

	2712 *status = U_MEMORY_ALLOCATION_ERROR;

	2713 uprv_free(result);

	2714 return NULL;

	2715 }

	2716

	2717 result->search->text = text;

	2718 result->search->textLength = textlength;

	2719

	2720 result->pattern.text = pattern;

	2721 result->pattern.textLength = patternlength;

	2722 result->pattern.CE = NULL;

	2723 result->pattern.PCE = NULL;

	2724

	2725 result->search->breakIter = breakiter;

	2726 #if !UCONFIG_NO_BREAK_ITERATION

	2727 result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLo caleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, statu s);

	2728 if (breakiter) {

	2729 ubrk_setText(breakiter, text, textlength, status);

	2730 }

	2731 #endif

	2732

	2733 result->ownCollator = FALSE;

	2734 result->search->matchedLength = 0;

	2735 result->search->matchedIndex = USEARCH_DONE;

	2736 result->utilIter = NULL;

	2737 result->textIter = ucol_openElements(collator, text,

	2738 textlength, status);

	2739 if (U_FAILURE(*status)) {

	2740 usearch_close(result);

	2741 return NULL;

	2742 }

	2743

	2744 result->search->isOverlap = FALSE;

	2745 result->search->isCanonicalMatch = FALSE;

	2746 result->search->elementComparisonType = 0;

	2747 result->search->isForwardSearching = TRUE;

	2748 result->search->reset = TRUE;

	2749

	2750 initialize(result, status);

	2751

	2752 if (U_FAILURE(*status)) {

	2753 usearch_close(result);

	2754 return NULL;

	2755 }

	2756

	2757 return result;

	2758 }

	2759 return NULL;

	2760 }

	2761

	2762 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)

	2763 {

	2764 if (strsrch) {

	2765 if (strsrch->pattern.CE != strsrch->pattern.CEBuffer &&

	2766 strsrch->pattern.CE) {

	2767 uprv_free(strsrch->pattern.CE);

	2768 }

	2769

	2770 if (strsrch->pattern.PCE != NULL &&

	2771 strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {

	2772 uprv_free(strsrch->pattern.PCE);

	2773 }

	2774

	2775 ucol_closeElements(strsrch->textIter);

	2776 ucol_closeElements(strsrch->utilIter);

	2777

	2778 if (strsrch->ownCollator && strsrch->collator) {

	2779 ucol_close((UCollator *)strsrch->collator);

	2780 }

	2781

	2782 #if !UCONFIG_NO_BREAK_ITERATION

	2783 if (strsrch->search->internalBreakIter) {

	2784 ubrk_close(strsrch->search->internalBreakIter);

	2785 }

	2786 #endif

	2787

	2788 uprv_free(strsrch->search);

	2789 uprv_free(strsrch);

	2790 }

	2791 }

	2792

	2793 // set and get methods --------------------------------------------------

	2794

	2795 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,

	2796 int32_t position,

	2797 UErrorCode *status)

	2798 {

	2799 if (U_SUCCESS(*status) && strsrch) {

	2800 if (isOutOfBounds(strsrch->search->textLength, position)) {

	2801 *status = U_INDEX_OUTOFBOUNDS_ERROR;

	2802 }

	2803 else {

	2804 setColEIterOffset(strsrch->textIter, position);

	2805 }

	2806 strsrch->search->matchedIndex = USEARCH_DONE;

	2807 strsrch->search->matchedLength = 0;

	2808 strsrch->search->reset = FALSE;

	2809 }

	2810 }

	2811

	2812 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)

	2813 {

	2814 if (strsrch) {

	2815 int32_t result = ucol_getOffset(strsrch->textIter);

	2816 if (isOutOfBounds(strsrch->search->textLength, result)) {

	2817 return USEARCH_DONE;

	2818 }

	2819 return result;

	2820 }

	2821 return USEARCH_DONE;

	2822 }

	2823

	2824 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,

	2825 USearchAttribute attribute,

	2826 USearchAttributeValue value,

	2827 UErrorCode *status)

	2828 {

	2829 if (U_SUCCESS(*status) && strsrch) {

	2830 switch (attribute)

	2831 {

	2832 case USEARCH_OVERLAP :

	2833 strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);

	2834 break;

	2835 case USEARCH_CANONICAL_MATCH :

	2836 strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :

	2837 FALSE);

	2838 break;

	2839 case USEARCH_ELEMENT_COMPARISON :

	2840 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD \|\| value == USE ARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {

	2841 strsrch->search->elementComparisonType = (int16_t)value;

	2842 } else {

	2843 strsrch->search->elementComparisonType = 0;

	2844 }

	2845 break;

	2846 case USEARCH_ATTRIBUTE_COUNT :

	2847 default:

	2848 *status = U_ILLEGAL_ARGUMENT_ERROR;

	2849 }

	2850 }

	2851 if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {

	2852 *status = U_ILLEGAL_ARGUMENT_ERROR;

	2853 }

	2854 }

	2855

	2856 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(

	2857 const UStringSearch *strsrch,

	2858 USearchAttribute attribute)

	2859 {

	2860 if (strsrch) {

	2861 switch (attribute) {

	2862 case USEARCH_OVERLAP :

	2863 return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :

	2864 USEARCH_OFF);

	2865 case USEARCH_CANONICAL_MATCH :

	2866 return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :

	2867 USEARCH_OFF);

	2868 case USEARCH_ELEMENT_COMPARISON :

	2869 {

	2870 int16_t value = strsrch->search->elementComparisonType;

	2871 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD \|\| value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {

	2872 return (USearchAttributeValue)value;

	2873 } else {

	2874 return USEARCH_STANDARD_ELEMENT_COMPARISON;

	2875 }

	2876 }

	2877 case USEARCH_ATTRIBUTE_COUNT :

	2878 return USEARCH_DEFAULT;

	2879 }

	2880 }

	2881 return USEARCH_DEFAULT;

	2882 }

	2883

	2884 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(

	2885 const UStringSearch *strsrch)

	2886 {

	2887 if (strsrch == NULL) {

	2888 return USEARCH_DONE;

	2889 }

	2890 return strsrch->search->matchedIndex;

	2891 }

	2892

	2893

	2894 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,

	2895 UChar *result,

	2896 int32_t resultCapacity,

	2897 UErrorCode *status)

	2898 {

	2899 if (U_FAILURE(*status)) {

	2900 return USEARCH_DONE;

	2901 }

	2902 if (strsrch == NULL \|\| resultCapacity < 0 \|\| (resultCapacity > 0 &&

	2903 result == NULL)) {

	2904 *status = U_ILLEGAL_ARGUMENT_ERROR;

	2905 return USEARCH_DONE;

	2906 }

	2907

	2908 int32_t copylength = strsrch->search->matchedLength;

	2909 int32_t copyindex = strsrch->search->matchedIndex;

	2910 if (copyindex == USEARCH_DONE) {

	2911 u_terminateUChars(result, resultCapacity, 0, status);

	2912 return USEARCH_DONE;

	2913 }

	2914

	2915 if (resultCapacity < copylength) {

	2916 copylength = resultCapacity;

	2917 }

	2918 if (copylength > 0) {

	2919 uprv_memcpy(result, strsrch->search->text + copyindex,

	2920 copylength * sizeof(UChar));

	2921 }

	2922 return u_terminateUChars(result, resultCapacity,

	2923 strsrch->search->matchedLength, status);

	2924 }

	2925

	2926 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(

	2927 const UStringSearch *strsrch)

	2928 {

	2929 if (strsrch) {

	2930 return strsrch->search->matchedLength;

	2931 }

	2932 return USEARCH_DONE;

	2933 }

	2934

	2935 #if !UCONFIG_NO_BREAK_ITERATION

	2936

	2937 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch,

	2938 UBreakIterator *breakiter,

	2939 UErrorCode *status)

	2940 {

	2941 if (U_SUCCESS(*status) && strsrch) {

	2942 strsrch->search->breakIter = breakiter;

	2943 if (breakiter) {

	2944 ubrk_setText(breakiter, strsrch->search->text,

	2945 strsrch->search->textLength, status);

	2946 }

	2947 }

	2948 }

	2949

	2950 U_CAPI const UBreakIterator* U_EXPORT2

	2951 usearch_getBreakIterator(const UStringSearch *strsrch)

	2952 {

	2953 if (strsrch) {

	2954 return strsrch->search->breakIter;

	2955 }

	2956 return NULL;

	2957 }

	2958

	2959 #endif

	2960

	2961 U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch,

	2962 const UChar *text,

	2963 int32_t textlength,

	2964 UErrorCode *status)

	2965 {

	2966 if (U_SUCCESS(*status)) {

	2967 if (strsrch == NULL \|\| text == NULL \|\| textlength < -1 \|\|

	2968 textlength == 0) {

	2969 *status = U_ILLEGAL_ARGUMENT_ERROR;

	2970 }

	2971 else {

	2972 if (textlength == -1) {

	2973 textlength = u_strlen(text);

	2974 }

	2975 strsrch->search->text = text;

	2976 strsrch->search->textLength = textlength;

	2977 ucol_setText(strsrch->textIter, text, textlength, status);

	2978 strsrch->search->matchedIndex = USEARCH_DONE;

	2979 strsrch->search->matchedLength = 0;

	2980 strsrch->search->reset = TRUE;

	2981 #if !UCONFIG_NO_BREAK_ITERATION

	2982 if (strsrch->search->breakIter != NULL) {

	2983 ubrk_setText(strsrch->search->breakIter, text,

	2984 textlength, status);

	2985 }

	2986 ubrk_setText(strsrch->search->internalBreakIter, text, textlength, s tatus);

	2987 #endif

	2988 }

	2989 }

	2990 }

	2991

	2992 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,

	2993 int32_t *length)

	2994 {

	2995 if (strsrch) {

	2996 *length = strsrch->search->textLength;

	2997 return strsrch->search->text;

	2998 }

	2999 return NULL;

	3000 }

	3001

	3002 U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch,

	3003 const UCollator *collator,

	3004 UErrorCode *status)

	3005 {

	3006 if (U_SUCCESS(*status)) {

	3007 if (collator == NULL) {

	3008 *status = U_ILLEGAL_ARGUMENT_ERROR;

	3009 return;

	3010 }

	3011

	3012 if (strsrch) {

	3013 if (strsrch->ownCollator && (strsrch->collator != collator)) {

	3014 ucol_close((UCollator *)strsrch->collator);

	3015 strsrch->ownCollator = FALSE;

	3016 }

	3017 strsrch->collator = collator;

	3018 strsrch->strength = ucol_getStrength(collator);

	3019 strsrch->ceMask = getMask(strsrch->strength);

	3020 #if !UCONFIG_NO_BREAK_ITERATION

	3021 ubrk_close(strsrch->search->internalBreakIter);

	3022 strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, u col_getLocaleByType(collator, ULOC_VALID_LOCALE, status),

	3023 strsrch->search->text, strsrch->search->textLength, status);

	3024 #endif

	3025 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT

	3026 strsrch->toShift =

	3027 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==

	3028 UCOL_SHIFTED;

	3029 // if status is a failure, ucol_getVariableTop returns 0

	3030 strsrch->variableTop = ucol_getVariableTop(collator, status);

	3031 if (U_SUCCESS(*status)) {

	3032 initialize(strsrch, status);

	3033 if (U_SUCCESS(*status)) {

	3034 /* free offset buffer to avoid memory leak before initializi ng. */

	3035 ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_));

	3036 uprv_init_collIterate(collator, strsrch->search->text,

	3037 strsrch->search->textLength,

	3038 &(strsrch->textIter->iteratordata_),

	3039 status);

	3040 strsrch->utilIter->iteratordata_.coll = collator;

	3041 }

	3042 }

	3043 }

	3044

	3045 // **** are these calls needed?

	3046 // **** we call uprv_init_pce in initializePatternPCETable

	3047 // **** and the CEBuffer constructor...

	3048 #if 0

	3049 uprv_init_pce(strsrch->textIter);

	3050 uprv_init_pce(strsrch->utilIter);

	3051 #endif

	3052 }

	3053 }

	3054

	3055 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)

	3056 {

	3057 if (strsrch) {

	3058 return (UCollator *)strsrch->collator;

	3059 }

	3060 return NULL;

	3061 }

	3062

	3063 U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch,

	3064 const UChar *pattern,

	3065 int32_t patternlength,

	3066 UErrorCode *status)

	3067 {

	3068 if (U_SUCCESS(*status)) {

	3069 if (strsrch == NULL \|\| pattern == NULL) {

	3070 *status = U_ILLEGAL_ARGUMENT_ERROR;

	3071 }

	3072 else {

	3073 if (patternlength == -1) {

	3074 patternlength = u_strlen(pattern);

	3075 }

	3076 if (patternlength == 0) {

	3077 *status = U_ILLEGAL_ARGUMENT_ERROR;

	3078 return;

	3079 }

	3080 strsrch->pattern.text = pattern;

	3081 strsrch->pattern.textLength = patternlength;

	3082 initialize(strsrch, status);

	3083 }

	3084 }

	3085 }

	3086

	3087 U_CAPI const UChar* U_EXPORT2

	3088 usearch_getPattern(const UStringSearch *strsrch,

	3089 int32_t *length)

	3090 {

	3091 if (strsrch) {

	3092 *length = strsrch->pattern.textLength;

	3093 return strsrch->pattern.text;

	3094 }

	3095 return NULL;

	3096 }

	3097

	3098 // miscellanous methods --------------------------------------------------

	3099

	3100 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,

	3101 UErrorCode *status)

	3102 {

	3103 if (strsrch && U_SUCCESS(*status)) {

	3104 strsrch->search->isForwardSearching = TRUE;

	3105 usearch_setOffset(strsrch, 0, status);

	3106 if (U_SUCCESS(*status)) {

	3107 return usearch_next(strsrch, status);

	3108 }

	3109 }

	3110 return USEARCH_DONE;

	3111 }

	3112

	3113 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,

	3114 int32_t position,

	3115 UErrorCode *status)

	3116 {

	3117 if (strsrch && U_SUCCESS(*status)) {

	3118 strsrch->search->isForwardSearching = TRUE;

	3119 // position checked in usearch_setOffset

	3120 usearch_setOffset(strsrch, position, status);

	3121 if (U_SUCCESS(*status)) {

	3122 return usearch_next(strsrch, status);

	3123 }

	3124 }

	3125 return USEARCH_DONE;

	3126 }

	3127

	3128 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,

	3129 UErrorCode *status)

	3130 {

	3131 if (strsrch && U_SUCCESS(*status)) {

	3132 strsrch->search->isForwardSearching = FALSE;

	3133 usearch_setOffset(strsrch, strsrch->search->textLength, status);

	3134 if (U_SUCCESS(*status)) {

	3135 return usearch_previous(strsrch, status);

	3136 }

	3137 }

	3138 return USEARCH_DONE;

	3139 }

	3140

	3141 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,

	3142 int32_t position,

	3143 UErrorCode *status)

	3144 {

	3145 if (strsrch && U_SUCCESS(*status)) {

	3146 strsrch->search->isForwardSearching = FALSE;

	3147 // position checked in usearch_setOffset

	3148 usearch_setOffset(strsrch, position, status);

	3149 if (U_SUCCESS(*status)) {

	3150 return usearch_previous(strsrch, status);

	3151 }

	3152 }

	3153 return USEARCH_DONE;

	3154 }

	3155

	3156 /**

	3157 * If a direction switch is required, we'll count the number of ces till the

	3158 * beginning of the collation element iterator and iterate forwards that

	3159 * number of times. This is so that we get to the correct point within the

	3160 * string to continue the search in. Imagine when we are in the middle of the

	3161 * normalization buffer when the change in direction is request. arrrgghh....

	3162 * After searching the offset within the collation element iterator will be

	3163 * shifted to the start of the match. If a match is not found, the offset would

	3164 * have been set to the end of the text string in the collation element

	3165 * iterator.

	3166 * Okay, here's my take on normalization buffer. The only time when there can

	3167 * be 2 matches within the same normalization is when the pattern is consists

	3168 * of all accents. But since the offset returned is from the text string, we

	3169 * should not confuse the caller by returning the second match within the

	3170 * same normalization buffer. If we do, the 2 results will have the same match

	3171 * offsets, and that'll be confusing. I'll return the next match that doesn't

	3172 * fall within the same normalization buffer. Note this does not affect the

	3173 * results of matches spanning the text and the normalization buffer.

	3174 * The position to start searching is taken from the collation element

	3175 * iterator. Callers of this API would have to set the offset in the collation

	3176 * element iterator before using this method.

	3177 */

	3178 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,

	3179 UErrorCode *status)

	3180 {

	3181 if (U_SUCCESS(*status) && strsrch) {

	3182 // note offset is either equivalent to the start of the previous match

	3183 // or is set by the user

	3184 int32_t offset = usearch_getOffset(strsrch);

	3185 USearch *search = strsrch->search;

	3186 search->reset = FALSE;

	3187 int32_t textlength = search->textLength;

	3188 if (search->isForwardSearching) {

	3189 #if BOYER_MOORE

	3190 if (offset == textlength

	3191 \|\| (!search->isOverlap &&

	3192 (offset + strsrch->pattern.defaultShiftSize > textlength \|\|

	3193 (search->matchedIndex != USEARCH_DONE &&

	3194 offset + search->matchedLength >= textlength)))) {

	3195 // not enough characters to match

	3196 setMatchNotFound(strsrch);

	3197 return USEARCH_DONE;

	3198 }

	3199 #else

	3200 if (offset == textlength \|\|

	3201 (! search->isOverlap &&

	3202 (search->matchedIndex != USEARCH_DONE &&

	3203 offset + search->matchedLength > textlength))) {

	3204 // not enough characters to match

	3205 setMatchNotFound(strsrch);

	3206 return USEARCH_DONE;

	3207 }

	3208 #endif

	3209 }

	3210 else {

	3211 // switching direction.

	3212 // if matchedIndex == USEARCH_DONE, it means that either a

	3213 // setOffset has been called or that previous ran off the text

	3214 // string. the iterator would have been set to offset 0 if a

	3215 // match is not found.

	3216 search->isForwardSearching = TRUE;

	3217 if (search->matchedIndex != USEARCH_DONE) {

	3218 // there's no need to set the collation element iterator

	3219 // the next call to next will set the offset.

	3220 return search->matchedIndex;

	3221 }

	3222 }

	3223

	3224 if (U_SUCCESS(*status)) {

	3225 if (strsrch->pattern.CELength == 0) {

	3226 if (search->matchedIndex == USEARCH_DONE) {

	3227 search->matchedIndex = offset;

	3228 }

	3229 else { // moves by codepoints

	3230 UTF_FWD_1(search->text, search->matchedIndex, textlength);

	3231 }

	3232

	3233 search->matchedLength = 0;

	3234 setColEIterOffset(strsrch->textIter, search->matchedIndex);

	3235 // status checked below

	3236 if (search->matchedIndex == textlength) {

	3237 search->matchedIndex = USEARCH_DONE;

	3238 }

	3239 }

	3240 else {

	3241 if (search->matchedLength > 0) {

	3242 // if matchlength is 0 we are at the start of the iteration

	3243 if (search->isOverlap) {

	3244 ucol_setOffset(strsrch->textIter, offset + 1, status);

	3245 }

	3246 else {

	3247 ucol_setOffset(strsrch->textIter,

	3248 offset + search->matchedLength, status);

	3249 }

	3250 }

	3251 else {

	3252 // for boundary check purposes. this will ensure that the

	3253 // next match will not preceed the current offset

	3254 // note search->matchedIndex will always be set to something

	3255 // in the code

	3256 search->matchedIndex = offset - 1;

	3257 }

	3258

	3259 if (search->isCanonicalMatch) {

	3260 // can't use exact here since extra accents are allowed.

	3261 usearch_handleNextCanonical(strsrch, status);

	3262 }

	3263 else {

	3264 usearch_handleNextExact(strsrch, status);

	3265 }

	3266 }

	3267

	3268 if (U_FAILURE(*status)) {

	3269 return USEARCH_DONE;

	3270 }

	3271

	3272 #if !BOYER_MOORE

	3273 if (search->matchedIndex == USEARCH_DONE) {

	3274 ucol_setOffset(strsrch->textIter, search->textLength, status);

	3275 } else {

	3276 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);

	3277 }

	3278 #endif

	3279

	3280 return search->matchedIndex;

	3281 }

	3282 }

	3283 return USEARCH_DONE;

	3284 }

	3285

	3286 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,

	3287 UErrorCode *status)

	3288 {

	3289 if (U_SUCCESS(*status) && strsrch) {

	3290 int32_t offset;

	3291 USearch *search = strsrch->search;

	3292 if (search->reset) {

	3293 offset = search->textLength;

	3294 search->isForwardSearching = FALSE;

	3295 search->reset = FALSE;

	3296 setColEIterOffset(strsrch->textIter, offset);

	3297 }

	3298 else {

	3299 offset = usearch_getOffset(strsrch);

	3300 }

	3301

	3302 int32_t matchedindex = search->matchedIndex;

	3303 if (search->isForwardSearching == TRUE) {

	3304 // switching direction.

	3305 // if matchedIndex == USEARCH_DONE, it means that either a

	3306 // setOffset has been called or that next ran off the text

	3307 // string. the iterator would have been set to offset textLength if

	3308 // a match is not found.

	3309 search->isForwardSearching = FALSE;

	3310 if (matchedindex != USEARCH_DONE) {

	3311 return matchedindex;

	3312 }

	3313 }

	3314 else {

	3315 #if BOYER_MOORE

	3316 if (offset == 0 \|\| matchedindex == 0 \|\|

	3317 (!search->isOverlap &&

	3318 (offset < strsrch->pattern.defaultShiftSize \|\|

	3319 (matchedindex != USEARCH_DONE &&

	3320 matchedindex < strsrch->pattern.defaultShiftSize)))) {

	3321 // not enough characters to match

	3322 setMatchNotFound(strsrch);

	3323 return USEARCH_DONE;

	3324 }

	3325 #else

	3326 // Could check pattern length, but the

	3327 // linear search will do the right thing

	3328 if (offset == 0 \|\| matchedindex == 0) {

	3329 setMatchNotFound(strsrch);

	3330 return USEARCH_DONE;

	3331 }

	3332 #endif

	3333 }

	3334

	3335 if (U_SUCCESS(*status)) {

	3336 if (strsrch->pattern.CELength == 0) {

	3337 search->matchedIndex =

	3338 (matchedindex == USEARCH_DONE ? offset : matchedindex);

	3339 if (search->matchedIndex == 0) {

	3340 setMatchNotFound(strsrch);

	3341 // status checked below

	3342 }

	3343 else { // move by codepoints

	3344 UTF_BACK_1(search->text, 0, search->matchedIndex);

	3345 setColEIterOffset(strsrch->textIter, search->matchedIndex);

	3346 // status checked below

	3347 search->matchedLength = 0;

	3348 }

	3349 }

	3350 else {

	3351 #if !BOYER_MOORE

	3352 if (search->matchedIndex != USEARCH_DONE) {

	3353 if (search->isOverlap) {

	3354 ucol_setOffset(strsrch->textIter, search->matchedIndex + search->matchedLength - 2, status);

	3355 }

	3356 }

	3357 #endif

	3358

	3359 if (strsrch->search->isCanonicalMatch) {

	3360 // can't use exact here since extra accents are allowed.

	3361 usearch_handlePreviousCanonical(strsrch, status);

	3362 // status checked below

	3363 }

	3364 else {

	3365 usearch_handlePreviousExact(strsrch, status);

	3366 // status checked below

	3367 }

	3368 }

	3369

	3370 if (U_FAILURE(*status)) {

	3371 return USEARCH_DONE;

	3372 }

	3373

	3374 return search->matchedIndex;

	3375 }

	3376 }

	3377 return USEARCH_DONE;

	3378 }

	3379

	3380

	3381

	3382 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)

	3383 {

	3384 /*

	3385 reset is setting the attributes that are already in

	3386 string search, hence all attributes in the collator should

	3387 be retrieved without any problems

	3388 */

	3389 if (strsrch) {

	3390 UErrorCode status = U_ZERO_ERROR;

	3391 UBool sameCollAttribute = TRUE;

	3392 uint32_t ceMask;

	3393 UBool shift;

	3394 uint32_t varTop;

	3395

	3396 // ** hack to deal w/ how processed CEs encode quaternary **

	3397 UCollationStrength newStrength = ucol_getStrength(strsrch->collator);

	3398 if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNA RY) \|\|

	3399 (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNA RY)) {

	3400 sameCollAttribute = FALSE;

	3401 }

	3402

	3403 strsrch->strength = ucol_getStrength(strsrch->collator);

	3404 ceMask = getMask(strsrch->strength);

	3405 if (strsrch->ceMask != ceMask) {

	3406 strsrch->ceMask = ceMask;

	3407 sameCollAttribute = FALSE;

	3408 }

	3409

	3410 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT

	3411 shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,

	3412 &status) == UCOL_SHIFTED;

	3413 if (strsrch->toShift != shift) {

	3414 strsrch->toShift = shift;

	3415 sameCollAttribute = FALSE;

	3416 }

	3417

	3418 // if status is a failure, ucol_getVariableTop returns 0

	3419 varTop = ucol_getVariableTop(strsrch->collator, &status);

	3420 if (strsrch->variableTop != varTop) {

	3421 strsrch->variableTop = varTop;

	3422 sameCollAttribute = FALSE;

	3423 }

	3424 if (!sameCollAttribute) {

	3425 initialize(strsrch, &status);

	3426 }

	3427 /* free offset buffer to avoid memory leak before initializing. */

	3428 ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_));

	3429 uprv_init_collIterate(strsrch->collator, strsrch->search->text,

	3430 strsrch->search->textLength,

	3431 &(strsrch->textIter->iteratordata_),

	3432 &status);

	3433 strsrch->search->matchedLength = 0;

	3434 strsrch->search->matchedIndex = USEARCH_DONE;

	3435 strsrch->search->isOverlap = FALSE;

	3436 strsrch->search->isCanonicalMatch = FALSE;

	3437 strsrch->search->elementComparisonType = 0;

	3438 strsrch->search->isForwardSearching = TRUE;

	3439 strsrch->search->reset = TRUE;

	3440 }

	3441 }

	3442

	3443 //

	3444 // CEI Collation Element + source text index.

	3445 // These structs are kept in the circular buffer.

	3446 //

	3447 struct CEI {

	3448 int64_t ce;

	3449 int32_t lowIndex;

	3450 int32_t highIndex;

	3451 };

	3452

	3453 U_NAMESPACE_BEGIN

	3454

	3455

	3456 //

	3457 // CEBuffer A circular buffer of CEs from the text being searched.

	3458 //

	3459 #define DEFAULT_CEBUFFER_SIZE 50

	3460 struct CEBuffer {

	3461 CEI defBuf[DEFAULT_CEBUFFER_SIZE];

	3462 CEI *buf;

	3463 int32_t bufSize;

	3464 int32_t firstIx;

	3465 int32_t limitIx;

	3466 UCollationElements *ceIter;

	3467 UStringSearch *strSearch;

	3468

	3469

	3470

	3471 CEBuffer(UStringSearch ss, UErrorCode status);

	3472 ~CEBuffer();

	3473 const CEI *get(int32_t index);

	3474 const CEI *getPrevious(int32_t index);

	3475 };

	3476

	3477

	3478 CEBuffer::CEBuffer(UStringSearch ss, UErrorCode status) {

	3479 buf = defBuf;

	3480 strSearch = ss;

	3481 bufSize = ss->pattern.CELength+10;

	3482 ceIter = ss->textIter;

	3483 firstIx = 0;

	3484 limitIx = 0;

	3485

	3486 uprv_init_pce(ceIter);

	3487

	3488 if (bufSize>DEFAULT_CEBUFFER_SIZE) {

	3489 buf = (CEI )uprv_malloc(bufSize sizeof(CEI));

	3490 if (buf == NULL) {

	3491 *status = U_MEMORY_ALLOCATION_ERROR;

	3492 }

	3493 }

	3494 }

	3495

	3496 // TODO: add a reset or init function so that allocated

	3497 // buffers can be retained & reused.

	3498

	3499 CEBuffer::~CEBuffer() {

	3500 if (buf != defBuf) {

	3501 uprv_free(buf);

	3502 }

	3503 }

	3504

	3505

	3506 // Get the CE with the specified index.

	3507 // Index must be in the range

	3508 // n-history_size < index < n+1

	3509 // where n is the largest index to have been fetched by some previous call to this function.

	3510 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.

	3511 //

	3512 const CEI *CEBuffer::get(int32_t index) {

	3513 int i = index % bufSize;

	3514

	3515 if (index>=firstIx && index<limitIx) {

	3516 // The request was for an entry already in our buffer.

	3517 // Just return it.

	3518 return &buf[i];

	3519 }

	3520

	3521 // Caller is requesting a new, never accessed before, CE.

	3522 // Verify that it is the next one in sequence, which is all

	3523 // that is allowed.

	3524 if (index != limitIx) {

	3525 U_ASSERT(FALSE);

	3526

	3527 return NULL;

	3528 }

	3529

	3530 // Manage the circular CE buffer indexing

	3531 limitIx++;

	3532

	3533 if (limitIx - firstIx >= bufSize) {

	3534 // The buffer is full, knock out the lowest-indexed entry.

	3535 firstIx++;

	3536 }

	3537

	3538 UErrorCode status = U_ZERO_ERROR;

	3539

	3540 buf[i].ce = ucol_nextProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status);

	3541

	3542 return &buf[i];

	3543 }

	3544

	3545 // Get the CE with the specified index.

	3546 // Index must be in the range

	3547 // n-history_size < index < n+1

	3548 // where n is the largest index to have been fetched by some previous call to this function.

	3549 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.

	3550 //

	3551 const CEI *CEBuffer::getPrevious(int32_t index) {

	3552 int i = index % bufSize;

	3553

	3554 if (index>=firstIx && index<limitIx) {

	3555 // The request was for an entry already in our buffer.

	3556 // Just return it.

	3557 return &buf[i];

	3558 }

	3559

	3560 // Caller is requesting a new, never accessed before, CE.

	3561 // Verify that it is the next one in sequence, which is all

	3562 // that is allowed.

	3563 if (index != limitIx) {

	3564 U_ASSERT(FALSE);

	3565

	3566 return NULL;

	3567 }

	3568

	3569 // Manage the circular CE buffer indexing

	3570 limitIx++;

	3571

	3572 if (limitIx - firstIx >= bufSize) {

	3573 // The buffer is full, knock out the lowest-indexed entry.

	3574 firstIx++;

	3575 }

	3576

	3577 UErrorCode status = U_ZERO_ERROR;

	3578

	3579 buf[i].ce = ucol_previousProcessed(ceIter, &buf[i].lowIndex, &buf[i].highInd ex, &status);

	3580

	3581 return &buf[i];

	3582 }

	3583

	3584 U_NAMESPACE_END

	3585

	3586

	3587 // #define USEARCH_DEBUG

	3588

	3589 #ifdef USEARCH_DEBUG

	3590 #include <stdio.h>

	3591 #include <stdlib.h>

	3592 #endif

	3593

	3594 /*

	3595 * Find the next break boundary after startIndex. If the UStringSearch object

	3596 * has an external break iterator, use that. Otherwise use the internal characte r

	3597 * break iterator.

	3598 */

	3599 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {

	3600 #if 0

	3601 const UChar *text = strsrch->search->text;

	3602 int32_t textLen = strsrch->search->textLength;

	3603

	3604 U_ASSERT(startIndex>=0);

	3605 U_ASSERT(startIndex<=textLen);

	3606

	3607 if (startIndex >= textLen) {

	3608 return startIndex;

	3609 }

	3610

	3611 UChar32 c;

	3612 int32_t i = startIndex;

	3613 U16_NEXT(text, i, textLen, c);

	3614

	3615 // If we are on a control character, stop without looking for combining mark s.

	3616 // Control characters do not combine.

	3617 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);

	3618 if (gcProperty==U_GCB_CONTROL \|\| gcProperty==U_GCB_LF \|\| gcProperty==U_GCB_C R) {

	3619 return i;

	3620 }

	3621

	3622 // The initial character was not a control, and can thus accept trailing

	3623 // combining characters. Advance over however many of them there are.

	3624 int32_t indexOfLastCharChecked;

	3625 for (;;) {

	3626 indexOfLastCharChecked = i;

	3627 if (i>=textLen) {

	3628 break;

	3629 }

	3630 U16_NEXT(text, i, textLen, c);

	3631 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);

	3632 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {

	3633 break;

	3634 }

	3635 }

	3636 return indexOfLastCharChecked;

	3637 #elif !UCONFIG_NO_BREAK_ITERATION

	3638 UBreakIterator *breakiterator = strsrch->search->breakIter;

	3639

	3640 if (breakiterator == NULL) {

	3641 breakiterator = strsrch->search->internalBreakIter;

	3642 }

	3643

	3644 if (breakiterator != NULL) {

	3645 return ubrk_following(breakiterator, startIndex);

	3646 }

	3647

	3648 return startIndex;

	3649 #else

	3650 // ** or should we use the original code? **

	3651 return startIndex;

	3652 #endif

	3653

	3654 }

	3655

	3656 /*

	3657 * Returns TRUE if index is on a break boundary. If the UStringSearch

	3658 * has an external break iterator, test using that, otherwise test

	3659 * using the internal character break iterator.

	3660 */

	3661 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {

	3662 #if 0

	3663 const UChar *text = strsrch->search->text;

	3664 int32_t textLen = strsrch->search->textLength;

	3665

	3666 U_ASSERT(index>=0);

	3667 U_ASSERT(index<=textLen);

	3668

	3669 if (index>=textLen \|\| index<=0) {

	3670 return FALSE;

	3671 }

	3672

	3673 // If the character at the current index is not a GRAPHEME_EXTEND

	3674 // then we can not be within a combining sequence.

	3675 UChar32 c;

	3676 U16_GET(text, 0, index, textLen, c);

	3677 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);

	3678 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {

	3679 return FALSE;

	3680 }

	3681

	3682 // We are at a combining mark. If the preceding character is anything

	3683 // except a CONTROL, CR or LF, we are in a combining sequence.

	3684 U16_PREV(text, 0, index, c);

	3685 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);

	3686 UBool combining = !(gcProperty==U_GCB_CONTROL \|\| gcProperty==U_GCB_LF \|\| gc Property==U_GCB_CR);

	3687 return combining;

	3688 #elif !UCONFIG_NO_BREAK_ITERATION

	3689 UBreakIterator *breakiterator = strsrch->search->breakIter;

	3690

	3691 if (breakiterator == NULL) {

	3692 breakiterator = strsrch->search->internalBreakIter;

	3693 }

	3694

	3695 return (breakiterator != NULL && ! ubrk_isBoundary(breakiterator, index));

	3696 #else

	3697 // ** or use the original code? **

	3698 return FALSE;

	3699 #endif

	3700 }

	3701

	3702 #if 0

	3703 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int3 2_t end)

	3704 {

	3705 #if !UCONFIG_NO_BREAK_ITERATION

	3706 UBreakIterator *breakiterator = strsrch->search->breakIter;

	3707

	3708 if (breakiterator != NULL) {

	3709 int32_t startindex = ubrk_first(breakiterator);

	3710 int32_t endindex = ubrk_last(breakiterator);

	3711

	3712 // out-of-range indexes are never boundary positions

	3713 if (start < startindex \|\| start > endindex \|\|

	3714 end < startindex \|\| end > endindex) {

	3715 return FALSE;

	3716 }

	3717

	3718 return ubrk_isBoundary(breakiterator, start) &&

	3719 ubrk_isBoundary(breakiterator, end);

	3720 }

	3721 #endif

	3722

	3723 return TRUE;

	3724 }

	3725 #endif

	3726

	3727 typedef enum {

	3728 U_CE_MATCH = -1,

	3729 U_CE_NO_MATCH = 0,

	3730 U_CE_SKIP_TARG,

	3731 U_CE_SKIP_PATN

	3732 } UCompareCEsResult;

	3733 #define U_CE_LEVEL2_BASE 0x00000005

	3734 #define U_CE_LEVEL3_BASE 0x00050000

	3735

	3736 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t com pareType) {

	3737 if (targCE == patCE) {

	3738 return U_CE_MATCH;

	3739 }

	3740 if (compareType == 0) {

	3741 return U_CE_NO_MATCH;

	3742 }

	3743

	3744 int64_t targCEshifted = targCE >> 32;

	3745 int64_t patCEshifted = patCE >> 32;

	3746 int64_t mask;

	3747

	3748 mask = 0xFFFF0000;

	3749 int32_t targLev1 = (int32_t)(targCEshifted & mask);

	3750 int32_t patLev1 = (int32_t)(patCEshifted & mask);

	3751 if ( targLev1 != patLev1 ) {

	3752 if ( targLev1 == 0 ) {

	3753 return U_CE_SKIP_TARG;

	3754 }

	3755 if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {

	3756 return U_CE_SKIP_PATN;

	3757 }

	3758 return U_CE_NO_MATCH;

	3759 }

	3760

	3761 mask = 0x0000FFFF;

	3762 int32_t targLev2 = (int32_t)(targCEshifted & mask);

	3763 int32_t patLev2 = (int32_t)(patCEshifted & mask);

	3764 if ( targLev2 != patLev2 ) {

	3765 if ( targLev2 == 0 ) {

	3766 return U_CE_SKIP_TARG;

	3767 }

	3768 if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {

	3769 return U_CE_SKIP_PATN;

	3770 }

	3771 return (patLev2 == U_CE_LEVEL2_BASE \|\| (compareType == USEARCH_ANY_BASE_ WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?

	3772 U_CE_MATCH: U_CE_NO_MATCH;

	3773 }

	3774

	3775 mask = 0xFFFF0000;

	3776 int32_t targLev3 = (int32_t)(targCE & mask);

	3777 int32_t patLev3 = (int32_t)(patCE & mask);

	3778 if ( targLev3 != patLev3 ) {

	3779 return (patLev3 == U_CE_LEVEL3_BASE \|\| (compareType == USEARCH_ANY_BASE_ WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?

	3780 U_CE_MATCH: U_CE_NO_MATCH;

	3781 }

	3782

	3783 return U_CE_MATCH;

	3784 }

	3785

	3786 #if BOYER_MOORE

	3787 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s

	3788 #endif

	3789

	3790 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,

	3791 int32_t startIdx,

	3792 int32_t *matchStart,

	3793 int32_t *matchLimit,

	3794 UErrorCode *status)

	3795 {

	3796 if (U_FAILURE(*status)) {

	3797 return FALSE;

	3798 }

	3799

	3800 // TODO: reject search patterns beginning with a combining char.

	3801

	3802 #ifdef USEARCH_DEBUG

	3803 if (getenv("USEARCH_DEBUG") != NULL) {

	3804 printf("Pattern CEs\n");

	3805 for (int ii=0; ii<strsrch->pattern.CELength; ii++) {

	3806 printf(" %8x", strsrch->pattern.CE[ii]);

	3807 }

	3808 printf("\n");

	3809 }

	3810

	3811 #endif

	3812 // Input parameter sanity check.

	3813 // TODO: should input indicies clip to the text length

	3814 // in the same way that UText does.

	3815 if(strsrch->pattern.CELength == 0 \|\|

	3816 startIdx < 0 \|\|

	3817 startIdx > strsrch->search->textLength \|\|

	3818 strsrch->pattern.CE == NULL) {

	3819 *status = U_ILLEGAL_ARGUMENT_ERROR;

	3820 return FALSE;

	3821 }

	3822

	3823 if (strsrch->pattern.PCE == NULL) {

	3824 initializePatternPCETable(strsrch, status);

	3825 }

	3826

	3827 ucol_setOffset(strsrch->textIter, startIdx, status);

	3828 CEBuffer ceb(strsrch, status);

	3829

	3830

	3831 int32_t targetIx = 0;

	3832 const CEI *targetCEI = NULL;

	3833 int32_t patIx;

	3834 UBool found;

	3835

	3836 int32_t mStart = -1;

	3837 int32_t mLimit = -1;

	3838 int32_t minLimit;

	3839 int32_t maxLimit;

	3840

	3841

	3842

	3843 // Outer loop moves over match starting positions in the

	3844 // target CE space.

	3845 // Here we see the target as a sequence of collation elements, resulting fro m the following:

	3846 // 1. Target characters were decomposed, and (if appropriate) other compress ions and expansions are applied

	3847 // (for example, digraphs such as IJ may be broken into two characters).

	3848 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bi ts are primary strength, next

	3849 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit hal f) are tertiary. Any of these

	3850 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t

	3851 // CE weight 0 (as for a combining diacritic with secondary weight when t he collator strentgh is primary),

	3852 // then the CE is deleted, so the following code sees only CEs that are r elevant.

	3853 // For each CE, the lowIndex and highIndex correspond to where this CE begin s and ends in the original text.

	3854 // If lowIndex==highIndex, either the CE resulted from an expansion/decompos ition of one of the original text

	3855 // characters, or the CE marks the limit of the target text (in which case t he CE weight is UCOL_PROCESSED_NULLORDER).

	3856 //

	3857 for(targetIx=0; ; targetIx++)

	3858 {

	3859 found = TRUE;

	3860 // Inner loop checks for a match beginning at each

	3861 // position from the outer loop.

	3862 int32_t targetIxOffset = 0;

	3863 int64_t patCE = 0;

	3864 for (patIx=0; patIx<strsrch->pattern.PCELength; patIx++) {

	3865 patCE = strsrch->pattern.PCE[patIx];

	3866 targetCEI = ceb.get(targetIx+patIx+targetIxOffset);

	3867 // Compare CE from target string with CE from the pattern.

	3868 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,

	3869 // which will fail the compare, below.

	3870 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsr ch->search->elementComparisonType);

	3871 if ( ceMatch == U_CE_NO_MATCH ) {

	3872 found = FALSE;

	3873 break;

	3874 } else if ( ceMatch > U_CE_NO_MATCH ) {

	3875 if ( ceMatch == U_CE_SKIP_TARG ) {

	3876 // redo with same patCE, next targCE

	3877 patIx--;

	3878 targetIxOffset++;

	3879 } else { // ceMatch == U_CE_SKIP_PATN

	3880 // redo with same targCE, next patCE

	3881 targetIxOffset--;

	3882 }

	3883 }

	3884 }

	3885 targetIxOffset += strsrch->pattern.PCELength; // this is now the offset in target CE space to end of the match so far

	3886

	3887 if (!found && ((targetCEI == NULL) \|\| (targetCEI->ce != UCOL_PROCESSED_N ULLORDER))) {

	3888 // No match at this targetIx. Try again at the next.

	3889 continue;

	3890 }

	3891

	3892 if (!found) {

	3893 // No match at all, we have run off the end of the target text.

	3894 break;

	3895 }

	3896

	3897

	3898 // We have found a match in CE space.

	3899 // Now determine the bounds in string index space.

	3900 // There still is a chance of match failure if the CE range not corresp ond to

	3901 // an acceptable character range.

	3902 //

	3903 const CEI *firstCEI = ceb.get(targetIx);

	3904 const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - 1);

	3905

	3906 mStart = firstCEI->lowIndex;

	3907 minLimit = lastCEI->lowIndex;

	3908

	3909 // Look at the CE following the match. If it is UCOL_NULLORDER the matc h

	3910 // extended to the end of input, and the match is good.

	3911

	3912 // Look at the high and low indices of the CE following the match. If

	3913 // they are the same it means one of two things:

	3914 // 1. The match extended to the last CE from the target text, which i s OK, or

	3915 // 2. The last CE that was part of the match is in an expansion that extends

	3916 // to the first CE after the match. In this case, we reject the ma tch.

	3917 if (strsrch->search->elementComparisonType == 0) {

	3918 const CEI *nextCEI = ceb.get(targetIx + targetIxOffset);

	3919 maxLimit = nextCEI->lowIndex;

	3920 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_P ROCESSED_NULLORDER) {

	3921 found = FALSE;

	3922 }

	3923 } else {

	3924 const CEI *nextCEI;

	3925 for ( ; ; ++targetIxOffset ) {

	3926 nextCEI = ceb.get(targetIx + targetIxOffset);

	3927 maxLimit = nextCEI->lowIndex;

	3928 // If we are at the end of the target too, match succeeds

	3929 if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {

	3930 break;

	3931 }

	3932 // As long as the next CE has primary weight of 0,

	3933 // it is part of the last target element matched by the pattern;

	3934 // make sure it can be part of a match with the last patCE

	3935 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {

	3936 UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, pa tCE, strsrch->search->elementComparisonType);

	3937 if ( ceMatch == U_CE_NO_MATCH \|\| ceMatch == U_CE_SKIP_PA TN ) {

	3938 found = FALSE;

	3939 break;

	3940 }

	3941 // If lowIndex == highIndex, this target CE is part of an expans ion of the last matched

	3942 // target element, but it has non-zero primary weight => match f ails

	3943 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {

	3944 found = false;

	3945 break;

	3946 // Else the target CE is not part of an expansion of the last ma tched element, match succeeds

	3947 } else {

	3948 break;

	3949 }

	3950 }

	3951 }

	3952

	3953

	3954 // Check for the start of the match being within a combining sequence.

	3955 // This can happen if the pattern itself begins with a combining char, and

	3956 // the match found combining marks in the target text that were attach ed

	3957 // to something else.

	3958 // This type of match should be rejected for not completely consuming a

	3959 // combining sequence.

	3960 if (isBreakBoundary(strsrch, mStart)) {

	3961 found = FALSE;

	3962 }

	3963

	3964 // Check for the start of the match being within an Collation Element Ex pansion,

	3965 // meaning that the first char of the match is only partially matched.

	3966 // With exapnsions, the first CE will report the index of the source

	3967 // character, and all subsequent (expansions) CEs will report the sour ce index of the

	3968 // _following_ character.

	3969 int32_t secondIx = firstCEI->highIndex;

	3970 if (mStart == secondIx) {

	3971 found = FALSE;

	3972 }

	3973

	3974 // Advance the match end position to the first acceptable match boundar y.

	3975 // This advances the index over any combining charcters.

	3976 mLimit = maxLimit;

	3977 if (minLimit < maxLimit) {

	3978 int32_t nba = nextBoundaryAfter(strsrch, minLimit);

	3979

	3980 if (nba >= lastCEI->highIndex) {

	3981 mLimit = nba;

	3982 }

	3983 }

	3984

	3985 #ifdef USEARCH_DEBUG

	3986 if (getenv("USEARCH_DEBUG") != NULL) {

	3987 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLim it, mLimit);

	3988 }

	3989 #endif

	3990

	3991 // If advancing to the end of a combining sequence in character indexing space

	3992 // advanced us beyond the end of the match in CE space, reject this ma tch.

	3993 if (mLimit > maxLimit) {

	3994 found = FALSE;

	3995 }

	3996

	3997 if (isBreakBoundary(strsrch, mLimit)) {

	3998 found = FALSE;

	3999 }

	4000

	4001 if (! checkIdentical(strsrch, mStart, mLimit)) {

	4002 found = FALSE;

	4003 }

	4004

	4005 if (found) {

	4006 break;

	4007 }

	4008 }

	4009

	4010 #ifdef USEARCH_DEBUG

	4011 if (getenv("USEARCH_DEBUG") != NULL) {

	4012 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);

	4013 int32_t lastToPrint = ceb.limitIx+2;

	4014 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {

	4015 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);

	4016 }

	4017 printf("\n%s\n", found? "match found" : "no match");

	4018 }

	4019 #endif

	4020

	4021 // All Done. Store back the match bounds to the caller.

	4022 //

	4023 if (found==FALSE) {

	4024 mLimit = -1;

	4025 mStart = -1;

	4026 }

	4027

	4028 if (matchStart != NULL) {

	4029 *matchStart= mStart;

	4030 }

	4031

	4032 if (matchLimit != NULL) {

	4033 *matchLimit = mLimit;

	4034 }

	4035

	4036 return found;

	4037 }

	4038

	4039

	4040 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,

	4041 int32_t startIdx,

	4042 int32_t *matchStart,

	4043 int32_t *matchLimit,

	4044 UErrorCode *status)

	4045 {

	4046 if (U_FAILURE(*status)) {

	4047 return FALSE;

	4048 }

	4049

	4050 // TODO: reject search patterns beginning with a combining char.

	4051

	4052 #ifdef USEARCH_DEBUG

	4053 if (getenv("USEARCH_DEBUG") != NULL) {

	4054 printf("Pattern CEs\n");

	4055 for (int ii=0; ii<strsrch->pattern.CELength; ii++) {

	4056 printf(" %8x", strsrch->pattern.CE[ii]);

	4057 }

	4058 printf("\n");

	4059 }

	4060

	4061 #endif

	4062 // Input parameter sanity check.

	4063 // TODO: should input indicies clip to the text length

	4064 // in the same way that UText does.

	4065 if(strsrch->pattern.CELength == 0 \|\|

	4066 startIdx < 0 \|\|

	4067 startIdx > strsrch->search->textLength \|\|

	4068 strsrch->pattern.CE == NULL) {

	4069 *status = U_ILLEGAL_ARGUMENT_ERROR;

	4070 return FALSE;

	4071 }

	4072

	4073 if (strsrch->pattern.PCE == NULL) {

	4074 initializePatternPCETable(strsrch, status);

	4075 }

	4076

	4077 CEBuffer ceb(strsrch, status);

	4078 int32_t targetIx = 0;

	4079

	4080 /*

	4081 * Pre-load the buffer with the CE's for the grapheme

	4082 * after our starting position so that we're sure that

	4083 * we can look at the CE following the match when we

	4084 * check the match boundaries.

	4085 *

	4086 * This will also pre-fetch the first CE that we'll

	4087 * consider for the match.

	4088 */

	4089 if (startIdx < strsrch->search->textLength) {

	4090 UBreakIterator *bi = strsrch->search->internalBreakIter;

	4091 int32_t next = ubrk_following(bi, startIdx);

	4092

	4093 ucol_setOffset(strsrch->textIter, next, status);

	4094

	4095 for (targetIx = 0; ; targetIx += 1) {

	4096 if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {

	4097 break;

	4098 }

	4099 }

	4100 } else {

	4101 ucol_setOffset(strsrch->textIter, startIdx, status);

	4102 }

	4103

	4104

	4105 const CEI *targetCEI = NULL;

	4106 int32_t patIx;

	4107 UBool found;

	4108

	4109 int32_t limitIx = targetIx;

	4110 int32_t mStart = -1;

	4111 int32_t mLimit = -1;

	4112 int32_t minLimit;

	4113 int32_t maxLimit;

	4114

	4115

	4116

	4117 // Outer loop moves over match starting positions in the

	4118 // target CE space.

	4119 // Here, targetIx values increase toward the beginning of the base text (i.e . we get the text CEs in reverse order).

	4120 // But patIx is 0 at the beginning of the pattern and increases toward the end.

	4121 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern

	4122 // and the beginning of the base text.

	4123 for(targetIx = limitIx; ; targetIx += 1)

	4124 {

	4125 found = TRUE;

	4126 // Inner loop checks for a match beginning at each

	4127 // position from the outer loop.

	4128 int32_t targetIxOffset = 0;

	4129 for (patIx = strsrch->pattern.PCELength - 1; patIx >= 0; patIx -= 1) {

	4130 int64_t patCE = strsrch->pattern.PCE[patIx];

	4131

	4132 targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 - patIx + targetIxOffset);

	4133 // Compare CE from target string with CE from the pattern.

	4134 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,

	4135 // which will fail the compare, below.

	4136 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsr ch->search->elementComparisonType);

	4137 if ( ceMatch == U_CE_NO_MATCH ) {

	4138 found = FALSE;

	4139 break;

	4140 } else if ( ceMatch > U_CE_NO_MATCH ) {

	4141 if ( ceMatch == U_CE_SKIP_TARG ) {

	4142 // redo with same patCE, next targCE

	4143 patIx++;

	4144 targetIxOffset++;

	4145 } else { // ceMatch == U_CE_SKIP_PATN

	4146 // redo with same targCE, next patCE

	4147 targetIxOffset--;

	4148 }

	4149 }

	4150 }

	4151

	4152 if (!found && ((targetCEI == NULL) \|\| (targetCEI->ce != UCOL_PROCESSED_N ULLORDER))) {

	4153 // No match at this targetIx. Try again at the next.

	4154 continue;

	4155 }

	4156

	4157 if (!found) {

	4158 // No match at all, we have run off the end of the target text.

	4159 break;

	4160 }

	4161

	4162

	4163 // We have found a match in CE space.

	4164 // Now determine the bounds in string index space.

	4165 // There still is a chance of match failure if the CE range not corresp ond to

	4166 // an acceptable character range.

	4167 //

	4168 const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELen gth - 1 + targetIxOffset);

	4169 const CEI *lastCEI = ceb.getPrevious(targetIx);

	4170 const CEI *nextCEI = targetIx > 0? ceb.getPrevious(targetIx - 1) : NULL ;

	4171

	4172 mStart = firstCEI->lowIndex;

	4173 minLimit = lastCEI->lowIndex;

	4174 maxLimit = targetIx > 0? nextCEI->lowIndex : lastCEI->highIndex;

	4175

	4176 // Look at the CE following the match. If it is UCOL_NULLORDER the matc h

	4177 // extended to the end of input, and the match is good.

	4178

	4179 // Look at the high and low indices of the CE following the match. If

	4180 // they are the same it means one of two things:

	4181 // 1. The match extended to the last CE from the target text, which i s OK, or

	4182 // 2. The last CE that was part of the match is in an expansion that extends

	4183 // to the first CE after the match. In this case, we reject the ma tch.

	4184 if (targetIx >= 1) {

	4185 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_P ROCESSED_NULLORDER) {

	4186 found = FALSE;

	4187 }

	4188 }

	4189

	4190

	4191 // Check for the start of the match being within a combining sequence.

	4192 // This can happen if the pattern itself begins with a combining char, and

	4193 // the match found combining marks in the target text that were attach ed

	4194 // to something else.

	4195 // This type of match should be rejected for not completely consuming a

	4196 // combining sequence.

	4197 if (isBreakBoundary(strsrch, mStart)) {

	4198 found = FALSE;

	4199 }

	4200

	4201 // Look at the high index of the first CE in the match. If it's the same as the

	4202 // low index, the first CE in the match is in the middle of an expansion .

	4203 if (mStart == firstCEI->highIndex) {

	4204 found = FALSE;

	4205 }

	4206

	4207 // Advance the match end position to the first acceptable match boundar y.

	4208 // This advances the index over any combining charcters.

	4209 mLimit = maxLimit;

	4210 if (/targetIx > 0 &&/ minLimit < maxLimit) {

	4211 int32_t nba = nextBoundaryAfter(strsrch, minLimit);

	4212

	4213 if (nba >= lastCEI->highIndex) {

	4214 mLimit = nba;

	4215 }

	4216 }

	4217

	4218 #ifdef USEARCH_DEBUG

	4219 if (getenv("USEARCH_DEBUG") != NULL) {

	4220 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLim it, mLimit);

	4221 }

	4222 #endif

	4223

	4224 // If advancing to the end of a combining sequence in character indexing space

	4225 // advanced us beyond the end of the match in CE space, reject this ma tch.

	4226 if (mLimit > maxLimit) {

	4227 found = FALSE;

	4228 }

	4229

	4230 // Make sure the end of the match is on a break boundary

	4231 if (isBreakBoundary(strsrch, mLimit)) {

	4232 found = FALSE;

	4233 }

	4234

	4235 if (! checkIdentical(strsrch, mStart, mLimit)) {

	4236 found = FALSE;

	4237 }

	4238

	4239 if (found) {

	4240 break;

	4241 }

	4242 }

	4243

	4244 #ifdef USEARCH_DEBUG

	4245 if (getenv("USEARCH_DEBUG") != NULL) {

	4246 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);

	4247 int32_t lastToPrint = ceb.limitIx+2;

	4248 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {

	4249 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);

	4250 }

	4251 printf("\n%s\n", found? "match found" : "no match");

	4252 }

	4253 #endif

	4254

	4255 // All Done. Store back the match bounds to the caller.

	4256 //

	4257 if (found==FALSE) {

	4258 mLimit = -1;

	4259 mStart = -1;

	4260 }

	4261

	4262 if (matchStart != NULL) {

	4263 *matchStart= mStart;

	4264 }

	4265

	4266 if (matchLimit != NULL) {

	4267 *matchLimit = mLimit;

	4268 }

	4269

	4270 return found;

	4271 }

	4272

	4273

	4274

	4275

	4276 // internal use methods declared in usrchimp.h -----------------------------

	4277

	4278 UBool usearch_handleNextExact(UStringSearch strsrch, UErrorCode status)

	4279 {

	4280 if (U_FAILURE(*status)) {

	4281 setMatchNotFound(strsrch);

	4282 return FALSE;

	4283 }

	4284

	4285 #if BOYER_MOORE

	4286 UCollationElements *coleiter = strsrch->textIter;

	4287 int32_t textlength = strsrch->search->textLength;

	4288 int32_t *patternce = strsrch->pattern.CE;

	4289 int32_t patterncelength = strsrch->pattern.CELength;

	4290 int32_t textoffset = ucol_getOffset(coleiter);

	4291

	4292 // status used in setting coleiter offset, since offset is checked in

	4293 // shiftForward before setting the coleiter offset, status never

	4294 // a failure

	4295 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,

	4296 patterncelength);

	4297 while (textoffset <= textlength)

	4298 {

	4299 uint32_t patternceindex = patterncelength - 1;

	4300 int32_t targetce;

	4301 UBool found = FALSE;

	4302 int32_t lastce = UCOL_NULLORDER;

	4303

	4304 setColEIterOffset(coleiter, textoffset);

	4305

	4306 for (;;) {

	4307 // finding the last pattern ce match, imagine composite characters

	4308 // for example: search for pattern A in text \u00C0

	4309 // we'll have to skip \u0300 the grave first before we get to A

	4310 targetce = ucol_previous(coleiter, status);

	4311 if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {

	4312 found = FALSE;

	4313 break;

	4314 }

	4315 targetce = getCE(strsrch, targetce);

	4316 if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {

	4317 // this is for the text \u0315\u0300 that requires

	4318 // normalization and pattern \u0300, where \u0315 is ignorable

	4319 continue;

	4320 }

	4321 if (lastce == UCOL_NULLORDER \|\| lastce == UCOL_IGNORABLE) {

	4322 lastce = targetce;

	4323 }

	4324 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s

	4325 if (targetce == patternce[patternceindex]) {

	4326 // the first ce can be a contraction

	4327 found = TRUE;

	4328 break;

	4329 }

	4330 if (!hasExpansion(coleiter)) {

	4331 found = FALSE;

	4332 break;

	4333 }

	4334 }

	4335

	4336 //targetce = lastce;

	4337

	4338 while (found && patternceindex > 0) {

	4339 lastce = targetce;

	4340 targetce = ucol_previous(coleiter, status);

	4341 if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {

	4342 found = FALSE;

	4343 break;

	4344 }

	4345 targetce = getCE(strsrch, targetce);

	4346 if (targetce == UCOL_IGNORABLE) {

	4347 continue;

	4348 }

	4349

	4350 patternceindex --;

	4351 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s

	4352 found = found && targetce == patternce[patternceindex];

	4353 }

	4354

	4355 targetce = lastce;

	4356

	4357 if (!found) {

	4358 if (U_FAILURE(*status)) {

	4359 break;

	4360 }

	4361 textoffset = shiftForward(strsrch, textoffset, lastce,

	4362 patternceindex);

	4363 // status checked at loop.

	4364 patternceindex = patterncelength;

	4365 continue;

	4366 }

	4367

	4368 if (checkNextExactMatch(strsrch, &textoffset, status)) {

	4369 // status checked in ucol_setOffset

	4370 setColEIterOffset(coleiter, strsrch->search->matchedIndex);

	4371 return TRUE;

	4372 }

	4373 }

	4374 setMatchNotFound(strsrch);

	4375 return FALSE;

	4376 #else

	4377 int32_t textOffset = ucol_getOffset(strsrch->textIter);

	4378 int32_t start = -1;

	4379 int32_t end = -1;

	4380

	4381 if (usearch_search(strsrch, textOffset, &start, &end, status)) {

	4382 strsrch->search->matchedIndex = start;

	4383 strsrch->search->matchedLength = end - start;

	4384 return TRUE;

	4385 } else {

	4386 setMatchNotFound(strsrch);

	4387 return FALSE;

	4388 }

	4389 #endif

	4390 }

	4391

	4392 UBool usearch_handleNextCanonical(UStringSearch strsrch, UErrorCode status)

	4393 {

	4394 if (U_FAILURE(*status)) {

	4395 setMatchNotFound(strsrch);

	4396 return FALSE;

	4397 }

	4398

	4399 #if BOYER_MOORE

	4400 UCollationElements *coleiter = strsrch->textIter;

	4401 int32_t textlength = strsrch->search->textLength;

	4402 int32_t *patternce = strsrch->pattern.CE;

	4403 int32_t patterncelength = strsrch->pattern.CELength;

	4404 int32_t textoffset = ucol_getOffset(coleiter);

	4405 UBool hasPatternAccents =

	4406 strsrch->pattern.hasSuffixAccents \|\| strsrch->pattern.hasPrefixAccents;

	4407

	4408 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,

	4409 patterncelength);

	4410 strsrch->canonicalPrefixAccents[0] = 0;

	4411 strsrch->canonicalSuffixAccents[0] = 0;

	4412

	4413 while (textoffset <= textlength)

	4414 {

	4415 int32_t patternceindex = patterncelength - 1;

	4416 int32_t targetce;

	4417 UBool found = FALSE;

	4418 int32_t lastce = UCOL_NULLORDER;

	4419

	4420 setColEIterOffset(coleiter, textoffset);

	4421

	4422 for (;;) {

	4423 // finding the last pattern ce match, imagine composite characters

	4424 // for example: search for pattern A in text \u00C0

	4425 // we'll have to skip \u0300 the grave first before we get to A

	4426 targetce = ucol_previous(coleiter, status);

	4427 if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {

	4428 found = FALSE;

	4429 break;

	4430 }

	4431 targetce = getCE(strsrch, targetce);

	4432 if (lastce == UCOL_NULLORDER \|\| lastce == UCOL_IGNORABLE) {

	4433 lastce = targetce;

	4434 }

	4435 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s

	4436 if (targetce == patternce[patternceindex]) {

	4437 // the first ce can be a contraction

	4438 found = TRUE;

	4439 break;

	4440 }

	4441 if (!hasExpansion(coleiter)) {

	4442 found = FALSE;

	4443 break;

	4444 }

	4445 }

	4446

	4447 while (found && patternceindex > 0) {

	4448 targetce = ucol_previous(coleiter, status);

	4449 if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {

	4450 found = FALSE;

	4451 break;

	4452 }

	4453 targetce = getCE(strsrch, targetce);

	4454 if (targetce == UCOL_IGNORABLE) {

	4455 continue;

	4456 }

	4457

	4458 patternceindex --;

	4459 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s

	4460 found = found && targetce == patternce[patternceindex];

	4461 }

	4462

	4463 // initializing the rearranged accent array

	4464 if (hasPatternAccents && !found) {

	4465 strsrch->canonicalPrefixAccents[0] = 0;

	4466 strsrch->canonicalSuffixAccents[0] = 0;

	4467 if (U_FAILURE(*status)) {

	4468 break;

	4469 }

	4470 found = doNextCanonicalMatch(strsrch, textoffset, status);

	4471 }

	4472

	4473 if (!found) {

	4474 if (U_FAILURE(*status)) {

	4475 break;

	4476 }

	4477 textoffset = shiftForward(strsrch, textoffset, lastce,

	4478 patternceindex);

	4479 // status checked at loop

	4480 patternceindex = patterncelength;

	4481 continue;

	4482 }

	4483

	4484 if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {

	4485 setColEIterOffset(coleiter, strsrch->search->matchedIndex);

	4486 return TRUE;

	4487 }

	4488 }

	4489 setMatchNotFound(strsrch);

	4490 return FALSE;

	4491 #else

	4492 int32_t textOffset = ucol_getOffset(strsrch->textIter);

	4493 int32_t start = -1;

	4494 int32_t end = -1;

	4495

	4496 if (usearch_search(strsrch, textOffset, &start, &end, status)) {

	4497 strsrch->search->matchedIndex = start;

	4498 strsrch->search->matchedLength = end - start;

	4499 return TRUE;

	4500 } else {

	4501 setMatchNotFound(strsrch);

	4502 return FALSE;

	4503 }

	4504 #endif

	4505 }

	4506

	4507 UBool usearch_handlePreviousExact(UStringSearch strsrch, UErrorCode status)

	4508 {

	4509 if (U_FAILURE(*status)) {

	4510 setMatchNotFound(strsrch);

	4511 return FALSE;

	4512 }

	4513

	4514 #if BOYER_MOORE

	4515 UCollationElements *coleiter = strsrch->textIter;

	4516 int32_t *patternce = strsrch->pattern.CE;

	4517 int32_t patterncelength = strsrch->pattern.CELength;

	4518 int32_t textoffset = ucol_getOffset(coleiter);

	4519

	4520 // shifting it check for setting offset

	4521 // if setOffset is called previously or there was no previous match, we

	4522 // leave the offset as it is.

	4523 if (strsrch->search->matchedIndex != USEARCH_DONE) {

	4524 textoffset = strsrch->search->matchedIndex;

	4525 }

	4526

	4527 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,

	4528 patterncelength);

	4529

	4530 while (textoffset >= 0)

	4531 {

	4532 int32_t patternceindex = 1;

	4533 int32_t targetce;

	4534 UBool found = FALSE;

	4535 int32_t firstce = UCOL_NULLORDER;

	4536

	4537 // if status is a failure, ucol_setOffset does nothing

	4538 setColEIterOffset(coleiter, textoffset);

	4539

	4540 for (;;) {

	4541 // finding the first pattern ce match, imagine composite

	4542 // characters. for example: search for pattern \u0300 in text

	4543 // \u00C0, we'll have to skip A first before we get to

	4544 // \u0300 the grave accent

	4545 targetce = ucol_next(coleiter, status);

	4546 if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {

	4547 found = FALSE;

	4548 break;

	4549 }

	4550 targetce = getCE(strsrch, targetce);

	4551 if (firstce == UCOL_NULLORDER \|\| firstce == UCOL_IGNORABLE) {

	4552 firstce = targetce;

	4553 }

	4554 if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {

	4555 continue;

	4556 }

	4557 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s

	4558 if (targetce == patternce[0]) {

	4559 found = TRUE;

	4560 break;

	4561 }

	4562 if (!hasExpansion(coleiter)) {

	4563 // checking for accents in composite character

	4564 found = FALSE;

	4565 break;

	4566 }

	4567 }

	4568

	4569 //targetce = firstce;

	4570

	4571 while (found && (patternceindex < patterncelength)) {

	4572 firstce = targetce;

	4573 targetce = ucol_next(coleiter, status);

	4574 if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {

	4575 found = FALSE;

	4576 break;

	4577 }

	4578 targetce = getCE(strsrch, targetce);

	4579 if (targetce == UCOL_IGNORABLE) {

	4580 continue;

	4581 }

	4582

	4583 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s

	4584 found = found && targetce == patternce[patternceindex];

	4585 patternceindex ++;

	4586 }

	4587

	4588 targetce = firstce;

	4589

	4590 if (!found) {

	4591 if (U_FAILURE(*status)) {

	4592 break;

	4593 }

	4594

	4595 textoffset = reverseShift(strsrch, textoffset, targetce,

	4596 patternceindex);

	4597 patternceindex = 0;

	4598 continue;

	4599 }

	4600

	4601 if (checkPreviousExactMatch(strsrch, &textoffset, status)) {

	4602 setColEIterOffset(coleiter, textoffset);

	4603 return TRUE;

	4604 }

	4605 }

	4606 setMatchNotFound(strsrch);

	4607 return FALSE;

	4608 #else

	4609 int32_t textOffset = ucol_getOffset(strsrch->textIter);

	4610 int32_t start = -1;

	4611 int32_t end = -1;

	4612

	4613 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {

	4614 strsrch->search->matchedIndex = start;

	4615 strsrch->search->matchedLength = end - start;

	4616 return TRUE;

	4617 } else {

	4618 setMatchNotFound(strsrch);

	4619 return FALSE;

	4620 }

	4621 #endif

	4622 }

	4623

	4624 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,

	4625 UErrorCode *status)

	4626 {

	4627 if (U_FAILURE(*status)) {

	4628 setMatchNotFound(strsrch);

	4629 return FALSE;

	4630 }

	4631

	4632 #if BOYER_MOORE

	4633 UCollationElements *coleiter = strsrch->textIter;

	4634 int32_t *patternce = strsrch->pattern.CE;

	4635 int32_t patterncelength = strsrch->pattern.CELength;

	4636 int32_t textoffset = ucol_getOffset(coleiter);

	4637 UBool hasPatternAccents =

	4638 strsrch->pattern.hasSuffixAccents \|\| strsrch->pattern.hasPrefixAccents;

	4639

	4640 // shifting it check for setting offset

	4641 // if setOffset is called previously or there was no previous match, we

	4642 // leave the offset as it is.

	4643 if (strsrch->search->matchedIndex != USEARCH_DONE) {

	4644 textoffset = strsrch->search->matchedIndex;

	4645 }

	4646

	4647 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,

	4648 patterncelength);

	4649 strsrch->canonicalPrefixAccents[0] = 0;

	4650 strsrch->canonicalSuffixAccents[0] = 0;

	4651

	4652 while (textoffset >= 0)

	4653 {

	4654 int32_t patternceindex = 1;

	4655 int32_t targetce;

	4656 UBool found = FALSE;

	4657 int32_t firstce = UCOL_NULLORDER;

	4658

	4659 setColEIterOffset(coleiter, textoffset);

	4660 for (;;) {

	4661 // finding the first pattern ce match, imagine composite

	4662 // characters. for example: search for pattern \u0300 in text

	4663 // \u00C0, we'll have to skip A first before we get to

	4664 // \u0300 the grave accent

	4665 targetce = ucol_next(coleiter, status);

	4666 if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {

	4667 found = FALSE;

	4668 break;

	4669 }

	4670 targetce = getCE(strsrch, targetce);

	4671 if (firstce == UCOL_NULLORDER \|\| firstce == UCOL_IGNORABLE) {

	4672 firstce = targetce;

	4673 }

	4674

	4675 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s

	4676 if (targetce == patternce[0]) {

	4677 // the first ce can be a contraction

	4678 found = TRUE;

	4679 break;

	4680 }

	4681 if (!hasExpansion(coleiter)) {

	4682 // checking for accents in composite character

	4683 found = FALSE;

	4684 break;

	4685 }

	4686 }

	4687

	4688 targetce = firstce;

	4689

	4690 while (found && patternceindex < patterncelength) {

	4691 targetce = ucol_next(coleiter, status);

	4692 if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {

	4693 found = FALSE;

	4694 break;

	4695 }

	4696 targetce = getCE(strsrch, targetce);

	4697 if (targetce == UCOL_IGNORABLE) {

	4698 continue;

	4699 }

	4700

	4701 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s

	4702 found = found && targetce == patternce[patternceindex];

	4703 patternceindex ++;

	4704 }

	4705

	4706 // initializing the rearranged accent array

	4707 if (hasPatternAccents && !found) {

	4708 strsrch->canonicalPrefixAccents[0] = 0;

	4709 strsrch->canonicalSuffixAccents[0] = 0;

	4710 if (U_FAILURE(*status)) {

	4711 break;

	4712 }

	4713 found = doPreviousCanonicalMatch(strsrch, textoffset, status);

	4714 }

	4715

	4716 if (!found) {

	4717 if (U_FAILURE(*status)) {

	4718 break;

	4719 }

	4720 textoffset = reverseShift(strsrch, textoffset, targetce,

	4721 patternceindex);

	4722 patternceindex = 0;

	4723 continue;

	4724 }

	4725

	4726 if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {

	4727 setColEIterOffset(coleiter, textoffset);

	4728 return TRUE;

	4729 }

	4730 }

	4731 setMatchNotFound(strsrch);

	4732 return FALSE;

	4733 #else

	4734 int32_t textOffset = ucol_getOffset(strsrch->textIter);

	4735 int32_t start = -1;

	4736 int32_t end = -1;

	4737

	4738 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {

	4739 strsrch->search->matchedIndex = start;

	4740 strsrch->search->matchedLength = end - start;

	4741 return TRUE;

	4742 } else {

	4743 setMatchNotFound(strsrch);

	4744 return FALSE;

	4745 }

	4746 #endif

	4747 }

	4748

	4749 #endif /* #if !UCONFIG_NO_COLLATION */

OLD	NEW

« no previous file with comments | « icu46/source/i18n/uregexc.cpp ('k') | icu46/source/i18n/uspoof.cpp » ('j') | no next file with comments »