icu46/source/common/uniset.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/common/uniset.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 **********************************************************************

	3 * Copyright (C) 1999-2009, International Business Machines

	4 * Corporation and others. All Rights Reserved.

	5 **********************************************************************

	6 * Date Name Description

	7 * 10/20/99 alan Creation.

	8 **********************************************************************

	9 */

	10

	11 #include "unicode/utypes.h"

	12 #include "unicode/uniset.h"

	13 #include "unicode/parsepos.h"

	14 #include "unicode/symtable.h"

	15 #include "ruleiter.h"

	16 #include "cmemory.h"

	17 #include "cstring.h"

	18 #include "uhash.h"

	19 #include "util.h"

	20 #include "uvector.h"

	21 #include "charstr.h"

	22 #include "ustrfmt.h"

	23 #include "uassert.h"

	24 #include "hash.h"

	25 #include "bmpset.h"

	26 #include "unisetspan.h"

	27

	28 // Define UChar constants using hex for EBCDIC compatibility

	29 // Used #define to reduce private static exports and memory access time.

	30 #define SET_OPEN ((UChar)0x005B) /[/

	31 #define SET_CLOSE ((UChar)0x005D) /]/

	32 #define HYPHEN ((UChar)0x002D) /-/

	33 #define COMPLEMENT ((UChar)0x005E) /^/

	34 #define COLON ((UChar)0x003A) /:/

	35 #define BACKSLASH ((UChar)0x005C) /\/

	36 #define INTERSECTION ((UChar)0x0026) /&/

	37 #define UPPER_U ((UChar)0x0055) /U/

	38 #define LOWER_U ((UChar)0x0075) /u/

	39 #define OPEN_BRACE ((UChar)123) /{/

	40 #define CLOSE_BRACE ((UChar)125) /}/

	41 #define UPPER_P ((UChar)0x0050) /P/

	42 #define LOWER_P ((UChar)0x0070) /p/

	43 #define UPPER_N ((UChar)78) /N/

	44 #define EQUALS ((UChar)0x003D) /=/

	45

	46 // HIGH_VALUE > all valid values. 110000 for codepoints

	47 #define UNICODESET_HIGH 0x0110000

	48

	49 // LOW <= all valid values. ZERO for codepoints

	50 #define UNICODESET_LOW 0x000000

	51

	52 // initial storage. Must be >= 0

	53 #define START_EXTRA 16

	54

	55 // extra amount for growth. Must be >= 0

	56 #define GROW_EXTRA START_EXTRA

	57

	58 U_NAMESPACE_BEGIN

	59

	60 SymbolTable::~SymbolTable() {}

	61

	62 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)

	63

	64 /**

	65 * Modify the given UChar32 variable so that it is in range, by

	66 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and

	67 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.

	68 * It modifies its argument in-place and also returns it.

	69 */

	70 static inline UChar32 pinCodePoint(UChar32& c) {

	71 if (c < UNICODESET_LOW) {

	72 c = UNICODESET_LOW;

	73 } else if (c > (UNICODESET_HIGH-1)) {

	74 c = (UNICODESET_HIGH-1);

	75 }

	76 return c;

	77 }

	78

	79 //----------------------------------------------------------------

	80 // Debugging

	81 //----------------------------------------------------------------

	82

	83 // DO NOT DELETE THIS CODE. This code is used to debug memory leaks.

	84 // To enable the debugging, define the symbol DEBUG_MEM in the line

	85 // below. This will result in text being sent to stdout that looks

	86 // like this:

	87 // DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-

	88 // DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-

	89 // Each line lists a construction (ct) or destruction (dt) event, the

	90 // object address, the number of outstanding objects after the event,

	91 // and the pattern of the object in question.

	92

	93 // #define DEBUG_MEM

	94

	95 #ifdef DEBUG_MEM

	96 #include <stdio.h>

	97 static int32_t _dbgCount = 0;

	98

	99 static inline void _dbgct(UnicodeSet* set) {

	100 UnicodeString str;

	101 set->toPattern(str, TRUE);

	102 char buf[40];

	103 str.extract(0, 39, buf, "");

	104 printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);

	105 }

	106

	107 static inline void _dbgdt(UnicodeSet* set) {

	108 UnicodeString str;

	109 set->toPattern(str, TRUE);

	110 char buf[40];

	111 str.extract(0, 39, buf, "");

	112 printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);

	113 }

	114

	115 #else

	116

	117 #define _dbgct(set)

	118 #define _dbgdt(set)

	119

	120 #endif

	121

	122 //----------------------------------------------------------------

	123 // UnicodeString in UVector support

	124 //----------------------------------------------------------------

	125

	126 static void U_CALLCONV cloneUnicodeString(UHashTok dst, UHashTok src) {

	127 dst->pointer = new UnicodeString((UnicodeString)src->pointer);

	128 }

	129

	130 static int8_t U_CALLCONV compareUnicodeString(UHashTok t1, UHashTok t2) {

	131 const UnicodeString &a = (const UnicodeString)t1.pointer;

	132 const UnicodeString &b = (const UnicodeString)t2.pointer;

	133 return a.compare(b);

	134 }

	135

	136 //----------------------------------------------------------------

	137 // Constructors &c

	138 //----------------------------------------------------------------

	139

	140 /**

	141 * Constructs an empty set.

	142 */

	143 UnicodeSet::UnicodeSet() :

	144 len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),

	145 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),

	146 fFlags(0)

	147 {

	148 UErrorCode status = U_ZERO_ERROR;

	149 allocateStrings(status);

	150 if (U_FAILURE(status)) {

	151 return;

	152 }

	153 list = (UChar32) uprv_malloc(sizeof(UChar32) capacity);

	154 if(list!=NULL){

	155 list[0] = UNICODESET_HIGH;

	156 } else { // If memory allocation failed, set to bogus state.

	157 setToBogus();

	158 return;

	159 }

	160 _dbgct(this);

	161 }

	162

	163 /**

	164 * Constructs a set containing the given range. If <code>end >

	165 * start</code> then an empty set is created.

	166 *

	167 * @param start first character, inclusive, of range

	168 * @param end last character, inclusive, of range

	169 */

	170 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :

	171 len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),

	172 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),

	173 fFlags(0)

	174 {

	175 UErrorCode status = U_ZERO_ERROR;

	176 allocateStrings(status);

	177 if (U_FAILURE(status)) {

	178 return;

	179 }

	180 list = (UChar32) uprv_malloc(sizeof(UChar32) capacity);

	181 if(list!=NULL){

	182 list[0] = UNICODESET_HIGH;

	183 complement(start, end);

	184 } else { // If memory allocation failed, set to bogus state.

	185 setToBogus();

	186 return;

	187 }

	188 _dbgct(this);

	189 }

	190

	191 /**

	192 * Constructs a set that is identical to the given UnicodeSet.

	193 */

	194 UnicodeSet::UnicodeSet(const UnicodeSet& o) :

	195 UnicodeFilter(o),

	196 len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),

	197 bmpSet(0),

	198 buffer(0), bufferCapacity(0),

	199 patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),

	200 fFlags(0)

	201 {

	202 UErrorCode status = U_ZERO_ERROR;

	203 allocateStrings(status);

	204 if (U_FAILURE(status)) {

	205 return;

	206 }

	207 list = (UChar32) uprv_malloc(sizeof(UChar32) capacity);

	208 if(list!=NULL){

	209 *this = o;

	210 } else { // If memory allocation failed, set to bogus state.

	211 setToBogus();

	212 return;

	213 }

	214 _dbgct(this);

	215 }

	216

	217 // Copy-construct as thawed.

	218 UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :

	219 UnicodeFilter(o),

	220 len(0), capacity(o.len + GROW_EXTRA), list(0),

	221 bmpSet(0),

	222 buffer(0), bufferCapacity(0),

	223 patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),

	224 fFlags(0)

	225 {

	226 UErrorCode status = U_ZERO_ERROR;

	227 allocateStrings(status);

	228 if (U_FAILURE(status)) {

	229 return;

	230 }

	231 list = (UChar32) uprv_malloc(sizeof(UChar32) capacity);

	232 if(list!=NULL){

	233 // *this = o except for bmpSet and stringSpan

	234 len = o.len;

	235 uprv_memcpy(list, o.list, len*sizeof(UChar32));

	236 if (strings != NULL && o.strings != NULL) {

	237 strings->assign(*o.strings, cloneUnicodeString, status);

	238 } else { // Invalid strings.

	239 setToBogus();

	240 return;

	241 }

	242 if (o.pat) {

	243 setPattern(UnicodeString(o.pat, o.patLen));

	244 }

	245 } else { // If memory allocation failed, set to bogus state.

	246 setToBogus();

	247 return;

	248 }

	249 _dbgct(this);

	250 }

	251

	252 /**

	253 * Destructs the set.

	254 */

	255 UnicodeSet::~UnicodeSet() {

	256 _dbgdt(this); // first!

	257 uprv_free(list);

	258 delete bmpSet;

	259 if (buffer) {

	260 uprv_free(buffer);

	261 }

	262 delete strings;

	263 delete stringSpan;

	264 releasePattern();

	265 }

	266

	267 /**

	268 * Assigns this object to be a copy of another.

	269 */

	270 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {

	271 if (this == &o) {

	272 return *this;

	273 }

	274 if (isFrozen()) {

	275 return *this;

	276 }

	277 if (o.isBogus()) {

	278 setToBogus();

	279 return *this;

	280 }

	281 UErrorCode ec = U_ZERO_ERROR;

	282 ensureCapacity(o.len, ec);

	283 if (U_FAILURE(ec)) {

	284 return *this; // There is no way to report this error :-(

	285 }

	286 len = o.len;

	287 uprv_memcpy(list, o.list, len*sizeof(UChar32));

	288 if (o.bmpSet == NULL) {

	289 bmpSet = NULL;

	290 } else {

	291 bmpSet = new BMPSet(*o.bmpSet, list, len);

	292 if (bmpSet == NULL) { // Check for memory allocation error.

	293 setToBogus();

	294 return *this;

	295 }

	296 }

	297 if (strings != NULL && o.strings != NULL) {

	298 strings->assign(*o.strings, cloneUnicodeString, ec);

	299 } else { // Invalid strings.

	300 setToBogus();

	301 return *this;

	302 }

	303 if (o.stringSpan == NULL) {

	304 stringSpan = NULL;

	305 } else {

	306 stringSpan = new UnicodeSetStringSpan(o.stringSpan, strings);

	307 if (stringSpan == NULL) { // Check for memory allocation error.

	308 setToBogus();

	309 return *this;

	310 }

	311 }

	312 releasePattern();

	313 if (o.pat) {

	314 setPattern(UnicodeString(o.pat, o.patLen));

	315 }

	316 return *this;

	317 }

	318

	319 /**

	320 * Returns a copy of this object. All UnicodeMatcher objects have

	321 * to support cloning in order to allow classes using

	322 * UnicodeMatchers, such as Transliterator, to implement cloning.

	323 */

	324 UnicodeFunctor* UnicodeSet::clone() const {

	325 return new UnicodeSet(*this);

	326 }

	327

	328 UnicodeFunctor *UnicodeSet::cloneAsThawed() const {

	329 return new UnicodeSet(*this, TRUE);

	330 }

	331

	332 /**

	333 * Compares the specified object with this set for equality. Returns

	334 * <tt>true</tt> if the two sets

	335 * have the same size, and every member of the specified set is

	336 * contained in this set (or equivalently, every member of this set is

	337 * contained in the specified set).

	338 *

	339 * @param o set to be compared for equality with this set.

	340 * @return <tt>true</tt> if the specified set is equal to this set.

	341 */

	342 UBool UnicodeSet::operator==(const UnicodeSet& o) const {

	343 if (len != o.len) return FALSE;

	344 for (int32_t i = 0; i < len; ++i) {

	345 if (list[i] != o.list[i]) return FALSE;

	346 }

	347 if (strings != o.strings) return FALSE;

	348 return TRUE;

	349 }

	350

	351 /**

	352 * Returns the hash code value for this set.

	353 *

	354 * @return the hash code value for this set.

	355 * @see Object#hashCode()

	356 */

	357 int32_t UnicodeSet::hashCode(void) const {

	358 int32_t result = len;

	359 for (int32_t i = 0; i < len; ++i) {

	360 result *= 1000003;

	361 result += list[i];

	362 }

	363 return result;

	364 }

	365

	366 //----------------------------------------------------------------

	367 // Public API

	368 //----------------------------------------------------------------

	369

	370 /**

	371 * Returns the number of elements in this set (its cardinality),

	372 * Note than the elements of a set may include both individual

	373 * codepoints and strings.

	374 *

	375 * @return the number of elements in this set (its cardinality).

	376 */

	377 int32_t UnicodeSet::size(void) const {

	378 int32_t n = 0;

	379 int32_t count = getRangeCount();

	380 for (int32_t i = 0; i < count; ++i) {

	381 n += getRangeEnd(i) - getRangeStart(i) + 1;

	382 }

	383 return n + strings->size();

	384 }

	385

	386 /**

	387 * Returns <tt>true</tt> if this set contains no elements.

	388 *

	389 * @return <tt>true</tt> if this set contains no elements.

	390 */

	391 UBool UnicodeSet::isEmpty(void) const {

	392 return len == 1 && strings->size() == 0;

	393 }

	394

	395 /**

	396 * Returns true if this set contains the given character.

	397 * @param c character to be checked for containment

	398 * @return true if the test condition is met

	399 */

	400 UBool UnicodeSet::contains(UChar32 c) const {

	401 // Set i to the index of the start item greater than ch

	402 // We know we will terminate without length test!

	403 // LATER: for large sets, add binary search

	404 //int32_t i = -1;

	405 //for (;;) {

	406 // if (c < list[++i]) break;

	407 //}

	408 if (bmpSet != NULL) {

	409 return bmpSet->contains(c);

	410 }

	411 if (stringSpan != NULL) {

	412 return stringSpan->contains(c);

	413 }

	414 if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound

	415 return FALSE;

	416 }

	417 int32_t i = findCodePoint(c);

	418 return (UBool)(i & 1); // return true if odd

	419 }

	420

	421 /**

	422 * Returns the smallest value i such that c < list[i]. Caller

	423 * must ensure that c is a legal value or this method will enter

	424 * an infinite loop. This method performs a binary search.

	425 * @param c a character in the range MIN_VALUE..MAX_VALUE

	426 * inclusive

	427 * @return the smallest integer i in the range 0..len-1,

	428 * inclusive, such that c < list[i]

	429 */

	430 int32_t UnicodeSet::findCodePoint(UChar32 c) const {

	431 /* Examples:

	432 findCodePoint(c)

	433 set list[] c=0 1 3 4 7 8

	434 === ============== ===========

	435 [] [110000] 0 0 0 0 0 0

	436 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2

	437 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2

	438 [:Any:] [0, 110000] 1 1 1 1 1 1

	439 */

	440

	441 // Return the smallest i such that c < list[i]. Assume

	442 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).

	443 if (c < list[0])

	444 return 0;

	445 // High runner test. c is often after the last range, so an

	446 // initial check for this condition pays off.

	447 int32_t lo = 0;

	448 int32_t hi = len - 1;

	449 if (lo >= hi \|\| c >= list[hi-1])

	450 return hi;

	451 // invariant: c >= list[lo]

	452 // invariant: c < list[hi]

	453 for (;;) {

	454 int32_t i = (lo + hi) >> 1;

	455 if (i == lo) {

	456 break; // Found!

	457 } else if (c < list[i]) {

	458 hi = i;

	459 } else {

	460 lo = i;

	461 }

	462 }

	463 return hi;

	464 }

	465

	466 /**

	467 * Returns true if this set contains every character

	468 * of the given range.

	469 * @param start first character, inclusive, of the range

	470 * @param end last character, inclusive, of the range

	471 * @return true if the test condition is met

	472 */

	473 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {

	474 //int32_t i = -1;

	475 //for (;;) {

	476 // if (start < list[++i]) break;

	477 //}

	478 int32_t i = findCodePoint(start);

	479 return ((i & 1) != 0 && end < list[i]);

	480 }

	481

	482 /**

	483 * Returns <tt>true</tt> if this set contains the given

	484 * multicharacter string.

	485 * @param s string to be checked for containment

	486 * @return <tt>true</tt> if this set contains the specified string

	487 */

	488 UBool UnicodeSet::contains(const UnicodeString& s) const {

	489 if (s.length() == 0) return FALSE;

	490 int32_t cp = getSingleCP(s);

	491 if (cp < 0) {

	492 return strings->contains((void*) &s);

	493 } else {

	494 return contains((UChar32) cp);

	495 }

	496 }

	497

	498 /**

	499 * Returns true if this set contains all the characters and strings

	500 * of the given set.

	501 * @param c set to be checked for containment

	502 * @return true if the test condition is met

	503 */

	504 UBool UnicodeSet::containsAll(const UnicodeSet& c) const {

	505 // The specified set is a subset if all of its pairs are contained in

	506 // this set. It's possible to code this more efficiently in terms of

	507 // direct manipulation of the inversion lists if the need arises.

	508 int32_t n = c.getRangeCount();

	509 for (int i=0; i<n; ++i) {

	510 if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {

	511 return FALSE;

	512 }

	513 }

	514 if (!strings->containsAll(*c.strings)) return FALSE;

	515 return TRUE;

	516 }

	517

	518 /**

	519 * Returns true if this set contains all the characters

	520 * of the given string.

	521 * @param s string containing characters to be checked for containment

	522 * @return true if the test condition is met

	523 */

	524 UBool UnicodeSet::containsAll(const UnicodeString& s) const {

	525 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==

	526 s.length());

	527 }

	528

	529 /**

	530 * Returns true if this set contains none of the characters

	531 * of the given range.

	532 * @param start first character, inclusive, of the range

	533 * @param end last character, inclusive, of the range

	534 * @return true if the test condition is met

	535 */

	536 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {

	537 //int32_t i = -1;

	538 //for (;;) {

	539 // if (start < list[++i]) break;

	540 //}

	541 int32_t i = findCodePoint(start);

	542 return ((i & 1) == 0 && end < list[i]);

	543 }

	544

	545 /**

	546 * Returns true if this set contains none of the characters and strings

	547 * of the given set.

	548 * @param c set to be checked for containment

	549 * @return true if the test condition is met

	550 */

	551 UBool UnicodeSet::containsNone(const UnicodeSet& c) const {

	552 // The specified set is a subset if all of its pairs are contained in

	553 // this set. It's possible to code this more efficiently in terms of

	554 // direct manipulation of the inversion lists if the need arises.

	555 int32_t n = c.getRangeCount();

	556 for (int32_t i=0; i<n; ++i) {

	557 if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {

	558 return FALSE;

	559 }

	560 }

	561 if (!strings->containsNone(*c.strings)) return FALSE;

	562 return TRUE;

	563 }

	564

	565 /**

	566 * Returns true if this set contains none of the characters

	567 * of the given string.

	568 * @param s string containing characters to be checked for containment

	569 * @return true if the test condition is met

	570 */

	571 UBool UnicodeSet::containsNone(const UnicodeString& s) const {

	572 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==

	573 s.length());

	574 }

	575

	576 /**

	577 * Returns <tt>true</tt> if this set contains any character whose low byte

	578 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for

	579 * indexing.

	580 */

	581 UBool UnicodeSet::matchesIndexValue(uint8_t v) const {

	582 /* The index value v, in the range [0,255], is contained in this set if

	583 * it is contained in any pair of this set. Pairs either have the high

	584 * bytes equal, or unequal. If the high bytes are equal, then we have

	585 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <=

	586 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.

	587 * Then v is contained if xx <= v \|\| v <= yy. (This is identical to the

	588 * time zone month containment logic.)

	589 */

	590 int32_t i;

	591 int32_t rangeCount=getRangeCount();

	592 for (i=0; i<rangeCount; ++i) {

	593 UChar32 low = getRangeStart(i);

	594 UChar32 high = getRangeEnd(i);

	595 if ((low & ~0xFF) == (high & ~0xFF)) {

	596 if ((low & 0xFF) <= v && v <= (high & 0xFF)) {

	597 return TRUE;

	598 }

	599 } else if ((low & 0xFF) <= v \|\| v <= (high & 0xFF)) {

	600 return TRUE;

	601 }

	602 }

	603 if (strings->size() != 0) {

	604 for (i=0; i<strings->size(); ++i) {

	605 const UnicodeString& s = (const UnicodeString)strings->elementAt(i );

	606 //if (s.length() == 0) {

	607 // // Empty strings match everything

	608 // return TRUE;

	609 //}

	610 // assert(s.length() != 0); // We enforce this elsewhere

	611 UChar32 c = s.char32At(0);

	612 if ((c & 0xFF) == v) {

	613 return TRUE;

	614 }

	615 }

	616 }

	617 return FALSE;

	618 }

	619

	620 /**

	621 * Implementation of UnicodeMatcher::matches(). Always matches the

	622 * longest possible multichar string.

	623 */

	624 UMatchDegree UnicodeSet::matches(const Replaceable& text,

	625 int32_t& offset,

	626 int32_t limit,

	627 UBool incremental) {

	628 if (offset == limit) {

	629 // Strings, if any, have length != 0, so we don't worry

	630 // about them here. If we ever allow zero-length strings

	631 // we much check for them here.

	632 if (contains(U_ETHER)) {

	633 return incremental ? U_PARTIAL_MATCH : U_MATCH;

	634 } else {

	635 return U_MISMATCH;

	636 }

	637 } else {

	638 if (strings->size() != 0) { // try strings first

	639

	640 // might separate forward and backward loops later

	641 // for now they are combined

	642

	643 // TODO Improve efficiency of this, at least in the forward

	644 // direction, if not in both. In the forward direction we

	645 // can assume the strings are sorted.

	646

	647 int32_t i;

	648 UBool forward = offset < limit;

	649

	650 // firstChar is the leftmost char to match in the

	651 // forward direction or the rightmost char to match in

	652 // the reverse direction.

	653 UChar firstChar = text.charAt(offset);

	654

	655 // If there are multiple strings that can match we

	656 // return the longest match.

	657 int32_t highWaterLength = 0;

	658

	659 for (i=0; i<strings->size(); ++i) {

	660 const UnicodeString& trial = (const UnicodeString)strings->ele mentAt(i);

	661

	662 //if (trial.length() == 0) {

	663 // return U_MATCH; // null-string always matches

	664 //}

	665 // assert(trial.length() != 0); // We ensure this elsewhere

	666

	667 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);

	668

	669 // Strings are sorted, so we can optimize in the

	670 // forward direction.

	671 if (forward && c > firstChar) break;

	672 if (c != firstChar) continue;

	673

	674 int32_t matchLen = matchRest(text, offset, limit, trial);

	675

	676 if (incremental) {

	677 int32_t maxLen = forward ? limit-offset : offset-limit;

	678 if (matchLen == maxLen) {

	679 // We have successfully matched but only up to limit.

	680 return U_PARTIAL_MATCH;

	681 }

	682 }

	683

	684 if (matchLen == trial.length()) {

	685 // We have successfully matched the whole string.

	686 if (matchLen > highWaterLength) {

	687 highWaterLength = matchLen;

	688 }

	689 // In the forward direction we know strings

	690 // are sorted so we can bail early.

	691 if (forward && matchLen < highWaterLength) {

	692 break;

	693 }

	694 continue;

	695 }

	696 }

	697

	698 // We've checked all strings without a partial match.

	699 // If we have full matches, return the longest one.

	700 if (highWaterLength != 0) {

	701 offset += forward ? highWaterLength : -highWaterLength;

	702 return U_MATCH;

	703 }

	704 }

	705 return UnicodeFilter::matches(text, offset, limit, incremental);

	706 }

	707 }

	708

	709 /**

	710 * Returns the longest match for s in text at the given position.

	711 * If limit > start then match forward from start+1 to limit

	712 * matching all characters except s.charAt(0). If limit < start,

	713 * go backward starting from start-1 matching all characters

	714 * except s.charAt(s.length()-1). This method assumes that the

	715 * first character, text.charAt(start), matches s, so it does not

	716 * check it.

	717 * @param text the text to match

	718 * @param start the first character to match. In the forward

	719 * direction, text.charAt(start) is matched against s.charAt(0).

	720 * In the reverse direction, it is matched against

	721 * s.charAt(s.length()-1).

	722 * @param limit the limit offset for matching, either last+1 in

	723 * the forward direction, or last-1 in the reverse direction,

	724 * where last is the index of the last character to match.

	725 * @return If part of s matches up to the limit, return \|limit -

	726 * start\|. If all of s matches before reaching the limit, return

	727 * s.length(). If there is a mismatch between s and text, return

	728 * 0

	729 */

	730 int32_t UnicodeSet::matchRest(const Replaceable& text,

	731 int32_t start, int32_t limit,

	732 const UnicodeString& s) {

	733 int32_t i;

	734 int32_t maxLen;

	735 int32_t slen = s.length();

	736 if (start < limit) {

	737 maxLen = limit - start;

	738 if (maxLen > slen) maxLen = slen;

	739 for (i = 1; i < maxLen; ++i) {

	740 if (text.charAt(start + i) != s.charAt(i)) return 0;

	741 }

	742 } else {

	743 maxLen = start - limit;

	744 if (maxLen > slen) maxLen = slen;

	745 --slen; // <=> slen = s.length() - 1;

	746 for (i = 1; i < maxLen; ++i) {

	747 if (text.charAt(start - i) != s.charAt(slen - i)) return 0;

	748 }

	749 }

	750 return maxLen;

	751 }

	752

	753 /**

	754 * Implement of UnicodeMatcher

	755 */

	756 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {

	757 toUnionTo.addAll(*this);

	758 }

	759

	760 /**

	761 * Returns the index of the given character within this set, where

	762 * the set is ordered by ascending code point. If the character

	763 * is not in this set, return -1. The inverse of this method is

	764 * <code>charAt()</code>.

	765 * @return an index from 0..size()-1, or -1

	766 */

	767 int32_t UnicodeSet::indexOf(UChar32 c) const {

	768 if (c < MIN_VALUE \|\| c > MAX_VALUE) {

	769 return -1;

	770 }

	771 int32_t i = 0;

	772 int32_t n = 0;

	773 for (;;) {

	774 UChar32 start = list[i++];

	775 if (c < start) {

	776 return -1;

	777 }

	778 UChar32 limit = list[i++];

	779 if (c < limit) {

	780 return n + c - start;

	781 }

	782 n += limit - start;

	783 }

	784 }

	785

	786 /**

	787 * Returns the character at the given index within this set, where

	788 * the set is ordered by ascending code point. If the index is

	789 * out of range, return (UChar32)-1. The inverse of this method is

	790 * <code>indexOf()</code>.

	791 * @param index an index from 0..size()-1

	792 * @return the character at the given index, or (UChar32)-1.

	793 */

	794 UChar32 UnicodeSet::charAt(int32_t index) const {

	795 if (index >= 0) {

	796 // len2 is the largest even integer <= len, that is, it is len

	797 // for even values and len-1 for odd values. With odd values

	798 // the last entry is UNICODESET_HIGH.

	799 int32_t len2 = len & ~1;

	800 for (int32_t i=0; i < len2;) {

	801 UChar32 start = list[i++];

	802 int32_t count = list[i++] - start;

	803 if (index < count) {

	804 return (UChar32)(start + index);

	805 }

	806 index -= count;

	807 }

	808 }

	809 return (UChar32)-1;

	810 }

	811

	812 /**

	813 * Make this object represent the range <code>start - end</code>.

	814 * If <code>end > start</code> then this object is set to an

	815 * an empty range.

	816 *

	817 * @param start first character in the set, inclusive

	818 * @rparam end last character in the set, inclusive

	819 */

	820 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {

	821 clear();

	822 complement(start, end);

	823 return *this;

	824 }

	825

	826 /**

	827 * Adds the specified range to this set if it is not already

	828 * present. If this set already contains the specified range,

	829 * the call leaves this set unchanged. If <code>end > start</code>

	830 * then an empty range is added, leaving the set unchanged.

	831 *

	832 * @param start first character, inclusive, of range to be added

	833 * to this set.

	834 * @param end last character, inclusive, of range to be added

	835 * to this set.

	836 */

	837 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {

	838 if (pinCodePoint(start) < pinCodePoint(end)) {

	839 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };

	840 add(range, 2, 0);

	841 } else if (start == end) {

	842 add(start);

	843 }

	844 return *this;

	845 }

	846

	847 // #define DEBUG_US_ADD

	848

	849 #ifdef DEBUG_US_ADD

	850 #include <stdio.h>

	851 void dump(UChar32 c) {

	852 if (c <= 0xFF) {

	853 printf("%c", (char)c);

	854 } else {

	855 printf("U+%04X", c);

	856 }

	857 }

	858 void dump(const UChar32* list, int32_t len) {

	859 printf("[");

	860 for (int32_t i=0; i<len; ++i) {

	861 if (i != 0) printf(", ");

	862 dump(list[i]);

	863 }

	864 printf("]");

	865 }

	866 #endif

	867

	868 /**

	869 * Adds the specified character to this set if it is not already

	870 * present. If this set already contains the specified character,

	871 * the call leaves this set unchanged.

	872 */

	873 UnicodeSet& UnicodeSet::add(UChar32 c) {

	874 // find smallest i such that c < list[i]

	875 // if odd, then it is IN the set

	876 // if even, then it is OUT of the set

	877 int32_t i = findCodePoint(pinCodePoint(c));

	878

	879 // already in set?

	880 if ((i & 1) != 0 \|\| isFrozen() \|\| isBogus()) return *this;

	881

	882 // HIGH is 0x110000

	883 // assert(list[len-1] == HIGH);

	884

	885 // empty = [HIGH]

	886 // [start_0, limit_0, start_1, limit_1, HIGH]

	887

	888 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]

	889 // ^

	890 // list[i]

	891

	892 // i == 0 means c is before the first range

	893

	894 #ifdef DEBUG_US_ADD

	895 printf("Add of ");

	896 dump(c);

	897 printf(" found at %d", i);

	898 printf(": ");

	899 dump(list, len);

	900 printf(" => ");

	901 #endif

	902

	903 if (c == list[i]-1) {

	904 // c is before start of next range

	905 list[i] = c;

	906 // if we touched the HIGH mark, then add a new one

	907 if (c == (UNICODESET_HIGH - 1)) {

	908 UErrorCode status = U_ZERO_ERROR;

	909 ensureCapacity(len+1, status);

	910 if (U_FAILURE(status)) {

	911 return *this; // There is no way to report this error :-(

	912 }

	913 list[len++] = UNICODESET_HIGH;

	914 }

	915 if (i > 0 && c == list[i-1]) {

	916 // collapse adjacent ranges

	917

	918 // [..., start_k-1, c, c, limit_k, ..., HIGH]

	919 // ^

	920 // list[i]

	921

	922 //for (int32_t k=i-1; k<len-2; ++k) {

	923 // list[k] = list[k+2];

	924 //}

	925 UChar32* dst = list + i - 1;

	926 UChar32* src = dst + 2;

	927 UChar32* srclimit = list + len;

	928 while (src < srclimit) (dst++) = (src++);

	929

	930 len -= 2;

	931 }

	932 }

	933

	934 else if (i > 0 && c == list[i-1]) {

	935 // c is after end of prior range

	936 list[i-1]++;

	937 // no need to check for collapse here

	938 }

	939

	940 else {

	941 // At this point we know the new char is not adjacent to

	942 // any existing ranges, and it is not 10FFFF.

	943

	944

	945 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]

	946 // ^

	947 // list[i]

	948

	949 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]

	950 // ^

	951 // list[i]

	952

	953 UErrorCode status = U_ZERO_ERROR;

	954 ensureCapacity(len+2, status);

	955 if (U_FAILURE(status)) {

	956 return *this; // There is no way to report this error :-(

	957 }

	958

	959 //for (int32_t k=len-1; k>=i; --k) {

	960 // list[k+2] = list[k];

	961 //}

	962 UChar32* src = list + len;

	963 UChar32* dst = src + 2;

	964 UChar32* srclimit = list + i;

	965 while (src > srclimit) (--dst) = (--src);

	966

	967 list[i] = c;

	968 list[i+1] = c+1;

	969 len += 2;

	970 }

	971

	972 #ifdef DEBUG_US_ADD

	973 dump(list, len);

	974 printf("\n");

	975

	976 for (i=1; i<len; ++i) {

	977 if (list[i] <= list[i-1]) {

	978 // Corrupt array!

	979 printf("ERROR: list has been corrupted\n");

	980 exit(1);

	981 }

	982 }

	983 #endif

	984

	985 releasePattern();

	986 return *this;

	987 }

	988

	989 /**

	990 * Adds the specified multicharacter to this set if it is not already

	991 * present. If this set already contains the multicharacter,

	992 * the call leaves this set unchanged.

	993 * Thus "ch" => {"ch"}

	994 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>

	995 * @param s the source string

	996 * @return the modified set, for chaining

	997 */

	998 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {

	999 if (s.length() == 0 \|\| isFrozen() \|\| isBogus()) return *this;

	1000 int32_t cp = getSingleCP(s);

	1001 if (cp < 0) {

	1002 if (!strings->contains((void*) &s)) {

	1003 _add(s);

	1004 releasePattern();

	1005 }

	1006 } else {

	1007 add((UChar32)cp);

	1008 }

	1009 return *this;

	1010 }

	1011

	1012 /**

	1013 * Adds the given string, in order, to 'strings'. The given string

	1014 * must have been checked by the caller to not be empty and to not

	1015 * already be in 'strings'.

	1016 */

	1017 void UnicodeSet::_add(const UnicodeString& s) {

	1018 if (isFrozen() \|\| isBogus()) {

	1019 return;

	1020 }

	1021 UnicodeString* t = new UnicodeString(s);

	1022 if (t == NULL) { // Check for memory allocation error.

	1023 setToBogus();

	1024 return;

	1025 }

	1026 UErrorCode ec = U_ZERO_ERROR;

	1027 strings->sortedInsert(t, compareUnicodeString, ec);

	1028 if (U_FAILURE(ec)) {

	1029 setToBogus();

	1030 delete t;

	1031 }

	1032 }

	1033

	1034 /**

	1035 * @return a code point IF the string consists of a single one.

	1036 * otherwise returns -1.

	1037 * @param string to test

	1038 */

	1039 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {

	1040 //if (s.length() < 1) {

	1041 // throw new IllegalArgumentException("Can't use zero-length strings in U nicodeSet");

	1042 //}

	1043 if (s.length() > 2) return -1;

	1044 if (s.length() == 1) return s.charAt(0);

	1045

	1046 // at this point, len = 2

	1047 UChar32 cp = s.char32At(0);

	1048 if (cp > 0xFFFF) { // is surrogate pair

	1049 return cp;

	1050 }

	1051 return -1;

	1052 }

	1053

	1054 /**

	1055 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h" }

	1056 * If this set already any particular character, it has no effect on that charac ter.

	1057 * @param the source string

	1058 * @return the modified set, for chaining

	1059 */

	1060 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {

	1061 UChar32 cp;

	1062 for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {

	1063 cp = s.char32At(i);

	1064 add(cp);

	1065 }

	1066 return *this;

	1067 }

	1068

	1069 /**

	1070 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}

	1071 * If this set already any particular character, it has no effect on that charac ter.

	1072 * @param the source string

	1073 * @return the modified set, for chaining

	1074 */

	1075 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {

	1076 UnicodeSet set;

	1077 set.addAll(s);

	1078 retainAll(set);

	1079 return *this;

	1080 }

	1081

	1082 /**

	1083 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}

	1084 * If this set already any particular character, it has no effect on that charac ter.

	1085 * @param the source string

	1086 * @return the modified set, for chaining

	1087 */

	1088 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {

	1089 UnicodeSet set;

	1090 set.addAll(s);

	1091 complementAll(set);

	1092 return *this;

	1093 }

	1094

	1095 /**

	1096 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}

	1097 * If this set already any particular character, it has no effect on that charac ter.

	1098 * @param the source string

	1099 * @return the modified set, for chaining

	1100 */

	1101 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {

	1102 UnicodeSet set;

	1103 set.addAll(s);

	1104 removeAll(set);

	1105 return *this;

	1106 }

	1107

	1108 UnicodeSet& UnicodeSet::removeAllStrings() {

	1109 strings->removeAllElements();

	1110 return *this;

	1111 }

	1112

	1113

	1114 /**

	1115 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}

	1116 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>

	1117 * @param the source string

	1118 * @return a newly created set containing the given string

	1119 */

	1120 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {

	1121 UnicodeSet *set = new UnicodeSet();

	1122 if (set != NULL) { // Check for memory allocation error.

	1123 set->add(s);

	1124 }

	1125 return set;

	1126 }

	1127

	1128

	1129 /**

	1130 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h" }

	1131 * @param the source string

	1132 * @return a newly created set containing the given characters

	1133 */

	1134 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {

	1135 UnicodeSet *set = new UnicodeSet();

	1136 if (set != NULL) { // Check for memory allocation error.

	1137 set->addAll(s);

	1138 }

	1139 return set;

	1140 }

	1141

	1142 /**

	1143 * Retain only the elements in this set that are contained in the

	1144 * specified range. If <code>end > start</code> then an empty range is

	1145 * retained, leaving the set empty.

	1146 *

	1147 * @param start first character, inclusive, of range to be retained

	1148 * to this set.

	1149 * @param end last character, inclusive, of range to be retained

	1150 * to this set.

	1151 */

	1152 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {

	1153 if (pinCodePoint(start) <= pinCodePoint(end)) {

	1154 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };

	1155 retain(range, 2, 0);

	1156 } else {

	1157 clear();

	1158 }

	1159 return *this;

	1160 }

	1161

	1162 UnicodeSet& UnicodeSet::retain(UChar32 c) {

	1163 return retain(c, c);

	1164 }

	1165

	1166 /**

	1167 * Removes the specified range from this set if it is present.

	1168 * The set will not contain the specified range once the call

	1169 * returns. If <code>end > start</code> then an empty range is

	1170 * removed, leaving the set unchanged.

	1171 *

	1172 * @param start first character, inclusive, of range to be removed

	1173 * from this set.

	1174 * @param end last character, inclusive, of range to be removed

	1175 * from this set.

	1176 */

	1177 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {

	1178 if (pinCodePoint(start) <= pinCodePoint(end)) {

	1179 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };

	1180 retain(range, 2, 2);

	1181 }

	1182 return *this;

	1183 }

	1184

	1185 /**

	1186 * Removes the specified character from this set if it is present.

	1187 * The set will not contain the specified range once the call

	1188 * returns.

	1189 */

	1190 UnicodeSet& UnicodeSet::remove(UChar32 c) {

	1191 return remove(c, c);

	1192 }

	1193

	1194 /**

	1195 * Removes the specified string from this set if it is present.

	1196 * The set will not contain the specified character once the call

	1197 * returns.

	1198 * @param the source string

	1199 * @return the modified set, for chaining

	1200 */

	1201 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {

	1202 if (s.length() == 0 \|\| isFrozen() \|\| isBogus()) return *this;

	1203 int32_t cp = getSingleCP(s);

	1204 if (cp < 0) {

	1205 strings->removeElement((void*) &s);

	1206 releasePattern();

	1207 } else {

	1208 remove((UChar32)cp, (UChar32)cp);

	1209 }

	1210 return *this;

	1211 }

	1212

	1213 /**

	1214 * Complements the specified range in this set. Any character in

	1215 * the range will be removed if it is in this set, or will be

	1216 * added if it is not in this set. If <code>end > start</code>

	1217 * then an empty range is xor'ed, leaving the set unchanged.

	1218 *

	1219 * @param start first character, inclusive, of range to be removed

	1220 * from this set.

	1221 * @param end last character, inclusive, of range to be removed

	1222 * from this set.

	1223 */

	1224 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {

	1225 if (isFrozen() \|\| isBogus()) {

	1226 return *this;

	1227 }

	1228 if (pinCodePoint(start) <= pinCodePoint(end)) {

	1229 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };

	1230 exclusiveOr(range, 2, 0);

	1231 }

	1232 releasePattern();

	1233 return *this;

	1234 }

	1235

	1236 UnicodeSet& UnicodeSet::complement(UChar32 c) {

	1237 return complement(c, c);

	1238 }

	1239

	1240 /**

	1241 * This is equivalent to

	1242 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.

	1243 */

	1244 UnicodeSet& UnicodeSet::complement(void) {

	1245 if (isFrozen() \|\| isBogus()) {

	1246 return *this;

	1247 }

	1248 UErrorCode status = U_ZERO_ERROR;

	1249 if (list[0] == UNICODESET_LOW) {

	1250 ensureBufferCapacity(len-1, status);

	1251 if (U_FAILURE(status)) {

	1252 return *this;

	1253 }

	1254 uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32));

	1255 --len;

	1256 } else {

	1257 ensureBufferCapacity(len+1, status);

	1258 if (U_FAILURE(status)) {

	1259 return *this;

	1260 }

	1261 uprv_memcpy(buffer + 1, list, len*sizeof(UChar32));

	1262 buffer[0] = UNICODESET_LOW;

	1263 ++len;

	1264 }

	1265 swapBuffers();

	1266 releasePattern();

	1267 return *this;

	1268 }

	1269

	1270 /**

	1271 * Complement the specified string in this set.

	1272 * The set will not contain the specified string once the call

	1273 * returns.

	1274 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>

	1275 * @param s the string to complement

	1276 * @return this object, for chaining

	1277 */

	1278 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {

	1279 if (s.length() == 0 \|\| isFrozen() \|\| isBogus()) return *this;

	1280 int32_t cp = getSingleCP(s);

	1281 if (cp < 0) {

	1282 if (strings->contains((void*) &s)) {

	1283 strings->removeElement((void*) &s);

	1284 } else {

	1285 _add(s);

	1286 }

	1287 releasePattern();

	1288 } else {

	1289 complement((UChar32)cp, (UChar32)cp);

	1290 }

	1291 return *this;

	1292 }

	1293

	1294 /**

	1295 * Adds all of the elements in the specified set to this set if

	1296 * they're not already present. This operation effectively

	1297 * modifies this set so that its value is the <i>union</i> of the two

	1298 * sets. The behavior of this operation is unspecified if the specified

	1299 * collection is modified while the operation is in progress.

	1300 *

	1301 * @param c set whose elements are to be added to this set.

	1302 * @see #add(char, char)

	1303 */

	1304 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {

	1305 if ( c.len>0 && c.list!=NULL ) {

	1306 add(c.list, c.len, 0);

	1307 }

	1308

	1309 // Add strings in order

	1310 if ( c.strings!=NULL ) {

	1311 for (int32_t i=0; i<c.strings->size(); ++i) {

	1312 const UnicodeString* s = (const UnicodeString*)c.strings->elementAt( i);

	1313 if (!strings->contains((void*) s)) {

	1314 _add(*s);

	1315 }

	1316 }

	1317 }

	1318 return *this;

	1319 }

	1320

	1321 /**

	1322 * Retains only the elements in this set that are contained in the

	1323 * specified set. In other words, removes from this set all of

	1324 * its elements that are not contained in the specified set. This

	1325 * operation effectively modifies this set so that its value is

	1326 * the <i>intersection</i> of the two sets.

	1327 *

	1328 * @param c set that defines which elements this set will retain.

	1329 */

	1330 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {

	1331 if (isFrozen() \|\| isBogus()) {

	1332 return *this;

	1333 }

	1334 retain(c.list, c.len, 0);

	1335 strings->retainAll(*c.strings);

	1336 return *this;

	1337 }

	1338

	1339 /**

	1340 * Removes from this set all of its elements that are contained in the

	1341 * specified set. This operation effectively modifies this

	1342 * set so that its value is the <i>asymmetric set difference</i> of

	1343 * the two sets.

	1344 *

	1345 * @param c set that defines which elements will be removed from

	1346 * this set.

	1347 */

	1348 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {

	1349 if (isFrozen() \|\| isBogus()) {

	1350 return *this;

	1351 }

	1352 retain(c.list, c.len, 2);

	1353 strings->removeAll(*c.strings);

	1354 return *this;

	1355 }

	1356

	1357 /**

	1358 * Complements in this set all elements contained in the specified

	1359 * set. Any character in the other set will be removed if it is

	1360 * in this set, or will be added if it is not in this set.

	1361 *

	1362 * @param c set that defines which elements will be xor'ed from

	1363 * this set.

	1364 */

	1365 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {

	1366 if (isFrozen() \|\| isBogus()) {

	1367 return *this;

	1368 }

	1369 exclusiveOr(c.list, c.len, 0);

	1370

	1371 for (int32_t i=0; i<c.strings->size(); ++i) {

	1372 void* e = c.strings->elementAt(i);

	1373 if (!strings->removeElement(e)) {

	1374 _add((const UnicodeString)e);

	1375 }

	1376 }

	1377 return *this;

	1378 }

	1379

	1380 /**

	1381 * Removes all of the elements from this set. This set will be

	1382 * empty after this call returns.

	1383 */

	1384 UnicodeSet& UnicodeSet::clear(void) {

	1385 if (isFrozen()) {

	1386 return *this;

	1387 }

	1388 if (list != NULL) {

	1389 list[0] = UNICODESET_HIGH;

	1390 }

	1391 len = 1;

	1392 releasePattern();

	1393 if (strings != NULL) {

	1394 strings->removeAllElements();

	1395 }

	1396 if (list != NULL && strings != NULL) {

	1397 // Remove bogus

	1398 fFlags = 0;

	1399 }

	1400 return *this;

	1401 }

	1402

	1403 /**

	1404 * Iteration method that returns the number of ranges contained in

	1405 * this set.

	1406 * @see #getRangeStart

	1407 * @see #getRangeEnd

	1408 */

	1409 int32_t UnicodeSet::getRangeCount() const {

	1410 return len/2;

	1411 }

	1412

	1413 /**

	1414 * Iteration method that returns the first character in the

	1415 * specified range of this set.

	1416 * @see #getRangeCount

	1417 * @see #getRangeEnd

	1418 */

	1419 UChar32 UnicodeSet::getRangeStart(int32_t index) const {

	1420 return list[index*2];

	1421 }

	1422

	1423 /**

	1424 * Iteration method that returns the last character in the

	1425 * specified range of this set.

	1426 * @see #getRangeStart

	1427 * @see #getRangeEnd

	1428 */

	1429 UChar32 UnicodeSet::getRangeEnd(int32_t index) const {

	1430 return list[index*2 + 1] - 1;

	1431 }

	1432

	1433 int32_t UnicodeSet::getStringCount() const {

	1434 return strings->size();

	1435 }

	1436

	1437 const UnicodeString* UnicodeSet::getString(int32_t index) const {

	1438 return (const UnicodeString*) strings->elementAt(index);

	1439 }

	1440

	1441 /**

	1442 * Reallocate this objects internal structures to take up the least

	1443 * possible space, without changing this object's value.

	1444 */

	1445 UnicodeSet& UnicodeSet::compact() {

	1446 if (isFrozen() \|\| isBogus()) {

	1447 return *this;

	1448 }

	1449 // Delete buffer first to defragment memory less.

	1450 if (buffer != NULL) {

	1451 uprv_free(buffer);

	1452 buffer = NULL;

	1453 }

	1454 if (len < capacity) {

	1455 // Make the capacity equal to len or 1.

	1456 // We don't want to realloc of 0 size.

	1457 int32_t newCapacity = len + (len == 0);

	1458 UChar32* temp = (UChar32) uprv_realloc(list, sizeof(UChar32) newCapac ity);

	1459 if (temp) {

	1460 list = temp;

	1461 capacity = newCapacity;

	1462 }

	1463 // else what the heck happened?! We allocated less memory!

	1464 // Oh well. We'll keep our original array.

	1465 }

	1466 return *this;

	1467 }

	1468

	1469 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {

	1470 int32_t bmpLength, length, destLength;

	1471

	1472 if (U_FAILURE(ec)) {

	1473 return 0;

	1474 }

	1475

	1476 if (destCapacity<0 \|\| (destCapacity>0 && dest==NULL)) {

	1477 ec=U_ILLEGAL_ARGUMENT_ERROR;

	1478 return 0;

	1479 }

	1480

	1481 /* count necessary 16-bit units */

	1482 length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH

	1483 // assert(length>=0);

	1484 if (length==0) {

	1485 /* empty set */

	1486 if (destCapacity>0) {

	1487 *dest=0;

	1488 } else {

	1489 ec=U_BUFFER_OVERFLOW_ERROR;

	1490 }

	1491 return 1;

	1492 }

	1493 /* now length>0 */

	1494

	1495 if (this->list[length-1]<=0xffff) {

	1496 /* all BMP */

	1497 bmpLength=length;

	1498 } else if (this->list[0]>=0x10000) {

	1499 /* all supplementary */

	1500 bmpLength=0;

	1501 length*=2;

	1502 } else {

	1503 /* some BMP, some supplementary */

	1504 for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++b mpLength) {}

	1505 length=bmpLength+2*(length-bmpLength);

	1506 }

	1507

	1508 /* length: number of 16-bit array units */

	1509 if (length>0x7fff) {

	1510 /* there are only 15 bits for the length in the first serialized word */

	1511 ec=U_INDEX_OUTOFBOUNDS_ERROR;

	1512 return 0;

	1513 }

	1514

	1515 /*

	1516 * total serialized length:

	1517 * number of 16-bit array units (length) +

	1518 * 1 length unit (always) +

	1519 * 1 bmpLength unit (if there are supplementary values)

	1520 */

	1521 destLength=length+((length>bmpLength)?2:1);

	1522 if (destLength<=destCapacity) {

	1523 const UChar32 *p;

	1524 int32_t i;

	1525

	1526 *dest=(uint16_t)length;

	1527 if (length>bmpLength) {

	1528 *dest\|=0x8000;

	1529 *++dest=(uint16_t)bmpLength;

	1530 }

	1531 ++dest;

	1532

	1533 /* write the BMP part of the array */

	1534 p=this->list;

	1535 for (i=0; i<bmpLength; ++i) {

	1536 dest++=(uint16_t)p++;

	1537 }

	1538

	1539 /* write the supplementary part of the array */

	1540 for (; i<length; i+=2) {

	1541 dest++=(uint16_t)(p>>16);

	1542 dest++=(uint16_t)p++;

	1543 }

	1544 } else {

	1545 ec=U_BUFFER_OVERFLOW_ERROR;

	1546 }

	1547 return destLength;

	1548 }

	1549

	1550 //----------------------------------------------------------------

	1551 // Implementation: Utility methods

	1552 //----------------------------------------------------------------

	1553

	1554 /**

	1555 * Allocate our strings vector and return TRUE if successful.

	1556 */

	1557 UBool UnicodeSet::allocateStrings(UErrorCode &status) {

	1558 if (U_FAILURE(status)) {

	1559 return FALSE;

	1560 }

	1561 strings = new UVector(uhash_deleteUnicodeString,

	1562 uhash_compareUnicodeString, 1, status);

	1563 if (strings == NULL) { // Check for memory allocation error.

	1564 status = U_MEMORY_ALLOCATION_ERROR;

	1565 return FALSE;

	1566 }

	1567 if (U_FAILURE(status)) {

	1568 delete strings;

	1569 strings = NULL;

	1570 return FALSE;

	1571 }

	1572 return TRUE;

	1573 }

	1574

	1575 void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) {

	1576 if (newLen <= capacity)

	1577 return;

	1578 UChar32* temp = (UChar32) uprv_realloc(list, sizeof(UChar32) (newLen + GR OW_EXTRA));

	1579 if (temp == NULL) {

	1580 ec = U_MEMORY_ALLOCATION_ERROR;

	1581 setToBogus();

	1582 return;

	1583 }

	1584 list = temp;

	1585 capacity = newLen + GROW_EXTRA;

	1586 // else we keep the original contents on the memory failure.

	1587 }

	1588

	1589 void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) {

	1590 if (buffer != NULL && newLen <= bufferCapacity)

	1591 return;

	1592 UChar32* temp = (UChar32) uprv_realloc(buffer, sizeof(UChar32) (newLen + GROW_EXTRA));

	1593 if (temp == NULL) {

	1594 ec = U_MEMORY_ALLOCATION_ERROR;

	1595 setToBogus();

	1596 return;

	1597 }

	1598 buffer = temp;

	1599 bufferCapacity = newLen + GROW_EXTRA;

	1600 // else we keep the original contents on the memory failure.

	1601 }

	1602

	1603 /**

	1604 * Swap list and buffer.

	1605 */

	1606 void UnicodeSet::swapBuffers(void) {

	1607 // swap list and buffer

	1608 UChar32* temp = list;

	1609 list = buffer;

	1610 buffer = temp;

	1611

	1612 int32_t c = capacity;

	1613 capacity = bufferCapacity;

	1614 bufferCapacity = c;

	1615 }

	1616

	1617 void UnicodeSet::setToBogus() {

	1618 clear(); // Remove everything in the set.

	1619 fFlags = kIsBogus;

	1620 }

	1621

	1622 //----------------------------------------------------------------

	1623 // Implementation: Fundamental operators

	1624 //----------------------------------------------------------------

	1625

	1626 static inline UChar32 max(UChar32 a, UChar32 b) {

	1627 return (a > b) ? a : b;

	1628 }

	1629

	1630 // polarity = 0, 3 is normal: x xor y

	1631 // polarity = 1, 2: x xor ~y == x === y

	1632

	1633 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t pola rity) {

	1634 if (isFrozen() \|\| isBogus()) {

	1635 return;

	1636 }

	1637 UErrorCode status = U_ZERO_ERROR;

	1638 ensureBufferCapacity(len + otherLen, status);

	1639 if (U_FAILURE(status)) {

	1640 return;

	1641 }

	1642

	1643 int32_t i = 0, j = 0, k = 0;

	1644 UChar32 a = list[i++];

	1645 UChar32 b;

	1646 if (polarity == 1 \|\| polarity == 2) {

	1647 b = UNICODESET_LOW;

	1648 if (other[j] == UNICODESET_LOW) { // skip base if already LOW

	1649 ++j;

	1650 b = other[j];

	1651 }

	1652 } else {

	1653 b = other[j++];

	1654 }

	1655 // simplest of all the routines

	1656 // sort the values, discarding identicals!

	1657 for (;;) {

	1658 if (a < b) {

	1659 buffer[k++] = a;

	1660 a = list[i++];

	1661 } else if (b < a) {

	1662 buffer[k++] = b;

	1663 b = other[j++];

	1664 } else if (a != UNICODESET_HIGH) { // at this point, a == b

	1665 // discard both values!

	1666 a = list[i++];

	1667 b = other[j++];

	1668 } else { // DONE!

	1669 buffer[k++] = UNICODESET_HIGH;

	1670 len = k;

	1671 break;

	1672 }

	1673 }

	1674 swapBuffers();

	1675 releasePattern();

	1676 }

	1677

	1678 // polarity = 0 is normal: x union y

	1679 // polarity = 2: x union ~y

	1680 // polarity = 1: ~x union y

	1681 // polarity = 3: ~x union ~y

	1682

	1683 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {

	1684 if (isFrozen() \|\| isBogus() \|\| other==NULL) {

	1685 return;

	1686 }

	1687 UErrorCode status = U_ZERO_ERROR;

	1688 ensureBufferCapacity(len + otherLen, status);

	1689 if (U_FAILURE(status)) {

	1690 return;

	1691 }

	1692

	1693 int32_t i = 0, j = 0, k = 0;

	1694 UChar32 a = list[i++];

	1695 UChar32 b = other[j++];

	1696 // change from xor is that we have to check overlapping pairs

	1697 // polarity bit 1 means a is second, bit 2 means b is.

	1698 for (;;) {

	1699 switch (polarity) {

	1700 case 0: // both first; take lower if unequal

	1701 if (a < b) { // take a

	1702 // Back up over overlapping ranges in buffer[]

	1703 if (k > 0 && a <= buffer[k-1]) {

	1704 // Pick latter end value in buffer[] vs. list[]

	1705 a = max(list[i], buffer[--k]);

	1706 } else {

	1707 // No overlap

	1708 buffer[k++] = a;

	1709 a = list[i];

	1710 }

	1711 i++; // Common if/else code factored out

	1712 polarity ^= 1;

	1713 } else if (b < a) { // take b

	1714 if (k > 0 && b <= buffer[k-1]) {

	1715 b = max(other[j], buffer[--k]);

	1716 } else {

	1717 buffer[k++] = b;

	1718 b = other[j];

	1719 }

	1720 j++;

	1721 polarity ^= 2;

	1722 } else { // a == b, take a, drop b

	1723 if (a == UNICODESET_HIGH) goto loop_end;

	1724 // This is symmetrical; it doesn't matter if

	1725 // we backtrack with a or b. - liu

	1726 if (k > 0 && a <= buffer[k-1]) {

	1727 a = max(list[i], buffer[--k]);

	1728 } else {

	1729 // No overlap

	1730 buffer[k++] = a;

	1731 a = list[i];

	1732 }

	1733 i++;

	1734 polarity ^= 1;

	1735 b = other[j++];

	1736 polarity ^= 2;

	1737 }

	1738 break;

	1739 case 3: // both second; take higher if unequal, and drop other

	1740 if (b <= a) { // take a

	1741 if (a == UNICODESET_HIGH) goto loop_end;

	1742 buffer[k++] = a;

	1743 } else { // take b

	1744 if (b == UNICODESET_HIGH) goto loop_end;

	1745 buffer[k++] = b;

	1746 }

	1747 a = list[i++];

	1748 polarity ^= 1; // factored common code

	1749 b = other[j++];

	1750 polarity ^= 2;

	1751 break;

	1752 case 1: // a second, b first; if b < a, overlap

	1753 if (a < b) { // no overlap, take a

	1754 buffer[k++] = a; a = list[i++]; polarity ^= 1;

	1755 } else if (b < a) { // OVERLAP, drop b

	1756 b = other[j++];

	1757 polarity ^= 2;

	1758 } else { // a == b, drop both!

	1759 if (a == UNICODESET_HIGH) goto loop_end;

	1760 a = list[i++];

	1761 polarity ^= 1;

	1762 b = other[j++];

	1763 polarity ^= 2;

	1764 }

	1765 break;

	1766 case 2: // a first, b second; if a < b, overlap

	1767 if (b < a) { // no overlap, take b

	1768 buffer[k++] = b;

	1769 b = other[j++];

	1770 polarity ^= 2;

	1771 } else if (a < b) { // OVERLAP, drop a

	1772 a = list[i++];

	1773 polarity ^= 1;

	1774 } else { // a == b, drop both!

	1775 if (a == UNICODESET_HIGH) goto loop_end;

	1776 a = list[i++];

	1777 polarity ^= 1;

	1778 b = other[j++];

	1779 polarity ^= 2;

	1780 }

	1781 break;

	1782 }

	1783 }

	1784 loop_end:

	1785 buffer[k++] = UNICODESET_HIGH; // terminate

	1786 len = k;

	1787 swapBuffers();

	1788 releasePattern();

	1789 }

	1790

	1791 // polarity = 0 is normal: x intersect y

	1792 // polarity = 2: x intersect ~y == set-minus

	1793 // polarity = 1: ~x intersect y

	1794 // polarity = 3: ~x intersect ~y

	1795

	1796 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {

	1797 if (isFrozen() \|\| isBogus()) {

	1798 return;

	1799 }

	1800 UErrorCode status = U_ZERO_ERROR;

	1801 ensureBufferCapacity(len + otherLen, status);

	1802 if (U_FAILURE(status)) {

	1803 return;

	1804 }

	1805

	1806 int32_t i = 0, j = 0, k = 0;

	1807 UChar32 a = list[i++];

	1808 UChar32 b = other[j++];

	1809 // change from xor is that we have to check overlapping pairs

	1810 // polarity bit 1 means a is second, bit 2 means b is.

	1811 for (;;) {

	1812 switch (polarity) {

	1813 case 0: // both first; drop the smaller

	1814 if (a < b) { // drop a

	1815 a = list[i++];

	1816 polarity ^= 1;

	1817 } else if (b < a) { // drop b

	1818 b = other[j++];

	1819 polarity ^= 2;

	1820 } else { // a == b, take one, drop other

	1821 if (a == UNICODESET_HIGH) goto loop_end;

	1822 buffer[k++] = a;

	1823 a = list[i++];

	1824 polarity ^= 1;

	1825 b = other[j++];

	1826 polarity ^= 2;

	1827 }

	1828 break;

	1829 case 3: // both second; take lower if unequal

	1830 if (a < b) { // take a

	1831 buffer[k++] = a;

	1832 a = list[i++];

	1833 polarity ^= 1;

	1834 } else if (b < a) { // take b

	1835 buffer[k++] = b;

	1836 b = other[j++];

	1837 polarity ^= 2;

	1838 } else { // a == b, take one, drop other

	1839 if (a == UNICODESET_HIGH) goto loop_end;

	1840 buffer[k++] = a;

	1841 a = list[i++];

	1842 polarity ^= 1;

	1843 b = other[j++];

	1844 polarity ^= 2;

	1845 }

	1846 break;

	1847 case 1: // a second, b first;

	1848 if (a < b) { // NO OVERLAP, drop a

	1849 a = list[i++];

	1850 polarity ^= 1;

	1851 } else if (b < a) { // OVERLAP, take b

	1852 buffer[k++] = b;

	1853 b = other[j++];

	1854 polarity ^= 2;

	1855 } else { // a == b, drop both!

	1856 if (a == UNICODESET_HIGH) goto loop_end;

	1857 a = list[i++];

	1858 polarity ^= 1;

	1859 b = other[j++];

	1860 polarity ^= 2;

	1861 }

	1862 break;

	1863 case 2: // a first, b second; if a < b, overlap

	1864 if (b < a) { // no overlap, drop b

	1865 b = other[j++];

	1866 polarity ^= 2;

	1867 } else if (a < b) { // OVERLAP, take a

	1868 buffer[k++] = a;

	1869 a = list[i++];

	1870 polarity ^= 1;

	1871 } else { // a == b, drop both!

	1872 if (a == UNICODESET_HIGH) goto loop_end;

	1873 a = list[i++];

	1874 polarity ^= 1;

	1875 b = other[j++];

	1876 polarity ^= 2;

	1877 }

	1878 break;

	1879 }

	1880 }

	1881 loop_end:

	1882 buffer[k++] = UNICODESET_HIGH; // terminate

	1883 len = k;

	1884 swapBuffers();

	1885 releasePattern();

	1886 }

	1887

	1888 /**

	1889 * Append the <code>toPattern()</code> representation of a

	1890 * string to the given <code>StringBuffer</code>.

	1891 */

	1892 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool

	1893 escapeUnprintable) {

	1894 UChar32 cp;

	1895 for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {

	1896 _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);

	1897 }

	1898 }

	1899

	1900 /**

	1901 * Append the <code>toPattern()</code> representation of a

	1902 * character to the given <code>StringBuffer</code>.

	1903 */

	1904 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool

	1905 escapeUnprintable) {

	1906 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {

	1907 // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything

	1908 // unprintable

	1909 if (ICU_Utility::escapeUnprintable(buf, c)) {

	1910 return;

	1911 }

	1912 }

	1913 // Okay to let ':' pass through

	1914 switch (c) {

	1915 case SET_OPEN:

	1916 case SET_CLOSE:

	1917 case HYPHEN:

	1918 case COMPLEMENT:

	1919 case INTERSECTION:

	1920 case BACKSLASH:

	1921 case OPEN_BRACE:

	1922 case CLOSE_BRACE:

	1923 case COLON:

	1924 case SymbolTable::SYMBOL_REF:

	1925 buf.append(BACKSLASH);

	1926 break;

	1927 default:

	1928 // Escape whitespace

	1929 if (uprv_isRuleWhiteSpace(c)) {

	1930 buf.append(BACKSLASH);

	1931 }

	1932 break;

	1933 }

	1934 buf.append(c);

	1935 }

	1936

	1937 /**

	1938 * Append a string representation of this set to result. This will be

	1939 * a cleaned version of the string passed to applyPattern(), if there

	1940 * is one. Otherwise it will be generated.

	1941 */

	1942 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,

	1943 UBool escapeUnprintable) const

	1944 {

	1945 if (pat != NULL) {

	1946 int32_t i;

	1947 int32_t backslashCount = 0;

	1948 for (i=0; i<patLen; ) {

	1949 UChar32 c;

	1950 U16_NEXT(pat, i, patLen, c);

	1951 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {

	1952 // If the unprintable character is preceded by an odd

	1953 // number of backslashes, then it has been escaped.

	1954 // Before unescaping it, we delete the final

	1955 // backslash.

	1956 if ((backslashCount % 2) == 1) {

	1957 result.truncate(result.length() - 1);

	1958 }

	1959 ICU_Utility::escapeUnprintable(result, c);

	1960 backslashCount = 0;

	1961 } else {

	1962 result.append(c);

	1963 if (c == BACKSLASH) {

	1964 ++backslashCount;

	1965 } else {

	1966 backslashCount = 0;

	1967 }

	1968 }

	1969 }

	1970 return result;

	1971 }

	1972

	1973 return _generatePattern(result, escapeUnprintable);

	1974 }

	1975

	1976 /**

	1977 * Returns a string representation of this set. If the result of

	1978 * calling this function is passed to a UnicodeSet constructor, it

	1979 * will produce another set that is equal to this one.

	1980 */

	1981 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,

	1982 UBool escapeUnprintable) const

	1983 {

	1984 result.truncate(0);

	1985 return _toPattern(result, escapeUnprintable);

	1986 }

	1987

	1988 /**

	1989 * Generate and append a string representation of this set to result.

	1990 * This does not use this.pat, the cleaned up copy of the string

	1991 * passed to applyPattern().

	1992 */

	1993 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,

	1994 UBool escapeUnprintable) const

	1995 {

	1996 result.append(SET_OPEN);

	1997

	1998 // // Check against the predefined categories. We implicitly build

	1999 // // up ALL category sets the first time toPattern() is called.

	2000 // for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {

	2001 // if (*this == getCategorySet(cat)) {

	2002 // result.append(COLON);

	2003 // result.append(CATEGORY_NAMES, cat*2, 2);

	2004 // return result.append(CATEGORY_CLOSE);

	2005 // }

	2006 // }

	2007

	2008 int32_t count = getRangeCount();

	2009

	2010 // If the set contains at least 2 intervals and includes both

	2011 // MIN_VALUE and MAX_VALUE, then the inverse representation will

	2012 // be more economical.

	2013 if (count > 1 &&

	2014 getRangeStart(0) == MIN_VALUE &&

	2015 getRangeEnd(count-1) == MAX_VALUE) {

	2016

	2017 // Emit the inverse

	2018 result.append(COMPLEMENT);

	2019

	2020 for (int32_t i = 1; i < count; ++i) {

	2021 UChar32 start = getRangeEnd(i-1)+1;

	2022 UChar32 end = getRangeStart(i)-1;

	2023 _appendToPat(result, start, escapeUnprintable);

	2024 if (start != end) {

	2025 if ((start+1) != end) {

	2026 result.append(HYPHEN);

	2027 }

	2028 _appendToPat(result, end, escapeUnprintable);

	2029 }

	2030 }

	2031 }

	2032

	2033 // Default; emit the ranges as pairs

	2034 else {

	2035 for (int32_t i = 0; i < count; ++i) {

	2036 UChar32 start = getRangeStart(i);

	2037 UChar32 end = getRangeEnd(i);

	2038 _appendToPat(result, start, escapeUnprintable);

	2039 if (start != end) {

	2040 if ((start+1) != end) {

	2041 result.append(HYPHEN);

	2042 }

	2043 _appendToPat(result, end, escapeUnprintable);

	2044 }

	2045 }

	2046 }

	2047

	2048 for (int32_t i = 0; i<strings->size(); ++i) {

	2049 result.append(OPEN_BRACE);

	2050 _appendToPat(result,

	2051 (const UnicodeString) strings->elementAt(i),

	2052 escapeUnprintable);

	2053 result.append(CLOSE_BRACE);

	2054 }

	2055 return result.append(SET_CLOSE);

	2056 }

	2057

	2058 /**

	2059 * Release existing cached pattern

	2060 */

	2061 void UnicodeSet::releasePattern() {

	2062 if (pat) {

	2063 uprv_free(pat);

	2064 pat = NULL;

	2065 patLen = 0;

	2066 }

	2067 }

	2068

	2069 /**

	2070 * Set the new pattern to cache.

	2071 */

	2072 void UnicodeSet::setPattern(const UnicodeString& newPat) {

	2073 releasePattern();

	2074 int32_t newPatLen = newPat.length();

	2075 pat = (UChar )uprv_malloc((newPatLen + 1) sizeof(UChar));

	2076 if (pat) {

	2077 patLen = newPatLen;

	2078 newPat.extractBetween(0, patLen, pat);

	2079 pat[patLen] = 0;

	2080 }

	2081 // else we don't care if malloc failed. This was just a nice cache.

	2082 // We can regenerate an equivalent pattern later when requested.

	2083 }

	2084

	2085 UnicodeFunctor *UnicodeSet::freeze() {

	2086 if(!isFrozen() && !isBogus()) {

	2087 // Do most of what compact() does before freezing because

	2088 // compact() will not work when the set is frozen.

	2089 // Small modification: Don't shrink if the savings would be tiny (<=GROW _EXTRA).

	2090

	2091 // Delete buffer first to defragment memory less.

	2092 if (buffer != NULL) {

	2093 uprv_free(buffer);

	2094 buffer = NULL;

	2095 }

	2096 if (capacity > (len + GROW_EXTRA)) {

	2097 // Make the capacity equal to len or 1.

	2098 // We don't want to realloc of 0 size.

	2099 capacity = len + (len == 0);

	2100 list = (UChar32) uprv_realloc(list, sizeof(UChar32) capacity);

	2101 if (list == NULL) { // Check for memory allocation error.

	2102 setToBogus();

	2103 return this;

	2104 }

	2105 }

	2106

	2107 // Optimize contains() and span() and similar functions.

	2108 if (!strings->isEmpty()) {

	2109 stringSpan = new UnicodeSetStringSpan(this, strings, UnicodeSetStr ingSpan::ALL);

	2110 if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {

	2111 // All strings are irrelevant for span() etc. because

	2112 // all of each string's code points are contained in this set.

	2113 // Do not check needsStringSpanUTF8() because UTF-8 has at most as

	2114 // many relevant strings as UTF-16.

	2115 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)

	2116 delete stringSpan;

	2117 stringSpan = NULL;

	2118 }

	2119 }

	2120 if (stringSpan == NULL) {

	2121 // No span-relevant strings: Optimize for code point spans.

	2122 bmpSet=new BMPSet(list, len);

	2123 if (bmpSet == NULL) { // Check for memory allocation error.

	2124 setToBogus();

	2125 }

	2126 }

	2127 }

	2128 return this;

	2129 }

	2130

	2131 int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanC ondition) const {

	2132 if(length>0 && bmpSet!=NULL) {

	2133 return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);

	2134 }

	2135 if(length<0) {

	2136 length=u_strlen(s);

	2137 }

	2138 if(length==0) {

	2139 return 0;

	2140 }

	2141 if(stringSpan!=NULL) {

	2142 return stringSpan->span(s, length, spanCondition);

	2143 } else if(!strings->isEmpty()) {

	2144 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?

	2145 UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :

	2146 UnicodeSetStringSpan::FWD_UTF16_CONTAINED;

	2147 UnicodeSetStringSpan strSpan(this, strings, which);

	2148 if(strSpan.needsStringSpanUTF16()) {

	2149 return strSpan.span(s, length, spanCondition);

	2150 }

	2151 }

	2152

	2153 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {

	2154 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.

	2155 }

	2156

	2157 UChar32 c;

	2158 int32_t start=0, prev=0;

	2159 do {

	2160 U16_NEXT(s, start, length, c);

	2161 if(spanCondition!=contains(c)) {

	2162 break;

	2163 }

	2164 } while((prev=start)<length);

	2165 return prev;

	2166 }

	2167

	2168 int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition s panCondition) const {

	2169 if(length>0 && bmpSet!=NULL) {

	2170 return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);

	2171 }

	2172 if(length<0) {

	2173 length=u_strlen(s);

	2174 }

	2175 if(length==0) {

	2176 return 0;

	2177 }

	2178 if(stringSpan!=NULL) {

	2179 return stringSpan->spanBack(s, length, spanCondition);

	2180 } else if(!strings->isEmpty()) {

	2181 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?

	2182 UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :

	2183 UnicodeSetStringSpan::BACK_UTF16_CONTAINED;

	2184 UnicodeSetStringSpan strSpan(this, strings, which);

	2185 if(strSpan.needsStringSpanUTF16()) {

	2186 return strSpan.spanBack(s, length, spanCondition);

	2187 }

	2188 }

	2189

	2190 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {

	2191 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.

	2192 }

	2193

	2194 UChar32 c;

	2195 int32_t prev=length;

	2196 do {

	2197 U16_PREV(s, 0, length, c);

	2198 if(spanCondition!=contains(c)) {

	2199 break;

	2200 }

	2201 } while((prev=length)>0);

	2202 return prev;

	2203 }

	2204

	2205 int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition sp anCondition) const {

	2206 if(length>0 && bmpSet!=NULL) {

	2207 const uint8_t s0=(const uint8_t )s;

	2208 return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);

	2209 }

	2210 if(length<0) {

	2211 length=(int32_t)uprv_strlen(s);

	2212 }

	2213 if(length==0) {

	2214 return 0;

	2215 }

	2216 if(stringSpan!=NULL) {

	2217 return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);

	2218 } else if(!strings->isEmpty()) {

	2219 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?

	2220 UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :

	2221 UnicodeSetStringSpan::FWD_UTF8_CONTAINED;

	2222 UnicodeSetStringSpan strSpan(this, strings, which);

	2223 if(strSpan.needsStringSpanUTF8()) {

	2224 return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);

	2225 }

	2226 }

	2227

	2228 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {

	2229 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.

	2230 }

	2231

	2232 UChar32 c;

	2233 int32_t start=0, prev=0;

	2234 do {

	2235 U8_NEXT(s, start, length, c);

	2236 if(c<0) {

	2237 c=0xfffd;

	2238 }

	2239 if(spanCondition!=contains(c)) {

	2240 break;

	2241 }

	2242 } while((prev=start)<length);

	2243 return prev;

	2244 }

	2245

	2246 int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanConditio n spanCondition) const {

	2247 if(length>0 && bmpSet!=NULL) {

	2248 const uint8_t s0=(const uint8_t )s;

	2249 return bmpSet->spanBackUTF8(s0, length, spanCondition);

	2250 }

	2251 if(length<0) {

	2252 length=(int32_t)uprv_strlen(s);

	2253 }

	2254 if(length==0) {

	2255 return 0;

	2256 }

	2257 if(stringSpan!=NULL) {

	2258 return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanConditio n);

	2259 } else if(!strings->isEmpty()) {

	2260 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?

	2261 UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :

	2262 UnicodeSetStringSpan::BACK_UTF8_CONTAINED;

	2263 UnicodeSetStringSpan strSpan(this, strings, which);

	2264 if(strSpan.needsStringSpanUTF8()) {

	2265 return strSpan.spanBackUTF8((const uint8_t *)s, length, spanConditio n);

	2266 }

	2267 }

	2268

	2269 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {

	2270 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.

	2271 }

	2272

	2273 UChar32 c;

	2274 int32_t prev=length;

	2275 do {

	2276 U8_PREV(s, 0, length, c);

	2277 if(c<0) {

	2278 c=0xfffd;

	2279 }

	2280 if(spanCondition!=contains(c)) {

	2281 break;

	2282 }

	2283 } while((prev=length)>0);

	2284 return prev;

	2285 }

	2286

	2287 U_NAMESPACE_END

OLD	NEW

« no previous file with comments | « icu46/source/common/unifunct.cpp ('k') | icu46/source/common/uniset_props.cpp » ('j') | no next file with comments »