icu46/source/common/uniset_props.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/common/uniset_props.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 *******************************************************************************

	3 *

	4 * Copyright (C) 1999-2010, International Business Machines

	5 * Corporation and others. All Rights Reserved.

	6 *

	7 *******************************************************************************

	8 * file name: uniset_props.cpp

	9 * encoding: US-ASCII

	10 * tab size: 8 (not used)

	11 * indentation:4

	12 *

	13 * created on: 2004aug25

	14 * created by: Markus W. Scherer

	15 *

	16 * Character property dependent functions moved here from uniset.cpp

	17 */

	18

	19 #include "unicode/utypes.h"

	20 #include "unicode/uniset.h"

	21 #include "unicode/parsepos.h"

	22 #include "unicode/uchar.h"

	23 #include "unicode/uscript.h"

	24 #include "unicode/symtable.h"

	25 #include "unicode/uset.h"

	26 #include "unicode/locid.h"

	27 #include "unicode/brkiter.h"

	28 #include "uset_imp.h"

	29 #include "ruleiter.h"

	30 #include "cmemory.h"

	31 #include "ucln_cmn.h"

	32 #include "util.h"

	33 #include "uvector.h"

	34 #include "uprops.h"

	35 #include "propname.h"

	36 #include "normalizer2impl.h"

	37 #include "ucase.h"

	38 #include "ubidi_props.h"

	39 #include "uinvchar.h"

	40 #include "uprops.h"

	41 #include "charstr.h"

	42 #include "cstring.h"

	43 #include "mutex.h"

	44 #include "umutex.h"

	45 #include "uassert.h"

	46 #include "hash.h"

	47

	48 U_NAMESPACE_USE

	49

	50 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))

	51

	52 // initial storage. Must be >= 0

	53 // * same as in uniset.cpp ! *

	54 #define START_EXTRA 16

	55

	56 // Define UChar constants using hex for EBCDIC compatibility

	57 // Used #define to reduce private static exports and memory access time.

	58 #define SET_OPEN ((UChar)0x005B) /[/

	59 #define SET_CLOSE ((UChar)0x005D) /]/

	60 #define HYPHEN ((UChar)0x002D) /-/

	61 #define COMPLEMENT ((UChar)0x005E) /^/

	62 #define COLON ((UChar)0x003A) /:/

	63 #define BACKSLASH ((UChar)0x005C) /\/

	64 #define INTERSECTION ((UChar)0x0026) /&/

	65 #define UPPER_U ((UChar)0x0055) /U/

	66 #define LOWER_U ((UChar)0x0075) /u/

	67 #define OPEN_BRACE ((UChar)123) /{/

	68 #define CLOSE_BRACE ((UChar)125) /}/

	69 #define UPPER_P ((UChar)0x0050) /P/

	70 #define LOWER_P ((UChar)0x0070) /p/

	71 #define UPPER_N ((UChar)78) /N/

	72 #define EQUALS ((UChar)0x003D) /=/

	73

	74 //static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:"

	75 static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]"

	76 //static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p"

	77 static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}"

	78 //static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N"

	79 static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /-]/

	80

	81 // Special property set IDs

	82 static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF]

	83 static const char ASCII[] = "ASCII"; // [\u0000-\u007F]

	84 static const char ASSIGNED[] = "Assigned"; // [:^Cn:]

	85

	86 // Unicode name property alias

	87 #define NAME_PROP "na"

	88 #define NAME_PROP_LENGTH 2

	89

	90 /**

	91 * Delimiter string used in patterns to close a category reference:

	92 * ":]". Example: "[:Lu:]".

	93 */

	94 //static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */

	95

	96 // Cached sets ------------------------------------------------------------- ***

	97

	98 U_CDECL_BEGIN

	99 static UBool U_CALLCONV uset_cleanup();

	100 U_CDECL_END

	101

	102 // Not a TriStateSingletonWrapper because we think the UnicodeSet constructor

	103 // can only fail with an out-of-memory error

	104 // if we have a correct pattern and the properties data is hardcoded and always available.

	105 class UnicodeSetSingleton : public SimpleSingletonWrapper<UnicodeSet> {

	106 public:

	107 UnicodeSetSingleton(SimpleSingleton &s, const char *pattern) :

	108 SimpleSingletonWrapper<UnicodeSet>(s), fPattern(pattern) {}

	109 UnicodeSet *getInstance(UErrorCode &errorCode) {

	110 return SimpleSingletonWrapper<UnicodeSet>::getInstance(createInstance, f Pattern, errorCode);

	111 }

	112 private:

	113 static void createInstance(const void context, UErrorCode &errorCode) {

	114 UnicodeString pattern((const char *)context, -1, US_INV);

	115 UnicodeSet *set=new UnicodeSet(pattern, errorCode);

	116 if(set==NULL) {

	117 errorCode=U_MEMORY_ALLOCATION_ERROR;

	118 }

	119 set->freeze();

	120 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);

	121 return set;

	122 }

	123

	124 const char *fPattern;

	125 };

	126

	127 U_CDECL_BEGIN

	128

	129 static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusi ons()

	130

	131 STATIC_SIMPLE_SINGLETON(uni32Singleton);

	132

	133 //----------------------------------------------------------------

	134 // Inclusions list

	135 //----------------------------------------------------------------

	136

	137 // USetAdder implementation

	138 // Does not use uset.h to reduce code dependencies

	139 static void U_CALLCONV

	140 _set_add(USet *set, UChar32 c) {

	141 ((UnicodeSet *)set)->add(c);

	142 }

	143

	144 static void U_CALLCONV

	145 _set_addRange(USet *set, UChar32 start, UChar32 end) {

	146 ((UnicodeSet *)set)->add(start, end);

	147 }

	148

	149 static void U_CALLCONV

	150 _set_addString(USet set, const UChar str, int32_t length) {

	151 ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));

	152 }

	153

	154 /**

	155 * Cleanup function for UnicodeSet

	156 */

	157 static UBool U_CALLCONV uset_cleanup(void) {

	158 int32_t i;

	159

	160 for(i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) {

	161 if (INCLUSIONS[i] != NULL) {

	162 delete INCLUSIONS[i];

	163 INCLUSIONS[i] = NULL;

	164 }

	165 }

	166 UnicodeSetSingleton(uni32Singleton, NULL).deleteInstance();

	167 return TRUE;

	168 }

	169

	170 U_CDECL_END

	171

	172 U_NAMESPACE_BEGIN

	173

	174 /*

	175 Reduce excessive reallocation, and make it easier to detect initialization

	176 problems.

	177 Usually you don't see smaller sets than this for Unicode 5.0.

	178 */

	179 #define DEFAULT_INCLUSION_CAPACITY 3072

	180

	181 const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {

	182 UBool needInit;

	183 UMTX_CHECK(NULL, (INCLUSIONS[src] == NULL), needInit);

	184 if (needInit) {

	185 UnicodeSet* incl = new UnicodeSet();

	186 USetAdder sa = {

	187 (USet *)incl,

	188 _set_add,

	189 _set_addRange,

	190 _set_addString,

	191 NULL, // don't need remove()

	192 NULL // don't need removeRange()

	193 };

	194 incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status);

	195 if (incl != NULL) {

	196 switch(src) {

	197 case UPROPS_SRC_CHAR:

	198 uchar_addPropertyStarts(&sa, &status);

	199 break;

	200 case UPROPS_SRC_PROPSVEC:

	201 upropsvec_addPropertyStarts(&sa, &status);

	202 break;

	203 case UPROPS_SRC_CHAR_AND_PROPSVEC:

	204 uchar_addPropertyStarts(&sa, &status);

	205 upropsvec_addPropertyStarts(&sa, &status);

	206 break;

	207 #if !UCONFIG_NO_NORMALIZATION

	208 case UPROPS_SRC_CASE_AND_NORM: {

	209 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(statu s);

	210 if(U_SUCCESS(status)) {

	211 impl->addPropertyStarts(&sa, status);

	212 }

	213 ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status);

	214 break;

	215 }

	216 case UPROPS_SRC_NFC: {

	217 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(statu s);

	218 if(U_SUCCESS(status)) {

	219 impl->addPropertyStarts(&sa, status);

	220 }

	221 break;

	222 }

	223 case UPROPS_SRC_NFKC: {

	224 const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(stat us);

	225 if(U_SUCCESS(status)) {

	226 impl->addPropertyStarts(&sa, status);

	227 }

	228 break;

	229 }

	230 case UPROPS_SRC_NFKC_CF: {

	231 const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(s tatus);

	232 if(U_SUCCESS(status)) {

	233 impl->addPropertyStarts(&sa, status);

	234 }

	235 break;

	236 }

	237 case UPROPS_SRC_NFC_CANON_ITER: {

	238 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(statu s);

	239 if(U_SUCCESS(status)) {

	240 impl->addCanonIterPropertyStarts(&sa, status);

	241 }

	242 break;

	243 }

	244 #endif

	245 case UPROPS_SRC_CASE:

	246 ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status);

	247 break;

	248 case UPROPS_SRC_BIDI:

	249 ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status);

	250 break;

	251 default:

	252 status = U_INTERNAL_PROGRAM_ERROR;

	253 break;

	254 }

	255 if (U_SUCCESS(status)) {

	256 // Compact for caching

	257 incl->compact();

	258 umtx_lock(NULL);

	259 if (INCLUSIONS[src] == NULL) {

	260 INCLUSIONS[src] = incl;

	261 incl = NULL;

	262 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);

	263 }

	264 umtx_unlock(NULL);

	265 }

	266 delete incl;

	267 } else {

	268 status = U_MEMORY_ALLOCATION_ERROR;

	269 }

	270 }

	271 return INCLUSIONS[src];

	272 }

	273

	274 // Cache some sets for other services -------------------------------------- ***

	275

	276 U_CFUNC UnicodeSet *

	277 uniset_getUnicode32Instance(UErrorCode &errorCode) {

	278 return UnicodeSetSingleton(uni32Singleton, "[:age=3.2:]").getInstance(errorC ode);

	279 }

	280

	281 // helper functions for matching of pattern syntax pieces ------------------ ***

	282 // these functions are parallel to the PERL_OPEN etc. strings above

	283

	284 // using these functions is not only faster than UnicodeString::compare() and

	285 // caseCompare(), but they also make UnicodeSet work for simple patterns when

	286 // no Unicode properties data is available - when caseCompare() fails

	287

	288 static inline UBool

	289 isPerlOpen(const UnicodeString &pattern, int32_t pos) {

	290 UChar c;

	291 return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P \|\| c==UPPER_P);

	292 }

	293

	294 /*static inline UBool

	295 isPerlClose(const UnicodeString &pattern, int32_t pos) {

	296 return pattern.charAt(pos)==CLOSE_BRACE;

	297 }*/

	298

	299 static inline UBool

	300 isNameOpen(const UnicodeString &pattern, int32_t pos) {

	301 return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;

	302 }

	303

	304 static inline UBool

	305 isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {

	306 return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;

	307 }

	308

	309 /*static inline UBool

	310 isPOSIXClose(const UnicodeString &pattern, int32_t pos) {

	311 return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;

	312 }*/

	313

	314 // TODO memory debugging provided inside uniset.cpp

	315 // could be made available here but probably obsolete with use of modern

	316 // memory leak checker tools

	317 #define _dbgct(me)

	318

	319 //----------------------------------------------------------------

	320 // Constructors &c

	321 //----------------------------------------------------------------

	322

	323 /**

	324 * Constructs a set from the given pattern, optionally ignoring

	325 * white space. See the class description for the syntax of the

	326 * pattern language.

	327 * @param pattern a string specifying what characters are in the set

	328 */

	329 UnicodeSet::UnicodeSet(const UnicodeString& pattern,

	330 UErrorCode& status) :

	331 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),

	332 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),

	333 fFlags(0)

	334 {

	335 if(U_SUCCESS(status)){

	336 list = (UChar32) uprv_malloc(sizeof(UChar32) capacity);

	337 /* test for NULL */

	338 if(list == NULL) {

	339 status = U_MEMORY_ALLOCATION_ERROR;

	340 }else{

	341 allocateStrings(status);

	342 applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);

	343 }

	344 }

	345 _dbgct(this);

	346 }

	347

	348 /**

	349 * Constructs a set from the given pattern, optionally ignoring

	350 * white space. See the class description for the syntax of the

	351 * pattern language.

	352 * @param pattern a string specifying what characters are in the set

	353 * @param options bitmask for options to apply to the pattern.

	354 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.

	355 */

	356 UnicodeSet::UnicodeSet(const UnicodeString& pattern,

	357 uint32_t options,

	358 const SymbolTable* symbols,

	359 UErrorCode& status) :

	360 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),

	361 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),

	362 fFlags(0)

	363 {

	364 if(U_SUCCESS(status)){

	365 list = (UChar32) uprv_malloc(sizeof(UChar32) capacity);

	366 /* test for NULL */

	367 if(list == NULL) {

	368 status = U_MEMORY_ALLOCATION_ERROR;

	369 }else{

	370 allocateStrings(status);

	371 applyPattern(pattern, options, symbols, status);

	372 }

	373 }

	374 _dbgct(this);

	375 }

	376

	377 UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,

	378 uint32_t options,

	379 const SymbolTable* symbols,

	380 UErrorCode& status) :

	381 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),

	382 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),

	383 fFlags(0)

	384 {

	385 if(U_SUCCESS(status)){

	386 list = (UChar32) uprv_malloc(sizeof(UChar32) capacity);

	387 /* test for NULL */

	388 if(list == NULL) {

	389 status = U_MEMORY_ALLOCATION_ERROR;

	390 }else{

	391 allocateStrings(status);

	392 applyPattern(pattern, pos, options, symbols, status);

	393 }

	394 }

	395 _dbgct(this);

	396 }

	397

	398 //----------------------------------------------------------------

	399 // Public API

	400 //----------------------------------------------------------------

	401

	402 /**

	403 * Modifies this set to represent the set specified by the given

	404 * pattern, optionally ignoring white space. See the class

	405 * description for the syntax of the pattern language.

	406 * @param pattern a string specifying what characters are in the set

	407 * @param ignoreSpaces if <code>true</code>, all spaces in the

	408 * pattern are ignored. Spaces are those characters for which

	409 * <code>uprv_isRuleWhiteSpace()</code> is <code>true</code>.

	410 * Characters preceded by '\\' are escaped, losing any special

	411 * meaning they otherwise have. Spaces may be included by

	412 * escaping them.

	413 * @exception <code>IllegalArgumentException</code> if the pattern

	414 * contains a syntax error.

	415 */

	416 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,

	417 UErrorCode& status) {

	418 return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);

	419 }

	420

	421

	422 /**

	423 * Modifies this set to represent the set specified by the given

	424 * pattern, optionally ignoring white space. See the class

	425 * description for the syntax of the pattern language.

	426 * @param pattern a string specifying what characters are in the set

	427 * @param options bitmask for options to apply to the pattern.

	428 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.

	429 */

	430 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,

	431 uint32_t options,

	432 const SymbolTable* symbols,

	433 UErrorCode& status) {

	434 if (U_FAILURE(status) \|\| isFrozen()) {

	435 return *this;

	436 }

	437

	438 ParsePosition pos(0);

	439 applyPattern(pattern, pos, options, symbols, status);

	440 if (U_FAILURE(status)) return *this;

	441

	442 int32_t i = pos.getIndex();

	443

	444 if (options & USET_IGNORE_SPACE) {

	445 // Skip over trailing whitespace

	446 ICU_Utility::skipWhitespace(pattern, i, TRUE);

	447 }

	448

	449 if (i != pattern.length()) {

	450 status = U_ILLEGAL_ARGUMENT_ERROR;

	451 }

	452 return *this;

	453 }

	454

	455 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,

	456 ParsePosition& pos,

	457 uint32_t options,

	458 const SymbolTable* symbols,

	459 UErrorCode& status) {

	460 if (U_FAILURE(status) \|\| isFrozen()) {

	461 return *this;

	462 }

	463 // Need to build the pattern in a temporary string because

	464 // _applyPattern calls add() etc., which set pat to empty.

	465 UnicodeString rebuiltPat;

	466 RuleCharacterIterator chars(pattern, symbols, pos);

	467 applyPattern(chars, symbols, rebuiltPat, options, status);

	468 if (U_FAILURE(status)) return *this;

	469 if (chars.inVariable()) {

	470 // syntaxError(chars, "Extra chars in variable value");

	471 status = U_MALFORMED_SET;

	472 return *this;

	473 }

	474 setPattern(rebuiltPat);

	475 return *this;

	476 }

	477

	478 /**

	479 * Return true if the given position, in the given pattern, appears

	480 * to be the start of a UnicodeSet pattern.

	481 */

	482 UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {

	483 return ((pos+1) < pattern.length() &&

	484 pattern.charAt(pos) == (UChar)91/[/) \|\|

	485 resemblesPropertyPattern(pattern, pos);

	486 }

	487

	488 //----------------------------------------------------------------

	489 // Implementation: Pattern parsing

	490 //----------------------------------------------------------------

	491

	492 /**

	493 * A small all-inline class to manage a UnicodeSet pointer. Add

	494 * operator->() etc. as needed.

	495 */

	496 class UnicodeSetPointer {

	497 UnicodeSet* p;

	498 public:

	499 inline UnicodeSetPointer() : p(0) {}

	500 inline ~UnicodeSetPointer() { delete p; }

	501 inline UnicodeSet* pointer() { return p; }

	502 inline UBool allocate() {

	503 if (p == 0) {

	504 p = new UnicodeSet();

	505 }

	506 return p != 0;

	507 }

	508 };

	509

	510 /**

	511 * Parse the pattern from the given RuleCharacterIterator. The

	512 * iterator is advanced over the parsed pattern.

	513 * @param chars iterator over the pattern characters. Upon return

	514 * it will be advanced to the first character after the parsed

	515 * pattern, or the end of the iteration if all characters are

	516 * parsed.

	517 * @param symbols symbol table to use to parse and dereference

	518 * variables, or null if none.

	519 * @param rebuiltPat the pattern that was parsed, rebuilt or

	520 * copied from the input pattern, as appropriate.

	521 * @param options a bit mask of zero or more of the following:

	522 * IGNORE_SPACE, CASE.

	523 */

	524 void UnicodeSet::applyPattern(RuleCharacterIterator& chars,

	525 const SymbolTable* symbols,

	526 UnicodeString& rebuiltPat,

	527 uint32_t options,

	528 UErrorCode& ec) {

	529 if (U_FAILURE(ec)) return;

	530

	531 // Syntax characters: [ ] ^ - & { }

	532

	533 // Recognized special forms for chars, sets: c-c s-s s&s

	534

	535 int32_t opts = RuleCharacterIterator::PARSE_VARIABLES \|

	536 RuleCharacterIterator::PARSE_ESCAPES;

	537 if ((options & USET_IGNORE_SPACE) != 0) {

	538 opts \|= RuleCharacterIterator::SKIP_WHITESPACE;

	539 }

	540

	541 UnicodeString patLocal, buf;

	542 UBool usePat = FALSE;

	543 UnicodeSetPointer scratch;

	544 RuleCharacterIterator::Pos backup;

	545

	546 // mode: 0=before [, 1=between [...], 2=after ]

	547 // lastItem: 0=none, 1=char, 2=set

	548 int8_t lastItem = 0, mode = 0;

	549 UChar32 lastChar = 0;

	550 UChar op = 0;

	551

	552 UBool invert = FALSE;

	553

	554 clear();

	555

	556 while (mode != 2 && !chars.atEnd()) {

	557 U_ASSERT((lastItem == 0 && op == 0) \|\|

	558 (lastItem == 1 && (op == 0 \|\| op == HYPHEN /'-'/)) \|\|

	559 (lastItem == 2 && (op == 0 \|\| op == HYPHEN /'-'/ \|\|

	560 op == INTERSECTION /'&'/)));

	561

	562 UChar32 c = 0;

	563 UBool literal = FALSE;

	564 UnicodeSet* nested = 0; // alias - do not delete

	565

	566 // -------- Check for property pattern

	567

	568 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed

	569 int8_t setMode = 0;

	570 if (resemblesPropertyPattern(chars, opts)) {

	571 setMode = 2;

	572 }

	573

	574 // -------- Parse '[' of opening delimiter OR nested set.

	575 // If there is a nested set, use `setMode' to define how

	576 // the set should be parsed. If the '[' is part of the

	577 // opening delimiter for this pattern, parse special

	578 // strings "[", "[^", "[-", and "[^-". Check for stand-in

	579 // characters representing a nested set in the symbol

	580 // table.

	581

	582 else {

	583 // Prepare to backup if necessary

	584 chars.getPos(backup);

	585 c = chars.next(opts, literal, ec);

	586 if (U_FAILURE(ec)) return;

	587

	588 if (c == 0x5B /'['/ && !literal) {

	589 if (mode == 1) {

	590 chars.setPos(backup); // backup

	591 setMode = 1;

	592 } else {

	593 // Handle opening '[' delimiter

	594 mode = 1;

	595 patLocal.append((UChar) 0x5B /'['/);

	596 chars.getPos(backup); // prepare to backup

	597 c = chars.next(opts, literal, ec);

	598 if (U_FAILURE(ec)) return;

	599 if (c == 0x5E /'^'/ && !literal) {

	600 invert = TRUE;

	601 patLocal.append((UChar) 0x5E /'^'/);

	602 chars.getPos(backup); // prepare to backup

	603 c = chars.next(opts, literal, ec);

	604 if (U_FAILURE(ec)) return;

	605 }

	606 // Fall through to handle special leading '-';

	607 // otherwise restart loop for nested [], \p{}, etc.

	608 if (c == HYPHEN /'-'/) {

	609 literal = TRUE;

	610 // Fall through to handle literal '-' below

	611 } else {

	612 chars.setPos(backup); // backup

	613 continue;

	614 }

	615 }

	616 } else if (symbols != 0) {

	617 const UnicodeFunctor *m = symbols->lookupMatcher(c);

	618 if (m != 0) {

	619 const UnicodeSet ms = dynamic_cast<const UnicodeSet >(m);

	620 if (ms == NULL) {

	621 ec = U_MALFORMED_SET;

	622 return;

	623 }

	624 // casting away const, but `nested' won't be modified

	625 // (important not to modify stored set)

	626 nested = const_cast<UnicodeSet*>(ms);

	627 setMode = 3;

	628 }

	629 }

	630 }

	631

	632 // -------- Handle a nested set. This either is inline in

	633 // the pattern or represented by a stand-in that has

	634 // previously been parsed and was looked up in the symbol

	635 // table.

	636

	637 if (setMode != 0) {

	638 if (lastItem == 1) {

	639 if (op != 0) {

	640 // syntaxError(chars, "Char expected after operator");

	641 ec = U_MALFORMED_SET;

	642 return;

	643 }

	644 add(lastChar, lastChar);

	645 _appendToPat(patLocal, lastChar, FALSE);

	646 lastItem = 0;

	647 op = 0;

	648 }

	649

	650 if (op == HYPHEN /'-'/ \|\| op == INTERSECTION /'&'/) {

	651 patLocal.append(op);

	652 }

	653

	654 if (nested == 0) {

	655 // lazy allocation

	656 if (!scratch.allocate()) {

	657 ec = U_MEMORY_ALLOCATION_ERROR;

	658 return;

	659 }

	660 nested = scratch.pointer();

	661 }

	662 switch (setMode) {

	663 case 1:

	664 nested->applyPattern(chars, symbols, patLocal, options, ec);

	665 break;

	666 case 2:

	667 chars.skipIgnored(opts);

	668 nested->applyPropertyPattern(chars, patLocal, ec);

	669 if (U_FAILURE(ec)) return;

	670 break;

	671 case 3: // `nested' already parsed

	672 nested->_toPattern(patLocal, FALSE);

	673 break;

	674 }

	675

	676 usePat = TRUE;

	677

	678 if (mode == 0) {

	679 // Entire pattern is a category; leave parse loop

	680 this = nested;

	681 mode = 2;

	682 break;

	683 }

	684

	685 switch (op) {

	686 case HYPHEN: /'-'/

	687 removeAll(*nested);

	688 break;

	689 case INTERSECTION: /'&'/

	690 retainAll(*nested);

	691 break;

	692 case 0:

	693 addAll(*nested);

	694 break;

	695 }

	696

	697 op = 0;

	698 lastItem = 2;

	699

	700 continue;

	701 }

	702

	703 if (mode == 0) {

	704 // syntaxError(chars, "Missing '['");

	705 ec = U_MALFORMED_SET;

	706 return;

	707 }

	708

	709 // -------- Parse special (syntax) characters. If the

	710 // current character is not special, or if it is escaped,

	711 // then fall through and handle it below.

	712

	713 if (!literal) {

	714 switch (c) {

	715 case 0x5D /']'/:

	716 if (lastItem == 1) {

	717 add(lastChar, lastChar);

	718 _appendToPat(patLocal, lastChar, FALSE);

	719 }

	720 // Treat final trailing '-' as a literal

	721 if (op == HYPHEN /'-'/) {

	722 add(op, op);

	723 patLocal.append(op);

	724 } else if (op == INTERSECTION /'&'/) {

	725 // syntaxError(chars, "Trailing '&'");

	726 ec = U_MALFORMED_SET;

	727 return;

	728 }

	729 patLocal.append((UChar) 0x5D /']'/);

	730 mode = 2;

	731 continue;

	732 case HYPHEN /'-'/:

	733 if (op == 0) {

	734 if (lastItem != 0) {

	735 op = (UChar) c;

	736 continue;

	737 } else {

	738 // Treat final trailing '-' as a literal

	739 add(c, c);

	740 c = chars.next(opts, literal, ec);

	741 if (U_FAILURE(ec)) return;

	742 if (c == 0x5D /']'/ && !literal) {

	743 patLocal.append(HYPHEN_RIGHT_BRACE);

	744 mode = 2;

	745 continue;

	746 }

	747 }

	748 }

	749 // syntaxError(chars, "'-' not after char or set");

	750 ec = U_MALFORMED_SET;

	751 return;

	752 case INTERSECTION /'&'/:

	753 if (lastItem == 2 && op == 0) {

	754 op = (UChar) c;

	755 continue;

	756 }

	757 // syntaxError(chars, "'&' not after set");

	758 ec = U_MALFORMED_SET;

	759 return;

	760 case 0x5E /'^'/:

	761 // syntaxError(chars, "'^' not after '['");

	762 ec = U_MALFORMED_SET;

	763 return;

	764 case 0x7B /'{'/:

	765 if (op != 0) {

	766 // syntaxError(chars, "Missing operand after operator");

	767 ec = U_MALFORMED_SET;

	768 return;

	769 }

	770 if (lastItem == 1) {

	771 add(lastChar, lastChar);

	772 _appendToPat(patLocal, lastChar, FALSE);

	773 }

	774 lastItem = 0;

	775 buf.truncate(0);

	776 {

	777 UBool ok = FALSE;

	778 while (!chars.atEnd()) {

	779 c = chars.next(opts, literal, ec);

	780 if (U_FAILURE(ec)) return;

	781 if (c == 0x7D /'}'/ && !literal) {

	782 ok = TRUE;

	783 break;

	784 }

	785 buf.append(c);

	786 }

	787 if (buf.length() < 1 \|\| !ok) {

	788 // syntaxError(chars, "Invalid multicharacter string");

	789 ec = U_MALFORMED_SET;

	790 return;

	791 }

	792 }

	793 // We have new string. Add it to set and continue;

	794 // we don't need to drop through to the further

	795 // processing

	796 add(buf);

	797 patLocal.append((UChar) 0x7B /'{'/);

	798 _appendToPat(patLocal, buf, FALSE);

	799 patLocal.append((UChar) 0x7D /'}'/);

	800 continue;

	801 case SymbolTable::SYMBOL_REF:

	802 // symbols nosymbols

	803 // [a-$] error error (ambiguous)

	804 // [a$] anchor anchor

	805 // [a-$x] var "x"* literal '$'

	806 // [a-$.] error literal '$'

	807 // *We won't get here in the case of var "x"

	808 {

	809 chars.getPos(backup);

	810 c = chars.next(opts, literal, ec);

	811 if (U_FAILURE(ec)) return;

	812 UBool anchor = (c == 0x5D /']'/ && !literal);

	813 if (symbols == 0 && !anchor) {

	814 c = SymbolTable::SYMBOL_REF;

	815 chars.setPos(backup);

	816 break; // literal '$'

	817 }

	818 if (anchor && op == 0) {

	819 if (lastItem == 1) {

	820 add(lastChar, lastChar);

	821 _appendToPat(patLocal, lastChar, FALSE);

	822 }

	823 add(U_ETHER);

	824 usePat = TRUE;

	825 patLocal.append((UChar) SymbolTable::SYMBOL_REF);

	826 patLocal.append((UChar) 0x5D /']'/);

	827 mode = 2;

	828 continue;

	829 }

	830 // syntaxError(chars, "Unquoted '$'");

	831 ec = U_MALFORMED_SET;

	832 return;

	833 }

	834 default:

	835 break;

	836 }

	837 }

	838

	839 // -------- Parse literal characters. This includes both

	840 // escaped chars ("\u4E01") and non-syntax characters

	841 // ("a").

	842

	843 switch (lastItem) {

	844 case 0:

	845 lastItem = 1;

	846 lastChar = c;

	847 break;

	848 case 1:

	849 if (op == HYPHEN /'-'/) {

	850 if (lastChar >= c) {

	851 // Don't allow redundant (a-a) or empty (b-a) ranges;

	852 // these are most likely typos.

	853 // syntaxError(chars, "Invalid range");

	854 ec = U_MALFORMED_SET;

	855 return;

	856 }

	857 add(lastChar, c);

	858 _appendToPat(patLocal, lastChar, FALSE);

	859 patLocal.append(op);

	860 _appendToPat(patLocal, c, FALSE);

	861 lastItem = 0;

	862 op = 0;

	863 } else {

	864 add(lastChar, lastChar);

	865 _appendToPat(patLocal, lastChar, FALSE);

	866 lastChar = c;

	867 }

	868 break;

	869 case 2:

	870 if (op != 0) {

	871 // syntaxError(chars, "Set expected after operator");

	872 ec = U_MALFORMED_SET;

	873 return;

	874 }

	875 lastChar = c;

	876 lastItem = 1;

	877 break;

	878 }

	879 }

	880

	881 if (mode != 2) {

	882 // syntaxError(chars, "Missing ']'");

	883 ec = U_MALFORMED_SET;

	884 return;

	885 }

	886

	887 chars.skipIgnored(opts);

	888

	889 /**

	890 * Handle global flags (invert, case insensitivity). If this

	891 * pattern should be compiled case-insensitive, then we need

	892 * to close over case BEFORE COMPLEMENTING. This makes

	893 * patterns like /[^abc]/i work.

	894 */

	895 if ((options & USET_CASE_INSENSITIVE) != 0) {

	896 closeOver(USET_CASE_INSENSITIVE);

	897 }

	898 else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {

	899 closeOver(USET_ADD_CASE_MAPPINGS);

	900 }

	901 if (invert) {

	902 complement();

	903 }

	904

	905 // Use the rebuilt pattern (patLocal) only if necessary. Prefer the

	906 // generated pattern.

	907 if (usePat) {

	908 rebuiltPat.append(patLocal);

	909 } else {

	910 _generatePattern(rebuiltPat, FALSE);

	911 }

	912 if (isBogus() && U_SUCCESS(ec)) {

	913 // We likely ran out of memory. AHHH!

	914 ec = U_MEMORY_ALLOCATION_ERROR;

	915 }

	916 }

	917

	918 //----------------------------------------------------------------

	919 // Property set implementation

	920 //----------------------------------------------------------------

	921

	922 static UBool numericValueFilter(UChar32 ch, void* context) {

	923 return u_getNumericValue(ch) == (double)context;

	924 }

	925

	926 static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {

	927 int32_t value = (int32_t)context;

	928 return (U_GET_GC_MASK((UChar32) ch) & value) != 0;

	929 }

	930

	931 static UBool versionFilter(UChar32 ch, void* context) {

	932 static const UVersionInfo none = { 0, 0, 0, 0 };

	933 UVersionInfo v;

	934 u_charAge(ch, v);

	935 UVersionInfo* version = (UVersionInfo*)context;

	936 return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, siz eof(v)) <= 0;

	937 }

	938

	939 typedef struct {

	940 UProperty prop;

	941 int32_t value;

	942 } IntPropertyContext;

	943

	944 static UBool intPropertyFilter(UChar32 ch, void* context) {

	945 IntPropertyContext* c = (IntPropertyContext*)context;

	946 return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;

	947 }

	948

	949 static UBool scriptExtensionsFilter(UChar32 ch, void* context) {

	950 return uscript_hasScript(ch, (UScriptCode)context);

	951 }

	952

	953 /**

	954 * Generic filter-based scanning code for UCD property UnicodeSets.

	955 */

	956 void UnicodeSet::applyFilter(UnicodeSet::Filter filter,

	957 void* context,

	958 int32_t src,

	959 UErrorCode &status) {

	960 if (U_FAILURE(status)) return;

	961

	962 // Logically, walk through all Unicode characters, noting the start

	963 // and end of each range for which filter.contain(c) is

	964 // true. Add each range to a set.

	965 //

	966 // To improve performance, use an inclusions set which

	967 // encodes information about character ranges that are known

	968 // to have identical properties.

	969 // getInclusions(src) contains exactly the first characters of

	970 // same-value ranges for the given properties "source".

	971 const UnicodeSet* inclusions = getInclusions(src, status);

	972 if (U_FAILURE(status)) {

	973 return;

	974 }

	975

	976 clear();

	977

	978 UChar32 startHasProperty = -1;

	979 int32_t limitRange = inclusions->getRangeCount();

	980

	981 for (int j=0; j<limitRange; ++j) {

	982 // get current range

	983 UChar32 start = inclusions->getRangeStart(j);

	984 UChar32 end = inclusions->getRangeEnd(j);

	985

	986 // for all the code points in the range, process

	987 for (UChar32 ch = start; ch <= end; ++ch) {

	988 // only add to this UnicodeSet on inflection points --

	989 // where the hasProperty value changes to false

	990 if ((*filter)(ch, context)) {

	991 if (startHasProperty < 0) {

	992 startHasProperty = ch;

	993 }

	994 } else if (startHasProperty >= 0) {

	995 add(startHasProperty, ch-1);

	996 startHasProperty = -1;

	997 }

	998 }

	999 }

	1000 if (startHasProperty >= 0) {

	1001 add((UChar32)startHasProperty, (UChar32)0x10FFFF);

	1002 }

	1003 if (isBogus() && U_SUCCESS(status)) {

	1004 // We likely ran out of memory. AHHH!

	1005 status = U_MEMORY_ALLOCATION_ERROR;

	1006 }

	1007 }

	1008

	1009 static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {

	1010 /* Note: we use ' ' in compiler code page */

	1011 int32_t j = 0;

	1012 char ch;

	1013 --dstCapacity; /* make room for term. zero */

	1014 while ((ch = *src++) != 0) {

	1015 if (ch == ' ' && (j==0 \|\| (j>0 && dst[j-1]==' '))) {

	1016 continue;

	1017 }

	1018 if (j >= dstCapacity) return FALSE;

	1019 dst[j++] = ch;

	1020 }

	1021 if (j > 0 && dst[j-1] == ' ') --j;

	1022 dst[j] = 0;

	1023 return TRUE;

	1024 }

	1025

	1026 //----------------------------------------------------------------

	1027 // Property set API

	1028 //----------------------------------------------------------------

	1029

	1030 #define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}

	1031

	1032 UnicodeSet&

	1033 UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {

	1034 if (U_FAILURE(ec) \|\| isFrozen()) return *this;

	1035

	1036 if (prop == UCHAR_GENERAL_CATEGORY_MASK) {

	1037 applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec);

	1038 } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {

	1039 UScriptCode script = (UScriptCode)value;

	1040 applyFilter(scriptExtensionsFilter, &script, UPROPS_SRC_PROPSVEC, ec);

	1041 } else {

	1042 IntPropertyContext c = {prop, value};

	1043 applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec);

	1044 }

	1045 return *this;

	1046 }

	1047

	1048 UnicodeSet&

	1049 UnicodeSet::applyPropertyAlias(const UnicodeString& prop,

	1050 const UnicodeString& value,

	1051 UErrorCode& ec) {

	1052 if (U_FAILURE(ec) \|\| isFrozen()) return *this;

	1053

	1054 // prop and value used to be converted to char * using the default

	1055 // converter instead of the invariant conversion.

	1056 // This should not be necessary because all Unicode property and value

	1057 // names use only invariant characters.

	1058 // If there are any variant characters, then we won't find them anyway.

	1059 // Checking first avoids assertion failures in the conversion.

	1060 if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) \|\|

	1061 !uprv_isInvariantUString(value.getBuffer(), value.length())

	1062 ) {

	1063 FAIL(ec);

	1064 }

	1065 CharString pname, vname;

	1066 pname.appendInvariantChars(prop, ec);

	1067 vname.appendInvariantChars(value, ec);

	1068 if (U_FAILURE(ec)) return *this;

	1069

	1070 UProperty p;

	1071 int32_t v;

	1072 UBool mustNotBeEmpty = FALSE, invert = FALSE;

	1073

	1074 if (value.length() > 0) {

	1075 p = u_getPropertyEnum(pname.data());

	1076 if (p == UCHAR_INVALID_CODE) FAIL(ec);

	1077

	1078 // Treat gc as gcm

	1079 if (p == UCHAR_GENERAL_CATEGORY) {

	1080 p = UCHAR_GENERAL_CATEGORY_MASK;

	1081 }

	1082

	1083 if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) \|\|

	1084 (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) \|\|

	1085 (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {

	1086 v = u_getPropertyValueEnum(p, vname.data());

	1087 if (v == UCHAR_INVALID_CODE) {

	1088 // Handle numeric CCC

	1089 if (p == UCHAR_CANONICAL_COMBINING_CLASS \|\|

	1090 p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS \|\|

	1091 p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {

	1092 char* end;

	1093 double value = uprv_strtod(vname.data(), &end);

	1094 v = (int32_t) value;

	1095 if (v != value \|\| v < 0 \|\| *end != 0) {

	1096 // non-integral or negative value, or trailing junk

	1097 FAIL(ec);

	1098 }

	1099 // If the resultant set is empty then the numeric value

	1100 // was invalid.

	1101 mustNotBeEmpty = TRUE;

	1102 } else {

	1103 FAIL(ec);

	1104 }

	1105 }

	1106 }

	1107

	1108 else {

	1109

	1110 switch (p) {

	1111 case UCHAR_NUMERIC_VALUE:

	1112 {

	1113 char* end;

	1114 double value = uprv_strtod(vname.data(), &end);

	1115 if (*end != 0) {

	1116 FAIL(ec);

	1117 }

	1118 applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec) ;

	1119 return *this;

	1120 }

	1121 break;

	1122 case UCHAR_NAME:

	1123 case UCHAR_UNICODE_1_NAME:

	1124 {

	1125 // Must munge name, since u_charFromName() does not do

	1126 // 'loose' matching.

	1127 char buf[128]; // it suffices that this be > uprv_getMaxChar NameLength

	1128 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec) ;

	1129 UCharNameChoice choice = (p == UCHAR_NAME) ?

	1130 U_EXTENDED_CHAR_NAME : U_UNICODE_10_CHAR_NAME;

	1131 UChar32 ch = u_charFromName(choice, buf, &ec);

	1132 if (U_SUCCESS(ec)) {

	1133 clear();

	1134 add(ch);

	1135 return *this;

	1136 } else {

	1137 FAIL(ec);

	1138 }

	1139 }

	1140 break;

	1141 case UCHAR_AGE:

	1142 {

	1143 // Must munge name, since u_versionFromString() does not do

	1144 // 'loose' matching.

	1145 char buf[128];

	1146 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec) ;

	1147 UVersionInfo version;

	1148 u_versionFromString(version, buf);

	1149 applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec );

	1150 return *this;

	1151 }

	1152 break;

	1153 case UCHAR_SCRIPT_EXTENSIONS:

	1154 v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());

	1155 if (v == UCHAR_INVALID_CODE) {

	1156 FAIL(ec);

	1157 }

	1158 // fall through to calling applyIntPropertyValue()

	1159 break;

	1160 default:

	1161 // p is a non-binary, non-enumerated property that we

	1162 // don't support (yet).

	1163 FAIL(ec);

	1164 }

	1165 }

	1166 }

	1167

	1168 else {

	1169 // value is empty. Interpret as General Category, Script, or

	1170 // Binary property.

	1171 p = UCHAR_GENERAL_CATEGORY_MASK;

	1172 v = u_getPropertyValueEnum(p, pname.data());

	1173 if (v == UCHAR_INVALID_CODE) {

	1174 p = UCHAR_SCRIPT;

	1175 v = u_getPropertyValueEnum(p, pname.data());

	1176 if (v == UCHAR_INVALID_CODE) {

	1177 p = u_getPropertyEnum(pname.data());

	1178 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {

	1179 v = 1;

	1180 } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {

	1181 set(MIN_VALUE, MAX_VALUE);

	1182 return *this;

	1183 } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {

	1184 set(0, 0x7F);

	1185 return *this;

	1186 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data() )) {

	1187 // [:Assigned:]=[:^Cn:]

	1188 p = UCHAR_GENERAL_CATEGORY_MASK;

	1189 v = U_GC_CN_MASK;

	1190 invert = TRUE;

	1191 } else {

	1192 FAIL(ec);

	1193 }

	1194 }

	1195 }

	1196 }

	1197

	1198 applyIntPropertyValue(p, v, ec);

	1199 if(invert) {

	1200 complement();

	1201 }

	1202

	1203 if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) {

	1204 // mustNotBeEmpty is set to true if an empty set indicates

	1205 // invalid input.

	1206 ec = U_ILLEGAL_ARGUMENT_ERROR;

	1207 }

	1208

	1209 if (isBogus() && U_SUCCESS(ec)) {

	1210 // We likely ran out of memory. AHHH!

	1211 ec = U_MEMORY_ALLOCATION_ERROR;

	1212 }

	1213 return *this;

	1214 }

	1215

	1216 //----------------------------------------------------------------

	1217 // Property set patterns

	1218 //----------------------------------------------------------------

	1219

	1220 /**

	1221 * Return true if the given position, in the given pattern, appears

	1222 * to be the start of a property set pattern.

	1223 */

	1224 UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,

	1225 int32_t pos) {

	1226 // Patterns are at least 5 characters long

	1227 if ((pos+5) > pattern.length()) {

	1228 return FALSE;

	1229 }

	1230

	1231 // Look for an opening [:, [:^, \p, or \P

	1232 return isPOSIXOpen(pattern, pos) \|\| isPerlOpen(pattern, pos) \|\| isNameOpen(p attern, pos);

	1233 }

	1234

	1235 /**

	1236 * Return true if the given iterator appears to point at a

	1237 * property pattern. Regardless of the result, return with the

	1238 * iterator unchanged.

	1239 * @param chars iterator over the pattern characters. Upon return

	1240 * it will be unchanged.

	1241 * @param iterOpts RuleCharacterIterator options

	1242 */

	1243 UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,

	1244 int32_t iterOpts) {

	1245 // NOTE: literal will always be FALSE, because we don't parse escapes.

	1246 UBool result = FALSE, literal;

	1247 UErrorCode ec = U_ZERO_ERROR;

	1248 iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;

	1249 RuleCharacterIterator::Pos pos;

	1250 chars.getPos(pos);

	1251 UChar32 c = chars.next(iterOpts, literal, ec);

	1252 if (c == 0x5B /'['/ \|\| c == 0x5C /'\\'/) {

	1253 UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPAC E,

	1254 literal, ec);

	1255 result = (c == 0x5B /'['/) ? (d == 0x3A /':'/) :

	1256 (d == 0x4E /'N'/ \|\| d == 0x70 /'p'/ \|\| d == 0x50 /'P'/);

	1257 }

	1258 chars.setPos(pos);

	1259 return result && U_SUCCESS(ec);

	1260 }

	1261

	1262 /**

	1263 * Parse the given property pattern at the given parse position.

	1264 */

	1265 UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,

	1266 ParsePosition& ppos,

	1267 UErrorCode &ec) {

	1268 int32_t pos = ppos.getIndex();

	1269

	1270 UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}

	1271 UBool isName = FALSE; // true for \N{pat}, o/w false

	1272 UBool invert = FALSE;

	1273

	1274 if (U_FAILURE(ec)) return *this;

	1275

	1276 // Minimum length is 5 characters, e.g. \p{L}

	1277 if ((pos+5) > pattern.length()) {

	1278 FAIL(ec);

	1279 }

	1280

	1281 // On entry, ppos should point to one of the following locations:

	1282 // Look for an opening [:, [:^, \p, or \P

	1283 if (isPOSIXOpen(pattern, pos)) {

	1284 posix = TRUE;

	1285 pos += 2;

	1286 pos = ICU_Utility::skipWhitespace(pattern, pos);

	1287 if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {

	1288 ++pos;

	1289 invert = TRUE;

	1290 }

	1291 } else if (isPerlOpen(pattern, pos) \|\| isNameOpen(pattern, pos)) {

	1292 UChar c = pattern.charAt(pos+1);

	1293 invert = (c == UPPER_P);

	1294 isName = (c == UPPER_N);

	1295 pos += 2;

	1296 pos = ICU_Utility::skipWhitespace(pattern, pos);

	1297 if (pos == pattern.length() \|\| pattern.charAt(pos++) != OPEN_BRACE) {

	1298 // Syntax error; "\p" or "\P" not followed by "{"

	1299 FAIL(ec);

	1300 }

	1301 } else {

	1302 // Open delimiter not seen

	1303 FAIL(ec);

	1304 }

	1305

	1306 // Look for the matching close delimiter, either :] or }

	1307 int32_t close = pattern.indexOf(posix ? POSIX_CLOSE : PERL_CLOSE, pos);

	1308 if (close < 0) {

	1309 // Syntax error; close delimiter missing

	1310 FAIL(ec);

	1311 }

	1312

	1313 // Look for an '=' sign. If this is present, we will parse a

	1314 // medium \p{gc=Cf} or long \p{GeneralCategory=Format}

	1315 // pattern.

	1316 int32_t equals = pattern.indexOf(EQUALS, pos);

	1317 UnicodeString propName, valueName;

	1318 if (equals >= 0 && equals < close && !isName) {

	1319 // Equals seen; parse medium/long pattern

	1320 pattern.extractBetween(pos, equals, propName);

	1321 pattern.extractBetween(equals+1, close, valueName);

	1322 }

	1323

	1324 else {

	1325 // Handle case where no '=' is seen, and \N{}

	1326 pattern.extractBetween(pos, close, propName);

	1327

	1328 // Handle \N{name}

	1329 if (isName) {

	1330 // This is a little inefficient since it means we have to

	1331 // parse NAME_PROP back to UCHAR_NAME even though we already

	1332 // know it's UCHAR_NAME. If we refactor the API to

	1333 // support args of (UProperty, char*) then we can remove

	1334 // NAME_PROP and make this a little more efficient.

	1335 valueName = propName;

	1336 propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);

	1337 }

	1338 }

	1339

	1340 applyPropertyAlias(propName, valueName, ec);

	1341

	1342 if (U_SUCCESS(ec)) {

	1343 if (invert) {

	1344 complement();

	1345 }

	1346

	1347 // Move to the limit position after the close delimiter if the

	1348 // parse succeeded.

	1349 ppos.setIndex(close + (posix ? 2 : 1));

	1350 }

	1351

	1352 return *this;

	1353 }

	1354

	1355 /**

	1356 * Parse a property pattern.

	1357 * @param chars iterator over the pattern characters. Upon return

	1358 * it will be advanced to the first character after the parsed

	1359 * pattern, or the end of the iteration if all characters are

	1360 * parsed.

	1361 * @param rebuiltPat the pattern that was parsed, rebuilt or

	1362 * copied from the input pattern, as appropriate.

	1363 */

	1364 void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,

	1365 UnicodeString& rebuiltPat,

	1366 UErrorCode& ec) {

	1367 if (U_FAILURE(ec)) return;

	1368 UnicodeString pattern;

	1369 chars.lookahead(pattern);

	1370 ParsePosition pos(0);

	1371 applyPropertyPattern(pattern, pos, ec);

	1372 if (U_FAILURE(ec)) return;

	1373 if (pos.getIndex() == 0) {

	1374 // syntaxError(chars, "Invalid property pattern");

	1375 ec = U_MALFORMED_SET;

	1376 return;

	1377 }

	1378 chars.jumpahead(pos.getIndex());

	1379 rebuiltPat.append(pattern, 0, pos.getIndex());

	1380 }

	1381

	1382 //----------------------------------------------------------------

	1383 // Case folding API

	1384 //----------------------------------------------------------------

	1385

	1386 // add the result of a full case mapping to the set

	1387 // use str as a temporary string to avoid constructing one

	1388 static inline void

	1389 addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString &str) {

	1390 if(result >= 0) {

	1391 if(result > UCASE_MAX_STRING_LENGTH) {

	1392 // add a single-code point case mapping

	1393 set.add(result);

	1394 } else {

	1395 // add a string case mapping from full with length result

	1396 str.setTo((UBool)FALSE, full, result);

	1397 set.add(str);

	1398 }

	1399 }

	1400 // result < 0: the code point mapped to itself, no need to add it

	1401 // see ucase.h

	1402 }

	1403

	1404 UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {

	1405 if (isFrozen() \|\| isBogus()) {

	1406 return *this;

	1407 }

	1408 if (attribute & (USET_CASE_INSENSITIVE \| USET_ADD_CASE_MAPPINGS)) {

	1409 const UCaseProps *csp = ucase_getSingleton();

	1410 {

	1411 UnicodeSet foldSet(*this);

	1412 UnicodeString str;

	1413 USetAdder sa = {

	1414 foldSet.toUSet(),

	1415 _set_add,

	1416 _set_addRange,

	1417 _set_addString,

	1418 NULL, // don't need remove()

	1419 NULL // don't need removeRange()

	1420 };

	1421

	1422 // start with input set to guarantee inclusion

	1423 // USET_CASE: remove strings because the strings will actually be re duced (folded);

	1424 // therefore, start with no strings and add only those ne eded

	1425 if (attribute & USET_CASE_INSENSITIVE) {

	1426 foldSet.strings->removeAllElements();

	1427 }

	1428

	1429 int32_t n = getRangeCount();

	1430 UChar32 result;

	1431 const UChar *full;

	1432 int32_t locCache = 0;

	1433

	1434 for (int32_t i=0; i<n; ++i) {

	1435 UChar32 start = getRangeStart(i);

	1436 UChar32 end = getRangeEnd(i);

	1437

	1438 if (attribute & USET_CASE_INSENSITIVE) {

	1439 // full case closure

	1440 for (UChar32 cp=start; cp<=end; ++cp) {

	1441 ucase_addCaseClosure(csp, cp, &sa);

	1442 }

	1443 } else {

	1444 // add case mappings

	1445 // (does not add long s for regular s, or Kelvin for k, for example)

	1446 for (UChar32 cp=start; cp<=end; ++cp) {

	1447 result = ucase_toFullLower(csp, cp, NULL, NULL, &full, " ", &locCache);

	1448 addCaseMapping(foldSet, result, full, str);

	1449

	1450 result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, " ", &locCache);

	1451 addCaseMapping(foldSet, result, full, str);

	1452

	1453 result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, " ", &locCache);

	1454 addCaseMapping(foldSet, result, full, str);

	1455

	1456 result = ucase_toFullFolding(csp, cp, &full, 0);

	1457 addCaseMapping(foldSet, result, full, str);

	1458 }

	1459 }

	1460 }

	1461 if (strings != NULL && strings->size() > 0) {

	1462 if (attribute & USET_CASE_INSENSITIVE) {

	1463 for (int32_t j=0; j<strings->size(); ++j) {

	1464 str = (const UnicodeString ) strings->elementAt(j);

	1465 str.foldCase();

	1466 if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str .length(), &sa)) {

	1467 foldSet.add(str); // does not map to code points: ad d the folded string itself

	1468 }

	1469 }

	1470 } else {

	1471 Locale root("");

	1472 #if !UCONFIG_NO_BREAK_ITERATION

	1473 UErrorCode status = U_ZERO_ERROR;

	1474 BreakIterator *bi = BreakIterator::createWordInstance(root, status);

	1475 if (U_SUCCESS(status)) {

	1476 #endif

	1477 const UnicodeString *pStr;

	1478

	1479 for (int32_t j=0; j<strings->size(); ++j) {

	1480 pStr = (const UnicodeString *) strings->elementAt(j) ;

	1481 (str = *pStr).toLower(root);

	1482 foldSet.add(str);

	1483 #if !UCONFIG_NO_BREAK_ITERATION

	1484 (str = *pStr).toTitle(bi, root);

	1485 foldSet.add(str);

	1486 #endif

	1487 (str = *pStr).toUpper(root);

	1488 foldSet.add(str);

	1489 (str = *pStr).foldCase();

	1490 foldSet.add(str);

	1491 }

	1492 #if !UCONFIG_NO_BREAK_ITERATION

	1493 }

	1494 delete bi;

	1495 #endif

	1496 }

	1497 }

	1498 *this = foldSet;

	1499 }

	1500 }

	1501 return *this;

	1502 }

	1503

	1504 U_NAMESPACE_END

OLD	NEW

« no previous file with comments | « icu46/source/common/uniset.cpp ('k') | icu46/source/common/unisetspan.h » ('j') | no next file with comments »