OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * |
| 4 * Copyright (C) 1999-2010, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ******************************************************************************* |
| 8 * file name: uniset_props.cpp |
| 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) |
| 11 * indentation:4 |
| 12 * |
| 13 * created on: 2004aug25 |
| 14 * created by: Markus W. Scherer |
| 15 * |
| 16 * Character property dependent functions moved here from uniset.cpp |
| 17 */ |
| 18 |
| 19 #include "unicode/utypes.h" |
| 20 #include "unicode/uniset.h" |
| 21 #include "unicode/parsepos.h" |
| 22 #include "unicode/uchar.h" |
| 23 #include "unicode/uscript.h" |
| 24 #include "unicode/symtable.h" |
| 25 #include "unicode/uset.h" |
| 26 #include "unicode/locid.h" |
| 27 #include "unicode/brkiter.h" |
| 28 #include "uset_imp.h" |
| 29 #include "ruleiter.h" |
| 30 #include "cmemory.h" |
| 31 #include "ucln_cmn.h" |
| 32 #include "util.h" |
| 33 #include "uvector.h" |
| 34 #include "uprops.h" |
| 35 #include "propname.h" |
| 36 #include "normalizer2impl.h" |
| 37 #include "ucase.h" |
| 38 #include "ubidi_props.h" |
| 39 #include "uinvchar.h" |
| 40 #include "uprops.h" |
| 41 #include "charstr.h" |
| 42 #include "cstring.h" |
| 43 #include "mutex.h" |
| 44 #include "umutex.h" |
| 45 #include "uassert.h" |
| 46 #include "hash.h" |
| 47 |
| 48 U_NAMESPACE_USE |
| 49 |
| 50 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
| 51 |
| 52 // initial storage. Must be >= 0 |
| 53 // *** same as in uniset.cpp ! *** |
| 54 #define START_EXTRA 16 |
| 55 |
| 56 // Define UChar constants using hex for EBCDIC compatibility |
| 57 // Used #define to reduce private static exports and memory access time. |
| 58 #define SET_OPEN ((UChar)0x005B) /*[*/ |
| 59 #define SET_CLOSE ((UChar)0x005D) /*]*/ |
| 60 #define HYPHEN ((UChar)0x002D) /*-*/ |
| 61 #define COMPLEMENT ((UChar)0x005E) /*^*/ |
| 62 #define COLON ((UChar)0x003A) /*:*/ |
| 63 #define BACKSLASH ((UChar)0x005C) /*\*/ |
| 64 #define INTERSECTION ((UChar)0x0026) /*&*/ |
| 65 #define UPPER_U ((UChar)0x0055) /*U*/ |
| 66 #define LOWER_U ((UChar)0x0075) /*u*/ |
| 67 #define OPEN_BRACE ((UChar)123) /*{*/ |
| 68 #define CLOSE_BRACE ((UChar)125) /*}*/ |
| 69 #define UPPER_P ((UChar)0x0050) /*P*/ |
| 70 #define LOWER_P ((UChar)0x0070) /*p*/ |
| 71 #define UPPER_N ((UChar)78) /*N*/ |
| 72 #define EQUALS ((UChar)0x003D) /*=*/ |
| 73 |
| 74 //static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:" |
| 75 static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]" |
| 76 //static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p" |
| 77 static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}" |
| 78 //static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N" |
| 79 static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/ |
| 80 |
| 81 // Special property set IDs |
| 82 static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF] |
| 83 static const char ASCII[] = "ASCII"; // [\u0000-\u007F] |
| 84 static const char ASSIGNED[] = "Assigned"; // [:^Cn:] |
| 85 |
| 86 // Unicode name property alias |
| 87 #define NAME_PROP "na" |
| 88 #define NAME_PROP_LENGTH 2 |
| 89 |
| 90 /** |
| 91 * Delimiter string used in patterns to close a category reference: |
| 92 * ":]". Example: "[:Lu:]". |
| 93 */ |
| 94 //static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */ |
| 95 |
| 96 // Cached sets ------------------------------------------------------------- *** |
| 97 |
| 98 U_CDECL_BEGIN |
| 99 static UBool U_CALLCONV uset_cleanup(); |
| 100 U_CDECL_END |
| 101 |
| 102 // Not a TriStateSingletonWrapper because we think the UnicodeSet constructor |
| 103 // can only fail with an out-of-memory error |
| 104 // if we have a correct pattern and the properties data is hardcoded and always
available. |
| 105 class UnicodeSetSingleton : public SimpleSingletonWrapper<UnicodeSet> { |
| 106 public: |
| 107 UnicodeSetSingleton(SimpleSingleton &s, const char *pattern) : |
| 108 SimpleSingletonWrapper<UnicodeSet>(s), fPattern(pattern) {} |
| 109 UnicodeSet *getInstance(UErrorCode &errorCode) { |
| 110 return SimpleSingletonWrapper<UnicodeSet>::getInstance(createInstance, f
Pattern, errorCode); |
| 111 } |
| 112 private: |
| 113 static void *createInstance(const void *context, UErrorCode &errorCode) { |
| 114 UnicodeString pattern((const char *)context, -1, US_INV); |
| 115 UnicodeSet *set=new UnicodeSet(pattern, errorCode); |
| 116 if(set==NULL) { |
| 117 errorCode=U_MEMORY_ALLOCATION_ERROR; |
| 118 } |
| 119 set->freeze(); |
| 120 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); |
| 121 return set; |
| 122 } |
| 123 |
| 124 const char *fPattern; |
| 125 }; |
| 126 |
| 127 U_CDECL_BEGIN |
| 128 |
| 129 static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusi
ons() |
| 130 |
| 131 STATIC_SIMPLE_SINGLETON(uni32Singleton); |
| 132 |
| 133 //---------------------------------------------------------------- |
| 134 // Inclusions list |
| 135 //---------------------------------------------------------------- |
| 136 |
| 137 // USetAdder implementation |
| 138 // Does not use uset.h to reduce code dependencies |
| 139 static void U_CALLCONV |
| 140 _set_add(USet *set, UChar32 c) { |
| 141 ((UnicodeSet *)set)->add(c); |
| 142 } |
| 143 |
| 144 static void U_CALLCONV |
| 145 _set_addRange(USet *set, UChar32 start, UChar32 end) { |
| 146 ((UnicodeSet *)set)->add(start, end); |
| 147 } |
| 148 |
| 149 static void U_CALLCONV |
| 150 _set_addString(USet *set, const UChar *str, int32_t length) { |
| 151 ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length)); |
| 152 } |
| 153 |
| 154 /** |
| 155 * Cleanup function for UnicodeSet |
| 156 */ |
| 157 static UBool U_CALLCONV uset_cleanup(void) { |
| 158 int32_t i; |
| 159 |
| 160 for(i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) { |
| 161 if (INCLUSIONS[i] != NULL) { |
| 162 delete INCLUSIONS[i]; |
| 163 INCLUSIONS[i] = NULL; |
| 164 } |
| 165 } |
| 166 UnicodeSetSingleton(uni32Singleton, NULL).deleteInstance(); |
| 167 return TRUE; |
| 168 } |
| 169 |
| 170 U_CDECL_END |
| 171 |
| 172 U_NAMESPACE_BEGIN |
| 173 |
| 174 /* |
| 175 Reduce excessive reallocation, and make it easier to detect initialization |
| 176 problems. |
| 177 Usually you don't see smaller sets than this for Unicode 5.0. |
| 178 */ |
| 179 #define DEFAULT_INCLUSION_CAPACITY 3072 |
| 180 |
| 181 const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) { |
| 182 UBool needInit; |
| 183 UMTX_CHECK(NULL, (INCLUSIONS[src] == NULL), needInit); |
| 184 if (needInit) { |
| 185 UnicodeSet* incl = new UnicodeSet(); |
| 186 USetAdder sa = { |
| 187 (USet *)incl, |
| 188 _set_add, |
| 189 _set_addRange, |
| 190 _set_addString, |
| 191 NULL, // don't need remove() |
| 192 NULL // don't need removeRange() |
| 193 }; |
| 194 incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status); |
| 195 if (incl != NULL) { |
| 196 switch(src) { |
| 197 case UPROPS_SRC_CHAR: |
| 198 uchar_addPropertyStarts(&sa, &status); |
| 199 break; |
| 200 case UPROPS_SRC_PROPSVEC: |
| 201 upropsvec_addPropertyStarts(&sa, &status); |
| 202 break; |
| 203 case UPROPS_SRC_CHAR_AND_PROPSVEC: |
| 204 uchar_addPropertyStarts(&sa, &status); |
| 205 upropsvec_addPropertyStarts(&sa, &status); |
| 206 break; |
| 207 #if !UCONFIG_NO_NORMALIZATION |
| 208 case UPROPS_SRC_CASE_AND_NORM: { |
| 209 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(statu
s); |
| 210 if(U_SUCCESS(status)) { |
| 211 impl->addPropertyStarts(&sa, status); |
| 212 } |
| 213 ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status); |
| 214 break; |
| 215 } |
| 216 case UPROPS_SRC_NFC: { |
| 217 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(statu
s); |
| 218 if(U_SUCCESS(status)) { |
| 219 impl->addPropertyStarts(&sa, status); |
| 220 } |
| 221 break; |
| 222 } |
| 223 case UPROPS_SRC_NFKC: { |
| 224 const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(stat
us); |
| 225 if(U_SUCCESS(status)) { |
| 226 impl->addPropertyStarts(&sa, status); |
| 227 } |
| 228 break; |
| 229 } |
| 230 case UPROPS_SRC_NFKC_CF: { |
| 231 const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(s
tatus); |
| 232 if(U_SUCCESS(status)) { |
| 233 impl->addPropertyStarts(&sa, status); |
| 234 } |
| 235 break; |
| 236 } |
| 237 case UPROPS_SRC_NFC_CANON_ITER: { |
| 238 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(statu
s); |
| 239 if(U_SUCCESS(status)) { |
| 240 impl->addCanonIterPropertyStarts(&sa, status); |
| 241 } |
| 242 break; |
| 243 } |
| 244 #endif |
| 245 case UPROPS_SRC_CASE: |
| 246 ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status); |
| 247 break; |
| 248 case UPROPS_SRC_BIDI: |
| 249 ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status); |
| 250 break; |
| 251 default: |
| 252 status = U_INTERNAL_PROGRAM_ERROR; |
| 253 break; |
| 254 } |
| 255 if (U_SUCCESS(status)) { |
| 256 // Compact for caching |
| 257 incl->compact(); |
| 258 umtx_lock(NULL); |
| 259 if (INCLUSIONS[src] == NULL) { |
| 260 INCLUSIONS[src] = incl; |
| 261 incl = NULL; |
| 262 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); |
| 263 } |
| 264 umtx_unlock(NULL); |
| 265 } |
| 266 delete incl; |
| 267 } else { |
| 268 status = U_MEMORY_ALLOCATION_ERROR; |
| 269 } |
| 270 } |
| 271 return INCLUSIONS[src]; |
| 272 } |
| 273 |
| 274 // Cache some sets for other services -------------------------------------- *** |
| 275 |
| 276 U_CFUNC UnicodeSet * |
| 277 uniset_getUnicode32Instance(UErrorCode &errorCode) { |
| 278 return UnicodeSetSingleton(uni32Singleton, "[:age=3.2:]").getInstance(errorC
ode); |
| 279 } |
| 280 |
| 281 // helper functions for matching of pattern syntax pieces ------------------ *** |
| 282 // these functions are parallel to the PERL_OPEN etc. strings above |
| 283 |
| 284 // using these functions is not only faster than UnicodeString::compare() and |
| 285 // caseCompare(), but they also make UnicodeSet work for simple patterns when |
| 286 // no Unicode properties data is available - when caseCompare() fails |
| 287 |
| 288 static inline UBool |
| 289 isPerlOpen(const UnicodeString &pattern, int32_t pos) { |
| 290 UChar c; |
| 291 return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P
|| c==UPPER_P); |
| 292 } |
| 293 |
| 294 /*static inline UBool |
| 295 isPerlClose(const UnicodeString &pattern, int32_t pos) { |
| 296 return pattern.charAt(pos)==CLOSE_BRACE; |
| 297 }*/ |
| 298 |
| 299 static inline UBool |
| 300 isNameOpen(const UnicodeString &pattern, int32_t pos) { |
| 301 return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N; |
| 302 } |
| 303 |
| 304 static inline UBool |
| 305 isPOSIXOpen(const UnicodeString &pattern, int32_t pos) { |
| 306 return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON; |
| 307 } |
| 308 |
| 309 /*static inline UBool |
| 310 isPOSIXClose(const UnicodeString &pattern, int32_t pos) { |
| 311 return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE; |
| 312 }*/ |
| 313 |
| 314 // TODO memory debugging provided inside uniset.cpp |
| 315 // could be made available here but probably obsolete with use of modern |
| 316 // memory leak checker tools |
| 317 #define _dbgct(me) |
| 318 |
| 319 //---------------------------------------------------------------- |
| 320 // Constructors &c |
| 321 //---------------------------------------------------------------- |
| 322 |
| 323 /** |
| 324 * Constructs a set from the given pattern, optionally ignoring |
| 325 * white space. See the class description for the syntax of the |
| 326 * pattern language. |
| 327 * @param pattern a string specifying what characters are in the set |
| 328 */ |
| 329 UnicodeSet::UnicodeSet(const UnicodeString& pattern, |
| 330 UErrorCode& status) : |
| 331 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), |
| 332 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), |
| 333 fFlags(0) |
| 334 { |
| 335 if(U_SUCCESS(status)){ |
| 336 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); |
| 337 /* test for NULL */ |
| 338 if(list == NULL) { |
| 339 status = U_MEMORY_ALLOCATION_ERROR; |
| 340 }else{ |
| 341 allocateStrings(status); |
| 342 applyPattern(pattern, USET_IGNORE_SPACE, NULL, status); |
| 343 } |
| 344 } |
| 345 _dbgct(this); |
| 346 } |
| 347 |
| 348 /** |
| 349 * Constructs a set from the given pattern, optionally ignoring |
| 350 * white space. See the class description for the syntax of the |
| 351 * pattern language. |
| 352 * @param pattern a string specifying what characters are in the set |
| 353 * @param options bitmask for options to apply to the pattern. |
| 354 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. |
| 355 */ |
| 356 UnicodeSet::UnicodeSet(const UnicodeString& pattern, |
| 357 uint32_t options, |
| 358 const SymbolTable* symbols, |
| 359 UErrorCode& status) : |
| 360 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), |
| 361 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), |
| 362 fFlags(0) |
| 363 { |
| 364 if(U_SUCCESS(status)){ |
| 365 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); |
| 366 /* test for NULL */ |
| 367 if(list == NULL) { |
| 368 status = U_MEMORY_ALLOCATION_ERROR; |
| 369 }else{ |
| 370 allocateStrings(status); |
| 371 applyPattern(pattern, options, symbols, status); |
| 372 } |
| 373 } |
| 374 _dbgct(this); |
| 375 } |
| 376 |
| 377 UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, |
| 378 uint32_t options, |
| 379 const SymbolTable* symbols, |
| 380 UErrorCode& status) : |
| 381 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), |
| 382 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), |
| 383 fFlags(0) |
| 384 { |
| 385 if(U_SUCCESS(status)){ |
| 386 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); |
| 387 /* test for NULL */ |
| 388 if(list == NULL) { |
| 389 status = U_MEMORY_ALLOCATION_ERROR; |
| 390 }else{ |
| 391 allocateStrings(status); |
| 392 applyPattern(pattern, pos, options, symbols, status); |
| 393 } |
| 394 } |
| 395 _dbgct(this); |
| 396 } |
| 397 |
| 398 //---------------------------------------------------------------- |
| 399 // Public API |
| 400 //---------------------------------------------------------------- |
| 401 |
| 402 /** |
| 403 * Modifies this set to represent the set specified by the given |
| 404 * pattern, optionally ignoring white space. See the class |
| 405 * description for the syntax of the pattern language. |
| 406 * @param pattern a string specifying what characters are in the set |
| 407 * @param ignoreSpaces if <code>true</code>, all spaces in the |
| 408 * pattern are ignored. Spaces are those characters for which |
| 409 * <code>uprv_isRuleWhiteSpace()</code> is <code>true</code>. |
| 410 * Characters preceded by '\\' are escaped, losing any special |
| 411 * meaning they otherwise have. Spaces may be included by |
| 412 * escaping them. |
| 413 * @exception <code>IllegalArgumentException</code> if the pattern |
| 414 * contains a syntax error. |
| 415 */ |
| 416 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, |
| 417 UErrorCode& status) { |
| 418 return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status); |
| 419 } |
| 420 |
| 421 |
| 422 /** |
| 423 * Modifies this set to represent the set specified by the given |
| 424 * pattern, optionally ignoring white space. See the class |
| 425 * description for the syntax of the pattern language. |
| 426 * @param pattern a string specifying what characters are in the set |
| 427 * @param options bitmask for options to apply to the pattern. |
| 428 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. |
| 429 */ |
| 430 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, |
| 431 uint32_t options, |
| 432 const SymbolTable* symbols, |
| 433 UErrorCode& status) { |
| 434 if (U_FAILURE(status) || isFrozen()) { |
| 435 return *this; |
| 436 } |
| 437 |
| 438 ParsePosition pos(0); |
| 439 applyPattern(pattern, pos, options, symbols, status); |
| 440 if (U_FAILURE(status)) return *this; |
| 441 |
| 442 int32_t i = pos.getIndex(); |
| 443 |
| 444 if (options & USET_IGNORE_SPACE) { |
| 445 // Skip over trailing whitespace |
| 446 ICU_Utility::skipWhitespace(pattern, i, TRUE); |
| 447 } |
| 448 |
| 449 if (i != pattern.length()) { |
| 450 status = U_ILLEGAL_ARGUMENT_ERROR; |
| 451 } |
| 452 return *this; |
| 453 } |
| 454 |
| 455 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, |
| 456 ParsePosition& pos, |
| 457 uint32_t options, |
| 458 const SymbolTable* symbols, |
| 459 UErrorCode& status) { |
| 460 if (U_FAILURE(status) || isFrozen()) { |
| 461 return *this; |
| 462 } |
| 463 // Need to build the pattern in a temporary string because |
| 464 // _applyPattern calls add() etc., which set pat to empty. |
| 465 UnicodeString rebuiltPat; |
| 466 RuleCharacterIterator chars(pattern, symbols, pos); |
| 467 applyPattern(chars, symbols, rebuiltPat, options, status); |
| 468 if (U_FAILURE(status)) return *this; |
| 469 if (chars.inVariable()) { |
| 470 // syntaxError(chars, "Extra chars in variable value"); |
| 471 status = U_MALFORMED_SET; |
| 472 return *this; |
| 473 } |
| 474 setPattern(rebuiltPat); |
| 475 return *this; |
| 476 } |
| 477 |
| 478 /** |
| 479 * Return true if the given position, in the given pattern, appears |
| 480 * to be the start of a UnicodeSet pattern. |
| 481 */ |
| 482 UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) { |
| 483 return ((pos+1) < pattern.length() && |
| 484 pattern.charAt(pos) == (UChar)91/*[*/) || |
| 485 resemblesPropertyPattern(pattern, pos); |
| 486 } |
| 487 |
| 488 //---------------------------------------------------------------- |
| 489 // Implementation: Pattern parsing |
| 490 //---------------------------------------------------------------- |
| 491 |
| 492 /** |
| 493 * A small all-inline class to manage a UnicodeSet pointer. Add |
| 494 * operator->() etc. as needed. |
| 495 */ |
| 496 class UnicodeSetPointer { |
| 497 UnicodeSet* p; |
| 498 public: |
| 499 inline UnicodeSetPointer() : p(0) {} |
| 500 inline ~UnicodeSetPointer() { delete p; } |
| 501 inline UnicodeSet* pointer() { return p; } |
| 502 inline UBool allocate() { |
| 503 if (p == 0) { |
| 504 p = new UnicodeSet(); |
| 505 } |
| 506 return p != 0; |
| 507 } |
| 508 }; |
| 509 |
| 510 /** |
| 511 * Parse the pattern from the given RuleCharacterIterator. The |
| 512 * iterator is advanced over the parsed pattern. |
| 513 * @param chars iterator over the pattern characters. Upon return |
| 514 * it will be advanced to the first character after the parsed |
| 515 * pattern, or the end of the iteration if all characters are |
| 516 * parsed. |
| 517 * @param symbols symbol table to use to parse and dereference |
| 518 * variables, or null if none. |
| 519 * @param rebuiltPat the pattern that was parsed, rebuilt or |
| 520 * copied from the input pattern, as appropriate. |
| 521 * @param options a bit mask of zero or more of the following: |
| 522 * IGNORE_SPACE, CASE. |
| 523 */ |
| 524 void UnicodeSet::applyPattern(RuleCharacterIterator& chars, |
| 525 const SymbolTable* symbols, |
| 526 UnicodeString& rebuiltPat, |
| 527 uint32_t options, |
| 528 UErrorCode& ec) { |
| 529 if (U_FAILURE(ec)) return; |
| 530 |
| 531 // Syntax characters: [ ] ^ - & { } |
| 532 |
| 533 // Recognized special forms for chars, sets: c-c s-s s&s |
| 534 |
| 535 int32_t opts = RuleCharacterIterator::PARSE_VARIABLES | |
| 536 RuleCharacterIterator::PARSE_ESCAPES; |
| 537 if ((options & USET_IGNORE_SPACE) != 0) { |
| 538 opts |= RuleCharacterIterator::SKIP_WHITESPACE; |
| 539 } |
| 540 |
| 541 UnicodeString patLocal, buf; |
| 542 UBool usePat = FALSE; |
| 543 UnicodeSetPointer scratch; |
| 544 RuleCharacterIterator::Pos backup; |
| 545 |
| 546 // mode: 0=before [, 1=between [...], 2=after ] |
| 547 // lastItem: 0=none, 1=char, 2=set |
| 548 int8_t lastItem = 0, mode = 0; |
| 549 UChar32 lastChar = 0; |
| 550 UChar op = 0; |
| 551 |
| 552 UBool invert = FALSE; |
| 553 |
| 554 clear(); |
| 555 |
| 556 while (mode != 2 && !chars.atEnd()) { |
| 557 U_ASSERT((lastItem == 0 && op == 0) || |
| 558 (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) || |
| 559 (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ || |
| 560 op == INTERSECTION /*'&'*/))); |
| 561 |
| 562 UChar32 c = 0; |
| 563 UBool literal = FALSE; |
| 564 UnicodeSet* nested = 0; // alias - do not delete |
| 565 |
| 566 // -------- Check for property pattern |
| 567 |
| 568 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed |
| 569 int8_t setMode = 0; |
| 570 if (resemblesPropertyPattern(chars, opts)) { |
| 571 setMode = 2; |
| 572 } |
| 573 |
| 574 // -------- Parse '[' of opening delimiter OR nested set. |
| 575 // If there is a nested set, use `setMode' to define how |
| 576 // the set should be parsed. If the '[' is part of the |
| 577 // opening delimiter for this pattern, parse special |
| 578 // strings "[", "[^", "[-", and "[^-". Check for stand-in |
| 579 // characters representing a nested set in the symbol |
| 580 // table. |
| 581 |
| 582 else { |
| 583 // Prepare to backup if necessary |
| 584 chars.getPos(backup); |
| 585 c = chars.next(opts, literal, ec); |
| 586 if (U_FAILURE(ec)) return; |
| 587 |
| 588 if (c == 0x5B /*'['*/ && !literal) { |
| 589 if (mode == 1) { |
| 590 chars.setPos(backup); // backup |
| 591 setMode = 1; |
| 592 } else { |
| 593 // Handle opening '[' delimiter |
| 594 mode = 1; |
| 595 patLocal.append((UChar) 0x5B /*'['*/); |
| 596 chars.getPos(backup); // prepare to backup |
| 597 c = chars.next(opts, literal, ec); |
| 598 if (U_FAILURE(ec)) return; |
| 599 if (c == 0x5E /*'^'*/ && !literal) { |
| 600 invert = TRUE; |
| 601 patLocal.append((UChar) 0x5E /*'^'*/); |
| 602 chars.getPos(backup); // prepare to backup |
| 603 c = chars.next(opts, literal, ec); |
| 604 if (U_FAILURE(ec)) return; |
| 605 } |
| 606 // Fall through to handle special leading '-'; |
| 607 // otherwise restart loop for nested [], \p{}, etc. |
| 608 if (c == HYPHEN /*'-'*/) { |
| 609 literal = TRUE; |
| 610 // Fall through to handle literal '-' below |
| 611 } else { |
| 612 chars.setPos(backup); // backup |
| 613 continue; |
| 614 } |
| 615 } |
| 616 } else if (symbols != 0) { |
| 617 const UnicodeFunctor *m = symbols->lookupMatcher(c); |
| 618 if (m != 0) { |
| 619 const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m); |
| 620 if (ms == NULL) { |
| 621 ec = U_MALFORMED_SET; |
| 622 return; |
| 623 } |
| 624 // casting away const, but `nested' won't be modified |
| 625 // (important not to modify stored set) |
| 626 nested = const_cast<UnicodeSet*>(ms); |
| 627 setMode = 3; |
| 628 } |
| 629 } |
| 630 } |
| 631 |
| 632 // -------- Handle a nested set. This either is inline in |
| 633 // the pattern or represented by a stand-in that has |
| 634 // previously been parsed and was looked up in the symbol |
| 635 // table. |
| 636 |
| 637 if (setMode != 0) { |
| 638 if (lastItem == 1) { |
| 639 if (op != 0) { |
| 640 // syntaxError(chars, "Char expected after operator"); |
| 641 ec = U_MALFORMED_SET; |
| 642 return; |
| 643 } |
| 644 add(lastChar, lastChar); |
| 645 _appendToPat(patLocal, lastChar, FALSE); |
| 646 lastItem = 0; |
| 647 op = 0; |
| 648 } |
| 649 |
| 650 if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) { |
| 651 patLocal.append(op); |
| 652 } |
| 653 |
| 654 if (nested == 0) { |
| 655 // lazy allocation |
| 656 if (!scratch.allocate()) { |
| 657 ec = U_MEMORY_ALLOCATION_ERROR; |
| 658 return; |
| 659 } |
| 660 nested = scratch.pointer(); |
| 661 } |
| 662 switch (setMode) { |
| 663 case 1: |
| 664 nested->applyPattern(chars, symbols, patLocal, options, ec); |
| 665 break; |
| 666 case 2: |
| 667 chars.skipIgnored(opts); |
| 668 nested->applyPropertyPattern(chars, patLocal, ec); |
| 669 if (U_FAILURE(ec)) return; |
| 670 break; |
| 671 case 3: // `nested' already parsed |
| 672 nested->_toPattern(patLocal, FALSE); |
| 673 break; |
| 674 } |
| 675 |
| 676 usePat = TRUE; |
| 677 |
| 678 if (mode == 0) { |
| 679 // Entire pattern is a category; leave parse loop |
| 680 *this = *nested; |
| 681 mode = 2; |
| 682 break; |
| 683 } |
| 684 |
| 685 switch (op) { |
| 686 case HYPHEN: /*'-'*/ |
| 687 removeAll(*nested); |
| 688 break; |
| 689 case INTERSECTION: /*'&'*/ |
| 690 retainAll(*nested); |
| 691 break; |
| 692 case 0: |
| 693 addAll(*nested); |
| 694 break; |
| 695 } |
| 696 |
| 697 op = 0; |
| 698 lastItem = 2; |
| 699 |
| 700 continue; |
| 701 } |
| 702 |
| 703 if (mode == 0) { |
| 704 // syntaxError(chars, "Missing '['"); |
| 705 ec = U_MALFORMED_SET; |
| 706 return; |
| 707 } |
| 708 |
| 709 // -------- Parse special (syntax) characters. If the |
| 710 // current character is not special, or if it is escaped, |
| 711 // then fall through and handle it below. |
| 712 |
| 713 if (!literal) { |
| 714 switch (c) { |
| 715 case 0x5D /*']'*/: |
| 716 if (lastItem == 1) { |
| 717 add(lastChar, lastChar); |
| 718 _appendToPat(patLocal, lastChar, FALSE); |
| 719 } |
| 720 // Treat final trailing '-' as a literal |
| 721 if (op == HYPHEN /*'-'*/) { |
| 722 add(op, op); |
| 723 patLocal.append(op); |
| 724 } else if (op == INTERSECTION /*'&'*/) { |
| 725 // syntaxError(chars, "Trailing '&'"); |
| 726 ec = U_MALFORMED_SET; |
| 727 return; |
| 728 } |
| 729 patLocal.append((UChar) 0x5D /*']'*/); |
| 730 mode = 2; |
| 731 continue; |
| 732 case HYPHEN /*'-'*/: |
| 733 if (op == 0) { |
| 734 if (lastItem != 0) { |
| 735 op = (UChar) c; |
| 736 continue; |
| 737 } else { |
| 738 // Treat final trailing '-' as a literal |
| 739 add(c, c); |
| 740 c = chars.next(opts, literal, ec); |
| 741 if (U_FAILURE(ec)) return; |
| 742 if (c == 0x5D /*']'*/ && !literal) { |
| 743 patLocal.append(HYPHEN_RIGHT_BRACE); |
| 744 mode = 2; |
| 745 continue; |
| 746 } |
| 747 } |
| 748 } |
| 749 // syntaxError(chars, "'-' not after char or set"); |
| 750 ec = U_MALFORMED_SET; |
| 751 return; |
| 752 case INTERSECTION /*'&'*/: |
| 753 if (lastItem == 2 && op == 0) { |
| 754 op = (UChar) c; |
| 755 continue; |
| 756 } |
| 757 // syntaxError(chars, "'&' not after set"); |
| 758 ec = U_MALFORMED_SET; |
| 759 return; |
| 760 case 0x5E /*'^'*/: |
| 761 // syntaxError(chars, "'^' not after '['"); |
| 762 ec = U_MALFORMED_SET; |
| 763 return; |
| 764 case 0x7B /*'{'*/: |
| 765 if (op != 0) { |
| 766 // syntaxError(chars, "Missing operand after operator"); |
| 767 ec = U_MALFORMED_SET; |
| 768 return; |
| 769 } |
| 770 if (lastItem == 1) { |
| 771 add(lastChar, lastChar); |
| 772 _appendToPat(patLocal, lastChar, FALSE); |
| 773 } |
| 774 lastItem = 0; |
| 775 buf.truncate(0); |
| 776 { |
| 777 UBool ok = FALSE; |
| 778 while (!chars.atEnd()) { |
| 779 c = chars.next(opts, literal, ec); |
| 780 if (U_FAILURE(ec)) return; |
| 781 if (c == 0x7D /*'}'*/ && !literal) { |
| 782 ok = TRUE; |
| 783 break; |
| 784 } |
| 785 buf.append(c); |
| 786 } |
| 787 if (buf.length() < 1 || !ok) { |
| 788 // syntaxError(chars, "Invalid multicharacter string"); |
| 789 ec = U_MALFORMED_SET; |
| 790 return; |
| 791 } |
| 792 } |
| 793 // We have new string. Add it to set and continue; |
| 794 // we don't need to drop through to the further |
| 795 // processing |
| 796 add(buf); |
| 797 patLocal.append((UChar) 0x7B /*'{'*/); |
| 798 _appendToPat(patLocal, buf, FALSE); |
| 799 patLocal.append((UChar) 0x7D /*'}'*/); |
| 800 continue; |
| 801 case SymbolTable::SYMBOL_REF: |
| 802 // symbols nosymbols |
| 803 // [a-$] error error (ambiguous) |
| 804 // [a$] anchor anchor |
| 805 // [a-$x] var "x"* literal '$' |
| 806 // [a-$.] error literal '$' |
| 807 // *We won't get here in the case of var "x" |
| 808 { |
| 809 chars.getPos(backup); |
| 810 c = chars.next(opts, literal, ec); |
| 811 if (U_FAILURE(ec)) return; |
| 812 UBool anchor = (c == 0x5D /*']'*/ && !literal); |
| 813 if (symbols == 0 && !anchor) { |
| 814 c = SymbolTable::SYMBOL_REF; |
| 815 chars.setPos(backup); |
| 816 break; // literal '$' |
| 817 } |
| 818 if (anchor && op == 0) { |
| 819 if (lastItem == 1) { |
| 820 add(lastChar, lastChar); |
| 821 _appendToPat(patLocal, lastChar, FALSE); |
| 822 } |
| 823 add(U_ETHER); |
| 824 usePat = TRUE; |
| 825 patLocal.append((UChar) SymbolTable::SYMBOL_REF); |
| 826 patLocal.append((UChar) 0x5D /*']'*/); |
| 827 mode = 2; |
| 828 continue; |
| 829 } |
| 830 // syntaxError(chars, "Unquoted '$'"); |
| 831 ec = U_MALFORMED_SET; |
| 832 return; |
| 833 } |
| 834 default: |
| 835 break; |
| 836 } |
| 837 } |
| 838 |
| 839 // -------- Parse literal characters. This includes both |
| 840 // escaped chars ("\u4E01") and non-syntax characters |
| 841 // ("a"). |
| 842 |
| 843 switch (lastItem) { |
| 844 case 0: |
| 845 lastItem = 1; |
| 846 lastChar = c; |
| 847 break; |
| 848 case 1: |
| 849 if (op == HYPHEN /*'-'*/) { |
| 850 if (lastChar >= c) { |
| 851 // Don't allow redundant (a-a) or empty (b-a) ranges; |
| 852 // these are most likely typos. |
| 853 // syntaxError(chars, "Invalid range"); |
| 854 ec = U_MALFORMED_SET; |
| 855 return; |
| 856 } |
| 857 add(lastChar, c); |
| 858 _appendToPat(patLocal, lastChar, FALSE); |
| 859 patLocal.append(op); |
| 860 _appendToPat(patLocal, c, FALSE); |
| 861 lastItem = 0; |
| 862 op = 0; |
| 863 } else { |
| 864 add(lastChar, lastChar); |
| 865 _appendToPat(patLocal, lastChar, FALSE); |
| 866 lastChar = c; |
| 867 } |
| 868 break; |
| 869 case 2: |
| 870 if (op != 0) { |
| 871 // syntaxError(chars, "Set expected after operator"); |
| 872 ec = U_MALFORMED_SET; |
| 873 return; |
| 874 } |
| 875 lastChar = c; |
| 876 lastItem = 1; |
| 877 break; |
| 878 } |
| 879 } |
| 880 |
| 881 if (mode != 2) { |
| 882 // syntaxError(chars, "Missing ']'"); |
| 883 ec = U_MALFORMED_SET; |
| 884 return; |
| 885 } |
| 886 |
| 887 chars.skipIgnored(opts); |
| 888 |
| 889 /** |
| 890 * Handle global flags (invert, case insensitivity). If this |
| 891 * pattern should be compiled case-insensitive, then we need |
| 892 * to close over case BEFORE COMPLEMENTING. This makes |
| 893 * patterns like /[^abc]/i work. |
| 894 */ |
| 895 if ((options & USET_CASE_INSENSITIVE) != 0) { |
| 896 closeOver(USET_CASE_INSENSITIVE); |
| 897 } |
| 898 else if ((options & USET_ADD_CASE_MAPPINGS) != 0) { |
| 899 closeOver(USET_ADD_CASE_MAPPINGS); |
| 900 } |
| 901 if (invert) { |
| 902 complement(); |
| 903 } |
| 904 |
| 905 // Use the rebuilt pattern (patLocal) only if necessary. Prefer the |
| 906 // generated pattern. |
| 907 if (usePat) { |
| 908 rebuiltPat.append(patLocal); |
| 909 } else { |
| 910 _generatePattern(rebuiltPat, FALSE); |
| 911 } |
| 912 if (isBogus() && U_SUCCESS(ec)) { |
| 913 // We likely ran out of memory. AHHH! |
| 914 ec = U_MEMORY_ALLOCATION_ERROR; |
| 915 } |
| 916 } |
| 917 |
| 918 //---------------------------------------------------------------- |
| 919 // Property set implementation |
| 920 //---------------------------------------------------------------- |
| 921 |
| 922 static UBool numericValueFilter(UChar32 ch, void* context) { |
| 923 return u_getNumericValue(ch) == *(double*)context; |
| 924 } |
| 925 |
| 926 static UBool generalCategoryMaskFilter(UChar32 ch, void* context) { |
| 927 int32_t value = *(int32_t*)context; |
| 928 return (U_GET_GC_MASK((UChar32) ch) & value) != 0; |
| 929 } |
| 930 |
| 931 static UBool versionFilter(UChar32 ch, void* context) { |
| 932 static const UVersionInfo none = { 0, 0, 0, 0 }; |
| 933 UVersionInfo v; |
| 934 u_charAge(ch, v); |
| 935 UVersionInfo* version = (UVersionInfo*)context; |
| 936 return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, siz
eof(v)) <= 0; |
| 937 } |
| 938 |
| 939 typedef struct { |
| 940 UProperty prop; |
| 941 int32_t value; |
| 942 } IntPropertyContext; |
| 943 |
| 944 static UBool intPropertyFilter(UChar32 ch, void* context) { |
| 945 IntPropertyContext* c = (IntPropertyContext*)context; |
| 946 return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value; |
| 947 } |
| 948 |
| 949 static UBool scriptExtensionsFilter(UChar32 ch, void* context) { |
| 950 return uscript_hasScript(ch, *(UScriptCode*)context); |
| 951 } |
| 952 |
| 953 /** |
| 954 * Generic filter-based scanning code for UCD property UnicodeSets. |
| 955 */ |
| 956 void UnicodeSet::applyFilter(UnicodeSet::Filter filter, |
| 957 void* context, |
| 958 int32_t src, |
| 959 UErrorCode &status) { |
| 960 if (U_FAILURE(status)) return; |
| 961 |
| 962 // Logically, walk through all Unicode characters, noting the start |
| 963 // and end of each range for which filter.contain(c) is |
| 964 // true. Add each range to a set. |
| 965 // |
| 966 // To improve performance, use an inclusions set which |
| 967 // encodes information about character ranges that are known |
| 968 // to have identical properties. |
| 969 // getInclusions(src) contains exactly the first characters of |
| 970 // same-value ranges for the given properties "source". |
| 971 const UnicodeSet* inclusions = getInclusions(src, status); |
| 972 if (U_FAILURE(status)) { |
| 973 return; |
| 974 } |
| 975 |
| 976 clear(); |
| 977 |
| 978 UChar32 startHasProperty = -1; |
| 979 int32_t limitRange = inclusions->getRangeCount(); |
| 980 |
| 981 for (int j=0; j<limitRange; ++j) { |
| 982 // get current range |
| 983 UChar32 start = inclusions->getRangeStart(j); |
| 984 UChar32 end = inclusions->getRangeEnd(j); |
| 985 |
| 986 // for all the code points in the range, process |
| 987 for (UChar32 ch = start; ch <= end; ++ch) { |
| 988 // only add to this UnicodeSet on inflection points -- |
| 989 // where the hasProperty value changes to false |
| 990 if ((*filter)(ch, context)) { |
| 991 if (startHasProperty < 0) { |
| 992 startHasProperty = ch; |
| 993 } |
| 994 } else if (startHasProperty >= 0) { |
| 995 add(startHasProperty, ch-1); |
| 996 startHasProperty = -1; |
| 997 } |
| 998 } |
| 999 } |
| 1000 if (startHasProperty >= 0) { |
| 1001 add((UChar32)startHasProperty, (UChar32)0x10FFFF); |
| 1002 } |
| 1003 if (isBogus() && U_SUCCESS(status)) { |
| 1004 // We likely ran out of memory. AHHH! |
| 1005 status = U_MEMORY_ALLOCATION_ERROR; |
| 1006 } |
| 1007 } |
| 1008 |
| 1009 static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { |
| 1010 /* Note: we use ' ' in compiler code page */ |
| 1011 int32_t j = 0; |
| 1012 char ch; |
| 1013 --dstCapacity; /* make room for term. zero */ |
| 1014 while ((ch = *src++) != 0) { |
| 1015 if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) { |
| 1016 continue; |
| 1017 } |
| 1018 if (j >= dstCapacity) return FALSE; |
| 1019 dst[j++] = ch; |
| 1020 } |
| 1021 if (j > 0 && dst[j-1] == ' ') --j; |
| 1022 dst[j] = 0; |
| 1023 return TRUE; |
| 1024 } |
| 1025 |
| 1026 //---------------------------------------------------------------- |
| 1027 // Property set API |
| 1028 //---------------------------------------------------------------- |
| 1029 |
| 1030 #define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;} |
| 1031 |
| 1032 UnicodeSet& |
| 1033 UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec)
{ |
| 1034 if (U_FAILURE(ec) || isFrozen()) return *this; |
| 1035 |
| 1036 if (prop == UCHAR_GENERAL_CATEGORY_MASK) { |
| 1037 applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec); |
| 1038 } else if (prop == UCHAR_SCRIPT_EXTENSIONS) { |
| 1039 UScriptCode script = (UScriptCode)value; |
| 1040 applyFilter(scriptExtensionsFilter, &script, UPROPS_SRC_PROPSVEC, ec); |
| 1041 } else { |
| 1042 IntPropertyContext c = {prop, value}; |
| 1043 applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec); |
| 1044 } |
| 1045 return *this; |
| 1046 } |
| 1047 |
| 1048 UnicodeSet& |
| 1049 UnicodeSet::applyPropertyAlias(const UnicodeString& prop, |
| 1050 const UnicodeString& value, |
| 1051 UErrorCode& ec) { |
| 1052 if (U_FAILURE(ec) || isFrozen()) return *this; |
| 1053 |
| 1054 // prop and value used to be converted to char * using the default |
| 1055 // converter instead of the invariant conversion. |
| 1056 // This should not be necessary because all Unicode property and value |
| 1057 // names use only invariant characters. |
| 1058 // If there are any variant characters, then we won't find them anyway. |
| 1059 // Checking first avoids assertion failures in the conversion. |
| 1060 if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) || |
| 1061 !uprv_isInvariantUString(value.getBuffer(), value.length()) |
| 1062 ) { |
| 1063 FAIL(ec); |
| 1064 } |
| 1065 CharString pname, vname; |
| 1066 pname.appendInvariantChars(prop, ec); |
| 1067 vname.appendInvariantChars(value, ec); |
| 1068 if (U_FAILURE(ec)) return *this; |
| 1069 |
| 1070 UProperty p; |
| 1071 int32_t v; |
| 1072 UBool mustNotBeEmpty = FALSE, invert = FALSE; |
| 1073 |
| 1074 if (value.length() > 0) { |
| 1075 p = u_getPropertyEnum(pname.data()); |
| 1076 if (p == UCHAR_INVALID_CODE) FAIL(ec); |
| 1077 |
| 1078 // Treat gc as gcm |
| 1079 if (p == UCHAR_GENERAL_CATEGORY) { |
| 1080 p = UCHAR_GENERAL_CATEGORY_MASK; |
| 1081 } |
| 1082 |
| 1083 if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) || |
| 1084 (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) || |
| 1085 (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) { |
| 1086 v = u_getPropertyValueEnum(p, vname.data()); |
| 1087 if (v == UCHAR_INVALID_CODE) { |
| 1088 // Handle numeric CCC |
| 1089 if (p == UCHAR_CANONICAL_COMBINING_CLASS || |
| 1090 p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS || |
| 1091 p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) { |
| 1092 char* end; |
| 1093 double value = uprv_strtod(vname.data(), &end); |
| 1094 v = (int32_t) value; |
| 1095 if (v != value || v < 0 || *end != 0) { |
| 1096 // non-integral or negative value, or trailing junk |
| 1097 FAIL(ec); |
| 1098 } |
| 1099 // If the resultant set is empty then the numeric value |
| 1100 // was invalid. |
| 1101 mustNotBeEmpty = TRUE; |
| 1102 } else { |
| 1103 FAIL(ec); |
| 1104 } |
| 1105 } |
| 1106 } |
| 1107 |
| 1108 else { |
| 1109 |
| 1110 switch (p) { |
| 1111 case UCHAR_NUMERIC_VALUE: |
| 1112 { |
| 1113 char* end; |
| 1114 double value = uprv_strtod(vname.data(), &end); |
| 1115 if (*end != 0) { |
| 1116 FAIL(ec); |
| 1117 } |
| 1118 applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec)
; |
| 1119 return *this; |
| 1120 } |
| 1121 break; |
| 1122 case UCHAR_NAME: |
| 1123 case UCHAR_UNICODE_1_NAME: |
| 1124 { |
| 1125 // Must munge name, since u_charFromName() does not do |
| 1126 // 'loose' matching. |
| 1127 char buf[128]; // it suffices that this be > uprv_getMaxChar
NameLength |
| 1128 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec)
; |
| 1129 UCharNameChoice choice = (p == UCHAR_NAME) ? |
| 1130 U_EXTENDED_CHAR_NAME : U_UNICODE_10_CHAR_NAME; |
| 1131 UChar32 ch = u_charFromName(choice, buf, &ec); |
| 1132 if (U_SUCCESS(ec)) { |
| 1133 clear(); |
| 1134 add(ch); |
| 1135 return *this; |
| 1136 } else { |
| 1137 FAIL(ec); |
| 1138 } |
| 1139 } |
| 1140 break; |
| 1141 case UCHAR_AGE: |
| 1142 { |
| 1143 // Must munge name, since u_versionFromString() does not do |
| 1144 // 'loose' matching. |
| 1145 char buf[128]; |
| 1146 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec)
; |
| 1147 UVersionInfo version; |
| 1148 u_versionFromString(version, buf); |
| 1149 applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec
); |
| 1150 return *this; |
| 1151 } |
| 1152 break; |
| 1153 case UCHAR_SCRIPT_EXTENSIONS: |
| 1154 v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data()); |
| 1155 if (v == UCHAR_INVALID_CODE) { |
| 1156 FAIL(ec); |
| 1157 } |
| 1158 // fall through to calling applyIntPropertyValue() |
| 1159 break; |
| 1160 default: |
| 1161 // p is a non-binary, non-enumerated property that we |
| 1162 // don't support (yet). |
| 1163 FAIL(ec); |
| 1164 } |
| 1165 } |
| 1166 } |
| 1167 |
| 1168 else { |
| 1169 // value is empty. Interpret as General Category, Script, or |
| 1170 // Binary property. |
| 1171 p = UCHAR_GENERAL_CATEGORY_MASK; |
| 1172 v = u_getPropertyValueEnum(p, pname.data()); |
| 1173 if (v == UCHAR_INVALID_CODE) { |
| 1174 p = UCHAR_SCRIPT; |
| 1175 v = u_getPropertyValueEnum(p, pname.data()); |
| 1176 if (v == UCHAR_INVALID_CODE) { |
| 1177 p = u_getPropertyEnum(pname.data()); |
| 1178 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) { |
| 1179 v = 1; |
| 1180 } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) { |
| 1181 set(MIN_VALUE, MAX_VALUE); |
| 1182 return *this; |
| 1183 } else if (0 == uprv_comparePropertyNames(ASCII, pname.data()))
{ |
| 1184 set(0, 0x7F); |
| 1185 return *this; |
| 1186 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data()
)) { |
| 1187 // [:Assigned:]=[:^Cn:] |
| 1188 p = UCHAR_GENERAL_CATEGORY_MASK; |
| 1189 v = U_GC_CN_MASK; |
| 1190 invert = TRUE; |
| 1191 } else { |
| 1192 FAIL(ec); |
| 1193 } |
| 1194 } |
| 1195 } |
| 1196 } |
| 1197 |
| 1198 applyIntPropertyValue(p, v, ec); |
| 1199 if(invert) { |
| 1200 complement(); |
| 1201 } |
| 1202 |
| 1203 if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) { |
| 1204 // mustNotBeEmpty is set to true if an empty set indicates |
| 1205 // invalid input. |
| 1206 ec = U_ILLEGAL_ARGUMENT_ERROR; |
| 1207 } |
| 1208 |
| 1209 if (isBogus() && U_SUCCESS(ec)) { |
| 1210 // We likely ran out of memory. AHHH! |
| 1211 ec = U_MEMORY_ALLOCATION_ERROR; |
| 1212 } |
| 1213 return *this; |
| 1214 } |
| 1215 |
| 1216 //---------------------------------------------------------------- |
| 1217 // Property set patterns |
| 1218 //---------------------------------------------------------------- |
| 1219 |
| 1220 /** |
| 1221 * Return true if the given position, in the given pattern, appears |
| 1222 * to be the start of a property set pattern. |
| 1223 */ |
| 1224 UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern, |
| 1225 int32_t pos) { |
| 1226 // Patterns are at least 5 characters long |
| 1227 if ((pos+5) > pattern.length()) { |
| 1228 return FALSE; |
| 1229 } |
| 1230 |
| 1231 // Look for an opening [:, [:^, \p, or \P |
| 1232 return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(p
attern, pos); |
| 1233 } |
| 1234 |
| 1235 /** |
| 1236 * Return true if the given iterator appears to point at a |
| 1237 * property pattern. Regardless of the result, return with the |
| 1238 * iterator unchanged. |
| 1239 * @param chars iterator over the pattern characters. Upon return |
| 1240 * it will be unchanged. |
| 1241 * @param iterOpts RuleCharacterIterator options |
| 1242 */ |
| 1243 UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars, |
| 1244 int32_t iterOpts) { |
| 1245 // NOTE: literal will always be FALSE, because we don't parse escapes. |
| 1246 UBool result = FALSE, literal; |
| 1247 UErrorCode ec = U_ZERO_ERROR; |
| 1248 iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES; |
| 1249 RuleCharacterIterator::Pos pos; |
| 1250 chars.getPos(pos); |
| 1251 UChar32 c = chars.next(iterOpts, literal, ec); |
| 1252 if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) { |
| 1253 UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPAC
E, |
| 1254 literal, ec); |
| 1255 result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) : |
| 1256 (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/); |
| 1257 } |
| 1258 chars.setPos(pos); |
| 1259 return result && U_SUCCESS(ec); |
| 1260 } |
| 1261 |
| 1262 /** |
| 1263 * Parse the given property pattern at the given parse position. |
| 1264 */ |
| 1265 UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern, |
| 1266 ParsePosition& ppos, |
| 1267 UErrorCode &ec) { |
| 1268 int32_t pos = ppos.getIndex(); |
| 1269 |
| 1270 UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} |
| 1271 UBool isName = FALSE; // true for \N{pat}, o/w false |
| 1272 UBool invert = FALSE; |
| 1273 |
| 1274 if (U_FAILURE(ec)) return *this; |
| 1275 |
| 1276 // Minimum length is 5 characters, e.g. \p{L} |
| 1277 if ((pos+5) > pattern.length()) { |
| 1278 FAIL(ec); |
| 1279 } |
| 1280 |
| 1281 // On entry, ppos should point to one of the following locations: |
| 1282 // Look for an opening [:, [:^, \p, or \P |
| 1283 if (isPOSIXOpen(pattern, pos)) { |
| 1284 posix = TRUE; |
| 1285 pos += 2; |
| 1286 pos = ICU_Utility::skipWhitespace(pattern, pos); |
| 1287 if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) { |
| 1288 ++pos; |
| 1289 invert = TRUE; |
| 1290 } |
| 1291 } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) { |
| 1292 UChar c = pattern.charAt(pos+1); |
| 1293 invert = (c == UPPER_P); |
| 1294 isName = (c == UPPER_N); |
| 1295 pos += 2; |
| 1296 pos = ICU_Utility::skipWhitespace(pattern, pos); |
| 1297 if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) { |
| 1298 // Syntax error; "\p" or "\P" not followed by "{" |
| 1299 FAIL(ec); |
| 1300 } |
| 1301 } else { |
| 1302 // Open delimiter not seen |
| 1303 FAIL(ec); |
| 1304 } |
| 1305 |
| 1306 // Look for the matching close delimiter, either :] or } |
| 1307 int32_t close = pattern.indexOf(posix ? POSIX_CLOSE : PERL_CLOSE, pos); |
| 1308 if (close < 0) { |
| 1309 // Syntax error; close delimiter missing |
| 1310 FAIL(ec); |
| 1311 } |
| 1312 |
| 1313 // Look for an '=' sign. If this is present, we will parse a |
| 1314 // medium \p{gc=Cf} or long \p{GeneralCategory=Format} |
| 1315 // pattern. |
| 1316 int32_t equals = pattern.indexOf(EQUALS, pos); |
| 1317 UnicodeString propName, valueName; |
| 1318 if (equals >= 0 && equals < close && !isName) { |
| 1319 // Equals seen; parse medium/long pattern |
| 1320 pattern.extractBetween(pos, equals, propName); |
| 1321 pattern.extractBetween(equals+1, close, valueName); |
| 1322 } |
| 1323 |
| 1324 else { |
| 1325 // Handle case where no '=' is seen, and \N{} |
| 1326 pattern.extractBetween(pos, close, propName); |
| 1327 |
| 1328 // Handle \N{name} |
| 1329 if (isName) { |
| 1330 // This is a little inefficient since it means we have to |
| 1331 // parse NAME_PROP back to UCHAR_NAME even though we already |
| 1332 // know it's UCHAR_NAME. If we refactor the API to |
| 1333 // support args of (UProperty, char*) then we can remove |
| 1334 // NAME_PROP and make this a little more efficient. |
| 1335 valueName = propName; |
| 1336 propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV); |
| 1337 } |
| 1338 } |
| 1339 |
| 1340 applyPropertyAlias(propName, valueName, ec); |
| 1341 |
| 1342 if (U_SUCCESS(ec)) { |
| 1343 if (invert) { |
| 1344 complement(); |
| 1345 } |
| 1346 |
| 1347 // Move to the limit position after the close delimiter if the |
| 1348 // parse succeeded. |
| 1349 ppos.setIndex(close + (posix ? 2 : 1)); |
| 1350 } |
| 1351 |
| 1352 return *this; |
| 1353 } |
| 1354 |
| 1355 /** |
| 1356 * Parse a property pattern. |
| 1357 * @param chars iterator over the pattern characters. Upon return |
| 1358 * it will be advanced to the first character after the parsed |
| 1359 * pattern, or the end of the iteration if all characters are |
| 1360 * parsed. |
| 1361 * @param rebuiltPat the pattern that was parsed, rebuilt or |
| 1362 * copied from the input pattern, as appropriate. |
| 1363 */ |
| 1364 void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars, |
| 1365 UnicodeString& rebuiltPat, |
| 1366 UErrorCode& ec) { |
| 1367 if (U_FAILURE(ec)) return; |
| 1368 UnicodeString pattern; |
| 1369 chars.lookahead(pattern); |
| 1370 ParsePosition pos(0); |
| 1371 applyPropertyPattern(pattern, pos, ec); |
| 1372 if (U_FAILURE(ec)) return; |
| 1373 if (pos.getIndex() == 0) { |
| 1374 // syntaxError(chars, "Invalid property pattern"); |
| 1375 ec = U_MALFORMED_SET; |
| 1376 return; |
| 1377 } |
| 1378 chars.jumpahead(pos.getIndex()); |
| 1379 rebuiltPat.append(pattern, 0, pos.getIndex()); |
| 1380 } |
| 1381 |
| 1382 //---------------------------------------------------------------- |
| 1383 // Case folding API |
| 1384 //---------------------------------------------------------------- |
| 1385 |
| 1386 // add the result of a full case mapping to the set |
| 1387 // use str as a temporary string to avoid constructing one |
| 1388 static inline void |
| 1389 addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString
&str) { |
| 1390 if(result >= 0) { |
| 1391 if(result > UCASE_MAX_STRING_LENGTH) { |
| 1392 // add a single-code point case mapping |
| 1393 set.add(result); |
| 1394 } else { |
| 1395 // add a string case mapping from full with length result |
| 1396 str.setTo((UBool)FALSE, full, result); |
| 1397 set.add(str); |
| 1398 } |
| 1399 } |
| 1400 // result < 0: the code point mapped to itself, no need to add it |
| 1401 // see ucase.h |
| 1402 } |
| 1403 |
| 1404 UnicodeSet& UnicodeSet::closeOver(int32_t attribute) { |
| 1405 if (isFrozen() || isBogus()) { |
| 1406 return *this; |
| 1407 } |
| 1408 if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) { |
| 1409 const UCaseProps *csp = ucase_getSingleton(); |
| 1410 { |
| 1411 UnicodeSet foldSet(*this); |
| 1412 UnicodeString str; |
| 1413 USetAdder sa = { |
| 1414 foldSet.toUSet(), |
| 1415 _set_add, |
| 1416 _set_addRange, |
| 1417 _set_addString, |
| 1418 NULL, // don't need remove() |
| 1419 NULL // don't need removeRange() |
| 1420 }; |
| 1421 |
| 1422 // start with input set to guarantee inclusion |
| 1423 // USET_CASE: remove strings because the strings will actually be re
duced (folded); |
| 1424 // therefore, start with no strings and add only those ne
eded |
| 1425 if (attribute & USET_CASE_INSENSITIVE) { |
| 1426 foldSet.strings->removeAllElements(); |
| 1427 } |
| 1428 |
| 1429 int32_t n = getRangeCount(); |
| 1430 UChar32 result; |
| 1431 const UChar *full; |
| 1432 int32_t locCache = 0; |
| 1433 |
| 1434 for (int32_t i=0; i<n; ++i) { |
| 1435 UChar32 start = getRangeStart(i); |
| 1436 UChar32 end = getRangeEnd(i); |
| 1437 |
| 1438 if (attribute & USET_CASE_INSENSITIVE) { |
| 1439 // full case closure |
| 1440 for (UChar32 cp=start; cp<=end; ++cp) { |
| 1441 ucase_addCaseClosure(csp, cp, &sa); |
| 1442 } |
| 1443 } else { |
| 1444 // add case mappings |
| 1445 // (does not add long s for regular s, or Kelvin for k, for
example) |
| 1446 for (UChar32 cp=start; cp<=end; ++cp) { |
| 1447 result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "
", &locCache); |
| 1448 addCaseMapping(foldSet, result, full, str); |
| 1449 |
| 1450 result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "
", &locCache); |
| 1451 addCaseMapping(foldSet, result, full, str); |
| 1452 |
| 1453 result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "
", &locCache); |
| 1454 addCaseMapping(foldSet, result, full, str); |
| 1455 |
| 1456 result = ucase_toFullFolding(csp, cp, &full, 0); |
| 1457 addCaseMapping(foldSet, result, full, str); |
| 1458 } |
| 1459 } |
| 1460 } |
| 1461 if (strings != NULL && strings->size() > 0) { |
| 1462 if (attribute & USET_CASE_INSENSITIVE) { |
| 1463 for (int32_t j=0; j<strings->size(); ++j) { |
| 1464 str = *(const UnicodeString *) strings->elementAt(j); |
| 1465 str.foldCase(); |
| 1466 if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str
.length(), &sa)) { |
| 1467 foldSet.add(str); // does not map to code points: ad
d the folded string itself |
| 1468 } |
| 1469 } |
| 1470 } else { |
| 1471 Locale root(""); |
| 1472 #if !UCONFIG_NO_BREAK_ITERATION |
| 1473 UErrorCode status = U_ZERO_ERROR; |
| 1474 BreakIterator *bi = BreakIterator::createWordInstance(root,
status); |
| 1475 if (U_SUCCESS(status)) { |
| 1476 #endif |
| 1477 const UnicodeString *pStr; |
| 1478 |
| 1479 for (int32_t j=0; j<strings->size(); ++j) { |
| 1480 pStr = (const UnicodeString *) strings->elementAt(j)
; |
| 1481 (str = *pStr).toLower(root); |
| 1482 foldSet.add(str); |
| 1483 #if !UCONFIG_NO_BREAK_ITERATION |
| 1484 (str = *pStr).toTitle(bi, root); |
| 1485 foldSet.add(str); |
| 1486 #endif |
| 1487 (str = *pStr).toUpper(root); |
| 1488 foldSet.add(str); |
| 1489 (str = *pStr).foldCase(); |
| 1490 foldSet.add(str); |
| 1491 } |
| 1492 #if !UCONFIG_NO_BREAK_ITERATION |
| 1493 } |
| 1494 delete bi; |
| 1495 #endif |
| 1496 } |
| 1497 } |
| 1498 *this = foldSet; |
| 1499 } |
| 1500 } |
| 1501 return *this; |
| 1502 } |
| 1503 |
| 1504 U_NAMESPACE_END |
OLD | NEW |