OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * Copyright (C) 2013-2014, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ******************************************************************************* |
| 6 * collationruleparser.cpp |
| 7 * |
| 8 * (replaced the former ucol_tok.cpp) |
| 9 * |
| 10 * created on: 2013apr10 |
| 11 * created by: Markus W. Scherer |
| 12 */ |
| 13 |
| 14 #include "unicode/utypes.h" |
| 15 |
| 16 #if !UCONFIG_NO_COLLATION |
| 17 |
| 18 #include "unicode/normalizer2.h" |
| 19 #include "unicode/parseerr.h" |
| 20 #include "unicode/uchar.h" |
| 21 #include "unicode/ucol.h" |
| 22 #include "unicode/uloc.h" |
| 23 #include "unicode/unistr.h" |
| 24 #include "unicode/utf16.h" |
| 25 #include "charstr.h" |
| 26 #include "cmemory.h" |
| 27 #include "collation.h" |
| 28 #include "collationdata.h" |
| 29 #include "collationruleparser.h" |
| 30 #include "collationsettings.h" |
| 31 #include "collationtailoring.h" |
| 32 #include "cstring.h" |
| 33 #include "patternprops.h" |
| 34 #include "uassert.h" |
| 35 #include "uvectr32.h" |
| 36 |
| 37 U_NAMESPACE_BEGIN |
| 38 |
| 39 namespace { |
| 40 |
| 41 static const UChar BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 };
// "[before" |
| 42 const int32_t BEFORE_LENGTH = 7; |
| 43 |
| 44 } // namespace |
| 45 |
| 46 CollationRuleParser::Sink::~Sink() {} |
| 47 |
| 48 void |
| 49 CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *
&, UErrorCode &) {} |
| 50 |
| 51 void |
| 52 CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCod
e &) {} |
| 53 |
| 54 CollationRuleParser::Importer::~Importer() {} |
| 55 |
| 56 CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &
errorCode) |
| 57 : nfd(*Normalizer2::getNFDInstance(errorCode)), |
| 58 nfc(*Normalizer2::getNFCInstance(errorCode)), |
| 59 rules(NULL), baseData(base), settings(NULL), |
| 60 parseError(NULL), errorReason(NULL), |
| 61 sink(NULL), importer(NULL), |
| 62 ruleIndex(0) { |
| 63 } |
| 64 |
| 65 CollationRuleParser::~CollationRuleParser() { |
| 66 } |
| 67 |
| 68 void |
| 69 CollationRuleParser::parse(const UnicodeString &ruleString, |
| 70 CollationSettings &outSettings, |
| 71 UParseError *outParseError, |
| 72 UErrorCode &errorCode) { |
| 73 if(U_FAILURE(errorCode)) { return; } |
| 74 settings = &outSettings; |
| 75 parseError = outParseError; |
| 76 if(parseError != NULL) { |
| 77 parseError->line = 0; |
| 78 parseError->offset = -1; |
| 79 parseError->preContext[0] = 0; |
| 80 parseError->postContext[0] = 0; |
| 81 } |
| 82 errorReason = NULL; |
| 83 parse(ruleString, errorCode); |
| 84 } |
| 85 |
| 86 void |
| 87 CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCod
e) { |
| 88 if(U_FAILURE(errorCode)) { return; } |
| 89 rules = &ruleString; |
| 90 ruleIndex = 0; |
| 91 |
| 92 while(ruleIndex < rules->length()) { |
| 93 UChar c = rules->charAt(ruleIndex); |
| 94 if(PatternProps::isWhiteSpace(c)) { |
| 95 ++ruleIndex; |
| 96 continue; |
| 97 } |
| 98 switch(c) { |
| 99 case 0x26: // '&' |
| 100 parseRuleChain(errorCode); |
| 101 break; |
| 102 case 0x5b: // '[' |
| 103 parseSetting(errorCode); |
| 104 break; |
| 105 case 0x23: // '#' starts a comment, until the end of the line |
| 106 ruleIndex = skipComment(ruleIndex + 1); |
| 107 break; |
| 108 case 0x40: // '@' is equivalent to [backwards 2] |
| 109 settings->setFlag(CollationSettings::BACKWARD_SECONDARY, |
| 110 UCOL_ON, 0, errorCode); |
| 111 ++ruleIndex; |
| 112 break; |
| 113 case 0x21: // '!' used to turn on Thai/Lao character reversal |
| 114 // Accept but ignore. The root collator has contractions |
| 115 // that are equivalent to the character reversal, where appropriate. |
| 116 ++ruleIndex; |
| 117 break; |
| 118 default: |
| 119 setParseError("expected a reset or setting or comment", errorCode); |
| 120 break; |
| 121 } |
| 122 if(U_FAILURE(errorCode)) { return; } |
| 123 } |
| 124 } |
| 125 |
| 126 void |
| 127 CollationRuleParser::parseRuleChain(UErrorCode &errorCode) { |
| 128 int32_t resetStrength = parseResetAndPosition(errorCode); |
| 129 UBool isFirstRelation = TRUE; |
| 130 for(;;) { |
| 131 int32_t result = parseRelationOperator(errorCode); |
| 132 if(U_FAILURE(errorCode)) { return; } |
| 133 if(result < 0) { |
| 134 if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23)
{ |
| 135 // '#' starts a comment, until the end of the line |
| 136 ruleIndex = skipComment(ruleIndex + 1); |
| 137 continue; |
| 138 } |
| 139 if(isFirstRelation) { |
| 140 setParseError("reset not followed by a relation", errorCode); |
| 141 } |
| 142 return; |
| 143 } |
| 144 int32_t strength = result & STRENGTH_MASK; |
| 145 if(resetStrength < UCOL_IDENTICAL) { |
| 146 // reset-before rule chain |
| 147 if(isFirstRelation) { |
| 148 if(strength != resetStrength) { |
| 149 setParseError("reset-before strength differs from its first
relation", errorCode); |
| 150 return; |
| 151 } |
| 152 } else { |
| 153 if(strength < resetStrength) { |
| 154 setParseError("reset-before strength followed by a stronger
relation", errorCode); |
| 155 return; |
| 156 } |
| 157 } |
| 158 } |
| 159 int32_t i = ruleIndex + (result >> OFFSET_SHIFT); // skip over the rela
tion operator |
| 160 if((result & STARRED_FLAG) == 0) { |
| 161 parseRelationStrings(strength, i, errorCode); |
| 162 } else { |
| 163 parseStarredCharacters(strength, i, errorCode); |
| 164 } |
| 165 if(U_FAILURE(errorCode)) { return; } |
| 166 isFirstRelation = FALSE; |
| 167 } |
| 168 } |
| 169 |
| 170 int32_t |
| 171 CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) { |
| 172 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; } |
| 173 int32_t i = skipWhiteSpace(ruleIndex + 1); |
| 174 int32_t j; |
| 175 UChar c; |
| 176 int32_t resetStrength; |
| 177 if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 && |
| 178 (j = i + BEFORE_LENGTH) < rules->length() && |
| 179 PatternProps::isWhiteSpace(rules->charAt(j)) && |
| 180 ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() && |
| 181 0x31 <= (c = rules->charAt(j)) && c <= 0x33 && |
| 182 rules->charAt(j + 1) == 0x5d) { |
| 183 // &[before n] with n=1 or 2 or 3 |
| 184 resetStrength = UCOL_PRIMARY + (c - 0x31); |
| 185 i = skipWhiteSpace(j + 2); |
| 186 } else { |
| 187 resetStrength = UCOL_IDENTICAL; |
| 188 } |
| 189 if(i >= rules->length()) { |
| 190 setParseError("reset without position", errorCode); |
| 191 return UCOL_DEFAULT; |
| 192 } |
| 193 UnicodeString str; |
| 194 if(rules->charAt(i) == 0x5b) { // '[' |
| 195 i = parseSpecialPosition(i, str, errorCode); |
| 196 } else { |
| 197 i = parseTailoringString(i, str, errorCode); |
| 198 } |
| 199 sink->addReset(resetStrength, str, errorReason, errorCode); |
| 200 if(U_FAILURE(errorCode)) { setErrorContext(); } |
| 201 ruleIndex = i; |
| 202 return resetStrength; |
| 203 } |
| 204 |
| 205 int32_t |
| 206 CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) { |
| 207 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; } |
| 208 ruleIndex = skipWhiteSpace(ruleIndex); |
| 209 if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; } |
| 210 int32_t strength; |
| 211 int32_t i = ruleIndex; |
| 212 UChar c = rules->charAt(i++); |
| 213 switch(c) { |
| 214 case 0x3c: // '<' |
| 215 if(i < rules->length() && rules->charAt(i) == 0x3c) { // << |
| 216 ++i; |
| 217 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<< |
| 218 ++i; |
| 219 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<< |
| 220 ++i; |
| 221 strength = UCOL_QUATERNARY; |
| 222 } else { |
| 223 strength = UCOL_TERTIARY; |
| 224 } |
| 225 } else { |
| 226 strength = UCOL_SECONDARY; |
| 227 } |
| 228 } else { |
| 229 strength = UCOL_PRIMARY; |
| 230 } |
| 231 if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*' |
| 232 ++i; |
| 233 strength |= STARRED_FLAG; |
| 234 } |
| 235 break; |
| 236 case 0x3b: // ';' same as << |
| 237 strength = UCOL_SECONDARY; |
| 238 break; |
| 239 case 0x2c: // ',' same as <<< |
| 240 strength = UCOL_TERTIARY; |
| 241 break; |
| 242 case 0x3d: // '=' |
| 243 strength = UCOL_IDENTICAL; |
| 244 if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*' |
| 245 ++i; |
| 246 strength |= STARRED_FLAG; |
| 247 } |
| 248 break; |
| 249 default: |
| 250 return UCOL_DEFAULT; |
| 251 } |
| 252 return ((i - ruleIndex) << OFFSET_SHIFT) | strength; |
| 253 } |
| 254 |
| 255 void |
| 256 CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCod
e &errorCode) { |
| 257 // Parse |
| 258 // prefix | str / extension |
| 259 // where prefix and extension are optional. |
| 260 UnicodeString prefix, str, extension; |
| 261 i = parseTailoringString(i, str, errorCode); |
| 262 if(U_FAILURE(errorCode)) { return; } |
| 263 UChar next = (i < rules->length()) ? rules->charAt(i) : 0; |
| 264 if(next == 0x7c) { // '|' separates the context prefix from the string. |
| 265 prefix = str; |
| 266 i = parseTailoringString(i + 1, str, errorCode); |
| 267 if(U_FAILURE(errorCode)) { return; } |
| 268 next = (i < rules->length()) ? rules->charAt(i) : 0; |
| 269 } |
| 270 if(next == 0x2f) { // '/' separates the string from the extension. |
| 271 i = parseTailoringString(i + 1, extension, errorCode); |
| 272 } |
| 273 if(!prefix.isEmpty()) { |
| 274 UChar32 prefix0 = prefix.char32At(0); |
| 275 UChar32 c = str.char32At(0); |
| 276 if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) { |
| 277 setParseError("in 'prefix|str', prefix and str must each start with
an NFC boundary", |
| 278 errorCode); |
| 279 return; |
| 280 } |
| 281 } |
| 282 sink->addRelation(strength, prefix, str, extension, errorReason, errorCode); |
| 283 if(U_FAILURE(errorCode)) { setErrorContext(); } |
| 284 ruleIndex = i; |
| 285 } |
| 286 |
| 287 void |
| 288 CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorC
ode &errorCode) { |
| 289 UnicodeString empty, raw; |
| 290 i = parseString(skipWhiteSpace(i), raw, errorCode); |
| 291 if(U_FAILURE(errorCode)) { return; } |
| 292 if(raw.isEmpty()) { |
| 293 setParseError("missing starred-relation string", errorCode); |
| 294 return; |
| 295 } |
| 296 UChar32 prev = -1; |
| 297 int32_t j = 0; |
| 298 for(;;) { |
| 299 while(j < raw.length()) { |
| 300 UChar32 c = raw.char32At(j); |
| 301 if(!nfd.isInert(c)) { |
| 302 setParseError("starred-relation string is not all NFD-inert", er
rorCode); |
| 303 return; |
| 304 } |
| 305 sink->addRelation(strength, empty, UnicodeString(c), empty, errorRea
son, errorCode); |
| 306 if(U_FAILURE(errorCode)) { |
| 307 setErrorContext(); |
| 308 return; |
| 309 } |
| 310 j += U16_LENGTH(c); |
| 311 prev = c; |
| 312 } |
| 313 if(i >= rules->length() || rules->charAt(i) != 0x2d) { // '-' |
| 314 break; |
| 315 } |
| 316 if(prev < 0) { |
| 317 setParseError("range without start in starred-relation string", erro
rCode); |
| 318 return; |
| 319 } |
| 320 i = parseString(i + 1, raw, errorCode); |
| 321 if(U_FAILURE(errorCode)) { return; } |
| 322 if(raw.isEmpty()) { |
| 323 setParseError("range without end in starred-relation string", errorC
ode); |
| 324 return; |
| 325 } |
| 326 UChar32 c = raw.char32At(0); |
| 327 if(c < prev) { |
| 328 setParseError("range start greater than end in starred-relation stri
ng", errorCode); |
| 329 return; |
| 330 } |
| 331 // range prev-c |
| 332 UnicodeString s; |
| 333 while(++prev <= c) { |
| 334 if(!nfd.isInert(prev)) { |
| 335 setParseError("starred-relation string range is not all NFD-iner
t", errorCode); |
| 336 return; |
| 337 } |
| 338 if(U_IS_SURROGATE(prev)) { |
| 339 setParseError("starred-relation string range contains a surrogat
e", errorCode); |
| 340 return; |
| 341 } |
| 342 if(0xfffd <= prev && prev <= 0xffff) { |
| 343 setParseError("starred-relation string range contains U+FFFD, U+
FFFE or U+FFFF", errorCode); |
| 344 return; |
| 345 } |
| 346 s.setTo(prev); |
| 347 sink->addRelation(strength, empty, s, empty, errorReason, errorCode)
; |
| 348 if(U_FAILURE(errorCode)) { |
| 349 setErrorContext(); |
| 350 return; |
| 351 } |
| 352 } |
| 353 prev = -1; |
| 354 j = U16_LENGTH(c); |
| 355 } |
| 356 ruleIndex = skipWhiteSpace(i); |
| 357 } |
| 358 |
| 359 int32_t |
| 360 CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorC
ode &errorCode) { |
| 361 i = parseString(skipWhiteSpace(i), raw, errorCode); |
| 362 if(U_SUCCESS(errorCode) && raw.isEmpty()) { |
| 363 setParseError("missing relation string", errorCode); |
| 364 } |
| 365 return skipWhiteSpace(i); |
| 366 } |
| 367 |
| 368 int32_t |
| 369 CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &erro
rCode) { |
| 370 if(U_FAILURE(errorCode)) { return i; } |
| 371 raw.remove(); |
| 372 while(i < rules->length()) { |
| 373 UChar32 c = rules->charAt(i++); |
| 374 if(isSyntaxChar(c)) { |
| 375 if(c == 0x27) { // apostrophe |
| 376 if(i < rules->length() && rules->charAt(i) == 0x27) { |
| 377 // Double apostrophe, encodes a single one. |
| 378 raw.append((UChar)0x27); |
| 379 ++i; |
| 380 continue; |
| 381 } |
| 382 // Quote literal text until the next single apostrophe. |
| 383 for(;;) { |
| 384 if(i == rules->length()) { |
| 385 setParseError("quoted literal text missing terminating a
postrophe", errorCode); |
| 386 return i; |
| 387 } |
| 388 c = rules->charAt(i++); |
| 389 if(c == 0x27) { |
| 390 if(i < rules->length() && rules->charAt(i) == 0x27) { |
| 391 // Double apostrophe inside quoted literal text, |
| 392 // still encodes a single apostrophe. |
| 393 ++i; |
| 394 } else { |
| 395 break; |
| 396 } |
| 397 } |
| 398 raw.append((UChar)c); |
| 399 } |
| 400 } else if(c == 0x5c) { // backslash |
| 401 if(i == rules->length()) { |
| 402 setParseError("backslash escape at the end of the rule strin
g", errorCode); |
| 403 return i; |
| 404 } |
| 405 c = rules->char32At(i); |
| 406 raw.append(c); |
| 407 i += U16_LENGTH(c); |
| 408 } else { |
| 409 // Any other syntax character terminates a string. |
| 410 --i; |
| 411 break; |
| 412 } |
| 413 } else if(PatternProps::isWhiteSpace(c)) { |
| 414 // Unquoted white space terminates a string. |
| 415 --i; |
| 416 break; |
| 417 } else { |
| 418 raw.append((UChar)c); |
| 419 } |
| 420 } |
| 421 for(int32_t j = 0; j < raw.length();) { |
| 422 UChar32 c = raw.char32At(j); |
| 423 if(U_IS_SURROGATE(c)) { |
| 424 setParseError("string contains an unpaired surrogate", errorCode); |
| 425 return i; |
| 426 } |
| 427 if(0xfffd <= c && c <= 0xffff) { |
| 428 setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode)
; |
| 429 return i; |
| 430 } |
| 431 j += U16_LENGTH(c); |
| 432 } |
| 433 return i; |
| 434 } |
| 435 |
| 436 namespace { |
| 437 |
| 438 static const char *const positions[] = { |
| 439 "first tertiary ignorable", |
| 440 "last tertiary ignorable", |
| 441 "first secondary ignorable", |
| 442 "last secondary ignorable", |
| 443 "first primary ignorable", |
| 444 "last primary ignorable", |
| 445 "first variable", |
| 446 "last variable", |
| 447 "first regular", |
| 448 "last regular", |
| 449 "first implicit", |
| 450 "last implicit", |
| 451 "first trailing", |
| 452 "last trailing" |
| 453 }; |
| 454 |
| 455 } // namespace |
| 456 |
| 457 int32_t |
| 458 CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorC
ode &errorCode) { |
| 459 if(U_FAILURE(errorCode)) { return 0; } |
| 460 UnicodeString raw; |
| 461 int32_t j = readWords(i + 1, raw); |
| 462 if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) { // words end with
] |
| 463 ++j; |
| 464 for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) { |
| 465 if(raw == UnicodeString(positions[pos], -1, US_INV)) { |
| 466 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos)); |
| 467 return j; |
| 468 } |
| 469 } |
| 470 if(raw == UNICODE_STRING_SIMPLE("top")) { |
| 471 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR)); |
| 472 return j; |
| 473 } |
| 474 if(raw == UNICODE_STRING_SIMPLE("variable top")) { |
| 475 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE))
; |
| 476 return j; |
| 477 } |
| 478 } |
| 479 setParseError("not a valid special reset position", errorCode); |
| 480 return i; |
| 481 } |
| 482 |
| 483 void |
| 484 CollationRuleParser::parseSetting(UErrorCode &errorCode) { |
| 485 if(U_FAILURE(errorCode)) { return; } |
| 486 UnicodeString raw; |
| 487 int32_t i = ruleIndex + 1; |
| 488 int32_t j = readWords(i, raw); |
| 489 if(j <= i || raw.isEmpty()) { |
| 490 setParseError("expected a setting/option at '['", errorCode); |
| 491 } |
| 492 if(rules->charAt(j) == 0x5d) { // words end with ] |
| 493 ++j; |
| 494 if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) && |
| 495 (raw.length() == 7 || raw.charAt(7) == 0x20)) { |
| 496 parseReordering(raw, errorCode); |
| 497 ruleIndex = j; |
| 498 return; |
| 499 } |
| 500 if(raw == UNICODE_STRING_SIMPLE("backwards 2")) { |
| 501 settings->setFlag(CollationSettings::BACKWARD_SECONDARY, |
| 502 UCOL_ON, 0, errorCode); |
| 503 ruleIndex = j; |
| 504 return; |
| 505 } |
| 506 UnicodeString v; |
| 507 int32_t valueIndex = raw.lastIndexOf((UChar)0x20); |
| 508 if(valueIndex >= 0) { |
| 509 v.setTo(raw, valueIndex + 1); |
| 510 raw.truncate(valueIndex); |
| 511 } |
| 512 if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) { |
| 513 int32_t value = UCOL_DEFAULT; |
| 514 UChar c = v.charAt(0); |
| 515 if(0x31 <= c && c <= 0x34) { // 1..4 |
| 516 value = UCOL_PRIMARY + (c - 0x31); |
| 517 } else if(c == 0x49) { // 'I' |
| 518 value = UCOL_IDENTICAL; |
| 519 } |
| 520 if(value != UCOL_DEFAULT) { |
| 521 settings->setStrength(value, 0, errorCode); |
| 522 ruleIndex = j; |
| 523 return; |
| 524 } |
| 525 } else if(raw == UNICODE_STRING_SIMPLE("alternate")) { |
| 526 UColAttributeValue value = UCOL_DEFAULT; |
| 527 if(v == UNICODE_STRING_SIMPLE("non-ignorable")) { |
| 528 value = UCOL_NON_IGNORABLE; |
| 529 } else if(v == UNICODE_STRING_SIMPLE("shifted")) { |
| 530 value = UCOL_SHIFTED; |
| 531 } |
| 532 if(value != UCOL_DEFAULT) { |
| 533 settings->setAlternateHandling(value, 0, errorCode); |
| 534 ruleIndex = j; |
| 535 return; |
| 536 } |
| 537 } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) { |
| 538 int32_t value = UCOL_DEFAULT; |
| 539 if(v == UNICODE_STRING_SIMPLE("space")) { |
| 540 value = CollationSettings::MAX_VAR_SPACE; |
| 541 } else if(v == UNICODE_STRING_SIMPLE("punct")) { |
| 542 value = CollationSettings::MAX_VAR_PUNCT; |
| 543 } else if(v == UNICODE_STRING_SIMPLE("symbol")) { |
| 544 value = CollationSettings::MAX_VAR_SYMBOL; |
| 545 } else if(v == UNICODE_STRING_SIMPLE("currency")) { |
| 546 value = CollationSettings::MAX_VAR_CURRENCY; |
| 547 } |
| 548 if(value != UCOL_DEFAULT) { |
| 549 settings->setMaxVariable(value, 0, errorCode); |
| 550 settings->variableTop = baseData->getLastPrimaryForGroup( |
| 551 UCOL_REORDER_CODE_FIRST + value); |
| 552 U_ASSERT(settings->variableTop != 0); |
| 553 ruleIndex = j; |
| 554 return; |
| 555 } |
| 556 } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) { |
| 557 UColAttributeValue value = UCOL_DEFAULT; |
| 558 if(v == UNICODE_STRING_SIMPLE("off")) { |
| 559 value = UCOL_OFF; |
| 560 } else if(v == UNICODE_STRING_SIMPLE("lower")) { |
| 561 value = UCOL_LOWER_FIRST; |
| 562 } else if(v == UNICODE_STRING_SIMPLE("upper")) { |
| 563 value = UCOL_UPPER_FIRST; |
| 564 } |
| 565 if(value != UCOL_DEFAULT) { |
| 566 settings->setCaseFirst(value, 0, errorCode); |
| 567 ruleIndex = j; |
| 568 return; |
| 569 } |
| 570 } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) { |
| 571 UColAttributeValue value = getOnOffValue(v); |
| 572 if(value != UCOL_DEFAULT) { |
| 573 settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, error
Code); |
| 574 ruleIndex = j; |
| 575 return; |
| 576 } |
| 577 } else if(raw == UNICODE_STRING_SIMPLE("normalization")) { |
| 578 UColAttributeValue value = getOnOffValue(v); |
| 579 if(value != UCOL_DEFAULT) { |
| 580 settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorC
ode); |
| 581 ruleIndex = j; |
| 582 return; |
| 583 } |
| 584 } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) { |
| 585 UColAttributeValue value = getOnOffValue(v); |
| 586 if(value != UCOL_DEFAULT) { |
| 587 settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCod
e); |
| 588 ruleIndex = j; |
| 589 return; |
| 590 } |
| 591 } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) { |
| 592 UColAttributeValue value = getOnOffValue(v); |
| 593 if(value != UCOL_DEFAULT) { |
| 594 if(value == UCOL_ON) { |
| 595 setParseError("[hiraganaQ on] is not supported", errorCode); |
| 596 } |
| 597 ruleIndex = j; |
| 598 return; |
| 599 } |
| 600 } else if(raw == UNICODE_STRING_SIMPLE("import")) { |
| 601 CharString lang; |
| 602 lang.appendInvariantChars(v, errorCode); |
| 603 if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; } |
| 604 // BCP 47 language tag -> ICU locale ID |
| 605 char localeID[ULOC_FULLNAME_CAPACITY]; |
| 606 int32_t parsedLength; |
| 607 int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FUL
LNAME_CAPACITY, |
| 608 &parsedLength, &errorCode); |
| 609 if(U_FAILURE(errorCode) || |
| 610 parsedLength != lang.length() || length >= ULOC_FULLNAME_CAP
ACITY) { |
| 611 errorCode = U_ZERO_ERROR; |
| 612 setParseError("expected language tag in [import langTag]", error
Code); |
| 613 return; |
| 614 } |
| 615 // localeID minus all keywords |
| 616 char baseID[ULOC_FULLNAME_CAPACITY]; |
| 617 length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY,
&errorCode); |
| 618 if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) { |
| 619 errorCode = U_ZERO_ERROR; |
| 620 setParseError("expected language tag in [import langTag]", error
Code); |
| 621 return; |
| 622 } |
| 623 if(length == 3 && uprv_memcmp(baseID, "und", 3) == 0) { |
| 624 uprv_strcpy(baseID, "root"); |
| 625 } |
| 626 // @collation=type, or length=0 if not specified |
| 627 char collationType[ULOC_KEYWORDS_CAPACITY]; |
| 628 length = uloc_getKeywordValue(localeID, "collation", |
| 629 collationType, ULOC_KEYWORDS_CAPACITY, |
| 630 &errorCode); |
| 631 if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) { |
| 632 errorCode = U_ZERO_ERROR; |
| 633 setParseError("expected language tag in [import langTag]", error
Code); |
| 634 return; |
| 635 } |
| 636 if(importer == NULL) { |
| 637 setParseError("[import langTag] is not supported", errorCode); |
| 638 } else { |
| 639 UnicodeString importedRules; |
| 640 importer->getRules(baseID, length > 0 ? collationType : "standar
d", |
| 641 importedRules, errorReason, errorCode); |
| 642 if(U_FAILURE(errorCode)) { |
| 643 if(errorReason == NULL) { |
| 644 errorReason = "[import langTag] failed"; |
| 645 } |
| 646 setErrorContext(); |
| 647 return; |
| 648 } |
| 649 const UnicodeString *outerRules = rules; |
| 650 int32_t outerRuleIndex = ruleIndex; |
| 651 parse(importedRules, errorCode); |
| 652 if(U_FAILURE(errorCode)) { |
| 653 if(parseError != NULL) { |
| 654 parseError->offset = outerRuleIndex; |
| 655 } |
| 656 } |
| 657 rules = outerRules; |
| 658 ruleIndex = j; |
| 659 } |
| 660 return; |
| 661 } |
| 662 } else if(rules->charAt(j) == 0x5b) { // words end with [ |
| 663 UnicodeSet set; |
| 664 j = parseUnicodeSet(j, set, errorCode); |
| 665 if(U_FAILURE(errorCode)) { return; } |
| 666 if(raw == UNICODE_STRING_SIMPLE("optimize")) { |
| 667 sink->optimize(set, errorReason, errorCode); |
| 668 if(U_FAILURE(errorCode)) { setErrorContext(); } |
| 669 ruleIndex = j; |
| 670 return; |
| 671 } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) { |
| 672 sink->suppressContractions(set, errorReason, errorCode); |
| 673 if(U_FAILURE(errorCode)) { setErrorContext(); } |
| 674 ruleIndex = j; |
| 675 return; |
| 676 } |
| 677 } |
| 678 setParseError("not a valid setting/option", errorCode); |
| 679 } |
| 680 |
| 681 void |
| 682 CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &error
Code) { |
| 683 if(U_FAILURE(errorCode)) { return; } |
| 684 int32_t i = 7; // after "reorder" |
| 685 if(i == raw.length()) { |
| 686 // empty [reorder] with no codes |
| 687 settings->resetReordering(); |
| 688 return; |
| 689 } |
| 690 // Parse the codes in [reorder aa bb cc]. |
| 691 UVector32 reorderCodes(errorCode); |
| 692 if(U_FAILURE(errorCode)) { return; } |
| 693 CharString word; |
| 694 while(i < raw.length()) { |
| 695 ++i; // skip the word-separating space |
| 696 int32_t limit = raw.indexOf((UChar)0x20, i); |
| 697 if(limit < 0) { limit = raw.length(); } |
| 698 word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), er
rorCode); |
| 699 if(U_FAILURE(errorCode)) { return; } |
| 700 int32_t code = getReorderCode(word.data()); |
| 701 if(code < 0) { |
| 702 setParseError("unknown script or reorder code", errorCode); |
| 703 return; |
| 704 } |
| 705 reorderCodes.addElement(code, errorCode); |
| 706 if(U_FAILURE(errorCode)) { return; } |
| 707 i = limit; |
| 708 } |
| 709 int32_t length = reorderCodes.size(); |
| 710 if(length == 1 && reorderCodes.elementAti(0) == UCOL_REORDER_CODE_NONE) { |
| 711 settings->resetReordering(); |
| 712 return; |
| 713 } |
| 714 uint8_t table[256]; |
| 715 baseData->makeReorderTable(reorderCodes.getBuffer(), length, table, errorCod
e); |
| 716 if(U_FAILURE(errorCode)) { return; } |
| 717 if(!settings->setReordering(reorderCodes.getBuffer(), length, table)) { |
| 718 errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 719 } |
| 720 } |
| 721 |
| 722 static const char *const gSpecialReorderCodes[] = { |
| 723 "space", "punct", "symbol", "currency", "digit" |
| 724 }; |
| 725 |
| 726 int32_t |
| 727 CollationRuleParser::getReorderCode(const char *word) { |
| 728 for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) { |
| 729 if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) { |
| 730 return UCOL_REORDER_CODE_FIRST + i; |
| 731 } |
| 732 } |
| 733 int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word); |
| 734 if(script >= 0) { |
| 735 return script; |
| 736 } |
| 737 if(uprv_stricmp(word, "others") == 0) { |
| 738 return UCOL_REORDER_CODE_OTHERS; // same as Zzzz = USCRIPT_UNKNOWN |
| 739 } |
| 740 return -1; |
| 741 } |
| 742 |
| 743 UColAttributeValue |
| 744 CollationRuleParser::getOnOffValue(const UnicodeString &s) { |
| 745 if(s == UNICODE_STRING_SIMPLE("on")) { |
| 746 return UCOL_ON; |
| 747 } else if(s == UNICODE_STRING_SIMPLE("off")) { |
| 748 return UCOL_OFF; |
| 749 } else { |
| 750 return UCOL_DEFAULT; |
| 751 } |
| 752 } |
| 753 |
| 754 int32_t |
| 755 CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &err
orCode) { |
| 756 // Collect a UnicodeSet pattern between a balanced pair of [brackets]. |
| 757 int32_t level = 0; |
| 758 int32_t j = i; |
| 759 for(;;) { |
| 760 if(j == rules->length()) { |
| 761 setParseError("unbalanced UnicodeSet pattern brackets", errorCode); |
| 762 return j; |
| 763 } |
| 764 UChar c = rules->charAt(j++); |
| 765 if(c == 0x5b) { // '[' |
| 766 ++level; |
| 767 } else if(c == 0x5d) { // ']' |
| 768 if(--level == 0) { break; } |
| 769 } |
| 770 } |
| 771 set.applyPattern(rules->tempSubStringBetween(i, j), errorCode); |
| 772 if(U_FAILURE(errorCode)) { |
| 773 errorCode = U_ZERO_ERROR; |
| 774 setParseError("not a valid UnicodeSet pattern", errorCode); |
| 775 return j; |
| 776 } |
| 777 j = skipWhiteSpace(j); |
| 778 if(j == rules->length() || rules->charAt(j) != 0x5d) { |
| 779 setParseError("missing option-terminating ']' after UnicodeSet pattern",
errorCode); |
| 780 return j; |
| 781 } |
| 782 return ++j; |
| 783 } |
| 784 |
| 785 int32_t |
| 786 CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const { |
| 787 static const UChar sp = 0x20; |
| 788 raw.remove(); |
| 789 i = skipWhiteSpace(i); |
| 790 for(;;) { |
| 791 if(i >= rules->length()) { return 0; } |
| 792 UChar c = rules->charAt(i); |
| 793 if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) { // syntax except -_ |
| 794 if(raw.isEmpty()) { return i; } |
| 795 if(raw.endsWith(&sp, 1)) { // remove trailing space |
| 796 raw.truncate(raw.length() - 1); |
| 797 } |
| 798 return i; |
| 799 } |
| 800 if(PatternProps::isWhiteSpace(c)) { |
| 801 raw.append(0x20); |
| 802 i = skipWhiteSpace(i + 1); |
| 803 } else { |
| 804 raw.append(c); |
| 805 ++i; |
| 806 } |
| 807 } |
| 808 } |
| 809 |
| 810 int32_t |
| 811 CollationRuleParser::skipComment(int32_t i) const { |
| 812 // skip to past the newline |
| 813 while(i < rules->length()) { |
| 814 UChar c = rules->charAt(i++); |
| 815 // LF or FF or CR or NEL or LS or PS |
| 816 if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c ==
0x2029) { |
| 817 // Unicode Newline Guidelines: "A readline function should stop at N
LF, LS, FF, or PS." |
| 818 // NLF (new line function) = CR or LF or CR+LF or NEL. |
| 819 // No need to collect all of CR+LF because a following LF will be ig
nored anyway. |
| 820 break; |
| 821 } |
| 822 } |
| 823 return i; |
| 824 } |
| 825 |
| 826 void |
| 827 CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) { |
| 828 if(U_FAILURE(errorCode)) { return; } |
| 829 // Error code consistent with the old parser (from ca. 2001), |
| 830 // rather than U_PARSE_ERROR; |
| 831 errorCode = U_INVALID_FORMAT_ERROR; |
| 832 errorReason = reason; |
| 833 if(parseError != NULL) { setErrorContext(); } |
| 834 } |
| 835 |
| 836 void |
| 837 CollationRuleParser::setErrorContext() { |
| 838 if(parseError == NULL) { return; } |
| 839 |
| 840 // Note: This relies on the calling code maintaining the ruleIndex |
| 841 // at a position that is useful for debugging. |
| 842 // For example, at the beginning of a reset or relation etc. |
| 843 parseError->offset = ruleIndex; |
| 844 parseError->line = 0; // We are not counting line numbers. |
| 845 |
| 846 // before ruleIndex |
| 847 int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1); |
| 848 if(start < 0) { |
| 849 start = 0; |
| 850 } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) { |
| 851 ++start; |
| 852 } |
| 853 int32_t length = ruleIndex - start; |
| 854 rules->extract(start, length, parseError->preContext); |
| 855 parseError->preContext[length] = 0; |
| 856 |
| 857 // starting from ruleIndex |
| 858 length = rules->length() - ruleIndex; |
| 859 if(length >= U_PARSE_CONTEXT_LEN) { |
| 860 length = U_PARSE_CONTEXT_LEN - 1; |
| 861 if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) { |
| 862 --length; |
| 863 } |
| 864 } |
| 865 rules->extract(ruleIndex, length, parseError->postContext); |
| 866 parseError->postContext[length] = 0; |
| 867 } |
| 868 |
| 869 UBool |
| 870 CollationRuleParser::isSyntaxChar(UChar32 c) { |
| 871 return 0x21 <= c && c <= 0x7e && |
| 872 (c <= 0x2f || (0x3a <= c && c <= 0x40) || |
| 873 (0x5b <= c && c <= 0x60) || (0x7b <= c)); |
| 874 } |
| 875 |
| 876 int32_t |
| 877 CollationRuleParser::skipWhiteSpace(int32_t i) const { |
| 878 while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) { |
| 879 ++i; |
| 880 } |
| 881 return i; |
| 882 } |
| 883 |
| 884 U_NAMESPACE_END |
| 885 |
| 886 #endif // !UCONFIG_NO_COLLATION |
OLD | NEW |