source/i18n/collationruleparser.cpp - Issue 845603002: Update ICU to 54.1 step 1

Side by Side Diff: source/i18n/collationruleparser.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master

Patch Set: remove unusued directories Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /*

	2 *******************************************************************************

	3 * Copyright (C) 2013-2014, International Business Machines

	4 * Corporation and others. All Rights Reserved.

	5 *******************************************************************************

	6 * collationruleparser.cpp

	7 *

	8 * (replaced the former ucol_tok.cpp)

	9 *

	10 * created on: 2013apr10

	11 * created by: Markus W. Scherer

	12 */

	13

	14 #include "unicode/utypes.h"

	15

	16 #if !UCONFIG_NO_COLLATION

	17

	18 #include "unicode/normalizer2.h"

	19 #include "unicode/parseerr.h"

	20 #include "unicode/uchar.h"

	21 #include "unicode/ucol.h"

	22 #include "unicode/uloc.h"

	23 #include "unicode/unistr.h"

	24 #include "unicode/utf16.h"

	25 #include "charstr.h"

	26 #include "cmemory.h"

	27 #include "collation.h"

	28 #include "collationdata.h"

	29 #include "collationruleparser.h"

	30 #include "collationsettings.h"

	31 #include "collationtailoring.h"

	32 #include "cstring.h"

	33 #include "patternprops.h"

	34 #include "uassert.h"

	35 #include "uvectr32.h"

	36

	37 U_NAMESPACE_BEGIN

	38

	39 namespace {

	40

	41 static const UChar BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 }; // "[before"

	42 const int32_t BEFORE_LENGTH = 7;

	43

	44 } // namespace

	45

	46 CollationRuleParser::Sink::~Sink() {}

	47

	48 void

	49 CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char * &, UErrorCode &) {}

	50

	51 void

	52 CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCod e &) {}

	53

	54 CollationRuleParser::Importer::~Importer() {}

	55

	56 CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode & errorCode)

	57 : nfd(*Normalizer2::getNFDInstance(errorCode)),

	58 nfc(*Normalizer2::getNFCInstance(errorCode)),

	59 rules(NULL), baseData(base), settings(NULL),

	60 parseError(NULL), errorReason(NULL),

	61 sink(NULL), importer(NULL),

	62 ruleIndex(0) {

	63 }

	64

	65 CollationRuleParser::~CollationRuleParser() {

	66 }

	67

	68 void

	69 CollationRuleParser::parse(const UnicodeString &ruleString,

	70 CollationSettings &outSettings,

	71 UParseError *outParseError,

	72 UErrorCode &errorCode) {

	73 if(U_FAILURE(errorCode)) { return; }

	74 settings = &outSettings;

	75 parseError = outParseError;

	76 if(parseError != NULL) {

	77 parseError->line = 0;

	78 parseError->offset = -1;

	79 parseError->preContext[0] = 0;

	80 parseError->postContext[0] = 0;

	81 }

	82 errorReason = NULL;

	83 parse(ruleString, errorCode);

	84 }

	85

	86 void

	87 CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCod e) {

	88 if(U_FAILURE(errorCode)) { return; }

	89 rules = &ruleString;

	90 ruleIndex = 0;

	91

	92 while(ruleIndex < rules->length()) {

	93 UChar c = rules->charAt(ruleIndex);

	94 if(PatternProps::isWhiteSpace(c)) {

	95 ++ruleIndex;

	96 continue;

	97 }

	98 switch(c) {

	99 case 0x26: // '&'

	100 parseRuleChain(errorCode);

	101 break;

	102 case 0x5b: // '['

	103 parseSetting(errorCode);

	104 break;

	105 case 0x23: // '#' starts a comment, until the end of the line

	106 ruleIndex = skipComment(ruleIndex + 1);

	107 break;

	108 case 0x40: // '@' is equivalent to [backwards 2]

	109 settings->setFlag(CollationSettings::BACKWARD_SECONDARY,

	110 UCOL_ON, 0, errorCode);

	111 ++ruleIndex;

	112 break;

	113 case 0x21: // '!' used to turn on Thai/Lao character reversal

	114 // Accept but ignore. The root collator has contractions

	115 // that are equivalent to the character reversal, where appropriate.

	116 ++ruleIndex;

	117 break;

	118 default:

	119 setParseError("expected a reset or setting or comment", errorCode);

	120 break;

	121 }

	122 if(U_FAILURE(errorCode)) { return; }

	123 }

	124 }

	125

	126 void

	127 CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {

	128 int32_t resetStrength = parseResetAndPosition(errorCode);

	129 UBool isFirstRelation = TRUE;

	130 for(;;) {

	131 int32_t result = parseRelationOperator(errorCode);

	132 if(U_FAILURE(errorCode)) { return; }

	133 if(result < 0) {

	134 if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {

	135 // '#' starts a comment, until the end of the line

	136 ruleIndex = skipComment(ruleIndex + 1);

	137 continue;

	138 }

	139 if(isFirstRelation) {

	140 setParseError("reset not followed by a relation", errorCode);

	141 }

	142 return;

	143 }

	144 int32_t strength = result & STRENGTH_MASK;

	145 if(resetStrength < UCOL_IDENTICAL) {

	146 // reset-before rule chain

	147 if(isFirstRelation) {

	148 if(strength != resetStrength) {

	149 setParseError("reset-before strength differs from its first relation", errorCode);

	150 return;

	151 }

	152 } else {

	153 if(strength < resetStrength) {

	154 setParseError("reset-before strength followed by a stronger relation", errorCode);

	155 return;

	156 }

	157 }

	158 }

	159 int32_t i = ruleIndex + (result >> OFFSET_SHIFT); // skip over the rela tion operator

	160 if((result & STARRED_FLAG) == 0) {

	161 parseRelationStrings(strength, i, errorCode);

	162 } else {

	163 parseStarredCharacters(strength, i, errorCode);

	164 }

	165 if(U_FAILURE(errorCode)) { return; }

	166 isFirstRelation = FALSE;

	167 }

	168 }

	169

	170 int32_t

	171 CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {

	172 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }

	173 int32_t i = skipWhiteSpace(ruleIndex + 1);

	174 int32_t j;

	175 UChar c;

	176 int32_t resetStrength;

	177 if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&

	178 (j = i + BEFORE_LENGTH) < rules->length() &&

	179 PatternProps::isWhiteSpace(rules->charAt(j)) &&

	180 ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&

	181 0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&

	182 rules->charAt(j + 1) == 0x5d) {

	183 // &[before n] with n=1 or 2 or 3

	184 resetStrength = UCOL_PRIMARY + (c - 0x31);

	185 i = skipWhiteSpace(j + 2);

	186 } else {

	187 resetStrength = UCOL_IDENTICAL;

	188 }

	189 if(i >= rules->length()) {

	190 setParseError("reset without position", errorCode);

	191 return UCOL_DEFAULT;

	192 }

	193 UnicodeString str;

	194 if(rules->charAt(i) == 0x5b) { // '['

	195 i = parseSpecialPosition(i, str, errorCode);

	196 } else {

	197 i = parseTailoringString(i, str, errorCode);

	198 }

	199 sink->addReset(resetStrength, str, errorReason, errorCode);

	200 if(U_FAILURE(errorCode)) { setErrorContext(); }

	201 ruleIndex = i;

	202 return resetStrength;

	203 }

	204

	205 int32_t

	206 CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {

	207 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }

	208 ruleIndex = skipWhiteSpace(ruleIndex);

	209 if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }

	210 int32_t strength;

	211 int32_t i = ruleIndex;

	212 UChar c = rules->charAt(i++);

	213 switch(c) {

	214 case 0x3c: // '<'

	215 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<

	216 ++i;

	217 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<

	218 ++i;

	219 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<<

	220 ++i;

	221 strength = UCOL_QUATERNARY;

	222 } else {

	223 strength = UCOL_TERTIARY;

	224 }

	225 } else {

	226 strength = UCOL_SECONDARY;

	227 }

	228 } else {

	229 strength = UCOL_PRIMARY;

	230 }

	231 if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'

	232 ++i;

	233 strength \|= STARRED_FLAG;

	234 }

	235 break;

	236 case 0x3b: // ';' same as <<

	237 strength = UCOL_SECONDARY;

	238 break;

	239 case 0x2c: // ',' same as <<<

	240 strength = UCOL_TERTIARY;

	241 break;

	242 case 0x3d: // '='

	243 strength = UCOL_IDENTICAL;

	244 if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'

	245 ++i;

	246 strength \|= STARRED_FLAG;

	247 }

	248 break;

	249 default:

	250 return UCOL_DEFAULT;

	251 }

	252 return ((i - ruleIndex) << OFFSET_SHIFT) \| strength;

	253 }

	254

	255 void

	256 CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCod e &errorCode) {

	257 // Parse

	258 // prefix \| str / extension

	259 // where prefix and extension are optional.

	260 UnicodeString prefix, str, extension;

	261 i = parseTailoringString(i, str, errorCode);

	262 if(U_FAILURE(errorCode)) { return; }

	263 UChar next = (i < rules->length()) ? rules->charAt(i) : 0;

	264 if(next == 0x7c) { // '\|' separates the context prefix from the string.

	265 prefix = str;

	266 i = parseTailoringString(i + 1, str, errorCode);

	267 if(U_FAILURE(errorCode)) { return; }

	268 next = (i < rules->length()) ? rules->charAt(i) : 0;

	269 }

	270 if(next == 0x2f) { // '/' separates the string from the extension.

	271 i = parseTailoringString(i + 1, extension, errorCode);

	272 }

	273 if(!prefix.isEmpty()) {

	274 UChar32 prefix0 = prefix.char32At(0);

	275 UChar32 c = str.char32At(0);

	276 if(!nfc.hasBoundaryBefore(prefix0) \|\| !nfc.hasBoundaryBefore(c)) {

	277 setParseError("in 'prefix\|str', prefix and str must each start with an NFC boundary",

	278 errorCode);

	279 return;

	280 }

	281 }

	282 sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);

	283 if(U_FAILURE(errorCode)) { setErrorContext(); }

	284 ruleIndex = i;

	285 }

	286

	287 void

	288 CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorC ode &errorCode) {

	289 UnicodeString empty, raw;

	290 i = parseString(skipWhiteSpace(i), raw, errorCode);

	291 if(U_FAILURE(errorCode)) { return; }

	292 if(raw.isEmpty()) {

	293 setParseError("missing starred-relation string", errorCode);

	294 return;

	295 }

	296 UChar32 prev = -1;

	297 int32_t j = 0;

	298 for(;;) {

	299 while(j < raw.length()) {

	300 UChar32 c = raw.char32At(j);

	301 if(!nfd.isInert(c)) {

	302 setParseError("starred-relation string is not all NFD-inert", er rorCode);

	303 return;

	304 }

	305 sink->addRelation(strength, empty, UnicodeString(c), empty, errorRea son, errorCode);

	306 if(U_FAILURE(errorCode)) {

	307 setErrorContext();

	308 return;

	309 }

	310 j += U16_LENGTH(c);

	311 prev = c;

	312 }

	313 if(i >= rules->length() \|\| rules->charAt(i) != 0x2d) { // '-'

	314 break;

	315 }

	316 if(prev < 0) {

	317 setParseError("range without start in starred-relation string", erro rCode);

	318 return;

	319 }

	320 i = parseString(i + 1, raw, errorCode);

	321 if(U_FAILURE(errorCode)) { return; }

	322 if(raw.isEmpty()) {

	323 setParseError("range without end in starred-relation string", errorC ode);

	324 return;

	325 }

	326 UChar32 c = raw.char32At(0);

	327 if(c < prev) {

	328 setParseError("range start greater than end in starred-relation stri ng", errorCode);

	329 return;

	330 }

	331 // range prev-c

	332 UnicodeString s;

	333 while(++prev <= c) {

	334 if(!nfd.isInert(prev)) {

	335 setParseError("starred-relation string range is not all NFD-iner t", errorCode);

	336 return;

	337 }

	338 if(U_IS_SURROGATE(prev)) {

	339 setParseError("starred-relation string range contains a surrogat e", errorCode);

	340 return;

	341 }

	342 if(0xfffd <= prev && prev <= 0xffff) {

	343 setParseError("starred-relation string range contains U+FFFD, U+ FFFE or U+FFFF", errorCode);

	344 return;

	345 }

	346 s.setTo(prev);

	347 sink->addRelation(strength, empty, s, empty, errorReason, errorCode) ;

	348 if(U_FAILURE(errorCode)) {

	349 setErrorContext();

	350 return;

	351 }

	352 }

	353 prev = -1;

	354 j = U16_LENGTH(c);

	355 }

	356 ruleIndex = skipWhiteSpace(i);

	357 }

	358

	359 int32_t

	360 CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorC ode &errorCode) {

	361 i = parseString(skipWhiteSpace(i), raw, errorCode);

	362 if(U_SUCCESS(errorCode) && raw.isEmpty()) {

	363 setParseError("missing relation string", errorCode);

	364 }

	365 return skipWhiteSpace(i);

	366 }

	367

	368 int32_t

	369 CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &erro rCode) {

	370 if(U_FAILURE(errorCode)) { return i; }

	371 raw.remove();

	372 while(i < rules->length()) {

	373 UChar32 c = rules->charAt(i++);

	374 if(isSyntaxChar(c)) {

	375 if(c == 0x27) { // apostrophe

	376 if(i < rules->length() && rules->charAt(i) == 0x27) {

	377 // Double apostrophe, encodes a single one.

	378 raw.append((UChar)0x27);

	379 ++i;

	380 continue;

	381 }

	382 // Quote literal text until the next single apostrophe.

	383 for(;;) {

	384 if(i == rules->length()) {

	385 setParseError("quoted literal text missing terminating a postrophe", errorCode);

	386 return i;

	387 }

	388 c = rules->charAt(i++);

	389 if(c == 0x27) {

	390 if(i < rules->length() && rules->charAt(i) == 0x27) {

	391 // Double apostrophe inside quoted literal text,

	392 // still encodes a single apostrophe.

	393 ++i;

	394 } else {

	395 break;

	396 }

	397 }

	398 raw.append((UChar)c);

	399 }

	400 } else if(c == 0x5c) { // backslash

	401 if(i == rules->length()) {

	402 setParseError("backslash escape at the end of the rule strin g", errorCode);

	403 return i;

	404 }

	405 c = rules->char32At(i);

	406 raw.append(c);

	407 i += U16_LENGTH(c);

	408 } else {

	409 // Any other syntax character terminates a string.

	410 --i;

	411 break;

	412 }

	413 } else if(PatternProps::isWhiteSpace(c)) {

	414 // Unquoted white space terminates a string.

	415 --i;

	416 break;

	417 } else {

	418 raw.append((UChar)c);

	419 }

	420 }

	421 for(int32_t j = 0; j < raw.length();) {

	422 UChar32 c = raw.char32At(j);

	423 if(U_IS_SURROGATE(c)) {

	424 setParseError("string contains an unpaired surrogate", errorCode);

	425 return i;

	426 }

	427 if(0xfffd <= c && c <= 0xffff) {

	428 setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode) ;

	429 return i;

	430 }

	431 j += U16_LENGTH(c);

	432 }

	433 return i;

	434 }

	435

	436 namespace {

	437

	438 static const char *const positions[] = {

	439 "first tertiary ignorable",

	440 "last tertiary ignorable",

	441 "first secondary ignorable",

	442 "last secondary ignorable",

	443 "first primary ignorable",

	444 "last primary ignorable",

	445 "first variable",

	446 "last variable",

	447 "first regular",

	448 "last regular",

	449 "first implicit",

	450 "last implicit",

	451 "first trailing",

	452 "last trailing"

	453 };

	454

	455 } // namespace

	456

	457 int32_t

	458 CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorC ode &errorCode) {

	459 if(U_FAILURE(errorCode)) { return 0; }

	460 UnicodeString raw;

	461 int32_t j = readWords(i + 1, raw);

	462 if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) { // words end with ]

	463 ++j;

	464 for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {

	465 if(raw == UnicodeString(positions[pos], -1, US_INV)) {

	466 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos));

	467 return j;

	468 }

	469 }

	470 if(raw == UNICODE_STRING_SIMPLE("top")) {

	471 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR));

	472 return j;

	473 }

	474 if(raw == UNICODE_STRING_SIMPLE("variable top")) {

	475 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE)) ;

	476 return j;

	477 }

	478 }

	479 setParseError("not a valid special reset position", errorCode);

	480 return i;

	481 }

	482

	483 void

	484 CollationRuleParser::parseSetting(UErrorCode &errorCode) {

	485 if(U_FAILURE(errorCode)) { return; }

	486 UnicodeString raw;

	487 int32_t i = ruleIndex + 1;

	488 int32_t j = readWords(i, raw);

	489 if(j <= i \|\| raw.isEmpty()) {

	490 setParseError("expected a setting/option at '['", errorCode);

	491 }

	492 if(rules->charAt(j) == 0x5d) { // words end with ]

	493 ++j;

	494 if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&

	495 (raw.length() == 7 \|\| raw.charAt(7) == 0x20)) {

	496 parseReordering(raw, errorCode);

	497 ruleIndex = j;

	498 return;

	499 }

	500 if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {

	501 settings->setFlag(CollationSettings::BACKWARD_SECONDARY,

	502 UCOL_ON, 0, errorCode);

	503 ruleIndex = j;

	504 return;

	505 }

	506 UnicodeString v;

	507 int32_t valueIndex = raw.lastIndexOf((UChar)0x20);

	508 if(valueIndex >= 0) {

	509 v.setTo(raw, valueIndex + 1);

	510 raw.truncate(valueIndex);

	511 }

	512 if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {

	513 int32_t value = UCOL_DEFAULT;

	514 UChar c = v.charAt(0);

	515 if(0x31 <= c && c <= 0x34) { // 1..4

	516 value = UCOL_PRIMARY + (c - 0x31);

	517 } else if(c == 0x49) { // 'I'

	518 value = UCOL_IDENTICAL;

	519 }

	520 if(value != UCOL_DEFAULT) {

	521 settings->setStrength(value, 0, errorCode);

	522 ruleIndex = j;

	523 return;

	524 }

	525 } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {

	526 UColAttributeValue value = UCOL_DEFAULT;

	527 if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {

	528 value = UCOL_NON_IGNORABLE;

	529 } else if(v == UNICODE_STRING_SIMPLE("shifted")) {

	530 value = UCOL_SHIFTED;

	531 }

	532 if(value != UCOL_DEFAULT) {

	533 settings->setAlternateHandling(value, 0, errorCode);

	534 ruleIndex = j;

	535 return;

	536 }

	537 } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {

	538 int32_t value = UCOL_DEFAULT;

	539 if(v == UNICODE_STRING_SIMPLE("space")) {

	540 value = CollationSettings::MAX_VAR_SPACE;

	541 } else if(v == UNICODE_STRING_SIMPLE("punct")) {

	542 value = CollationSettings::MAX_VAR_PUNCT;

	543 } else if(v == UNICODE_STRING_SIMPLE("symbol")) {

	544 value = CollationSettings::MAX_VAR_SYMBOL;

	545 } else if(v == UNICODE_STRING_SIMPLE("currency")) {

	546 value = CollationSettings::MAX_VAR_CURRENCY;

	547 }

	548 if(value != UCOL_DEFAULT) {

	549 settings->setMaxVariable(value, 0, errorCode);

	550 settings->variableTop = baseData->getLastPrimaryForGroup(

	551 UCOL_REORDER_CODE_FIRST + value);

	552 U_ASSERT(settings->variableTop != 0);

	553 ruleIndex = j;

	554 return;

	555 }

	556 } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {

	557 UColAttributeValue value = UCOL_DEFAULT;

	558 if(v == UNICODE_STRING_SIMPLE("off")) {

	559 value = UCOL_OFF;

	560 } else if(v == UNICODE_STRING_SIMPLE("lower")) {

	561 value = UCOL_LOWER_FIRST;

	562 } else if(v == UNICODE_STRING_SIMPLE("upper")) {

	563 value = UCOL_UPPER_FIRST;

	564 }

	565 if(value != UCOL_DEFAULT) {

	566 settings->setCaseFirst(value, 0, errorCode);

	567 ruleIndex = j;

	568 return;

	569 }

	570 } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {

	571 UColAttributeValue value = getOnOffValue(v);

	572 if(value != UCOL_DEFAULT) {

	573 settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, error Code);

	574 ruleIndex = j;

	575 return;

	576 }

	577 } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {

	578 UColAttributeValue value = getOnOffValue(v);

	579 if(value != UCOL_DEFAULT) {

	580 settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorC ode);

	581 ruleIndex = j;

	582 return;

	583 }

	584 } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {

	585 UColAttributeValue value = getOnOffValue(v);

	586 if(value != UCOL_DEFAULT) {

	587 settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCod e);

	588 ruleIndex = j;

	589 return;

	590 }

	591 } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {

	592 UColAttributeValue value = getOnOffValue(v);

	593 if(value != UCOL_DEFAULT) {

	594 if(value == UCOL_ON) {

	595 setParseError("[hiraganaQ on] is not supported", errorCode);

	596 }

	597 ruleIndex = j;

	598 return;

	599 }

	600 } else if(raw == UNICODE_STRING_SIMPLE("import")) {

	601 CharString lang;

	602 lang.appendInvariantChars(v, errorCode);

	603 if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }

	604 // BCP 47 language tag -> ICU locale ID

	605 char localeID[ULOC_FULLNAME_CAPACITY];

	606 int32_t parsedLength;

	607 int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FUL LNAME_CAPACITY,

	608 &parsedLength, &errorCode);

	609 if(U_FAILURE(errorCode) \|\|

	610 parsedLength != lang.length() \|\| length >= ULOC_FULLNAME_CAP ACITY) {

	611 errorCode = U_ZERO_ERROR;

	612 setParseError("expected language tag in [import langTag]", error Code);

	613 return;

	614 }

	615 // localeID minus all keywords

	616 char baseID[ULOC_FULLNAME_CAPACITY];

	617 length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode);

	618 if(U_FAILURE(errorCode) \|\| length >= ULOC_KEYWORDS_CAPACITY) {

	619 errorCode = U_ZERO_ERROR;

	620 setParseError("expected language tag in [import langTag]", error Code);

	621 return;

	622 }

	623 if(length == 3 && uprv_memcmp(baseID, "und", 3) == 0) {

	624 uprv_strcpy(baseID, "root");

	625 }

	626 // @collation=type, or length=0 if not specified

	627 char collationType[ULOC_KEYWORDS_CAPACITY];

	628 length = uloc_getKeywordValue(localeID, "collation",

	629 collationType, ULOC_KEYWORDS_CAPACITY,

	630 &errorCode);

	631 if(U_FAILURE(errorCode) \|\| length >= ULOC_KEYWORDS_CAPACITY) {

	632 errorCode = U_ZERO_ERROR;

	633 setParseError("expected language tag in [import langTag]", error Code);

	634 return;

	635 }

	636 if(importer == NULL) {

	637 setParseError("[import langTag] is not supported", errorCode);

	638 } else {

	639 UnicodeString importedRules;

	640 importer->getRules(baseID, length > 0 ? collationType : "standar d",

	641 importedRules, errorReason, errorCode);

	642 if(U_FAILURE(errorCode)) {

	643 if(errorReason == NULL) {

	644 errorReason = "[import langTag] failed";

	645 }

	646 setErrorContext();

	647 return;

	648 }

	649 const UnicodeString *outerRules = rules;

	650 int32_t outerRuleIndex = ruleIndex;

	651 parse(importedRules, errorCode);

	652 if(U_FAILURE(errorCode)) {

	653 if(parseError != NULL) {

	654 parseError->offset = outerRuleIndex;

	655 }

	656 }

	657 rules = outerRules;

	658 ruleIndex = j;

	659 }

	660 return;

	661 }

	662 } else if(rules->charAt(j) == 0x5b) { // words end with [

	663 UnicodeSet set;

	664 j = parseUnicodeSet(j, set, errorCode);

	665 if(U_FAILURE(errorCode)) { return; }

	666 if(raw == UNICODE_STRING_SIMPLE("optimize")) {

	667 sink->optimize(set, errorReason, errorCode);

	668 if(U_FAILURE(errorCode)) { setErrorContext(); }

	669 ruleIndex = j;

	670 return;

	671 } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {

	672 sink->suppressContractions(set, errorReason, errorCode);

	673 if(U_FAILURE(errorCode)) { setErrorContext(); }

	674 ruleIndex = j;

	675 return;

	676 }

	677 }

	678 setParseError("not a valid setting/option", errorCode);

	679 }

	680

	681 void

	682 CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &error Code) {

	683 if(U_FAILURE(errorCode)) { return; }

	684 int32_t i = 7; // after "reorder"

	685 if(i == raw.length()) {

	686 // empty [reorder] with no codes

	687 settings->resetReordering();

	688 return;

	689 }

	690 // Parse the codes in [reorder aa bb cc].

	691 UVector32 reorderCodes(errorCode);

	692 if(U_FAILURE(errorCode)) { return; }

	693 CharString word;

	694 while(i < raw.length()) {

	695 ++i; // skip the word-separating space

	696 int32_t limit = raw.indexOf((UChar)0x20, i);

	697 if(limit < 0) { limit = raw.length(); }

	698 word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), er rorCode);

	699 if(U_FAILURE(errorCode)) { return; }

	700 int32_t code = getReorderCode(word.data());

	701 if(code < 0) {

	702 setParseError("unknown script or reorder code", errorCode);

	703 return;

	704 }

	705 reorderCodes.addElement(code, errorCode);

	706 if(U_FAILURE(errorCode)) { return; }

	707 i = limit;

	708 }

	709 int32_t length = reorderCodes.size();

	710 if(length == 1 && reorderCodes.elementAti(0) == UCOL_REORDER_CODE_NONE) {

	711 settings->resetReordering();

	712 return;

	713 }

	714 uint8_t table[256];

	715 baseData->makeReorderTable(reorderCodes.getBuffer(), length, table, errorCod e);

	716 if(U_FAILURE(errorCode)) { return; }

	717 if(!settings->setReordering(reorderCodes.getBuffer(), length, table)) {

	718 errorCode = U_MEMORY_ALLOCATION_ERROR;

	719 }

	720 }

	721

	722 static const char *const gSpecialReorderCodes[] = {

	723 "space", "punct", "symbol", "currency", "digit"

	724 };

	725

	726 int32_t

	727 CollationRuleParser::getReorderCode(const char *word) {

	728 for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {

	729 if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {

	730 return UCOL_REORDER_CODE_FIRST + i;

	731 }

	732 }

	733 int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);

	734 if(script >= 0) {

	735 return script;

	736 }

	737 if(uprv_stricmp(word, "others") == 0) {

	738 return UCOL_REORDER_CODE_OTHERS; // same as Zzzz = USCRIPT_UNKNOWN

	739 }

	740 return -1;

	741 }

	742

	743 UColAttributeValue

	744 CollationRuleParser::getOnOffValue(const UnicodeString &s) {

	745 if(s == UNICODE_STRING_SIMPLE("on")) {

	746 return UCOL_ON;

	747 } else if(s == UNICODE_STRING_SIMPLE("off")) {

	748 return UCOL_OFF;

	749 } else {

	750 return UCOL_DEFAULT;

	751 }

	752 }

	753

	754 int32_t

	755 CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &err orCode) {

	756 // Collect a UnicodeSet pattern between a balanced pair of [brackets].

	757 int32_t level = 0;

	758 int32_t j = i;

	759 for(;;) {

	760 if(j == rules->length()) {

	761 setParseError("unbalanced UnicodeSet pattern brackets", errorCode);

	762 return j;

	763 }

	764 UChar c = rules->charAt(j++);

	765 if(c == 0x5b) { // '['

	766 ++level;

	767 } else if(c == 0x5d) { // ']'

	768 if(--level == 0) { break; }

	769 }

	770 }

	771 set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);

	772 if(U_FAILURE(errorCode)) {

	773 errorCode = U_ZERO_ERROR;

	774 setParseError("not a valid UnicodeSet pattern", errorCode);

	775 return j;

	776 }

	777 j = skipWhiteSpace(j);

	778 if(j == rules->length() \|\| rules->charAt(j) != 0x5d) {

	779 setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);

	780 return j;

	781 }

	782 return ++j;

	783 }

	784

	785 int32_t

	786 CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {

	787 static const UChar sp = 0x20;

	788 raw.remove();

	789 i = skipWhiteSpace(i);

	790 for(;;) {

	791 if(i >= rules->length()) { return 0; }

	792 UChar c = rules->charAt(i);

	793 if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) { // syntax except -_

	794 if(raw.isEmpty()) { return i; }

	795 if(raw.endsWith(&sp, 1)) { // remove trailing space

	796 raw.truncate(raw.length() - 1);

	797 }

	798 return i;

	799 }

	800 if(PatternProps::isWhiteSpace(c)) {

	801 raw.append(0x20);

	802 i = skipWhiteSpace(i + 1);

	803 } else {

	804 raw.append(c);

	805 ++i;

	806 }

	807 }

	808 }

	809

	810 int32_t

	811 CollationRuleParser::skipComment(int32_t i) const {

	812 // skip to past the newline

	813 while(i < rules->length()) {

	814 UChar c = rules->charAt(i++);

	815 // LF or FF or CR or NEL or LS or PS

	816 if(c == 0xa \|\| c == 0xc \|\| c == 0xd \|\| c == 0x85 \|\| c == 0x2028 \|\| c == 0x2029) {

	817 // Unicode Newline Guidelines: "A readline function should stop at N LF, LS, FF, or PS."

	818 // NLF (new line function) = CR or LF or CR+LF or NEL.

	819 // No need to collect all of CR+LF because a following LF will be ig nored anyway.

	820 break;

	821 }

	822 }

	823 return i;

	824 }

	825

	826 void

	827 CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {

	828 if(U_FAILURE(errorCode)) { return; }

	829 // Error code consistent with the old parser (from ca. 2001),

	830 // rather than U_PARSE_ERROR;

	831 errorCode = U_INVALID_FORMAT_ERROR;

	832 errorReason = reason;

	833 if(parseError != NULL) { setErrorContext(); }

	834 }

	835

	836 void

	837 CollationRuleParser::setErrorContext() {

	838 if(parseError == NULL) { return; }

	839

	840 // Note: This relies on the calling code maintaining the ruleIndex

	841 // at a position that is useful for debugging.

	842 // For example, at the beginning of a reset or relation etc.

	843 parseError->offset = ruleIndex;

	844 parseError->line = 0; // We are not counting line numbers.

	845

	846 // before ruleIndex

	847 int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);

	848 if(start < 0) {

	849 start = 0;

	850 } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {

	851 ++start;

	852 }

	853 int32_t length = ruleIndex - start;

	854 rules->extract(start, length, parseError->preContext);

	855 parseError->preContext[length] = 0;

	856

	857 // starting from ruleIndex

	858 length = rules->length() - ruleIndex;

	859 if(length >= U_PARSE_CONTEXT_LEN) {

	860 length = U_PARSE_CONTEXT_LEN - 1;

	861 if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {

	862 --length;

	863 }

	864 }

	865 rules->extract(ruleIndex, length, parseError->postContext);

	866 parseError->postContext[length] = 0;

	867 }

	868

	869 UBool

	870 CollationRuleParser::isSyntaxChar(UChar32 c) {

	871 return 0x21 <= c && c <= 0x7e &&

	872 (c <= 0x2f \|\| (0x3a <= c && c <= 0x40) \|\|

	873 (0x5b <= c && c <= 0x60) \|\| (0x7b <= c));

	874 }

	875

	876 int32_t

	877 CollationRuleParser::skipWhiteSpace(int32_t i) const {

	878 while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {

	879 ++i;

	880 }

	881 return i;

	882 }

	883

	884 U_NAMESPACE_END

	885

	886 #endif // !UCONFIG_NO_COLLATION

OLD	NEW

« no previous file with comments | « source/i18n/collationruleparser.h ('k') | source/i18n/collationsets.h » ('j') | no next file with comments »