source/i18n/collationruleparser.cpp - Issue 845603002: Update ICU to 54.1 step 1

Unified Diff: source/i18n/collationruleparser.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master

Patch Set: remove unusued directories Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: source/i18n/collationruleparser.cpp

diff --git a/source/i18n/collationruleparser.cpp b/source/i18n/collationruleparser.cpp

new file mode 100644

index 0000000000000000000000000000000000000000..ac413a2a64cd0145bde39ebb37fe05ca8a285dd2

--- /dev/null

+++ b/source/i18n/collationruleparser.cpp

@@ -0,0 +1,886 @@

+/*

+*******************************************************************************

+* collationruleparser.cpp

+* (replaced the former ucol_tok.cpp)

+* created on: 2013apr10

+* created by: Markus W. Scherer

+*/

+#include "unicode/utypes.h"

+#if !UCONFIG_NO_COLLATION

+#include "unicode/normalizer2.h"

+#include "unicode/parseerr.h"

+#include "unicode/uchar.h"

+#include "unicode/ucol.h"

+#include "unicode/uloc.h"

+#include "unicode/unistr.h"

+#include "unicode/utf16.h"

+#include "charstr.h"

+#include "cmemory.h"

+#include "collation.h"

+#include "collationdata.h"

+#include "collationruleparser.h"

+#include "collationsettings.h"

+#include "collationtailoring.h"

+#include "cstring.h"

+#include "patternprops.h"

+#include "uassert.h"

+#include "uvectr32.h"

+U_NAMESPACE_BEGIN

+namespace {

+static const UChar BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 }; // "[before"

+const int32_t BEFORE_LENGTH = 7;

+} // namespace

+CollationRuleParser::Sink::~Sink() {}

+void

+CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}

+void

+CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}

+CollationRuleParser::Importer::~Importer() {}

+CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)

+ : nfd(*Normalizer2::getNFDInstance(errorCode)),

+ nfc(*Normalizer2::getNFCInstance(errorCode)),

+ rules(NULL), baseData(base), settings(NULL),

+ parseError(NULL), errorReason(NULL),

+ sink(NULL), importer(NULL),

+ ruleIndex(0) {

+CollationRuleParser::~CollationRuleParser() {

+void

+CollationRuleParser::parse(const UnicodeString &ruleString,

+ CollationSettings &outSettings,

+ UParseError *outParseError,

+ UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return; }

+ settings = &outSettings;

+ parseError = outParseError;

+ if(parseError != NULL) {

+ parseError->line = 0;

+ parseError->offset = -1;

+ parseError->preContext[0] = 0;

+ parseError->postContext[0] = 0;

+ }

+ errorReason = NULL;

+ parse(ruleString, errorCode);

+void

+CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return; }

+ rules = &ruleString;

+ ruleIndex = 0;

+ while(ruleIndex < rules->length()) {

+ UChar c = rules->charAt(ruleIndex);

+ if(PatternProps::isWhiteSpace(c)) {

+ ++ruleIndex;

+ continue;

+ }

+ switch(c) {

+ case 0x26: // '&'

+ parseRuleChain(errorCode);

+ break;

+ case 0x5b: // '['

+ parseSetting(errorCode);

+ break;

+ case 0x23: // '#' starts a comment, until the end of the line

+ ruleIndex = skipComment(ruleIndex + 1);

+ break;

+ case 0x40: // '@' is equivalent to [backwards 2]

+ settings->setFlag(CollationSettings::BACKWARD_SECONDARY,

+ UCOL_ON, 0, errorCode);

+ ++ruleIndex;

+ break;

+ case 0x21: // '!' used to turn on Thai/Lao character reversal

+ // Accept but ignore. The root collator has contractions

+ // that are equivalent to the character reversal, where appropriate.

+ ++ruleIndex;

+ break;

+ default:

+ setParseError("expected a reset or setting or comment", errorCode);

+ break;

+ }

+ if(U_FAILURE(errorCode)) { return; }

+ }

+void

+CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {

+ int32_t resetStrength = parseResetAndPosition(errorCode);

+ UBool isFirstRelation = TRUE;

+ for(;;) {

+ int32_t result = parseRelationOperator(errorCode);

+ if(U_FAILURE(errorCode)) { return; }

+ if(result < 0) {

+ if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {

+ // '#' starts a comment, until the end of the line

+ ruleIndex = skipComment(ruleIndex + 1);

+ continue;

+ }

+ if(isFirstRelation) {

+ setParseError("reset not followed by a relation", errorCode);

+ }

+ return;

+ }

+ int32_t strength = result & STRENGTH_MASK;

+ if(resetStrength < UCOL_IDENTICAL) {

+ // reset-before rule chain

+ if(isFirstRelation) {

+ if(strength != resetStrength) {

+ setParseError("reset-before strength differs from its first relation", errorCode);

+ return;

+ }

+ } else {

+ if(strength < resetStrength) {

+ setParseError("reset-before strength followed by a stronger relation", errorCode);

+ return;

+ }

+ int32_t i = ruleIndex + (result >> OFFSET_SHIFT); // skip over the relation operator

+ if((result & STARRED_FLAG) == 0) {

+ parseRelationStrings(strength, i, errorCode);

+ } else {

+ parseStarredCharacters(strength, i, errorCode);

+ }

+ if(U_FAILURE(errorCode)) { return; }

+ isFirstRelation = FALSE;

+ }

+int32_t

+CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }

+ int32_t i = skipWhiteSpace(ruleIndex + 1);

+ int32_t j;

+ UChar c;

+ int32_t resetStrength;

+ if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&

+ (j = i + BEFORE_LENGTH) < rules->length() &&

+ PatternProps::isWhiteSpace(rules->charAt(j)) &&

+ ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&

+ 0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&

+ rules->charAt(j + 1) == 0x5d) {

+ // &[before n] with n=1 or 2 or 3

+ resetStrength = UCOL_PRIMARY + (c - 0x31);

+ i = skipWhiteSpace(j + 2);

+ } else {

+ resetStrength = UCOL_IDENTICAL;

+ }

+ if(i >= rules->length()) {

+ setParseError("reset without position", errorCode);

+ return UCOL_DEFAULT;

+ }

+ UnicodeString str;

+ if(rules->charAt(i) == 0x5b) { // '['

+ i = parseSpecialPosition(i, str, errorCode);

+ } else {

+ i = parseTailoringString(i, str, errorCode);

+ }

+ sink->addReset(resetStrength, str, errorReason, errorCode);

+ if(U_FAILURE(errorCode)) { setErrorContext(); }

+ ruleIndex = i;

+ return resetStrength;

+int32_t

+CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }

+ ruleIndex = skipWhiteSpace(ruleIndex);

+ if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }

+ int32_t strength;

+ int32_t i = ruleIndex;

+ UChar c = rules->charAt(i++);

+ switch(c) {

+ case 0x3c: // '<'

+ if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<

+ ++i;

+ if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<

+ ++i;

+ if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<<

+ ++i;

+ strength = UCOL_QUATERNARY;

+ } else {

+ strength = UCOL_TERTIARY;

+ }

+ } else {

+ strength = UCOL_SECONDARY;

+ }

+ } else {

+ strength = UCOL_PRIMARY;

+ }

+ if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'

+ ++i;

+ strength |= STARRED_FLAG;

+ }

+ break;

+ case 0x3b: // ';' same as <<

+ strength = UCOL_SECONDARY;

+ break;

+ case 0x2c: // ',' same as <<<

+ strength = UCOL_TERTIARY;

+ break;

+ case 0x3d: // '='

+ strength = UCOL_IDENTICAL;

+ if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'

+ ++i;

+ strength |= STARRED_FLAG;

+ }

+ break;

+ default:

+ return UCOL_DEFAULT;

+ }

+ return ((i - ruleIndex) << OFFSET_SHIFT) | strength;

+void

+CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {

+ // Parse

+ // prefix | str / extension

+ // where prefix and extension are optional.

+ UnicodeString prefix, str, extension;

+ i = parseTailoringString(i, str, errorCode);

+ if(U_FAILURE(errorCode)) { return; }

+ UChar next = (i < rules->length()) ? rules->charAt(i) : 0;

+ if(next == 0x7c) { // '|' separates the context prefix from the string.

+ prefix = str;

+ i = parseTailoringString(i + 1, str, errorCode);

+ if(U_FAILURE(errorCode)) { return; }

+ next = (i < rules->length()) ? rules->charAt(i) : 0;

+ }

+ if(next == 0x2f) { // '/' separates the string from the extension.

+ i = parseTailoringString(i + 1, extension, errorCode);

+ }

+ if(!prefix.isEmpty()) {

+ UChar32 prefix0 = prefix.char32At(0);

+ UChar32 c = str.char32At(0);

+ if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {

+ setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",

+ errorCode);

+ return;

+ }

+ sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);

+ if(U_FAILURE(errorCode)) { setErrorContext(); }

+ ruleIndex = i;

+void

+CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {

+ UnicodeString empty, raw;

+ i = parseString(skipWhiteSpace(i), raw, errorCode);

+ if(U_FAILURE(errorCode)) { return; }

+ if(raw.isEmpty()) {

+ setParseError("missing starred-relation string", errorCode);

+ return;

+ }

+ UChar32 prev = -1;

+ int32_t j = 0;

+ for(;;) {

+ while(j < raw.length()) {

+ UChar32 c = raw.char32At(j);

+ if(!nfd.isInert(c)) {

+ setParseError("starred-relation string is not all NFD-inert", errorCode);

+ return;

+ }

+ sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);

+ if(U_FAILURE(errorCode)) {

+ setErrorContext();

+ return;

+ }

+ j += U16_LENGTH(c);

+ prev = c;

+ }

+ if(i >= rules->length() || rules->charAt(i) != 0x2d) { // '-'

+ break;

+ }

+ if(prev < 0) {

+ setParseError("range without start in starred-relation string", errorCode);

+ return;

+ }

+ i = parseString(i + 1, raw, errorCode);

+ if(U_FAILURE(errorCode)) { return; }

+ if(raw.isEmpty()) {

+ setParseError("range without end in starred-relation string", errorCode);

+ return;

+ }

+ UChar32 c = raw.char32At(0);

+ if(c < prev) {

+ setParseError("range start greater than end in starred-relation string", errorCode);

+ return;

+ }

+ // range prev-c

+ UnicodeString s;

+ while(++prev <= c) {

+ if(!nfd.isInert(prev)) {

+ setParseError("starred-relation string range is not all NFD-inert", errorCode);

+ return;

+ }

+ if(U_IS_SURROGATE(prev)) {

+ setParseError("starred-relation string range contains a surrogate", errorCode);

+ return;

+ }

+ if(0xfffd <= prev && prev <= 0xffff) {

+ setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);

+ return;

+ }

+ s.setTo(prev);

+ sink->addRelation(strength, empty, s, empty, errorReason, errorCode);

+ if(U_FAILURE(errorCode)) {

+ setErrorContext();

+ return;

+ }

+ prev = -1;

+ j = U16_LENGTH(c);

+ }

+ ruleIndex = skipWhiteSpace(i);

+int32_t

+CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {

+ i = parseString(skipWhiteSpace(i), raw, errorCode);

+ if(U_SUCCESS(errorCode) && raw.isEmpty()) {

+ setParseError("missing relation string", errorCode);

+ }

+ return skipWhiteSpace(i);

+int32_t

+CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return i; }

+ raw.remove();

+ while(i < rules->length()) {

+ UChar32 c = rules->charAt(i++);

+ if(isSyntaxChar(c)) {

+ if(c == 0x27) { // apostrophe

+ if(i < rules->length() && rules->charAt(i) == 0x27) {

+ // Double apostrophe, encodes a single one.

+ raw.append((UChar)0x27);

+ ++i;

+ continue;

+ }

+ // Quote literal text until the next single apostrophe.

+ for(;;) {

+ if(i == rules->length()) {

+ setParseError("quoted literal text missing terminating apostrophe", errorCode);

+ return i;

+ }

+ c = rules->charAt(i++);

+ if(c == 0x27) {

+ if(i < rules->length() && rules->charAt(i) == 0x27) {

+ // Double apostrophe inside quoted literal text,

+ // still encodes a single apostrophe.

+ ++i;

+ } else {

+ break;

+ }

+ raw.append((UChar)c);

+ }

+ } else if(c == 0x5c) { // backslash

+ if(i == rules->length()) {

+ setParseError("backslash escape at the end of the rule string", errorCode);

+ return i;

+ }

+ c = rules->char32At(i);

+ raw.append(c);

+ i += U16_LENGTH(c);

+ } else {

+ // Any other syntax character terminates a string.

+ --i;

+ break;

+ }

+ } else if(PatternProps::isWhiteSpace(c)) {

+ // Unquoted white space terminates a string.

+ --i;

+ break;

+ } else {

+ raw.append((UChar)c);

+ }

+ for(int32_t j = 0; j < raw.length();) {

+ UChar32 c = raw.char32At(j);

+ if(U_IS_SURROGATE(c)) {

+ setParseError("string contains an unpaired surrogate", errorCode);

+ return i;

+ }

+ if(0xfffd <= c && c <= 0xffff) {

+ setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);

+ return i;

+ }

+ j += U16_LENGTH(c);

+ }

+ return i;

+namespace {

+static const char *const positions[] = {

+ "first tertiary ignorable",

+ "last tertiary ignorable",

+ "first secondary ignorable",

+ "last secondary ignorable",

+ "first primary ignorable",

+ "last primary ignorable",

+ "first variable",

+ "last variable",

+ "first regular",

+ "last regular",

+ "first implicit",

+ "last implicit",

+ "first trailing",

+ "last trailing"

+};

+} // namespace

+int32_t

+CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return 0; }

+ UnicodeString raw;

+ int32_t j = readWords(i + 1, raw);

+ if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) { // words end with ]

+ ++j;

+ for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {

+ if(raw == UnicodeString(positions[pos], -1, US_INV)) {

+ str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos));

+ return j;

+ }

+ if(raw == UNICODE_STRING_SIMPLE("top")) {

+ str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR));

+ return j;

+ }

+ if(raw == UNICODE_STRING_SIMPLE("variable top")) {

+ str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE));

+ return j;

+ }

+ setParseError("not a valid special reset position", errorCode);

+ return i;

+void

+CollationRuleParser::parseSetting(UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return; }

+ UnicodeString raw;

+ int32_t i = ruleIndex + 1;

+ int32_t j = readWords(i, raw);

+ if(j <= i || raw.isEmpty()) {

+ setParseError("expected a setting/option at '['", errorCode);

+ }

+ if(rules->charAt(j) == 0x5d) { // words end with ]

+ ++j;

+ if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&

+ (raw.length() == 7 || raw.charAt(7) == 0x20)) {

+ parseReordering(raw, errorCode);

+ ruleIndex = j;

+ return;

+ }

+ if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {

+ settings->setFlag(CollationSettings::BACKWARD_SECONDARY,

+ UCOL_ON, 0, errorCode);

+ ruleIndex = j;

+ return;

+ }

+ UnicodeString v;

+ int32_t valueIndex = raw.lastIndexOf((UChar)0x20);

+ if(valueIndex >= 0) {

+ v.setTo(raw, valueIndex + 1);

+ raw.truncate(valueIndex);

+ }

+ if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {

+ int32_t value = UCOL_DEFAULT;

+ UChar c = v.charAt(0);

+ if(0x31 <= c && c <= 0x34) { // 1..4

+ value = UCOL_PRIMARY + (c - 0x31);

+ } else if(c == 0x49) { // 'I'

+ value = UCOL_IDENTICAL;

+ }

+ if(value != UCOL_DEFAULT) {

+ settings->setStrength(value, 0, errorCode);

+ ruleIndex = j;

+ return;

+ }

+ } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {

+ UColAttributeValue value = UCOL_DEFAULT;

+ if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {

+ value = UCOL_NON_IGNORABLE;

+ } else if(v == UNICODE_STRING_SIMPLE("shifted")) {

+ value = UCOL_SHIFTED;

+ }

+ if(value != UCOL_DEFAULT) {

+ settings->setAlternateHandling(value, 0, errorCode);

+ ruleIndex = j;

+ return;

+ }

+ } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {

+ int32_t value = UCOL_DEFAULT;

+ if(v == UNICODE_STRING_SIMPLE("space")) {

+ value = CollationSettings::MAX_VAR_SPACE;

+ } else if(v == UNICODE_STRING_SIMPLE("punct")) {

+ value = CollationSettings::MAX_VAR_PUNCT;

+ } else if(v == UNICODE_STRING_SIMPLE("symbol")) {

+ value = CollationSettings::MAX_VAR_SYMBOL;

+ } else if(v == UNICODE_STRING_SIMPLE("currency")) {

+ value = CollationSettings::MAX_VAR_CURRENCY;

+ }

+ if(value != UCOL_DEFAULT) {

+ settings->setMaxVariable(value, 0, errorCode);

+ settings->variableTop = baseData->getLastPrimaryForGroup(

+ UCOL_REORDER_CODE_FIRST + value);

+ U_ASSERT(settings->variableTop != 0);

+ ruleIndex = j;

+ return;

+ }

+ } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {

+ UColAttributeValue value = UCOL_DEFAULT;

+ if(v == UNICODE_STRING_SIMPLE("off")) {

+ value = UCOL_OFF;

+ } else if(v == UNICODE_STRING_SIMPLE("lower")) {

+ value = UCOL_LOWER_FIRST;

+ } else if(v == UNICODE_STRING_SIMPLE("upper")) {

+ value = UCOL_UPPER_FIRST;

+ }

+ if(value != UCOL_DEFAULT) {

+ settings->setCaseFirst(value, 0, errorCode);

+ ruleIndex = j;

+ return;

+ }

+ } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {

+ UColAttributeValue value = getOnOffValue(v);

+ if(value != UCOL_DEFAULT) {

+ settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);

+ ruleIndex = j;

+ return;

+ }

+ } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {

+ UColAttributeValue value = getOnOffValue(v);

+ if(value != UCOL_DEFAULT) {

+ settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);

+ ruleIndex = j;

+ return;

+ }

+ } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {

+ UColAttributeValue value = getOnOffValue(v);

+ if(value != UCOL_DEFAULT) {

+ settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);

+ ruleIndex = j;

+ return;

+ }

+ } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {

+ UColAttributeValue value = getOnOffValue(v);

+ if(value != UCOL_DEFAULT) {

+ if(value == UCOL_ON) {

+ setParseError("[hiraganaQ on] is not supported", errorCode);

+ }

+ ruleIndex = j;

+ return;

+ }

+ } else if(raw == UNICODE_STRING_SIMPLE("import")) {

+ CharString lang;

+ lang.appendInvariantChars(v, errorCode);

+ if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }

+ // BCP 47 language tag -> ICU locale ID

+ char localeID[ULOC_FULLNAME_CAPACITY];

+ int32_t parsedLength;

+ int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FULLNAME_CAPACITY,

+ &parsedLength, &errorCode);

+ if(U_FAILURE(errorCode) ||

+ parsedLength != lang.length() || length >= ULOC_FULLNAME_CAPACITY) {

+ errorCode = U_ZERO_ERROR;

+ setParseError("expected language tag in [import langTag]", errorCode);

+ return;

+ }

+ // localeID minus all keywords

+ char baseID[ULOC_FULLNAME_CAPACITY];

+ length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode);

+ if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {

+ errorCode = U_ZERO_ERROR;

+ setParseError("expected language tag in [import langTag]", errorCode);

+ return;

+ }

+ if(length == 3 && uprv_memcmp(baseID, "und", 3) == 0) {

+ uprv_strcpy(baseID, "root");

+ }

+ // @collation=type, or length=0 if not specified

+ char collationType[ULOC_KEYWORDS_CAPACITY];

+ length = uloc_getKeywordValue(localeID, "collation",

+ collationType, ULOC_KEYWORDS_CAPACITY,

+ &errorCode);

+ if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {

+ errorCode = U_ZERO_ERROR;

+ setParseError("expected language tag in [import langTag]", errorCode);

+ return;

+ }

+ if(importer == NULL) {

+ setParseError("[import langTag] is not supported", errorCode);

+ } else {

+ UnicodeString importedRules;

+ importer->getRules(baseID, length > 0 ? collationType : "standard",

+ importedRules, errorReason, errorCode);

+ if(U_FAILURE(errorCode)) {

+ if(errorReason == NULL) {

+ errorReason = "[import langTag] failed";

+ }

+ setErrorContext();

+ return;

+ }

+ const UnicodeString *outerRules = rules;

+ int32_t outerRuleIndex = ruleIndex;

+ parse(importedRules, errorCode);

+ if(U_FAILURE(errorCode)) {

+ if(parseError != NULL) {

+ parseError->offset = outerRuleIndex;

+ }

+ rules = outerRules;

+ ruleIndex = j;

+ }

+ return;

+ }

+ } else if(rules->charAt(j) == 0x5b) { // words end with [

+ UnicodeSet set;

+ j = parseUnicodeSet(j, set, errorCode);

+ if(U_FAILURE(errorCode)) { return; }

+ if(raw == UNICODE_STRING_SIMPLE("optimize")) {

+ sink->optimize(set, errorReason, errorCode);

+ if(U_FAILURE(errorCode)) { setErrorContext(); }

+ ruleIndex = j;

+ return;

+ } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {

+ sink->suppressContractions(set, errorReason, errorCode);

+ if(U_FAILURE(errorCode)) { setErrorContext(); }

+ ruleIndex = j;

+ return;

+ }

+ setParseError("not a valid setting/option", errorCode);

+void

+CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return; }

+ int32_t i = 7; // after "reorder"

+ if(i == raw.length()) {

+ // empty [reorder] with no codes

+ settings->resetReordering();

+ return;

+ }

+ // Parse the codes in [reorder aa bb cc].

+ UVector32 reorderCodes(errorCode);

+ if(U_FAILURE(errorCode)) { return; }

+ CharString word;

+ while(i < raw.length()) {

+ ++i; // skip the word-separating space

+ int32_t limit = raw.indexOf((UChar)0x20, i);

+ if(limit < 0) { limit = raw.length(); }

+ word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);

+ if(U_FAILURE(errorCode)) { return; }

+ int32_t code = getReorderCode(word.data());

+ if(code < 0) {

+ setParseError("unknown script or reorder code", errorCode);

+ return;

+ }

+ reorderCodes.addElement(code, errorCode);

+ if(U_FAILURE(errorCode)) { return; }

+ i = limit;

+ }

+ int32_t length = reorderCodes.size();

+ if(length == 1 && reorderCodes.elementAti(0) == UCOL_REORDER_CODE_NONE) {

+ settings->resetReordering();

+ return;

+ }

+ uint8_t table[256];

+ baseData->makeReorderTable(reorderCodes.getBuffer(), length, table, errorCode);

+ if(U_FAILURE(errorCode)) { return; }

+ if(!settings->setReordering(reorderCodes.getBuffer(), length, table)) {

+ errorCode = U_MEMORY_ALLOCATION_ERROR;

+ }

+static const char *const gSpecialReorderCodes[] = {

+ "space", "punct", "symbol", "currency", "digit"

+};

+int32_t

+CollationRuleParser::getReorderCode(const char *word) {

+ for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {

+ if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {

+ return UCOL_REORDER_CODE_FIRST + i;

+ }

+ int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);

+ if(script >= 0) {

+ return script;

+ }

+ if(uprv_stricmp(word, "others") == 0) {

+ return UCOL_REORDER_CODE_OTHERS; // same as Zzzz = USCRIPT_UNKNOWN

+ }

+ return -1;

+UColAttributeValue

+CollationRuleParser::getOnOffValue(const UnicodeString &s) {

+ if(s == UNICODE_STRING_SIMPLE("on")) {

+ return UCOL_ON;

+ } else if(s == UNICODE_STRING_SIMPLE("off")) {

+ return UCOL_OFF;

+ } else {

+ return UCOL_DEFAULT;

+ }

+int32_t

+CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {

+ // Collect a UnicodeSet pattern between a balanced pair of [brackets].

+ int32_t level = 0;

+ int32_t j = i;

+ for(;;) {

+ if(j == rules->length()) {

+ setParseError("unbalanced UnicodeSet pattern brackets", errorCode);

+ return j;

+ }

+ UChar c = rules->charAt(j++);

+ if(c == 0x5b) { // '['

+ ++level;

+ } else if(c == 0x5d) { // ']'

+ if(--level == 0) { break; }

+ }

+ set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);

+ if(U_FAILURE(errorCode)) {

+ errorCode = U_ZERO_ERROR;

+ setParseError("not a valid UnicodeSet pattern", errorCode);

+ return j;

+ }

+ j = skipWhiteSpace(j);

+ if(j == rules->length() || rules->charAt(j) != 0x5d) {

+ setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);

+ return j;

+ }

+ return ++j;

+int32_t

+CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {

+ static const UChar sp = 0x20;

+ raw.remove();

+ i = skipWhiteSpace(i);

+ for(;;) {

+ if(i >= rules->length()) { return 0; }

+ UChar c = rules->charAt(i);

+ if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) { // syntax except -_

+ if(raw.isEmpty()) { return i; }

+ if(raw.endsWith(&sp, 1)) { // remove trailing space

+ raw.truncate(raw.length() - 1);

+ }

+ return i;

+ }

+ if(PatternProps::isWhiteSpace(c)) {

+ raw.append(0x20);

+ i = skipWhiteSpace(i + 1);

+ } else {

+ raw.append(c);

+ ++i;

+ }

+int32_t

+CollationRuleParser::skipComment(int32_t i) const {

+ // skip to past the newline

+ while(i < rules->length()) {

+ UChar c = rules->charAt(i++);

+ // LF or FF or CR or NEL or LS or PS

+ if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {

+ // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."

+ // NLF (new line function) = CR or LF or CR+LF or NEL.

+ // No need to collect all of CR+LF because a following LF will be ignored anyway.

+ break;

+ }

+ return i;

+void

+CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return; }

+ // Error code consistent with the old parser (from ca. 2001),

+ // rather than U_PARSE_ERROR;

+ errorCode = U_INVALID_FORMAT_ERROR;

+ errorReason = reason;

+ if(parseError != NULL) { setErrorContext(); }

+void

+CollationRuleParser::setErrorContext() {

+ if(parseError == NULL) { return; }

+ // Note: This relies on the calling code maintaining the ruleIndex

+ // at a position that is useful for debugging.

+ // For example, at the beginning of a reset or relation etc.

+ parseError->offset = ruleIndex;

+ parseError->line = 0; // We are not counting line numbers.

+ // before ruleIndex

+ int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);

+ if(start < 0) {

+ start = 0;

+ } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {

+ ++start;

+ }

+ int32_t length = ruleIndex - start;

+ rules->extract(start, length, parseError->preContext);

+ parseError->preContext[length] = 0;

+ // starting from ruleIndex

+ length = rules->length() - ruleIndex;

+ if(length >= U_PARSE_CONTEXT_LEN) {

+ length = U_PARSE_CONTEXT_LEN - 1;

+ if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {

+ --length;

+ }

+ rules->extract(ruleIndex, length, parseError->postContext);

+ parseError->postContext[length] = 0;

+UBool

+CollationRuleParser::isSyntaxChar(UChar32 c) {

+ return 0x21 <= c && c <= 0x7e &&

+ (c <= 0x2f || (0x3a <= c && c <= 0x40) ||

+ (0x5b <= c && c <= 0x60) || (0x7b <= c));

+int32_t

+CollationRuleParser::skipWhiteSpace(int32_t i) const {

+ while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {

+ ++i;

+ }

+ return i;

+U_NAMESPACE_END

+#endif // !UCONFIG_NO_COLLATION

« no previous file with comments | « source/i18n/collationruleparser.h ('k') | source/i18n/collationsets.h » ('j') | no next file with comments »