icu46/source/i18n/rbt_pars.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/i18n/rbt_pars.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 **********************************************************************

	3 * Copyright (C) 1999-2008, International Business Machines

	4 * Corporation and others. All Rights Reserved.

	5 **********************************************************************

	6 * Date Name Description

	7 * 11/17/99 aliu Creation.

	8 **********************************************************************

	9 */

	10

	11 #include "unicode/utypes.h"

	12

	13 #if !UCONFIG_NO_TRANSLITERATION

	14

	15 #include "unicode/uobject.h"

	16 #include "unicode/parseerr.h"

	17 #include "unicode/parsepos.h"

	18 #include "unicode/putil.h"

	19 #include "unicode/uchar.h"

	20 #include "unicode/ustring.h"

	21 #include "unicode/uniset.h"

	22 #include "cstring.h"

	23 #include "funcrepl.h"

	24 #include "hash.h"

	25 #include "quant.h"

	26 #include "rbt.h"

	27 #include "rbt_data.h"

	28 #include "rbt_pars.h"

	29 #include "rbt_rule.h"

	30 #include "strmatch.h"

	31 #include "strrepl.h"

	32 #include "unicode/symtable.h"

	33 #include "tridpars.h"

	34 #include "uvector.h"

	35 #include "hash.h"

	36 #include "util.h"

	37 #include "cmemory.h"

	38 #include "uprops.h"

	39 #include "putilimp.h"

	40

	41 // Operators

	42 #define VARIABLE_DEF_OP ((UChar)0x003D) /=/

	43 #define FORWARD_RULE_OP ((UChar)0x003E) />/

	44 #define REVERSE_RULE_OP ((UChar)0x003C) /</

	45 #define FWDREV_RULE_OP ((UChar)0x007E) /~/ // internal rep of <> op

	46

	47 // Other special characters

	48 #define QUOTE ((UChar)0x0027) /'/

	49 #define ESCAPE ((UChar)0x005C) /\/

	50 #define END_OF_RULE ((UChar)0x003B) /;/

	51 #define RULE_COMMENT_CHAR ((UChar)0x0023) /#/

	52

	53 #define SEGMENT_OPEN ((UChar)0x0028) /(/

	54 #define SEGMENT_CLOSE ((UChar)0x0029) /)/

	55 #define CONTEXT_ANTE ((UChar)0x007B) /{/

	56 #define CONTEXT_POST ((UChar)0x007D) /}/

	57 #define CURSOR_POS ((UChar)0x007C) /\|/

	58 #define CURSOR_OFFSET ((UChar)0x0040) /@/

	59 #define ANCHOR_START ((UChar)0x005E) /^/

	60 #define KLEENE_STAR ((UChar)0x002A) /***/

	61 #define ONE_OR_MORE ((UChar)0x002B) /+/

	62 #define ZERO_OR_ONE ((UChar)0x003F) /?/

	63

	64 #define DOT ((UChar)46) /./

	65

	66 static const UChar DOT_SET[] = { // "[^[:Zp:][:Zl:]\r\n$]";

	67 91, 94, 91, 58, 90, 112, 58, 93, 91, 58, 90,

	68 108, 58, 93, 92, 114, 92, 110, 36, 93, 0

	69 };

	70

	71 // A function is denoted &Source-Target/Variant(text)

	72 #define FUNCTION ((UChar)38) /&/

	73

	74 // Aliases for some of the syntax characters. These are provided so

	75 // transliteration rules can be expressed in XML without clashing with

	76 // XML syntax characters '<', '>', and '&'.

	77 #define ALT_REVERSE_RULE_OP ((UChar)0x2190) // Left Arrow

	78 #define ALT_FORWARD_RULE_OP ((UChar)0x2192) // Right Arrow

	79 #define ALT_FWDREV_RULE_OP ((UChar)0x2194) // Left Right Arrow

	80 #define ALT_FUNCTION ((UChar)0x2206) // Increment (~Greek Capital Delta)

	81

	82 // Special characters disallowed at the top level

	83 static const UChar ILLEGAL_TOP[] = {41,0}; // ")"

	84

	85 // Special characters disallowed within a segment

	86 static const UChar ILLEGAL_SEG[] = {123,125,124,64,0}; // "{}\|@"

	87

	88 // Special characters disallowed within a function argument

	89 static const UChar ILLEGAL_FUNC[] = {94,40,46,42,43,63,123,125,124,64,0}; // "^( .*+?{}\|@"

	90

	91 // By definition, the ANCHOR_END special character is a

	92 // trailing SymbolTable.SYMBOL_REF character.

	93 // private static final char ANCHOR_END = '$';

	94

	95 static const UChar gOPERATORS[] = { // "=><"

	96 VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP,

	97 ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP,

	98 0

	99 };

	100

	101 static const UChar HALF_ENDERS[] = { // "=><;"

	102 VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP,

	103 ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP,

	104 END_OF_RULE,

	105 0

	106 };

	107

	108 // These are also used in Transliterator::toRules()

	109 static const int32_t ID_TOKEN_LEN = 2;

	110 static const UChar ID_TOKEN[] = { 0x3A, 0x3A }; // ':', ':'

	111

	112 /*

	113 commented out until we do real ::BEGIN/::END functionality

	114 static const int32_t BEGIN_TOKEN_LEN = 5;

	115 static const UChar BEGIN_TOKEN[] = { 0x42, 0x45, 0x47, 0x49, 0x4e }; // 'BEGIN'

	116

	117 static const int32_t END_TOKEN_LEN = 3;

	118 static const UChar END_TOKEN[] = { 0x45, 0x4e, 0x44 }; // 'END'

	119 */

	120

	121 U_NAMESPACE_BEGIN

	122

	123 //----------------------------------------------------------------------

	124 // BEGIN ParseData

	125 //----------------------------------------------------------------------

	126

	127 /**

	128 * This class implements the SymbolTable interface. It is used

	129 * during parsing to give UnicodeSet access to variables that

	130 * have been defined so far. Note that it uses variablesVector,

	131 * _not_ data.setVariables.

	132 */

	133 class ParseData : public UMemory, public SymbolTable {

	134 public:

	135 const TransliterationRuleData* data; // alias

	136

	137 const UVector* variablesVector; // alias

	138

	139 const Hashtable* variableNames; // alias

	140

	141 ParseData(const TransliterationRuleData* data = 0,

	142 const UVector* variablesVector = 0,

	143 const Hashtable* variableNames = 0);

	144

	145 virtual const UnicodeString* lookup(const UnicodeString& s) const;

	146

	147 virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const;

	148

	149 virtual UnicodeString parseReference(const UnicodeString& text,

	150 ParsePosition& pos, int32_t limit) cons t;

	151 /**

	152 * Return true if the given character is a matcher standin or a plain

	153 * character (non standin).

	154 */

	155 UBool isMatcher(UChar32 ch);

	156

	157 /**

	158 * Return true if the given character is a replacer standin or a plain

	159 * character (non standin).

	160 */

	161 UBool isReplacer(UChar32 ch);

	162

	163 private:

	164 ParseData(const ParseData &other); // forbid copying of this class

	165 ParseData &operator=(const ParseData &other); // forbid copying of this clas s

	166 };

	167

	168 ParseData::ParseData(const TransliterationRuleData* d,

	169 const UVector* sets,

	170 const Hashtable* vNames) :

	171 data(d), variablesVector(sets), variableNames(vNames) {}

	172

	173 /**

	174 * Implement SymbolTable API.

	175 */

	176 const UnicodeString* ParseData::lookup(const UnicodeString& name) const {

	177 return (const UnicodeString*) variableNames->get(name);

	178 }

	179

	180 /**

	181 * Implement SymbolTable API.

	182 */

	183 const UnicodeFunctor* ParseData::lookupMatcher(UChar32 ch) const {

	184 // Note that we cannot use data.lookupSet() because the

	185 // set array has not been constructed yet.

	186 const UnicodeFunctor* set = NULL;

	187 int32_t i = ch - data->variablesBase;

	188 if (i >= 0 && i < variablesVector->size()) {

	189 int32_t i = ch - data->variablesBase;

	190 set = (i < variablesVector->size()) ?

	191 (UnicodeFunctor*) variablesVector->elementAt(i) : 0;

	192 }

	193 return set;

	194 }

	195

	196 /**

	197 * Implement SymbolTable API. Parse out a symbol reference

	198 * name.

	199 */

	200 UnicodeString ParseData::parseReference(const UnicodeString& text,

	201 ParsePosition& pos, int32_t limit) const {

	202 int32_t start = pos.getIndex();

	203 int32_t i = start;

	204 UnicodeString result;

	205 while (i < limit) {

	206 UChar c = text.charAt(i);

	207 if ((i==start && !u_isIDStart(c)) \|\| !u_isIDPart(c)) {

	208 break;

	209 }

	210 ++i;

	211 }

	212 if (i == start) { // No valid name chars

	213 return result; // Indicate failure with empty string

	214 }

	215 pos.setIndex(i);

	216 text.extractBetween(start, i, result);

	217 return result;

	218 }

	219

	220 UBool ParseData::isMatcher(UChar32 ch) {

	221 // Note that we cannot use data.lookup() because the

	222 // set array has not been constructed yet.

	223 int32_t i = ch - data->variablesBase;

	224 if (i >= 0 && i < variablesVector->size()) {

	225 UnicodeFunctor f = (UnicodeFunctor) variablesVector->elementAt(i);

	226 return f != NULL && f->toMatcher() != NULL;

	227 }

	228 return TRUE;

	229 }

	230

	231 /**

	232 * Return true if the given character is a replacer standin or a plain

	233 * character (non standin).

	234 */

	235 UBool ParseData::isReplacer(UChar32 ch) {

	236 // Note that we cannot use data.lookup() because the

	237 // set array has not been constructed yet.

	238 int i = ch - data->variablesBase;

	239 if (i >= 0 && i < variablesVector->size()) {

	240 UnicodeFunctor f = (UnicodeFunctor) variablesVector->elementAt(i);

	241 return f != NULL && f->toReplacer() != NULL;

	242 }

	243 return TRUE;

	244 }

	245

	246 //----------------------------------------------------------------------

	247 // BEGIN RuleHalf

	248 //----------------------------------------------------------------------

	249

	250 /**

	251 * A class representing one side of a rule. This class knows how to

	252 * parse half of a rule. It is tightly coupled to the method

	253 * RuleBasedTransliterator.Parser.parseRule().

	254 */

	255 class RuleHalf : public UMemory {

	256

	257 public:

	258

	259 UnicodeString text;

	260

	261 int32_t cursor; // position of cursor in text

	262 int32_t ante; // position of ante context marker '{' in text

	263 int32_t post; // position of post context marker '}' in text

	264

	265 // Record the offset to the cursor either to the left or to the

	266 // right of the key. This is indicated by characters on the output

	267 // side that allow the cursor to be positioned arbitrarily within

	268 // the matching text. For example, abc{def} > \| @@@ xyz; changes

	269 // def to xyz and moves the cursor to before abc. Offset characters

	270 // must be at the start or end, and they cannot move the cursor past

	271 // the ante- or postcontext text. Placeholders are only valid in

	272 // output text. The length of the ante and post context is

	273 // determined at runtime, because of supplementals and quantifiers.

	274 int32_t cursorOffset; // only nonzero on output side

	275

	276 // Position of first CURSOR_OFFSET on _right_. This will be -1

	277 // for \|@, -2 for \|@@, etc., and 1 for @\|, 2 for @@\|, etc.

	278 int32_t cursorOffsetPos;

	279

	280 UBool anchorStart;

	281 UBool anchorEnd;

	282

	283 /**

	284 * The segment number from 1..n of the next '(' we see

	285 * during parsing; 1-based.

	286 */

	287 int32_t nextSegmentNumber;

	288

	289 TransliteratorParser& parser;

	290

	291 //--------------------------------------------------

	292 // Methods

	293

	294 RuleHalf(TransliteratorParser& parser);

	295 ~RuleHalf();

	296

	297 int32_t parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorC ode& status);

	298

	299 int32_t parseSection(const UnicodeString& rule, int32_t pos, int32_t limit,

	300 UnicodeString& buf,

	301 const UnicodeString& illegal,

	302 UBool isSegment,

	303 UErrorCode& status);

	304

	305 /**

	306 * Remove context.

	307 */

	308 void removeContext();

	309

	310 /**

	311 * Return true if this half looks like valid output, that is, does not

	312 * contain quantifiers or other special input-only elements.

	313 */

	314 UBool isValidOutput(TransliteratorParser& parser);

	315

	316 /**

	317 * Return true if this half looks like valid input, that is, does not

	318 * contain functions or other special output-only elements.

	319 */

	320 UBool isValidInput(TransliteratorParser& parser);

	321

	322 int syntaxError(UErrorCode code,

	323 const UnicodeString& rule,

	324 int32_t start,

	325 UErrorCode& status) {

	326 return parser.syntaxError(code, rule, start, status);

	327 }

	328

	329 private:

	330 // Disallowed methods; no impl.

	331 RuleHalf(const RuleHalf&);

	332 RuleHalf& operator=(const RuleHalf&);

	333 };

	334

	335 RuleHalf::RuleHalf(TransliteratorParser& p) :

	336 parser(p)

	337 {

	338 cursor = -1;

	339 ante = -1;

	340 post = -1;

	341 cursorOffset = 0;

	342 cursorOffsetPos = 0;

	343 anchorStart = anchorEnd = FALSE;

	344 nextSegmentNumber = 1;

	345 }

	346

	347 RuleHalf::~RuleHalf() {

	348 }

	349

	350 /**

	351 * Parse one side of a rule, stopping at either the limit,

	352 * the END_OF_RULE character, or an operator.

	353 * @return the index after the terminating character, or

	354 * if limit was reached, limit

	355 */

	356 int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit, U ErrorCode& status) {

	357 int32_t start = pos;

	358 text.truncate(0);

	359 pos = parseSection(rule, pos, limit, text, ILLEGAL_TOP, FALSE, status);

	360

	361 if (cursorOffset > 0 && cursor != cursorOffsetPos) {

	362 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status);

	363 }

	364

	365 return pos;

	366 }

	367

	368 /**

	369 * Parse a section of one side of a rule, stopping at either

	370 * the limit, the END_OF_RULE character, an operator, or a

	371 * segment close character. This method parses both a

	372 * top-level rule half and a segment within such a rule half.

	373 * It calls itself recursively to parse segments and nested

	374 * segments.

	375 * @param buf buffer into which to accumulate the rule pattern

	376 * characters, either literal characters from the rule or

	377 * standins for UnicodeMatcher objects including segments.

	378 * @param illegal the set of special characters that is illegal during

	379 * this parse.

	380 * @param isSegment if true, then we've already seen a '(' and

	381 * pos on entry points right after it. Accumulate everything

	382 * up to the closing ')', put it in a segment matcher object,

	383 * generate a standin for it, and add the standin to buf. As

	384 * a side effect, update the segments vector with a reference

	385 * to the segment matcher. This works recursively for nested

	386 * segments. If isSegment is false, just accumulate

	387 * characters into buf.

	388 * @return the index after the terminating character, or

	389 * if limit was reached, limit

	390 */

	391 int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t l imit,

	392 UnicodeString& buf,

	393 const UnicodeString& illegal,

	394 UBool isSegment, UErrorCode& status) {

	395 int32_t start = pos;

	396 ParsePosition pp;

	397 UnicodeString scratch;

	398 UBool done = FALSE;

	399 int32_t quoteStart = -1; // Most recent 'single quoted string'

	400 int32_t quoteLimit = -1;

	401 int32_t varStart = -1; // Most recent $variableReference

	402 int32_t varLimit = -1;

	403 int32_t bufStart = buf.length();

	404

	405 while (pos < limit && !done) {

	406 // Since all syntax characters are in the BMP, fetching

	407 // 16-bit code units suffices here.

	408 UChar c = rule.charAt(pos++);

	409 if (uprv_isRuleWhiteSpace(c)) {

	410 // Ignore whitespace. Note that this is not Unicode

	411 // spaces, but Java spaces -- a subset, representing

	412 // whitespace likely to be seen in code.

	413 continue;

	414 }

	415 if (u_strchr(HALF_ENDERS, c) != NULL) {

	416 if (isSegment) {

	417 // Unclosed segment

	418 return syntaxError(U_UNCLOSED_SEGMENT, rule, start, status);

	419 }

	420 break;

	421 }

	422 if (anchorEnd) {

	423 // Text after a presumed end anchor is a syntax err

	424 return syntaxError(U_MALFORMED_VARIABLE_REFERENCE, rule, start, stat us);

	425 }

	426 if (UnicodeSet::resemblesPattern(rule, pos-1)) {

	427 pp.setIndex(pos-1); // Backup to opening '['

	428 buf.append(parser.parseSet(rule, pp, status));

	429 if (U_FAILURE(status)) {

	430 return syntaxError(U_MALFORMED_SET, rule, start, status);

	431 }

	432 pos = pp.getIndex();

	433 continue;

	434 }

	435 // Handle escapes

	436 if (c == ESCAPE) {

	437 if (pos == limit) {

	438 return syntaxError(U_TRAILING_BACKSLASH, rule, start, status);

	439 }

	440 UChar32 escaped = rule.unescapeAt(pos); // pos is already past '\\'

	441 if (escaped == (UChar32) -1) {

	442 return syntaxError(U_MALFORMED_UNICODE_ESCAPE, rule, start, stat us);

	443 }

	444 if (!parser.checkVariableRange(escaped)) {

	445 return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status );

	446 }

	447 buf.append(escaped);

	448 continue;

	449 }

	450 // Handle quoted matter

	451 if (c == QUOTE) {

	452 int32_t iq = rule.indexOf(QUOTE, pos);

	453 if (iq == pos) {

	454 buf.append(c); // Parse [''] outside quotes as [']

	455 ++pos;

	456 } else {

	457 /* This loop picks up a run of quoted text of the

	458 * form 'aaaa' each time through. If this run

	459 * hasn't really ended ('aaaa''bbbb') then it keeps

	460 * looping, each time adding on a new run. When it

	461 * reaches the final quote it breaks.

	462 */

	463 quoteStart = buf.length();

	464 for (;;) {

	465 if (iq < 0) {

	466 return syntaxError(U_UNTERMINATED_QUOTE, rule, start, st atus);

	467 }

	468 scratch.truncate(0);

	469 rule.extractBetween(pos, iq, scratch);

	470 buf.append(scratch);

	471 pos = iq+1;

	472 if (pos < limit && rule.charAt(pos) == QUOTE) {

	473 // Parse [''] inside quotes as [']

	474 iq = rule.indexOf(QUOTE, pos+1);

	475 // Continue looping

	476 } else {

	477 break;

	478 }

	479 }

	480 quoteLimit = buf.length();

	481

	482 for (iq=quoteStart; iq<quoteLimit; ++iq) {

	483 if (!parser.checkVariableRange(buf.charAt(iq))) {

	484 return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start , status);

	485 }

	486 }

	487 }

	488 continue;

	489 }

	490

	491 if (!parser.checkVariableRange(c)) {

	492 return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status);

	493 }

	494

	495 if (illegal.indexOf(c) >= 0) {

	496 syntaxError(U_ILLEGAL_CHARACTER, rule, start, status);

	497 }

	498

	499 switch (c) {

	500

	501 //------------------------------------------------------

	502 // Elements allowed within and out of segments

	503 //------------------------------------------------------

	504 case ANCHOR_START:

	505 if (buf.length() == 0 && !anchorStart) {

	506 anchorStart = TRUE;

	507 } else {

	508 return syntaxError(U_MISPLACED_ANCHOR_START,

	509 rule, start, status);

	510 }

	511 break;

	512 case SEGMENT_OPEN:

	513 {

	514 // bufSegStart is the offset in buf to the first

	515 // character of the segment we are parsing.

	516 int32_t bufSegStart = buf.length();

	517

	518 // Record segment number now, since nextSegmentNumber

	519 // will be incremented during the call to parseSection

	520 // if there are nested segments.

	521 int32_t segmentNumber = nextSegmentNumber++; // 1-based

	522

	523 // Parse the segment

	524 pos = parseSection(rule, pos, limit, buf, ILLEGAL_SEG, TRUE, sta tus);

	525

	526 // After parsing a segment, the relevant characters are

	527 // in buf, starting at offset bufSegStart. Extract them

	528 // into a string matcher, and replace them with a

	529 // standin for that matcher.

	530 StringMatcher* m =

	531 new StringMatcher(buf, bufSegStart, buf.length(),

	532 segmentNumber, *parser.curData);

	533 if (m == NULL) {

	534 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, s tatus);

	535 }

	536

	537 // Record and associate object and segment number

	538 parser.setSegmentObject(segmentNumber, m, status);

	539 buf.truncate(bufSegStart);

	540 buf.append(parser.getSegmentStandin(segmentNumber, status));

	541 }

	542 break;

	543 case FUNCTION:

	544 case ALT_FUNCTION:

	545 {

	546 int32_t iref = pos;

	547 TransliteratorIDParser::SingleID* single =

	548 TransliteratorIDParser::parseFilterID(rule, iref);

	549 // The next character MUST be a segment open

	550 if (single == NULL \|\|

	551 !ICU_Utility::parseChar(rule, iref, SEGMENT_OPEN)) {

	552 return syntaxError(U_INVALID_FUNCTION, rule, start, status);

	553 }

	554

	555 Transliterator *t = single->createInstance();

	556 delete single;

	557 if (t == NULL) {

	558 return syntaxError(U_INVALID_FUNCTION, rule, start, status);

	559 }

	560

	561 // bufSegStart is the offset in buf to the first

	562 // character of the segment we are parsing.

	563 int32_t bufSegStart = buf.length();

	564

	565 // Parse the segment

	566 pos = parseSection(rule, iref, limit, buf, ILLEGAL_FUNC, TRUE, s tatus);

	567

	568 // After parsing a segment, the relevant characters are

	569 // in buf, starting at offset bufSegStart.

	570 UnicodeString output;

	571 buf.extractBetween(bufSegStart, buf.length(), output);

	572 FunctionReplacer *r =

	573 new FunctionReplacer(t, new StringReplacer(output, parser.cu rData));

	574 if (r == NULL) {

	575 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, s tatus);

	576 }

	577

	578 // Replace the buffer contents with a stand-in

	579 buf.truncate(bufSegStart);

	580 buf.append(parser.generateStandInFor(r, status));

	581 }

	582 break;

	583 case SymbolTable::SYMBOL_REF:

	584 // Handle variable references and segment references "$1" .. "$9"

	585 {

	586 // A variable reference must be followed immediately

	587 // by a Unicode identifier start and zero or more

	588 // Unicode identifier part characters, or by a digit

	589 // 1..9 if it is a segment reference.

	590 if (pos == limit) {

	591 // A variable ref character at the end acts as

	592 // an anchor to the context limit, as in perl.

	593 anchorEnd = TRUE;

	594 break;

	595 }

	596 // Parse "$1" "$2" .. "$9" .. (no upper limit)

	597 c = rule.charAt(pos);

	598 int32_t r = u_digit(c, 10);

	599 if (r >= 1 && r <= 9) {

	600 r = ICU_Utility::parseNumber(rule, pos, 10);

	601 if (r < 0) {

	602 return syntaxError(U_UNDEFINED_SEGMENT_REFERENCE,

	603 rule, start, status);

	604 }

	605 buf.append(parser.getSegmentStandin(r, status));

	606 } else {

	607 pp.setIndex(pos);

	608 UnicodeString name = parser.parseData->

	609 parseReference(rule, pp, limit);

	610 if (name.length() == 0) {

	611 // This means the '$' was not followed by a

	612 // valid name. Try to interpret it as an

	613 // end anchor then. If this also doesn't work

	614 // (if we see a following character) then signal

	615 // an error.

	616 anchorEnd = TRUE;

	617 break;

	618 }

	619 pos = pp.getIndex();

	620 // If this is a variable definition statement,

	621 // then the LHS variable will be undefined. In

	622 // that case appendVariableDef() will append the

	623 // special placeholder char variableLimit-1.

	624 varStart = buf.length();

	625 parser.appendVariableDef(name, buf, status);

	626 varLimit = buf.length();

	627 }

	628 }

	629 break;

	630 case DOT:

	631 buf.append(parser.getDotStandIn(status));

	632 break;

	633 case KLEENE_STAR:

	634 case ONE_OR_MORE:

	635 case ZERO_OR_ONE:

	636 // Quantifiers. We handle single characters, quoted strings,

	637 // variable references, and segments.

	638 // a+ matches aaa

	639 // 'foo'+ matches foofoofoo

	640 // $v+ matches xyxyxy if $v == xy

	641 // (seg)+ matches segsegseg

	642 {

	643 if (isSegment && buf.length() == bufStart) {

	644 // The */+ immediately follows '('

	645 return syntaxError(U_MISPLACED_QUANTIFIER, rule, start, stat us);

	646 }

	647

	648 int32_t qstart, qlimit;

	649 // The */+ follows an isolated character or quote

	650 // or variable reference

	651 if (buf.length() == quoteLimit) {

	652 // The */+ follows a 'quoted string'

	653 qstart = quoteStart;

	654 qlimit = quoteLimit;

	655 } else if (buf.length() == varLimit) {

	656 // The */+ follows a $variableReference

	657 qstart = varStart;

	658 qlimit = varLimit;

	659 } else {

	660 // The */+ follows a single character, possibly

	661 // a segment standin

	662 qstart = buf.length() - 1;

	663 qlimit = qstart + 1;

	664 }

	665

	666 UnicodeFunctor *m =

	667 new StringMatcher(buf, qstart, qlimit, 0, *parser.curData);

	668 if (m == NULL) {

	669 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, s tatus);

	670 }

	671 int32_t min = 0;

	672 int32_t max = Quantifier::MAX;

	673 switch (c) {

	674 case ONE_OR_MORE:

	675 min = 1;

	676 break;

	677 case ZERO_OR_ONE:

	678 min = 0;

	679 max = 1;

	680 break;

	681 // case KLEENE_STAR:

	682 // do nothing -- min, max already set

	683 }

	684 m = new Quantifier(m, min, max);

	685 if (m == NULL) {

	686 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, s tatus);

	687 }

	688 buf.truncate(qstart);

	689 buf.append(parser.generateStandInFor(m, status));

	690 }

	691 break;

	692

	693 //------------------------------------------------------

	694 // Elements allowed ONLY WITHIN segments

	695 //------------------------------------------------------

	696 case SEGMENT_CLOSE:

	697 // assert(isSegment);

	698 // We're done parsing a segment.

	699 done = TRUE;

	700 break;

	701

	702 //------------------------------------------------------

	703 // Elements allowed ONLY OUTSIDE segments

	704 //------------------------------------------------------

	705 case CONTEXT_ANTE:

	706 if (ante >= 0) {

	707 return syntaxError(U_MULTIPLE_ANTE_CONTEXTS, rule, start, status );

	708 }

	709 ante = buf.length();

	710 break;

	711 case CONTEXT_POST:

	712 if (post >= 0) {

	713 return syntaxError(U_MULTIPLE_POST_CONTEXTS, rule, start, status );

	714 }

	715 post = buf.length();

	716 break;

	717 case CURSOR_POS:

	718 if (cursor >= 0) {

	719 return syntaxError(U_MULTIPLE_CURSORS, rule, start, status);

	720 }

	721 cursor = buf.length();

	722 break;

	723 case CURSOR_OFFSET:

	724 if (cursorOffset < 0) {

	725 if (buf.length() > 0) {

	726 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, s tatus);

	727 }

	728 --cursorOffset;

	729 } else if (cursorOffset > 0) {

	730 if (buf.length() != cursorOffsetPos \|\| cursor >= 0) {

	731 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, s tatus);

	732 }

	733 ++cursorOffset;

	734 } else {

	735 if (cursor == 0 && buf.length() == 0) {

	736 cursorOffset = -1;

	737 } else if (cursor < 0) {

	738 cursorOffsetPos = buf.length();

	739 cursorOffset = 1;

	740 } else {

	741 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, s tatus);

	742 }

	743 }

	744 break;

	745

	746

	747 //------------------------------------------------------

	748 // Non-special characters

	749 //------------------------------------------------------

	750 default:

	751 // Disallow unquoted characters other than [0-9A-Za-z]

	752 // in the printable ASCII range. These characters are

	753 // reserved for possible future use.

	754 if (c >= 0x0021 && c <= 0x007E &&

	755 !((c >= 0x0030/'0'/ && c <= 0x0039/'9'/) \|\|

	756 (c >= 0x0041/'A'/ && c <= 0x005A/'Z'/) \|\|

	757 (c >= 0x0061/'a'/ && c <= 0x007A/'z'/))) {

	758 return syntaxError(U_UNQUOTED_SPECIAL, rule, start, status);

	759 }

	760 buf.append(c);

	761 break;

	762 }

	763 }

	764

	765 return pos;

	766 }

	767

	768 /**

	769 * Remove context.

	770 */

	771 void RuleHalf::removeContext() {

	772 //text = text.substring(ante < 0 ? 0 : ante,

	773 // post < 0 ? text.length() : post);

	774 if (post >= 0) {

	775 text.remove(post);

	776 }

	777 if (ante >= 0) {

	778 text.removeBetween(0, ante);

	779 }

	780 ante = post = -1;

	781 anchorStart = anchorEnd = FALSE;

	782 }

	783

	784 /**

	785 * Return true if this half looks like valid output, that is, does not

	786 * contain quantifiers or other special input-only elements.

	787 */

	788 UBool RuleHalf::isValidOutput(TransliteratorParser& transParser) {

	789 for (int32_t i=0; i<text.length(); ) {

	790 UChar32 c = text.char32At(i);

	791 i += UTF_CHAR_LENGTH(c);

	792 if (!transParser.parseData->isReplacer(c)) {

	793 return FALSE;

	794 }

	795 }

	796 return TRUE;

	797 }

	798

	799 /**

	800 * Return true if this half looks like valid input, that is, does not

	801 * contain functions or other special output-only elements.

	802 */

	803 UBool RuleHalf::isValidInput(TransliteratorParser& transParser) {

	804 for (int32_t i=0; i<text.length(); ) {

	805 UChar32 c = text.char32At(i);

	806 i += UTF_CHAR_LENGTH(c);

	807 if (!transParser.parseData->isMatcher(c)) {

	808 return FALSE;

	809 }

	810 }

	811 return TRUE;

	812 }

	813

	814 //----------------------------------------------------------------------

	815 // PUBLIC API

	816 //----------------------------------------------------------------------

	817

	818 /**

	819 * Constructor.

	820 */

	821 TransliteratorParser::TransliteratorParser(UErrorCode &statusReturn) :

	822 dataVector(statusReturn),

	823 idBlockVector(statusReturn),

	824 variablesVector(statusReturn),

	825 segmentObjects(statusReturn)

	826 {

	827 idBlockVector.setDeleter(uhash_deleteUnicodeString);

	828 curData = NULL;

	829 compoundFilter = NULL;

	830 parseData = NULL;

	831 variableNames.setValueDeleter(uhash_deleteUnicodeString);

	832 }

	833

	834 /**

	835 * Destructor.

	836 */

	837 TransliteratorParser::~TransliteratorParser() {

	838 while (!dataVector.isEmpty())

	839 delete (TransliterationRuleData*)(dataVector.orphanElementAt(0));

	840 delete compoundFilter;

	841 delete parseData;

	842 while (!variablesVector.isEmpty())

	843 delete (UnicodeFunctor*)variablesVector.orphanElementAt(0);

	844 }

	845

	846 void

	847 TransliteratorParser::parse(const UnicodeString& rules,

	848 UTransDirection transDirection,

	849 UParseError& pe,

	850 UErrorCode& ec) {

	851 if (U_SUCCESS(ec)) {

	852 parseRules(rules, transDirection, ec);

	853 pe = parseError;

	854 }

	855 }

	856

	857 /**

	858 * Return the compound filter parsed by parse(). Caller owns result.

	859 */

	860 UnicodeSet* TransliteratorParser::orphanCompoundFilter() {

	861 UnicodeSet* f = compoundFilter;

	862 compoundFilter = NULL;

	863 return f;

	864 }

	865

	866 //----------------------------------------------------------------------

	867 // Private implementation

	868 //----------------------------------------------------------------------

	869

	870 /**

	871 * Parse the given string as a sequence of rules, separated by newline

	872 * characters ('\n'), and cause this object to implement those rules. Any

	873 * previous rules are discarded. Typically this method is called exactly

	874 * once, during construction.

	875 * @exception IllegalArgumentException if there is a syntax error in the

	876 * rules

	877 */

	878 void TransliteratorParser::parseRules(const UnicodeString& rule,

	879 UTransDirection theDirection,

	880 UErrorCode& status)

	881 {

	882 // Clear error struct

	883 uprv_memset(&parseError, 0, sizeof(parseError));

	884 parseError.line = parseError.offset = -1;

	885

	886 UBool parsingIDs = TRUE;

	887 int32_t ruleCount = 0;

	888

	889 while (!dataVector.isEmpty()) {

	890 delete (TransliterationRuleData*)(dataVector.orphanElementAt(0));

	891 }

	892 if (U_FAILURE(status)) {

	893 return;

	894 }

	895

	896 idBlockVector.removeAllElements();

	897 curData = NULL;

	898 direction = theDirection;

	899 ruleCount = 0;

	900

	901 delete compoundFilter;

	902 compoundFilter = NULL;

	903

	904 while (!variablesVector.isEmpty()) {

	905 delete (UnicodeFunctor*)variablesVector.orphanElementAt(0);

	906 }

	907 variableNames.removeAll();

	908 parseData = new ParseData(0, &variablesVector, &variableNames);

	909 if (parseData == NULL) {

	910 status = U_MEMORY_ALLOCATION_ERROR;

	911 return;

	912 }

	913

	914 dotStandIn = (UChar) -1;

	915

	916 UnicodeString *tempstr = NULL; // used for memory allocation error checking

	917 UnicodeString str; // scratch

	918 UnicodeString idBlockResult;

	919 int32_t pos = 0;

	920 int32_t limit = rule.length();

	921

	922 // The compound filter offset is an index into idBlockResult.

	923 // If it is 0, then the compound filter occurred at the start,

	924 // and it is the offset to the _start_ of the compound filter

	925 // pattern. Otherwise it is the offset to the _limit_ of the

	926 // compound filter pattern within idBlockResult.

	927 compoundFilter = NULL;

	928 int32_t compoundFilterOffset = -1;

	929

	930 while (pos < limit && U_SUCCESS(status)) {

	931 UChar c = rule.charAt(pos++);

	932 if (uprv_isRuleWhiteSpace(c)) {

	933 // Ignore leading whitespace.

	934 continue;

	935 }

	936 // Skip lines starting with the comment character

	937 if (c == RULE_COMMENT_CHAR) {

	938 pos = rule.indexOf((UChar)0x000A /\n/, pos) + 1;

	939 if (pos == 0) {

	940 break; // No "\n" found; rest of rule is a commnet

	941 }

	942 continue; // Either fall out or restart with next line

	943 }

	944

	945 // skip empty rules

	946 if (c == END_OF_RULE)

	947 continue;

	948

	949 // keep track of how many rules we've seen

	950 ++ruleCount;

	951

	952 // We've found the start of a rule or ID. c is its first

	953 // character, and pos points past c.

	954 --pos;

	955 // Look for an ID token. Must have at least ID_TOKEN_LEN + 1

	956 // chars left.

	957 if ((pos + ID_TOKEN_LEN + 1) <= limit &&

	958 rule.compare(pos, ID_TOKEN_LEN, ID_TOKEN) == 0) {

	959 pos += ID_TOKEN_LEN;

	960 c = rule.charAt(pos);

	961 while (uprv_isRuleWhiteSpace(c) && pos < limit) {

	962 ++pos;

	963 c = rule.charAt(pos);

	964 }

	965

	966 int32_t p = pos;

	967

	968 if (!parsingIDs) {

	969 if (curData != NULL) {

	970 if (direction == UTRANS_FORWARD)

	971 dataVector.addElement(curData, status);

	972 else

	973 dataVector.insertElementAt(curData, 0, status);

	974 curData = NULL;

	975 }

	976 parsingIDs = TRUE;

	977 }

	978

	979 TransliteratorIDParser::SingleID* id =

	980 TransliteratorIDParser::parseSingleID(rule, p, direction, status );

	981 if (p != pos && ICU_Utility::parseChar(rule, p, END_OF_RULE)) {

	982 // Successful ::ID parse.

	983

	984 if (direction == UTRANS_FORWARD) {

	985 idBlockResult.append(id->canonID).append(END_OF_RULE);

	986 } else {

	987 idBlockResult.insert(0, END_OF_RULE);

	988 idBlockResult.insert(0, id->canonID);

	989 }

	990

	991 } else {

	992 // Couldn't parse an ID. Try to parse a global filter

	993 int32_t withParens = -1;

	994 UnicodeSet* f = TransliteratorIDParser::parseGlobalFilter(rule, p, direction, withParens, NULL);

	995 if (f != NULL) {

	996 if (ICU_Utility::parseChar(rule, p, END_OF_RULE)

	997 && (direction == UTRANS_FORWARD) == (withParens == 0))

	998 {

	999 if (compoundFilter != NULL) {

	1000 // Multiple compound filters

	1001 syntaxError(U_MULTIPLE_COMPOUND_FILTERS, rule, pos, status);

	1002 delete f;

	1003 } else {

	1004 compoundFilter = f;

	1005 compoundFilterOffset = ruleCount;

	1006 }

	1007 } else {

	1008 delete f;

	1009 }

	1010 } else {

	1011 // Invalid ::id

	1012 // Can be parsed as neither an ID nor a global filter

	1013 syntaxError(U_INVALID_ID, rule, pos, status);

	1014 }

	1015 }

	1016 delete id;

	1017 pos = p;

	1018 } else {

	1019 if (parsingIDs) {

	1020 tempstr = new UnicodeString(idBlockResult);

	1021 // NULL pointer check

	1022 if (tempstr == NULL) {

	1023 status = U_MEMORY_ALLOCATION_ERROR;

	1024 return;

	1025 }

	1026 if (direction == UTRANS_FORWARD)

	1027 idBlockVector.addElement(tempstr, status);

	1028 else

	1029 idBlockVector.insertElementAt(tempstr, 0, status);

	1030 idBlockResult.remove();

	1031 parsingIDs = FALSE;

	1032 curData = new TransliterationRuleData(status);

	1033 // NULL pointer check

	1034 if (curData == NULL) {

	1035 status = U_MEMORY_ALLOCATION_ERROR;

	1036 return;

	1037 }

	1038 parseData->data = curData;

	1039

	1040 // By default, rules use part of the private use area

	1041 // E000..F8FF for variables and other stand-ins. Currently

	1042 // the range F000..F8FF is typically sufficient. The 'use

	1043 // variable range' pragma allows rule sets to modify this.

	1044 setVariableRange(0xF000, 0xF8FF, status);

	1045 }

	1046

	1047 if (resemblesPragma(rule, pos, limit)) {

	1048 int32_t ppp = parsePragma(rule, pos, limit, status);

	1049 if (ppp < 0) {

	1050 syntaxError(U_MALFORMED_PRAGMA, rule, pos, status);

	1051 }

	1052 pos = ppp;

	1053 // Parse a rule

	1054 } else {

	1055 pos = parseRule(rule, pos, limit, status);

	1056 }

	1057 }

	1058 }

	1059

	1060 if (parsingIDs && idBlockResult.length() > 0) {

	1061 tempstr = new UnicodeString(idBlockResult);

	1062 // NULL pointer check

	1063 if (tempstr == NULL) {

	1064 status = U_MEMORY_ALLOCATION_ERROR;

	1065 return;

	1066 }

	1067 if (direction == UTRANS_FORWARD)

	1068 idBlockVector.addElement(tempstr, status);

	1069 else

	1070 idBlockVector.insertElementAt(tempstr, 0, status);

	1071 }

	1072 else if (!parsingIDs && curData != NULL) {

	1073 if (direction == UTRANS_FORWARD)

	1074 dataVector.addElement(curData, status);

	1075 else

	1076 dataVector.insertElementAt(curData, 0, status);

	1077 }

	1078

	1079 if (U_SUCCESS(status)) {

	1080 // Convert the set vector to an array

	1081 int32_t i, dataVectorSize = dataVector.size();

	1082 for (i = 0; i < dataVectorSize; i++) {

	1083 TransliterationRuleData* data = (TransliterationRuleData*)dataVector .elementAt(i);

	1084 data->variablesLength = variablesVector.size();

	1085 if (data->variablesLength == 0) {

	1086 data->variables = 0;

	1087 } else {

	1088 data->variables = (UnicodeFunctor*)uprv_malloc(data->variablesL ength sizeof(UnicodeFunctor*));

	1089 // NULL pointer check

	1090 if (data->variables == NULL) {

	1091 status = U_MEMORY_ALLOCATION_ERROR;

	1092 return;

	1093 }

	1094 data->variablesAreOwned = (i == 0);

	1095 }

	1096

	1097 for (int32_t j = 0; j < data->variablesLength; j++) {

	1098 data->variables[j] =

	1099 ((UnicodeSet*)variablesVector.elementAt(j));

	1100 }

	1101

	1102 data->variableNames.removeAll();

	1103 int32_t pos = -1;

	1104 const UHashElement* he = variableNames.nextElement(pos);

	1105 while (he != NULL) {

	1106 UnicodeString* tempus = (UnicodeString)(((UnicodeString)(he->v alue.pointer))->clone());

	1107 if (tempus == NULL) {

	1108 status = U_MEMORY_ALLOCATION_ERROR;

	1109 return;

	1110 }

	1111 data->variableNames.put(((UnicodeString)(he->key.pointer)),

	1112 tempus, status);

	1113 he = variableNames.nextElement(pos);

	1114 }

	1115 }

	1116 variablesVector.removeAllElements(); // keeps them from getting delete d when we succeed

	1117

	1118 // Index the rules

	1119 if (compoundFilter != NULL) {

	1120 if ((direction == UTRANS_FORWARD && compoundFilterOffset != 1) \|\|

	1121 (direction == UTRANS_REVERSE && compoundFilterOffset != ruleCoun t)) {

	1122 status = U_MISPLACED_COMPOUND_FILTER;

	1123 }

	1124 }

	1125

	1126 for (i = 0; i < dataVectorSize; i++) {

	1127 TransliterationRuleData* data = (TransliterationRuleData*)dataVector .elementAt(i);

	1128 data->ruleSet.freeze(parseError, status);

	1129 }

	1130 if (idBlockVector.size() == 1 && ((UnicodeString*)idBlockVector.elementA t(0))->isEmpty()) {

	1131 idBlockVector.removeElementAt(0);

	1132 }

	1133 }

	1134 }

	1135

	1136 /**

	1137 * Set the variable range to [start, end] (inclusive).

	1138 */

	1139 void TransliteratorParser::setVariableRange(int32_t start, int32_t end, UErrorCo de& status) {

	1140 if (start > end \|\| start < 0 \|\| end > 0xFFFF) {

	1141 status = U_MALFORMED_PRAGMA;

	1142 return;

	1143 }

	1144

	1145 curData->variablesBase = (UChar) start;

	1146 if (dataVector.size() == 0) {

	1147 variableNext = (UChar) start;

	1148 variableLimit = (UChar) (end + 1);

	1149 }

	1150 }

	1151

	1152 /**

	1153 * Assert that the given character is NOT within the variable range.

	1154 * If it is, return FALSE. This is neccesary to ensure that the

	1155 * variable range does not overlap characters used in a rule.

	1156 */

	1157 UBool TransliteratorParser::checkVariableRange(UChar32 ch) const {

	1158 return !(ch >= curData->variablesBase && ch < variableLimit);

	1159 }

	1160

	1161 /**

	1162 * Set the maximum backup to 'backup', in response to a pragma

	1163 * statement.

	1164 */

	1165 void TransliteratorParser::pragmaMaximumBackup(int32_t /backup/) {

	1166 //TODO Finish

	1167 }

	1168

	1169 /**

	1170 * Begin normalizing all rules using the given mode, in response

	1171 * to a pragma statement.

	1172 */

	1173 void TransliteratorParser::pragmaNormalizeRules(UNormalizationMode /mode/) {

	1174 //TODO Finish

	1175 }

	1176

	1177 static const UChar PRAGMA_USE[] = {0x75,0x73,0x65,0x20,0}; // "use "

	1178

	1179 static const UChar PRAGMA_VARIABLE_RANGE[] = {0x7E,0x76,0x61,0x72,0x69,0x61,0x62 ,0x6C,0x65,0x20,0x72,0x61,0x6E,0x67,0x65,0x20,0x23,0x20,0x23,0x7E,0x3B,0}; // "~ variable range # #~;"

	1180

	1181 static const UChar PRAGMA_MAXIMUM_BACKUP[] = {0x7E,0x6D,0x61,0x78,0x69,0x6D,0x75 ,0x6D,0x20,0x62,0x61,0x63,0x6B,0x75,0x70,0x20,0x23,0x7E,0x3B,0}; // "~maximum ba ckup #~;"

	1182

	1183 static const UChar PRAGMA_NFD_RULES[] = {0x7E,0x6E,0x66,0x64,0x20,0x72,0x75,0x6C ,0x65,0x73,0x7E,0x3B,0}; // "~nfd rules~;"

	1184

	1185 static const UChar PRAGMA_NFC_RULES[] = {0x7E,0x6E,0x66,0x63,0x20,0x72,0x75,0x6C ,0x65,0x73,0x7E,0x3B,0}; // "~nfc rules~;"

	1186

	1187 /**

	1188 * Return true if the given rule looks like a pragma.

	1189 * @param pos offset to the first non-whitespace character

	1190 * of the rule.

	1191 * @param limit pointer past the last character of the rule.

	1192 */

	1193 UBool TransliteratorParser::resemblesPragma(const UnicodeString& rule, int32_t p os, int32_t limit) {

	1194 // Must start with /use\s/i

	1195 return ICU_Utility::parsePattern(rule, pos, limit, PRAGMA_USE, NULL) >= 0;

	1196 }

	1197

	1198 /**

	1199 * Parse a pragma. This method assumes resemblesPragma() has

	1200 * already returned true.

	1201 * @param pos offset to the first non-whitespace character

	1202 * of the rule.

	1203 * @param limit pointer past the last character of the rule.

	1204 * @return the position index after the final ';' of the pragma,

	1205 * or -1 on failure.

	1206 */

	1207 int32_t TransliteratorParser::parsePragma(const UnicodeString& rule, int32_t pos , int32_t limit, UErrorCode& status) {

	1208 int32_t array[2];

	1209

	1210 // resemblesPragma() has already returned true, so we

	1211 // know that pos points to /use\s/i; we can skip 4 characters

	1212 // immediately

	1213 pos += 4;

	1214

	1215 // Here are the pragmas we recognize:

	1216 // use variable range 0xE000 0xEFFF;

	1217 // use maximum backup 16;

	1218 // use nfd rules;

	1219 // use nfc rules;

	1220 int p = ICU_Utility::parsePattern(rule, pos, limit, PRAGMA_VARIABLE_RANGE, a rray);

	1221 if (p >= 0) {

	1222 setVariableRange(array[0], array[1], status);

	1223 return p;

	1224 }

	1225

	1226 p = ICU_Utility::parsePattern(rule, pos, limit, PRAGMA_MAXIMUM_BACKUP, array );

	1227 if (p >= 0) {

	1228 pragmaMaximumBackup(array[0]);

	1229 return p;

	1230 }

	1231

	1232 p = ICU_Utility::parsePattern(rule, pos, limit, PRAGMA_NFD_RULES, NULL);

	1233 if (p >= 0) {

	1234 pragmaNormalizeRules(UNORM_NFD);

	1235 return p;

	1236 }

	1237

	1238 p = ICU_Utility::parsePattern(rule, pos, limit, PRAGMA_NFC_RULES, NULL);

	1239 if (p >= 0) {

	1240 pragmaNormalizeRules(UNORM_NFC);

	1241 return p;

	1242 }

	1243

	1244 // Syntax error: unable to parse pragma

	1245 return -1;

	1246 }

	1247

	1248 /**

	1249 * MAIN PARSER. Parse the next rule in the given rule string, starting

	1250 * at pos. Return the index after the last character parsed. Do not

	1251 * parse characters at or after limit.

	1252 *

	1253 * Important: The character at pos must be a non-whitespace character

	1254 * that is not the comment character.

	1255 *

	1256 * This method handles quoting, escaping, and whitespace removal. It

	1257 * parses the end-of-rule character. It recognizes context and cursor

	1258 * indicators. Once it does a lexical breakdown of the rule at pos, it

	1259 * creates a rule object and adds it to our rule list.

	1260 */

	1261 int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) {

	1262 // Locate the left side, operator, and right side

	1263 int32_t start = pos;

	1264 UChar op = 0;

	1265 int32_t i;

	1266

	1267 // Set up segments data

	1268 segmentStandins.truncate(0);

	1269 segmentObjects.removeAllElements();

	1270

	1271 // Use pointers to automatics to make swapping possible.

	1272 RuleHalf _left(this), _right(this);

	1273 RuleHalf* left = &_left;

	1274 RuleHalf* right = &_right;

	1275

	1276 undefinedVariableName.remove();

	1277 pos = left->parse(rule, pos, limit, status);

	1278 if (U_FAILURE(status)) {

	1279 return start;

	1280 }

	1281

	1282 if (pos == limit \|\| u_strchr(gOPERATORS, (op = rule.charAt(--pos))) == NULL) {

	1283 return syntaxError(U_MISSING_OPERATOR, rule, start, status);

	1284 }

	1285 ++pos;

	1286

	1287 // Found an operator char. Check for forward-reverse operator.

	1288 if (op == REVERSE_RULE_OP &&

	1289 (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {

	1290 ++pos;

	1291 op = FWDREV_RULE_OP;

	1292 }

	1293

	1294 // Translate alternate op characters.

	1295 switch (op) {

	1296 case ALT_FORWARD_RULE_OP:

	1297 op = FORWARD_RULE_OP;

	1298 break;

	1299 case ALT_REVERSE_RULE_OP:

	1300 op = REVERSE_RULE_OP;

	1301 break;

	1302 case ALT_FWDREV_RULE_OP:

	1303 op = FWDREV_RULE_OP;

	1304 break;

	1305 }

	1306

	1307 pos = right->parse(rule, pos, limit, status);

	1308 if (U_FAILURE(status)) {

	1309 return start;

	1310 }

	1311

	1312 if (pos < limit) {

	1313 if (rule.charAt(--pos) == END_OF_RULE) {

	1314 ++pos;

	1315 } else {

	1316 // RuleHalf parser must have terminated at an operator

	1317 return syntaxError(U_UNQUOTED_SPECIAL, rule, start, status);

	1318 }

	1319 }

	1320

	1321 if (op == VARIABLE_DEF_OP) {

	1322 // LHS is the name. RHS is a single character, either a literal

	1323 // or a set (already parsed). If RHS is longer than one

	1324 // character, it is either a multi-character string, or multiple

	1325 // sets, or a mixture of chars and sets -- syntax error.

	1326

	1327 // We expect to see a single undefined variable (the one being

	1328 // defined).

	1329 if (undefinedVariableName.length() == 0) {

	1330 // "Missing '$' or duplicate definition"

	1331 return syntaxError(U_BAD_VARIABLE_DEFINITION, rule, start, status);

	1332 }

	1333 if (left->text.length() != 1 \|\| left->text.charAt(0) != variableLimit) {

	1334 // "Malformed LHS"

	1335 return syntaxError(U_MALFORMED_VARIABLE_DEFINITION, rule, start, sta tus);

	1336 }

	1337 if (left->anchorStart \|\| left->anchorEnd \|\|

	1338 right->anchorStart \|\| right->anchorEnd) {

	1339 return syntaxError(U_MALFORMED_VARIABLE_DEFINITION, rule, start, sta tus);

	1340 }

	1341 // We allow anything on the right, including an empty string.

	1342 UnicodeString* value = new UnicodeString(right->text);

	1343 // NULL pointer check

	1344 if (value == NULL) {

	1345 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);

	1346 }

	1347 variableNames.put(undefinedVariableName, value, status);

	1348 ++variableLimit;

	1349 return pos;

	1350 }

	1351

	1352 // If this is not a variable definition rule, we shouldn't have

	1353 // any undefined variable names.

	1354 if (undefinedVariableName.length() != 0) {

	1355 return syntaxError(// "Undefined variable $" + undefinedVariableName,

	1356 U_UNDEFINED_VARIABLE,

	1357 rule, start, status);

	1358 }

	1359

	1360 // Verify segments

	1361 if (segmentStandins.length() > segmentObjects.size()) {

	1362 syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, rule, start, status);

	1363 }

	1364 for (i=0; i<segmentStandins.length(); ++i) {

	1365 if (segmentStandins.charAt(i) == 0) {

	1366 syntaxError(U_INTERNAL_TRANSLITERATOR_ERROR, rule, start, status); / / will never happen

	1367 }

	1368 }

	1369 for (i=0; i<segmentObjects.size(); ++i) {

	1370 if (segmentObjects.elementAt(i) == NULL) {

	1371 syntaxError(U_INTERNAL_TRANSLITERATOR_ERROR, rule, start, status); / / will never happen

	1372 }

	1373 }

	1374

	1375 // If the direction we want doesn't match the rule

	1376 // direction, do nothing.

	1377 if (op != FWDREV_RULE_OP &&

	1378 ((direction == UTRANS_FORWARD) != (op == FORWARD_RULE_OP))) {

	1379 return pos;

	1380 }

	1381

	1382 // Transform the rule into a forward rule by swapping the

	1383 // sides if necessary.

	1384 if (direction == UTRANS_REVERSE) {

	1385 left = &_right;

	1386 right = &_left;

	1387 }

	1388

	1389 // Remove non-applicable elements in forward-reverse

	1390 // rules. Bidirectional rules ignore elements that do not

	1391 // apply.

	1392 if (op == FWDREV_RULE_OP) {

	1393 right->removeContext();

	1394 left->cursor = -1;

	1395 left->cursorOffset = 0;

	1396 }

	1397

	1398 // Normalize context

	1399 if (left->ante < 0) {

	1400 left->ante = 0;

	1401 }

	1402 if (left->post < 0) {

	1403 left->post = left->text.length();

	1404 }

	1405

	1406 // Context is only allowed on the input side. Cursors are only

	1407 // allowed on the output side. Segment delimiters can only appear

	1408 // on the left, and references on the right. Cursor offset

	1409 // cannot appear without an explicit cursor. Cursor offset

	1410 // cannot place the cursor outside the limits of the context.

	1411 // Anchors are only allowed on the input side.

	1412 if (right->ante >= 0 \|\| right->post >= 0 \|\| left->cursor >= 0 \|\|

	1413 (right->cursorOffset != 0 && right->cursor < 0) \|\|

	1414 // - The following two checks were used to ensure that the

	1415 // - the cursor offset stayed within the ante- or postcontext.

	1416 // - However, with the addition of quantifiers, we have to

	1417 // - allow arbitrary cursor offsets and do runtime checking.

	1418 //(right->cursorOffset > (left->text.length() - left->post)) \|\|

	1419 //(-right->cursorOffset > left->ante) \|\|

	1420 right->anchorStart \|\| right->anchorEnd \|\|

	1421 !left->isValidInput(this) \|\| !right->isValidOutput(this) \|\|

	1422 left->ante > left->post) {

	1423

	1424 return syntaxError(U_MALFORMED_RULE, rule, start, status);

	1425 }

	1426

	1427 // Flatten segment objects vector to an array

	1428 UnicodeFunctor** segmentsArray = NULL;

	1429 if (segmentObjects.size() > 0) {

	1430 segmentsArray = (UnicodeFunctor *)uprv_malloc(segmentObjects.size() s izeof(UnicodeFunctor *));

	1431 // Null pointer check

	1432 if (segmentsArray == NULL) {

	1433 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);

	1434 }

	1435 segmentObjects.toArray((void**) segmentsArray);

	1436 }

	1437 TransliterationRule* temptr = new TransliterationRule(

	1438 left->text, left->ante, left->post,

	1439 right->text, right->cursor, right->cursorOffset,

	1440 segmentsArray,

	1441 segmentObjects.size(),

	1442 left->anchorStart, left->anchorEnd,

	1443 curData,

	1444 status);

	1445 //Null pointer check

	1446 if (temptr == NULL) {

	1447 uprv_free(segmentsArray);

	1448 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);

	1449 }

	1450

	1451 curData->ruleSet.addRule(temptr, status);

	1452

	1453 return pos;

	1454 }

	1455

	1456 /**

	1457 * Called by main parser upon syntax error. Search the rule string

	1458 * for the probable end of the rule. Of course, if the error is that

	1459 * the end of rule marker is missing, then the rule end will not be found.

	1460 * In any case the rule start will be correctly reported.

	1461 * @param msg error description

	1462 * @param rule pattern string

	1463 * @param start position of first character of current rule

	1464 */

	1465 int32_t TransliteratorParser::syntaxError(UErrorCode parseErrorCode,

	1466 const UnicodeString& rule,

	1467 int32_t pos,

	1468 UErrorCode& status)

	1469 {

	1470 parseError.offset = pos;

	1471 parseError.line = 0 ; /* we are not using line numbers */

	1472

	1473 // for pre-context

	1474 const int32_t LEN = U_PARSE_CONTEXT_LEN - 1;

	1475 int32_t start = uprv_max(pos - LEN, 0);

	1476 int32_t stop = pos;

	1477

	1478 rule.extract(start,stop-start,parseError.preContext);

	1479 //null terminate the buffer

	1480 parseError.preContext[stop-start] = 0;

	1481

	1482 //for post-context

	1483 start = pos;

	1484 stop = uprv_min(pos + LEN, rule.length());

	1485

	1486 rule.extract(start,stop-start,parseError.postContext);

	1487 //null terminate the buffer

	1488 parseError.postContext[stop-start]= 0;

	1489

	1490 status = (UErrorCode)parseErrorCode;

	1491 return pos;

	1492

	1493 }

	1494

	1495 /**

	1496 * Parse a UnicodeSet out, store it, and return the stand-in character

	1497 * used to represent it.

	1498 */

	1499 UChar TransliteratorParser::parseSet(const UnicodeString& rule,

	1500 ParsePosition& pos,

	1501 UErrorCode& status) {

	1502 UnicodeSet* set = new UnicodeSet(rule, pos, USET_IGNORE_SPACE, parseData, st atus);

	1503 // Null pointer check

	1504 if (set == NULL) {

	1505 status = U_MEMORY_ALLOCATION_ERROR;

	1506 return (UChar)0x0000; // Return empty character with error.

	1507 }

	1508 set->compact();

	1509 return generateStandInFor(set, status);

	1510 }

	1511

	1512 /**

	1513 * Generate and return a stand-in for a new UnicodeFunctor. Store

	1514 * the matcher (adopt it).

	1515 */

	1516 UChar TransliteratorParser::generateStandInFor(UnicodeFunctor* adopted, UErrorCo de& status) {

	1517 // assert(obj != null);

	1518

	1519 // Look up previous stand-in, if any. This is a short list

	1520 // (typical n is 0, 1, or 2); linear search is optimal.

	1521 for (int32_t i=0; i<variablesVector.size(); ++i) {

	1522 if (variablesVector.elementAt(i) == adopted) { // [sic] pointer comparis on

	1523 return (UChar) (curData->variablesBase + i);

	1524 }

	1525 }

	1526

	1527 if (variableNext >= variableLimit) {

	1528 delete adopted;

	1529 status = U_VARIABLE_RANGE_EXHAUSTED;

	1530 return 0;

	1531 }

	1532 variablesVector.addElement(adopted, status);

	1533 return variableNext++;

	1534 }

	1535

	1536 /**

	1537 * Return the standin for segment seg (1-based).

	1538 */

	1539 UChar TransliteratorParser::getSegmentStandin(int32_t seg, UErrorCode& status) {

	1540 // Special character used to indicate an empty spot

	1541 UChar empty = curData->variablesBase - 1;

	1542 while (segmentStandins.length() < seg) {

	1543 segmentStandins.append(empty);

	1544 }

	1545 UChar c = segmentStandins.charAt(seg-1);

	1546 if (c == empty) {

	1547 if (variableNext >= variableLimit) {

	1548 status = U_VARIABLE_RANGE_EXHAUSTED;

	1549 return 0;

	1550 }

	1551 c = variableNext++;

	1552 // Set a placeholder in the master variables vector that will be

	1553 // filled in later by setSegmentObject(). We know that we will get

	1554 // called first because setSegmentObject() will call us.

	1555 variablesVector.addElement((void*) NULL, status);

	1556 segmentStandins.setCharAt(seg-1, c);

	1557 }

	1558 return c;

	1559 }

	1560

	1561 /**

	1562 * Set the object for segment seg (1-based).

	1563 */

	1564 void TransliteratorParser::setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status) {

	1565 // Since we call parseSection() recursively, nested

	1566 // segments will result in segment i+1 getting parsed

	1567 // and stored before segment i; be careful with the

	1568 // vector handling here.

	1569 if (segmentObjects.size() < seg) {

	1570 segmentObjects.setSize(seg, status);

	1571 }

	1572 int32_t index = getSegmentStandin(seg, status) - curData->variablesBase;

	1573 if (segmentObjects.elementAt(seg-1) != NULL \|\|

	1574 variablesVector.elementAt(index) != NULL) {

	1575 // should never happen

	1576 status = U_INTERNAL_TRANSLITERATOR_ERROR;

	1577 return;

	1578 }

	1579 segmentObjects.setElementAt(adopted, seg-1);

	1580 variablesVector.setElementAt(adopted, index);

	1581 }

	1582

	1583 /**

	1584 * Return the stand-in for the dot set. It is allocated the first

	1585 * time and reused thereafter.

	1586 */

	1587 UChar TransliteratorParser::getDotStandIn(UErrorCode& status) {

	1588 if (dotStandIn == (UChar) -1) {

	1589 UnicodeSet* tempus = new UnicodeSet(DOT_SET, status);

	1590 // Null pointer check.

	1591 if (tempus == NULL) {

	1592 status = U_MEMORY_ALLOCATION_ERROR;

	1593 return (UChar)0x0000;

	1594 }

	1595 dotStandIn = generateStandInFor(tempus, status);

	1596 }

	1597 return dotStandIn;

	1598 }

	1599

	1600 /**

	1601 * Append the value of the given variable name to the given

	1602 * UnicodeString.

	1603 */

	1604 void TransliteratorParser::appendVariableDef(const UnicodeString& name,

	1605 UnicodeString& buf,

	1606 UErrorCode& status) {

	1607 const UnicodeString* s = (const UnicodeString*) variableNames.get(name);

	1608 if (s == NULL) {

	1609 // We allow one undefined variable so that variable definition

	1610 // statements work. For the first undefined variable we return

	1611 // the special placeholder variableLimit-1, and save the variable

	1612 // name.

	1613 if (undefinedVariableName.length() == 0) {

	1614 undefinedVariableName = name;

	1615 if (variableNext >= variableLimit) {

	1616 // throw new RuntimeException("Private use variables exhausted") ;

	1617 status = U_ILLEGAL_ARGUMENT_ERROR;

	1618 return;

	1619 }

	1620 buf.append((UChar) --variableLimit);

	1621 } else {

	1622 //throw new IllegalArgumentException("Undefined variable $"

	1623 // + name);

	1624 status = U_ILLEGAL_ARGUMENT_ERROR;

	1625 return;

	1626 }

	1627 } else {

	1628 buf.append(*s);

	1629 }

	1630 }

	1631

	1632 /**

	1633 * Glue method to get around access restrictions in C++.

	1634 */

	1635 /Transliterator TransliteratorParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) {

	1636 return Transliterator::createBasicInstance(id, canonID);

	1637 }*/

	1638

	1639 U_NAMESPACE_END

	1640

	1641 U_CAPI int32_t

	1642 utrans_stripRules(const UChar source, int32_t sourceLen, UChar target, UErrorC ode *status) {

	1643 U_NAMESPACE_USE

	1644

	1645 //const UChar *sourceStart = source;

	1646 const UChar *targetStart = target;

	1647 const UChar *sourceLimit = source+sourceLen;

	1648 UChar *targetLimit = target+sourceLen;

	1649 UChar32 c = 0;

	1650 UBool quoted = FALSE;

	1651 int32_t index;

	1652

	1653 uprv_memset(target, 0, sourceLen*U_SIZEOF_UCHAR);

	1654

	1655 /* read the rules into the buffer */

	1656 while (source < sourceLimit)

	1657 {

	1658 index=0;

	1659 U16_NEXT_UNSAFE(source, index, c);

	1660 source+=index;

	1661 if(c == QUOTE) {

	1662 quoted = (UBool)!quoted;

	1663 }

	1664 else if (!quoted) {

	1665 if (c == RULE_COMMENT_CHAR) {

	1666 /* skip comments and all preceding spaces */

	1667 while (targetStart < target && *(target - 1) == 0x0020) {

	1668 target--;

	1669 }

	1670 do {

	1671 c = *(source++);

	1672 }

	1673 while (c != CR && c != LF);

	1674 }

	1675 else if (c == ESCAPE) {

	1676 UChar32 c2 = *source;

	1677 if (c2 == CR \|\| c2 == LF) {

	1678 /* A backslash at the end of a line. */

	1679 /* Since we're stripping lines, ignore the backslash. */

	1680 source++;

	1681 continue;

	1682 }

	1683 if (c2 == 0x0075 && source+5 < sourceLimit) { /* \u seen. \U isn 't unescaped. */

	1684 int32_t escapeOffset = 0;

	1685 UnicodeString escapedStr(source, 5);

	1686 c2 = escapedStr.unescapeAt(escapeOffset);

	1687

	1688 if (c2 == (UChar32)0xFFFFFFFF \|\| escapeOffset == 0)

	1689 {

	1690 *status = U_PARSE_ERROR;

	1691 return 0;

	1692 }

	1693 if (!uprv_isRuleWhiteSpace(c2) && !u_iscntrl(c2) && !u_ispun ct(c2)) {

	1694 /* It was escaped for a reason. Write what it was suppos e to be. */

	1695 source+=5;

	1696 c = c2;

	1697 }

	1698 }

	1699 else if (c2 == QUOTE) {

	1700 /* \' seen. Make sure we don't do anything when we see it ag ain. */

	1701 quoted = (UBool)!quoted;

	1702 }

	1703 }

	1704 }

	1705 if (c == CR \|\| c == LF)

	1706 {

	1707 /* ignore spaces carriage returns, and all leading spaces on the nex t line.

	1708 * and line feed unless in the form \uXXXX

	1709 */

	1710 quoted = FALSE;

	1711 while (source < sourceLimit) {

	1712 c = *(source);

	1713 if (c != CR && c != LF && c != 0x0020) {

	1714 break;

	1715 }

	1716 source++;

	1717 }

	1718 continue;

	1719 }

	1720

	1721 /* Append UChar * after dissembling if c > 0xffff*/

	1722 index=0;

	1723 U16_APPEND_UNSAFE(target, index, c);

	1724 target+=index;

	1725 }

	1726 if (target < targetLimit) {

	1727 *target = 0;

	1728 }

	1729 return (int32_t)(target-targetStart);

	1730 }

	1731

	1732 #endif /* #if !UCONFIG_NO_TRANSLITERATION */

OLD	NEW

« no previous file with comments | « icu46/source/i18n/rbt_pars.h ('k') | icu46/source/i18n/rbt_rule.h » ('j') | no next file with comments »