icu46/source/common/rbbiscan.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/common/rbbiscan.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1

	2 //

	3 // file: rbbiscan.cpp

	4 //

	5 // Copyright (C) 2002-2010, International Business Machines Corporation and oth ers.

	6 // All Rights Reserved.

	7 //

	8 // This file contains the Rule Based Break Iterator Rule Builder functions for

	9 // scanning the rules and assembling a parse tree. This is the first phase

	10 // of compiling the rules.

	11 //

	12 // The overall of the rules is managed by class RBBIRuleBuilder, which will

	13 // create and use an instance of this class as part of the process.

	14 //

	15

	16 #include "unicode/utypes.h"

	17

	18 #if !UCONFIG_NO_BREAK_ITERATION

	19

	20 #include "unicode/unistr.h"

	21 #include "unicode/uniset.h"

	22 #include "unicode/uchar.h"

	23 #include "unicode/uchriter.h"

	24 #include "unicode/parsepos.h"

	25 #include "unicode/parseerr.h"

	26 #include "util.h"

	27 #include "cmemory.h"

	28 #include "cstring.h"

	29

	30 #include "rbbirpt.h" // Contains state table for the rbbi rules parser.

	31 // generated by a Perl script.

	32 #include "rbbirb.h"

	33 #include "rbbinode.h"

	34 #include "rbbiscan.h"

	35 #include "rbbitblb.h"

	36

	37 #include "uassert.h"

	38

	39

	40 //------------------------------------------------------------------------------

	41 //

	42 // Unicode Set init strings for each of the character classes needed for parsing a rule file.

	43 // (Initialized with hex values for portability to EBCDIC based ma chines.

	44 // Really ugly, but there's no good way to avoid it.)

	45 //

	46 // The sets are referred to by name in the rbbirpt.txt, which is th e

	47 // source form of the state transition table for the RBBI rule pars er.

	48 //

	49 //------------------------------------------------------------------------------

	50 static const UChar gRuleSet_rule_char_pattern[] = {

	51 // [ ^ [ \ p { Z } \ u 0 0 2 0

	52 0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32 , 0x30,

	53 // - \ u 0 0 7 f ] - [ \ p

	54 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70,

	55 // { L } ] - [ \ p { N } ] ]

	56 0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d , 0};

	57

	58 static const UChar gRuleSet_name_char_pattern[] = {

	59 // [ _ \ p { L } \ p { N } ]

	60 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d , 0};

	61

	62 static const UChar gRuleSet_digit_char_pattern[] = {

	63 // [ 0 - 9 ]

	64 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};

	65

	66 static const UChar gRuleSet_name_start_char_pattern[] = {

	67 // [ _ \ p { L } ]

	68 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 };

	69

	70 static const UChar kAny[] = {0x61, 0x6e, 0x79, 0x00}; // "any"

	71

	72

	73 U_CDECL_BEGIN

	74 static void U_CALLCONV RBBISetTable_deleter(void *p) {

	75 U_NAMESPACE_QUALIFIER RBBISetTableEl px = (U_NAMESPACE_QUALIFIER RBBISetTab leEl )p;

	76 delete px->key;

	77 // Note: px->val is owned by the linked list "fSetsListHead" in scanner.

	78 // Don't delete the value nodes here.

	79 uprv_free(px);

	80 }

	81 U_CDECL_END

	82

	83 U_NAMESPACE_BEGIN

	84

	85 //------------------------------------------------------------------------------

	86 //

	87 // Constructor.

	88 //

	89 //------------------------------------------------------------------------------

	90 RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb)

	91 {

	92 fRB = rb;

	93 fStackPtr = 0;

	94 fStack[fStackPtr] = 0;

	95 fNodeStackPtr = 0;

	96 fRuleNum = 0;

	97 fNodeStack[0] = NULL;

	98

	99 fSymbolTable = NULL;

	100 fSetTable = NULL;

	101

	102 fScanIndex = 0;

	103 fNextIndex = 0;

	104

	105 fReverseRule = FALSE;

	106 fLookAheadRule = FALSE;

	107

	108 fLineNum = 1;

	109 fCharNum = 0;

	110 fQuoteMode = FALSE;

	111

	112 // Do not check status until after all critical fields are sufficiently init ialized

	113 // that the destructor can run cleanly.

	114 if (U_FAILURE(*rb->fStatus)) {

	115 return;

	116 }

	117

	118 //

	119 // Set up the constant Unicode Sets.

	120 // Note: These could be made static, lazily initialized, and shared amo ng

	121 // all instances of RBBIRuleScanners. BUT this is quite a bit si mpler,

	122 // and the time to build these few sets should be small compared to a

	123 // full break iterator build.

	124 fRuleSets[kRuleSet_rule_char-128] = UnicodeSet(gRuleSet_rule_char_patt ern, *rb->fStatus);

	125 UnicodeSet *whitespaceSet = uprv_openRuleWhiteSpaceSet(rb->fStatus);

	126 if (U_FAILURE(*rb->fStatus)) {

	127 return;

	128 }

	129 fRuleSets[kRuleSet_white_space-128] = *whitespaceSet;

	130 delete whitespaceSet;

	131 fRuleSets[kRuleSet_name_char-128] = UnicodeSet(gRuleSet_name_char_patt ern, *rb->fStatus);

	132 fRuleSets[kRuleSet_name_start_char-128] = UnicodeSet(gRuleSet_name_start_cha r_pattern, *rb->fStatus);

	133 fRuleSets[kRuleSet_digit_char-128] = UnicodeSet(gRuleSet_digit_char_pat tern, *rb->fStatus);

	134 if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) {

	135 // This case happens if ICU's data is missing. UnicodeSet tries to look up property

	136 // names from the init string, can't find them, and claims an illegal arguement.

	137 // Change the error so that the actual problem will be clearer to user s.

	138 *rb->fStatus = U_BRK_INIT_ERROR;

	139 }

	140 if (U_FAILURE(*rb->fStatus)) {

	141 return;

	142 }

	143

	144 fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus);

	145 if (fSymbolTable == NULL) {

	146 *rb->fStatus = U_MEMORY_ALLOCATION_ERROR;

	147 return;

	148 }

	149 fSetTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeStrin g, NULL, rb->fStatus);

	150 if (U_FAILURE(*rb->fStatus)) {

	151 return;

	152 }

	153 uhash_setValueDeleter(fSetTable, RBBISetTable_deleter);

	154 }

	155

	156

	157

	158 //------------------------------------------------------------------------------

	159 //

	160 // Destructor

	161 //

	162 //------------------------------------------------------------------------------

	163 RBBIRuleScanner::~RBBIRuleScanner() {

	164 delete fSymbolTable;

	165 if (fSetTable != NULL) {

	166 uhash_close(fSetTable);

	167 fSetTable = NULL;

	168

	169 }

	170

	171

	172 // Node Stack.

	173 // Normally has one entry, which is the entire parse tree for the rules.

	174 // If errors occured, there may be additional subtrees left on the stack.

	175 while (fNodeStackPtr > 0) {

	176 delete fNodeStack[fNodeStackPtr];

	177 fNodeStackPtr--;

	178 }

	179

	180 }

	181

	182 //------------------------------------------------------------------------------

	183 //

	184 // doParseAction Do some action during rule parsing.

	185 // Called by the parse state machine.

	186 // Actions build the parse tree and Unicode Sets,

	187 // and maintain the parse stack for nested expressions.

	188 //

	189 // TODO: unify EParseAction and RBBI_RuleParseAction enum types.

	190 // They represent exactly the same thing. They're separate

	191 // only to work around enum forward declaration res trictions

	192 // in some compilers, while at the same time avoidi ng multiple

	193 // definitions problems. I'm sure that there's a b etter way.

	194 //

	195 //------------------------------------------------------------------------------

	196 UBool RBBIRuleScanner::doParseActions(int32_t action)

	197 {

	198 RBBINode *n = NULL;

	199

	200 UBool returnVal = TRUE;

	201

	202 switch (action) {

	203

	204 case doExprStart:

	205 pushNewNode(RBBINode::opStart);

	206 fRuleNum++;

	207 break;

	208

	209

	210 case doExprOrOperator:

	211 {

	212 fixOpStack(RBBINode::precOpCat);

	213 RBBINode *operandNode = fNodeStack[fNodeStackPtr--];

	214 RBBINode *orNode = pushNewNode(RBBINode::opOr);

	215 orNode->fLeftChild = operandNode;

	216 operandNode->fParent = orNode;

	217 }

	218 break;

	219

	220 case doExprCatOperator:

	221 // concatenation operator.

	222 // For the implicit concatenation of adjacent terms in an expression tha t are

	223 // not separated by any other operator. Action is invoked between the

	224 // actions for the two terms.

	225 {

	226 fixOpStack(RBBINode::precOpCat);

	227 RBBINode *operandNode = fNodeStack[fNodeStackPtr--];

	228 RBBINode *catNode = pushNewNode(RBBINode::opCat);

	229 catNode->fLeftChild = operandNode;

	230 operandNode->fParent = catNode;

	231 }

	232 break;

	233

	234 case doLParen:

	235 // Open Paren.

	236 // The openParen node is a dummy operation type with a low precedence,

	237 // which has the affect of ensuring that any real binary op that

	238 // follows within the parens binds more tightly to the operands than

	239 // stuff outside of the parens.

	240 pushNewNode(RBBINode::opLParen);

	241 break;

	242

	243 case doExprRParen:

	244 fixOpStack(RBBINode::precLParen);

	245 break;

	246

	247 case doNOP:

	248 break;

	249

	250 case doStartAssign:

	251 // We've just scanned "$variable = "

	252 // The top of the node stack has the $variable ref node.

	253

	254 // Save the start position of the RHS text in the StartExpression node

	255 // that precedes the $variableReference node on the stack.

	256 // This will eventually be used when saving the full $variable replace ment

	257 // text as a string.

	258 n = fNodeStack[fNodeStackPtr-1];

	259 n->fFirstPos = fNextIndex; // move past the '='

	260

	261 // Push a new start-of-expression node; needed to keep parse of the

	262 // RHS expression happy.

	263 pushNewNode(RBBINode::opStart);

	264 break;

	265

	266

	267

	268

	269 case doEndAssign:

	270 {

	271 // We have reached the end of an assignement statement.

	272 // Current scan char is the ';' that terminates the assignment.

	273

	274 // Terminate expression, leaves expression parse tree rooted in TOS node.

	275 fixOpStack(RBBINode::precStart);

	276

	277 RBBINode *startExprNode = fNodeStack[fNodeStackPtr-2];

	278 RBBINode *varRefNode = fNodeStack[fNodeStackPtr-1];

	279 RBBINode *RHSExprNode = fNodeStack[fNodeStackPtr];

	280

	281 // Save original text of right side of assignment, excluding the ter minating ';'

	282 // in the root of the node for the right-hand-side expression.

	283 RHSExprNode->fFirstPos = startExprNode->fFirstPos;

	284 RHSExprNode->fLastPos = fScanIndex;

	285 fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLas tPos, RHSExprNode->fText);

	286

	287 // Expression parse tree becomes l. child of the $variable reference node.

	288 varRefNode->fLeftChild = RHSExprNode;

	289 RHSExprNode->fParent = varRefNode;

	290

	291 // Make a symbol table entry for the $variableRef node.

	292 fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus) ;

	293 if (U_FAILURE(*fRB->fStatus)) {

	294 // This is a round-about way to get the parse position set

	295 // so that duplicate symbols error messages include a line numb er.

	296 UErrorCode t = *fRB->fStatus;

	297 *fRB->fStatus = U_ZERO_ERROR;

	298 error(t);

	299 }

	300

	301 // Clean up the stack.

	302 delete startExprNode;

	303 fNodeStackPtr-=3;

	304 break;

	305 }

	306

	307 case doEndOfRule:

	308 {

	309 fixOpStack(RBBINode::precStart); // Terminate expression, leaves ex pression

	310 if (U_FAILURE(*fRB->fStatus)) { // parse tree rooted in TOS node .

	311 break;

	312 }

	313 #ifdef RBBI_DEBUG

	314 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeSt ack("end of rule");}

	315 #endif

	316 U_ASSERT(fNodeStackPtr == 1);

	317

	318 // If this rule includes a look-ahead '/', add a endMark node to the

	319 // expression tree.

	320 if (fLookAheadRule) {

	321 RBBINode *thisRule = fNodeStack[fNodeStackPtr];

	322 RBBINode *endNode = pushNewNode(RBBINode::endMark);

	323 RBBINode *catNode = pushNewNode(RBBINode::opCat);

	324 fNodeStackPtr -= 2;

	325 catNode->fLeftChild = thisRule;

	326 catNode->fRightChild = endNode;

	327 fNodeStack[fNodeStackPtr] = catNode;

	328 endNode->fVal = fRuleNum;

	329 endNode->fLookAheadEnd = TRUE;

	330 }

	331

	332 // All rule expressions are ORed together.

	333 // The ';' that terminates an expression really just functions as a '\|' with

	334 // a low operator prededence.

	335 //

	336 // Each of the four sets of rules are collected separately.

	337 // (forward, reverse, safe_forward, safe_reverse)

	338 // OR this rule into the appropriate group of them.

	339 //

	340 RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefault Tree);

	341

	342 if (*destRules != NULL) {

	343 // This is not the first rule encounted.

	344 // OR previous stuff (from *destRules)

	345 // with the current rule expression (on the Node Stack)

	346 // with the resulting OR expression going to *destRules

	347 //

	348 RBBINode *thisRule = fNodeStack[fNodeStackPtr];

	349 RBBINode prevRules = destRules;

	350 RBBINode *orNode = pushNewNode(RBBINode::opOr);

	351 orNode->fLeftChild = prevRules;

	352 prevRules->fParent = orNode;

	353 orNode->fRightChild = thisRule;

	354 thisRule->fParent = orNode;

	355 *destRules = orNode;

	356 }

	357 else

	358 {

	359 // This is the first rule encountered (for this direction).

	360 // Just move its parse tree from the stack to *destRules.

	361 *destRules = fNodeStack[fNodeStackPtr];

	362 }

	363 fReverseRule = FALSE; // in preparation for the next rule.

	364 fLookAheadRule = FALSE;

	365 fNodeStackPtr = 0;

	366 }

	367 break;

	368

	369

	370 case doRuleError:

	371 error(U_BRK_RULE_SYNTAX);

	372 returnVal = FALSE;

	373 break;

	374

	375

	376 case doVariableNameExpectedErr:

	377 error(U_BRK_RULE_SYNTAX);

	378 break;

	379

	380

	381 //

	382 // Unary operands + ? *

	383 // These all appear after the operand to which they apply.

	384 // When we hit one, the operand (may be a whole sub expression)

	385 // will be on the top of the stack.

	386 // Unary Operator becomes TOS, with the old TOS as its one child.

	387 case doUnaryOpPlus:

	388 {

	389 RBBINode *operandNode = fNodeStack[fNodeStackPtr--];

	390 RBBINode *plusNode = pushNewNode(RBBINode::opPlus);

	391 plusNode->fLeftChild = operandNode;

	392 operandNode->fParent = plusNode;

	393 }

	394 break;

	395

	396 case doUnaryOpQuestion:

	397 {

	398 RBBINode *operandNode = fNodeStack[fNodeStackPtr--];

	399 RBBINode *qNode = pushNewNode(RBBINode::opQuestion);

	400 qNode->fLeftChild = operandNode;

	401 operandNode->fParent = qNode;

	402 }

	403 break;

	404

	405 case doUnaryOpStar:

	406 {

	407 RBBINode *operandNode = fNodeStack[fNodeStackPtr--];

	408 RBBINode *starNode = pushNewNode(RBBINode::opStar);

	409 starNode->fLeftChild = operandNode;

	410 operandNode->fParent = starNode;

	411 }

	412 break;

	413

	414 case doRuleChar:

	415 // A "Rule Character" is any single character that is a literal part

	416 // of the regular expression. Like a, b and c in the expression "(abc*) \| [:L:]"

	417 // These are pretty uncommon in break rules; the terms are more commonly

	418 // sets. To keep things uniform, treat these characters like as

	419 // sets that just happen to contain only one character.

	420 {

	421 n = pushNewNode(RBBINode::setRef);

	422 findSetFor(fC.fChar, n);

	423 n->fFirstPos = fScanIndex;

	424 n->fLastPos = fNextIndex;

	425 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);

	426 break;

	427 }

	428

	429 case doDotAny:

	430 // scanned a ".", meaning match any single character.

	431 {

	432 n = pushNewNode(RBBINode::setRef);

	433 findSetFor(kAny, n);

	434 n->fFirstPos = fScanIndex;

	435 n->fLastPos = fNextIndex;

	436 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);

	437 break;

	438 }

	439

	440 case doSlash:

	441 // Scanned a '/', which identifies a look-ahead break position in a rule .

	442 n = pushNewNode(RBBINode::lookAhead);

	443 n->fVal = fRuleNum;

	444 n->fFirstPos = fScanIndex;

	445 n->fLastPos = fNextIndex;

	446 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);

	447 fLookAheadRule = TRUE;

	448 break;

	449

	450

	451 case doStartTagValue:

	452 // Scanned a '{', the opening delimiter for a tag value within a rule.

	453 n = pushNewNode(RBBINode::tag);

	454 n->fVal = 0;

	455 n->fFirstPos = fScanIndex;

	456 n->fLastPos = fNextIndex;

	457 break;

	458

	459 case doTagDigit:

	460 // Just scanned a decimal digit that's part of a tag value

	461 {

	462 n = fNodeStack[fNodeStackPtr];

	463 uint32_t v = u_charDigitValue(fC.fChar);

	464 U_ASSERT(v < 10);

	465 n->fVal = n->fVal*10 + v;

	466 break;

	467 }

	468

	469 case doTagValue:

	470 n = fNodeStack[fNodeStackPtr];

	471 n->fLastPos = fNextIndex;

	472 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);

	473 break;

	474

	475 case doTagExpectedError:

	476 error(U_BRK_MALFORMED_RULE_TAG);

	477 returnVal = FALSE;

	478 break;

	479

	480 case doOptionStart:

	481 // Scanning a !!option. At the start of string.

	482 fOptionStart = fScanIndex;

	483 break;

	484

	485 case doOptionEnd:

	486 {

	487 UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart );

	488 if (opt == UNICODE_STRING("chain", 5)) {

	489 fRB->fChainRules = TRUE;

	490 } else if (opt == UNICODE_STRING("LBCMNoChain", 11)) {

	491 fRB->fLBCMNoChain = TRUE;

	492 } else if (opt == UNICODE_STRING("forward", 7)) {

	493 fRB->fDefaultTree = &fRB->fForwardTree;

	494 } else if (opt == UNICODE_STRING("reverse", 7)) {

	495 fRB->fDefaultTree = &fRB->fReverseTree;

	496 } else if (opt == UNICODE_STRING("safe_forward", 12)) {

	497 fRB->fDefaultTree = &fRB->fSafeFwdTree;

	498 } else if (opt == UNICODE_STRING("safe_reverse", 12)) {

	499 fRB->fDefaultTree = &fRB->fSafeRevTree;

	500 } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) {

	501 fRB->fLookAheadHardBreak = TRUE;

	502 } else {

	503 error(U_BRK_UNRECOGNIZED_OPTION);

	504 }

	505 }

	506 break;

	507

	508 case doReverseDir:

	509 fReverseRule = TRUE;

	510 break;

	511

	512 case doStartVariableName:

	513 n = pushNewNode(RBBINode::varRef);

	514 if (U_FAILURE(*fRB->fStatus)) {

	515 break;

	516 }

	517 n->fFirstPos = fScanIndex;

	518 break;

	519

	520 case doEndVariableName:

	521 n = fNodeStack[fNodeStackPtr];

	522 if (n==NULL \|\| n->fType != RBBINode::varRef) {

	523 error(U_BRK_INTERNAL_ERROR);

	524 break;

	525 }

	526 n->fLastPos = fScanIndex;

	527 fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText);

	528 // Look the newly scanned name up in the symbol table

	529 // If there's an entry, set the l. child of the var ref to the replace ment expression.

	530 // (We also pass through here when scanning assignments, but no harm i s done, other

	531 // than a slight wasted effort that seems hard to avoid. Lookup will be null)

	532 n->fLeftChild = fSymbolTable->lookupNode(n->fText);

	533 break;

	534

	535 case doCheckVarDef:

	536 n = fNodeStack[fNodeStackPtr];

	537 if (n->fLeftChild == NULL) {

	538 error(U_BRK_UNDEFINED_VARIABLE);

	539 returnVal = FALSE;

	540 }

	541 break;

	542

	543 case doExprFinished:

	544 break;

	545

	546 case doRuleErrorAssignExpr:

	547 error(U_BRK_ASSIGN_ERROR);

	548 returnVal = FALSE;

	549 break;

	550

	551 case doExit:

	552 returnVal = FALSE;

	553 break;

	554

	555 case doScanUnicodeSet:

	556 scanSet();

	557 break;

	558

	559 default:

	560 error(U_BRK_INTERNAL_ERROR);

	561 returnVal = FALSE;

	562 break;

	563 }

	564 return returnVal;

	565 }

	566

	567

	568

	569

	570 //------------------------------------------------------------------------------

	571 //

	572 // Error Report a rule parse error.

	573 // Only report it if no previous error has been recorded.

	574 //

	575 //------------------------------------------------------------------------------

	576 void RBBIRuleScanner::error(UErrorCode e) {

	577 if (U_SUCCESS(*fRB->fStatus)) {

	578 *fRB->fStatus = e;

	579 if (fRB->fParseError) {

	580 fRB->fParseError->line = fLineNum;

	581 fRB->fParseError->offset = fCharNum;

	582 fRB->fParseError->preContext[0] = 0;

	583 fRB->fParseError->preContext[0] = 0;

	584 }

	585 }

	586 }

	587

	588

	589

	590

	591 //------------------------------------------------------------------------------

	592 //

	593 // fixOpStack The parse stack holds partially assembled chunks of the parse t ree.

	594 // An entry on the stack may be as small as a single setRef node,

	595 // or as large as the parse tree

	596 // for an entire expression (this will be the one item left on the stack

	597 // when the parsing of an RBBI rule completes.

	598 //

	599 // This function is called when a binary operator is encountered.

	600 // It looks back up the stack for operators that are not yet assoc iated

	601 // with a right operand, and if the precedence of the stacked oper ator >=

	602 // the precedence of the current operator, binds the operand left,

	603 // to the previously encountered operator.

	604 //

	605 //------------------------------------------------------------------------------

	606 void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) {

	607 RBBINode *n;

	608 // printNodeStack("entering fixOpStack()");

	609 for (;;) {

	610 n = fNodeStack[fNodeStackPtr-1]; // an operator node

	611 if (n->fPrecedence == 0) {

	612 RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node");

	613 error(U_BRK_INTERNAL_ERROR);

	614 return;

	615 }

	616

	617 if (n->fPrecedence < p \|\| n->fPrecedence <= RBBINode::precLParen) {

	618 // The most recent operand goes with the current operator,

	619 // not with the previously stacked one.

	620 break;

	621 }

	622 // Stack operator is a binary op ( '\|' or concatenation)

	623 // TOS operand becomes right child of this operator.

	624 // Resulting subexpression becomes the TOS operand.

	625 n->fRightChild = fNodeStack[fNodeStackPtr];

	626 fNodeStack[fNodeStackPtr]->fParent = n;

	627 fNodeStackPtr--;

	628 // printNodeStack("looping in fixOpStack() ");

	629 }

	630

	631 if (p <= RBBINode::precLParen) {

	632 // Scan is at a right paren or end of expression.

	633 // The scanned item must match the stack, or else there was an error.

	634 // Discard the left paren (or start expr) node from the stack,

	635 // leaving the completed (sub)expression as TOS.

	636 if (n->fPrecedence != p) {

	637 // Right paren encountered matched start of expression node, or

	638 // end of expression matched with a left paren node.

	639 error(U_BRK_MISMATCHED_PAREN);

	640 }

	641 fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr];

	642 fNodeStackPtr--;

	643 // Delete the now-discarded LParen or Start node.

	644 delete n;

	645 }

	646 // printNodeStack("leaving fixOpStack()");

	647 }

	648

	649

	650

	651

	652 //------------------------------------------------------------------------------

	653 //

	654 // findSetFor given a UnicodeString,

	655 // - find the corresponding Unicode Set (uset node)

	656 // (create one if necessary)

	657 // - Set fLeftChild of the caller's node (should be a setRef no de)

	658 // to the uset node

	659 // Maintain a hash table of uset nodes, so the same one is alway s used

	660 // for the same string.

	661 // If a "to adopt" set is provided and we haven't seen this key before,

	662 // add the provided set to the hash table.

	663 // If the string is one (32 bit) char in length, the set contain s

	664 // just one element which is the char in question.

	665 // If the string is "any", return a set containing all chars.

	666 //

	667 //------------------------------------------------------------------------------

	668 void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode node, Unicode Set setToAdopt) {

	669

	670 RBBISetTableEl *el;

	671

	672 // First check whether we've already cached a set for this string.

	673 // If so, just use the cached set in the new node.

	674 // delete any set provided by the caller, since we own it.

	675 el = (RBBISetTableEl *)uhash_get(fSetTable, &s);

	676 if (el != NULL) {

	677 delete setToAdopt;

	678 node->fLeftChild = el->val;

	679 U_ASSERT(node->fLeftChild->fType == RBBINode::uset);

	680 return;

	681 }

	682

	683 // Haven't seen this set before.

	684 // If the caller didn't provide us with a prebuilt set,

	685 // create a new UnicodeSet now.

	686 if (setToAdopt == NULL) {

	687 if (s.compare(kAny, -1) == 0) {

	688 setToAdopt = new UnicodeSet(0x000000, 0x10ffff);

	689 } else {

	690 UChar32 c;

	691 c = s.char32At(0);

	692 setToAdopt = new UnicodeSet(c, c);

	693 }

	694 }

	695

	696 //

	697 // Make a new uset node to refer to this UnicodeSet

	698 // This new uset node becomes the child of the caller's setReference node.

	699 //

	700 RBBINode *usetNode = new RBBINode(RBBINode::uset);

	701 if (usetNode == NULL) {

	702 error(U_MEMORY_ALLOCATION_ERROR);

	703 return;

	704 }

	705 usetNode->fInputSet = setToAdopt;

	706 usetNode->fParent = node;

	707 node->fLeftChild = usetNode;

	708 usetNode->fText = s;

	709

	710

	711 //

	712 // Add the new uset node to the list of all uset nodes.

	713 //

	714 fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus);

	715

	716

	717 //

	718 // Add the new set to the set hash table.

	719 //

	720 el = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl));

	721 UnicodeString *tkey = new UnicodeString(s);

	722 if (tkey == NULL \|\| el == NULL \|\| setToAdopt == NULL) {

	723 // Delete to avoid memory leak

	724 delete tkey;

	725 tkey = NULL;

	726 uprv_free(el);

	727 el = NULL;

	728 delete setToAdopt;

	729 setToAdopt = NULL;

	730

	731 error(U_MEMORY_ALLOCATION_ERROR);

	732 return;

	733 }

	734 el->key = tkey;

	735 el->val = usetNode;

	736 uhash_put(fSetTable, el->key, el, fRB->fStatus);

	737

	738 return;

	739 }

	740

	741

	742

	743 //

	744 // Assorted Unicode character constants.

	745 // Numeric because there is no portable way to enter them as literals.

	746 // (Think EBCDIC).

	747 //

	748 static const UChar chCR = 0x0d; // New lines, for terminating c omments.

	749 static const UChar chLF = 0x0a;

	750 static const UChar chNEL = 0x85; // NEL newline variant

	751 static const UChar chLS = 0x2028; // Unicode Line Separator

	752 static const UChar chApos = 0x27; // single quote, for quoted ch ars.

	753 static const UChar chPound = 0x23; // '#', introduces a comment.

	754 static const UChar chBackSlash = 0x5c; // '\' introduces a char escap e

	755 static const UChar chLParen = 0x28;

	756 static const UChar chRParen = 0x29;

	757

	758

	759 //------------------------------------------------------------------------------

	760 //

	761 // stripRules Return a rules string without unnecessary

	762 // characters.

	763 //

	764 //------------------------------------------------------------------------------

	765 UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {

	766 UnicodeString strippedRules;

	767 int rulesLength = rules.length();

	768 for (int idx = 0; idx < rulesLength; ) {

	769 UChar ch = rules[idx++];

	770 if (ch == chPound) {

	771 while (idx < rulesLength

	772 && ch != chCR && ch != chLF && ch != chNEL)

	773 {

	774 ch = rules[idx++];

	775 }

	776 }

	777 if (!u_isISOControl(ch)) {

	778 strippedRules.append(ch);

	779 }

	780 }

	781 // strippedRules = strippedRules.unescape();

	782 return strippedRules;

	783 }

	784

	785

	786 //------------------------------------------------------------------------------

	787 //

	788 // nextCharLL Low Level Next Char from rule input source.

	789 // Get a char from the input character iterator,

	790 // keep track of input position for error reporting.

	791 //

	792 //------------------------------------------------------------------------------

	793 UChar32 RBBIRuleScanner::nextCharLL() {

	794 UChar32 ch;

	795

	796 if (fNextIndex >= fRB->fRules.length()) {

	797 return (UChar32)-1;

	798 }

	799 ch = fRB->fRules.char32At(fNextIndex);

	800 fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1);

	801

	802 if (ch == chCR \|\|

	803 ch == chNEL \|\|

	804 ch == chLS \|\|

	805 (ch == chLF && fLastChar != chCR)) {

	806 // Character is starting a new line. Bump up the line number, and

	807 // reset the column to 0.

	808 fLineNum++;

	809 fCharNum=0;

	810 if (fQuoteMode) {

	811 error(U_BRK_NEW_LINE_IN_QUOTED_STRING);

	812 fQuoteMode = FALSE;

	813 }

	814 }

	815 else {

	816 // Character is not starting a new line. Except in the case of a

	817 // LF following a CR, increment the column position.

	818 if (ch != chLF) {

	819 fCharNum++;

	820 }

	821 }

	822 fLastChar = ch;

	823 return ch;

	824 }

	825

	826

	827 //------------------------------------------------------------------------------

	828 //

	829 // nextChar for rules scanning. At this level, we handle stripping

	830 // out comments and processing backslash character escapes.

	831 // The rest of the rules grammar is handled at the next level up.

	832 //

	833 //------------------------------------------------------------------------------

	834 void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {

	835

	836 // Unicode Character constants needed for the processing done by nextChar(),

	837 // in hex because literals wont work on EBCDIC machines.

	838

	839 fScanIndex = fNextIndex;

	840 c.fChar = nextCharLL();

	841 c.fEscaped = FALSE;

	842

	843 //

	844 // check for '' sequence.

	845 // These are recognized in all contexts, whether in quoted text or not.

	846 //

	847 if (c.fChar == chApos) {

	848 if (fRB->fRules.char32At(fNextIndex) == chApos) {

	849 c.fChar = nextCharLL(); // get nextChar officially so char acter counts

	850 c.fEscaped = TRUE; // stay correct.

	851 }

	852 else

	853 {

	854 // Single quote, by itself.

	855 // Toggle quoting mode.

	856 // Return either '(' or ')', because quotes cause a grouping of t he quoted text.

	857 fQuoteMode = !fQuoteMode;

	858 if (fQuoteMode == TRUE) {

	859 c.fChar = chLParen;

	860 } else {

	861 c.fChar = chRParen;

	862 }

	863 c.fEscaped = FALSE; // The paren that we return is not escaped.

	864 return;

	865 }

	866 }

	867

	868 if (fQuoteMode) {

	869 c.fEscaped = TRUE;

	870 }

	871 else

	872 {

	873 // We are not in a 'quoted region' of the source.

	874 //

	875 if (c.fChar == chPound) {

	876 // Start of a comment. Consume the rest of it.

	877 // The new-line char that terminates the comment is always returned .

	878 // It will be treated as white-space, and serves to break up anythi ng

	879 // that might otherwise incorrectly clump together with a comment in

	880 // the middle (a variable name, for example.)

	881 for (;;) {

	882 c.fChar = nextCharLL();

	883 if (c.fChar == (UChar32)-1 \|\| // EOF

	884 c.fChar == chCR \|\|

	885 c.fChar == chLF \|\|

	886 c.fChar == chNEL \|\|

	887 c.fChar == chLS) {break;}

	888 }

	889 }

	890 if (c.fChar == (UChar32)-1) {

	891 return;

	892 }

	893

	894 //

	895 // check for backslash escaped characters.

	896 // Use UnicodeString::unescapeAt() to handle them.

	897 //

	898 if (c.fChar == chBackSlash) {

	899 c.fEscaped = TRUE;

	900 int32_t startX = fNextIndex;

	901 c.fChar = fRB->fRules.unescapeAt(fNextIndex);

	902 if (fNextIndex == startX) {

	903 error(U_BRK_HEX_DIGITS_EXPECTED);

	904 }

	905 fCharNum += fNextIndex-startX;

	906 }

	907 }

	908 // putc(c.fChar, stdout);

	909 }

	910

	911 //------------------------------------------------------------------------------

	912 //

	913 // Parse RBBI rules. The state machine for rules parsing is here.

	914 // The state tables are hand-written in the file rbbirpt.tx t,

	915 // and converted to the form used here by a perl

	916 // script rbbicst.pl

	917 //

	918 //------------------------------------------------------------------------------

	919 void RBBIRuleScanner::parse() {

	920 uint16_t state;

	921 const RBBIRuleTableEl *tableEl;

	922

	923 if (U_FAILURE(*fRB->fStatus)) {

	924 return;

	925 }

	926

	927 state = 1;

	928 nextChar(fC);

	929 //

	930 // Main loop for the rule parsing state machine.

	931 // Runs once per state transition.

	932 // Each time through optionally performs, depending on the state table,

	933 // - an advance to the the next input char

	934 // - an action to be performed.

	935 // - pushing or popping a state to/from the local state return stack.

	936 //

	937 for (;;) {

	938 // Bail out if anything has gone wrong.

	939 // RBBI rule file parsing stops on the first error encountered.

	940 if (U_FAILURE(*fRB->fStatus)) {

	941 break;

	942 }

	943

	944 // Quit if state == 0. This is the normal way to exit the state machine .

	945 //

	946 if (state == 0) {

	947 break;

	948 }

	949

	950 // Find the state table element that matches the input char from the rul e, or the

	951 // class of the input character. Start with the first table row for this

	952 // state, then linearly scan forward until we find a row that matches the

	953 // character. The last row for each state always matches all charact ers, so

	954 // the search will stop there, if not before.

	955 //

	956 tableEl = &gRuleParseStateTable[state];

	957 #ifdef RBBI_DEBUG

	958 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {

	959 RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d) state=%s ",

	960 fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]);

	961 }

	962 #endif

	963

	964 for (;;) {

	965 #ifdef RBBI_DEBUG

	966 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBB IDebugPrintf(".");}

	967 #endif

	968 if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE && tableEl-> fCharClass == fC.fChar) {

	969 // Table row specified an individual character, not a set, and

	970 // the input character is not escaped, and

	971 // the input character matched it.

	972 break;

	973 }

	974 if (tableEl->fCharClass == 255) {

	975 // Table row specified default, match anything character class.

	976 break;

	977 }

	978 if (tableEl->fCharClass == 254 && fC.fEscaped) {

	979 // Table row specified "escaped" and the char was escaped.

	980 break;

	981 }

	982 if (tableEl->fCharClass == 253 && fC.fEscaped &&

	983 (fC.fChar == 0x50 \|\| fC.fChar == 0x70 )) {

	984 // Table row specified "escaped P" and the char is either 'p' or 'P'.

	985 break;

	986 }

	987 if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1) {

	988 // Table row specified eof and we hit eof on the input.

	989 break;

	990 }

	991

	992 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&

	993 fC.fEscaped == FALSE && // char is not escaped &&

	994 fC.fChar != (UChar32)-1) { // char is not EOF

	995 if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {

	996 // Table row specified a character class, or set of characte rs,

	997 // and the current char matches it.

	998 break;

	999 }

	1000 }

	1001

	1002 // No match on this row, advance to the next row for this state,

	1003 tableEl++;

	1004 }

	1005 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPu ts("");}

	1006

	1007 //

	1008 // We've found the row of the state table that matches the current input

	1009 // character from the rules string.

	1010 // Perform any action specified by this row in the state table.

	1011 if (doParseActions((int32_t)tableEl->fAction) == FALSE) {

	1012 // Break out of the state machine loop if the

	1013 // the action signalled some kind of error, or

	1014 // the action was to exit, occurs on normal end-of-rules-input.

	1015 break;

	1016 }

	1017

	1018 if (tableEl->fPushState != 0) {

	1019 fStackPtr++;

	1020 if (fStackPtr >= kStackSize) {

	1021 error(U_BRK_INTERNAL_ERROR);

	1022 RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow." );

	1023 fStackPtr--;

	1024 }

	1025 fStack[fStackPtr] = tableEl->fPushState;

	1026 }

	1027

	1028 if (tableEl->fNextChar) {

	1029 nextChar(fC);

	1030 }

	1031

	1032 // Get the next state from the table entry, or from the

	1033 // state stack if the next state was specified as "pop".

	1034 if (tableEl->fNextState != 255) {

	1035 state = tableEl->fNextState;

	1036 } else {

	1037 state = fStack[fStackPtr];

	1038 fStackPtr--;

	1039 if (fStackPtr < 0) {

	1040 error(U_BRK_INTERNAL_ERROR);

	1041 RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow. ");

	1042 fStackPtr++;

	1043 }

	1044 }

	1045

	1046 }

	1047

	1048 //

	1049 // If there were NO user specified reverse rules, set up the equivalent of " .*;"

	1050 //

	1051 if (fRB->fReverseTree == NULL) {

	1052 fRB->fReverseTree = pushNewNode(RBBINode::opStar);

	1053 RBBINode *operand = pushNewNode(RBBINode::setRef);

	1054 findSetFor(kAny, operand);

	1055 fRB->fReverseTree->fLeftChild = operand;

	1056 operand->fParent = fRB->fReverseTree;

	1057 fNodeStackPtr -= 2;

	1058 }

	1059

	1060

	1061 //

	1062 // Parsing of the input RBBI rules is complete.

	1063 // We now have a parse tree for the rule expressions

	1064 // and a list of all UnicodeSets that are referenced.

	1065 //

	1066 #ifdef RBBI_DEBUG

	1067 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable- >rbbiSymtablePrint();}

	1068 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree"))

	1069 {

	1070 RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n");

	1071 fRB->fForwardTree->printTree(TRUE);

	1072 RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n");

	1073 fRB->fReverseTree->printTree(TRUE);

	1074 RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n");

	1075 fRB->fSafeFwdTree->printTree(TRUE);

	1076 RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n");

	1077 fRB->fSafeRevTree->printTree(TRUE);

	1078 }

	1079 #endif

	1080 }

	1081

	1082

	1083 //------------------------------------------------------------------------------

	1084 //

	1085 // printNodeStack for debugging...

	1086 //

	1087 //------------------------------------------------------------------------------

	1088 #ifdef RBBI_DEBUG

	1089 void RBBIRuleScanner::printNodeStack(const char *title) {

	1090 int i;

	1091 RBBIDebugPrintf("%s. Dumping node stack...\n", title);

	1092 for (i=fNodeStackPtr; i>0; i--) {fNodeStack[i]->printTree(TRUE);}

	1093 }

	1094 #endif

	1095

	1096

	1097

	1098

	1099 //------------------------------------------------------------------------------

	1100 //

	1101 // pushNewNode create a new RBBINode of the specified type and push it

	1102 // onto the stack of nodes.

	1103 //

	1104 //------------------------------------------------------------------------------

	1105 RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) {

	1106 fNodeStackPtr++;

	1107 if (fNodeStackPtr >= kStackSize) {

	1108 error(U_BRK_INTERNAL_ERROR);

	1109 RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow.");

	1110 *fRB->fStatus = U_BRK_INTERNAL_ERROR;

	1111 return NULL;

	1112 }

	1113 fNodeStack[fNodeStackPtr] = new RBBINode(t);

	1114 if (fNodeStack[fNodeStackPtr] == NULL) {

	1115 *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR;

	1116 }

	1117 return fNodeStack[fNodeStackPtr];

	1118 }

	1119

	1120

	1121

	1122 //------------------------------------------------------------------------------

	1123 //

	1124 // scanSet Construct a UnicodeSet from the text at the current scan

	1125 // position. Advance the scan position to the first character

	1126 // after the set.

	1127 //

	1128 // A new RBBI setref node referring to the set is pushed onto the no de

	1129 // stack.

	1130 //

	1131 // The scan position is normally under the control of the state mach ine

	1132 // that controls rule parsing. UnicodeSets, however, are parsed by

	1133 // the UnicodeSet constructor, not by the RBBI rule parser.

	1134 //

	1135 //------------------------------------------------------------------------------

	1136 void RBBIRuleScanner::scanSet() {

	1137 UnicodeSet *uset;

	1138 ParsePosition pos;

	1139 int startPos;

	1140 int i;

	1141

	1142 if (U_FAILURE(*fRB->fStatus)) {

	1143 return;

	1144 }

	1145

	1146 pos.setIndex(fScanIndex);

	1147 startPos = fScanIndex;

	1148 UErrorCode localStatus = U_ZERO_ERROR;

	1149 uset = new UnicodeSet(fRB->fRules, pos, USET_IGNORE_SPACE,

	1150 fSymbolTable,

	1151 localStatus);

	1152 if (uset == NULL) {

	1153 localStatus = U_MEMORY_ALLOCATION_ERROR;

	1154 }

	1155 if (U_FAILURE(localStatus)) {

	1156 // TODO: Get more accurate position of the error from UnicodeSet's ret urn info.

	1157 // UnicodeSet appears to not be reporting correctly at this time .

	1158 #ifdef RBBI_DEBUG

	1159 RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.ge tIndex());

	1160 #endif

	1161 error(localStatus);

	1162 delete uset;

	1163 return;

	1164 }

	1165

	1166 // Verify that the set contains at least one code point.

	1167 //

	1168 if (uset->isEmpty()) {

	1169 // This set is empty.

	1170 // Make it an error, because it almost certainly is not what the user w anted.

	1171 // Also, avoids having to think about corner cases in the tree manipula tion code

	1172 // that occurs later on.

	1173 error(U_BRK_RULE_EMPTY_SET);

	1174 delete uset;

	1175 return;

	1176 }

	1177

	1178

	1179 // Advance the RBBI parse postion over the UnicodeSet pattern.

	1180 // Don't just set fScanIndex because the line/char positions maintained

	1181 // for error reporting would be thrown off.

	1182 i = pos.getIndex();

	1183 for (;;) {

	1184 if (fNextIndex >= i) {

	1185 break;

	1186 }

	1187 nextCharLL();

	1188 }

	1189

	1190 if (U_SUCCESS(*fRB->fStatus)) {

	1191 RBBINode *n;

	1192

	1193 n = pushNewNode(RBBINode::setRef);

	1194 n->fFirstPos = startPos;

	1195 n->fLastPos = fNextIndex;

	1196 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);

	1197 // findSetFor() serves several purposes here:

	1198 // - Adopts storage for the UnicodeSet, will be responsible for dele ting.

	1199 // - Mantains collection of all sets in use, needed later for establ ishing

	1200 // character categories for run time engine.

	1201 // - Eliminates mulitiple instances of the same set.

	1202 // - Creates a new uset node if necessary (if this isn't a duplicate .)

	1203 findSetFor(n->fText, n, uset);

	1204 }

	1205

	1206 }

	1207

	1208 U_NAMESPACE_END

	1209

	1210 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

OLD	NEW

« no previous file with comments | « icu46/source/common/rbbiscan.h ('k') | icu46/source/common/rbbisetb.h » ('j') | no next file with comments »