OLD | NEW |
(Empty) | |
| 1 |
| 2 // |
| 3 // file: rbbiscan.cpp |
| 4 // |
| 5 // Copyright (C) 2002-2010, International Business Machines Corporation and oth
ers. |
| 6 // All Rights Reserved. |
| 7 // |
| 8 // This file contains the Rule Based Break Iterator Rule Builder functions for |
| 9 // scanning the rules and assembling a parse tree. This is the first phase |
| 10 // of compiling the rules. |
| 11 // |
| 12 // The overall of the rules is managed by class RBBIRuleBuilder, which will |
| 13 // create and use an instance of this class as part of the process. |
| 14 // |
| 15 |
| 16 #include "unicode/utypes.h" |
| 17 |
| 18 #if !UCONFIG_NO_BREAK_ITERATION |
| 19 |
| 20 #include "unicode/unistr.h" |
| 21 #include "unicode/uniset.h" |
| 22 #include "unicode/uchar.h" |
| 23 #include "unicode/uchriter.h" |
| 24 #include "unicode/parsepos.h" |
| 25 #include "unicode/parseerr.h" |
| 26 #include "util.h" |
| 27 #include "cmemory.h" |
| 28 #include "cstring.h" |
| 29 |
| 30 #include "rbbirpt.h" // Contains state table for the rbbi rules parser. |
| 31 // generated by a Perl script. |
| 32 #include "rbbirb.h" |
| 33 #include "rbbinode.h" |
| 34 #include "rbbiscan.h" |
| 35 #include "rbbitblb.h" |
| 36 |
| 37 #include "uassert.h" |
| 38 |
| 39 |
| 40 //------------------------------------------------------------------------------ |
| 41 // |
| 42 // Unicode Set init strings for each of the character classes needed for parsing
a rule file. |
| 43 // (Initialized with hex values for portability to EBCDIC based ma
chines. |
| 44 // Really ugly, but there's no good way to avoid it.) |
| 45 // |
| 46 // The sets are referred to by name in the rbbirpt.txt, which is th
e |
| 47 // source form of the state transition table for the RBBI rule pars
er. |
| 48 // |
| 49 //------------------------------------------------------------------------------ |
| 50 static const UChar gRuleSet_rule_char_pattern[] = { |
| 51 // [ ^ [ \ p { Z } \ u 0 0 2
0 |
| 52 0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32
, 0x30, |
| 53 // - \ u 0 0 7 f ] - [ \ p |
| 54 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, |
| 55 // { L } ] - [ \ p { N } ] ] |
| 56 0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d
, 0}; |
| 57 |
| 58 static const UChar gRuleSet_name_char_pattern[] = { |
| 59 // [ _ \ p { L } \ p { N } ] |
| 60 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d
, 0}; |
| 61 |
| 62 static const UChar gRuleSet_digit_char_pattern[] = { |
| 63 // [ 0 - 9 ] |
| 64 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0}; |
| 65 |
| 66 static const UChar gRuleSet_name_start_char_pattern[] = { |
| 67 // [ _ \ p { L } ] |
| 68 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 }; |
| 69 |
| 70 static const UChar kAny[] = {0x61, 0x6e, 0x79, 0x00}; // "any" |
| 71 |
| 72 |
| 73 U_CDECL_BEGIN |
| 74 static void U_CALLCONV RBBISetTable_deleter(void *p) { |
| 75 U_NAMESPACE_QUALIFIER RBBISetTableEl *px = (U_NAMESPACE_QUALIFIER RBBISetTab
leEl *)p; |
| 76 delete px->key; |
| 77 // Note: px->val is owned by the linked list "fSetsListHead" in scanner. |
| 78 // Don't delete the value nodes here. |
| 79 uprv_free(px); |
| 80 } |
| 81 U_CDECL_END |
| 82 |
| 83 U_NAMESPACE_BEGIN |
| 84 |
| 85 //------------------------------------------------------------------------------ |
| 86 // |
| 87 // Constructor. |
| 88 // |
| 89 //------------------------------------------------------------------------------ |
| 90 RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb) |
| 91 { |
| 92 fRB = rb; |
| 93 fStackPtr = 0; |
| 94 fStack[fStackPtr] = 0; |
| 95 fNodeStackPtr = 0; |
| 96 fRuleNum = 0; |
| 97 fNodeStack[0] = NULL; |
| 98 |
| 99 fSymbolTable = NULL; |
| 100 fSetTable = NULL; |
| 101 |
| 102 fScanIndex = 0; |
| 103 fNextIndex = 0; |
| 104 |
| 105 fReverseRule = FALSE; |
| 106 fLookAheadRule = FALSE; |
| 107 |
| 108 fLineNum = 1; |
| 109 fCharNum = 0; |
| 110 fQuoteMode = FALSE; |
| 111 |
| 112 // Do not check status until after all critical fields are sufficiently init
ialized |
| 113 // that the destructor can run cleanly. |
| 114 if (U_FAILURE(*rb->fStatus)) { |
| 115 return; |
| 116 } |
| 117 |
| 118 // |
| 119 // Set up the constant Unicode Sets. |
| 120 // Note: These could be made static, lazily initialized, and shared amo
ng |
| 121 // all instances of RBBIRuleScanners. BUT this is quite a bit si
mpler, |
| 122 // and the time to build these few sets should be small compared
to a |
| 123 // full break iterator build. |
| 124 fRuleSets[kRuleSet_rule_char-128] = UnicodeSet(gRuleSet_rule_char_patt
ern, *rb->fStatus); |
| 125 UnicodeSet *whitespaceSet = uprv_openRuleWhiteSpaceSet(rb->fStatus); |
| 126 if (U_FAILURE(*rb->fStatus)) { |
| 127 return; |
| 128 } |
| 129 fRuleSets[kRuleSet_white_space-128] = *whitespaceSet; |
| 130 delete whitespaceSet; |
| 131 fRuleSets[kRuleSet_name_char-128] = UnicodeSet(gRuleSet_name_char_patt
ern, *rb->fStatus); |
| 132 fRuleSets[kRuleSet_name_start_char-128] = UnicodeSet(gRuleSet_name_start_cha
r_pattern, *rb->fStatus); |
| 133 fRuleSets[kRuleSet_digit_char-128] = UnicodeSet(gRuleSet_digit_char_pat
tern, *rb->fStatus); |
| 134 if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) { |
| 135 // This case happens if ICU's data is missing. UnicodeSet tries to look
up property |
| 136 // names from the init string, can't find them, and claims an illegal
arguement. |
| 137 // Change the error so that the actual problem will be clearer to user
s. |
| 138 *rb->fStatus = U_BRK_INIT_ERROR; |
| 139 } |
| 140 if (U_FAILURE(*rb->fStatus)) { |
| 141 return; |
| 142 } |
| 143 |
| 144 fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus); |
| 145 if (fSymbolTable == NULL) { |
| 146 *rb->fStatus = U_MEMORY_ALLOCATION_ERROR; |
| 147 return; |
| 148 } |
| 149 fSetTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeStrin
g, NULL, rb->fStatus); |
| 150 if (U_FAILURE(*rb->fStatus)) { |
| 151 return; |
| 152 } |
| 153 uhash_setValueDeleter(fSetTable, RBBISetTable_deleter); |
| 154 } |
| 155 |
| 156 |
| 157 |
| 158 //------------------------------------------------------------------------------ |
| 159 // |
| 160 // Destructor |
| 161 // |
| 162 //------------------------------------------------------------------------------ |
| 163 RBBIRuleScanner::~RBBIRuleScanner() { |
| 164 delete fSymbolTable; |
| 165 if (fSetTable != NULL) { |
| 166 uhash_close(fSetTable); |
| 167 fSetTable = NULL; |
| 168 |
| 169 } |
| 170 |
| 171 |
| 172 // Node Stack. |
| 173 // Normally has one entry, which is the entire parse tree for the rules. |
| 174 // If errors occured, there may be additional subtrees left on the stack. |
| 175 while (fNodeStackPtr > 0) { |
| 176 delete fNodeStack[fNodeStackPtr]; |
| 177 fNodeStackPtr--; |
| 178 } |
| 179 |
| 180 } |
| 181 |
| 182 //------------------------------------------------------------------------------ |
| 183 // |
| 184 // doParseAction Do some action during rule parsing. |
| 185 // Called by the parse state machine. |
| 186 // Actions build the parse tree and Unicode Sets, |
| 187 // and maintain the parse stack for nested expressions. |
| 188 // |
| 189 // TODO: unify EParseAction and RBBI_RuleParseAction enum
types. |
| 190 // They represent exactly the same thing. They're
separate |
| 191 // only to work around enum forward declaration res
trictions |
| 192 // in some compilers, while at the same time avoidi
ng multiple |
| 193 // definitions problems. I'm sure that there's a b
etter way. |
| 194 // |
| 195 //------------------------------------------------------------------------------ |
| 196 UBool RBBIRuleScanner::doParseActions(int32_t action) |
| 197 { |
| 198 RBBINode *n = NULL; |
| 199 |
| 200 UBool returnVal = TRUE; |
| 201 |
| 202 switch (action) { |
| 203 |
| 204 case doExprStart: |
| 205 pushNewNode(RBBINode::opStart); |
| 206 fRuleNum++; |
| 207 break; |
| 208 |
| 209 |
| 210 case doExprOrOperator: |
| 211 { |
| 212 fixOpStack(RBBINode::precOpCat); |
| 213 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; |
| 214 RBBINode *orNode = pushNewNode(RBBINode::opOr); |
| 215 orNode->fLeftChild = operandNode; |
| 216 operandNode->fParent = orNode; |
| 217 } |
| 218 break; |
| 219 |
| 220 case doExprCatOperator: |
| 221 // concatenation operator. |
| 222 // For the implicit concatenation of adjacent terms in an expression tha
t are |
| 223 // not separated by any other operator. Action is invoked between the |
| 224 // actions for the two terms. |
| 225 { |
| 226 fixOpStack(RBBINode::precOpCat); |
| 227 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; |
| 228 RBBINode *catNode = pushNewNode(RBBINode::opCat); |
| 229 catNode->fLeftChild = operandNode; |
| 230 operandNode->fParent = catNode; |
| 231 } |
| 232 break; |
| 233 |
| 234 case doLParen: |
| 235 // Open Paren. |
| 236 // The openParen node is a dummy operation type with a low precedence, |
| 237 // which has the affect of ensuring that any real binary op that |
| 238 // follows within the parens binds more tightly to the operands than |
| 239 // stuff outside of the parens. |
| 240 pushNewNode(RBBINode::opLParen); |
| 241 break; |
| 242 |
| 243 case doExprRParen: |
| 244 fixOpStack(RBBINode::precLParen); |
| 245 break; |
| 246 |
| 247 case doNOP: |
| 248 break; |
| 249 |
| 250 case doStartAssign: |
| 251 // We've just scanned "$variable = " |
| 252 // The top of the node stack has the $variable ref node. |
| 253 |
| 254 // Save the start position of the RHS text in the StartExpression node |
| 255 // that precedes the $variableReference node on the stack. |
| 256 // This will eventually be used when saving the full $variable replace
ment |
| 257 // text as a string. |
| 258 n = fNodeStack[fNodeStackPtr-1]; |
| 259 n->fFirstPos = fNextIndex; // move past the '=' |
| 260 |
| 261 // Push a new start-of-expression node; needed to keep parse of the |
| 262 // RHS expression happy. |
| 263 pushNewNode(RBBINode::opStart); |
| 264 break; |
| 265 |
| 266 |
| 267 |
| 268 |
| 269 case doEndAssign: |
| 270 { |
| 271 // We have reached the end of an assignement statement. |
| 272 // Current scan char is the ';' that terminates the assignment. |
| 273 |
| 274 // Terminate expression, leaves expression parse tree rooted in TOS
node. |
| 275 fixOpStack(RBBINode::precStart); |
| 276 |
| 277 RBBINode *startExprNode = fNodeStack[fNodeStackPtr-2]; |
| 278 RBBINode *varRefNode = fNodeStack[fNodeStackPtr-1]; |
| 279 RBBINode *RHSExprNode = fNodeStack[fNodeStackPtr]; |
| 280 |
| 281 // Save original text of right side of assignment, excluding the ter
minating ';' |
| 282 // in the root of the node for the right-hand-side expression. |
| 283 RHSExprNode->fFirstPos = startExprNode->fFirstPos; |
| 284 RHSExprNode->fLastPos = fScanIndex; |
| 285 fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLas
tPos, RHSExprNode->fText); |
| 286 |
| 287 // Expression parse tree becomes l. child of the $variable reference
node. |
| 288 varRefNode->fLeftChild = RHSExprNode; |
| 289 RHSExprNode->fParent = varRefNode; |
| 290 |
| 291 // Make a symbol table entry for the $variableRef node. |
| 292 fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus)
; |
| 293 if (U_FAILURE(*fRB->fStatus)) { |
| 294 // This is a round-about way to get the parse position set |
| 295 // so that duplicate symbols error messages include a line numb
er. |
| 296 UErrorCode t = *fRB->fStatus; |
| 297 *fRB->fStatus = U_ZERO_ERROR; |
| 298 error(t); |
| 299 } |
| 300 |
| 301 // Clean up the stack. |
| 302 delete startExprNode; |
| 303 fNodeStackPtr-=3; |
| 304 break; |
| 305 } |
| 306 |
| 307 case doEndOfRule: |
| 308 { |
| 309 fixOpStack(RBBINode::precStart); // Terminate expression, leaves ex
pression |
| 310 if (U_FAILURE(*fRB->fStatus)) { // parse tree rooted in TOS node
. |
| 311 break; |
| 312 } |
| 313 #ifdef RBBI_DEBUG |
| 314 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeSt
ack("end of rule");} |
| 315 #endif |
| 316 U_ASSERT(fNodeStackPtr == 1); |
| 317 |
| 318 // If this rule includes a look-ahead '/', add a endMark node to the |
| 319 // expression tree. |
| 320 if (fLookAheadRule) { |
| 321 RBBINode *thisRule = fNodeStack[fNodeStackPtr]; |
| 322 RBBINode *endNode = pushNewNode(RBBINode::endMark); |
| 323 RBBINode *catNode = pushNewNode(RBBINode::opCat); |
| 324 fNodeStackPtr -= 2; |
| 325 catNode->fLeftChild = thisRule; |
| 326 catNode->fRightChild = endNode; |
| 327 fNodeStack[fNodeStackPtr] = catNode; |
| 328 endNode->fVal = fRuleNum; |
| 329 endNode->fLookAheadEnd = TRUE; |
| 330 } |
| 331 |
| 332 // All rule expressions are ORed together. |
| 333 // The ';' that terminates an expression really just functions as a '|'
with |
| 334 // a low operator prededence. |
| 335 // |
| 336 // Each of the four sets of rules are collected separately. |
| 337 // (forward, reverse, safe_forward, safe_reverse) |
| 338 // OR this rule into the appropriate group of them. |
| 339 // |
| 340 RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefault
Tree); |
| 341 |
| 342 if (*destRules != NULL) { |
| 343 // This is not the first rule encounted. |
| 344 // OR previous stuff (from *destRules) |
| 345 // with the current rule expression (on the Node Stack) |
| 346 // with the resulting OR expression going to *destRules |
| 347 // |
| 348 RBBINode *thisRule = fNodeStack[fNodeStackPtr]; |
| 349 RBBINode *prevRules = *destRules; |
| 350 RBBINode *orNode = pushNewNode(RBBINode::opOr); |
| 351 orNode->fLeftChild = prevRules; |
| 352 prevRules->fParent = orNode; |
| 353 orNode->fRightChild = thisRule; |
| 354 thisRule->fParent = orNode; |
| 355 *destRules = orNode; |
| 356 } |
| 357 else |
| 358 { |
| 359 // This is the first rule encountered (for this direction). |
| 360 // Just move its parse tree from the stack to *destRules. |
| 361 *destRules = fNodeStack[fNodeStackPtr]; |
| 362 } |
| 363 fReverseRule = FALSE; // in preparation for the next rule. |
| 364 fLookAheadRule = FALSE; |
| 365 fNodeStackPtr = 0; |
| 366 } |
| 367 break; |
| 368 |
| 369 |
| 370 case doRuleError: |
| 371 error(U_BRK_RULE_SYNTAX); |
| 372 returnVal = FALSE; |
| 373 break; |
| 374 |
| 375 |
| 376 case doVariableNameExpectedErr: |
| 377 error(U_BRK_RULE_SYNTAX); |
| 378 break; |
| 379 |
| 380 |
| 381 // |
| 382 // Unary operands + ? * |
| 383 // These all appear after the operand to which they apply. |
| 384 // When we hit one, the operand (may be a whole sub expression) |
| 385 // will be on the top of the stack. |
| 386 // Unary Operator becomes TOS, with the old TOS as its one child. |
| 387 case doUnaryOpPlus: |
| 388 { |
| 389 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; |
| 390 RBBINode *plusNode = pushNewNode(RBBINode::opPlus); |
| 391 plusNode->fLeftChild = operandNode; |
| 392 operandNode->fParent = plusNode; |
| 393 } |
| 394 break; |
| 395 |
| 396 case doUnaryOpQuestion: |
| 397 { |
| 398 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; |
| 399 RBBINode *qNode = pushNewNode(RBBINode::opQuestion); |
| 400 qNode->fLeftChild = operandNode; |
| 401 operandNode->fParent = qNode; |
| 402 } |
| 403 break; |
| 404 |
| 405 case doUnaryOpStar: |
| 406 { |
| 407 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; |
| 408 RBBINode *starNode = pushNewNode(RBBINode::opStar); |
| 409 starNode->fLeftChild = operandNode; |
| 410 operandNode->fParent = starNode; |
| 411 } |
| 412 break; |
| 413 |
| 414 case doRuleChar: |
| 415 // A "Rule Character" is any single character that is a literal part |
| 416 // of the regular expression. Like a, b and c in the expression "(abc*)
| [:L:]" |
| 417 // These are pretty uncommon in break rules; the terms are more commonly |
| 418 // sets. To keep things uniform, treat these characters like as |
| 419 // sets that just happen to contain only one character. |
| 420 { |
| 421 n = pushNewNode(RBBINode::setRef); |
| 422 findSetFor(fC.fChar, n); |
| 423 n->fFirstPos = fScanIndex; |
| 424 n->fLastPos = fNextIndex; |
| 425 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); |
| 426 break; |
| 427 } |
| 428 |
| 429 case doDotAny: |
| 430 // scanned a ".", meaning match any single character. |
| 431 { |
| 432 n = pushNewNode(RBBINode::setRef); |
| 433 findSetFor(kAny, n); |
| 434 n->fFirstPos = fScanIndex; |
| 435 n->fLastPos = fNextIndex; |
| 436 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); |
| 437 break; |
| 438 } |
| 439 |
| 440 case doSlash: |
| 441 // Scanned a '/', which identifies a look-ahead break position in a rule
. |
| 442 n = pushNewNode(RBBINode::lookAhead); |
| 443 n->fVal = fRuleNum; |
| 444 n->fFirstPos = fScanIndex; |
| 445 n->fLastPos = fNextIndex; |
| 446 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); |
| 447 fLookAheadRule = TRUE; |
| 448 break; |
| 449 |
| 450 |
| 451 case doStartTagValue: |
| 452 // Scanned a '{', the opening delimiter for a tag value within a rule. |
| 453 n = pushNewNode(RBBINode::tag); |
| 454 n->fVal = 0; |
| 455 n->fFirstPos = fScanIndex; |
| 456 n->fLastPos = fNextIndex; |
| 457 break; |
| 458 |
| 459 case doTagDigit: |
| 460 // Just scanned a decimal digit that's part of a tag value |
| 461 { |
| 462 n = fNodeStack[fNodeStackPtr]; |
| 463 uint32_t v = u_charDigitValue(fC.fChar); |
| 464 U_ASSERT(v < 10); |
| 465 n->fVal = n->fVal*10 + v; |
| 466 break; |
| 467 } |
| 468 |
| 469 case doTagValue: |
| 470 n = fNodeStack[fNodeStackPtr]; |
| 471 n->fLastPos = fNextIndex; |
| 472 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); |
| 473 break; |
| 474 |
| 475 case doTagExpectedError: |
| 476 error(U_BRK_MALFORMED_RULE_TAG); |
| 477 returnVal = FALSE; |
| 478 break; |
| 479 |
| 480 case doOptionStart: |
| 481 // Scanning a !!option. At the start of string. |
| 482 fOptionStart = fScanIndex; |
| 483 break; |
| 484 |
| 485 case doOptionEnd: |
| 486 { |
| 487 UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart
); |
| 488 if (opt == UNICODE_STRING("chain", 5)) { |
| 489 fRB->fChainRules = TRUE; |
| 490 } else if (opt == UNICODE_STRING("LBCMNoChain", 11)) { |
| 491 fRB->fLBCMNoChain = TRUE; |
| 492 } else if (opt == UNICODE_STRING("forward", 7)) { |
| 493 fRB->fDefaultTree = &fRB->fForwardTree; |
| 494 } else if (opt == UNICODE_STRING("reverse", 7)) { |
| 495 fRB->fDefaultTree = &fRB->fReverseTree; |
| 496 } else if (opt == UNICODE_STRING("safe_forward", 12)) { |
| 497 fRB->fDefaultTree = &fRB->fSafeFwdTree; |
| 498 } else if (opt == UNICODE_STRING("safe_reverse", 12)) { |
| 499 fRB->fDefaultTree = &fRB->fSafeRevTree; |
| 500 } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) { |
| 501 fRB->fLookAheadHardBreak = TRUE; |
| 502 } else { |
| 503 error(U_BRK_UNRECOGNIZED_OPTION); |
| 504 } |
| 505 } |
| 506 break; |
| 507 |
| 508 case doReverseDir: |
| 509 fReverseRule = TRUE; |
| 510 break; |
| 511 |
| 512 case doStartVariableName: |
| 513 n = pushNewNode(RBBINode::varRef); |
| 514 if (U_FAILURE(*fRB->fStatus)) { |
| 515 break; |
| 516 } |
| 517 n->fFirstPos = fScanIndex; |
| 518 break; |
| 519 |
| 520 case doEndVariableName: |
| 521 n = fNodeStack[fNodeStackPtr]; |
| 522 if (n==NULL || n->fType != RBBINode::varRef) { |
| 523 error(U_BRK_INTERNAL_ERROR); |
| 524 break; |
| 525 } |
| 526 n->fLastPos = fScanIndex; |
| 527 fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText); |
| 528 // Look the newly scanned name up in the symbol table |
| 529 // If there's an entry, set the l. child of the var ref to the replace
ment expression. |
| 530 // (We also pass through here when scanning assignments, but no harm i
s done, other |
| 531 // than a slight wasted effort that seems hard to avoid. Lookup will
be null) |
| 532 n->fLeftChild = fSymbolTable->lookupNode(n->fText); |
| 533 break; |
| 534 |
| 535 case doCheckVarDef: |
| 536 n = fNodeStack[fNodeStackPtr]; |
| 537 if (n->fLeftChild == NULL) { |
| 538 error(U_BRK_UNDEFINED_VARIABLE); |
| 539 returnVal = FALSE; |
| 540 } |
| 541 break; |
| 542 |
| 543 case doExprFinished: |
| 544 break; |
| 545 |
| 546 case doRuleErrorAssignExpr: |
| 547 error(U_BRK_ASSIGN_ERROR); |
| 548 returnVal = FALSE; |
| 549 break; |
| 550 |
| 551 case doExit: |
| 552 returnVal = FALSE; |
| 553 break; |
| 554 |
| 555 case doScanUnicodeSet: |
| 556 scanSet(); |
| 557 break; |
| 558 |
| 559 default: |
| 560 error(U_BRK_INTERNAL_ERROR); |
| 561 returnVal = FALSE; |
| 562 break; |
| 563 } |
| 564 return returnVal; |
| 565 } |
| 566 |
| 567 |
| 568 |
| 569 |
| 570 //------------------------------------------------------------------------------ |
| 571 // |
| 572 // Error Report a rule parse error. |
| 573 // Only report it if no previous error has been recorded. |
| 574 // |
| 575 //------------------------------------------------------------------------------ |
| 576 void RBBIRuleScanner::error(UErrorCode e) { |
| 577 if (U_SUCCESS(*fRB->fStatus)) { |
| 578 *fRB->fStatus = e; |
| 579 if (fRB->fParseError) { |
| 580 fRB->fParseError->line = fLineNum; |
| 581 fRB->fParseError->offset = fCharNum; |
| 582 fRB->fParseError->preContext[0] = 0; |
| 583 fRB->fParseError->preContext[0] = 0; |
| 584 } |
| 585 } |
| 586 } |
| 587 |
| 588 |
| 589 |
| 590 |
| 591 //------------------------------------------------------------------------------ |
| 592 // |
| 593 // fixOpStack The parse stack holds partially assembled chunks of the parse t
ree. |
| 594 // An entry on the stack may be as small as a single setRef node, |
| 595 // or as large as the parse tree |
| 596 // for an entire expression (this will be the one item left on the
stack |
| 597 // when the parsing of an RBBI rule completes. |
| 598 // |
| 599 // This function is called when a binary operator is encountered. |
| 600 // It looks back up the stack for operators that are not yet assoc
iated |
| 601 // with a right operand, and if the precedence of the stacked oper
ator >= |
| 602 // the precedence of the current operator, binds the operand left, |
| 603 // to the previously encountered operator. |
| 604 // |
| 605 //------------------------------------------------------------------------------ |
| 606 void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) { |
| 607 RBBINode *n; |
| 608 // printNodeStack("entering fixOpStack()"); |
| 609 for (;;) { |
| 610 n = fNodeStack[fNodeStackPtr-1]; // an operator node |
| 611 if (n->fPrecedence == 0) { |
| 612 RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node"); |
| 613 error(U_BRK_INTERNAL_ERROR); |
| 614 return; |
| 615 } |
| 616 |
| 617 if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) { |
| 618 // The most recent operand goes with the current operator, |
| 619 // not with the previously stacked one. |
| 620 break; |
| 621 } |
| 622 // Stack operator is a binary op ( '|' or concatenation) |
| 623 // TOS operand becomes right child of this operator. |
| 624 // Resulting subexpression becomes the TOS operand. |
| 625 n->fRightChild = fNodeStack[fNodeStackPtr]; |
| 626 fNodeStack[fNodeStackPtr]->fParent = n; |
| 627 fNodeStackPtr--; |
| 628 // printNodeStack("looping in fixOpStack() "); |
| 629 } |
| 630 |
| 631 if (p <= RBBINode::precLParen) { |
| 632 // Scan is at a right paren or end of expression. |
| 633 // The scanned item must match the stack, or else there was an error. |
| 634 // Discard the left paren (or start expr) node from the stack, |
| 635 // leaving the completed (sub)expression as TOS. |
| 636 if (n->fPrecedence != p) { |
| 637 // Right paren encountered matched start of expression node, or |
| 638 // end of expression matched with a left paren node. |
| 639 error(U_BRK_MISMATCHED_PAREN); |
| 640 } |
| 641 fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr]; |
| 642 fNodeStackPtr--; |
| 643 // Delete the now-discarded LParen or Start node. |
| 644 delete n; |
| 645 } |
| 646 // printNodeStack("leaving fixOpStack()"); |
| 647 } |
| 648 |
| 649 |
| 650 |
| 651 |
| 652 //------------------------------------------------------------------------------ |
| 653 // |
| 654 // findSetFor given a UnicodeString, |
| 655 // - find the corresponding Unicode Set (uset node) |
| 656 // (create one if necessary) |
| 657 // - Set fLeftChild of the caller's node (should be a setRef no
de) |
| 658 // to the uset node |
| 659 // Maintain a hash table of uset nodes, so the same one is alway
s used |
| 660 // for the same string. |
| 661 // If a "to adopt" set is provided and we haven't seen this key
before, |
| 662 // add the provided set to the hash table. |
| 663 // If the string is one (32 bit) char in length, the set contain
s |
| 664 // just one element which is the char in question. |
| 665 // If the string is "any", return a set containing all chars. |
| 666 // |
| 667 //------------------------------------------------------------------------------ |
| 668 void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, Unicode
Set *setToAdopt) { |
| 669 |
| 670 RBBISetTableEl *el; |
| 671 |
| 672 // First check whether we've already cached a set for this string. |
| 673 // If so, just use the cached set in the new node. |
| 674 // delete any set provided by the caller, since we own it. |
| 675 el = (RBBISetTableEl *)uhash_get(fSetTable, &s); |
| 676 if (el != NULL) { |
| 677 delete setToAdopt; |
| 678 node->fLeftChild = el->val; |
| 679 U_ASSERT(node->fLeftChild->fType == RBBINode::uset); |
| 680 return; |
| 681 } |
| 682 |
| 683 // Haven't seen this set before. |
| 684 // If the caller didn't provide us with a prebuilt set, |
| 685 // create a new UnicodeSet now. |
| 686 if (setToAdopt == NULL) { |
| 687 if (s.compare(kAny, -1) == 0) { |
| 688 setToAdopt = new UnicodeSet(0x000000, 0x10ffff); |
| 689 } else { |
| 690 UChar32 c; |
| 691 c = s.char32At(0); |
| 692 setToAdopt = new UnicodeSet(c, c); |
| 693 } |
| 694 } |
| 695 |
| 696 // |
| 697 // Make a new uset node to refer to this UnicodeSet |
| 698 // This new uset node becomes the child of the caller's setReference node. |
| 699 // |
| 700 RBBINode *usetNode = new RBBINode(RBBINode::uset); |
| 701 if (usetNode == NULL) { |
| 702 error(U_MEMORY_ALLOCATION_ERROR); |
| 703 return; |
| 704 } |
| 705 usetNode->fInputSet = setToAdopt; |
| 706 usetNode->fParent = node; |
| 707 node->fLeftChild = usetNode; |
| 708 usetNode->fText = s; |
| 709 |
| 710 |
| 711 // |
| 712 // Add the new uset node to the list of all uset nodes. |
| 713 // |
| 714 fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus); |
| 715 |
| 716 |
| 717 // |
| 718 // Add the new set to the set hash table. |
| 719 // |
| 720 el = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl)); |
| 721 UnicodeString *tkey = new UnicodeString(s); |
| 722 if (tkey == NULL || el == NULL || setToAdopt == NULL) { |
| 723 // Delete to avoid memory leak |
| 724 delete tkey; |
| 725 tkey = NULL; |
| 726 uprv_free(el); |
| 727 el = NULL; |
| 728 delete setToAdopt; |
| 729 setToAdopt = NULL; |
| 730 |
| 731 error(U_MEMORY_ALLOCATION_ERROR); |
| 732 return; |
| 733 } |
| 734 el->key = tkey; |
| 735 el->val = usetNode; |
| 736 uhash_put(fSetTable, el->key, el, fRB->fStatus); |
| 737 |
| 738 return; |
| 739 } |
| 740 |
| 741 |
| 742 |
| 743 // |
| 744 // Assorted Unicode character constants. |
| 745 // Numeric because there is no portable way to enter them as literals. |
| 746 // (Think EBCDIC). |
| 747 // |
| 748 static const UChar chCR = 0x0d; // New lines, for terminating c
omments. |
| 749 static const UChar chLF = 0x0a; |
| 750 static const UChar chNEL = 0x85; // NEL newline variant |
| 751 static const UChar chLS = 0x2028; // Unicode Line Separator |
| 752 static const UChar chApos = 0x27; // single quote, for quoted ch
ars. |
| 753 static const UChar chPound = 0x23; // '#', introduces a comment. |
| 754 static const UChar chBackSlash = 0x5c; // '\' introduces a char escap
e |
| 755 static const UChar chLParen = 0x28; |
| 756 static const UChar chRParen = 0x29; |
| 757 |
| 758 |
| 759 //------------------------------------------------------------------------------ |
| 760 // |
| 761 // stripRules Return a rules string without unnecessary |
| 762 // characters. |
| 763 // |
| 764 //------------------------------------------------------------------------------ |
| 765 UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) { |
| 766 UnicodeString strippedRules; |
| 767 int rulesLength = rules.length(); |
| 768 for (int idx = 0; idx < rulesLength; ) { |
| 769 UChar ch = rules[idx++]; |
| 770 if (ch == chPound) { |
| 771 while (idx < rulesLength |
| 772 && ch != chCR && ch != chLF && ch != chNEL) |
| 773 { |
| 774 ch = rules[idx++]; |
| 775 } |
| 776 } |
| 777 if (!u_isISOControl(ch)) { |
| 778 strippedRules.append(ch); |
| 779 } |
| 780 } |
| 781 // strippedRules = strippedRules.unescape(); |
| 782 return strippedRules; |
| 783 } |
| 784 |
| 785 |
| 786 //------------------------------------------------------------------------------ |
| 787 // |
| 788 // nextCharLL Low Level Next Char from rule input source. |
| 789 // Get a char from the input character iterator, |
| 790 // keep track of input position for error reporting. |
| 791 // |
| 792 //------------------------------------------------------------------------------ |
| 793 UChar32 RBBIRuleScanner::nextCharLL() { |
| 794 UChar32 ch; |
| 795 |
| 796 if (fNextIndex >= fRB->fRules.length()) { |
| 797 return (UChar32)-1; |
| 798 } |
| 799 ch = fRB->fRules.char32At(fNextIndex); |
| 800 fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1); |
| 801 |
| 802 if (ch == chCR || |
| 803 ch == chNEL || |
| 804 ch == chLS || |
| 805 (ch == chLF && fLastChar != chCR)) { |
| 806 // Character is starting a new line. Bump up the line number, and |
| 807 // reset the column to 0. |
| 808 fLineNum++; |
| 809 fCharNum=0; |
| 810 if (fQuoteMode) { |
| 811 error(U_BRK_NEW_LINE_IN_QUOTED_STRING); |
| 812 fQuoteMode = FALSE; |
| 813 } |
| 814 } |
| 815 else { |
| 816 // Character is not starting a new line. Except in the case of a |
| 817 // LF following a CR, increment the column position. |
| 818 if (ch != chLF) { |
| 819 fCharNum++; |
| 820 } |
| 821 } |
| 822 fLastChar = ch; |
| 823 return ch; |
| 824 } |
| 825 |
| 826 |
| 827 //------------------------------------------------------------------------------ |
| 828 // |
| 829 // nextChar for rules scanning. At this level, we handle stripping |
| 830 // out comments and processing backslash character escapes. |
| 831 // The rest of the rules grammar is handled at the next level up. |
| 832 // |
| 833 //------------------------------------------------------------------------------ |
| 834 void RBBIRuleScanner::nextChar(RBBIRuleChar &c) { |
| 835 |
| 836 // Unicode Character constants needed for the processing done by nextChar(), |
| 837 // in hex because literals wont work on EBCDIC machines. |
| 838 |
| 839 fScanIndex = fNextIndex; |
| 840 c.fChar = nextCharLL(); |
| 841 c.fEscaped = FALSE; |
| 842 |
| 843 // |
| 844 // check for '' sequence. |
| 845 // These are recognized in all contexts, whether in quoted text or not. |
| 846 // |
| 847 if (c.fChar == chApos) { |
| 848 if (fRB->fRules.char32At(fNextIndex) == chApos) { |
| 849 c.fChar = nextCharLL(); // get nextChar officially so char
acter counts |
| 850 c.fEscaped = TRUE; // stay correct. |
| 851 } |
| 852 else |
| 853 { |
| 854 // Single quote, by itself. |
| 855 // Toggle quoting mode. |
| 856 // Return either '(' or ')', because quotes cause a grouping of t
he quoted text. |
| 857 fQuoteMode = !fQuoteMode; |
| 858 if (fQuoteMode == TRUE) { |
| 859 c.fChar = chLParen; |
| 860 } else { |
| 861 c.fChar = chRParen; |
| 862 } |
| 863 c.fEscaped = FALSE; // The paren that we return is not escaped. |
| 864 return; |
| 865 } |
| 866 } |
| 867 |
| 868 if (fQuoteMode) { |
| 869 c.fEscaped = TRUE; |
| 870 } |
| 871 else |
| 872 { |
| 873 // We are not in a 'quoted region' of the source. |
| 874 // |
| 875 if (c.fChar == chPound) { |
| 876 // Start of a comment. Consume the rest of it. |
| 877 // The new-line char that terminates the comment is always returned
. |
| 878 // It will be treated as white-space, and serves to break up anythi
ng |
| 879 // that might otherwise incorrectly clump together with a comment
in |
| 880 // the middle (a variable name, for example.) |
| 881 for (;;) { |
| 882 c.fChar = nextCharLL(); |
| 883 if (c.fChar == (UChar32)-1 || // EOF |
| 884 c.fChar == chCR || |
| 885 c.fChar == chLF || |
| 886 c.fChar == chNEL || |
| 887 c.fChar == chLS) {break;} |
| 888 } |
| 889 } |
| 890 if (c.fChar == (UChar32)-1) { |
| 891 return; |
| 892 } |
| 893 |
| 894 // |
| 895 // check for backslash escaped characters. |
| 896 // Use UnicodeString::unescapeAt() to handle them. |
| 897 // |
| 898 if (c.fChar == chBackSlash) { |
| 899 c.fEscaped = TRUE; |
| 900 int32_t startX = fNextIndex; |
| 901 c.fChar = fRB->fRules.unescapeAt(fNextIndex); |
| 902 if (fNextIndex == startX) { |
| 903 error(U_BRK_HEX_DIGITS_EXPECTED); |
| 904 } |
| 905 fCharNum += fNextIndex-startX; |
| 906 } |
| 907 } |
| 908 // putc(c.fChar, stdout); |
| 909 } |
| 910 |
| 911 //------------------------------------------------------------------------------ |
| 912 // |
| 913 // Parse RBBI rules. The state machine for rules parsing is here. |
| 914 // The state tables are hand-written in the file rbbirpt.tx
t, |
| 915 // and converted to the form used here by a perl |
| 916 // script rbbicst.pl |
| 917 // |
| 918 //------------------------------------------------------------------------------ |
| 919 void RBBIRuleScanner::parse() { |
| 920 uint16_t state; |
| 921 const RBBIRuleTableEl *tableEl; |
| 922 |
| 923 if (U_FAILURE(*fRB->fStatus)) { |
| 924 return; |
| 925 } |
| 926 |
| 927 state = 1; |
| 928 nextChar(fC); |
| 929 // |
| 930 // Main loop for the rule parsing state machine. |
| 931 // Runs once per state transition. |
| 932 // Each time through optionally performs, depending on the state table, |
| 933 // - an advance to the the next input char |
| 934 // - an action to be performed. |
| 935 // - pushing or popping a state to/from the local state return stack. |
| 936 // |
| 937 for (;;) { |
| 938 // Bail out if anything has gone wrong. |
| 939 // RBBI rule file parsing stops on the first error encountered. |
| 940 if (U_FAILURE(*fRB->fStatus)) { |
| 941 break; |
| 942 } |
| 943 |
| 944 // Quit if state == 0. This is the normal way to exit the state machine
. |
| 945 // |
| 946 if (state == 0) { |
| 947 break; |
| 948 } |
| 949 |
| 950 // Find the state table element that matches the input char from the rul
e, or the |
| 951 // class of the input character. Start with the first table row for
this |
| 952 // state, then linearly scan forward until we find a row that matches
the |
| 953 // character. The last row for each state always matches all charact
ers, so |
| 954 // the search will stop there, if not before. |
| 955 // |
| 956 tableEl = &gRuleParseStateTable[state]; |
| 957 #ifdef RBBI_DEBUG |
| 958 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { |
| 959 RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d) state=%s
", |
| 960 fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]); |
| 961 } |
| 962 #endif |
| 963 |
| 964 for (;;) { |
| 965 #ifdef RBBI_DEBUG |
| 966 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBB
IDebugPrintf(".");} |
| 967 #endif |
| 968 if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE && tableEl->
fCharClass == fC.fChar) { |
| 969 // Table row specified an individual character, not a set, and |
| 970 // the input character is not escaped, and |
| 971 // the input character matched it. |
| 972 break; |
| 973 } |
| 974 if (tableEl->fCharClass == 255) { |
| 975 // Table row specified default, match anything character class. |
| 976 break; |
| 977 } |
| 978 if (tableEl->fCharClass == 254 && fC.fEscaped) { |
| 979 // Table row specified "escaped" and the char was escaped. |
| 980 break; |
| 981 } |
| 982 if (tableEl->fCharClass == 253 && fC.fEscaped && |
| 983 (fC.fChar == 0x50 || fC.fChar == 0x70 )) { |
| 984 // Table row specified "escaped P" and the char is either 'p' or
'P'. |
| 985 break; |
| 986 } |
| 987 if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1) { |
| 988 // Table row specified eof and we hit eof on the input. |
| 989 break; |
| 990 } |
| 991 |
| 992 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && //
Table specs a char class && |
| 993 fC.fEscaped == FALSE && //
char is not escaped && |
| 994 fC.fChar != (UChar32)-1) { //
char is not EOF |
| 995 if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) { |
| 996 // Table row specified a character class, or set of characte
rs, |
| 997 // and the current char matches it. |
| 998 break; |
| 999 } |
| 1000 } |
| 1001 |
| 1002 // No match on this row, advance to the next row for this state, |
| 1003 tableEl++; |
| 1004 } |
| 1005 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPu
ts("");} |
| 1006 |
| 1007 // |
| 1008 // We've found the row of the state table that matches the current input |
| 1009 // character from the rules string. |
| 1010 // Perform any action specified by this row in the state table. |
| 1011 if (doParseActions((int32_t)tableEl->fAction) == FALSE) { |
| 1012 // Break out of the state machine loop if the |
| 1013 // the action signalled some kind of error, or |
| 1014 // the action was to exit, occurs on normal end-of-rules-input. |
| 1015 break; |
| 1016 } |
| 1017 |
| 1018 if (tableEl->fPushState != 0) { |
| 1019 fStackPtr++; |
| 1020 if (fStackPtr >= kStackSize) { |
| 1021 error(U_BRK_INTERNAL_ERROR); |
| 1022 RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow."
); |
| 1023 fStackPtr--; |
| 1024 } |
| 1025 fStack[fStackPtr] = tableEl->fPushState; |
| 1026 } |
| 1027 |
| 1028 if (tableEl->fNextChar) { |
| 1029 nextChar(fC); |
| 1030 } |
| 1031 |
| 1032 // Get the next state from the table entry, or from the |
| 1033 // state stack if the next state was specified as "pop". |
| 1034 if (tableEl->fNextState != 255) { |
| 1035 state = tableEl->fNextState; |
| 1036 } else { |
| 1037 state = fStack[fStackPtr]; |
| 1038 fStackPtr--; |
| 1039 if (fStackPtr < 0) { |
| 1040 error(U_BRK_INTERNAL_ERROR); |
| 1041 RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow.
"); |
| 1042 fStackPtr++; |
| 1043 } |
| 1044 } |
| 1045 |
| 1046 } |
| 1047 |
| 1048 // |
| 1049 // If there were NO user specified reverse rules, set up the equivalent of "
.*;" |
| 1050 // |
| 1051 if (fRB->fReverseTree == NULL) { |
| 1052 fRB->fReverseTree = pushNewNode(RBBINode::opStar); |
| 1053 RBBINode *operand = pushNewNode(RBBINode::setRef); |
| 1054 findSetFor(kAny, operand); |
| 1055 fRB->fReverseTree->fLeftChild = operand; |
| 1056 operand->fParent = fRB->fReverseTree; |
| 1057 fNodeStackPtr -= 2; |
| 1058 } |
| 1059 |
| 1060 |
| 1061 // |
| 1062 // Parsing of the input RBBI rules is complete. |
| 1063 // We now have a parse tree for the rule expressions |
| 1064 // and a list of all UnicodeSets that are referenced. |
| 1065 // |
| 1066 #ifdef RBBI_DEBUG |
| 1067 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable-
>rbbiSymtablePrint();} |
| 1068 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree")) |
| 1069 { |
| 1070 RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n"); |
| 1071 fRB->fForwardTree->printTree(TRUE); |
| 1072 RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n"); |
| 1073 fRB->fReverseTree->printTree(TRUE); |
| 1074 RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n"); |
| 1075 fRB->fSafeFwdTree->printTree(TRUE); |
| 1076 RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n"); |
| 1077 fRB->fSafeRevTree->printTree(TRUE); |
| 1078 } |
| 1079 #endif |
| 1080 } |
| 1081 |
| 1082 |
| 1083 //------------------------------------------------------------------------------ |
| 1084 // |
| 1085 // printNodeStack for debugging... |
| 1086 // |
| 1087 //------------------------------------------------------------------------------ |
| 1088 #ifdef RBBI_DEBUG |
| 1089 void RBBIRuleScanner::printNodeStack(const char *title) { |
| 1090 int i; |
| 1091 RBBIDebugPrintf("%s. Dumping node stack...\n", title); |
| 1092 for (i=fNodeStackPtr; i>0; i--) {fNodeStack[i]->printTree(TRUE);} |
| 1093 } |
| 1094 #endif |
| 1095 |
| 1096 |
| 1097 |
| 1098 |
| 1099 //------------------------------------------------------------------------------ |
| 1100 // |
| 1101 // pushNewNode create a new RBBINode of the specified type and push it |
| 1102 // onto the stack of nodes. |
| 1103 // |
| 1104 //------------------------------------------------------------------------------ |
| 1105 RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) { |
| 1106 fNodeStackPtr++; |
| 1107 if (fNodeStackPtr >= kStackSize) { |
| 1108 error(U_BRK_INTERNAL_ERROR); |
| 1109 RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow."); |
| 1110 *fRB->fStatus = U_BRK_INTERNAL_ERROR; |
| 1111 return NULL; |
| 1112 } |
| 1113 fNodeStack[fNodeStackPtr] = new RBBINode(t); |
| 1114 if (fNodeStack[fNodeStackPtr] == NULL) { |
| 1115 *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR; |
| 1116 } |
| 1117 return fNodeStack[fNodeStackPtr]; |
| 1118 } |
| 1119 |
| 1120 |
| 1121 |
| 1122 //------------------------------------------------------------------------------ |
| 1123 // |
| 1124 // scanSet Construct a UnicodeSet from the text at the current scan |
| 1125 // position. Advance the scan position to the first character |
| 1126 // after the set. |
| 1127 // |
| 1128 // A new RBBI setref node referring to the set is pushed onto the no
de |
| 1129 // stack. |
| 1130 // |
| 1131 // The scan position is normally under the control of the state mach
ine |
| 1132 // that controls rule parsing. UnicodeSets, however, are parsed by |
| 1133 // the UnicodeSet constructor, not by the RBBI rule parser. |
| 1134 // |
| 1135 //------------------------------------------------------------------------------ |
| 1136 void RBBIRuleScanner::scanSet() { |
| 1137 UnicodeSet *uset; |
| 1138 ParsePosition pos; |
| 1139 int startPos; |
| 1140 int i; |
| 1141 |
| 1142 if (U_FAILURE(*fRB->fStatus)) { |
| 1143 return; |
| 1144 } |
| 1145 |
| 1146 pos.setIndex(fScanIndex); |
| 1147 startPos = fScanIndex; |
| 1148 UErrorCode localStatus = U_ZERO_ERROR; |
| 1149 uset = new UnicodeSet(fRB->fRules, pos, USET_IGNORE_SPACE, |
| 1150 fSymbolTable, |
| 1151 localStatus); |
| 1152 if (uset == NULL) { |
| 1153 localStatus = U_MEMORY_ALLOCATION_ERROR; |
| 1154 } |
| 1155 if (U_FAILURE(localStatus)) { |
| 1156 // TODO: Get more accurate position of the error from UnicodeSet's ret
urn info. |
| 1157 // UnicodeSet appears to not be reporting correctly at this time
. |
| 1158 #ifdef RBBI_DEBUG |
| 1159 RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.ge
tIndex()); |
| 1160 #endif |
| 1161 error(localStatus); |
| 1162 delete uset; |
| 1163 return; |
| 1164 } |
| 1165 |
| 1166 // Verify that the set contains at least one code point. |
| 1167 // |
| 1168 if (uset->isEmpty()) { |
| 1169 // This set is empty. |
| 1170 // Make it an error, because it almost certainly is not what the user w
anted. |
| 1171 // Also, avoids having to think about corner cases in the tree manipula
tion code |
| 1172 // that occurs later on. |
| 1173 error(U_BRK_RULE_EMPTY_SET); |
| 1174 delete uset; |
| 1175 return; |
| 1176 } |
| 1177 |
| 1178 |
| 1179 // Advance the RBBI parse postion over the UnicodeSet pattern. |
| 1180 // Don't just set fScanIndex because the line/char positions maintained |
| 1181 // for error reporting would be thrown off. |
| 1182 i = pos.getIndex(); |
| 1183 for (;;) { |
| 1184 if (fNextIndex >= i) { |
| 1185 break; |
| 1186 } |
| 1187 nextCharLL(); |
| 1188 } |
| 1189 |
| 1190 if (U_SUCCESS(*fRB->fStatus)) { |
| 1191 RBBINode *n; |
| 1192 |
| 1193 n = pushNewNode(RBBINode::setRef); |
| 1194 n->fFirstPos = startPos; |
| 1195 n->fLastPos = fNextIndex; |
| 1196 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); |
| 1197 // findSetFor() serves several purposes here: |
| 1198 // - Adopts storage for the UnicodeSet, will be responsible for dele
ting. |
| 1199 // - Mantains collection of all sets in use, needed later for establ
ishing |
| 1200 // character categories for run time engine. |
| 1201 // - Eliminates mulitiple instances of the same set. |
| 1202 // - Creates a new uset node if necessary (if this isn't a duplicate
.) |
| 1203 findSetFor(n->fText, n, uset); |
| 1204 } |
| 1205 |
| 1206 } |
| 1207 |
| 1208 U_NAMESPACE_END |
| 1209 |
| 1210 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
OLD | NEW |