Index: icu46/source/common/rbbirpt.txt |
=================================================================== |
--- icu46/source/common/rbbirpt.txt (revision 0) |
+++ icu46/source/common/rbbirpt.txt (revision 0) |
@@ -0,0 +1,315 @@ |
+ |
+#***************************************************************************** |
+# |
+# Copyright (C) 2002-2003, International Business Machines Corporation and others. |
+# All Rights Reserved. |
+# |
+#***************************************************************************** |
+# |
+# file: rbbirpt.txt |
+# ICU Break Iterator Rule Parser State Table |
+# |
+# This state table is used when reading and parsing a set of RBBI rules |
+# The rule parser uses a state machine; the data in this file define the |
+# state transitions that occur for each input character. |
+# |
+# *** This file defines the RBBI rule grammar. This is it. |
+# *** The determination of what is accepted is here. |
+# |
+# This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays |
+# that are then built with the rule parser. |
+# |
+ |
+# |
+# Here is the syntax of the state definitions in this file: |
+# |
+# |
+#StateName: |
+# input-char n next-state ^push-state action |
+# input-char n next-state ^push-state action |
+# | | | | | |
+# | | | | |--- action to be performed by state machine |
+# | | | | See function RBBIRuleScanner::doParseActions() |
+# | | | | |
+# | | | |--- Push this named state onto the state stack. |
+# | | | Later, when next state is specified as "pop", |
+# | | | the pushed state will become the current state. |
+# | | | |
+# | | |--- Transition to this state if the current input character matches the input |
+# | | character or char class in the left hand column. "pop" causes the next |
+# | | state to be popped from the state stack. |
+# | | |
+# | |--- When making the state transition specified on this line, advance to the next |
+# | character from the input only if 'n' appears here. |
+# | |
+# |--- Character or named character classes to test for. If the current character being scanned |
+# matches, peform the actions and go to the state specified on this line. |
+# The input character is tested sequentally, in the order written. The characters and |
+# character classes tested for do not need to be mutually exclusive. The first match wins. |
+# |
+ |
+ |
+ |
+ |
+# |
+# start state, scan position is at the beginning of the rules file, or in between two rules. |
+# |
+start: |
+ escaped term ^break-rule-end doExprStart |
+ white_space n start |
+ '$' scan-var-name ^assign-or-rule doExprStart |
+ '!' n rev-option |
+ ';' n start # ignore empty rules. |
+ eof exit |
+ default term ^break-rule-end doExprStart |
+ |
+# |
+# break-rule-end: Returned from doing a break-rule expression. |
+# |
+break-rule-end: |
+ ';' n start doEndOfRule |
+ white_space n break-rule-end |
+ default errorDeath doRuleError |
+ |
+ |
+# |
+# ! We've just scanned a '!', indicating either a !!key word flag or a |
+# !Reverse rule. |
+# |
+rev-option: |
+ '!' n option-scan1 |
+ default reverse-rule ^break-rule-end doReverseDir |
+ |
+option-scan1: |
+ name_start_char n option-scan2 doOptionStart |
+ default errorDeath doRuleError |
+ |
+option-scan2: |
+ name_char n option-scan2 |
+ default option-scan3 doOptionEnd |
+ |
+option-scan3: |
+ ';' n start |
+ white_space n option-scan3 |
+ default errorDeath doRuleError |
+ |
+ |
+reverse-rule: |
+ default term ^break-rule-end doExprStart |
+ |
+ |
+# |
+# term. Eat through a single rule character, or a composite thing, which |
+# could be a parenthesized expression, a variable name, or a Unicode Set. |
+# |
+term: |
+ escaped n expr-mod doRuleChar |
+ white_space n term |
+ rule_char n expr-mod doRuleChar |
+ '[' scan-unicode-set ^expr-mod |
+ '(' n term ^expr-mod doLParen |
+ '$' scan-var-name ^term-var-ref |
+ '.' n expr-mod doDotAny |
+ default errorDeath doRuleError |
+ |
+ |
+ |
+# |
+# term-var-ref We've just finished scanning a reference to a $variable. |
+# Check that the variable was defined. |
+# The variable name scanning is in common with assignment statements, |
+# so the check can't be done there. |
+term-var-ref: |
+ default expr-mod doCheckVarDef |
+ |
+ |
+# |
+# expr-mod We've just finished scanning a term, now look for the optional |
+# trailing '*', '?', '+' |
+# |
+expr-mod: |
+ white_space n expr-mod |
+ '*' n expr-cont doUnaryOpStar |
+ '+' n expr-cont doUnaryOpPlus |
+ '?' n expr-cont doUnaryOpQuestion |
+ default expr-cont |
+ |
+ |
+# |
+# expr-cont Expression, continuation. At a point where additional terms are |
+# allowed, but not required. |
+# |
+expr-cont: |
+ escaped term doExprCatOperator |
+ white_space n expr-cont |
+ rule_char term doExprCatOperator |
+ '[' term doExprCatOperator |
+ '(' term doExprCatOperator |
+ '$' term doExprCatOperator |
+ '.' term doExprCatOperator |
+ '/' look-ahead doExprCatOperator |
+ '{' n tag-open doExprCatOperator |
+ '|' n term doExprOrOperator |
+ ')' n pop doExprRParen |
+ default pop doExprFinished |
+ |
+ |
+# |
+# look-ahead Scanning a '/', which identifies a break point, assuming that the |
+# remainder of the expression matches. |
+# |
+# Generate a parse tree as if this was a special kind of input symbol |
+# appearing in an otherwise normal concatenation expression. |
+# |
+look-ahead: |
+ '/' n expr-cont-no-slash doSlash |
+ default errorDeath |
+ |
+ |
+# |
+# expr-cont-no-slash Expression, continuation. At a point where additional terms are |
+# allowed, but not required. Just like |
+# expr-cont, above, except that no '/' |
+# look-ahead symbol is permitted. |
+# |
+expr-cont-no-slash: |
+ escaped term doExprCatOperator |
+ white_space n expr-cont |
+ rule_char term doExprCatOperator |
+ '[' term doExprCatOperator |
+ '(' term doExprCatOperator |
+ '$' term doExprCatOperator |
+ '.' term doExprCatOperator |
+ '|' n term doExprOrOperator |
+ ')' n pop doExprRParen |
+ default pop doExprFinished |
+ |
+ |
+# |
+# tags scanning a '{', the opening delimiter for a tag that identifies |
+# the kind of match. Scan the whole {dddd} tag, where d=digit |
+# |
+tag-open: |
+ white_space n tag-open |
+ digit_char tag-value doStartTagValue |
+ default errorDeath doTagExpectedError |
+ |
+tag-value: |
+ white_space n tag-close |
+ '}' tag-close |
+ digit_char n tag-value doTagDigit |
+ default errorDeath doTagExpectedError |
+ |
+tag-close: |
+ white_space n tag-close |
+ '}' n expr-cont-no-tag doTagValue |
+ default errorDeath doTagExpectedError |
+ |
+ |
+ |
+# |
+# expr-cont-no-tag Expression, continuation. At a point where additional terms are |
+# allowed, but not required. Just like |
+# expr-cont, above, except that no "{ddd}" |
+# tagging is permitted. |
+# |
+expr-cont-no-tag: |
+ escaped term doExprCatOperator |
+ white_space n expr-cont-no-tag |
+ rule_char term doExprCatOperator |
+ '[' term doExprCatOperator |
+ '(' term doExprCatOperator |
+ '$' term doExprCatOperator |
+ '.' term doExprCatOperator |
+ '/' look-ahead doExprCatOperator |
+ '|' n term doExprOrOperator |
+ ')' n pop doExprRParen |
+ default pop doExprFinished |
+ |
+ |
+ |
+ |
+# |
+# Variable Name Scanning. |
+# |
+# The state that branched to here must have pushed a return state |
+# to go to after completion of the variable name scanning. |
+# |
+# The current input character must be the $ that introduces the name. |
+# The $ is consummed here rather than in the state that first detected it |
+# so that the doStartVariableName action only needs to happen in one |
+# place (here), and the other states don't need to worry about it. |
+# |
+scan-var-name: |
+ '$' n scan-var-start doStartVariableName |
+ default errorDeath |
+ |
+ |
+scan-var-start: |
+ name_start_char n scan-var-body |
+ default errorDeath doVariableNameExpectedErr |
+ |
+scan-var-body: |
+ name_char n scan-var-body |
+ default pop doEndVariableName |
+ |
+ |
+ |
+# |
+# scan-unicode-set Unicode Sets are parsed by the the UnicodeSet class. |
+# Within the RBBI parser, after finding the first character |
+# of a Unicode Set, we just hand the rule input at that |
+# point of to the Unicode Set constructor, then pick |
+# up parsing after the close of the set. |
+# |
+# The action for this state invokes the UnicodeSet parser. |
+# |
+scan-unicode-set: |
+ '[' n pop doScanUnicodeSet |
+ 'p' n pop doScanUnicodeSet |
+ 'P' n pop doScanUnicodeSet |
+ default errorDeath |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+# |
+# assign-or-rule. A $variable was encountered at the start of something, could be |
+# either an assignment statement or a rule, depending on whether an '=' |
+# follows the variable name. We get to this state when the variable name |
+# scanning does a return. |
+# |
+assign-or-rule: |
+ white_space n assign-or-rule |
+ '=' n term ^assign-end doStartAssign # variable was target of assignment |
+ default term-var-ref ^break-rule-end # variable was a term in a rule |
+ |
+ |
+ |
+# |
+# assign-end This state is entered when the end of the expression on the |
+# right hand side of an assignment is found. We get here via |
+# a pop; this state is pushed when the '=' in an assignment is found. |
+# |
+# The only thing allowed at this point is a ';'. The RHS of an |
+# assignment must look like a rule expression, and we come here |
+# when what is being scanned no longer looks like an expression. |
+# |
+assign-end: |
+ ';' n start doEndAssign |
+ default errorDeath doRuleErrorAssignExpr |
+ |
+ |
+ |
+# |
+# errorDeath. This state is specified as the next state whenever a syntax error |
+# in the source rules is detected. Barring bugs, the state machine will never |
+# actually get here, but will stop because of the action associated with the error. |
+# But, just in case, this state asks the state machine to exit. |
+errorDeath: |
+ default n errorDeath doExit |
+ |
+ |
Property changes on: icu46/source/common/rbbirpt.txt |
___________________________________________________________________ |
Added: svn:eol-style |
+ LF |