Index: icu46/source/i18n/regexcst.txt |
=================================================================== |
--- icu46/source/i18n/regexcst.txt (revision 0) |
+++ icu46/source/i18n/regexcst.txt (revision 0) |
@@ -0,0 +1,467 @@ |
+ |
+#***************************************************************************** |
+# |
+# Copyright (C) 2002-2007, International Business Machines Corporation and others. |
+# All Rights Reserved. |
+# |
+#***************************************************************************** |
+# |
+# file: regexcst.txt |
+# ICU Regular Expression Parser State Table |
+# |
+# This state table is used when reading and parsing a regular expression pattern |
+# The pattern parser uses a state machine; the data in this file define the |
+# state transitions that occur for each input character. |
+# |
+# *** This file defines the regex pattern grammar. This is it. |
+# *** The determination of what is accepted is here. |
+# |
+# This file is processed by a perl script "regexcst.pl" to produce initialized C arrays |
+# that are then built with the rule parser. |
+# |
+ |
+# |
+# Here is the syntax of the state definitions in this file: |
+# |
+# |
+#StateName: |
+# input-char n next-state ^push-state action |
+# input-char n next-state ^push-state action |
+# | | | | | |
+# | | | | |--- action to be performed by state machine |
+# | | | | See function RBBIRuleScanner::doParseActions() |
+# | | | | |
+# | | | |--- Push this named state onto the state stack. |
+# | | | Later, when next state is specified as "pop", |
+# | | | the pushed state will become the current state. |
+# | | | |
+# | | |--- Transition to this state if the current input character matches the input |
+# | | character or char class in the left hand column. "pop" causes the next |
+# | | state to be popped from the state stack. |
+# | | |
+# | |--- When making the state transition specified on this line, advance to the next |
+# | character from the input only if 'n' appears here. |
+# | |
+# |--- Character or named character classes to test for. If the current character being scanned |
+# matches, peform the actions and go to the state specified on this line. |
+# The input character is tested sequentally, in the order written. The characters and |
+# character classes tested for do not need to be mutually exclusive. The first match wins. |
+# |
+ |
+ |
+ |
+ |
+# |
+# start state, scan position is at the beginning of the pattern. |
+# |
+start: |
+ default term doPatStart |
+ |
+ |
+ |
+ |
+# |
+# term. At a position where we can accept the start most items in a pattern. |
+# |
+term: |
+ quoted n expr-quant doLiteralChar |
+ rule_char n expr-quant doLiteralChar |
+ '[' n set-open ^set-finish doSetBegin |
+ '(' n open-paren |
+ '.' n expr-quant doDotAny |
+ '^' n expr-quant doCaret |
+ '$' n expr-quant doDollar |
+ '\' n backslash |
+ '|' n term doOrOperator |
+ ')' n pop doCloseParen |
+ eof term doPatFinish |
+ default errorDeath doRuleError |
+ |
+ |
+ |
+# |
+# expr-quant We've just finished scanning a term, now look for the optional |
+# trailing quantifier - *, +, ?, *?, etc. |
+# |
+expr-quant: |
+ '*' n quant-star |
+ '+' n quant-plus |
+ '?' n quant-opt |
+ '{' n interval-open doIntervalInit |
+ '(' n open-paren-quant |
+ default expr-cont |
+ |
+ |
+# |
+# expr-cont Expression, continuation. At a point where additional terms are |
+# allowed, but not required. No Quantifiers |
+# |
+expr-cont: |
+ '|' n term doOrOperator |
+ ')' n pop doCloseParen |
+ default term |
+ |
+ |
+# |
+# open-paren-quant Special case handling for comments appearing before a quantifier, |
+# e.g. x(?#comment )* |
+# Open parens from expr-quant come here; anything but a (?# comment |
+# branches into the normal parenthesis sequence as quickly as possible. |
+# |
+open-paren-quant: |
+ '?' n open-paren-quant2 doSuppressComments |
+ default open-paren |
+ |
+open-paren-quant2: |
+ '#' n paren-comment ^expr-quant |
+ default open-paren-extended |
+ |
+ |
+# |
+# open-paren We've got an open paren. We need to scan further to |
+# determine what kind of quantifier it is - plain (, (?:, (?>, or whatever. |
+# |
+open-paren: |
+ '?' n open-paren-extended doSuppressComments |
+ default term ^expr-quant doOpenCaptureParen |
+ |
+open-paren-extended: |
+ ':' n term ^expr-quant doOpenNonCaptureParen # (?: |
+ '>' n term ^expr-quant doOpenAtomicParen # (?> |
+ '=' n term ^expr-cont doOpenLookAhead # (?= |
+ '!' n term ^expr-cont doOpenLookAheadNeg # (?! |
+ '<' n open-paren-lookbehind |
+ '#' n paren-comment ^term |
+ 'i' paren-flag doBeginMatchMode |
+ 'd' paren-flag doBeginMatchMode |
+ 'm' paren-flag doBeginMatchMode |
+ 's' paren-flag doBeginMatchMode |
+ 'u' paren-flag doBeginMatchMode |
+ 'w' paren-flag doBeginMatchMode |
+ 'x' paren-flag doBeginMatchMode |
+ '-' paren-flag doBeginMatchMode |
+ '(' n errorDeath doConditionalExpr |
+ '{' n errorDeath doPerlInline |
+ default errorDeath doBadOpenParenType |
+ |
+open-paren-lookbehind: |
+ '=' n term ^expr-cont doOpenLookBehind # (?<= |
+ '!' n term ^expr-cont doOpenLookBehindNeg # (?<! |
+ default errorDeath doBadOpenParenType |
+ |
+ |
+# |
+# paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')' |
+# |
+paren-comment: |
+ ')' n pop |
+ eof errorDeath doMismatchedParenErr |
+ default n paren-comment |
+ |
+# |
+# paren-flag Scanned a (?ismx-ismx flag setting |
+# |
+paren-flag: |
+ 'i' n paren-flag doMatchMode |
+ 'd' n paren-flag doMatchMode |
+ 'm' n paren-flag doMatchMode |
+ 's' n paren-flag doMatchMode |
+ 'u' n paren-flag doMatchMode |
+ 'w' n paren-flag doMatchMode |
+ 'x' n paren-flag doMatchMode |
+ '-' n paren-flag doMatchMode |
+ ')' n term doSetMatchMode |
+ ':' n term ^expr-quant doMatchModeParen |
+ default errorDeath doBadModeFlag |
+ |
+ |
+# |
+# quant-star Scanning a '*' quantifier. Need to look ahead to decide |
+# between plain '*', '*?', '*+' |
+# |
+quant-star: |
+ '?' n expr-cont doNGStar # *? |
+ '+' n expr-cont doPossessiveStar # *+ |
+ default expr-cont doStar |
+ |
+ |
+# |
+# quant-plus Scanning a '+' quantifier. Need to look ahead to decide |
+# between plain '+', '+?', '++' |
+# |
+quant-plus: |
+ '?' n expr-cont doNGPlus # *? |
+ '+' n expr-cont doPossessivePlus # *+ |
+ default expr-cont doPlus |
+ |
+ |
+# |
+# quant-opt Scanning a '?' quantifier. Need to look ahead to decide |
+# between plain '?', '??', '?+' |
+# |
+quant-opt: |
+ '?' n expr-cont doNGOpt # ?? |
+ '+' n expr-cont doPossessiveOpt # ?+ |
+ default expr-cont doOpt # ? |
+ |
+ |
+# |
+# Interval scanning a '{', the opening delimiter for an interval specification |
+# {number} or {min, max} or {min,} |
+# |
+interval-open: |
+ digit_char interval-lower |
+ default errorDeath doIntervalError |
+ |
+interval-lower: |
+ digit_char n interval-lower doIntevalLowerDigit |
+ ',' n interval-upper |
+ '}' n interval-type doIntervalSame # {n} |
+ default errorDeath doIntervalError |
+ |
+interval-upper: |
+ digit_char n interval-upper doIntervalUpperDigit |
+ '}' n interval-type |
+ default errorDeath doIntervalError |
+ |
+interval-type: |
+ '?' n expr-cont doNGInterval # {n,m}? |
+ '+' n expr-cont doPossessiveInterval # {n,m}+ |
+ default expr-cont doInterval # {m,n} |
+ |
+ |
+# |
+# backslash # Backslash. Figure out which of the \thingies we have encountered. |
+# The low level next-char function will have preprocessed |
+# some of them already; those won't come here. |
+backslash: |
+ 'A' n term doBackslashA |
+ 'B' n term doBackslashB |
+ 'b' n term doBackslashb |
+ 'd' n expr-quant doBackslashd |
+ 'D' n expr-quant doBackslashD |
+ 'G' n term doBackslashG |
+ 'N' expr-quant doNamedChar # \N{NAME} named char |
+ 'p' expr-quant doProperty # \p{Lu} style property |
+ 'P' expr-quant doProperty |
+ 'Q' n term doEnterQuoteMode |
+ 'S' n expr-quant doBackslashS |
+ 's' n expr-quant doBackslashs |
+ 'W' n expr-quant doBackslashW |
+ 'w' n expr-quant doBackslashw |
+ 'X' n expr-quant doBackslashX |
+ 'Z' n term doBackslashZ |
+ 'z' n term doBackslashz |
+ digit_char n expr-quant doBackRef # Will scan multiple digits |
+ eof errorDeath doEscapeError |
+ default n expr-quant doEscapedLiteralChar |
+ |
+ |
+ |
+# |
+# [set expression] parsing, |
+# All states involved in parsing set expressions have names beginning with "set-" |
+# |
+ |
+set-open: |
+ '^' n set-open2 doSetNegate |
+ ':' set-posix doSetPosixProp |
+ default set-open2 |
+ |
+set-open2: |
+ ']' n set-after-lit doSetLiteral |
+ default set-start |
+ |
+# set-posix: |
+# scanned a '[:' If it really is a [:property:], doSetPosixProp will have |
+# moved the scan to the closing ']'. If it wasn't a property |
+# expression, the scan will still be at the opening ':', which should |
+# be interpreted as a normal set expression. |
+set-posix: |
+ ']' n pop doSetEnd |
+ ':' set-start |
+ default errorDeath doRuleError # should not be possible. |
+ |
+# |
+# set-start after the [ and special case leading characters (^ and/or ]) but before |
+# everything else. A '-' is literal at this point. |
+# |
+set-start: |
+ ']' n pop doSetEnd |
+ '[' n set-open ^set-after-set doSetBeginUnion |
+ '\' n set-escape |
+ '-' n set-start-dash |
+ '&' n set-start-amp |
+ default n set-after-lit doSetLiteral |
+ |
+# set-start-dash Turn "[--" into a syntax error. |
+# "[-x" is good, - and x are literals. |
+# |
+set-start-dash: |
+ '-' errorDeath doRuleError |
+ default set-after-lit doSetAddDash |
+ |
+# set-start-amp Turn "[&&" into a syntax error. |
+# "[&x" is good, & and x are literals. |
+# |
+set-start-amp: |
+ '&' errorDeath doRuleError |
+ default set-after-lit doSetAddAmp |
+ |
+# |
+# set-after-lit The last thing scanned was a literal character within a set. |
+# Can be followed by anything. Single '-' or '&' are |
+# literals in this context, not operators. |
+set-after-lit: |
+ ']' n pop doSetEnd |
+ '[' n set-open ^set-after-set doSetBeginUnion |
+ '-' n set-lit-dash |
+ '&' n set-lit-amp |
+ '\' n set-escape |
+ eof errorDeath doSetNoCloseError |
+ default n set-after-lit doSetLiteral |
+ |
+set-after-set: |
+ ']' n pop doSetEnd |
+ '[' n set-open ^set-after-set doSetBeginUnion |
+ '-' n set-set-dash |
+ '&' n set-set-amp |
+ '\' n set-escape |
+ eof errorDeath doSetNoCloseError |
+ default n set-after-lit doSetLiteral |
+ |
+set-after-range: |
+ ']' n pop doSetEnd |
+ '[' n set-open ^set-after-set doSetBeginUnion |
+ '-' n set-range-dash |
+ '&' n set-range-amp |
+ '\' n set-escape |
+ eof errorDeath doSetNoCloseError |
+ default n set-after-lit doSetLiteral |
+ |
+ |
+# set-after-op |
+# After a -- or && |
+# It is an error to close a set at this point. |
+# |
+set-after-op: |
+ '[' n set-open ^set-after-set doSetBeginUnion |
+ ']' errorDeath doSetOpError |
+ '\' n set-escape |
+ default n set-after-lit doSetLiteral |
+ |
+# |
+# set-set-amp |
+# Have scanned [[set]& |
+# Could be a '&' intersection operator, if a set follows. |
+# Could be the start of a '&&' operator. |
+# Otherewise is a literal. |
+set-set-amp: |
+ '[' n set-open ^set-after-set doSetBeginIntersection1 |
+ '&' n set-after-op doSetIntersection2 |
+ default set-after-lit doSetAddAmp |
+ |
+ |
+# set-lit-amp Have scanned "[literals&" |
+# Could be a start of "&&" operator or a literal |
+# In [abc&[def]], the '&' is a literal |
+# |
+set-lit-amp: |
+ '&' n set-after-op doSetIntersection2 |
+ default set-after-lit doSetAddAmp |
+ |
+ |
+# |
+# set-set-dash |
+# Have scanned [set]- |
+# Could be a '-' difference operator, if a [set] follows. |
+# Could be the start of a '--' operator. |
+# Otherewise is a literal. |
+set-set-dash: |
+ '[' n set-open ^set-after-set doSetBeginDifference1 |
+ '-' n set-after-op doSetDifference2 |
+ default set-after-lit doSetAddDash |
+ |
+ |
+# |
+# set-range-dash |
+# scanned a-b- or \w- |
+# any set or range like item where the trailing single '-' should |
+# be literal, not a set difference operation. |
+# A trailing "--" is still a difference operator. |
+set-range-dash: |
+ '-' n set-after-op doSetDifference2 |
+ default set-after-lit doSetAddDash |
+ |
+ |
+set-range-amp: |
+ '&' n set-after-op doSetIntersection2 |
+ default set-after-lit doSetAddAmp |
+ |
+ |
+# set-lit-dash |
+# Have scanned "[literals-" Could be a range or a -- operator or a literal |
+# In [abc-[def]], the '-' is a literal (confirmed with a Java test) |
+# [abc-\p{xx} the '-' is an error |
+# [abc-] the '-' is a literal |
+# [ab-xy] the '-' is a range |
+# |
+set-lit-dash: |
+ '-' n set-after-op doSetDifference2 |
+ '[' set-after-lit doSetAddDash |
+ ']' set-after-lit doSetAddDash |
+ '\' n set-lit-dash-escape |
+ default n set-after-range doSetRange |
+ |
+# set-lit-dash-escape |
+# |
+# scanned "[literal-\" |
+# Could be a range, if the \ introduces an escaped literal char or a named char. |
+# Otherwise it is an error. |
+# |
+set-lit-dash-escape: |
+ 's' errorDeath doSetOpError |
+ 'S' errorDeath doSetOpError |
+ 'w' errorDeath doSetOpError |
+ 'W' errorDeath doSetOpError |
+ 'd' errorDeath doSetOpError |
+ 'D' errorDeath doSetOpError |
+ 'N' set-after-range doSetNamedRange |
+ default n set-after-range doSetRange |
+ |
+ |
+# |
+# set-escape |
+# Common back-slash escape processing within set expressions |
+# |
+set-escape: |
+ 'p' set-after-set doSetProp |
+ 'P' set-after-set doSetProp |
+ 'N' set-after-lit doSetNamedChar |
+ 's' n set-after-range doSetBackslash_s |
+ 'S' n set-after-range doSetBackslash_S |
+ 'w' n set-after-range doSetBackslash_w |
+ 'W' n set-after-range doSetBackslash_W |
+ 'd' n set-after-range doSetBackslash_d |
+ 'D' n set-after-range doSetBackslash_D |
+ default n set-after-lit doSetLiteralEscaped |
+ |
+# |
+# set-finish |
+# Have just encountered the final ']' that completes a [set], and |
+# arrived here via a pop. From here, we exit the set parsing world, and go |
+# back to generic regular expression parsing. |
+# |
+set-finish: |
+ default expr-quant doSetFinish |
+ |
+ |
+# |
+# errorDeath. This state is specified as the next state whenever a syntax error |
+# in the source rules is detected. Barring bugs, the state machine will never |
+# actually get here, but will stop because of the action associated with the error. |
+# But, just in case, this state asks the state machine to exit. |
+errorDeath: |
+ default n errorDeath doExit |
+ |
+ |
Property changes on: icu46/source/i18n/regexcst.txt |
___________________________________________________________________ |
Added: svn:eol-style |
+ LF |