| Index: icu46/source/i18n/regexcst.txt
|
| ===================================================================
|
| --- icu46/source/i18n/regexcst.txt (revision 0)
|
| +++ icu46/source/i18n/regexcst.txt (revision 0)
|
| @@ -0,0 +1,467 @@
|
| +
|
| +#*****************************************************************************
|
| +#
|
| +# Copyright (C) 2002-2007, International Business Machines Corporation and others.
|
| +# All Rights Reserved.
|
| +#
|
| +#*****************************************************************************
|
| +#
|
| +# file: regexcst.txt
|
| +# ICU Regular Expression Parser State Table
|
| +#
|
| +# This state table is used when reading and parsing a regular expression pattern
|
| +# The pattern parser uses a state machine; the data in this file define the
|
| +# state transitions that occur for each input character.
|
| +#
|
| +# *** This file defines the regex pattern grammar. This is it.
|
| +# *** The determination of what is accepted is here.
|
| +#
|
| +# This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
|
| +# that are then built with the rule parser.
|
| +#
|
| +
|
| +#
|
| +# Here is the syntax of the state definitions in this file:
|
| +#
|
| +#
|
| +#StateName:
|
| +# input-char n next-state ^push-state action
|
| +# input-char n next-state ^push-state action
|
| +# | | | | |
|
| +# | | | | |--- action to be performed by state machine
|
| +# | | | | See function RBBIRuleScanner::doParseActions()
|
| +# | | | |
|
| +# | | | |--- Push this named state onto the state stack.
|
| +# | | | Later, when next state is specified as "pop",
|
| +# | | | the pushed state will become the current state.
|
| +# | | |
|
| +# | | |--- Transition to this state if the current input character matches the input
|
| +# | | character or char class in the left hand column. "pop" causes the next
|
| +# | | state to be popped from the state stack.
|
| +# | |
|
| +# | |--- When making the state transition specified on this line, advance to the next
|
| +# | character from the input only if 'n' appears here.
|
| +# |
|
| +# |--- Character or named character classes to test for. If the current character being scanned
|
| +# matches, peform the actions and go to the state specified on this line.
|
| +# The input character is tested sequentally, in the order written. The characters and
|
| +# character classes tested for do not need to be mutually exclusive. The first match wins.
|
| +#
|
| +
|
| +
|
| +
|
| +
|
| +#
|
| +# start state, scan position is at the beginning of the pattern.
|
| +#
|
| +start:
|
| + default term doPatStart
|
| +
|
| +
|
| +
|
| +
|
| +#
|
| +# term. At a position where we can accept the start most items in a pattern.
|
| +#
|
| +term:
|
| + quoted n expr-quant doLiteralChar
|
| + rule_char n expr-quant doLiteralChar
|
| + '[' n set-open ^set-finish doSetBegin
|
| + '(' n open-paren
|
| + '.' n expr-quant doDotAny
|
| + '^' n expr-quant doCaret
|
| + '$' n expr-quant doDollar
|
| + '\' n backslash
|
| + '|' n term doOrOperator
|
| + ')' n pop doCloseParen
|
| + eof term doPatFinish
|
| + default errorDeath doRuleError
|
| +
|
| +
|
| +
|
| +#
|
| +# expr-quant We've just finished scanning a term, now look for the optional
|
| +# trailing quantifier - *, +, ?, *?, etc.
|
| +#
|
| +expr-quant:
|
| + '*' n quant-star
|
| + '+' n quant-plus
|
| + '?' n quant-opt
|
| + '{' n interval-open doIntervalInit
|
| + '(' n open-paren-quant
|
| + default expr-cont
|
| +
|
| +
|
| +#
|
| +# expr-cont Expression, continuation. At a point where additional terms are
|
| +# allowed, but not required. No Quantifiers
|
| +#
|
| +expr-cont:
|
| + '|' n term doOrOperator
|
| + ')' n pop doCloseParen
|
| + default term
|
| +
|
| +
|
| +#
|
| +# open-paren-quant Special case handling for comments appearing before a quantifier,
|
| +# e.g. x(?#comment )*
|
| +# Open parens from expr-quant come here; anything but a (?# comment
|
| +# branches into the normal parenthesis sequence as quickly as possible.
|
| +#
|
| +open-paren-quant:
|
| + '?' n open-paren-quant2 doSuppressComments
|
| + default open-paren
|
| +
|
| +open-paren-quant2:
|
| + '#' n paren-comment ^expr-quant
|
| + default open-paren-extended
|
| +
|
| +
|
| +#
|
| +# open-paren We've got an open paren. We need to scan further to
|
| +# determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
|
| +#
|
| +open-paren:
|
| + '?' n open-paren-extended doSuppressComments
|
| + default term ^expr-quant doOpenCaptureParen
|
| +
|
| +open-paren-extended:
|
| + ':' n term ^expr-quant doOpenNonCaptureParen # (?:
|
| + '>' n term ^expr-quant doOpenAtomicParen # (?>
|
| + '=' n term ^expr-cont doOpenLookAhead # (?=
|
| + '!' n term ^expr-cont doOpenLookAheadNeg # (?!
|
| + '<' n open-paren-lookbehind
|
| + '#' n paren-comment ^term
|
| + 'i' paren-flag doBeginMatchMode
|
| + 'd' paren-flag doBeginMatchMode
|
| + 'm' paren-flag doBeginMatchMode
|
| + 's' paren-flag doBeginMatchMode
|
| + 'u' paren-flag doBeginMatchMode
|
| + 'w' paren-flag doBeginMatchMode
|
| + 'x' paren-flag doBeginMatchMode
|
| + '-' paren-flag doBeginMatchMode
|
| + '(' n errorDeath doConditionalExpr
|
| + '{' n errorDeath doPerlInline
|
| + default errorDeath doBadOpenParenType
|
| +
|
| +open-paren-lookbehind:
|
| + '=' n term ^expr-cont doOpenLookBehind # (?<=
|
| + '!' n term ^expr-cont doOpenLookBehindNeg # (?<!
|
| + default errorDeath doBadOpenParenType
|
| +
|
| +
|
| +#
|
| +# paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')'
|
| +#
|
| +paren-comment:
|
| + ')' n pop
|
| + eof errorDeath doMismatchedParenErr
|
| + default n paren-comment
|
| +
|
| +#
|
| +# paren-flag Scanned a (?ismx-ismx flag setting
|
| +#
|
| +paren-flag:
|
| + 'i' n paren-flag doMatchMode
|
| + 'd' n paren-flag doMatchMode
|
| + 'm' n paren-flag doMatchMode
|
| + 's' n paren-flag doMatchMode
|
| + 'u' n paren-flag doMatchMode
|
| + 'w' n paren-flag doMatchMode
|
| + 'x' n paren-flag doMatchMode
|
| + '-' n paren-flag doMatchMode
|
| + ')' n term doSetMatchMode
|
| + ':' n term ^expr-quant doMatchModeParen
|
| + default errorDeath doBadModeFlag
|
| +
|
| +
|
| +#
|
| +# quant-star Scanning a '*' quantifier. Need to look ahead to decide
|
| +# between plain '*', '*?', '*+'
|
| +#
|
| +quant-star:
|
| + '?' n expr-cont doNGStar # *?
|
| + '+' n expr-cont doPossessiveStar # *+
|
| + default expr-cont doStar
|
| +
|
| +
|
| +#
|
| +# quant-plus Scanning a '+' quantifier. Need to look ahead to decide
|
| +# between plain '+', '+?', '++'
|
| +#
|
| +quant-plus:
|
| + '?' n expr-cont doNGPlus # *?
|
| + '+' n expr-cont doPossessivePlus # *+
|
| + default expr-cont doPlus
|
| +
|
| +
|
| +#
|
| +# quant-opt Scanning a '?' quantifier. Need to look ahead to decide
|
| +# between plain '?', '??', '?+'
|
| +#
|
| +quant-opt:
|
| + '?' n expr-cont doNGOpt # ??
|
| + '+' n expr-cont doPossessiveOpt # ?+
|
| + default expr-cont doOpt # ?
|
| +
|
| +
|
| +#
|
| +# Interval scanning a '{', the opening delimiter for an interval specification
|
| +# {number} or {min, max} or {min,}
|
| +#
|
| +interval-open:
|
| + digit_char interval-lower
|
| + default errorDeath doIntervalError
|
| +
|
| +interval-lower:
|
| + digit_char n interval-lower doIntevalLowerDigit
|
| + ',' n interval-upper
|
| + '}' n interval-type doIntervalSame # {n}
|
| + default errorDeath doIntervalError
|
| +
|
| +interval-upper:
|
| + digit_char n interval-upper doIntervalUpperDigit
|
| + '}' n interval-type
|
| + default errorDeath doIntervalError
|
| +
|
| +interval-type:
|
| + '?' n expr-cont doNGInterval # {n,m}?
|
| + '+' n expr-cont doPossessiveInterval # {n,m}+
|
| + default expr-cont doInterval # {m,n}
|
| +
|
| +
|
| +#
|
| +# backslash # Backslash. Figure out which of the \thingies we have encountered.
|
| +# The low level next-char function will have preprocessed
|
| +# some of them already; those won't come here.
|
| +backslash:
|
| + 'A' n term doBackslashA
|
| + 'B' n term doBackslashB
|
| + 'b' n term doBackslashb
|
| + 'd' n expr-quant doBackslashd
|
| + 'D' n expr-quant doBackslashD
|
| + 'G' n term doBackslashG
|
| + 'N' expr-quant doNamedChar # \N{NAME} named char
|
| + 'p' expr-quant doProperty # \p{Lu} style property
|
| + 'P' expr-quant doProperty
|
| + 'Q' n term doEnterQuoteMode
|
| + 'S' n expr-quant doBackslashS
|
| + 's' n expr-quant doBackslashs
|
| + 'W' n expr-quant doBackslashW
|
| + 'w' n expr-quant doBackslashw
|
| + 'X' n expr-quant doBackslashX
|
| + 'Z' n term doBackslashZ
|
| + 'z' n term doBackslashz
|
| + digit_char n expr-quant doBackRef # Will scan multiple digits
|
| + eof errorDeath doEscapeError
|
| + default n expr-quant doEscapedLiteralChar
|
| +
|
| +
|
| +
|
| +#
|
| +# [set expression] parsing,
|
| +# All states involved in parsing set expressions have names beginning with "set-"
|
| +#
|
| +
|
| +set-open:
|
| + '^' n set-open2 doSetNegate
|
| + ':' set-posix doSetPosixProp
|
| + default set-open2
|
| +
|
| +set-open2:
|
| + ']' n set-after-lit doSetLiteral
|
| + default set-start
|
| +
|
| +# set-posix:
|
| +# scanned a '[:' If it really is a [:property:], doSetPosixProp will have
|
| +# moved the scan to the closing ']'. If it wasn't a property
|
| +# expression, the scan will still be at the opening ':', which should
|
| +# be interpreted as a normal set expression.
|
| +set-posix:
|
| + ']' n pop doSetEnd
|
| + ':' set-start
|
| + default errorDeath doRuleError # should not be possible.
|
| +
|
| +#
|
| +# set-start after the [ and special case leading characters (^ and/or ]) but before
|
| +# everything else. A '-' is literal at this point.
|
| +#
|
| +set-start:
|
| + ']' n pop doSetEnd
|
| + '[' n set-open ^set-after-set doSetBeginUnion
|
| + '\' n set-escape
|
| + '-' n set-start-dash
|
| + '&' n set-start-amp
|
| + default n set-after-lit doSetLiteral
|
| +
|
| +# set-start-dash Turn "[--" into a syntax error.
|
| +# "[-x" is good, - and x are literals.
|
| +#
|
| +set-start-dash:
|
| + '-' errorDeath doRuleError
|
| + default set-after-lit doSetAddDash
|
| +
|
| +# set-start-amp Turn "[&&" into a syntax error.
|
| +# "[&x" is good, & and x are literals.
|
| +#
|
| +set-start-amp:
|
| + '&' errorDeath doRuleError
|
| + default set-after-lit doSetAddAmp
|
| +
|
| +#
|
| +# set-after-lit The last thing scanned was a literal character within a set.
|
| +# Can be followed by anything. Single '-' or '&' are
|
| +# literals in this context, not operators.
|
| +set-after-lit:
|
| + ']' n pop doSetEnd
|
| + '[' n set-open ^set-after-set doSetBeginUnion
|
| + '-' n set-lit-dash
|
| + '&' n set-lit-amp
|
| + '\' n set-escape
|
| + eof errorDeath doSetNoCloseError
|
| + default n set-after-lit doSetLiteral
|
| +
|
| +set-after-set:
|
| + ']' n pop doSetEnd
|
| + '[' n set-open ^set-after-set doSetBeginUnion
|
| + '-' n set-set-dash
|
| + '&' n set-set-amp
|
| + '\' n set-escape
|
| + eof errorDeath doSetNoCloseError
|
| + default n set-after-lit doSetLiteral
|
| +
|
| +set-after-range:
|
| + ']' n pop doSetEnd
|
| + '[' n set-open ^set-after-set doSetBeginUnion
|
| + '-' n set-range-dash
|
| + '&' n set-range-amp
|
| + '\' n set-escape
|
| + eof errorDeath doSetNoCloseError
|
| + default n set-after-lit doSetLiteral
|
| +
|
| +
|
| +# set-after-op
|
| +# After a -- or &&
|
| +# It is an error to close a set at this point.
|
| +#
|
| +set-after-op:
|
| + '[' n set-open ^set-after-set doSetBeginUnion
|
| + ']' errorDeath doSetOpError
|
| + '\' n set-escape
|
| + default n set-after-lit doSetLiteral
|
| +
|
| +#
|
| +# set-set-amp
|
| +# Have scanned [[set]&
|
| +# Could be a '&' intersection operator, if a set follows.
|
| +# Could be the start of a '&&' operator.
|
| +# Otherewise is a literal.
|
| +set-set-amp:
|
| + '[' n set-open ^set-after-set doSetBeginIntersection1
|
| + '&' n set-after-op doSetIntersection2
|
| + default set-after-lit doSetAddAmp
|
| +
|
| +
|
| +# set-lit-amp Have scanned "[literals&"
|
| +# Could be a start of "&&" operator or a literal
|
| +# In [abc&[def]], the '&' is a literal
|
| +#
|
| +set-lit-amp:
|
| + '&' n set-after-op doSetIntersection2
|
| + default set-after-lit doSetAddAmp
|
| +
|
| +
|
| +#
|
| +# set-set-dash
|
| +# Have scanned [set]-
|
| +# Could be a '-' difference operator, if a [set] follows.
|
| +# Could be the start of a '--' operator.
|
| +# Otherewise is a literal.
|
| +set-set-dash:
|
| + '[' n set-open ^set-after-set doSetBeginDifference1
|
| + '-' n set-after-op doSetDifference2
|
| + default set-after-lit doSetAddDash
|
| +
|
| +
|
| +#
|
| +# set-range-dash
|
| +# scanned a-b- or \w-
|
| +# any set or range like item where the trailing single '-' should
|
| +# be literal, not a set difference operation.
|
| +# A trailing "--" is still a difference operator.
|
| +set-range-dash:
|
| + '-' n set-after-op doSetDifference2
|
| + default set-after-lit doSetAddDash
|
| +
|
| +
|
| +set-range-amp:
|
| + '&' n set-after-op doSetIntersection2
|
| + default set-after-lit doSetAddAmp
|
| +
|
| +
|
| +# set-lit-dash
|
| +# Have scanned "[literals-" Could be a range or a -- operator or a literal
|
| +# In [abc-[def]], the '-' is a literal (confirmed with a Java test)
|
| +# [abc-\p{xx} the '-' is an error
|
| +# [abc-] the '-' is a literal
|
| +# [ab-xy] the '-' is a range
|
| +#
|
| +set-lit-dash:
|
| + '-' n set-after-op doSetDifference2
|
| + '[' set-after-lit doSetAddDash
|
| + ']' set-after-lit doSetAddDash
|
| + '\' n set-lit-dash-escape
|
| + default n set-after-range doSetRange
|
| +
|
| +# set-lit-dash-escape
|
| +#
|
| +# scanned "[literal-\"
|
| +# Could be a range, if the \ introduces an escaped literal char or a named char.
|
| +# Otherwise it is an error.
|
| +#
|
| +set-lit-dash-escape:
|
| + 's' errorDeath doSetOpError
|
| + 'S' errorDeath doSetOpError
|
| + 'w' errorDeath doSetOpError
|
| + 'W' errorDeath doSetOpError
|
| + 'd' errorDeath doSetOpError
|
| + 'D' errorDeath doSetOpError
|
| + 'N' set-after-range doSetNamedRange
|
| + default n set-after-range doSetRange
|
| +
|
| +
|
| +#
|
| +# set-escape
|
| +# Common back-slash escape processing within set expressions
|
| +#
|
| +set-escape:
|
| + 'p' set-after-set doSetProp
|
| + 'P' set-after-set doSetProp
|
| + 'N' set-after-lit doSetNamedChar
|
| + 's' n set-after-range doSetBackslash_s
|
| + 'S' n set-after-range doSetBackslash_S
|
| + 'w' n set-after-range doSetBackslash_w
|
| + 'W' n set-after-range doSetBackslash_W
|
| + 'd' n set-after-range doSetBackslash_d
|
| + 'D' n set-after-range doSetBackslash_D
|
| + default n set-after-lit doSetLiteralEscaped
|
| +
|
| +#
|
| +# set-finish
|
| +# Have just encountered the final ']' that completes a [set], and
|
| +# arrived here via a pop. From here, we exit the set parsing world, and go
|
| +# back to generic regular expression parsing.
|
| +#
|
| +set-finish:
|
| + default expr-quant doSetFinish
|
| +
|
| +
|
| +#
|
| +# errorDeath. This state is specified as the next state whenever a syntax error
|
| +# in the source rules is detected. Barring bugs, the state machine will never
|
| +# actually get here, but will stop because of the action associated with the error.
|
| +# But, just in case, this state asks the state machine to exit.
|
| +errorDeath:
|
| + default n errorDeath doExit
|
| +
|
| +
|
|
|
| Property changes on: icu46/source/i18n/regexcst.txt
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + LF
|
|
|
|
|