OLD | NEW |
(Empty) | |
| 1 // |
| 2 // regexcmp.h |
| 3 // |
| 4 // Copyright (C) 2002-2010, International Business Machines Corporation and oth
ers. |
| 5 // All Rights Reserved. |
| 6 // |
| 7 // This file contains declarations for the class RegexCompile |
| 8 // |
| 9 // This class is internal to the regular expression implementation. |
| 10 // For the public Regular Expression API, see the file "unicode/regex.h" |
| 11 // |
| 12 |
| 13 |
| 14 #ifndef RBBISCAN_H |
| 15 #define RBBISCAN_H |
| 16 |
| 17 #include "unicode/utypes.h" |
| 18 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 19 |
| 20 #include "unicode/uobject.h" |
| 21 #include "unicode/uniset.h" |
| 22 #include "unicode/parseerr.h" |
| 23 #include "uhash.h" |
| 24 #include "uvector.h" |
| 25 |
| 26 |
| 27 |
| 28 U_NAMESPACE_BEGIN |
| 29 |
| 30 |
| 31 //------------------------------------------------------------------------------
-- |
| 32 // |
| 33 // class RegexCompile Contains the regular expression compiler. |
| 34 // |
| 35 //------------------------------------------------------------------------------
-- |
| 36 struct RegexTableEl; |
| 37 class RegexPattern; |
| 38 |
| 39 |
| 40 class RegexCompile : public UMemory { |
| 41 public: |
| 42 |
| 43 enum { |
| 44 kStackSize = 100 // The size of the state stack for |
| 45 }; // pattern parsing. Corresponds roughly |
| 46 // to the depth of parentheses nesting |
| 47 // that is allowed in the rules. |
| 48 |
| 49 struct RegexPatternChar { |
| 50 UChar32 fChar; |
| 51 UBool fQuoted; |
| 52 }; |
| 53 |
| 54 RegexCompile(RegexPattern *rp, UErrorCode &e); |
| 55 |
| 56 void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e)
; |
| 57 void compile(UText *pat, UParseError &pp, UErrorCode &e); |
| 58 |
| 59 |
| 60 virtual ~RegexCompile(); |
| 61 |
| 62 void nextChar(RegexPatternChar &c); // Get the next char from th
e input stream. |
| 63 |
| 64 static void cleanup(); // Memory cleanup |
| 65 |
| 66 |
| 67 |
| 68 // Categories of parentheses in pattern. |
| 69 // The category is saved in the compile-time parentheses stack frame, and |
| 70 // determines the code to be generated when the matching close ) is encoun
tered. |
| 71 enum EParenClass { |
| 72 plain = -1, // No special handling |
| 73 capturing = -2, |
| 74 atomic = -3, |
| 75 lookAhead = -4, |
| 76 negLookAhead = -5, |
| 77 flags = -6, |
| 78 lookBehind = -7, |
| 79 lookBehindN = -8 |
| 80 }; |
| 81 |
| 82 private: |
| 83 |
| 84 |
| 85 UBool doParseActions(int32_t a); |
| 86 void error(UErrorCode e); // error reporting conven
ience function. |
| 87 |
| 88 UChar32 nextCharLL(); |
| 89 UChar32 peekCharLL(); |
| 90 UnicodeSet *scanProp(); |
| 91 UnicodeSet *scanPosixProp(); |
| 92 void handleCloseParen(); |
| 93 int32_t blockTopLoc(UBool reserve); // Locate a position in the
compiled pattern |
| 94 // at the top of the just
completed block |
| 95 // or operation, and optio
nally ensure that |
| 96 // there is space to add a
n opcode there. |
| 97 void compileSet(UnicodeSet *theSet); // Generate the compiled pa
ttern for |
| 98 // a reference to a Unico
deSet. |
| 99 void compileInterval(int32_t InitOp, // Generate the code for a
{min,max} quantifier. |
| 100 int32_t LoopOp); |
| 101 UBool compileInlineInterval(); // Generate inline code for
a {min,max} quantifier |
| 102 void literalChar(UChar32 c); // Compile a literal char |
| 103 void fixLiterals(UBool split=FALSE); // Fix literal strings. |
| 104 void insertOp(int32_t where); // Open up a slot for a new
op in the |
| 105 // generated code at the
specified location. |
| 106 void emitONE_CHAR(UChar32 c); // Emit a ONE_CHAR op into
the compiled code, |
| 107 // taking case mode into
account. |
| 108 int32_t minMatchLength(int32_t start, |
| 109 int32_t end); |
| 110 int32_t maxMatchLength(int32_t start, |
| 111 int32_t end); |
| 112 void matchStartType(); |
| 113 void stripNOPs(); |
| 114 |
| 115 void setEval(int32_t op); |
| 116 void setPushOp(int32_t op); |
| 117 UChar32 scanNamedChar(); |
| 118 UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negate
d); |
| 119 |
| 120 |
| 121 UErrorCode *fStatus; |
| 122 RegexPattern *fRXPat; |
| 123 UParseError *fParseErr; |
| 124 |
| 125 // |
| 126 // Data associated with low level character scanning |
| 127 // |
| 128 int64_t fScanIndex; // Index of current charact
er being processed |
| 129 // in the rule input stri
ng. |
| 130 UBool fQuoteMode; // Scan is in a \Q...\E quo
ted region |
| 131 UBool fInBackslashQuote; // Scan is between a '\' an
d the following char. |
| 132 UBool fEOLComments; // When scan is just after
'(?', inhibit #... to |
| 133 // end of line comments,
in favor of (?#...) comments. |
| 134 int64_t fLineNum; // Line number in input fil
e. |
| 135 int64_t fCharNum; // Char position within the
line. |
| 136 UChar32 fLastChar; // Previous char, needed to
count CR-LF |
| 137 // as a single line, not
two. |
| 138 UChar32 fPeekChar; // Saved char, if we've sca
nned ahead. |
| 139 |
| 140 |
| 141 RegexPatternChar fC; // Current char for parse s
tate machine |
| 142 // processing. |
| 143 |
| 144 // |
| 145 // Data for the state machine that parses the regular expression. |
| 146 // |
| 147 RegexTableEl **fStateTable; // State Transition Table f
or regex Rule |
| 148 // parsing. index by p[s
tate][char-class] |
| 149 |
| 150 uint16_t fStack[kStackSize]; // State stack, holds sta
te pushes |
| 151 int32_t fStackPtr; // and pops as specified
in the state |
| 152 // transition rules. |
| 153 |
| 154 // |
| 155 // Data associated with the generation of the pcode for the match engine |
| 156 // |
| 157 int32_t fModeFlags; // Match Flags. (Case Inse
nsitive, etc.) |
| 158 // Always has high bit (3
1) set so that flag values |
| 159 // on the paren stack are
distinguished from relocatable |
| 160 // pcode addresses. |
| 161 int32_t fNewModeFlags; // New flags, while compili
ng (?i, holds state |
| 162 // until last flag is sca
nned. |
| 163 UBool fSetModeFlag; // true for (?ismx, false f
or (?-ismx |
| 164 |
| 165 |
| 166 int32_t fStringOpStart; // While a literal string i
s being scanned |
| 167 // holds the start index
within RegexPattern. |
| 168 // fLiteralText where the
string is being stored. |
| 169 |
| 170 int64_t fPatternLength; // Length of the input patt
ern string. |
| 171 |
| 172 UVector32 fParenStack; // parentheses stack. Each
frame consists of |
| 173 // the positions of compi
led pattern operations |
| 174 // needing fixup, followe
d by negative value. The |
| 175 // first entry in each fr
ame is the position of the |
| 176 // spot reserved for use
when a quantifier |
| 177 // needs to add a SAVE at
the start of a (block) |
| 178 // The negative value (-1
, -2,...) indicates |
| 179 // the kind of paren that
opened the frame. Some |
| 180 // need special handling
on close. |
| 181 |
| 182 |
| 183 int32_t fMatchOpenParen; // The position in the comp
iled pattern |
| 184 // of the slot reserved f
or a state save |
| 185 // at the start of the mo
st recently processed |
| 186 // parenthesized block. |
| 187 int32_t fMatchCloseParen; // The position in the patt
ern of the first |
| 188 // location after the mos
t recently processed |
| 189 // parenthesized block. |
| 190 |
| 191 int32_t fIntervalLow; // {lower, upper} interval
quantifier values. |
| 192 int32_t fIntervalUpper; // Placed here temporarily,
when pattern is |
| 193 // initially scanned. Ea
ch new interval |
| 194 // encountered overwrites
these values. |
| 195 // -1 for the upper inter
val value means none |
| 196 // was specified (unlimit
ed occurences.) |
| 197 |
| 198 int64_t fNameStartPos; // Starting position of a \
N{NAME} name in a |
| 199 // pattern, valid while r
emainder of name is |
| 200 // scanned. |
| 201 |
| 202 UStack fSetStack; // Stack of UnicodeSets, us
ed while evaluating |
| 203 // (at compile time) set
expressions within |
| 204 // the pattern. |
| 205 UStack fSetOpStack; // Stack of pending set ope
rators (&&, --, union) |
| 206 |
| 207 UChar32 fLastSetLiteral; // The last single code poi
nt added to a set. |
| 208 // needed when "-y" is sc
anned, and we need |
| 209 // to turn "x-y" into a r
ange. |
| 210 }; |
| 211 |
| 212 // Constant values to be pushed onto fSetOpStack while scanning & evalueating [s
et expressions] |
| 213 // The high 16 bits are the operator precedence, and the low 16 are a code for
the operation itself. |
| 214 |
| 215 enum SetOperations { |
| 216 setStart = 0 << 16 | 1, |
| 217 setEnd = 1 << 16 | 2, |
| 218 setNegation = 2 << 16 | 3, |
| 219 setCaseClose = 2 << 16 | 9, |
| 220 setDifference2 = 3 << 16 | 4, // '--' set difference operator |
| 221 setIntersection2 = 3 << 16 | 5, // '&&' set intersection operator |
| 222 setUnion = 4 << 16 | 6, // implicit union of adjacent items |
| 223 setDifference1 = 4 << 16 | 7, // '-', single dash difference op, for co
mpatibility with old UnicodeSet. |
| 224 setIntersection1 = 4 << 16 | 8 // '&', single amp intersection op, for c
ompatibility with old UnicodeSet. |
| 225 }; |
| 226 |
| 227 U_NAMESPACE_END |
| 228 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 229 #endif // RBBISCAN_H |
OLD | NEW |