| OLD | NEW |
| (Empty) |
| 1 // Copyright 2006 The RE2 Authors. All Rights Reserved. | |
| 2 // Use of this source code is governed by a BSD-style | |
| 3 // license that can be found in the LICENSE file. | |
| 4 | |
| 5 // --- SPONSORED LINK -------------------------------------------------- | |
| 6 // If you want to use this library for regular expression matching, | |
| 7 // you should use re2/re2.h, which provides a class RE2 that | |
| 8 // mimics the PCRE interface provided by PCRE's C++ wrappers. | |
| 9 // This header describes the low-level interface used to implement RE2 | |
| 10 // and may change in backwards-incompatible ways from time to time. | |
| 11 // In contrast, RE2's interface will not. | |
| 12 // --------------------------------------------------------------------- | |
| 13 | |
| 14 // Regular expression library: parsing, execution, and manipulation | |
| 15 // of regular expressions. | |
| 16 // | |
| 17 // Any operation that traverses the Regexp structures should be written | |
| 18 // using Regexp::Walker (see walker-inl.h), not recursively, because deeply nest
ed | |
| 19 // regular expressions such as x++++++++++++++++++++... might cause recursive | |
| 20 // traversals to overflow the stack. | |
| 21 // | |
| 22 // It is the caller's responsibility to provide appropriate mutual exclusion | |
| 23 // around manipulation of the regexps. RE2 does this. | |
| 24 // | |
| 25 // PARSING | |
| 26 // | |
| 27 // Regexp::Parse parses regular expressions encoded in UTF-8. | |
| 28 // The default syntax is POSIX extended regular expressions, | |
| 29 // with the following changes: | |
| 30 // | |
| 31 // 1. Backreferences (optional in POSIX EREs) are not supported. | |
| 32 // (Supporting them precludes the use of DFA-based | |
| 33 // matching engines.) | |
| 34 // | |
| 35 // 2. Collating elements and collation classes are not supported. | |
| 36 // (No one has needed or wanted them.) | |
| 37 // | |
| 38 // The exact syntax accepted can be modified by passing flags to | |
| 39 // Regexp::Parse. In particular, many of the basic Perl additions | |
| 40 // are available. The flags are documented below (search for LikePerl). | |
| 41 // | |
| 42 // If parsed with the flag Regexp::Latin1, both the regular expression | |
| 43 // and the input to the matching routines are assumed to be encoded in | |
| 44 // Latin-1, not UTF-8. | |
| 45 // | |
| 46 // EXECUTION | |
| 47 // | |
| 48 // Once Regexp has parsed a regular expression, it provides methods | |
| 49 // to search text using that regular expression. These methods are | |
| 50 // implemented via calling out to other regular expression libraries. | |
| 51 // (Let's call them the sublibraries.) | |
| 52 // | |
| 53 // To call a sublibrary, Regexp does not simply prepare a | |
| 54 // string version of the regular expression and hand it to the | |
| 55 // sublibrary. Instead, Regexp prepares, from its own parsed form, the | |
| 56 // corresponding internal representation used by the sublibrary. | |
| 57 // This has the drawback of needing to know the internal representation | |
| 58 // used by the sublibrary, but it has two important benefits: | |
| 59 // | |
| 60 // 1. The syntax and meaning of regular expressions is guaranteed | |
| 61 // to be that used by Regexp's parser, not the syntax expected | |
| 62 // by the sublibrary. Regexp might accept a restricted or | |
| 63 // expanded syntax for regular expressions as compared with | |
| 64 // the sublibrary. As long as Regexp can translate from its | |
| 65 // internal form into the sublibrary's, clients need not know | |
| 66 // exactly which sublibrary they are using. | |
| 67 // | |
| 68 // 2. The sublibrary parsers are bypassed. For whatever reason, | |
| 69 // sublibrary regular expression parsers often have security | |
| 70 // problems. For example, plan9grep's regular expression parser | |
| 71 // has a buffer overflow in its handling of large character | |
| 72 // classes, and PCRE's parser has had buffer overflow problems | |
| 73 // in the past. Security-team requires sandboxing of sublibrary | |
| 74 // regular expression parsers. Avoiding the sublibrary parsers | |
| 75 // avoids the sandbox. | |
| 76 // | |
| 77 // The execution methods we use now are provided by the compiled form, | |
| 78 // Prog, described in prog.h | |
| 79 // | |
| 80 // MANIPULATION | |
| 81 // | |
| 82 // Unlike other regular expression libraries, Regexp makes its parsed | |
| 83 // form accessible to clients, so that client code can analyze the | |
| 84 // parsed regular expressions. | |
| 85 | |
| 86 #ifndef RE2_REGEXP_H__ | |
| 87 #define RE2_REGEXP_H__ | |
| 88 | |
| 89 #include "util/util.h" | |
| 90 #include "re2/stringpiece.h" | |
| 91 | |
| 92 namespace re2 { | |
| 93 | |
| 94 // Keep in sync with string list kOpcodeNames[] in testing/dump.cc | |
| 95 enum RegexpOp { | |
| 96 // Matches no strings. | |
| 97 kRegexpNoMatch = 1, | |
| 98 | |
| 99 // Matches empty string. | |
| 100 kRegexpEmptyMatch, | |
| 101 | |
| 102 // Matches rune_. | |
| 103 kRegexpLiteral, | |
| 104 | |
| 105 // Matches runes_. | |
| 106 kRegexpLiteralString, | |
| 107 | |
| 108 // Matches concatenation of sub_[0..nsub-1]. | |
| 109 kRegexpConcat, | |
| 110 // Matches union of sub_[0..nsub-1]. | |
| 111 kRegexpAlternate, | |
| 112 | |
| 113 // Matches sub_[0] zero or more times. | |
| 114 kRegexpStar, | |
| 115 // Matches sub_[0] one or more times. | |
| 116 kRegexpPlus, | |
| 117 // Matches sub_[0] zero or one times. | |
| 118 kRegexpQuest, | |
| 119 | |
| 120 // Matches sub_[0] at least min_ times, at most max_ times. | |
| 121 // max_ == -1 means no upper limit. | |
| 122 kRegexpRepeat, | |
| 123 | |
| 124 // Parenthesized (capturing) subexpression. Index is cap_. | |
| 125 // Optionally, capturing name is name_. | |
| 126 kRegexpCapture, | |
| 127 | |
| 128 // Matches any character. | |
| 129 kRegexpAnyChar, | |
| 130 | |
| 131 // Matches any byte [sic]. | |
| 132 kRegexpAnyByte, | |
| 133 | |
| 134 // Matches empty string at beginning of line. | |
| 135 kRegexpBeginLine, | |
| 136 // Matches empty string at end of line. | |
| 137 kRegexpEndLine, | |
| 138 | |
| 139 // Matches word boundary "\b". | |
| 140 kRegexpWordBoundary, | |
| 141 // Matches not-a-word boundary "\B". | |
| 142 kRegexpNoWordBoundary, | |
| 143 | |
| 144 // Matches empty string at beginning of text. | |
| 145 kRegexpBeginText, | |
| 146 // Matches empty string at end of text. | |
| 147 kRegexpEndText, | |
| 148 | |
| 149 // Matches character class given by cc_. | |
| 150 kRegexpCharClass, | |
| 151 | |
| 152 // Forces match of entire expression right now, | |
| 153 // with match ID match_id_ (used by RE2::Set). | |
| 154 kRegexpHaveMatch, | |
| 155 | |
| 156 kMaxRegexpOp = kRegexpHaveMatch, | |
| 157 }; | |
| 158 | |
| 159 // Keep in sync with string list in regexp.cc | |
| 160 enum RegexpStatusCode { | |
| 161 // No error | |
| 162 kRegexpSuccess = 0, | |
| 163 | |
| 164 // Unexpected error | |
| 165 kRegexpInternalError, | |
| 166 | |
| 167 // Parse errors | |
| 168 kRegexpBadEscape, // bad escape sequence | |
| 169 kRegexpBadCharClass, // bad character class | |
| 170 kRegexpBadCharRange, // bad character class range | |
| 171 kRegexpMissingBracket, // missing closing ] | |
| 172 kRegexpMissingParen, // missing closing ) | |
| 173 kRegexpTrailingBackslash, // at end of regexp | |
| 174 kRegexpRepeatArgument, // repeat argument missing, e.g. "*" | |
| 175 kRegexpRepeatSize, // bad repetition argument | |
| 176 kRegexpRepeatOp, // bad repetition operator | |
| 177 kRegexpBadPerlOp, // bad perl operator | |
| 178 kRegexpBadUTF8, // invalid UTF-8 in regexp | |
| 179 kRegexpBadNamedCapture, // bad named capture | |
| 180 }; | |
| 181 | |
| 182 // Error status for certain operations. | |
| 183 class RegexpStatus { | |
| 184 public: | |
| 185 RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {} | |
| 186 ~RegexpStatus() { delete tmp_; } | |
| 187 | |
| 188 void set_code(enum RegexpStatusCode code) { code_ = code; } | |
| 189 void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; } | |
| 190 void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; } | |
| 191 enum RegexpStatusCode code() const { return code_; } | |
| 192 const StringPiece& error_arg() const { return error_arg_; } | |
| 193 bool ok() const { return code() == kRegexpSuccess; } | |
| 194 | |
| 195 // Copies state from status. | |
| 196 void Copy(const RegexpStatus& status); | |
| 197 | |
| 198 // Returns text equivalent of code, e.g.: | |
| 199 // "Bad character class" | |
| 200 static string CodeText(enum RegexpStatusCode code); | |
| 201 | |
| 202 // Returns text describing error, e.g.: | |
| 203 // "Bad character class: [z-a]" | |
| 204 string Text() const; | |
| 205 | |
| 206 private: | |
| 207 enum RegexpStatusCode code_; // Kind of error | |
| 208 StringPiece error_arg_; // Piece of regexp containing syntax error. | |
| 209 string* tmp_; // Temporary storage, possibly where error_arg_
is. | |
| 210 | |
| 211 DISALLOW_COPY_AND_ASSIGN(RegexpStatus); | |
| 212 }; | |
| 213 | |
| 214 // Walkers to implement Simplify. | |
| 215 class CoalesceWalker; | |
| 216 class SimplifyWalker; | |
| 217 | |
| 218 // Compiled form; see prog.h | |
| 219 class Prog; | |
| 220 | |
| 221 struct RuneRange { | |
| 222 RuneRange() : lo(0), hi(0) { } | |
| 223 RuneRange(int l, int h) : lo(l), hi(h) { } | |
| 224 Rune lo; | |
| 225 Rune hi; | |
| 226 }; | |
| 227 | |
| 228 // Less-than on RuneRanges treats a == b if they overlap at all. | |
| 229 // This lets us look in a set to find the range covering a particular Rune. | |
| 230 struct RuneRangeLess { | |
| 231 bool operator()(const RuneRange& a, const RuneRange& b) const { | |
| 232 return a.hi < b.lo; | |
| 233 } | |
| 234 }; | |
| 235 | |
| 236 class CharClassBuilder; | |
| 237 | |
| 238 class CharClass { | |
| 239 public: | |
| 240 void Delete(); | |
| 241 | |
| 242 typedef RuneRange* iterator; | |
| 243 iterator begin() { return ranges_; } | |
| 244 iterator end() { return ranges_ + nranges_; } | |
| 245 | |
| 246 int size() { return nrunes_; } | |
| 247 bool empty() { return nrunes_ == 0; } | |
| 248 bool full() { return nrunes_ == Runemax+1; } | |
| 249 bool FoldsASCII() { return folds_ascii_; } | |
| 250 | |
| 251 bool Contains(Rune r); | |
| 252 CharClass* Negate(); | |
| 253 | |
| 254 private: | |
| 255 CharClass(); // not implemented | |
| 256 ~CharClass(); // not implemented | |
| 257 static CharClass* New(int maxranges); | |
| 258 | |
| 259 friend class CharClassBuilder; | |
| 260 | |
| 261 bool folds_ascii_; | |
| 262 int nrunes_; | |
| 263 RuneRange *ranges_; | |
| 264 int nranges_; | |
| 265 DISALLOW_COPY_AND_ASSIGN(CharClass); | |
| 266 }; | |
| 267 | |
| 268 class Regexp { | |
| 269 public: | |
| 270 | |
| 271 // Flags for parsing. Can be ORed together. | |
| 272 enum ParseFlags { | |
| 273 NoParseFlags = 0, | |
| 274 FoldCase = 1<<0, // Fold case during matching (case-insensitive). | |
| 275 Literal = 1<<1, // Treat s as literal string instead of a regexp. | |
| 276 ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s | |
| 277 // and [[:space:]] to match newline. | |
| 278 DotNL = 1<<3, // Allow . to match newline. | |
| 279 MatchNL = ClassNL | DotNL, | |
| 280 OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and | |
| 281 // end of text, not around embedded newlines. | |
| 282 // (Perl's default) | |
| 283 Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8. | |
| 284 NonGreedy = 1<<6, // Repetition operators are non-greedy by default. | |
| 285 PerlClasses = 1<<7, // Allow Perl character classes like \d. | |
| 286 PerlB = 1<<8, // Allow Perl's \b and \B. | |
| 287 PerlX = 1<<9, // Perl extensions: | |
| 288 // non-capturing parens - (?: ) | |
| 289 // non-greedy operators - *? +? ?? {}? | |
| 290 // flag edits - (?i) (?-i) (?i: ) | |
| 291 // i - FoldCase | |
| 292 // m - !OneLine | |
| 293 // s - DotNL | |
| 294 // U - NonGreedy | |
| 295 // line ends: \A \z | |
| 296 // \Q and \E to disable/enable metacharacters | |
| 297 // (?P<name>expr) for named captures | |
| 298 // \C to match any single byte | |
| 299 UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group | |
| 300 // and \P{Han} for its negation. | |
| 301 NeverNL = 1<<11, // Never match NL, even if the regexp mentions | |
| 302 // it explicitly. | |
| 303 NeverCapture = 1<<12, // Parse all parens as non-capturing. | |
| 304 | |
| 305 // As close to Perl as we can get. | |
| 306 LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX | | |
| 307 UnicodeGroups, | |
| 308 | |
| 309 // Internal use only. | |
| 310 WasDollar = 1<<15, // on kRegexpEndText: was $ in regexp text | |
| 311 }; | |
| 312 | |
| 313 // Get. No set, Regexps are logically immutable once created. | |
| 314 RegexpOp op() { return static_cast<RegexpOp>(op_); } | |
| 315 int nsub() { return nsub_; } | |
| 316 bool simple() { return simple_ != 0; } | |
| 317 enum ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_);
} | |
| 318 int Ref(); // For testing. | |
| 319 | |
| 320 Regexp** sub() { | |
| 321 if(nsub_ <= 1) | |
| 322 return &subone_; | |
| 323 else | |
| 324 return submany_; | |
| 325 } | |
| 326 | |
| 327 int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; } | |
| 328 int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; } | |
| 329 Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; } | |
| 330 CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; } | |
| 331 int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; } | |
| 332 const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; } | |
| 333 Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; } | |
| 334 int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; } | |
| 335 int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; } | |
| 336 | |
| 337 // Increments reference count, returns object as convenience. | |
| 338 Regexp* Incref(); | |
| 339 | |
| 340 // Decrements reference count and deletes this object if count reaches 0. | |
| 341 void Decref(); | |
| 342 | |
| 343 // Parses string s to produce regular expression, returned. | |
| 344 // Caller must release return value with re->Decref(). | |
| 345 // On failure, sets *status (if status != NULL) and returns NULL. | |
| 346 static Regexp* Parse(const StringPiece& s, ParseFlags flags, | |
| 347 RegexpStatus* status); | |
| 348 | |
| 349 // Returns a _new_ simplified version of the current regexp. | |
| 350 // Does not edit the current regexp. | |
| 351 // Caller must release return value with re->Decref(). | |
| 352 // Simplified means that counted repetition has been rewritten | |
| 353 // into simpler terms and all Perl/POSIX features have been | |
| 354 // removed. The result will capture exactly the same | |
| 355 // subexpressions the original did, unless formatted with ToString. | |
| 356 Regexp* Simplify(); | |
| 357 friend class CoalesceWalker; | |
| 358 friend class SimplifyWalker; | |
| 359 | |
| 360 // Parses the regexp src and then simplifies it and sets *dst to the | |
| 361 // string representation of the simplified form. Returns true on success. | |
| 362 // Returns false and sets *status (if status != NULL) on parse error. | |
| 363 static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags, | |
| 364 string* dst, | |
| 365 RegexpStatus* status); | |
| 366 | |
| 367 // Returns the number of capturing groups in the regexp. | |
| 368 int NumCaptures(); | |
| 369 friend class NumCapturesWalker; | |
| 370 | |
| 371 // Returns a map from names to capturing group indices, | |
| 372 // or NULL if the regexp contains no named capture groups. | |
| 373 // The caller is responsible for deleting the map. | |
| 374 map<string, int>* NamedCaptures(); | |
| 375 | |
| 376 // Returns a map from capturing group indices to capturing group | |
| 377 // names or NULL if the regexp contains no named capture groups. The | |
| 378 // caller is responsible for deleting the map. | |
| 379 map<int, string>* CaptureNames(); | |
| 380 | |
| 381 // Returns a string representation of the current regexp, | |
| 382 // using as few parentheses as possible. | |
| 383 string ToString(); | |
| 384 | |
| 385 // Convenience functions. They consume the passed reference, | |
| 386 // so in many cases you should use, e.g., Plus(re->Incref(), flags). | |
| 387 // They do not consume allocated arrays like subs or runes. | |
| 388 static Regexp* Plus(Regexp* sub, ParseFlags flags); | |
| 389 static Regexp* Star(Regexp* sub, ParseFlags flags); | |
| 390 static Regexp* Quest(Regexp* sub, ParseFlags flags); | |
| 391 static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags); | |
| 392 static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags); | |
| 393 static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap); | |
| 394 static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max); | |
| 395 static Regexp* NewLiteral(Rune rune, ParseFlags flags); | |
| 396 static Regexp* NewCharClass(CharClass* cc, ParseFlags flags); | |
| 397 static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags); | |
| 398 static Regexp* HaveMatch(int match_id, ParseFlags flags); | |
| 399 | |
| 400 // Like Alternate but does not factor out common prefixes. | |
| 401 static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags); | |
| 402 | |
| 403 // Debugging function. Returns string format for regexp | |
| 404 // that makes structure clear. Does NOT use regexp syntax. | |
| 405 string Dump(); | |
| 406 | |
| 407 // Helper traversal class, defined fully in walker-inl.h. | |
| 408 template<typename T> class Walker; | |
| 409 | |
| 410 // Compile to Prog. See prog.h | |
| 411 // Reverse prog expects to be run over text backward. | |
| 412 // Construction and execution of prog will | |
| 413 // stay within approximately max_mem bytes of memory. | |
| 414 // If max_mem <= 0, a reasonable default is used. | |
| 415 Prog* CompileToProg(int64 max_mem); | |
| 416 Prog* CompileToReverseProg(int64 max_mem); | |
| 417 | |
| 418 // Whether to expect this library to find exactly the same answer as PCRE | |
| 419 // when running this regexp. Most regexps do mimic PCRE exactly, but a few | |
| 420 // obscure cases behave differently. Technically this is more a property | |
| 421 // of the Prog than the Regexp, but the computation is much easier to do | |
| 422 // on the Regexp. See mimics_pcre.cc for the exact conditions. | |
| 423 bool MimicsPCRE(); | |
| 424 | |
| 425 // Benchmarking function. | |
| 426 void NullWalk(); | |
| 427 | |
| 428 // Whether every match of this regexp must be anchored and | |
| 429 // begin with a non-empty fixed string (perhaps after ASCII | |
| 430 // case-folding). If so, returns the prefix and the sub-regexp that | |
| 431 // follows it. | |
| 432 bool RequiredPrefix(string* prefix, bool *foldcase, Regexp** suffix); | |
| 433 | |
| 434 private: | |
| 435 // Constructor allocates vectors as appropriate for operator. | |
| 436 explicit Regexp(RegexpOp op, ParseFlags parse_flags); | |
| 437 | |
| 438 // Use Decref() instead of delete to release Regexps. | |
| 439 // This is private to catch deletes at compile time. | |
| 440 ~Regexp(); | |
| 441 void Destroy(); | |
| 442 bool QuickDestroy(); | |
| 443 | |
| 444 // Helpers for Parse. Listed here so they can edit Regexps. | |
| 445 class ParseState; | |
| 446 friend class ParseState; | |
| 447 friend bool ParseCharClass(StringPiece* s, Regexp** out_re, | |
| 448 RegexpStatus* status); | |
| 449 | |
| 450 // Helper for testing [sic]. | |
| 451 friend bool RegexpEqualTestingOnly(Regexp*, Regexp*); | |
| 452 | |
| 453 // Computes whether Regexp is already simple. | |
| 454 bool ComputeSimple(); | |
| 455 | |
| 456 // Constructor that generates a concatenation or alternation, | |
| 457 // enforcing the limit on the number of subexpressions for | |
| 458 // a particular Regexp. | |
| 459 static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs, | |
| 460 ParseFlags flags, bool can_factor); | |
| 461 | |
| 462 // Returns the leading string that re starts with. | |
| 463 // The returned Rune* points into a piece of re, | |
| 464 // so it must not be used after the caller calls re->Decref(). | |
| 465 static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags); | |
| 466 | |
| 467 // Removes the first n leading runes from the beginning of re. | |
| 468 // Edits re in place. | |
| 469 static void RemoveLeadingString(Regexp* re, int n); | |
| 470 | |
| 471 // Returns the leading regexp in re's top-level concatenation. | |
| 472 // The returned Regexp* points at re or a sub-expression of re, | |
| 473 // so it must not be used after the caller calls re->Decref(). | |
| 474 static Regexp* LeadingRegexp(Regexp* re); | |
| 475 | |
| 476 // Removes LeadingRegexp(re) from re and returns the remainder. | |
| 477 // Might edit re in place. | |
| 478 static Regexp* RemoveLeadingRegexp(Regexp* re); | |
| 479 | |
| 480 // Simplifies an alternation of literal strings by factoring out | |
| 481 // common prefixes. | |
| 482 static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags); | |
| 483 static int FactorAlternationRecursive(Regexp** sub, int nsub, | |
| 484 ParseFlags flags, int maxdepth); | |
| 485 | |
| 486 // Is a == b? Only efficient on regexps that have not been through | |
| 487 // Simplify yet - the expansion of a kRegexpRepeat will make this | |
| 488 // take a long time. Do not call on such regexps, hence private. | |
| 489 static bool Equal(Regexp* a, Regexp* b); | |
| 490 | |
| 491 // Allocate space for n sub-regexps. | |
| 492 void AllocSub(int n) { | |
| 493 if (n < 0 || static_cast<uint16>(n) != n) | |
| 494 LOG(FATAL) << "Cannot AllocSub " << n; | |
| 495 if (n > 1) | |
| 496 submany_ = new Regexp*[n]; | |
| 497 nsub_ = n; | |
| 498 } | |
| 499 | |
| 500 // Add Rune to LiteralString | |
| 501 void AddRuneToString(Rune r); | |
| 502 | |
| 503 // Swaps this with that, in place. | |
| 504 void Swap(Regexp *that); | |
| 505 | |
| 506 // Operator. See description of operators above. | |
| 507 // uint8 instead of RegexpOp to control space usage. | |
| 508 uint8 op_; | |
| 509 | |
| 510 // Is this regexp structure already simple | |
| 511 // (has it been returned by Simplify)? | |
| 512 // uint8 instead of bool to control space usage. | |
| 513 uint8 simple_; | |
| 514 | |
| 515 // Flags saved from parsing and used during execution. | |
| 516 // (Only FoldCase is used.) | |
| 517 // uint16 instead of ParseFlags to control space usage. | |
| 518 uint16 parse_flags_; | |
| 519 | |
| 520 // Reference count. Exists so that SimplifyRegexp can build | |
| 521 // regexp structures that are dags rather than trees to avoid | |
| 522 // exponential blowup in space requirements. | |
| 523 // uint16 to control space usage. | |
| 524 // The standard regexp routines will never generate a | |
| 525 // ref greater than the maximum repeat count (100), | |
| 526 // but even so, Incref and Decref consult an overflow map | |
| 527 // when ref_ reaches kMaxRef. | |
| 528 uint16 ref_; | |
| 529 static const uint16 kMaxRef = 0xffff; | |
| 530 | |
| 531 // Subexpressions. | |
| 532 // uint16 to control space usage. | |
| 533 // Concat and Alternate handle larger numbers of subexpressions | |
| 534 // by building concatenation or alternation trees. | |
| 535 // Other routines should call Concat or Alternate instead of | |
| 536 // filling in sub() by hand. | |
| 537 uint16 nsub_; | |
| 538 static const uint16 kMaxNsub = 0xffff; | |
| 539 union { | |
| 540 Regexp** submany_; // if nsub_ > 1 | |
| 541 Regexp* subone_; // if nsub_ == 1 | |
| 542 }; | |
| 543 | |
| 544 // Extra space for parse and teardown stacks. | |
| 545 Regexp* down_; | |
| 546 | |
| 547 // Arguments to operator. See description of operators above. | |
| 548 union { | |
| 549 struct { // Repeat | |
| 550 int max_; | |
| 551 int min_; | |
| 552 }; | |
| 553 struct { // Capture | |
| 554 int cap_; | |
| 555 string* name_; | |
| 556 }; | |
| 557 struct { // LiteralString | |
| 558 int nrunes_; | |
| 559 Rune* runes_; | |
| 560 }; | |
| 561 struct { // CharClass | |
| 562 // These two could be in separate union members, | |
| 563 // but it wouldn't save any space (there are other two-word structs) | |
| 564 // and keeping them separate avoids confusion during parsing. | |
| 565 CharClass* cc_; | |
| 566 CharClassBuilder* ccb_; | |
| 567 }; | |
| 568 Rune rune_; // Literal | |
| 569 int match_id_; // HaveMatch | |
| 570 void *the_union_[2]; // as big as any other element, for memset | |
| 571 }; | |
| 572 | |
| 573 DISALLOW_COPY_AND_ASSIGN(Regexp); | |
| 574 }; | |
| 575 | |
| 576 // Character class set: contains non-overlapping, non-abutting RuneRanges. | |
| 577 typedef set<RuneRange, RuneRangeLess> RuneRangeSet; | |
| 578 | |
| 579 class CharClassBuilder { | |
| 580 public: | |
| 581 CharClassBuilder(); | |
| 582 | |
| 583 typedef RuneRangeSet::iterator iterator; | |
| 584 iterator begin() { return ranges_.begin(); } | |
| 585 iterator end() { return ranges_.end(); } | |
| 586 | |
| 587 int size() { return nrunes_; } | |
| 588 bool empty() { return nrunes_ == 0; } | |
| 589 bool full() { return nrunes_ == Runemax+1; } | |
| 590 | |
| 591 bool Contains(Rune r); | |
| 592 bool FoldsASCII(); | |
| 593 bool AddRange(Rune lo, Rune hi); // returns whether class changed | |
| 594 CharClassBuilder* Copy(); | |
| 595 void AddCharClass(CharClassBuilder* cc); | |
| 596 void Negate(); | |
| 597 void RemoveAbove(Rune r); | |
| 598 CharClass* GetCharClass(); | |
| 599 void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags); | |
| 600 | |
| 601 private: | |
| 602 static const uint32 AlphaMask = (1<<26) - 1; | |
| 603 uint32 upper_; // bitmap of A-Z | |
| 604 uint32 lower_; // bitmap of a-z | |
| 605 int nrunes_; | |
| 606 RuneRangeSet ranges_; | |
| 607 DISALLOW_COPY_AND_ASSIGN(CharClassBuilder); | |
| 608 }; | |
| 609 | |
| 610 // Tell g++ that bitwise ops on ParseFlags produce ParseFlags. | |
| 611 inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b) | |
| 612 { | |
| 613 return static_cast<Regexp::ParseFlags>(static_cast<int>(a) | static_cast<int>(
b)); | |
| 614 } | |
| 615 | |
| 616 inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, Regexp::ParseFlags b) | |
| 617 { | |
| 618 return static_cast<Regexp::ParseFlags>(static_cast<int>(a) ^ static_cast<int>(
b)); | |
| 619 } | |
| 620 | |
| 621 inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, Regexp::ParseFlags b) | |
| 622 { | |
| 623 return static_cast<Regexp::ParseFlags>(static_cast<int>(a) & static_cast<int>(
b)); | |
| 624 } | |
| 625 | |
| 626 inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) | |
| 627 { | |
| 628 return static_cast<Regexp::ParseFlags>(~static_cast<int>(a)); | |
| 629 } | |
| 630 | |
| 631 | |
| 632 | |
| 633 } // namespace re2 | |
| 634 | |
| 635 #endif // RE2_REGEXP_H__ | |
| OLD | NEW |