OLD | NEW |
| (Empty) |
1 // Copyright 2006 The RE2 Authors. All Rights Reserved. | |
2 // Use of this source code is governed by a BSD-style | |
3 // license that can be found in the LICENSE file. | |
4 | |
5 // --- SPONSORED LINK -------------------------------------------------- | |
6 // If you want to use this library for regular expression matching, | |
7 // you should use re2/re2.h, which provides a class RE2 that | |
8 // mimics the PCRE interface provided by PCRE's C++ wrappers. | |
9 // This header describes the low-level interface used to implement RE2 | |
10 // and may change in backwards-incompatible ways from time to time. | |
11 // In contrast, RE2's interface will not. | |
12 // --------------------------------------------------------------------- | |
13 | |
14 // Regular expression library: parsing, execution, and manipulation | |
15 // of regular expressions. | |
16 // | |
17 // Any operation that traverses the Regexp structures should be written | |
18 // using Regexp::Walker (see walker-inl.h), not recursively, because deeply nest
ed | |
19 // regular expressions such as x++++++++++++++++++++... might cause recursive | |
20 // traversals to overflow the stack. | |
21 // | |
22 // It is the caller's responsibility to provide appropriate mutual exclusion | |
23 // around manipulation of the regexps. RE2 does this. | |
24 // | |
25 // PARSING | |
26 // | |
27 // Regexp::Parse parses regular expressions encoded in UTF-8. | |
28 // The default syntax is POSIX extended regular expressions, | |
29 // with the following changes: | |
30 // | |
31 // 1. Backreferences (optional in POSIX EREs) are not supported. | |
32 // (Supporting them precludes the use of DFA-based | |
33 // matching engines.) | |
34 // | |
35 // 2. Collating elements and collation classes are not supported. | |
36 // (No one has needed or wanted them.) | |
37 // | |
38 // The exact syntax accepted can be modified by passing flags to | |
39 // Regexp::Parse. In particular, many of the basic Perl additions | |
40 // are available. The flags are documented below (search for LikePerl). | |
41 // | |
42 // If parsed with the flag Regexp::Latin1, both the regular expression | |
43 // and the input to the matching routines are assumed to be encoded in | |
44 // Latin-1, not UTF-8. | |
45 // | |
46 // EXECUTION | |
47 // | |
48 // Once Regexp has parsed a regular expression, it provides methods | |
49 // to search text using that regular expression. These methods are | |
50 // implemented via calling out to other regular expression libraries. | |
51 // (Let's call them the sublibraries.) | |
52 // | |
53 // To call a sublibrary, Regexp does not simply prepare a | |
54 // string version of the regular expression and hand it to the | |
55 // sublibrary. Instead, Regexp prepares, from its own parsed form, the | |
56 // corresponding internal representation used by the sublibrary. | |
57 // This has the drawback of needing to know the internal representation | |
58 // used by the sublibrary, but it has two important benefits: | |
59 // | |
60 // 1. The syntax and meaning of regular expressions is guaranteed | |
61 // to be that used by Regexp's parser, not the syntax expected | |
62 // by the sublibrary. Regexp might accept a restricted or | |
63 // expanded syntax for regular expressions as compared with | |
64 // the sublibrary. As long as Regexp can translate from its | |
65 // internal form into the sublibrary's, clients need not know | |
66 // exactly which sublibrary they are using. | |
67 // | |
68 // 2. The sublibrary parsers are bypassed. For whatever reason, | |
69 // sublibrary regular expression parsers often have security | |
70 // problems. For example, plan9grep's regular expression parser | |
71 // has a buffer overflow in its handling of large character | |
72 // classes, and PCRE's parser has had buffer overflow problems | |
73 // in the past. Security-team requires sandboxing of sublibrary | |
74 // regular expression parsers. Avoiding the sublibrary parsers | |
75 // avoids the sandbox. | |
76 // | |
77 // The execution methods we use now are provided by the compiled form, | |
78 // Prog, described in prog.h | |
79 // | |
80 // MANIPULATION | |
81 // | |
82 // Unlike other regular expression libraries, Regexp makes its parsed | |
83 // form accessible to clients, so that client code can analyze the | |
84 // parsed regular expressions. | |
85 | |
86 #ifndef RE2_REGEXP_H__ | |
87 #define RE2_REGEXP_H__ | |
88 | |
89 #include "util/util.h" | |
90 #include "re2/stringpiece.h" | |
91 | |
92 namespace re2 { | |
93 | |
94 // Keep in sync with string list kOpcodeNames[] in testing/dump.cc | |
95 enum RegexpOp { | |
96 // Matches no strings. | |
97 kRegexpNoMatch = 1, | |
98 | |
99 // Matches empty string. | |
100 kRegexpEmptyMatch, | |
101 | |
102 // Matches rune_. | |
103 kRegexpLiteral, | |
104 | |
105 // Matches runes_. | |
106 kRegexpLiteralString, | |
107 | |
108 // Matches concatenation of sub_[0..nsub-1]. | |
109 kRegexpConcat, | |
110 // Matches union of sub_[0..nsub-1]. | |
111 kRegexpAlternate, | |
112 | |
113 // Matches sub_[0] zero or more times. | |
114 kRegexpStar, | |
115 // Matches sub_[0] one or more times. | |
116 kRegexpPlus, | |
117 // Matches sub_[0] zero or one times. | |
118 kRegexpQuest, | |
119 | |
120 // Matches sub_[0] at least min_ times, at most max_ times. | |
121 // max_ == -1 means no upper limit. | |
122 kRegexpRepeat, | |
123 | |
124 // Parenthesized (capturing) subexpression. Index is cap_. | |
125 // Optionally, capturing name is name_. | |
126 kRegexpCapture, | |
127 | |
128 // Matches any character. | |
129 kRegexpAnyChar, | |
130 | |
131 // Matches any byte [sic]. | |
132 kRegexpAnyByte, | |
133 | |
134 // Matches empty string at beginning of line. | |
135 kRegexpBeginLine, | |
136 // Matches empty string at end of line. | |
137 kRegexpEndLine, | |
138 | |
139 // Matches word boundary "\b". | |
140 kRegexpWordBoundary, | |
141 // Matches not-a-word boundary "\B". | |
142 kRegexpNoWordBoundary, | |
143 | |
144 // Matches empty string at beginning of text. | |
145 kRegexpBeginText, | |
146 // Matches empty string at end of text. | |
147 kRegexpEndText, | |
148 | |
149 // Matches character class given by cc_. | |
150 kRegexpCharClass, | |
151 | |
152 // Forces match of entire expression right now, | |
153 // with match ID match_id_ (used by RE2::Set). | |
154 kRegexpHaveMatch, | |
155 | |
156 kMaxRegexpOp = kRegexpHaveMatch, | |
157 }; | |
158 | |
159 // Keep in sync with string list in regexp.cc | |
160 enum RegexpStatusCode { | |
161 // No error | |
162 kRegexpSuccess = 0, | |
163 | |
164 // Unexpected error | |
165 kRegexpInternalError, | |
166 | |
167 // Parse errors | |
168 kRegexpBadEscape, // bad escape sequence | |
169 kRegexpBadCharClass, // bad character class | |
170 kRegexpBadCharRange, // bad character class range | |
171 kRegexpMissingBracket, // missing closing ] | |
172 kRegexpMissingParen, // missing closing ) | |
173 kRegexpTrailingBackslash, // at end of regexp | |
174 kRegexpRepeatArgument, // repeat argument missing, e.g. "*" | |
175 kRegexpRepeatSize, // bad repetition argument | |
176 kRegexpRepeatOp, // bad repetition operator | |
177 kRegexpBadPerlOp, // bad perl operator | |
178 kRegexpBadUTF8, // invalid UTF-8 in regexp | |
179 kRegexpBadNamedCapture, // bad named capture | |
180 }; | |
181 | |
182 // Error status for certain operations. | |
183 class RegexpStatus { | |
184 public: | |
185 RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {} | |
186 ~RegexpStatus() { delete tmp_; } | |
187 | |
188 void set_code(enum RegexpStatusCode code) { code_ = code; } | |
189 void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; } | |
190 void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; } | |
191 enum RegexpStatusCode code() const { return code_; } | |
192 const StringPiece& error_arg() const { return error_arg_; } | |
193 bool ok() const { return code() == kRegexpSuccess; } | |
194 | |
195 // Copies state from status. | |
196 void Copy(const RegexpStatus& status); | |
197 | |
198 // Returns text equivalent of code, e.g.: | |
199 // "Bad character class" | |
200 static string CodeText(enum RegexpStatusCode code); | |
201 | |
202 // Returns text describing error, e.g.: | |
203 // "Bad character class: [z-a]" | |
204 string Text() const; | |
205 | |
206 private: | |
207 enum RegexpStatusCode code_; // Kind of error | |
208 StringPiece error_arg_; // Piece of regexp containing syntax error. | |
209 string* tmp_; // Temporary storage, possibly where error_arg_
is. | |
210 | |
211 DISALLOW_COPY_AND_ASSIGN(RegexpStatus); | |
212 }; | |
213 | |
214 // Walkers to implement Simplify. | |
215 class CoalesceWalker; | |
216 class SimplifyWalker; | |
217 | |
218 // Compiled form; see prog.h | |
219 class Prog; | |
220 | |
221 struct RuneRange { | |
222 RuneRange() : lo(0), hi(0) { } | |
223 RuneRange(int l, int h) : lo(l), hi(h) { } | |
224 Rune lo; | |
225 Rune hi; | |
226 }; | |
227 | |
228 // Less-than on RuneRanges treats a == b if they overlap at all. | |
229 // This lets us look in a set to find the range covering a particular Rune. | |
230 struct RuneRangeLess { | |
231 bool operator()(const RuneRange& a, const RuneRange& b) const { | |
232 return a.hi < b.lo; | |
233 } | |
234 }; | |
235 | |
236 class CharClassBuilder; | |
237 | |
238 class CharClass { | |
239 public: | |
240 void Delete(); | |
241 | |
242 typedef RuneRange* iterator; | |
243 iterator begin() { return ranges_; } | |
244 iterator end() { return ranges_ + nranges_; } | |
245 | |
246 int size() { return nrunes_; } | |
247 bool empty() { return nrunes_ == 0; } | |
248 bool full() { return nrunes_ == Runemax+1; } | |
249 bool FoldsASCII() { return folds_ascii_; } | |
250 | |
251 bool Contains(Rune r); | |
252 CharClass* Negate(); | |
253 | |
254 private: | |
255 CharClass(); // not implemented | |
256 ~CharClass(); // not implemented | |
257 static CharClass* New(int maxranges); | |
258 | |
259 friend class CharClassBuilder; | |
260 | |
261 bool folds_ascii_; | |
262 int nrunes_; | |
263 RuneRange *ranges_; | |
264 int nranges_; | |
265 DISALLOW_COPY_AND_ASSIGN(CharClass); | |
266 }; | |
267 | |
268 class Regexp { | |
269 public: | |
270 | |
271 // Flags for parsing. Can be ORed together. | |
272 enum ParseFlags { | |
273 NoParseFlags = 0, | |
274 FoldCase = 1<<0, // Fold case during matching (case-insensitive). | |
275 Literal = 1<<1, // Treat s as literal string instead of a regexp. | |
276 ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s | |
277 // and [[:space:]] to match newline. | |
278 DotNL = 1<<3, // Allow . to match newline. | |
279 MatchNL = ClassNL | DotNL, | |
280 OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and | |
281 // end of text, not around embedded newlines. | |
282 // (Perl's default) | |
283 Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8. | |
284 NonGreedy = 1<<6, // Repetition operators are non-greedy by default. | |
285 PerlClasses = 1<<7, // Allow Perl character classes like \d. | |
286 PerlB = 1<<8, // Allow Perl's \b and \B. | |
287 PerlX = 1<<9, // Perl extensions: | |
288 // non-capturing parens - (?: ) | |
289 // non-greedy operators - *? +? ?? {}? | |
290 // flag edits - (?i) (?-i) (?i: ) | |
291 // i - FoldCase | |
292 // m - !OneLine | |
293 // s - DotNL | |
294 // U - NonGreedy | |
295 // line ends: \A \z | |
296 // \Q and \E to disable/enable metacharacters | |
297 // (?P<name>expr) for named captures | |
298 // \C to match any single byte | |
299 UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group | |
300 // and \P{Han} for its negation. | |
301 NeverNL = 1<<11, // Never match NL, even if the regexp mentions | |
302 // it explicitly. | |
303 NeverCapture = 1<<12, // Parse all parens as non-capturing. | |
304 | |
305 // As close to Perl as we can get. | |
306 LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX | | |
307 UnicodeGroups, | |
308 | |
309 // Internal use only. | |
310 WasDollar = 1<<15, // on kRegexpEndText: was $ in regexp text | |
311 }; | |
312 | |
313 // Get. No set, Regexps are logically immutable once created. | |
314 RegexpOp op() { return static_cast<RegexpOp>(op_); } | |
315 int nsub() { return nsub_; } | |
316 bool simple() { return simple_ != 0; } | |
317 enum ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_);
} | |
318 int Ref(); // For testing. | |
319 | |
320 Regexp** sub() { | |
321 if(nsub_ <= 1) | |
322 return &subone_; | |
323 else | |
324 return submany_; | |
325 } | |
326 | |
327 int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; } | |
328 int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; } | |
329 Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; } | |
330 CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; } | |
331 int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; } | |
332 const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; } | |
333 Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; } | |
334 int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; } | |
335 int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; } | |
336 | |
337 // Increments reference count, returns object as convenience. | |
338 Regexp* Incref(); | |
339 | |
340 // Decrements reference count and deletes this object if count reaches 0. | |
341 void Decref(); | |
342 | |
343 // Parses string s to produce regular expression, returned. | |
344 // Caller must release return value with re->Decref(). | |
345 // On failure, sets *status (if status != NULL) and returns NULL. | |
346 static Regexp* Parse(const StringPiece& s, ParseFlags flags, | |
347 RegexpStatus* status); | |
348 | |
349 // Returns a _new_ simplified version of the current regexp. | |
350 // Does not edit the current regexp. | |
351 // Caller must release return value with re->Decref(). | |
352 // Simplified means that counted repetition has been rewritten | |
353 // into simpler terms and all Perl/POSIX features have been | |
354 // removed. The result will capture exactly the same | |
355 // subexpressions the original did, unless formatted with ToString. | |
356 Regexp* Simplify(); | |
357 friend class CoalesceWalker; | |
358 friend class SimplifyWalker; | |
359 | |
360 // Parses the regexp src and then simplifies it and sets *dst to the | |
361 // string representation of the simplified form. Returns true on success. | |
362 // Returns false and sets *status (if status != NULL) on parse error. | |
363 static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags, | |
364 string* dst, | |
365 RegexpStatus* status); | |
366 | |
367 // Returns the number of capturing groups in the regexp. | |
368 int NumCaptures(); | |
369 friend class NumCapturesWalker; | |
370 | |
371 // Returns a map from names to capturing group indices, | |
372 // or NULL if the regexp contains no named capture groups. | |
373 // The caller is responsible for deleting the map. | |
374 map<string, int>* NamedCaptures(); | |
375 | |
376 // Returns a map from capturing group indices to capturing group | |
377 // names or NULL if the regexp contains no named capture groups. The | |
378 // caller is responsible for deleting the map. | |
379 map<int, string>* CaptureNames(); | |
380 | |
381 // Returns a string representation of the current regexp, | |
382 // using as few parentheses as possible. | |
383 string ToString(); | |
384 | |
385 // Convenience functions. They consume the passed reference, | |
386 // so in many cases you should use, e.g., Plus(re->Incref(), flags). | |
387 // They do not consume allocated arrays like subs or runes. | |
388 static Regexp* Plus(Regexp* sub, ParseFlags flags); | |
389 static Regexp* Star(Regexp* sub, ParseFlags flags); | |
390 static Regexp* Quest(Regexp* sub, ParseFlags flags); | |
391 static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags); | |
392 static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags); | |
393 static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap); | |
394 static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max); | |
395 static Regexp* NewLiteral(Rune rune, ParseFlags flags); | |
396 static Regexp* NewCharClass(CharClass* cc, ParseFlags flags); | |
397 static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags); | |
398 static Regexp* HaveMatch(int match_id, ParseFlags flags); | |
399 | |
400 // Like Alternate but does not factor out common prefixes. | |
401 static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags); | |
402 | |
403 // Debugging function. Returns string format for regexp | |
404 // that makes structure clear. Does NOT use regexp syntax. | |
405 string Dump(); | |
406 | |
407 // Helper traversal class, defined fully in walker-inl.h. | |
408 template<typename T> class Walker; | |
409 | |
410 // Compile to Prog. See prog.h | |
411 // Reverse prog expects to be run over text backward. | |
412 // Construction and execution of prog will | |
413 // stay within approximately max_mem bytes of memory. | |
414 // If max_mem <= 0, a reasonable default is used. | |
415 Prog* CompileToProg(int64 max_mem); | |
416 Prog* CompileToReverseProg(int64 max_mem); | |
417 | |
418 // Whether to expect this library to find exactly the same answer as PCRE | |
419 // when running this regexp. Most regexps do mimic PCRE exactly, but a few | |
420 // obscure cases behave differently. Technically this is more a property | |
421 // of the Prog than the Regexp, but the computation is much easier to do | |
422 // on the Regexp. See mimics_pcre.cc for the exact conditions. | |
423 bool MimicsPCRE(); | |
424 | |
425 // Benchmarking function. | |
426 void NullWalk(); | |
427 | |
428 // Whether every match of this regexp must be anchored and | |
429 // begin with a non-empty fixed string (perhaps after ASCII | |
430 // case-folding). If so, returns the prefix and the sub-regexp that | |
431 // follows it. | |
432 bool RequiredPrefix(string* prefix, bool *foldcase, Regexp** suffix); | |
433 | |
434 private: | |
435 // Constructor allocates vectors as appropriate for operator. | |
436 explicit Regexp(RegexpOp op, ParseFlags parse_flags); | |
437 | |
438 // Use Decref() instead of delete to release Regexps. | |
439 // This is private to catch deletes at compile time. | |
440 ~Regexp(); | |
441 void Destroy(); | |
442 bool QuickDestroy(); | |
443 | |
444 // Helpers for Parse. Listed here so they can edit Regexps. | |
445 class ParseState; | |
446 friend class ParseState; | |
447 friend bool ParseCharClass(StringPiece* s, Regexp** out_re, | |
448 RegexpStatus* status); | |
449 | |
450 // Helper for testing [sic]. | |
451 friend bool RegexpEqualTestingOnly(Regexp*, Regexp*); | |
452 | |
453 // Computes whether Regexp is already simple. | |
454 bool ComputeSimple(); | |
455 | |
456 // Constructor that generates a concatenation or alternation, | |
457 // enforcing the limit on the number of subexpressions for | |
458 // a particular Regexp. | |
459 static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs, | |
460 ParseFlags flags, bool can_factor); | |
461 | |
462 // Returns the leading string that re starts with. | |
463 // The returned Rune* points into a piece of re, | |
464 // so it must not be used after the caller calls re->Decref(). | |
465 static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags); | |
466 | |
467 // Removes the first n leading runes from the beginning of re. | |
468 // Edits re in place. | |
469 static void RemoveLeadingString(Regexp* re, int n); | |
470 | |
471 // Returns the leading regexp in re's top-level concatenation. | |
472 // The returned Regexp* points at re or a sub-expression of re, | |
473 // so it must not be used after the caller calls re->Decref(). | |
474 static Regexp* LeadingRegexp(Regexp* re); | |
475 | |
476 // Removes LeadingRegexp(re) from re and returns the remainder. | |
477 // Might edit re in place. | |
478 static Regexp* RemoveLeadingRegexp(Regexp* re); | |
479 | |
480 // Simplifies an alternation of literal strings by factoring out | |
481 // common prefixes. | |
482 static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags); | |
483 static int FactorAlternationRecursive(Regexp** sub, int nsub, | |
484 ParseFlags flags, int maxdepth); | |
485 | |
486 // Is a == b? Only efficient on regexps that have not been through | |
487 // Simplify yet - the expansion of a kRegexpRepeat will make this | |
488 // take a long time. Do not call on such regexps, hence private. | |
489 static bool Equal(Regexp* a, Regexp* b); | |
490 | |
491 // Allocate space for n sub-regexps. | |
492 void AllocSub(int n) { | |
493 if (n < 0 || static_cast<uint16>(n) != n) | |
494 LOG(FATAL) << "Cannot AllocSub " << n; | |
495 if (n > 1) | |
496 submany_ = new Regexp*[n]; | |
497 nsub_ = n; | |
498 } | |
499 | |
500 // Add Rune to LiteralString | |
501 void AddRuneToString(Rune r); | |
502 | |
503 // Swaps this with that, in place. | |
504 void Swap(Regexp *that); | |
505 | |
506 // Operator. See description of operators above. | |
507 // uint8 instead of RegexpOp to control space usage. | |
508 uint8 op_; | |
509 | |
510 // Is this regexp structure already simple | |
511 // (has it been returned by Simplify)? | |
512 // uint8 instead of bool to control space usage. | |
513 uint8 simple_; | |
514 | |
515 // Flags saved from parsing and used during execution. | |
516 // (Only FoldCase is used.) | |
517 // uint16 instead of ParseFlags to control space usage. | |
518 uint16 parse_flags_; | |
519 | |
520 // Reference count. Exists so that SimplifyRegexp can build | |
521 // regexp structures that are dags rather than trees to avoid | |
522 // exponential blowup in space requirements. | |
523 // uint16 to control space usage. | |
524 // The standard regexp routines will never generate a | |
525 // ref greater than the maximum repeat count (100), | |
526 // but even so, Incref and Decref consult an overflow map | |
527 // when ref_ reaches kMaxRef. | |
528 uint16 ref_; | |
529 static const uint16 kMaxRef = 0xffff; | |
530 | |
531 // Subexpressions. | |
532 // uint16 to control space usage. | |
533 // Concat and Alternate handle larger numbers of subexpressions | |
534 // by building concatenation or alternation trees. | |
535 // Other routines should call Concat or Alternate instead of | |
536 // filling in sub() by hand. | |
537 uint16 nsub_; | |
538 static const uint16 kMaxNsub = 0xffff; | |
539 union { | |
540 Regexp** submany_; // if nsub_ > 1 | |
541 Regexp* subone_; // if nsub_ == 1 | |
542 }; | |
543 | |
544 // Extra space for parse and teardown stacks. | |
545 Regexp* down_; | |
546 | |
547 // Arguments to operator. See description of operators above. | |
548 union { | |
549 struct { // Repeat | |
550 int max_; | |
551 int min_; | |
552 }; | |
553 struct { // Capture | |
554 int cap_; | |
555 string* name_; | |
556 }; | |
557 struct { // LiteralString | |
558 int nrunes_; | |
559 Rune* runes_; | |
560 }; | |
561 struct { // CharClass | |
562 // These two could be in separate union members, | |
563 // but it wouldn't save any space (there are other two-word structs) | |
564 // and keeping them separate avoids confusion during parsing. | |
565 CharClass* cc_; | |
566 CharClassBuilder* ccb_; | |
567 }; | |
568 Rune rune_; // Literal | |
569 int match_id_; // HaveMatch | |
570 void *the_union_[2]; // as big as any other element, for memset | |
571 }; | |
572 | |
573 DISALLOW_COPY_AND_ASSIGN(Regexp); | |
574 }; | |
575 | |
576 // Character class set: contains non-overlapping, non-abutting RuneRanges. | |
577 typedef set<RuneRange, RuneRangeLess> RuneRangeSet; | |
578 | |
579 class CharClassBuilder { | |
580 public: | |
581 CharClassBuilder(); | |
582 | |
583 typedef RuneRangeSet::iterator iterator; | |
584 iterator begin() { return ranges_.begin(); } | |
585 iterator end() { return ranges_.end(); } | |
586 | |
587 int size() { return nrunes_; } | |
588 bool empty() { return nrunes_ == 0; } | |
589 bool full() { return nrunes_ == Runemax+1; } | |
590 | |
591 bool Contains(Rune r); | |
592 bool FoldsASCII(); | |
593 bool AddRange(Rune lo, Rune hi); // returns whether class changed | |
594 CharClassBuilder* Copy(); | |
595 void AddCharClass(CharClassBuilder* cc); | |
596 void Negate(); | |
597 void RemoveAbove(Rune r); | |
598 CharClass* GetCharClass(); | |
599 void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags); | |
600 | |
601 private: | |
602 static const uint32 AlphaMask = (1<<26) - 1; | |
603 uint32 upper_; // bitmap of A-Z | |
604 uint32 lower_; // bitmap of a-z | |
605 int nrunes_; | |
606 RuneRangeSet ranges_; | |
607 DISALLOW_COPY_AND_ASSIGN(CharClassBuilder); | |
608 }; | |
609 | |
610 // Tell g++ that bitwise ops on ParseFlags produce ParseFlags. | |
611 inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b) | |
612 { | |
613 return static_cast<Regexp::ParseFlags>(static_cast<int>(a) | static_cast<int>(
b)); | |
614 } | |
615 | |
616 inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, Regexp::ParseFlags b) | |
617 { | |
618 return static_cast<Regexp::ParseFlags>(static_cast<int>(a) ^ static_cast<int>(
b)); | |
619 } | |
620 | |
621 inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, Regexp::ParseFlags b) | |
622 { | |
623 return static_cast<Regexp::ParseFlags>(static_cast<int>(a) & static_cast<int>(
b)); | |
624 } | |
625 | |
626 inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) | |
627 { | |
628 return static_cast<Regexp::ParseFlags>(~static_cast<int>(a)); | |
629 } | |
630 | |
631 | |
632 | |
633 } // namespace re2 | |
634 | |
635 #endif // RE2_REGEXP_H__ | |
OLD | NEW |