OLD | NEW |
| (Empty) |
1 // Copyright 2003-2009 The RE2 Authors. All Rights Reserved. | |
2 // Use of this source code is governed by a BSD-style | |
3 // license that can be found in the LICENSE file. | |
4 | |
5 #ifndef RE2_RE2_H | |
6 #define RE2_RE2_H | |
7 | |
8 // C++ interface to the re2 regular-expression library. | |
9 // RE2 supports Perl-style regular expressions (with extensions like | |
10 // \d, \w, \s, ...). | |
11 // | |
12 // ----------------------------------------------------------------------- | |
13 // REGEXP SYNTAX: | |
14 // | |
15 // This module uses the re2 library and hence supports | |
16 // its syntax for regular expressions, which is similar to Perl's with | |
17 // some of the more complicated things thrown away. In particular, | |
18 // backreferences and generalized assertions are not available, nor is \Z. | |
19 // | |
20 // See https://github.com/google/re2/wiki/Syntax for the syntax | |
21 // supported by RE2, and a comparison with PCRE and PERL regexps. | |
22 // | |
23 // For those not familiar with Perl's regular expressions, | |
24 // here are some examples of the most commonly used extensions: | |
25 // | |
26 // "hello (\\w+) world" -- \w matches a "word" character | |
27 // "version (\\d+)" -- \d matches a digit | |
28 // "hello\\s+world" -- \s matches any whitespace character | |
29 // "\\b(\\w+)\\b" -- \b matches non-empty string at word boundary | |
30 // "(?i)hello" -- (?i) turns on case-insensitive matching | |
31 // "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible | |
32 // | |
33 // ----------------------------------------------------------------------- | |
34 // MATCHING INTERFACE: | |
35 // | |
36 // The "FullMatch" operation checks that supplied text matches a | |
37 // supplied pattern exactly. | |
38 // | |
39 // Example: successful match | |
40 // CHECK(RE2::FullMatch("hello", "h.*o")); | |
41 // | |
42 // Example: unsuccessful match (requires full match): | |
43 // CHECK(!RE2::FullMatch("hello", "e")); | |
44 // | |
45 // ----------------------------------------------------------------------- | |
46 // UTF-8 AND THE MATCHING INTERFACE: | |
47 // | |
48 // By default, the pattern and input text are interpreted as UTF-8. | |
49 // The RE2::Latin1 option causes them to be interpreted as Latin-1. | |
50 // | |
51 // Example: | |
52 // CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern))); | |
53 // CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1))); | |
54 // | |
55 // ----------------------------------------------------------------------- | |
56 // MATCHING WITH SUB-STRING EXTRACTION: | |
57 // | |
58 // You can supply extra pointer arguments to extract matched subpieces. | |
59 // | |
60 // Example: extracts "ruby" into "s" and 1234 into "i" | |
61 // int i; | |
62 // string s; | |
63 // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i)); | |
64 // | |
65 // Example: fails because string cannot be stored in integer | |
66 // CHECK(!RE2::FullMatch("ruby", "(.*)", &i)); | |
67 // | |
68 // Example: fails because there aren't enough sub-patterns: | |
69 // CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s)); | |
70 // | |
71 // Example: does not try to extract any extra sub-patterns | |
72 // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s)); | |
73 // | |
74 // Example: does not try to extract into NULL | |
75 // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i)); | |
76 // | |
77 // Example: integer overflow causes failure | |
78 // CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i)); | |
79 // | |
80 // NOTE(rsc): Asking for substrings slows successful matches quite a bit. | |
81 // This may get a little faster in the future, but right now is slower | |
82 // than PCRE. On the other hand, failed matches run *very* fast (faster | |
83 // than PCRE), as do matches without substring extraction. | |
84 // | |
85 // ----------------------------------------------------------------------- | |
86 // PARTIAL MATCHES | |
87 // | |
88 // You can use the "PartialMatch" operation when you want the pattern | |
89 // to match any substring of the text. | |
90 // | |
91 // Example: simple search for a string: | |
92 // CHECK(RE2::PartialMatch("hello", "ell")); | |
93 // | |
94 // Example: find first number in a string | |
95 // int number; | |
96 // CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number)); | |
97 // CHECK_EQ(number, 100); | |
98 // | |
99 // ----------------------------------------------------------------------- | |
100 // PRE-COMPILED REGULAR EXPRESSIONS | |
101 // | |
102 // RE2 makes it easy to use any string as a regular expression, without | |
103 // requiring a separate compilation step. | |
104 // | |
105 // If speed is of the essence, you can create a pre-compiled "RE2" | |
106 // object from the pattern and use it multiple times. If you do so, | |
107 // you can typically parse text faster than with sscanf. | |
108 // | |
109 // Example: precompile pattern for faster matching: | |
110 // RE2 pattern("h.*o"); | |
111 // while (ReadLine(&str)) { | |
112 // if (RE2::FullMatch(str, pattern)) ...; | |
113 // } | |
114 // | |
115 // ----------------------------------------------------------------------- | |
116 // SCANNING TEXT INCREMENTALLY | |
117 // | |
118 // The "Consume" operation may be useful if you want to repeatedly | |
119 // match regular expressions at the front of a string and skip over | |
120 // them as they match. This requires use of the "StringPiece" type, | |
121 // which represents a sub-range of a real string. | |
122 // | |
123 // Example: read lines of the form "var = value" from a string. | |
124 // string contents = ...; // Fill string somehow | |
125 // StringPiece input(contents); // Wrap a StringPiece around it | |
126 // | |
127 // string var; | |
128 // int value; | |
129 // while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) { | |
130 // ...; | |
131 // } | |
132 // | |
133 // Each successful call to "Consume" will set "var/value", and also | |
134 // advance "input" so it points past the matched text. Note that if the | |
135 // regular expression matches an empty string, input will advance | |
136 // by 0 bytes. If the regular expression being used might match | |
137 // an empty string, the loop body must check for this case and either | |
138 // advance the string or break out of the loop. | |
139 // | |
140 // The "FindAndConsume" operation is similar to "Consume" but does not | |
141 // anchor your match at the beginning of the string. For example, you | |
142 // could extract all words from a string by repeatedly calling | |
143 // RE2::FindAndConsume(&input, "(\\w+)", &word) | |
144 // | |
145 // ----------------------------------------------------------------------- | |
146 // USING VARIABLE NUMBER OF ARGUMENTS | |
147 // | |
148 // The above operations require you to know the number of arguments | |
149 // when you write the code. This is not always possible or easy (for | |
150 // example, the regular expression may be calculated at run time). | |
151 // You can use the "N" version of the operations when the number of | |
152 // match arguments are determined at run time. | |
153 // | |
154 // Example: | |
155 // const RE2::Arg* args[10]; | |
156 // int n; | |
157 // // ... populate args with pointers to RE2::Arg values ... | |
158 // // ... set n to the number of RE2::Arg objects ... | |
159 // bool match = RE2::FullMatchN(input, pattern, args, n); | |
160 // | |
161 // The last statement is equivalent to | |
162 // | |
163 // bool match = RE2::FullMatch(input, pattern, | |
164 // *args[0], *args[1], ..., *args[n - 1]); | |
165 // | |
166 // ----------------------------------------------------------------------- | |
167 // PARSING HEX/OCTAL/C-RADIX NUMBERS | |
168 // | |
169 // By default, if you pass a pointer to a numeric value, the | |
170 // corresponding text is interpreted as a base-10 number. You can | |
171 // instead wrap the pointer with a call to one of the operators Hex(), | |
172 // Octal(), or CRadix() to interpret the text in another base. The | |
173 // CRadix operator interprets C-style "0" (base-8) and "0x" (base-16) | |
174 // prefixes, but defaults to base-10. | |
175 // | |
176 // Example: | |
177 // int a, b, c, d; | |
178 // CHECK(RE2::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)", | |
179 // RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d)); | |
180 // will leave 64 in a, b, c, and d. | |
181 | |
182 #include <stdint.h> | |
183 #include <map> | |
184 #include <string> | |
185 #include "re2/stringpiece.h" | |
186 #include "re2/variadic_function.h" | |
187 | |
188 #ifndef RE2_HAVE_LONGLONG | |
189 #define RE2_HAVE_LONGLONG 1 | |
190 #endif | |
191 | |
192 namespace re2 { | |
193 | |
194 using std::string; | |
195 using std::map; | |
196 class Mutex; | |
197 class Prog; | |
198 class Regexp; | |
199 | |
200 // The following enum should be used only as a constructor argument to indicate | |
201 // that the variable has static storage class, and that the constructor should | |
202 // do nothing to its state. It indicates to the reader that it is legal to | |
203 // declare a static instance of the class, provided the constructor is given | |
204 // the LINKER_INITIALIZED argument. Normally, it is unsafe to declare a | |
205 // static variable that has a constructor or a destructor because invocation | |
206 // order is undefined. However, IF the type can be initialized by filling with | |
207 // zeroes (which the loader does for static variables), AND the type's | |
208 // destructor does nothing to the storage, then a constructor for static | |
209 // initialization can be declared as | |
210 // explicit MyClass(LinkerInitialized x) {} | |
211 // and invoked as | |
212 // static MyClass my_variable_name(LINKER_INITIALIZED); | |
213 enum LinkerInitialized { LINKER_INITIALIZED }; | |
214 | |
215 // Interface for regular expression matching. Also corresponds to a | |
216 // pre-compiled regular expression. An "RE2" object is safe for | |
217 // concurrent use by multiple threads. | |
218 class RE2 { | |
219 public: | |
220 // We convert user-passed pointers into special Arg objects | |
221 class Arg; | |
222 class Options; | |
223 | |
224 // Defined in set.h. | |
225 class Set; | |
226 | |
227 enum ErrorCode { | |
228 NoError = 0, | |
229 | |
230 // Unexpected error | |
231 ErrorInternal, | |
232 | |
233 // Parse errors | |
234 ErrorBadEscape, // bad escape sequence | |
235 ErrorBadCharClass, // bad character class | |
236 ErrorBadCharRange, // bad character class range | |
237 ErrorMissingBracket, // missing closing ] | |
238 ErrorMissingParen, // missing closing ) | |
239 ErrorTrailingBackslash, // trailing \ at end of regexp | |
240 ErrorRepeatArgument, // repeat argument missing, e.g. "*" | |
241 ErrorRepeatSize, // bad repetition argument | |
242 ErrorRepeatOp, // bad repetition operator | |
243 ErrorBadPerlOp, // bad perl operator | |
244 ErrorBadUTF8, // invalid UTF-8 in regexp | |
245 ErrorBadNamedCapture, // bad named capture group | |
246 ErrorPatternTooLarge // pattern too large (compile failed) | |
247 }; | |
248 | |
249 // Predefined common options. | |
250 // If you need more complicated things, instantiate | |
251 // an Option class, possibly passing one of these to | |
252 // the Option constructor, change the settings, and pass that | |
253 // Option class to the RE2 constructor. | |
254 enum CannedOptions { | |
255 DefaultOptions = 0, | |
256 Latin1, // treat input as Latin-1 (default UTF-8) | |
257 POSIX, // POSIX syntax, leftmost-longest match | |
258 Quiet // do not log about regexp parse errors | |
259 }; | |
260 | |
261 // Need to have the const char* and const string& forms for implicit | |
262 // conversions when passing string literals to FullMatch and PartialMatch. | |
263 // Otherwise the StringPiece form would be sufficient. | |
264 #ifndef SWIG | |
265 RE2(const char* pattern); | |
266 RE2(const string& pattern); | |
267 #endif | |
268 RE2(const StringPiece& pattern); | |
269 RE2(const StringPiece& pattern, const Options& option); | |
270 ~RE2(); | |
271 | |
272 // Returns whether RE2 was created properly. | |
273 bool ok() const { return error_code() == NoError; } | |
274 | |
275 // The string specification for this RE2. E.g. | |
276 // RE2 re("ab*c?d+"); | |
277 // re.pattern(); // "ab*c?d+" | |
278 const string& pattern() const { return pattern_; } | |
279 | |
280 // If RE2 could not be created properly, returns an error string. | |
281 // Else returns the empty string. | |
282 const string& error() const { return *error_; } | |
283 | |
284 // If RE2 could not be created properly, returns an error code. | |
285 // Else returns RE2::NoError (== 0). | |
286 ErrorCode error_code() const { return error_code_; } | |
287 | |
288 // If RE2 could not be created properly, returns the offending | |
289 // portion of the regexp. | |
290 const string& error_arg() const { return error_arg_; } | |
291 | |
292 // Returns the program size, a very approximate measure of a regexp's "cost". | |
293 // Larger numbers are more expensive than smaller numbers. | |
294 int ProgramSize() const; | |
295 | |
296 // EXPERIMENTAL! SUBJECT TO CHANGE! | |
297 // Outputs the program fanout as a histogram bucketed by powers of 2. | |
298 // Returns the number of the largest non-empty bucket. | |
299 int ProgramFanout(map<int, int>* histogram) const; | |
300 | |
301 // Returns the underlying Regexp; not for general use. | |
302 // Returns entire_regexp_ so that callers don't need | |
303 // to know about prefix_ and prefix_foldcase_. | |
304 re2::Regexp* Regexp() const { return entire_regexp_; } | |
305 | |
306 /***** The useful part: the matching interface *****/ | |
307 | |
308 // Matches "text" against "pattern". If pointer arguments are | |
309 // supplied, copies matched sub-patterns into them. | |
310 // | |
311 // You can pass in a "const char*" or a "string" for "text". | |
312 // You can pass in a "const char*" or a "string" or a "RE2" for "pattern". | |
313 // | |
314 // The provided pointer arguments can be pointers to any scalar numeric | |
315 // type, or one of: | |
316 // string (matched piece is copied to string) | |
317 // StringPiece (StringPiece is mutated to point to matched piece) | |
318 // T (where "bool T::ParseFrom(const char*, int)" exists) | |
319 // (void*)NULL (the corresponding matched sub-pattern is not copied) | |
320 // | |
321 // Returns true iff all of the following conditions are satisfied: | |
322 // a. "text" matches "pattern" exactly | |
323 // b. The number of matched sub-patterns is >= number of supplied pointers | |
324 // c. The "i"th argument has a suitable type for holding the | |
325 // string captured as the "i"th sub-pattern. If you pass in | |
326 // NULL for the "i"th argument, or pass fewer arguments than | |
327 // number of sub-patterns, "i"th captured sub-pattern is | |
328 // ignored. | |
329 // | |
330 // CAVEAT: An optional sub-pattern that does not exist in the | |
331 // matched string is assigned the empty string. Therefore, the | |
332 // following will return false (because the empty string is not a | |
333 // valid number): | |
334 // int number; | |
335 // RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number); | |
336 static bool FullMatchN(const StringPiece& text, const RE2& re, | |
337 const Arg* const args[], int argc); | |
338 static const VariadicFunction2< | |
339 bool, const StringPiece&, const RE2&, Arg, RE2::FullMatchN> FullMatch; | |
340 | |
341 // Exactly like FullMatch(), except that "pattern" is allowed to match | |
342 // a substring of "text". | |
343 static bool PartialMatchN(const StringPiece& text, const RE2& re, // 3..16 arg
s | |
344 const Arg* const args[], int argc); | |
345 static const VariadicFunction2< | |
346 bool, const StringPiece&, const RE2&, Arg, RE2::PartialMatchN> PartialMatc
h; | |
347 | |
348 // Like FullMatch() and PartialMatch(), except that pattern has to | |
349 // match a prefix of "text", and "input" is advanced past the matched | |
350 // text. Note: "input" is modified iff this routine returns true. | |
351 static bool ConsumeN(StringPiece* input, const RE2& pattern, // 3..16 args | |
352 const Arg* const args[], int argc); | |
353 static const VariadicFunction2< | |
354 bool, StringPiece*, const RE2&, Arg, RE2::ConsumeN> Consume; | |
355 | |
356 // Like Consume(..), but does not anchor the match at the beginning of the | |
357 // string. That is, "pattern" need not start its match at the beginning of | |
358 // "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next | |
359 // word in "s" and stores it in "word". | |
360 static bool FindAndConsumeN(StringPiece* input, const RE2& pattern, | |
361 const Arg* const args[], int argc); | |
362 static const VariadicFunction2< | |
363 bool, StringPiece*, const RE2&, Arg, RE2::FindAndConsumeN> FindAndConsume; | |
364 | |
365 // Replace the first match of "pattern" in "str" with "rewrite". | |
366 // Within "rewrite", backslash-escaped digits (\1 to \9) can be | |
367 // used to insert text matching corresponding parenthesized group | |
368 // from the pattern. \0 in "rewrite" refers to the entire matching | |
369 // text. E.g., | |
370 // | |
371 // string s = "yabba dabba doo"; | |
372 // CHECK(RE2::Replace(&s, "b+", "d")); | |
373 // | |
374 // will leave "s" containing "yada dabba doo" | |
375 // | |
376 // Returns true if the pattern matches and a replacement occurs, | |
377 // false otherwise. | |
378 static bool Replace(string *str, | |
379 const RE2& pattern, | |
380 const StringPiece& rewrite); | |
381 | |
382 // Like Replace(), except replaces successive non-overlapping occurrences | |
383 // of the pattern in the string with the rewrite. E.g. | |
384 // | |
385 // string s = "yabba dabba doo"; | |
386 // CHECK(RE2::GlobalReplace(&s, "b+", "d")); | |
387 // | |
388 // will leave "s" containing "yada dada doo" | |
389 // Replacements are not subject to re-matching. | |
390 // | |
391 // Because GlobalReplace only replaces non-overlapping matches, | |
392 // replacing "ana" within "banana" makes only one replacement, not two. | |
393 // | |
394 // Returns the number of replacements made. | |
395 static int GlobalReplace(string *str, | |
396 const RE2& pattern, | |
397 const StringPiece& rewrite); | |
398 | |
399 // Like Replace, except that if the pattern matches, "rewrite" | |
400 // is copied into "out" with substitutions. The non-matching | |
401 // portions of "text" are ignored. | |
402 // | |
403 // Returns true iff a match occurred and the extraction happened | |
404 // successfully; if no match occurs, the string is left unaffected. | |
405 // | |
406 // REQUIRES: "text" must not alias any part of "*out". | |
407 static bool Extract(const StringPiece &text, | |
408 const RE2& pattern, | |
409 const StringPiece &rewrite, | |
410 string *out); | |
411 | |
412 // Escapes all potentially meaningful regexp characters in | |
413 // 'unquoted'. The returned string, used as a regular expression, | |
414 // will exactly match the original string. For example, | |
415 // 1.5-2.0? | |
416 // may become: | |
417 // 1\.5\-2\.0\? | |
418 static string QuoteMeta(const StringPiece& unquoted); | |
419 | |
420 // Computes range for any strings matching regexp. The min and max can in | |
421 // some cases be arbitrarily precise, so the caller gets to specify the | |
422 // maximum desired length of string returned. | |
423 // | |
424 // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any | |
425 // string s that is an anchored match for this regexp satisfies | |
426 // min <= s && s <= max. | |
427 // | |
428 // Note that PossibleMatchRange() will only consider the first copy of an | |
429 // infinitely repeated element (i.e., any regexp element followed by a '*' or | |
430 // '+' operator). Regexps with "{N}" constructions are not affected, as those | |
431 // do not compile down to infinite repetitions. | |
432 // | |
433 // Returns true on success, false on error. | |
434 bool PossibleMatchRange(string* min, string* max, int maxlen) const; | |
435 | |
436 // Generic matching interface | |
437 | |
438 // Type of match. | |
439 enum Anchor { | |
440 UNANCHORED, // No anchoring | |
441 ANCHOR_START, // Anchor at start only | |
442 ANCHOR_BOTH // Anchor at start and end | |
443 }; | |
444 | |
445 // Return the number of capturing subpatterns, or -1 if the | |
446 // regexp wasn't valid on construction. The overall match ($0) | |
447 // does not count: if the regexp is "(a)(b)", returns 2. | |
448 int NumberOfCapturingGroups() const; | |
449 | |
450 // Return a map from names to capturing indices. | |
451 // The map records the index of the leftmost group | |
452 // with the given name. | |
453 // Only valid until the re is deleted. | |
454 const map<string, int>& NamedCapturingGroups() const; | |
455 | |
456 // Return a map from capturing indices to names. | |
457 // The map has no entries for unnamed groups. | |
458 // Only valid until the re is deleted. | |
459 const map<int, string>& CapturingGroupNames() const; | |
460 | |
461 // General matching routine. | |
462 // Match against text starting at offset startpos | |
463 // and stopping the search at offset endpos. | |
464 // Returns true if match found, false if not. | |
465 // On a successful match, fills in match[] (up to nmatch entries) | |
466 // with information about submatches. | |
467 // I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true, | |
468 // setting match[0] = "barbaz", match[1] = NULL, match[2] = "bar", | |
469 // match[3] = NULL, ..., up to match[nmatch-1] = NULL. | |
470 // | |
471 // Don't ask for more match information than you will use: | |
472 // runs much faster with nmatch == 1 than nmatch > 1, and | |
473 // runs even faster if nmatch == 0. | |
474 // Doesn't make sense to use nmatch > 1 + NumberOfCapturingGroups(), | |
475 // but will be handled correctly. | |
476 // | |
477 // Passing text == StringPiece(NULL, 0) will be handled like any other | |
478 // empty string, but note that on return, it will not be possible to tell | |
479 // whether submatch i matched the empty string or did not match: | |
480 // either way, match[i] == NULL. | |
481 bool Match(const StringPiece& text, | |
482 int startpos, | |
483 int endpos, | |
484 Anchor anchor, | |
485 StringPiece *match, | |
486 int nmatch) const; | |
487 | |
488 // Check that the given rewrite string is suitable for use with this | |
489 // regular expression. It checks that: | |
490 // * The regular expression has enough parenthesized subexpressions | |
491 // to satisfy all of the \N tokens in rewrite | |
492 // * The rewrite string doesn't have any syntax errors. E.g., | |
493 // '\' followed by anything other than a digit or '\'. | |
494 // A true return value guarantees that Replace() and Extract() won't | |
495 // fail because of a bad rewrite string. | |
496 bool CheckRewriteString(const StringPiece& rewrite, string* error) const; | |
497 | |
498 // Returns the maximum submatch needed for the rewrite to be done by | |
499 // Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2. | |
500 static int MaxSubmatch(const StringPiece& rewrite); | |
501 | |
502 // Append the "rewrite" string, with backslash subsitutions from "vec", | |
503 // to string "out". | |
504 // Returns true on success. This method can fail because of a malformed | |
505 // rewrite string. CheckRewriteString guarantees that the rewrite will | |
506 // be sucessful. | |
507 bool Rewrite(string *out, | |
508 const StringPiece &rewrite, | |
509 const StringPiece* vec, | |
510 int veclen) const; | |
511 | |
512 // Constructor options | |
513 class Options { | |
514 public: | |
515 // The options are (defaults in parentheses): | |
516 // | |
517 // utf8 (true) text and pattern are UTF-8; otherwise Latin-1 | |
518 // posix_syntax (false) restrict regexps to POSIX egrep syntax | |
519 // longest_match (false) search for longest match, not first match | |
520 // log_errors (true) log syntax and execution errors to ERROR | |
521 // max_mem (see below) approx. max memory footprint of RE2 | |
522 // literal (false) interpret string as literal, not regexp | |
523 // never_nl (false) never match \n, even if it is in regexp | |
524 // dot_nl (false) dot matches everything including new line | |
525 // never_capture (false) parse all parens as non-capturing | |
526 // case_sensitive (true) match is case-sensitive (regexp can override | |
527 // with (?i) unless in posix_syntax mode) | |
528 // | |
529 // The following options are only consulted when posix_syntax == true. | |
530 // (When posix_syntax == false these features are always enabled and | |
531 // cannot be turned off.) | |
532 // perl_classes (false) allow Perl's \d \s \w \D \S \W | |
533 // word_boundary (false) allow Perl's \b \B (word boundary and not) | |
534 // one_line (false) ^ and $ only match beginning and end of text | |
535 // | |
536 // The max_mem option controls how much memory can be used | |
537 // to hold the compiled form of the regexp (the Prog) and | |
538 // its cached DFA graphs. Code Search placed limits on the number | |
539 // of Prog instructions and DFA states: 10,000 for both. | |
540 // In RE2, those limits would translate to about 240 KB per Prog | |
541 // and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a | |
542 // better job of keeping them small than Code Search did). | |
543 // Each RE2 has two Progs (one forward, one reverse), and each Prog | |
544 // can have two DFAs (one first match, one longest match). | |
545 // That makes 4 DFAs: | |
546 // | |
547 // forward, first-match - used for UNANCHORED or ANCHOR_LEFT searches | |
548 // if opt.longest_match() == false | |
549 // forward, longest-match - used for all ANCHOR_BOTH searches, | |
550 // and the other two kinds if | |
551 // opt.longest_match() == true | |
552 // reverse, first-match - never used | |
553 // reverse, longest-match - used as second phase for unanchored searches | |
554 // | |
555 // The RE2 memory budget is statically divided between the two | |
556 // Progs and then the DFAs: two thirds to the forward Prog | |
557 // and one third to the reverse Prog. The forward Prog gives half | |
558 // of what it has left over to each of its DFAs. The reverse Prog | |
559 // gives it all to its longest-match DFA. | |
560 // | |
561 // Once a DFA fills its budget, it flushes its cache and starts over. | |
562 // If this happens too often, RE2 falls back on the NFA implementation. | |
563 | |
564 // For now, make the default budget something close to Code Search. | |
565 static const int kDefaultMaxMem = 8<<20; | |
566 | |
567 enum Encoding { | |
568 EncodingUTF8 = 1, | |
569 EncodingLatin1 | |
570 }; | |
571 | |
572 Options() : | |
573 encoding_(EncodingUTF8), | |
574 posix_syntax_(false), | |
575 longest_match_(false), | |
576 log_errors_(true), | |
577 max_mem_(kDefaultMaxMem), | |
578 literal_(false), | |
579 never_nl_(false), | |
580 dot_nl_(false), | |
581 never_capture_(false), | |
582 case_sensitive_(true), | |
583 perl_classes_(false), | |
584 word_boundary_(false), | |
585 one_line_(false) { | |
586 } | |
587 | |
588 /*implicit*/ Options(CannedOptions); | |
589 | |
590 Encoding encoding() const { return encoding_; } | |
591 void set_encoding(Encoding encoding) { encoding_ = encoding; } | |
592 | |
593 // Legacy interface to encoding. | |
594 // TODO(rsc): Remove once clients have been converted. | |
595 bool utf8() const { return encoding_ == EncodingUTF8; } | |
596 void set_utf8(bool b) { | |
597 if (b) { | |
598 encoding_ = EncodingUTF8; | |
599 } else { | |
600 encoding_ = EncodingLatin1; | |
601 } | |
602 } | |
603 | |
604 bool posix_syntax() const { return posix_syntax_; } | |
605 void set_posix_syntax(bool b) { posix_syntax_ = b; } | |
606 | |
607 bool longest_match() const { return longest_match_; } | |
608 void set_longest_match(bool b) { longest_match_ = b; } | |
609 | |
610 bool log_errors() const { return log_errors_; } | |
611 void set_log_errors(bool b) { log_errors_ = b; } | |
612 | |
613 int64_t max_mem() const { return max_mem_; } | |
614 void set_max_mem(int64_t m) { max_mem_ = m; } | |
615 | |
616 bool literal() const { return literal_; } | |
617 void set_literal(bool b) { literal_ = b; } | |
618 | |
619 bool never_nl() const { return never_nl_; } | |
620 void set_never_nl(bool b) { never_nl_ = b; } | |
621 | |
622 bool dot_nl() const { return dot_nl_; } | |
623 void set_dot_nl(bool b) { dot_nl_ = b; } | |
624 | |
625 bool never_capture() const { return never_capture_; } | |
626 void set_never_capture(bool b) { never_capture_ = b; } | |
627 | |
628 bool case_sensitive() const { return case_sensitive_; } | |
629 void set_case_sensitive(bool b) { case_sensitive_ = b; } | |
630 | |
631 bool perl_classes() const { return perl_classes_; } | |
632 void set_perl_classes(bool b) { perl_classes_ = b; } | |
633 | |
634 bool word_boundary() const { return word_boundary_; } | |
635 void set_word_boundary(bool b) { word_boundary_ = b; } | |
636 | |
637 bool one_line() const { return one_line_; } | |
638 void set_one_line(bool b) { one_line_ = b; } | |
639 | |
640 void Copy(const Options& src) { | |
641 encoding_ = src.encoding_; | |
642 posix_syntax_ = src.posix_syntax_; | |
643 longest_match_ = src.longest_match_; | |
644 log_errors_ = src.log_errors_; | |
645 max_mem_ = src.max_mem_; | |
646 literal_ = src.literal_; | |
647 never_nl_ = src.never_nl_; | |
648 dot_nl_ = src.dot_nl_; | |
649 never_capture_ = src.never_capture_; | |
650 case_sensitive_ = src.case_sensitive_; | |
651 perl_classes_ = src.perl_classes_; | |
652 word_boundary_ = src.word_boundary_; | |
653 one_line_ = src.one_line_; | |
654 } | |
655 | |
656 int ParseFlags() const; | |
657 | |
658 private: | |
659 Encoding encoding_; | |
660 bool posix_syntax_; | |
661 bool longest_match_; | |
662 bool log_errors_; | |
663 int64_t max_mem_; | |
664 bool literal_; | |
665 bool never_nl_; | |
666 bool dot_nl_; | |
667 bool never_capture_; | |
668 bool case_sensitive_; | |
669 bool perl_classes_; | |
670 bool word_boundary_; | |
671 bool one_line_; | |
672 | |
673 //DISALLOW_COPY_AND_ASSIGN(Options); | |
674 Options(const Options&); | |
675 void operator=(const Options&); | |
676 }; | |
677 | |
678 // Returns the options set in the constructor. | |
679 const Options& options() const { return options_; }; | |
680 | |
681 // Argument converters; see below. | |
682 static inline Arg CRadix(short* x); | |
683 static inline Arg CRadix(unsigned short* x); | |
684 static inline Arg CRadix(int* x); | |
685 static inline Arg CRadix(unsigned int* x); | |
686 static inline Arg CRadix(long* x); | |
687 static inline Arg CRadix(unsigned long* x); | |
688 #if RE2_HAVE_LONGLONG | |
689 static inline Arg CRadix(long long* x); | |
690 static inline Arg CRadix(unsigned long long* x); | |
691 #endif | |
692 | |
693 static inline Arg Hex(short* x); | |
694 static inline Arg Hex(unsigned short* x); | |
695 static inline Arg Hex(int* x); | |
696 static inline Arg Hex(unsigned int* x); | |
697 static inline Arg Hex(long* x); | |
698 static inline Arg Hex(unsigned long* x); | |
699 #if RE2_HAVE_LONGLONG | |
700 static inline Arg Hex(long long* x); | |
701 static inline Arg Hex(unsigned long long* x); | |
702 #endif | |
703 | |
704 static inline Arg Octal(short* x); | |
705 static inline Arg Octal(unsigned short* x); | |
706 static inline Arg Octal(int* x); | |
707 static inline Arg Octal(unsigned int* x); | |
708 static inline Arg Octal(long* x); | |
709 static inline Arg Octal(unsigned long* x); | |
710 #if RE2_HAVE_LONGLONG | |
711 static inline Arg Octal(long long* x); | |
712 static inline Arg Octal(unsigned long long* x); | |
713 #endif | |
714 | |
715 private: | |
716 void Init(const StringPiece& pattern, const Options& options); | |
717 | |
718 bool DoMatch(const StringPiece& text, | |
719 Anchor anchor, | |
720 int* consumed, | |
721 const Arg* const args[], | |
722 int n) const; | |
723 | |
724 re2::Prog* ReverseProg() const; | |
725 | |
726 mutable Mutex* mutex_; | |
727 string pattern_; // string regular expression | |
728 Options options_; // option flags | |
729 string prefix_; // required prefix (before regexp_) | |
730 bool prefix_foldcase_; // prefix is ASCII case-insensitive | |
731 re2::Regexp* entire_regexp_; // parsed regular expression | |
732 re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed | |
733 re2::Prog* prog_; // compiled program for regexp | |
734 mutable re2::Prog* rprog_; // reverse program for regexp | |
735 bool is_one_pass_; // can use prog_->SearchOnePass? | |
736 mutable const string* error_; // Error indicator | |
737 // (or points to empty string) | |
738 mutable ErrorCode error_code_; // Error code | |
739 mutable string error_arg_; // Fragment of regexp showing error | |
740 mutable int num_captures_; // Number of capturing groups | |
741 | |
742 // Map from capture names to indices | |
743 mutable const map<string, int>* named_groups_; | |
744 | |
745 // Map from capture indices to names | |
746 mutable const map<int, string>* group_names_; | |
747 | |
748 //DISALLOW_COPY_AND_ASSIGN(RE2); | |
749 RE2(const RE2&); | |
750 void operator=(const RE2&); | |
751 }; | |
752 | |
753 /***** Implementation details *****/ | |
754 | |
755 // Hex/Octal/Binary? | |
756 | |
757 // Special class for parsing into objects that define a ParseFrom() method | |
758 template <class T> | |
759 class _RE2_MatchObject { | |
760 public: | |
761 static inline bool Parse(const char* str, int n, void* dest) { | |
762 if (dest == NULL) return true; | |
763 T* object = reinterpret_cast<T*>(dest); | |
764 return object->ParseFrom(str, n); | |
765 } | |
766 }; | |
767 | |
768 class RE2::Arg { | |
769 public: | |
770 // Empty constructor so we can declare arrays of RE2::Arg | |
771 Arg(); | |
772 | |
773 // Constructor specially designed for NULL arguments | |
774 Arg(void*); | |
775 | |
776 typedef bool (*Parser)(const char* str, int n, void* dest); | |
777 | |
778 // Type-specific parsers | |
779 #define MAKE_PARSER(type,name) \ | |
780 Arg(type* p) : arg_(p), parser_(name) { } \ | |
781 Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \ | |
782 | |
783 | |
784 MAKE_PARSER(char, parse_char); | |
785 MAKE_PARSER(signed char, parse_char); | |
786 MAKE_PARSER(unsigned char, parse_uchar); | |
787 MAKE_PARSER(short, parse_short); | |
788 MAKE_PARSER(unsigned short, parse_ushort); | |
789 MAKE_PARSER(int, parse_int); | |
790 MAKE_PARSER(unsigned int, parse_uint); | |
791 MAKE_PARSER(long, parse_long); | |
792 MAKE_PARSER(unsigned long, parse_ulong); | |
793 #if RE2_HAVE_LONGLONG | |
794 MAKE_PARSER(long long, parse_longlong); | |
795 MAKE_PARSER(unsigned long long, parse_ulonglong); | |
796 #endif | |
797 MAKE_PARSER(float, parse_float); | |
798 MAKE_PARSER(double, parse_double); | |
799 MAKE_PARSER(string, parse_string); | |
800 MAKE_PARSER(StringPiece, parse_stringpiece); | |
801 | |
802 #undef MAKE_PARSER | |
803 | |
804 // Generic constructor templates | |
805 template <class T> Arg(T* p) | |
806 : arg_(p), parser_(_RE2_MatchObject<T>::Parse) { } | |
807 template <class T> Arg(T* p, Parser parser) | |
808 : arg_(p), parser_(parser) { } | |
809 | |
810 // Parse the data | |
811 bool Parse(const char* str, int n) const; | |
812 | |
813 private: | |
814 void* arg_; | |
815 Parser parser_; | |
816 | |
817 static bool parse_null (const char* str, int n, void* dest); | |
818 static bool parse_char (const char* str, int n, void* dest); | |
819 static bool parse_uchar (const char* str, int n, void* dest); | |
820 static bool parse_float (const char* str, int n, void* dest); | |
821 static bool parse_double (const char* str, int n, void* dest); | |
822 static bool parse_string (const char* str, int n, void* dest); | |
823 static bool parse_stringpiece (const char* str, int n, void* dest); | |
824 | |
825 #define DECLARE_INTEGER_PARSER(name) \ | |
826 private: \ | |
827 static bool parse_ ## name(const char* str, int n, void* dest); \ | |
828 static bool parse_ ## name ## _radix( \ | |
829 const char* str, int n, void* dest, int radix); \ | |
830 public: \ | |
831 static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \ | |
832 static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \ | |
833 static bool parse_ ## name ## _cradix(const char* str, int n, void* dest) | |
834 | |
835 DECLARE_INTEGER_PARSER(short); | |
836 DECLARE_INTEGER_PARSER(ushort); | |
837 DECLARE_INTEGER_PARSER(int); | |
838 DECLARE_INTEGER_PARSER(uint); | |
839 DECLARE_INTEGER_PARSER(long); | |
840 DECLARE_INTEGER_PARSER(ulong); | |
841 #if RE2_HAVE_LONGLONG | |
842 DECLARE_INTEGER_PARSER(longlong); | |
843 DECLARE_INTEGER_PARSER(ulonglong); | |
844 #endif | |
845 | |
846 #undef DECLARE_INTEGER_PARSER | |
847 }; | |
848 | |
849 inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { } | |
850 inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { } | |
851 | |
852 inline bool RE2::Arg::Parse(const char* str, int n) const { | |
853 return (*parser_)(str, n, arg_); | |
854 } | |
855 | |
856 // This part of the parser, appropriate only for ints, deals with bases | |
857 #define MAKE_INTEGER_PARSER(type, name) \ | |
858 inline RE2::Arg RE2::Hex(type* ptr) { \ | |
859 return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _hex); } \ | |
860 inline RE2::Arg RE2::Octal(type* ptr) { \ | |
861 return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _octal); } \ | |
862 inline RE2::Arg RE2::CRadix(type* ptr) { \ | |
863 return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); } | |
864 | |
865 MAKE_INTEGER_PARSER(short, short) | |
866 MAKE_INTEGER_PARSER(unsigned short, ushort) | |
867 MAKE_INTEGER_PARSER(int, int) | |
868 MAKE_INTEGER_PARSER(unsigned int, uint) | |
869 MAKE_INTEGER_PARSER(long, long) | |
870 MAKE_INTEGER_PARSER(unsigned long, ulong) | |
871 #if RE2_HAVE_LONGLONG | |
872 MAKE_INTEGER_PARSER(long long, longlong) | |
873 MAKE_INTEGER_PARSER(unsigned long long, ulonglong) | |
874 #endif | |
875 | |
876 #undef MAKE_INTEGER_PARSER | |
877 | |
878 } // namespace re2 | |
879 | |
880 using re2::RE2; | |
881 | |
882 #endif /* RE2_RE2_H */ | |
OLD | NEW |