Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(263)

Side by Side Diff: third_party/re2/re2/re2.h

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Re-Added LICENSE and OWNERS file Created 4 years, 12 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/re2/re2/prog.cc ('k') | third_party/re2/re2/re2.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #ifndef RE2_RE2_H
6 #define RE2_RE2_H
7
8 // C++ interface to the re2 regular-expression library.
9 // RE2 supports Perl-style regular expressions (with extensions like
10 // \d, \w, \s, ...).
11 //
12 // -----------------------------------------------------------------------
13 // REGEXP SYNTAX:
14 //
15 // This module uses the re2 library and hence supports
16 // its syntax for regular expressions, which is similar to Perl's with
17 // some of the more complicated things thrown away. In particular,
18 // backreferences and generalized assertions are not available, nor is \Z.
19 //
20 // See https://github.com/google/re2/wiki/Syntax for the syntax
21 // supported by RE2, and a comparison with PCRE and PERL regexps.
22 //
23 // For those not familiar with Perl's regular expressions,
24 // here are some examples of the most commonly used extensions:
25 //
26 // "hello (\\w+) world" -- \w matches a "word" character
27 // "version (\\d+)" -- \d matches a digit
28 // "hello\\s+world" -- \s matches any whitespace character
29 // "\\b(\\w+)\\b" -- \b matches non-empty string at word boundary
30 // "(?i)hello" -- (?i) turns on case-insensitive matching
31 // "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible
32 //
33 // -----------------------------------------------------------------------
34 // MATCHING INTERFACE:
35 //
36 // The "FullMatch" operation checks that supplied text matches a
37 // supplied pattern exactly.
38 //
39 // Example: successful match
40 // CHECK(RE2::FullMatch("hello", "h.*o"));
41 //
42 // Example: unsuccessful match (requires full match):
43 // CHECK(!RE2::FullMatch("hello", "e"));
44 //
45 // -----------------------------------------------------------------------
46 // UTF-8 AND THE MATCHING INTERFACE:
47 //
48 // By default, the pattern and input text are interpreted as UTF-8.
49 // The RE2::Latin1 option causes them to be interpreted as Latin-1.
50 //
51 // Example:
52 // CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern)));
53 // CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1)));
54 //
55 // -----------------------------------------------------------------------
56 // MATCHING WITH SUB-STRING EXTRACTION:
57 //
58 // You can supply extra pointer arguments to extract matched subpieces.
59 //
60 // Example: extracts "ruby" into "s" and 1234 into "i"
61 // int i;
62 // string s;
63 // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
64 //
65 // Example: fails because string cannot be stored in integer
66 // CHECK(!RE2::FullMatch("ruby", "(.*)", &i));
67 //
68 // Example: fails because there aren't enough sub-patterns:
69 // CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s));
70 //
71 // Example: does not try to extract any extra sub-patterns
72 // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));
73 //
74 // Example: does not try to extract into NULL
75 // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));
76 //
77 // Example: integer overflow causes failure
78 // CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));
79 //
80 // NOTE(rsc): Asking for substrings slows successful matches quite a bit.
81 // This may get a little faster in the future, but right now is slower
82 // than PCRE. On the other hand, failed matches run *very* fast (faster
83 // than PCRE), as do matches without substring extraction.
84 //
85 // -----------------------------------------------------------------------
86 // PARTIAL MATCHES
87 //
88 // You can use the "PartialMatch" operation when you want the pattern
89 // to match any substring of the text.
90 //
91 // Example: simple search for a string:
92 // CHECK(RE2::PartialMatch("hello", "ell"));
93 //
94 // Example: find first number in a string
95 // int number;
96 // CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number));
97 // CHECK_EQ(number, 100);
98 //
99 // -----------------------------------------------------------------------
100 // PRE-COMPILED REGULAR EXPRESSIONS
101 //
102 // RE2 makes it easy to use any string as a regular expression, without
103 // requiring a separate compilation step.
104 //
105 // If speed is of the essence, you can create a pre-compiled "RE2"
106 // object from the pattern and use it multiple times. If you do so,
107 // you can typically parse text faster than with sscanf.
108 //
109 // Example: precompile pattern for faster matching:
110 // RE2 pattern("h.*o");
111 // while (ReadLine(&str)) {
112 // if (RE2::FullMatch(str, pattern)) ...;
113 // }
114 //
115 // -----------------------------------------------------------------------
116 // SCANNING TEXT INCREMENTALLY
117 //
118 // The "Consume" operation may be useful if you want to repeatedly
119 // match regular expressions at the front of a string and skip over
120 // them as they match. This requires use of the "StringPiece" type,
121 // which represents a sub-range of a real string.
122 //
123 // Example: read lines of the form "var = value" from a string.
124 // string contents = ...; // Fill string somehow
125 // StringPiece input(contents); // Wrap a StringPiece around it
126 //
127 // string var;
128 // int value;
129 // while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {
130 // ...;
131 // }
132 //
133 // Each successful call to "Consume" will set "var/value", and also
134 // advance "input" so it points past the matched text. Note that if the
135 // regular expression matches an empty string, input will advance
136 // by 0 bytes. If the regular expression being used might match
137 // an empty string, the loop body must check for this case and either
138 // advance the string or break out of the loop.
139 //
140 // The "FindAndConsume" operation is similar to "Consume" but does not
141 // anchor your match at the beginning of the string. For example, you
142 // could extract all words from a string by repeatedly calling
143 // RE2::FindAndConsume(&input, "(\\w+)", &word)
144 //
145 // -----------------------------------------------------------------------
146 // USING VARIABLE NUMBER OF ARGUMENTS
147 //
148 // The above operations require you to know the number of arguments
149 // when you write the code. This is not always possible or easy (for
150 // example, the regular expression may be calculated at run time).
151 // You can use the "N" version of the operations when the number of
152 // match arguments are determined at run time.
153 //
154 // Example:
155 // const RE2::Arg* args[10];
156 // int n;
157 // // ... populate args with pointers to RE2::Arg values ...
158 // // ... set n to the number of RE2::Arg objects ...
159 // bool match = RE2::FullMatchN(input, pattern, args, n);
160 //
161 // The last statement is equivalent to
162 //
163 // bool match = RE2::FullMatch(input, pattern,
164 // *args[0], *args[1], ..., *args[n - 1]);
165 //
166 // -----------------------------------------------------------------------
167 // PARSING HEX/OCTAL/C-RADIX NUMBERS
168 //
169 // By default, if you pass a pointer to a numeric value, the
170 // corresponding text is interpreted as a base-10 number. You can
171 // instead wrap the pointer with a call to one of the operators Hex(),
172 // Octal(), or CRadix() to interpret the text in another base. The
173 // CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
174 // prefixes, but defaults to base-10.
175 //
176 // Example:
177 // int a, b, c, d;
178 // CHECK(RE2::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)",
179 // RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));
180 // will leave 64 in a, b, c, and d.
181
182 #include <stdint.h>
183 #include <map>
184 #include <string>
185 #include "re2/stringpiece.h"
186 #include "re2/variadic_function.h"
187
188 #ifndef RE2_HAVE_LONGLONG
189 #define RE2_HAVE_LONGLONG 1
190 #endif
191
192 namespace re2 {
193
194 using std::string;
195 using std::map;
196 class Mutex;
197 class Prog;
198 class Regexp;
199
200 // The following enum should be used only as a constructor argument to indicate
201 // that the variable has static storage class, and that the constructor should
202 // do nothing to its state. It indicates to the reader that it is legal to
203 // declare a static instance of the class, provided the constructor is given
204 // the LINKER_INITIALIZED argument. Normally, it is unsafe to declare a
205 // static variable that has a constructor or a destructor because invocation
206 // order is undefined. However, IF the type can be initialized by filling with
207 // zeroes (which the loader does for static variables), AND the type's
208 // destructor does nothing to the storage, then a constructor for static
209 // initialization can be declared as
210 // explicit MyClass(LinkerInitialized x) {}
211 // and invoked as
212 // static MyClass my_variable_name(LINKER_INITIALIZED);
213 enum LinkerInitialized { LINKER_INITIALIZED };
214
215 // Interface for regular expression matching. Also corresponds to a
216 // pre-compiled regular expression. An "RE2" object is safe for
217 // concurrent use by multiple threads.
218 class RE2 {
219 public:
220 // We convert user-passed pointers into special Arg objects
221 class Arg;
222 class Options;
223
224 // Defined in set.h.
225 class Set;
226
227 enum ErrorCode {
228 NoError = 0,
229
230 // Unexpected error
231 ErrorInternal,
232
233 // Parse errors
234 ErrorBadEscape, // bad escape sequence
235 ErrorBadCharClass, // bad character class
236 ErrorBadCharRange, // bad character class range
237 ErrorMissingBracket, // missing closing ]
238 ErrorMissingParen, // missing closing )
239 ErrorTrailingBackslash, // trailing \ at end of regexp
240 ErrorRepeatArgument, // repeat argument missing, e.g. "*"
241 ErrorRepeatSize, // bad repetition argument
242 ErrorRepeatOp, // bad repetition operator
243 ErrorBadPerlOp, // bad perl operator
244 ErrorBadUTF8, // invalid UTF-8 in regexp
245 ErrorBadNamedCapture, // bad named capture group
246 ErrorPatternTooLarge // pattern too large (compile failed)
247 };
248
249 // Predefined common options.
250 // If you need more complicated things, instantiate
251 // an Option class, possibly passing one of these to
252 // the Option constructor, change the settings, and pass that
253 // Option class to the RE2 constructor.
254 enum CannedOptions {
255 DefaultOptions = 0,
256 Latin1, // treat input as Latin-1 (default UTF-8)
257 POSIX, // POSIX syntax, leftmost-longest match
258 Quiet // do not log about regexp parse errors
259 };
260
261 // Need to have the const char* and const string& forms for implicit
262 // conversions when passing string literals to FullMatch and PartialMatch.
263 // Otherwise the StringPiece form would be sufficient.
264 #ifndef SWIG
265 RE2(const char* pattern);
266 RE2(const string& pattern);
267 #endif
268 RE2(const StringPiece& pattern);
269 RE2(const StringPiece& pattern, const Options& option);
270 ~RE2();
271
272 // Returns whether RE2 was created properly.
273 bool ok() const { return error_code() == NoError; }
274
275 // The string specification for this RE2. E.g.
276 // RE2 re("ab*c?d+");
277 // re.pattern(); // "ab*c?d+"
278 const string& pattern() const { return pattern_; }
279
280 // If RE2 could not be created properly, returns an error string.
281 // Else returns the empty string.
282 const string& error() const { return *error_; }
283
284 // If RE2 could not be created properly, returns an error code.
285 // Else returns RE2::NoError (== 0).
286 ErrorCode error_code() const { return error_code_; }
287
288 // If RE2 could not be created properly, returns the offending
289 // portion of the regexp.
290 const string& error_arg() const { return error_arg_; }
291
292 // Returns the program size, a very approximate measure of a regexp's "cost".
293 // Larger numbers are more expensive than smaller numbers.
294 int ProgramSize() const;
295
296 // EXPERIMENTAL! SUBJECT TO CHANGE!
297 // Outputs the program fanout as a histogram bucketed by powers of 2.
298 // Returns the number of the largest non-empty bucket.
299 int ProgramFanout(map<int, int>* histogram) const;
300
301 // Returns the underlying Regexp; not for general use.
302 // Returns entire_regexp_ so that callers don't need
303 // to know about prefix_ and prefix_foldcase_.
304 re2::Regexp* Regexp() const { return entire_regexp_; }
305
306 /***** The useful part: the matching interface *****/
307
308 // Matches "text" against "pattern". If pointer arguments are
309 // supplied, copies matched sub-patterns into them.
310 //
311 // You can pass in a "const char*" or a "string" for "text".
312 // You can pass in a "const char*" or a "string" or a "RE2" for "pattern".
313 //
314 // The provided pointer arguments can be pointers to any scalar numeric
315 // type, or one of:
316 // string (matched piece is copied to string)
317 // StringPiece (StringPiece is mutated to point to matched piece)
318 // T (where "bool T::ParseFrom(const char*, int)" exists)
319 // (void*)NULL (the corresponding matched sub-pattern is not copied)
320 //
321 // Returns true iff all of the following conditions are satisfied:
322 // a. "text" matches "pattern" exactly
323 // b. The number of matched sub-patterns is >= number of supplied pointers
324 // c. The "i"th argument has a suitable type for holding the
325 // string captured as the "i"th sub-pattern. If you pass in
326 // NULL for the "i"th argument, or pass fewer arguments than
327 // number of sub-patterns, "i"th captured sub-pattern is
328 // ignored.
329 //
330 // CAVEAT: An optional sub-pattern that does not exist in the
331 // matched string is assigned the empty string. Therefore, the
332 // following will return false (because the empty string is not a
333 // valid number):
334 // int number;
335 // RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
336 static bool FullMatchN(const StringPiece& text, const RE2& re,
337 const Arg* const args[], int argc);
338 static const VariadicFunction2<
339 bool, const StringPiece&, const RE2&, Arg, RE2::FullMatchN> FullMatch;
340
341 // Exactly like FullMatch(), except that "pattern" is allowed to match
342 // a substring of "text".
343 static bool PartialMatchN(const StringPiece& text, const RE2& re, // 3..16 arg s
344 const Arg* const args[], int argc);
345 static const VariadicFunction2<
346 bool, const StringPiece&, const RE2&, Arg, RE2::PartialMatchN> PartialMatc h;
347
348 // Like FullMatch() and PartialMatch(), except that pattern has to
349 // match a prefix of "text", and "input" is advanced past the matched
350 // text. Note: "input" is modified iff this routine returns true.
351 static bool ConsumeN(StringPiece* input, const RE2& pattern, // 3..16 args
352 const Arg* const args[], int argc);
353 static const VariadicFunction2<
354 bool, StringPiece*, const RE2&, Arg, RE2::ConsumeN> Consume;
355
356 // Like Consume(..), but does not anchor the match at the beginning of the
357 // string. That is, "pattern" need not start its match at the beginning of
358 // "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next
359 // word in "s" and stores it in "word".
360 static bool FindAndConsumeN(StringPiece* input, const RE2& pattern,
361 const Arg* const args[], int argc);
362 static const VariadicFunction2<
363 bool, StringPiece*, const RE2&, Arg, RE2::FindAndConsumeN> FindAndConsume;
364
365 // Replace the first match of "pattern" in "str" with "rewrite".
366 // Within "rewrite", backslash-escaped digits (\1 to \9) can be
367 // used to insert text matching corresponding parenthesized group
368 // from the pattern. \0 in "rewrite" refers to the entire matching
369 // text. E.g.,
370 //
371 // string s = "yabba dabba doo";
372 // CHECK(RE2::Replace(&s, "b+", "d"));
373 //
374 // will leave "s" containing "yada dabba doo"
375 //
376 // Returns true if the pattern matches and a replacement occurs,
377 // false otherwise.
378 static bool Replace(string *str,
379 const RE2& pattern,
380 const StringPiece& rewrite);
381
382 // Like Replace(), except replaces successive non-overlapping occurrences
383 // of the pattern in the string with the rewrite. E.g.
384 //
385 // string s = "yabba dabba doo";
386 // CHECK(RE2::GlobalReplace(&s, "b+", "d"));
387 //
388 // will leave "s" containing "yada dada doo"
389 // Replacements are not subject to re-matching.
390 //
391 // Because GlobalReplace only replaces non-overlapping matches,
392 // replacing "ana" within "banana" makes only one replacement, not two.
393 //
394 // Returns the number of replacements made.
395 static int GlobalReplace(string *str,
396 const RE2& pattern,
397 const StringPiece& rewrite);
398
399 // Like Replace, except that if the pattern matches, "rewrite"
400 // is copied into "out" with substitutions. The non-matching
401 // portions of "text" are ignored.
402 //
403 // Returns true iff a match occurred and the extraction happened
404 // successfully; if no match occurs, the string is left unaffected.
405 //
406 // REQUIRES: "text" must not alias any part of "*out".
407 static bool Extract(const StringPiece &text,
408 const RE2& pattern,
409 const StringPiece &rewrite,
410 string *out);
411
412 // Escapes all potentially meaningful regexp characters in
413 // 'unquoted'. The returned string, used as a regular expression,
414 // will exactly match the original string. For example,
415 // 1.5-2.0?
416 // may become:
417 // 1\.5\-2\.0\?
418 static string QuoteMeta(const StringPiece& unquoted);
419
420 // Computes range for any strings matching regexp. The min and max can in
421 // some cases be arbitrarily precise, so the caller gets to specify the
422 // maximum desired length of string returned.
423 //
424 // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
425 // string s that is an anchored match for this regexp satisfies
426 // min <= s && s <= max.
427 //
428 // Note that PossibleMatchRange() will only consider the first copy of an
429 // infinitely repeated element (i.e., any regexp element followed by a '*' or
430 // '+' operator). Regexps with "{N}" constructions are not affected, as those
431 // do not compile down to infinite repetitions.
432 //
433 // Returns true on success, false on error.
434 bool PossibleMatchRange(string* min, string* max, int maxlen) const;
435
436 // Generic matching interface
437
438 // Type of match.
439 enum Anchor {
440 UNANCHORED, // No anchoring
441 ANCHOR_START, // Anchor at start only
442 ANCHOR_BOTH // Anchor at start and end
443 };
444
445 // Return the number of capturing subpatterns, or -1 if the
446 // regexp wasn't valid on construction. The overall match ($0)
447 // does not count: if the regexp is "(a)(b)", returns 2.
448 int NumberOfCapturingGroups() const;
449
450 // Return a map from names to capturing indices.
451 // The map records the index of the leftmost group
452 // with the given name.
453 // Only valid until the re is deleted.
454 const map<string, int>& NamedCapturingGroups() const;
455
456 // Return a map from capturing indices to names.
457 // The map has no entries for unnamed groups.
458 // Only valid until the re is deleted.
459 const map<int, string>& CapturingGroupNames() const;
460
461 // General matching routine.
462 // Match against text starting at offset startpos
463 // and stopping the search at offset endpos.
464 // Returns true if match found, false if not.
465 // On a successful match, fills in match[] (up to nmatch entries)
466 // with information about submatches.
467 // I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true,
468 // setting match[0] = "barbaz", match[1] = NULL, match[2] = "bar",
469 // match[3] = NULL, ..., up to match[nmatch-1] = NULL.
470 //
471 // Don't ask for more match information than you will use:
472 // runs much faster with nmatch == 1 than nmatch > 1, and
473 // runs even faster if nmatch == 0.
474 // Doesn't make sense to use nmatch > 1 + NumberOfCapturingGroups(),
475 // but will be handled correctly.
476 //
477 // Passing text == StringPiece(NULL, 0) will be handled like any other
478 // empty string, but note that on return, it will not be possible to tell
479 // whether submatch i matched the empty string or did not match:
480 // either way, match[i] == NULL.
481 bool Match(const StringPiece& text,
482 int startpos,
483 int endpos,
484 Anchor anchor,
485 StringPiece *match,
486 int nmatch) const;
487
488 // Check that the given rewrite string is suitable for use with this
489 // regular expression. It checks that:
490 // * The regular expression has enough parenthesized subexpressions
491 // to satisfy all of the \N tokens in rewrite
492 // * The rewrite string doesn't have any syntax errors. E.g.,
493 // '\' followed by anything other than a digit or '\'.
494 // A true return value guarantees that Replace() and Extract() won't
495 // fail because of a bad rewrite string.
496 bool CheckRewriteString(const StringPiece& rewrite, string* error) const;
497
498 // Returns the maximum submatch needed for the rewrite to be done by
499 // Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2.
500 static int MaxSubmatch(const StringPiece& rewrite);
501
502 // Append the "rewrite" string, with backslash subsitutions from "vec",
503 // to string "out".
504 // Returns true on success. This method can fail because of a malformed
505 // rewrite string. CheckRewriteString guarantees that the rewrite will
506 // be sucessful.
507 bool Rewrite(string *out,
508 const StringPiece &rewrite,
509 const StringPiece* vec,
510 int veclen) const;
511
512 // Constructor options
513 class Options {
514 public:
515 // The options are (defaults in parentheses):
516 //
517 // utf8 (true) text and pattern are UTF-8; otherwise Latin-1
518 // posix_syntax (false) restrict regexps to POSIX egrep syntax
519 // longest_match (false) search for longest match, not first match
520 // log_errors (true) log syntax and execution errors to ERROR
521 // max_mem (see below) approx. max memory footprint of RE2
522 // literal (false) interpret string as literal, not regexp
523 // never_nl (false) never match \n, even if it is in regexp
524 // dot_nl (false) dot matches everything including new line
525 // never_capture (false) parse all parens as non-capturing
526 // case_sensitive (true) match is case-sensitive (regexp can override
527 // with (?i) unless in posix_syntax mode)
528 //
529 // The following options are only consulted when posix_syntax == true.
530 // (When posix_syntax == false these features are always enabled and
531 // cannot be turned off.)
532 // perl_classes (false) allow Perl's \d \s \w \D \S \W
533 // word_boundary (false) allow Perl's \b \B (word boundary and not)
534 // one_line (false) ^ and $ only match beginning and end of text
535 //
536 // The max_mem option controls how much memory can be used
537 // to hold the compiled form of the regexp (the Prog) and
538 // its cached DFA graphs. Code Search placed limits on the number
539 // of Prog instructions and DFA states: 10,000 for both.
540 // In RE2, those limits would translate to about 240 KB per Prog
541 // and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a
542 // better job of keeping them small than Code Search did).
543 // Each RE2 has two Progs (one forward, one reverse), and each Prog
544 // can have two DFAs (one first match, one longest match).
545 // That makes 4 DFAs:
546 //
547 // forward, first-match - used for UNANCHORED or ANCHOR_LEFT searches
548 // if opt.longest_match() == false
549 // forward, longest-match - used for all ANCHOR_BOTH searches,
550 // and the other two kinds if
551 // opt.longest_match() == true
552 // reverse, first-match - never used
553 // reverse, longest-match - used as second phase for unanchored searches
554 //
555 // The RE2 memory budget is statically divided between the two
556 // Progs and then the DFAs: two thirds to the forward Prog
557 // and one third to the reverse Prog. The forward Prog gives half
558 // of what it has left over to each of its DFAs. The reverse Prog
559 // gives it all to its longest-match DFA.
560 //
561 // Once a DFA fills its budget, it flushes its cache and starts over.
562 // If this happens too often, RE2 falls back on the NFA implementation.
563
564 // For now, make the default budget something close to Code Search.
565 static const int kDefaultMaxMem = 8<<20;
566
567 enum Encoding {
568 EncodingUTF8 = 1,
569 EncodingLatin1
570 };
571
572 Options() :
573 encoding_(EncodingUTF8),
574 posix_syntax_(false),
575 longest_match_(false),
576 log_errors_(true),
577 max_mem_(kDefaultMaxMem),
578 literal_(false),
579 never_nl_(false),
580 dot_nl_(false),
581 never_capture_(false),
582 case_sensitive_(true),
583 perl_classes_(false),
584 word_boundary_(false),
585 one_line_(false) {
586 }
587
588 /*implicit*/ Options(CannedOptions);
589
590 Encoding encoding() const { return encoding_; }
591 void set_encoding(Encoding encoding) { encoding_ = encoding; }
592
593 // Legacy interface to encoding.
594 // TODO(rsc): Remove once clients have been converted.
595 bool utf8() const { return encoding_ == EncodingUTF8; }
596 void set_utf8(bool b) {
597 if (b) {
598 encoding_ = EncodingUTF8;
599 } else {
600 encoding_ = EncodingLatin1;
601 }
602 }
603
604 bool posix_syntax() const { return posix_syntax_; }
605 void set_posix_syntax(bool b) { posix_syntax_ = b; }
606
607 bool longest_match() const { return longest_match_; }
608 void set_longest_match(bool b) { longest_match_ = b; }
609
610 bool log_errors() const { return log_errors_; }
611 void set_log_errors(bool b) { log_errors_ = b; }
612
613 int64_t max_mem() const { return max_mem_; }
614 void set_max_mem(int64_t m) { max_mem_ = m; }
615
616 bool literal() const { return literal_; }
617 void set_literal(bool b) { literal_ = b; }
618
619 bool never_nl() const { return never_nl_; }
620 void set_never_nl(bool b) { never_nl_ = b; }
621
622 bool dot_nl() const { return dot_nl_; }
623 void set_dot_nl(bool b) { dot_nl_ = b; }
624
625 bool never_capture() const { return never_capture_; }
626 void set_never_capture(bool b) { never_capture_ = b; }
627
628 bool case_sensitive() const { return case_sensitive_; }
629 void set_case_sensitive(bool b) { case_sensitive_ = b; }
630
631 bool perl_classes() const { return perl_classes_; }
632 void set_perl_classes(bool b) { perl_classes_ = b; }
633
634 bool word_boundary() const { return word_boundary_; }
635 void set_word_boundary(bool b) { word_boundary_ = b; }
636
637 bool one_line() const { return one_line_; }
638 void set_one_line(bool b) { one_line_ = b; }
639
640 void Copy(const Options& src) {
641 encoding_ = src.encoding_;
642 posix_syntax_ = src.posix_syntax_;
643 longest_match_ = src.longest_match_;
644 log_errors_ = src.log_errors_;
645 max_mem_ = src.max_mem_;
646 literal_ = src.literal_;
647 never_nl_ = src.never_nl_;
648 dot_nl_ = src.dot_nl_;
649 never_capture_ = src.never_capture_;
650 case_sensitive_ = src.case_sensitive_;
651 perl_classes_ = src.perl_classes_;
652 word_boundary_ = src.word_boundary_;
653 one_line_ = src.one_line_;
654 }
655
656 int ParseFlags() const;
657
658 private:
659 Encoding encoding_;
660 bool posix_syntax_;
661 bool longest_match_;
662 bool log_errors_;
663 int64_t max_mem_;
664 bool literal_;
665 bool never_nl_;
666 bool dot_nl_;
667 bool never_capture_;
668 bool case_sensitive_;
669 bool perl_classes_;
670 bool word_boundary_;
671 bool one_line_;
672
673 //DISALLOW_COPY_AND_ASSIGN(Options);
674 Options(const Options&);
675 void operator=(const Options&);
676 };
677
678 // Returns the options set in the constructor.
679 const Options& options() const { return options_; };
680
681 // Argument converters; see below.
682 static inline Arg CRadix(short* x);
683 static inline Arg CRadix(unsigned short* x);
684 static inline Arg CRadix(int* x);
685 static inline Arg CRadix(unsigned int* x);
686 static inline Arg CRadix(long* x);
687 static inline Arg CRadix(unsigned long* x);
688 #if RE2_HAVE_LONGLONG
689 static inline Arg CRadix(long long* x);
690 static inline Arg CRadix(unsigned long long* x);
691 #endif
692
693 static inline Arg Hex(short* x);
694 static inline Arg Hex(unsigned short* x);
695 static inline Arg Hex(int* x);
696 static inline Arg Hex(unsigned int* x);
697 static inline Arg Hex(long* x);
698 static inline Arg Hex(unsigned long* x);
699 #if RE2_HAVE_LONGLONG
700 static inline Arg Hex(long long* x);
701 static inline Arg Hex(unsigned long long* x);
702 #endif
703
704 static inline Arg Octal(short* x);
705 static inline Arg Octal(unsigned short* x);
706 static inline Arg Octal(int* x);
707 static inline Arg Octal(unsigned int* x);
708 static inline Arg Octal(long* x);
709 static inline Arg Octal(unsigned long* x);
710 #if RE2_HAVE_LONGLONG
711 static inline Arg Octal(long long* x);
712 static inline Arg Octal(unsigned long long* x);
713 #endif
714
715 private:
716 void Init(const StringPiece& pattern, const Options& options);
717
718 bool DoMatch(const StringPiece& text,
719 Anchor anchor,
720 int* consumed,
721 const Arg* const args[],
722 int n) const;
723
724 re2::Prog* ReverseProg() const;
725
726 mutable Mutex* mutex_;
727 string pattern_; // string regular expression
728 Options options_; // option flags
729 string prefix_; // required prefix (before regexp_)
730 bool prefix_foldcase_; // prefix is ASCII case-insensitive
731 re2::Regexp* entire_regexp_; // parsed regular expression
732 re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed
733 re2::Prog* prog_; // compiled program for regexp
734 mutable re2::Prog* rprog_; // reverse program for regexp
735 bool is_one_pass_; // can use prog_->SearchOnePass?
736 mutable const string* error_; // Error indicator
737 // (or points to empty string)
738 mutable ErrorCode error_code_; // Error code
739 mutable string error_arg_; // Fragment of regexp showing error
740 mutable int num_captures_; // Number of capturing groups
741
742 // Map from capture names to indices
743 mutable const map<string, int>* named_groups_;
744
745 // Map from capture indices to names
746 mutable const map<int, string>* group_names_;
747
748 //DISALLOW_COPY_AND_ASSIGN(RE2);
749 RE2(const RE2&);
750 void operator=(const RE2&);
751 };
752
753 /***** Implementation details *****/
754
755 // Hex/Octal/Binary?
756
757 // Special class for parsing into objects that define a ParseFrom() method
758 template <class T>
759 class _RE2_MatchObject {
760 public:
761 static inline bool Parse(const char* str, int n, void* dest) {
762 if (dest == NULL) return true;
763 T* object = reinterpret_cast<T*>(dest);
764 return object->ParseFrom(str, n);
765 }
766 };
767
768 class RE2::Arg {
769 public:
770 // Empty constructor so we can declare arrays of RE2::Arg
771 Arg();
772
773 // Constructor specially designed for NULL arguments
774 Arg(void*);
775
776 typedef bool (*Parser)(const char* str, int n, void* dest);
777
778 // Type-specific parsers
779 #define MAKE_PARSER(type,name) \
780 Arg(type* p) : arg_(p), parser_(name) { } \
781 Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \
782
783
784 MAKE_PARSER(char, parse_char);
785 MAKE_PARSER(signed char, parse_char);
786 MAKE_PARSER(unsigned char, parse_uchar);
787 MAKE_PARSER(short, parse_short);
788 MAKE_PARSER(unsigned short, parse_ushort);
789 MAKE_PARSER(int, parse_int);
790 MAKE_PARSER(unsigned int, parse_uint);
791 MAKE_PARSER(long, parse_long);
792 MAKE_PARSER(unsigned long, parse_ulong);
793 #if RE2_HAVE_LONGLONG
794 MAKE_PARSER(long long, parse_longlong);
795 MAKE_PARSER(unsigned long long, parse_ulonglong);
796 #endif
797 MAKE_PARSER(float, parse_float);
798 MAKE_PARSER(double, parse_double);
799 MAKE_PARSER(string, parse_string);
800 MAKE_PARSER(StringPiece, parse_stringpiece);
801
802 #undef MAKE_PARSER
803
804 // Generic constructor templates
805 template <class T> Arg(T* p)
806 : arg_(p), parser_(_RE2_MatchObject<T>::Parse) { }
807 template <class T> Arg(T* p, Parser parser)
808 : arg_(p), parser_(parser) { }
809
810 // Parse the data
811 bool Parse(const char* str, int n) const;
812
813 private:
814 void* arg_;
815 Parser parser_;
816
817 static bool parse_null (const char* str, int n, void* dest);
818 static bool parse_char (const char* str, int n, void* dest);
819 static bool parse_uchar (const char* str, int n, void* dest);
820 static bool parse_float (const char* str, int n, void* dest);
821 static bool parse_double (const char* str, int n, void* dest);
822 static bool parse_string (const char* str, int n, void* dest);
823 static bool parse_stringpiece (const char* str, int n, void* dest);
824
825 #define DECLARE_INTEGER_PARSER(name) \
826 private: \
827 static bool parse_ ## name(const char* str, int n, void* dest); \
828 static bool parse_ ## name ## _radix( \
829 const char* str, int n, void* dest, int radix); \
830 public: \
831 static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \
832 static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \
833 static bool parse_ ## name ## _cradix(const char* str, int n, void* dest)
834
835 DECLARE_INTEGER_PARSER(short);
836 DECLARE_INTEGER_PARSER(ushort);
837 DECLARE_INTEGER_PARSER(int);
838 DECLARE_INTEGER_PARSER(uint);
839 DECLARE_INTEGER_PARSER(long);
840 DECLARE_INTEGER_PARSER(ulong);
841 #if RE2_HAVE_LONGLONG
842 DECLARE_INTEGER_PARSER(longlong);
843 DECLARE_INTEGER_PARSER(ulonglong);
844 #endif
845
846 #undef DECLARE_INTEGER_PARSER
847 };
848
849 inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
850 inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
851
852 inline bool RE2::Arg::Parse(const char* str, int n) const {
853 return (*parser_)(str, n, arg_);
854 }
855
856 // This part of the parser, appropriate only for ints, deals with bases
857 #define MAKE_INTEGER_PARSER(type, name) \
858 inline RE2::Arg RE2::Hex(type* ptr) { \
859 return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _hex); } \
860 inline RE2::Arg RE2::Octal(type* ptr) { \
861 return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _octal); } \
862 inline RE2::Arg RE2::CRadix(type* ptr) { \
863 return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); }
864
865 MAKE_INTEGER_PARSER(short, short)
866 MAKE_INTEGER_PARSER(unsigned short, ushort)
867 MAKE_INTEGER_PARSER(int, int)
868 MAKE_INTEGER_PARSER(unsigned int, uint)
869 MAKE_INTEGER_PARSER(long, long)
870 MAKE_INTEGER_PARSER(unsigned long, ulong)
871 #if RE2_HAVE_LONGLONG
872 MAKE_INTEGER_PARSER(long long, longlong)
873 MAKE_INTEGER_PARSER(unsigned long long, ulonglong)
874 #endif
875
876 #undef MAKE_INTEGER_PARSER
877
878 } // namespace re2
879
880 using re2::RE2;
881
882 #endif /* RE2_RE2_H */
OLDNEW
« no previous file with comments | « third_party/re2/re2/prog.cc ('k') | third_party/re2/re2/re2.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698