third_party/re2/re2/re2.h - Issue 10575037: Include RE2 library

Side by Side Diff: third_party/re2/re2/re2.h

Issue 10575037: Include RE2 library (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Fixed windows include dirs Created 8 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 // Copyright 2003-2009 The RE2 Authors. All Rights Reserved.

	2 // Use of this source code is governed by a BSD-style

	3 // license that can be found in the LICENSE file.

	4

	5 #ifndef RE2_RE2_H

	6 #define RE2_RE2_H

	7

	8 #define kDefaultMaxMem (8<<20)

	9

	10 // C++ interface to the re2 regular-expression library.

	11 // RE2 supports Perl-style regular expressions (with extensions like

	12 // \d, \w, \s, ...).

	13 //

	14 // -----------------------------------------------------------------------

	15 // REGEXP SYNTAX:

	16 //

	17 // This module uses the re2 library and hence supports

	18 // its syntax for regular expressions, which is similar to Perl's with

	19 // some of the more complicated things thrown away. In particular,

	20 // backreferences and generalized assertions are not available, nor is \Z.

	21 //

	22 // See http://code.google.com/p/re2/wiki/Syntax for the syntax

	23 // supported by RE2, and a comparison with PCRE and PERL regexps.

	24 //

	25 // For those not familiar with Perl's regular expressions,

	26 // here are some examples of the most commonly used extensions:

	27 //

	28 // "hello (\\w+) world" -- \w matches a "word" character

	29 // "version (\\d+)" -- \d matches a digit

	30 // "hello\\s+world" -- \s matches any whitespace character

	31 // "\\b(\\w+)\\b" -- \b matches non-empty string at word boundary

	32 // "(?i)hello" -- (?i) turns on case-insensitive matching

	33 // "/\\(.?)\\/" -- .? matches . minimum no. of times possible

	34 //

	35 // -----------------------------------------------------------------------

	36 // MATCHING INTERFACE:

	37 //

	38 // The "FullMatch" operation checks that supplied text matches a

	39 // supplied pattern exactly.

	40 //

	41 // Example: successful match

	42 // CHECK(RE2::FullMatch("hello", "h.*o"));

	43 //

	44 // Example: unsuccessful match (requires full match):

	45 // CHECK(!RE2::FullMatch("hello", "e"));

	46 //

	47 // -----------------------------------------------------------------------

	48 // UTF-8 AND THE MATCHING INTERFACE:

	49 //

	50 // By default, the pattern and input text are interpreted as UTF-8.

	51 // The RE2::Latin1 option causes them to be interpreted as Latin-1.

	52 //

	53 // Example:

	54 // CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern)));

	55 // CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1)));

	56 //

	57 // -----------------------------------------------------------------------

	58 // MATCHING WITH SUB-STRING EXTRACTION:

	59 //

	60 // You can supply extra pointer arguments to extract matched subpieces.

	61 //

	62 // Example: extracts "ruby" into "s" and 1234 into "i"

	63 // int i;

	64 // string s;

	65 // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));

	66 //

	67 // Example: fails because string cannot be stored in integer

	68 // CHECK(!RE2::FullMatch("ruby", "(.*)", &i));

	69 //

	70 // Example: fails because there aren't enough sub-patterns:

	71 // CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s));

	72 //

	73 // Example: does not try to extract any extra sub-patterns

	74 // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));

	75 //

	76 // Example: does not try to extract into NULL

	77 // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));

	78 //

	79 // Example: integer overflow causes failure

	80 // CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));

	81 //

	82 // NOTE(rsc): Asking for substrings slows successful matches quite a bit.

	83 // This may get a little faster in the future, but right now is slower

	84 // than PCRE. On the other hand, failed matches run very fast (faster

	85 // than PCRE), as do matches without substring extraction.

	86 //

	87 // -----------------------------------------------------------------------

	88 // PARTIAL MATCHES

	89 //

	90 // You can use the "PartialMatch" operation when you want the pattern

	91 // to match any substring of the text.

	92 //

	93 // Example: simple search for a string:

	94 // CHECK(RE2::PartialMatch("hello", "ell"));

	95 //

	96 // Example: find first number in a string

	97 // int number;

	98 // CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number));

	99 // CHECK_EQ(number, 100);

	100 //

	101 // -----------------------------------------------------------------------

	102 // PRE-COMPILED REGULAR EXPRESSIONS

	103 //

	104 // RE2 makes it easy to use any string as a regular expression, without

	105 // requiring a separate compilation step.

	106 //

	107 // If speed is of the essence, you can create a pre-compiled "RE2"

	108 // object from the pattern and use it multiple times. If you do so,

	109 // you can typically parse text faster than with sscanf.

	110 //

	111 // Example: precompile pattern for faster matching:

	112 // RE2 pattern("h.*o");

	113 // while (ReadLine(&str)) {

	114 // if (RE2::FullMatch(str, pattern)) ...;

	115 // }

	116 //

	117 // -----------------------------------------------------------------------

	118 // SCANNING TEXT INCREMENTALLY

	119 //

	120 // The "Consume" operation may be useful if you want to repeatedly

	121 // match regular expressions at the front of a string and skip over

	122 // them as they match. This requires use of the "StringPiece" type,

	123 // which represents a sub-range of a real string.

	124 //

	125 // Example: read lines of the form "var = value" from a string.

	126 // string contents = ...; // Fill string somehow

	127 // StringPiece input(contents); // Wrap a StringPiece around it

	128 //

	129 // string var;

	130 // int value;

	131 // while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {

	132 // ...;

	133 // }

	134 //

	135 // Each successful call to "Consume" will set "var/value", and also

	136 // advance "input" so it points past the matched text. Note that if the

	137 // regular expression matches an empty string, input will advance

	138 // by 0 bytes. If the regular expression being used might match

	139 // an empty string, the loop body must check for this case and either

	140 // advance the string or break out of the loop.

	141 //

	142 // The "FindAndConsume" operation is similar to "Consume" but does not

	143 // anchor your match at the beginning of the string. For example, you

	144 // could extract all words from a string by repeatedly calling

	145 // RE2::FindAndConsume(&input, "(\\w+)", &word)

	146 //

	147 // -----------------------------------------------------------------------

	148 // USING VARIABLE NUMBER OF ARGUMENTS

	149 //

	150 // The above operations require you to know the number of arguments

	151 // when you write the code. This is not always possible or easy (for

	152 // example, the regular expression may be calculated at run time).

	153 // You can use the "N" version of the operations when the number of

	154 // match arguments are determined at run time.

	155 //

	156 // Example:

	157 // const RE2::Arg* args[10];

	158 // int n;

	159 // // ... populate args with pointers to RE2::Arg values ...

	160 // // ... set n to the number of RE2::Arg objects ...

	161 // bool match = RE2::FullMatchN(input, pattern, args, n);

	162 //

	163 // The last statement is equivalent to

	164 //

	165 // bool match = RE2::FullMatch(input, pattern,

	166 // args[0], args[1], ..., *args[n - 1]);

	167 //

	168 // -----------------------------------------------------------------------

	169 // PARSING HEX/OCTAL/C-RADIX NUMBERS

	170 //

	171 // By default, if you pass a pointer to a numeric value, the

	172 // corresponding text is interpreted as a base-10 number. You can

	173 // instead wrap the pointer with a call to one of the operators Hex(),

	174 // Octal(), or CRadix() to interpret the text in another base. The

	175 // CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)

	176 // prefixes, but defaults to base-10.

	177 //

	178 // Example:

	179 // int a, b, c, d;

	180 // CHECK(RE2::FullMatch("100 40 0100 0x40", "(.) (.) (.) (.)",

	181 // RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));

	182 // will leave 64 in a, b, c, and d.

	183

	184

	185 #include <stdint.h>

	186 #include <map>

	187 #include <string>

	188 #include "re2/stringpiece.h"

	189 #include "re2/variadic_function.h"

	190

	191 namespace re2 {

	192 using std::string;

	193 using std::map;

	194 class Mutex;

	195 class Prog;

	196 class Regexp;

	197

	198 // Interface for regular expression matching. Also corresponds to a

	199 // pre-compiled regular expression. An "RE2" object is safe for

	200 // concurrent use by multiple threads.

	201 class RE2 {

	202 public:

	203 // We convert user-passed pointers into special Arg objects

	204 class Arg;

	205 class Options;

	206

	207 // Defined in set.h.

	208 class Set;

	209

	210 enum ErrorCode {

	211 NoError = 0,

	212

	213 // Unexpected error

	214 ErrorInternal,

	215

	216 // Parse errors

	217 ErrorBadEscape, // bad escape sequence

	218 ErrorBadCharClass, // bad character class

	219 ErrorBadCharRange, // bad character class range

	220 ErrorMissingBracket, // missing closing ]

	221 ErrorMissingParen, // missing closing )

	222 ErrorTrailingBackslash, // trailing \ at end of regexp

	223 ErrorRepeatArgument, // repeat argument missing, e.g. "*"

	224 ErrorRepeatSize, // bad repetition argument

	225 ErrorRepeatOp, // bad repetition operator

	226 ErrorBadPerlOp, // bad perl operator

	227 ErrorBadUTF8, // invalid UTF-8 in regexp

	228 ErrorBadNamedCapture, // bad named capture group

	229 ErrorPatternTooLarge, // pattern too large (compile failed)

	230 };

	231

	232 // Predefined common options.

	233 // If you need more complicated things, instantiate

	234 // an Option class, change the settings, and pass it to the

	235 // RE2 constructor.

	236 static const Options DefaultOptions;

	237 static const Options Latin1; // treat input as Latin-1 (default UTF-8)

	238 //static const Options POSIX; // POSIX syntax, leftmost-longest match

	239 static const Options Quiet; // do not log about regexp parse errors

	240

	241 // Need to have the const char* and const string& forms for implicit

	242 // conversions when passing string literals to FullMatch and PartialMatch.

	243 // Otherwise the StringPiece form would be sufficient.

	244 #ifndef SWIG

	245 RE2(const char* pattern);

	246 RE2(const string& pattern);

	247 #endif

	248 RE2(const StringPiece& pattern);

	249 RE2(const StringPiece& pattern, const Options& option);

	250 ~RE2();

	251

	252 // Returns whether RE2 was created properly.

	253 bool ok() const { return error_code() == NoError; }

	254

	255 // The string specification for this RE2. E.g.

	256 // RE2 re("ab*c?d+");

	257 // re.pattern(); // "ab*c?d+"

	258 const string& pattern() const { return pattern_; }

	259

	260 // If RE2 could not be created properly, returns an error string.

	261 // Else returns the empty string.

	262 const string& error() const { return *error_; }

	263

	264 // If RE2 could not be created properly, returns an error code.

	265 // Else returns RE2::NoError (== 0).

	266 ErrorCode error_code() const { return error_code_; }

	267

	268 // If RE2 could not be created properly, returns the offending

	269 // portion of the regexp.

	270 const string& error_arg() const { return error_arg_; }

	271

	272 // Returns the program size, a very approximate measure of a regexp's "cost".

	273 // Larger numbers are more expensive than smaller numbers.

	274 int ProgramSize() const;

	275

	276 // Returns the underlying Regexp; not for general use.

	277 // Returns entire_regexp_ so that callers don't need

	278 // to know about prefix_ and prefix_foldcase_.

	279 re2::Regexp* Regexp() const { return entire_regexp_; }

	280

	281 /*** The useful part: the matching interface ***/

	282

	283 // Matches "text" against "pattern". If pointer arguments are

	284 // supplied, copies matched sub-patterns into them.

	285 //

	286 // You can pass in a "const char*" or a "string" for "text".

	287 // You can pass in a "const char*" or a "string" or a "RE2" for "pattern".

	288 //

	289 // The provided pointer arguments can be pointers to any scalar numeric

	290 // type, or one of:

	291 // string (matched piece is copied to string)

	292 // StringPiece (StringPiece is mutated to point to matched piece)

	293 // T (where "bool T::ParseFrom(const char*, int)" exists)

	294 // (void*)NULL (the corresponding matched sub-pattern is not copied)

	295 //

	296 // Returns true iff all of the following conditions are satisfied:

	297 // a. "text" matches "pattern" exactly

	298 // b. The number of matched sub-patterns is >= number of supplied pointers

	299 // c. The "i"th argument has a suitable type for holding the

	300 // string captured as the "i"th sub-pattern. If you pass in

	301 // NULL for the "i"th argument, or pass fewer arguments than

	302 // number of sub-patterns, "i"th captured sub-pattern is

	303 // ignored.

	304 //

	305 // CAVEAT: An optional sub-pattern that does not exist in the

	306 // matched string is assigned the empty string. Therefore, the

	307 // following will return false (because the empty string is not a

	308 // valid number):

	309 // int number;

	310 // RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);

	311 static bool FullMatchN(const StringPiece& text, const RE2& re,

	312 const Arg* const args[], int argc);

	313 static const VariadicFunction2<

	314 bool, const StringPiece&, const RE2&, Arg, RE2::FullMatchN> FullMatch;

	315

	316 // Exactly like FullMatch(), except that "pattern" is allowed to match

	317 // a substring of "text".

	318 static bool PartialMatchN(const StringPiece& text, const RE2& re, // 3..16 arg s

	319 const Arg* const args[], int argc);

	320 static const VariadicFunction2<

	321 bool, const StringPiece&, const RE2&, Arg, RE2::PartialMatchN> PartialMatc h;

	322

	323 // Like FullMatch() and PartialMatch(), except that pattern has to

	324 // match a prefix of "text", and "input" is advanced past the matched

	325 // text. Note: "input" is modified iff this routine returns true.

	326 static bool ConsumeN(StringPiece* input, const RE2& pattern, // 3..16 args

	327 const Arg* const args[], int argc);

	328 static const VariadicFunction2<

	329 bool, StringPiece*, const RE2&, Arg, RE2::ConsumeN> Consume;

	330

	331 // Like Consume(..), but does not anchor the match at the beginning of the

	332 // string. That is, "pattern" need not start its match at the beginning of

	333 // "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next

	334 // word in "s" and stores it in "word".

	335 static bool FindAndConsumeN(StringPiece* input, const RE2& pattern,

	336 const Arg* const args[], int argc);

	337 static const VariadicFunction2<

	338 bool, StringPiece*, const RE2&, Arg, RE2::FindAndConsumeN> FindAndConsume;

	339

	340 // Replace the first match of "pattern" in "str" with "rewrite".

	341 // Within "rewrite", backslash-escaped digits (\1 to \9) can be

	342 // used to insert text matching corresponding parenthesized group

	343 // from the pattern. \0 in "rewrite" refers to the entire matching

	344 // text. E.g.,

	345 //

	346 // string s = "yabba dabba doo";

	347 // CHECK(RE2::Replace(&s, "b+", "d"));

	348 //

	349 // will leave "s" containing "yada dabba doo"

	350 //

	351 // Returns true if the pattern matches and a replacement occurs,

	352 // false otherwise.

	353 static bool Replace(string *str,

	354 const RE2& pattern,

	355 const StringPiece& rewrite);

	356

	357 // Like Replace(), except replaces successive non-overlapping occurrences

	358 // of the pattern in the string with the rewrite. E.g.

	359 //

	360 // string s = "yabba dabba doo";

	361 // CHECK(RE2::GlobalReplace(&s, "b+", "d"));

	362 //

	363 // will leave "s" containing "yada dada doo"

	364 // Replacements are not subject to re-matching.

	365 //

	366 // Because GlobalReplace only replaces non-overlapping matches,

	367 // replacing "ana" within "banana" makes only one replacement, not two.

	368 //

	369 // Returns the number of replacements made.

	370 static int GlobalReplace(string *str,

	371 const RE2& pattern,

	372 const StringPiece& rewrite);

	373

	374 // Like Replace, except that if the pattern matches, "rewrite"

	375 // is copied into "out" with substitutions. The non-matching

	376 // portions of "text" are ignored.

	377 //

	378 // Returns true iff a match occurred and the extraction happened

	379 // successfully; if no match occurs, the string is left unaffected.

	380 static bool Extract(const StringPiece &text,

	381 const RE2& pattern,

	382 const StringPiece &rewrite,

	383 string *out);

	384

	385 // Escapes all potentially meaningful regexp characters in

	386 // 'unquoted'. The returned string, used as a regular expression,

	387 // will exactly match the original string. For example,

	388 // 1.5-2.0?

	389 // may become:

	390 // 1\.5\-2\.0\?

	391 static string QuoteMeta(const StringPiece& unquoted);

	392

	393 // Computes range for any strings matching regexp. The min and max can in

	394 // some cases be arbitrarily precise, so the caller gets to specify the

	395 // maximum desired length of string returned.

	396 //

	397 // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any

	398 // string s that is an anchored match for this regexp satisfies

	399 // min <= s && s <= max.

	400 //

	401 // Note that PossibleMatchRange() will only consider the first copy of an

	402 // infinitely repeated element (i.e., any regexp element followed by a '*' or

	403 // '+' operator). Regexps with "{N}" constructions are not affected, as those

	404 // do not compile down to infinite repetitions.

	405 //

	406 // Returns true on success, false on error.

	407 bool PossibleMatchRange(string* min, string* max, int maxlen) const;

	408

	409 // Generic matching interface

	410

	411 // Type of match.

	412 enum Anchor {

	413 UNANCHORED, // No anchoring

	414 ANCHOR_START, // Anchor at start only

	415 ANCHOR_BOTH, // Anchor at start and end

	416 };

	417

	418 // Return the number of capturing subpatterns, or -1 if the

	419 // regexp wasn't valid on construction. The overall match ($0)

	420 // does not count: if the regexp is "(a)(b)", returns 2.

	421 int NumberOfCapturingGroups() const;

	422

	423

	424 // Return a map from names to capturing indices.

	425 // The map records the index of the leftmost group

	426 // with the given name.

	427 // Only valid until the re is deleted.

	428 const map<string, int>& NamedCapturingGroups() const;

	429

	430 // Return a map from capturing indices to names.

	431 // The map has no entries for unnamed groups.

	432 // Only valid until the re is deleted.

	433 const map<int, string>& CapturingGroupNames() const;

	434

	435 // General matching routine.

	436 // Match against text starting at offset startpos

	437 // and stopping the search at offset endpos.

	438 // Returns true if match found, false if not.

	439 // On a successful match, fills in match[] (up to nmatch entries)

	440 // with information about submatches.

	441 // I.e. matching RE2("(foo)\|(bar)baz") on "barbazbla" will return true,

	442 // setting match[0] = "barbaz", match[1] = NULL, match[2] = "bar",

	443 // match[3] = NULL, ..., up to match[nmatch-1] = NULL.

	444 //

	445 // Don't ask for more match information than you will use:

	446 // runs much faster with nmatch == 1 than nmatch > 1, and

	447 // runs even faster if nmatch == 0.

	448 // Doesn't make sense to use nmatch > 1 + NumberOfCapturingGroups(),

	449 // but will be handled correctly.

	450 //

	451 // Passing text == StringPiece(NULL, 0) will be handled like any other

	452 // empty string, but note that on return, it will not be possible to tell

	453 // whether submatch i matched the empty string or did not match:

	454 // either way, match[i] == NULL.

	455 bool Match(const StringPiece& text,

	456 int startpos,

	457 int endpos,

	458 Anchor anchor,

	459 StringPiece *match,

	460 int nmatch) const;

	461

	462 // Check that the given rewrite string is suitable for use with this

	463 // regular expression. It checks that:

	464 // * The regular expression has enough parenthesized subexpressions

	465 // to satisfy all of the \N tokens in rewrite

	466 // * The rewrite string doesn't have any syntax errors. E.g.,

	467 // '\' followed by anything other than a digit or '\'.

	468 // A true return value guarantees that Replace() and Extract() won't

	469 // fail because of a bad rewrite string.

	470 bool CheckRewriteString(const StringPiece& rewrite, string* error) const;

	471

	472 // Constructor options

	473 class Options {

	474 public:

	475 // The options are (defaults in parentheses):

	476 //

	477 // utf8 (true) text and pattern are UTF-8; otherwise Latin-1

	478 // posix_syntax (false) restrict regexps to POSIX egrep syntax

	479 // longest_match (false) search for longest match, not first match

	480 // log_errors (true) log syntax and execution errors to ERROR

	481 // max_mem (see below) approx. max memory footprint of RE2

	482 // literal (false) interpret string as literal, not regexp

	483 // never_nl (false) never match \n, even if it is in regexp

	484 // case_sensitive (true) match is case-sensitive (regexp can override

	485 // with (?i) unless in posix_syntax mode)

	486 //

	487 // The following options are only consulted when posix_syntax == true.

	488 // (When posix_syntax == false these features are always enabled and

	489 // cannot be turned off.)

	490 // perl_classes (false) allow Perl's \d \s \w \D \S \W

	491 // word_boundary (false) allow Perl's \b \B (word boundary and not)

	492 // one_line (false) ^ and $ only match beginning and end of text

	493 //

	494 // The max_mem option controls how much memory can be used

	495 // to hold the compiled form of the regexp (the Prog) and

	496 // its cached DFA graphs. Code Search placed limits on the number

	497 // of Prog instructions and DFA states: 10,000 for both.

	498 // In RE2, those limits would translate to about 240 KB per Prog

	499 // and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a

	500 // better job of keeping them small than Code Search did).

	501 // Each RE2 has two Progs (one forward, one reverse), and each Prog

	502 // can have two DFAs (one first match, one longest match).

	503 // That makes 4 DFAs:

	504 //

	505 // forward, first-match - used for UNANCHORED or ANCHOR_LEFT searches

	506 // if opt.longest_match() == false

	507 // forward, longest-match - used for all ANCHOR_BOTH searches,

	508 // and the other two kinds if

	509 // opt.longest_match() == true

	510 // reverse, first-match - never used

	511 // reverse, longest-match - used as second phase for unanchored searches

	512 //

	513 // The RE2 memory budget is statically divided between the two

	514 // Progs and then the DFAs: two thirds to the forward Prog

	515 // and one third to the reverse Prog. The forward Prog gives half

	516 // of what it has left over to each of its DFAs. The reverse Prog

	517 // gives it all to its longest-match DFA.

	518 //

	519 // Once a DFA fills its budget, it flushes its cache and starts over.

	520 // If this happens too often, RE2 falls back on the NFA implementation.

	521

	522 enum Encoding {

	523 EncodingUTF8 = 1,

	524 EncodingLatin1

	525 };

	526

	527 Options() :

	528 encoding_(EncodingUTF8),

	529 posix_syntax_(false),

	530 longest_match_(false),

	531 log_errors_(true),

	532 max_mem_(kDefaultMaxMem),

	533 literal_(false),

	534 never_nl_(false),

	535 case_sensitive_(true),

	536 perl_classes_(false),

	537 word_boundary_(false),

	538 one_line_(false) {

	539 }

	540

	541 Encoding encoding() const { return encoding_; }

	542 void set_encoding(Encoding encoding) { encoding_ = encoding; }

	543

	544 // Legacy interface to encoding.

	545 // TODO(rsc): Remove once clients have been converted.

	546 bool utf8() const { return encoding_ == EncodingUTF8; }

	547 void set_utf8(bool b) {

	548 if (b) {

	549 encoding_ = EncodingUTF8;

	550 } else {

	551 encoding_ = EncodingLatin1;

	552 }

	553 }

	554

	555 bool posix_syntax() const { return posix_syntax_; }

	556 void set_posix_syntax(bool b) { posix_syntax_ = b; }

	557

	558 bool longest_match() const { return longest_match_; }

	559 void set_longest_match(bool b) { longest_match_ = b; }

	560

	561 bool log_errors() const { return log_errors_; }

	562 void set_log_errors(bool b) { log_errors_ = b; }

	563

	564 int max_mem() const { return max_mem_; }

	565 void set_max_mem(int m) { max_mem_ = m; }

	566

	567 bool literal() const { return literal_; }

	568 void set_literal(bool b) { literal_ = b; }

	569

	570 bool never_nl() const { return never_nl_; }

	571 void set_never_nl(bool b) { never_nl_ = b; }

	572

	573 bool case_sensitive() const { return case_sensitive_; }

	574 void set_case_sensitive(bool b) { case_sensitive_ = b; }

	575

	576 bool perl_classes() const { return perl_classes_; }

	577 void set_perl_classes(bool b) { perl_classes_ = b; }

	578

	579 bool word_boundary() const { return word_boundary_; }

	580 void set_word_boundary(bool b) { word_boundary_ = b; }

	581

	582 bool one_line() const { return one_line_; }

	583 void set_one_line(bool b) { one_line_ = b; }

	584

	585 void Copy(const Options& src) {

	586 encoding_ = src.encoding_;

	587 posix_syntax_ = src.posix_syntax_;

	588 longest_match_ = src.longest_match_;

	589 log_errors_ = src.log_errors_;

	590 max_mem_ = src.max_mem_;

	591 literal_ = src.literal_;

	592 never_nl_ = src.never_nl_;

	593 case_sensitive_ = src.case_sensitive_;

	594 perl_classes_ = src.perl_classes_;

	595 word_boundary_ = src.word_boundary_;

	596 one_line_ = src.one_line_;

	597 }

	598

	599 int ParseFlags() const;

	600

	601 private:

	602 // Private constructor for defining constants like RE2::Latin1.

	603 friend class RE2;

	604 Options(Encoding encoding,

	605 bool posix_syntax,

	606 bool longest_match,

	607 bool log_errors) :

	608 encoding_(encoding),

	609 posix_syntax_(posix_syntax),

	610 longest_match_(longest_match),

	611 log_errors_(log_errors),

	612 max_mem_(kDefaultMaxMem),

	613 literal_(false),

	614 never_nl_(false),

	615 case_sensitive_(true),

	616 perl_classes_(false),

	617 word_boundary_(false),

	618 one_line_(false) {

	619 }

	620

	621 Encoding encoding_;

	622 bool posix_syntax_;

	623 bool longest_match_;

	624 bool log_errors_;

	625 int64_t max_mem_;

	626 bool literal_;

	627 bool never_nl_;

	628 bool case_sensitive_;

	629 bool perl_classes_;

	630 bool word_boundary_;

	631 bool one_line_;

	632

	633 //DISALLOW_EVIL_CONSTRUCTORS(Options);

	634 Options(const Options&);

	635 void operator=(const Options&);

	636 };

	637

	638 // Returns the options set in the constructor.

	639 const Options& options() const { return options_; };

	640

	641 // Argument converters; see below.

	642 static inline Arg CRadix(short* x);

	643 static inline Arg CRadix(unsigned short* x);

	644 static inline Arg CRadix(int* x);

	645 static inline Arg CRadix(unsigned int* x);

	646 static inline Arg CRadix(long* x);

	647 static inline Arg CRadix(unsigned long* x);

	648 static inline Arg CRadix(long long* x);

	649 static inline Arg CRadix(unsigned long long* x);

	650

	651 static inline Arg Hex(short* x);

	652 static inline Arg Hex(unsigned short* x);

	653 static inline Arg Hex(int* x);

	654 static inline Arg Hex(unsigned int* x);

	655 static inline Arg Hex(long* x);

	656 static inline Arg Hex(unsigned long* x);

	657 static inline Arg Hex(long long* x);

	658 static inline Arg Hex(unsigned long long* x);

	659

	660 static inline Arg Octal(short* x);

	661 static inline Arg Octal(unsigned short* x);

	662 static inline Arg Octal(int* x);

	663 static inline Arg Octal(unsigned int* x);

	664 static inline Arg Octal(long* x);

	665 static inline Arg Octal(unsigned long* x);

	666 static inline Arg Octal(long long* x);

	667 static inline Arg Octal(unsigned long long* x);

	668

	669 private:

	670 void Init(const StringPiece& pattern, const Options& options);

	671

	672 bool Rewrite(string *out,

	673 const StringPiece &rewrite,

	674 const StringPiece* vec,

	675 int veclen) const;

	676

	677 bool DoMatch(const StringPiece& text,

	678 Anchor anchor,

	679 int* consumed,

	680 const Arg* const args[],

	681 int n) const;

	682

	683 re2::Prog* ReverseProg() const;

	684

	685 mutable Mutex* mutex_;

	686 string pattern_; // string regular expression

	687 Options options_; // option flags

	688 string prefix_; // required prefix (before regexp_)

	689 bool prefix_foldcase_; // prefix is ASCII case-insensitive

	690 re2::Regexp* entire_regexp_; // parsed regular expression

	691 re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed

	692 re2::Prog* prog_; // compiled program for regexp

	693 mutable re2::Prog* rprog_; // reverse program for regexp

	694 bool is_one_pass_; // can use prog_->SearchOnePass?

	695 mutable const string* error_; // Error indicator

	696 // (or points to empty string)

	697 mutable ErrorCode error_code_; // Error code

	698 mutable string error_arg_; // Fragment of regexp showing error

	699 mutable int num_captures_; // Number of capturing groups

	700

	701 // Map from capture names to indices

	702 mutable const map<string, int>* named_groups_;

	703

	704 // Map from capture indices to names

	705 mutable const map<int, string>* group_names_;

	706

	707 //DISALLOW_EVIL_CONSTRUCTORS(RE2);

	708 RE2(const RE2&);

	709 void operator=(const RE2&);

	710 };

	711

	712 /*** Implementation details ***/

	713

	714 // Hex/Octal/Binary?

	715

	716 // Special class for parsing into objects that define a ParseFrom() method

	717 template <class T>

	718 class _RE2_MatchObject {

	719 public:

	720 static inline bool Parse(const char* str, int n, void* dest) {

	721 if (dest == NULL) return true;

	722 T* object = reinterpret_cast<T*>(dest);

	723 return object->ParseFrom(str, n);

	724 }

	725 };

	726

	727 class RE2::Arg {

	728 public:

	729 // Empty constructor so we can declare arrays of RE2::Arg

	730 Arg();

	731

	732 // Constructor specially designed for NULL arguments

	733 Arg(void*);

	734

	735 typedef bool (Parser)(const char str, int n, void* dest);

	736

	737 // Type-specific parsers

	738 #define MAKE_PARSER(type,name) \

	739 Arg(type* p) : arg_(p), parser_(name) { } \

	740 Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \

	741

	742

	743 MAKE_PARSER(char, parse_char);

	744 MAKE_PARSER(signed char, parse_char);

	745 MAKE_PARSER(unsigned char, parse_uchar);

	746 MAKE_PARSER(short, parse_short);

	747 MAKE_PARSER(unsigned short, parse_ushort);

	748 MAKE_PARSER(int, parse_int);

	749 MAKE_PARSER(unsigned int, parse_uint);

	750 MAKE_PARSER(long, parse_long);

	751 MAKE_PARSER(unsigned long, parse_ulong);

	752 MAKE_PARSER(long long, parse_longlong);

	753 MAKE_PARSER(unsigned long long, parse_ulonglong);

	754 MAKE_PARSER(float, parse_float);

	755 MAKE_PARSER(double, parse_double);

	756 MAKE_PARSER(string, parse_string);

	757 MAKE_PARSER(StringPiece, parse_stringpiece);

	758

	759 #undef MAKE_PARSER

	760

	761 // Generic constructor

	762 template <class T> Arg(T*, Parser parser);

	763 // Generic constructor template

	764 template <class T> Arg(T* p)

	765 : arg_(p), parser_(_RE2_MatchObject<T>::Parse) {

	766 }

	767

	768 // Parse the data

	769 bool Parse(const char* str, int n) const;

	770

	771 private:

	772 void* arg_;

	773 Parser parser_;

	774

	775 static bool parse_null (const char* str, int n, void* dest);

	776 static bool parse_char (const char* str, int n, void* dest);

	777 static bool parse_uchar (const char* str, int n, void* dest);

	778 static bool parse_float (const char* str, int n, void* dest);

	779 static bool parse_double (const char* str, int n, void* dest);

	780 static bool parse_string (const char* str, int n, void* dest);

	781 static bool parse_stringpiece (const char* str, int n, void* dest);

	782

	783 #define DECLARE_INTEGER_PARSER(name) \

	784 private: \

	785 static bool parse_ ## name(const char* str, int n, void* dest); \

	786 static bool parse_ ## name ## _radix( \

	787 const char* str, int n, void* dest, int radix); \

	788 public: \

	789 static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \

	790 static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \

	791 static bool parse_ ## name ## _cradix(const char* str, int n, void* dest)

	792

	793 DECLARE_INTEGER_PARSER(short);

	794 DECLARE_INTEGER_PARSER(ushort);

	795 DECLARE_INTEGER_PARSER(int);

	796 DECLARE_INTEGER_PARSER(uint);

	797 DECLARE_INTEGER_PARSER(long);

	798 DECLARE_INTEGER_PARSER(ulong);

	799 DECLARE_INTEGER_PARSER(longlong);

	800 DECLARE_INTEGER_PARSER(ulonglong);

	801

	802 #undef DECLARE_INTEGER_PARSER

	803 };

	804

	805 inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }

	806 inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }

	807

	808 inline bool RE2::Arg::Parse(const char* str, int n) const {

	809 return (*parser_)(str, n, arg_);

	810 }

	811

	812 // This part of the parser, appropriate only for ints, deals with bases

	813 #define MAKE_INTEGER_PARSER(type, name) \

	814 inline RE2::Arg RE2::Hex(type* ptr) { \

	815 return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _hex); } \

	816 inline RE2::Arg RE2::Octal(type* ptr) { \

	817 return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _octal); } \

	818 inline RE2::Arg RE2::CRadix(type* ptr) { \

	819 return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); }

	820

	821 MAKE_INTEGER_PARSER(short, short);

	822 MAKE_INTEGER_PARSER(unsigned short, ushort);

	823 MAKE_INTEGER_PARSER(int, int);

	824 MAKE_INTEGER_PARSER(unsigned int, uint);

	825 MAKE_INTEGER_PARSER(long, long);

	826 MAKE_INTEGER_PARSER(unsigned long, ulong);

	827 MAKE_INTEGER_PARSER(long long, longlong);

	828 MAKE_INTEGER_PARSER(unsigned long long, ulonglong);

	829

	830 #undef MAKE_INTEGER_PARSER

	831

	832 } // namespace re2

	833

	834 using re2::RE2;

	835

	836 #endif /* RE2_RE2_H */

OLD	NEW

« no previous file with comments | « third_party/re2/re2/prog.cc ('k') | third_party/re2/re2/re2.cc » ('j') | third_party/re2/util/valgrind.cc » ('J')