third_party/re2/re2/re2.h - Issue 1544433002: Replace RE2 import with a dependency

Side by Side Diff: third_party/re2/re2/re2.h

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Re-Added LICENSE and OWNERS file Created 4 years, 12 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright 2003-2009 The RE2 Authors. All Rights Reserved.

2 // Use of this source code is governed by a BSD-style

3 // license that can be found in the LICENSE file.

4

5 #ifndef RE2_RE2_H

6 #define RE2_RE2_H

7

8 // C++ interface to the re2 regular-expression library.

9 // RE2 supports Perl-style regular expressions (with extensions like

10 // \d, \w, \s, ...).

11 //

12 // -----------------------------------------------------------------------

13 // REGEXP SYNTAX:

14 //

15 // This module uses the re2 library and hence supports

16 // its syntax for regular expressions, which is similar to Perl's with

17 // some of the more complicated things thrown away. In particular,

18 // backreferences and generalized assertions are not available, nor is \Z.

19 //

20 // See https://github.com/google/re2/wiki/Syntax for the syntax

21 // supported by RE2, and a comparison with PCRE and PERL regexps.

22 //

23 // For those not familiar with Perl's regular expressions,

24 // here are some examples of the most commonly used extensions:

25 //

26 // "hello (\\w+) world" -- \w matches a "word" character

27 // "version (\\d+)" -- \d matches a digit

28 // "hello\\s+world" -- \s matches any whitespace character

29 // "\\b(\\w+)\\b" -- \b matches non-empty string at word boundary

30 // "(?i)hello" -- (?i) turns on case-insensitive matching

31 // "/\\(.?)\\/" -- .? matches . minimum no. of times possible

32 //

33 // -----------------------------------------------------------------------

34 // MATCHING INTERFACE:

35 //

36 // The "FullMatch" operation checks that supplied text matches a

37 // supplied pattern exactly.

38 //

39 // Example: successful match

40 // CHECK(RE2::FullMatch("hello", "h.*o"));

41 //

42 // Example: unsuccessful match (requires full match):

43 // CHECK(!RE2::FullMatch("hello", "e"));

44 //

45 // -----------------------------------------------------------------------

46 // UTF-8 AND THE MATCHING INTERFACE:

47 //

48 // By default, the pattern and input text are interpreted as UTF-8.

49 // The RE2::Latin1 option causes them to be interpreted as Latin-1.

50 //

51 // Example:

52 // CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern)));

53 // CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1)));

54 //

55 // -----------------------------------------------------------------------

56 // MATCHING WITH SUB-STRING EXTRACTION:

57 //

58 // You can supply extra pointer arguments to extract matched subpieces.

59 //

60 // Example: extracts "ruby" into "s" and 1234 into "i"

61 // int i;

62 // string s;

63 // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));

64 //

65 // Example: fails because string cannot be stored in integer

66 // CHECK(!RE2::FullMatch("ruby", "(.*)", &i));

67 //

68 // Example: fails because there aren't enough sub-patterns:

69 // CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s));

70 //

71 // Example: does not try to extract any extra sub-patterns

72 // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));

73 //

74 // Example: does not try to extract into NULL

75 // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));

76 //

77 // Example: integer overflow causes failure

78 // CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));

79 //

80 // NOTE(rsc): Asking for substrings slows successful matches quite a bit.

81 // This may get a little faster in the future, but right now is slower

82 // than PCRE. On the other hand, failed matches run very fast (faster

83 // than PCRE), as do matches without substring extraction.

84 //

85 // -----------------------------------------------------------------------

86 // PARTIAL MATCHES

87 //

88 // You can use the "PartialMatch" operation when you want the pattern

89 // to match any substring of the text.

90 //

91 // Example: simple search for a string:

92 // CHECK(RE2::PartialMatch("hello", "ell"));

93 //

94 // Example: find first number in a string

95 // int number;

96 // CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number));

97 // CHECK_EQ(number, 100);

98 //

99 // -----------------------------------------------------------------------

100 // PRE-COMPILED REGULAR EXPRESSIONS

101 //

102 // RE2 makes it easy to use any string as a regular expression, without

103 // requiring a separate compilation step.

104 //

105 // If speed is of the essence, you can create a pre-compiled "RE2"

106 // object from the pattern and use it multiple times. If you do so,

107 // you can typically parse text faster than with sscanf.

108 //

109 // Example: precompile pattern for faster matching:

110 // RE2 pattern("h.*o");

111 // while (ReadLine(&str)) {

112 // if (RE2::FullMatch(str, pattern)) ...;

113 // }

114 //

115 // -----------------------------------------------------------------------

116 // SCANNING TEXT INCREMENTALLY

117 //

118 // The "Consume" operation may be useful if you want to repeatedly

119 // match regular expressions at the front of a string and skip over

120 // them as they match. This requires use of the "StringPiece" type,

121 // which represents a sub-range of a real string.

122 //

123 // Example: read lines of the form "var = value" from a string.

124 // string contents = ...; // Fill string somehow

125 // StringPiece input(contents); // Wrap a StringPiece around it

126 //

127 // string var;

128 // int value;

129 // while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {

130 // ...;

131 // }

132 //

133 // Each successful call to "Consume" will set "var/value", and also

134 // advance "input" so it points past the matched text. Note that if the

135 // regular expression matches an empty string, input will advance

136 // by 0 bytes. If the regular expression being used might match

137 // an empty string, the loop body must check for this case and either

138 // advance the string or break out of the loop.

139 //

140 // The "FindAndConsume" operation is similar to "Consume" but does not

141 // anchor your match at the beginning of the string. For example, you

142 // could extract all words from a string by repeatedly calling

143 // RE2::FindAndConsume(&input, "(\\w+)", &word)

144 //

145 // -----------------------------------------------------------------------

146 // USING VARIABLE NUMBER OF ARGUMENTS

147 //

148 // The above operations require you to know the number of arguments

149 // when you write the code. This is not always possible or easy (for

150 // example, the regular expression may be calculated at run time).

151 // You can use the "N" version of the operations when the number of

152 // match arguments are determined at run time.

153 //

154 // Example:

155 // const RE2::Arg* args[10];

156 // int n;

157 // // ... populate args with pointers to RE2::Arg values ...

158 // // ... set n to the number of RE2::Arg objects ...

159 // bool match = RE2::FullMatchN(input, pattern, args, n);

160 //

161 // The last statement is equivalent to

162 //

163 // bool match = RE2::FullMatch(input, pattern,

164 // args[0], args[1], ..., *args[n - 1]);

165 //

166 // -----------------------------------------------------------------------

167 // PARSING HEX/OCTAL/C-RADIX NUMBERS

168 //

169 // By default, if you pass a pointer to a numeric value, the

170 // corresponding text is interpreted as a base-10 number. You can

171 // instead wrap the pointer with a call to one of the operators Hex(),

172 // Octal(), or CRadix() to interpret the text in another base. The

173 // CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)

174 // prefixes, but defaults to base-10.

175 //

176 // Example:

177 // int a, b, c, d;

178 // CHECK(RE2::FullMatch("100 40 0100 0x40", "(.) (.) (.) (.)",

179 // RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));

180 // will leave 64 in a, b, c, and d.

181

182 #include <stdint.h>

183 #include <map>

184 #include <string>

185 #include "re2/stringpiece.h"

186 #include "re2/variadic_function.h"

187

188 #ifndef RE2_HAVE_LONGLONG

189 #define RE2_HAVE_LONGLONG 1

190 #endif

191

192 namespace re2 {

193

194 using std::string;

195 using std::map;

196 class Mutex;

197 class Prog;

198 class Regexp;

199

200 // The following enum should be used only as a constructor argument to indicate

201 // that the variable has static storage class, and that the constructor should

202 // do nothing to its state. It indicates to the reader that it is legal to

203 // declare a static instance of the class, provided the constructor is given

204 // the LINKER_INITIALIZED argument. Normally, it is unsafe to declare a

205 // static variable that has a constructor or a destructor because invocation

206 // order is undefined. However, IF the type can be initialized by filling with

207 // zeroes (which the loader does for static variables), AND the type's

208 // destructor does nothing to the storage, then a constructor for static

209 // initialization can be declared as

210 // explicit MyClass(LinkerInitialized x) {}

211 // and invoked as

212 // static MyClass my_variable_name(LINKER_INITIALIZED);

213 enum LinkerInitialized { LINKER_INITIALIZED };

214

215 // Interface for regular expression matching. Also corresponds to a

216 // pre-compiled regular expression. An "RE2" object is safe for

217 // concurrent use by multiple threads.

218 class RE2 {

219 public:

220 // We convert user-passed pointers into special Arg objects

221 class Arg;

222 class Options;

223

224 // Defined in set.h.

225 class Set;

226

227 enum ErrorCode {

228 NoError = 0,

229

230 // Unexpected error

231 ErrorInternal,

232

233 // Parse errors

234 ErrorBadEscape, // bad escape sequence

235 ErrorBadCharClass, // bad character class

236 ErrorBadCharRange, // bad character class range

237 ErrorMissingBracket, // missing closing ]

238 ErrorMissingParen, // missing closing )

239 ErrorTrailingBackslash, // trailing \ at end of regexp

240 ErrorRepeatArgument, // repeat argument missing, e.g. "*"

241 ErrorRepeatSize, // bad repetition argument

242 ErrorRepeatOp, // bad repetition operator

243 ErrorBadPerlOp, // bad perl operator

244 ErrorBadUTF8, // invalid UTF-8 in regexp

245 ErrorBadNamedCapture, // bad named capture group

246 ErrorPatternTooLarge // pattern too large (compile failed)

247 };

248

249 // Predefined common options.

250 // If you need more complicated things, instantiate

251 // an Option class, possibly passing one of these to

252 // the Option constructor, change the settings, and pass that

253 // Option class to the RE2 constructor.

254 enum CannedOptions {

255 DefaultOptions = 0,

256 Latin1, // treat input as Latin-1 (default UTF-8)

257 POSIX, // POSIX syntax, leftmost-longest match

258 Quiet // do not log about regexp parse errors

259 };

260

261 // Need to have the const char* and const string& forms for implicit

262 // conversions when passing string literals to FullMatch and PartialMatch.

263 // Otherwise the StringPiece form would be sufficient.

264 #ifndef SWIG

265 RE2(const char* pattern);

266 RE2(const string& pattern);

267 #endif

268 RE2(const StringPiece& pattern);

269 RE2(const StringPiece& pattern, const Options& option);

270 ~RE2();

271

272 // Returns whether RE2 was created properly.

273 bool ok() const { return error_code() == NoError; }

274

275 // The string specification for this RE2. E.g.

276 // RE2 re("ab*c?d+");

277 // re.pattern(); // "ab*c?d+"

278 const string& pattern() const { return pattern_; }

279

280 // If RE2 could not be created properly, returns an error string.

281 // Else returns the empty string.

282 const string& error() const { return *error_; }

283

284 // If RE2 could not be created properly, returns an error code.

285 // Else returns RE2::NoError (== 0).

286 ErrorCode error_code() const { return error_code_; }

287

288 // If RE2 could not be created properly, returns the offending

289 // portion of the regexp.

290 const string& error_arg() const { return error_arg_; }

291

292 // Returns the program size, a very approximate measure of a regexp's "cost".

293 // Larger numbers are more expensive than smaller numbers.

294 int ProgramSize() const;

295

296 // EXPERIMENTAL! SUBJECT TO CHANGE!

297 // Outputs the program fanout as a histogram bucketed by powers of 2.

298 // Returns the number of the largest non-empty bucket.

299 int ProgramFanout(map<int, int>* histogram) const;

300

301 // Returns the underlying Regexp; not for general use.

302 // Returns entire_regexp_ so that callers don't need

303 // to know about prefix_ and prefix_foldcase_.

304 re2::Regexp* Regexp() const { return entire_regexp_; }

305

306 /*** The useful part: the matching interface ***/

307

308 // Matches "text" against "pattern". If pointer arguments are

309 // supplied, copies matched sub-patterns into them.

310 //

311 // You can pass in a "const char*" or a "string" for "text".

312 // You can pass in a "const char*" or a "string" or a "RE2" for "pattern".

313 //

314 // The provided pointer arguments can be pointers to any scalar numeric

315 // type, or one of:

316 // string (matched piece is copied to string)

317 // StringPiece (StringPiece is mutated to point to matched piece)

318 // T (where "bool T::ParseFrom(const char*, int)" exists)

319 // (void*)NULL (the corresponding matched sub-pattern is not copied)

320 //

321 // Returns true iff all of the following conditions are satisfied:

322 // a. "text" matches "pattern" exactly

323 // b. The number of matched sub-patterns is >= number of supplied pointers

324 // c. The "i"th argument has a suitable type for holding the

325 // string captured as the "i"th sub-pattern. If you pass in

326 // NULL for the "i"th argument, or pass fewer arguments than

327 // number of sub-patterns, "i"th captured sub-pattern is

328 // ignored.

329 //

330 // CAVEAT: An optional sub-pattern that does not exist in the

331 // matched string is assigned the empty string. Therefore, the

332 // following will return false (because the empty string is not a

333 // valid number):

334 // int number;

335 // RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);

336 static bool FullMatchN(const StringPiece& text, const RE2& re,

337 const Arg* const args[], int argc);

338 static const VariadicFunction2<

339 bool, const StringPiece&, const RE2&, Arg, RE2::FullMatchN> FullMatch;

340

341 // Exactly like FullMatch(), except that "pattern" is allowed to match

342 // a substring of "text".

343 static bool PartialMatchN(const StringPiece& text, const RE2& re, // 3..16 arg s

344 const Arg* const args[], int argc);

345 static const VariadicFunction2<

346 bool, const StringPiece&, const RE2&, Arg, RE2::PartialMatchN> PartialMatc h;

347

348 // Like FullMatch() and PartialMatch(), except that pattern has to

349 // match a prefix of "text", and "input" is advanced past the matched

350 // text. Note: "input" is modified iff this routine returns true.

351 static bool ConsumeN(StringPiece* input, const RE2& pattern, // 3..16 args

352 const Arg* const args[], int argc);

353 static const VariadicFunction2<

354 bool, StringPiece*, const RE2&, Arg, RE2::ConsumeN> Consume;

355

356 // Like Consume(..), but does not anchor the match at the beginning of the

357 // string. That is, "pattern" need not start its match at the beginning of

358 // "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next

359 // word in "s" and stores it in "word".

360 static bool FindAndConsumeN(StringPiece* input, const RE2& pattern,

361 const Arg* const args[], int argc);

362 static const VariadicFunction2<

363 bool, StringPiece*, const RE2&, Arg, RE2::FindAndConsumeN> FindAndConsume;

364

365 // Replace the first match of "pattern" in "str" with "rewrite".

366 // Within "rewrite", backslash-escaped digits (\1 to \9) can be

367 // used to insert text matching corresponding parenthesized group

368 // from the pattern. \0 in "rewrite" refers to the entire matching

369 // text. E.g.,

370 //

371 // string s = "yabba dabba doo";

372 // CHECK(RE2::Replace(&s, "b+", "d"));

373 //

374 // will leave "s" containing "yada dabba doo"

375 //

376 // Returns true if the pattern matches and a replacement occurs,

377 // false otherwise.

378 static bool Replace(string *str,

379 const RE2& pattern,

380 const StringPiece& rewrite);

381

382 // Like Replace(), except replaces successive non-overlapping occurrences

383 // of the pattern in the string with the rewrite. E.g.

384 //

385 // string s = "yabba dabba doo";

386 // CHECK(RE2::GlobalReplace(&s, "b+", "d"));

387 //

388 // will leave "s" containing "yada dada doo"

389 // Replacements are not subject to re-matching.

390 //

391 // Because GlobalReplace only replaces non-overlapping matches,

392 // replacing "ana" within "banana" makes only one replacement, not two.

393 //

394 // Returns the number of replacements made.

395 static int GlobalReplace(string *str,

396 const RE2& pattern,

397 const StringPiece& rewrite);

398

399 // Like Replace, except that if the pattern matches, "rewrite"

400 // is copied into "out" with substitutions. The non-matching

401 // portions of "text" are ignored.

402 //

403 // Returns true iff a match occurred and the extraction happened

404 // successfully; if no match occurs, the string is left unaffected.

405 //

406 // REQUIRES: "text" must not alias any part of "*out".

407 static bool Extract(const StringPiece &text,

408 const RE2& pattern,

409 const StringPiece &rewrite,

410 string *out);

411

412 // Escapes all potentially meaningful regexp characters in

413 // 'unquoted'. The returned string, used as a regular expression,

414 // will exactly match the original string. For example,

415 // 1.5-2.0?

416 // may become:

417 // 1\.5\-2\.0\?

418 static string QuoteMeta(const StringPiece& unquoted);

419

420 // Computes range for any strings matching regexp. The min and max can in

421 // some cases be arbitrarily precise, so the caller gets to specify the

422 // maximum desired length of string returned.

423 //

424 // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any

425 // string s that is an anchored match for this regexp satisfies

426 // min <= s && s <= max.

427 //

428 // Note that PossibleMatchRange() will only consider the first copy of an

429 // infinitely repeated element (i.e., any regexp element followed by a '*' or

430 // '+' operator). Regexps with "{N}" constructions are not affected, as those

431 // do not compile down to infinite repetitions.

432 //

433 // Returns true on success, false on error.

434 bool PossibleMatchRange(string* min, string* max, int maxlen) const;

435

436 // Generic matching interface

437

438 // Type of match.

439 enum Anchor {

440 UNANCHORED, // No anchoring

441 ANCHOR_START, // Anchor at start only

442 ANCHOR_BOTH // Anchor at start and end

443 };

444

445 // Return the number of capturing subpatterns, or -1 if the

446 // regexp wasn't valid on construction. The overall match ($0)

447 // does not count: if the regexp is "(a)(b)", returns 2.

448 int NumberOfCapturingGroups() const;

449

450 // Return a map from names to capturing indices.

451 // The map records the index of the leftmost group

452 // with the given name.

453 // Only valid until the re is deleted.

454 const map<string, int>& NamedCapturingGroups() const;

455

456 // Return a map from capturing indices to names.

457 // The map has no entries for unnamed groups.

458 // Only valid until the re is deleted.

459 const map<int, string>& CapturingGroupNames() const;

460

461 // General matching routine.

462 // Match against text starting at offset startpos

463 // and stopping the search at offset endpos.

464 // Returns true if match found, false if not.

465 // On a successful match, fills in match[] (up to nmatch entries)

466 // with information about submatches.

467 // I.e. matching RE2("(foo)\|(bar)baz") on "barbazbla" will return true,

468 // setting match[0] = "barbaz", match[1] = NULL, match[2] = "bar",

469 // match[3] = NULL, ..., up to match[nmatch-1] = NULL.

470 //

471 // Don't ask for more match information than you will use:

472 // runs much faster with nmatch == 1 than nmatch > 1, and

473 // runs even faster if nmatch == 0.

474 // Doesn't make sense to use nmatch > 1 + NumberOfCapturingGroups(),

475 // but will be handled correctly.

476 //

477 // Passing text == StringPiece(NULL, 0) will be handled like any other

478 // empty string, but note that on return, it will not be possible to tell

479 // whether submatch i matched the empty string or did not match:

480 // either way, match[i] == NULL.

481 bool Match(const StringPiece& text,

482 int startpos,

483 int endpos,

484 Anchor anchor,

485 StringPiece *match,

486 int nmatch) const;

487

488 // Check that the given rewrite string is suitable for use with this

489 // regular expression. It checks that:

490 // * The regular expression has enough parenthesized subexpressions

491 // to satisfy all of the \N tokens in rewrite

492 // * The rewrite string doesn't have any syntax errors. E.g.,

493 // '\' followed by anything other than a digit or '\'.

494 // A true return value guarantees that Replace() and Extract() won't

495 // fail because of a bad rewrite string.

496 bool CheckRewriteString(const StringPiece& rewrite, string* error) const;

497

498 // Returns the maximum submatch needed for the rewrite to be done by

499 // Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2.

500 static int MaxSubmatch(const StringPiece& rewrite);

501

502 // Append the "rewrite" string, with backslash subsitutions from "vec",

503 // to string "out".

504 // Returns true on success. This method can fail because of a malformed

505 // rewrite string. CheckRewriteString guarantees that the rewrite will

506 // be sucessful.

507 bool Rewrite(string *out,

508 const StringPiece &rewrite,

509 const StringPiece* vec,

510 int veclen) const;

511

512 // Constructor options

513 class Options {

514 public:

515 // The options are (defaults in parentheses):

516 //

517 // utf8 (true) text and pattern are UTF-8; otherwise Latin-1

518 // posix_syntax (false) restrict regexps to POSIX egrep syntax

519 // longest_match (false) search for longest match, not first match

520 // log_errors (true) log syntax and execution errors to ERROR

521 // max_mem (see below) approx. max memory footprint of RE2

522 // literal (false) interpret string as literal, not regexp

523 // never_nl (false) never match \n, even if it is in regexp

524 // dot_nl (false) dot matches everything including new line

525 // never_capture (false) parse all parens as non-capturing

526 // case_sensitive (true) match is case-sensitive (regexp can override

527 // with (?i) unless in posix_syntax mode)

528 //

529 // The following options are only consulted when posix_syntax == true.

530 // (When posix_syntax == false these features are always enabled and

531 // cannot be turned off.)

532 // perl_classes (false) allow Perl's \d \s \w \D \S \W

533 // word_boundary (false) allow Perl's \b \B (word boundary and not)

534 // one_line (false) ^ and $ only match beginning and end of text

535 //

536 // The max_mem option controls how much memory can be used

537 // to hold the compiled form of the regexp (the Prog) and

538 // its cached DFA graphs. Code Search placed limits on the number

539 // of Prog instructions and DFA states: 10,000 for both.

540 // In RE2, those limits would translate to about 240 KB per Prog

541 // and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a

542 // better job of keeping them small than Code Search did).

543 // Each RE2 has two Progs (one forward, one reverse), and each Prog

544 // can have two DFAs (one first match, one longest match).

545 // That makes 4 DFAs:

546 //

547 // forward, first-match - used for UNANCHORED or ANCHOR_LEFT searches

548 // if opt.longest_match() == false

549 // forward, longest-match - used for all ANCHOR_BOTH searches,

550 // and the other two kinds if

551 // opt.longest_match() == true

552 // reverse, first-match - never used

553 // reverse, longest-match - used as second phase for unanchored searches

554 //

555 // The RE2 memory budget is statically divided between the two

556 // Progs and then the DFAs: two thirds to the forward Prog

557 // and one third to the reverse Prog. The forward Prog gives half

558 // of what it has left over to each of its DFAs. The reverse Prog

559 // gives it all to its longest-match DFA.

560 //

561 // Once a DFA fills its budget, it flushes its cache and starts over.

562 // If this happens too often, RE2 falls back on the NFA implementation.

563

564 // For now, make the default budget something close to Code Search.

565 static const int kDefaultMaxMem = 8<<20;

566

567 enum Encoding {

568 EncodingUTF8 = 1,

569 EncodingLatin1

570 };

571

572 Options() :

573 encoding_(EncodingUTF8),

574 posix_syntax_(false),

575 longest_match_(false),

576 log_errors_(true),

577 max_mem_(kDefaultMaxMem),

578 literal_(false),

579 never_nl_(false),

580 dot_nl_(false),

581 never_capture_(false),

582 case_sensitive_(true),

583 perl_classes_(false),

584 word_boundary_(false),

585 one_line_(false) {

586 }

587

588 /implicit/ Options(CannedOptions);

589

590 Encoding encoding() const { return encoding_; }

591 void set_encoding(Encoding encoding) { encoding_ = encoding; }

592

593 // Legacy interface to encoding.

594 // TODO(rsc): Remove once clients have been converted.

595 bool utf8() const { return encoding_ == EncodingUTF8; }

596 void set_utf8(bool b) {

597 if (b) {

598 encoding_ = EncodingUTF8;

599 } else {

600 encoding_ = EncodingLatin1;

601 }

602 }

603

604 bool posix_syntax() const { return posix_syntax_; }

605 void set_posix_syntax(bool b) { posix_syntax_ = b; }

606

607 bool longest_match() const { return longest_match_; }

608 void set_longest_match(bool b) { longest_match_ = b; }

609

610 bool log_errors() const { return log_errors_; }

611 void set_log_errors(bool b) { log_errors_ = b; }

612

613 int64_t max_mem() const { return max_mem_; }

614 void set_max_mem(int64_t m) { max_mem_ = m; }

615

616 bool literal() const { return literal_; }

617 void set_literal(bool b) { literal_ = b; }

618

619 bool never_nl() const { return never_nl_; }

620 void set_never_nl(bool b) { never_nl_ = b; }

621

622 bool dot_nl() const { return dot_nl_; }

623 void set_dot_nl(bool b) { dot_nl_ = b; }

624

625 bool never_capture() const { return never_capture_; }

626 void set_never_capture(bool b) { never_capture_ = b; }

627

628 bool case_sensitive() const { return case_sensitive_; }

629 void set_case_sensitive(bool b) { case_sensitive_ = b; }

630

631 bool perl_classes() const { return perl_classes_; }

632 void set_perl_classes(bool b) { perl_classes_ = b; }

633

634 bool word_boundary() const { return word_boundary_; }

635 void set_word_boundary(bool b) { word_boundary_ = b; }

636

637 bool one_line() const { return one_line_; }

638 void set_one_line(bool b) { one_line_ = b; }

639

640 void Copy(const Options& src) {

641 encoding_ = src.encoding_;

642 posix_syntax_ = src.posix_syntax_;

643 longest_match_ = src.longest_match_;

644 log_errors_ = src.log_errors_;

645 max_mem_ = src.max_mem_;

646 literal_ = src.literal_;

647 never_nl_ = src.never_nl_;

648 dot_nl_ = src.dot_nl_;

649 never_capture_ = src.never_capture_;

650 case_sensitive_ = src.case_sensitive_;

651 perl_classes_ = src.perl_classes_;

652 word_boundary_ = src.word_boundary_;

653 one_line_ = src.one_line_;

654 }

655

656 int ParseFlags() const;

657

658 private:

659 Encoding encoding_;

660 bool posix_syntax_;

661 bool longest_match_;

662 bool log_errors_;

663 int64_t max_mem_;

664 bool literal_;

665 bool never_nl_;

666 bool dot_nl_;

667 bool never_capture_;

668 bool case_sensitive_;

669 bool perl_classes_;

670 bool word_boundary_;

671 bool one_line_;

672

673 //DISALLOW_COPY_AND_ASSIGN(Options);

674 Options(const Options&);

675 void operator=(const Options&);

676 };

677

678 // Returns the options set in the constructor.

679 const Options& options() const { return options_; };

680

681 // Argument converters; see below.

682 static inline Arg CRadix(short* x);

683 static inline Arg CRadix(unsigned short* x);

684 static inline Arg CRadix(int* x);

685 static inline Arg CRadix(unsigned int* x);

686 static inline Arg CRadix(long* x);

687 static inline Arg CRadix(unsigned long* x);

688 #if RE2_HAVE_LONGLONG

689 static inline Arg CRadix(long long* x);

690 static inline Arg CRadix(unsigned long long* x);

691 #endif

692

693 static inline Arg Hex(short* x);

694 static inline Arg Hex(unsigned short* x);

695 static inline Arg Hex(int* x);

696 static inline Arg Hex(unsigned int* x);

697 static inline Arg Hex(long* x);

698 static inline Arg Hex(unsigned long* x);

699 #if RE2_HAVE_LONGLONG

700 static inline Arg Hex(long long* x);

701 static inline Arg Hex(unsigned long long* x);

702 #endif

703

704 static inline Arg Octal(short* x);

705 static inline Arg Octal(unsigned short* x);

706 static inline Arg Octal(int* x);

707 static inline Arg Octal(unsigned int* x);

708 static inline Arg Octal(long* x);

709 static inline Arg Octal(unsigned long* x);

710 #if RE2_HAVE_LONGLONG

711 static inline Arg Octal(long long* x);

712 static inline Arg Octal(unsigned long long* x);

713 #endif

714

715 private:

716 void Init(const StringPiece& pattern, const Options& options);

717

718 bool DoMatch(const StringPiece& text,

719 Anchor anchor,

720 int* consumed,

721 const Arg* const args[],

722 int n) const;

723

724 re2::Prog* ReverseProg() const;

725

726 mutable Mutex* mutex_;

727 string pattern_; // string regular expression

728 Options options_; // option flags

729 string prefix_; // required prefix (before regexp_)

730 bool prefix_foldcase_; // prefix is ASCII case-insensitive

731 re2::Regexp* entire_regexp_; // parsed regular expression

732 re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed

733 re2::Prog* prog_; // compiled program for regexp

734 mutable re2::Prog* rprog_; // reverse program for regexp

735 bool is_one_pass_; // can use prog_->SearchOnePass?

736 mutable const string* error_; // Error indicator

737 // (or points to empty string)

738 mutable ErrorCode error_code_; // Error code

739 mutable string error_arg_; // Fragment of regexp showing error

740 mutable int num_captures_; // Number of capturing groups

741

742 // Map from capture names to indices

743 mutable const map<string, int>* named_groups_;

744

745 // Map from capture indices to names

746 mutable const map<int, string>* group_names_;

747

748 //DISALLOW_COPY_AND_ASSIGN(RE2);

749 RE2(const RE2&);

750 void operator=(const RE2&);

751 };

752

753 /*** Implementation details ***/

754

755 // Hex/Octal/Binary?

756

757 // Special class for parsing into objects that define a ParseFrom() method

758 template <class T>

759 class _RE2_MatchObject {

760 public:

761 static inline bool Parse(const char* str, int n, void* dest) {

762 if (dest == NULL) return true;

763 T* object = reinterpret_cast<T*>(dest);

764 return object->ParseFrom(str, n);

765 }

766 };

767

768 class RE2::Arg {

769 public:

770 // Empty constructor so we can declare arrays of RE2::Arg

771 Arg();

772

773 // Constructor specially designed for NULL arguments

774 Arg(void*);

775

776 typedef bool (Parser)(const char str, int n, void* dest);

777

778 // Type-specific parsers

779 #define MAKE_PARSER(type,name) \

780 Arg(type* p) : arg_(p), parser_(name) { } \

781 Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \

782

783

784 MAKE_PARSER(char, parse_char);

785 MAKE_PARSER(signed char, parse_char);

786 MAKE_PARSER(unsigned char, parse_uchar);

787 MAKE_PARSER(short, parse_short);

788 MAKE_PARSER(unsigned short, parse_ushort);

789 MAKE_PARSER(int, parse_int);

790 MAKE_PARSER(unsigned int, parse_uint);

791 MAKE_PARSER(long, parse_long);

792 MAKE_PARSER(unsigned long, parse_ulong);

793 #if RE2_HAVE_LONGLONG

794 MAKE_PARSER(long long, parse_longlong);

795 MAKE_PARSER(unsigned long long, parse_ulonglong);

796 #endif

797 MAKE_PARSER(float, parse_float);

798 MAKE_PARSER(double, parse_double);

799 MAKE_PARSER(string, parse_string);

800 MAKE_PARSER(StringPiece, parse_stringpiece);

801

802 #undef MAKE_PARSER

803

804 // Generic constructor templates

805 template <class T> Arg(T* p)

806 : arg_(p), parser_(_RE2_MatchObject<T>::Parse) { }

807 template <class T> Arg(T* p, Parser parser)

808 : arg_(p), parser_(parser) { }

809

810 // Parse the data

811 bool Parse(const char* str, int n) const;

812

813 private:

814 void* arg_;

815 Parser parser_;

816

817 static bool parse_null (const char* str, int n, void* dest);

818 static bool parse_char (const char* str, int n, void* dest);

819 static bool parse_uchar (const char* str, int n, void* dest);

820 static bool parse_float (const char* str, int n, void* dest);

821 static bool parse_double (const char* str, int n, void* dest);

822 static bool parse_string (const char* str, int n, void* dest);

823 static bool parse_stringpiece (const char* str, int n, void* dest);

824

825 #define DECLARE_INTEGER_PARSER(name) \

826 private: \

827 static bool parse_ ## name(const char* str, int n, void* dest); \

828 static bool parse_ ## name ## _radix( \

829 const char* str, int n, void* dest, int radix); \

830 public: \

831 static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \

832 static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \

833 static bool parse_ ## name ## _cradix(const char* str, int n, void* dest)

834

835 DECLARE_INTEGER_PARSER(short);

836 DECLARE_INTEGER_PARSER(ushort);

837 DECLARE_INTEGER_PARSER(int);

838 DECLARE_INTEGER_PARSER(uint);

839 DECLARE_INTEGER_PARSER(long);

840 DECLARE_INTEGER_PARSER(ulong);

841 #if RE2_HAVE_LONGLONG

842 DECLARE_INTEGER_PARSER(longlong);

843 DECLARE_INTEGER_PARSER(ulonglong);

844 #endif

845

846 #undef DECLARE_INTEGER_PARSER

847 };

848

849 inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }

850 inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }

851

852 inline bool RE2::Arg::Parse(const char* str, int n) const {

853 return (*parser_)(str, n, arg_);

854 }

855

856 // This part of the parser, appropriate only for ints, deals with bases

857 #define MAKE_INTEGER_PARSER(type, name) \

858 inline RE2::Arg RE2::Hex(type* ptr) { \

859 return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _hex); } \

860 inline RE2::Arg RE2::Octal(type* ptr) { \

861 return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _octal); } \

862 inline RE2::Arg RE2::CRadix(type* ptr) { \

863 return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); }

864

865 MAKE_INTEGER_PARSER(short, short)

866 MAKE_INTEGER_PARSER(unsigned short, ushort)

867 MAKE_INTEGER_PARSER(int, int)

868 MAKE_INTEGER_PARSER(unsigned int, uint)

869 MAKE_INTEGER_PARSER(long, long)

870 MAKE_INTEGER_PARSER(unsigned long, ulong)

871 #if RE2_HAVE_LONGLONG

872 MAKE_INTEGER_PARSER(long long, longlong)

873 MAKE_INTEGER_PARSER(unsigned long long, ulonglong)

874 #endif

875

876 #undef MAKE_INTEGER_PARSER

877

878 } // namespace re2

879

880 using re2::RE2;

881

882 #endif /* RE2_RE2_H */

OLD	NEW

« no previous file with comments | « third_party/re2/re2/prog.cc ('k') | third_party/re2/re2/re2.cc » ('j') | no next file with comments »