Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(190)

Side by Side Diff: third_party/re2/re2/re2.cc

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Re-Added LICENSE and OWNERS file Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/re2/re2/re2.h ('k') | third_party/re2/re2/regexp.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Regular expression interface RE2.
6 //
7 // Originally the PCRE C++ wrapper, but adapted to use
8 // the new automata-based regular expression engines.
9
10 #include "re2/re2.h"
11
12 #include <stdio.h>
13 #include <string>
14 #include <errno.h>
15 #include "util/util.h"
16 #include "util/flags.h"
17 #include "util/sparse_array.h"
18 #include "re2/prog.h"
19 #include "re2/regexp.h"
20
21 DEFINE_bool(trace_re2, false, "trace RE2 execution");
22
23 namespace re2 {
24
25 // Maximum number of args we can set
26 static const int kMaxArgs = 16;
27 static const int kVecSize = 1+kMaxArgs;
28
29 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::Ful lMatchN> RE2::FullMatch = {};
30 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::Par tialMatchN> RE2::PartialMatch = {};
31 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume = {};
32 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndCo nsumeN> RE2::FindAndConsume = {};
33
34 // This will trigger LNK2005 error in MSVC.
35 #ifndef _MSC_VER
36 const int RE2::Options::kDefaultMaxMem; // initialized in re2.h
37 #endif
38
39 RE2::Options::Options(RE2::CannedOptions opt)
40 : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
41 posix_syntax_(opt == RE2::POSIX),
42 longest_match_(opt == RE2::POSIX),
43 log_errors_(opt != RE2::Quiet),
44 max_mem_(kDefaultMaxMem),
45 literal_(false),
46 never_nl_(false),
47 dot_nl_(false),
48 never_capture_(false),
49 case_sensitive_(true),
50 perl_classes_(false),
51 word_boundary_(false),
52 one_line_(false) {
53 }
54
55 // static empty things for use as const references.
56 // To avoid global constructors, initialized on demand.
57 GLOBAL_MUTEX(empty_mutex);
58 static const string *empty_string;
59 static const map<string, int> *empty_named_groups;
60 static const map<int, string> *empty_group_names;
61
62 static void InitEmpty() {
63 GLOBAL_MUTEX_LOCK(empty_mutex);
64 if (empty_string == NULL) {
65 empty_string = new string;
66 empty_named_groups = new map<string, int>;
67 empty_group_names = new map<int, string>;
68 }
69 GLOBAL_MUTEX_UNLOCK(empty_mutex);
70 }
71
72 // Converts from Regexp error code to RE2 error code.
73 // Maybe some day they will diverge. In any event, this
74 // hides the existence of Regexp from RE2 users.
75 static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
76 switch (code) {
77 case re2::kRegexpSuccess:
78 return RE2::NoError;
79 case re2::kRegexpInternalError:
80 return RE2::ErrorInternal;
81 case re2::kRegexpBadEscape:
82 return RE2::ErrorBadEscape;
83 case re2::kRegexpBadCharClass:
84 return RE2::ErrorBadCharClass;
85 case re2::kRegexpBadCharRange:
86 return RE2::ErrorBadCharRange;
87 case re2::kRegexpMissingBracket:
88 return RE2::ErrorMissingBracket;
89 case re2::kRegexpMissingParen:
90 return RE2::ErrorMissingParen;
91 case re2::kRegexpTrailingBackslash:
92 return RE2::ErrorTrailingBackslash;
93 case re2::kRegexpRepeatArgument:
94 return RE2::ErrorRepeatArgument;
95 case re2::kRegexpRepeatSize:
96 return RE2::ErrorRepeatSize;
97 case re2::kRegexpRepeatOp:
98 return RE2::ErrorRepeatOp;
99 case re2::kRegexpBadPerlOp:
100 return RE2::ErrorBadPerlOp;
101 case re2::kRegexpBadUTF8:
102 return RE2::ErrorBadUTF8;
103 case re2::kRegexpBadNamedCapture:
104 return RE2::ErrorBadNamedCapture;
105 }
106 return RE2::ErrorInternal;
107 }
108
109 static string trunc(const StringPiece& pattern) {
110 if (pattern.size() < 100)
111 return pattern.as_string();
112 return pattern.substr(0, 100).as_string() + "...";
113 }
114
115
116 RE2::RE2(const char* pattern) {
117 Init(pattern, DefaultOptions);
118 }
119
120 RE2::RE2(const string& pattern) {
121 Init(pattern, DefaultOptions);
122 }
123
124 RE2::RE2(const StringPiece& pattern) {
125 Init(pattern, DefaultOptions);
126 }
127
128 RE2::RE2(const StringPiece& pattern, const Options& options) {
129 Init(pattern, options);
130 }
131
132 int RE2::Options::ParseFlags() const {
133 int flags = Regexp::ClassNL;
134 switch (encoding()) {
135 default:
136 if (log_errors())
137 LOG(ERROR) << "Unknown encoding " << encoding();
138 break;
139 case RE2::Options::EncodingUTF8:
140 break;
141 case RE2::Options::EncodingLatin1:
142 flags |= Regexp::Latin1;
143 break;
144 }
145
146 if (!posix_syntax())
147 flags |= Regexp::LikePerl;
148
149 if (literal())
150 flags |= Regexp::Literal;
151
152 if (never_nl())
153 flags |= Regexp::NeverNL;
154
155 if (dot_nl())
156 flags |= Regexp::DotNL;
157
158 if (never_capture())
159 flags |= Regexp::NeverCapture;
160
161 if (!case_sensitive())
162 flags |= Regexp::FoldCase;
163
164 if (perl_classes())
165 flags |= Regexp::PerlClasses;
166
167 if (word_boundary())
168 flags |= Regexp::PerlB;
169
170 if (one_line())
171 flags |= Regexp::OneLine;
172
173 return flags;
174 }
175
176 void RE2::Init(const StringPiece& pattern, const Options& options) {
177 mutex_ = new Mutex;
178 pattern_ = pattern.as_string();
179 options_.Copy(options);
180 InitEmpty();
181 error_ = empty_string;
182 error_code_ = NoError;
183 suffix_regexp_ = NULL;
184 entire_regexp_ = NULL;
185 prog_ = NULL;
186 rprog_ = NULL;
187 named_groups_ = NULL;
188 group_names_ = NULL;
189 num_captures_ = -1;
190
191 RegexpStatus status;
192 entire_regexp_ = Regexp::Parse(
193 pattern_,
194 static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
195 &status);
196 if (entire_regexp_ == NULL) {
197 if (error_ == empty_string)
198 error_ = new string(status.Text());
199 if (options_.log_errors()) {
200 LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
201 << status.Text();
202 }
203 error_arg_ = status.error_arg().as_string();
204 error_code_ = RegexpErrorToRE2(status.code());
205 return;
206 }
207
208 prefix_.clear();
209 prefix_foldcase_ = false;
210 re2::Regexp* suffix;
211 if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
212 suffix_regexp_ = suffix;
213 else
214 suffix_regexp_ = entire_regexp_->Incref();
215
216 // Two thirds of the memory goes to the forward Prog,
217 // one third to the reverse prog, because the forward
218 // Prog has two DFAs but the reverse prog has one.
219 prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
220 if (prog_ == NULL) {
221 if (options_.log_errors())
222 LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
223 error_ = new string("pattern too large - compile failed");
224 error_code_ = RE2::ErrorPatternTooLarge;
225 return;
226 }
227
228 // Could delay this until the first match call that
229 // cares about submatch information, but the one-pass
230 // machine's memory gets cut from the DFA memory budget,
231 // and that is harder to do if the DFA has already
232 // been built.
233 is_one_pass_ = prog_->IsOnePass();
234 }
235
236 // Returns rprog_, computing it if needed.
237 re2::Prog* RE2::ReverseProg() const {
238 MutexLock l(mutex_);
239 if (rprog_ == NULL && error_ == empty_string) {
240 rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3);
241 if (rprog_ == NULL) {
242 if (options_.log_errors())
243 LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'";
244 error_ = new string("pattern too large - reverse compile failed");
245 error_code_ = RE2::ErrorPatternTooLarge;
246 return NULL;
247 }
248 }
249 return rprog_;
250 }
251
252 RE2::~RE2() {
253 if (suffix_regexp_)
254 suffix_regexp_->Decref();
255 if (entire_regexp_)
256 entire_regexp_->Decref();
257 delete mutex_;
258 delete prog_;
259 delete rprog_;
260 if (error_ != empty_string)
261 delete error_;
262 if (named_groups_ != NULL && named_groups_ != empty_named_groups)
263 delete named_groups_;
264 if (group_names_ != NULL && group_names_ != empty_group_names)
265 delete group_names_;
266 }
267
268 int RE2::ProgramSize() const {
269 if (prog_ == NULL)
270 return -1;
271 return prog_->size();
272 }
273
274 int RE2::ProgramFanout(map<int, int>* histogram) const {
275 if (prog_ == NULL)
276 return -1;
277 SparseArray<int> fanout(prog_->size());
278 prog_->Fanout(&fanout);
279 histogram->clear();
280 for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) {
281 // TODO(junyer): Optimise this?
282 int bucket = 0;
283 while (1 << bucket < i->second) {
284 bucket++;
285 }
286 (*histogram)[bucket]++;
287 }
288 return histogram->rbegin()->first;
289 }
290
291 // Returns num_captures_, computing it if needed, or -1 if the
292 // regexp wasn't valid on construction.
293 int RE2::NumberOfCapturingGroups() const {
294 MutexLock l(mutex_);
295 if (suffix_regexp_ == NULL)
296 return -1;
297 if (num_captures_ == -1)
298 num_captures_ = suffix_regexp_->NumCaptures();
299 return num_captures_;
300 }
301
302 // Returns named_groups_, computing it if needed.
303 const map<string, int>& RE2::NamedCapturingGroups() const {
304 MutexLock l(mutex_);
305 if (!ok())
306 return *empty_named_groups;
307 if (named_groups_ == NULL) {
308 named_groups_ = suffix_regexp_->NamedCaptures();
309 if (named_groups_ == NULL)
310 named_groups_ = empty_named_groups;
311 }
312 return *named_groups_;
313 }
314
315 // Returns group_names_, computing it if needed.
316 const map<int, string>& RE2::CapturingGroupNames() const {
317 MutexLock l(mutex_);
318 if (!ok())
319 return *empty_group_names;
320 if (group_names_ == NULL) {
321 group_names_ = suffix_regexp_->CaptureNames();
322 if (group_names_ == NULL)
323 group_names_ = empty_group_names;
324 }
325 return *group_names_;
326 }
327
328 /***** Convenience interfaces *****/
329
330 bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
331 const Arg* const args[], int n) {
332 return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
333 }
334
335 bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
336 const Arg* const args[], int n) {
337 return re.DoMatch(text, UNANCHORED, NULL, args, n);
338 }
339
340 bool RE2::ConsumeN(StringPiece* input, const RE2& re,
341 const Arg* const args[], int n) {
342 int consumed;
343 if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
344 input->remove_prefix(consumed);
345 return true;
346 } else {
347 return false;
348 }
349 }
350
351 bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
352 const Arg* const args[], int n) {
353 int consumed;
354 if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
355 input->remove_prefix(consumed);
356 return true;
357 } else {
358 return false;
359 }
360 }
361
362 // Returns the maximum submatch needed for the rewrite to be done by Replace().
363 // E.g. if rewrite == "foo \\2,\\1", returns 2.
364 int RE2::MaxSubmatch(const StringPiece& rewrite) {
365 int max = 0;
366 for (const char *s = rewrite.data(), *end = s + rewrite.size();
367 s < end; s++) {
368 if (*s == '\\') {
369 s++;
370 int c = (s < end) ? *s : -1;
371 if (isdigit(c)) {
372 int n = (c - '0');
373 if (n > max)
374 max = n;
375 }
376 }
377 }
378 return max;
379 }
380
381 bool RE2::Replace(string *str,
382 const RE2& re,
383 const StringPiece& rewrite) {
384 StringPiece vec[kVecSize];
385 int nvec = 1 + MaxSubmatch(rewrite);
386 if (nvec > arraysize(vec))
387 return false;
388 if (!re.Match(*str, 0, static_cast<int>(str->size()), UNANCHORED, vec, nvec))
389 return false;
390
391 string s;
392 if (!re.Rewrite(&s, rewrite, vec, nvec))
393 return false;
394
395 assert(vec[0].begin() >= str->data());
396 assert(vec[0].end() <= str->data()+str->size());
397 str->replace(vec[0].data() - str->data(), vec[0].size(), s);
398 return true;
399 }
400
401 int RE2::GlobalReplace(string *str,
402 const RE2& re,
403 const StringPiece& rewrite) {
404 StringPiece vec[kVecSize];
405 int nvec = 1 + MaxSubmatch(rewrite);
406 if (nvec > arraysize(vec))
407 return false;
408
409 const char* p = str->data();
410 const char* ep = p + str->size();
411 const char* lastend = NULL;
412 string out;
413 int count = 0;
414 while (p <= ep) {
415 if (!re.Match(*str, static_cast<int>(p - str->data()),
416 static_cast<int>(str->size()), UNANCHORED, vec, nvec))
417 break;
418 if (p < vec[0].begin())
419 out.append(p, vec[0].begin() - p);
420 if (vec[0].begin() == lastend && vec[0].size() == 0) {
421 // Disallow empty match at end of last match: skip ahead.
422 if (p < ep)
423 out.append(p, 1);
424 p++;
425 continue;
426 }
427 re.Rewrite(&out, rewrite, vec, nvec);
428 p = vec[0].end();
429 lastend = p;
430 count++;
431 }
432
433 if (count == 0)
434 return 0;
435
436 if (p < ep)
437 out.append(p, ep - p);
438 swap(out, *str);
439 return count;
440 }
441
442 bool RE2::Extract(const StringPiece &text,
443 const RE2& re,
444 const StringPiece &rewrite,
445 string *out) {
446 StringPiece vec[kVecSize];
447 int nvec = 1 + MaxSubmatch(rewrite);
448 if (nvec > arraysize(vec))
449 return false;
450
451 if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
452 return false;
453
454 out->clear();
455 return re.Rewrite(out, rewrite, vec, nvec);
456 }
457
458 string RE2::QuoteMeta(const StringPiece& unquoted) {
459 string result;
460 result.reserve(unquoted.size() << 1);
461
462 // Escape any ascii character not in [A-Za-z_0-9].
463 //
464 // Note that it's legal to escape a character even if it has no
465 // special meaning in a regular expression -- so this function does
466 // that. (This also makes it identical to the perl function of the
467 // same name except for the null-character special case;
468 // see `perldoc -f quotemeta`.)
469 for (int ii = 0; ii < unquoted.length(); ++ii) {
470 // Note that using 'isalnum' here raises the benchmark time from
471 // 32ns to 58ns:
472 if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
473 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
474 (unquoted[ii] < '0' || unquoted[ii] > '9') &&
475 unquoted[ii] != '_' &&
476 // If this is the part of a UTF8 or Latin1 character, we need
477 // to copy this byte without escaping. Experimentally this is
478 // what works correctly with the regexp library.
479 !(unquoted[ii] & 128)) {
480 if (unquoted[ii] == '\0') { // Special handling for null chars.
481 // Note that this special handling is not strictly required for RE2,
482 // but this quoting is required for other regexp libraries such as
483 // PCRE.
484 // Can't use "\\0" since the next character might be a digit.
485 result += "\\x00";
486 continue;
487 }
488 result += '\\';
489 }
490 result += unquoted[ii];
491 }
492
493 return result;
494 }
495
496 bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {
497 if (prog_ == NULL)
498 return false;
499
500 int n = static_cast<int>(prefix_.size());
501 if (n > maxlen)
502 n = maxlen;
503
504 // Determine initial min max from prefix_ literal.
505 string pmin, pmax;
506 pmin = prefix_.substr(0, n);
507 pmax = prefix_.substr(0, n);
508 if (prefix_foldcase_) {
509 // prefix is ASCII lowercase; change pmin to uppercase.
510 for (int i = 0; i < n; i++) {
511 if ('a' <= pmin[i] && pmin[i] <= 'z')
512 pmin[i] += 'A' - 'a';
513 }
514 }
515
516 // Add to prefix min max using PossibleMatchRange on regexp.
517 string dmin, dmax;
518 maxlen -= n;
519 if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
520 pmin += dmin;
521 pmax += dmax;
522 } else if (pmax.size() > 0) {
523 // prog_->PossibleMatchRange has failed us,
524 // but we still have useful information from prefix_.
525 // Round up pmax to allow any possible suffix.
526 pmax = PrefixSuccessor(pmax);
527 } else {
528 // Nothing useful.
529 *min = "";
530 *max = "";
531 return false;
532 }
533
534 *min = pmin;
535 *max = pmax;
536 return true;
537 }
538
539 // Avoid possible locale nonsense in standard strcasecmp.
540 // The string a is known to be all lowercase.
541 static int ascii_strcasecmp(const char* a, const char* b, int len) {
542 const char *ae = a + len;
543
544 for (; a < ae; a++, b++) {
545 uint8 x = *a;
546 uint8 y = *b;
547 if ('A' <= y && y <= 'Z')
548 y += 'a' - 'A';
549 if (x != y)
550 return x - y;
551 }
552 return 0;
553 }
554
555
556 /***** Actual matching and rewriting code *****/
557
558 bool RE2::Match(const StringPiece& text,
559 int startpos,
560 int endpos,
561 Anchor re_anchor,
562 StringPiece* submatch,
563 int nsubmatch) const {
564 if (!ok() || suffix_regexp_ == NULL) {
565 if (options_.log_errors())
566 LOG(ERROR) << "Invalid RE2: " << *error_;
567 return false;
568 }
569
570 if (startpos < 0 || startpos > endpos || endpos > text.size()) {
571 if (options_.log_errors())
572 LOG(ERROR) << "RE2: invalid startpos, endpos pair. ["
573 << "startpos: " << startpos << ", "
574 << "endpos: " << endpos << ", "
575 << "text size: " << text.size() << "]";
576 return false;
577 }
578
579 StringPiece subtext = text;
580 subtext.remove_prefix(startpos);
581 subtext.remove_suffix(text.size() - endpos);
582
583 // Use DFAs to find exact location of match, filter out non-matches.
584
585 // Don't ask for the location if we won't use it.
586 // SearchDFA can do extra optimizations in that case.
587 StringPiece match;
588 StringPiece* matchp = &match;
589 if (nsubmatch == 0)
590 matchp = NULL;
591
592 int ncap = 1 + NumberOfCapturingGroups();
593 if (ncap > nsubmatch)
594 ncap = nsubmatch;
595
596 // If the regexp is anchored explicitly, must not be in middle of text.
597 if (prog_->anchor_start() && startpos != 0)
598 return false;
599
600 // If the regexp is anchored explicitly, update re_anchor
601 // so that we can potentially fall into a faster case below.
602 if (prog_->anchor_start() && prog_->anchor_end())
603 re_anchor = ANCHOR_BOTH;
604 else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
605 re_anchor = ANCHOR_START;
606
607 // Check for the required prefix, if any.
608 int prefixlen = 0;
609 if (!prefix_.empty()) {
610 if (startpos != 0)
611 return false;
612 prefixlen = static_cast<int>(prefix_.size());
613 if (prefixlen > subtext.size())
614 return false;
615 if (prefix_foldcase_) {
616 if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
617 return false;
618 } else {
619 if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
620 return false;
621 }
622 subtext.remove_prefix(prefixlen);
623 // If there is a required prefix, the anchor must be at least ANCHOR_START.
624 if (re_anchor != ANCHOR_BOTH)
625 re_anchor = ANCHOR_START;
626 }
627
628 Prog::Anchor anchor = Prog::kUnanchored;
629 Prog::MatchKind kind = Prog::kFirstMatch;
630 if (options_.longest_match())
631 kind = Prog::kLongestMatch;
632 bool skipped_test = false;
633
634 bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
635
636 // SearchBitState allocates a bit vector of size prog_->size() * text.size().
637 // It also allocates a stack of 3-word structures which could potentially
638 // grow as large as prog_->size() * text.size() but in practice is much
639 // smaller.
640 // Conditions for using SearchBitState:
641 const int MaxBitStateProg = 500; // prog_->size() <= Max.
642 const int MaxBitStateVector = 256*1024; // bit vector size <= Max (bits)
643 bool can_bit_state = prog_->size() <= MaxBitStateProg;
644 int bit_state_text_max = MaxBitStateVector / prog_->size();
645
646 bool dfa_failed = false;
647 switch (re_anchor) {
648 default:
649 case UNANCHORED: {
650 if (!prog_->SearchDFA(subtext, text, anchor, kind,
651 matchp, &dfa_failed, NULL)) {
652 if (dfa_failed) {
653 // Fall back to NFA below.
654 skipped_test = true;
655 if (FLAGS_trace_re2)
656 LOG(INFO) << "Match " << trunc(pattern_)
657 << " [" << CEscape(subtext) << "]"
658 << " DFA failed.";
659 break;
660 }
661 if (FLAGS_trace_re2)
662 LOG(INFO) << "Match " << trunc(pattern_)
663 << " [" << CEscape(subtext) << "]"
664 << " used DFA - no match.";
665 return false;
666 }
667 if (FLAGS_trace_re2)
668 LOG(INFO) << "Match " << trunc(pattern_)
669 << " [" << CEscape(subtext) << "]"
670 << " used DFA - match";
671 if (matchp == NULL) // Matched. Don't care where
672 return true;
673 // SearchDFA set match[0].end() but didn't know where the
674 // match started. Run the regexp backward from match[0].end()
675 // to find the longest possible match -- that's where it started.
676 Prog* prog = ReverseProg();
677 if (prog == NULL)
678 return false;
679 if (!prog->SearchDFA(match, text, Prog::kAnchored,
680 Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
681 if (dfa_failed) {
682 // Fall back to NFA below.
683 skipped_test = true;
684 if (FLAGS_trace_re2)
685 LOG(INFO) << "Match " << trunc(pattern_)
686 << " [" << CEscape(subtext) << "]"
687 << " reverse DFA failed.";
688 break;
689 }
690 if (FLAGS_trace_re2)
691 LOG(INFO) << "Match " << trunc(pattern_)
692 << " [" << CEscape(subtext) << "]"
693 << " DFA inconsistency.";
694 if (options_.log_errors())
695 LOG(ERROR) << "DFA inconsistency";
696 return false;
697 }
698 if (FLAGS_trace_re2)
699 LOG(INFO) << "Match " << trunc(pattern_)
700 << " [" << CEscape(subtext) << "]"
701 << " used reverse DFA.";
702 break;
703 }
704
705 case ANCHOR_BOTH:
706 case ANCHOR_START:
707 if (re_anchor == ANCHOR_BOTH)
708 kind = Prog::kFullMatch;
709 anchor = Prog::kAnchored;
710
711 // If only a small amount of text and need submatch
712 // information anyway and we're going to use OnePass or BitState
713 // to get it, we might as well not even bother with the DFA:
714 // OnePass or BitState will be fast enough.
715 // On tiny texts, OnePass outruns even the DFA, and
716 // it doesn't have the shared state and occasional mutex that
717 // the DFA does.
718 if (can_one_pass && text.size() <= 4096 &&
719 (ncap > 1 || text.size() <= 8)) {
720 if (FLAGS_trace_re2)
721 LOG(INFO) << "Match " << trunc(pattern_)
722 << " [" << CEscape(subtext) << "]"
723 << " skipping DFA for OnePass.";
724 skipped_test = true;
725 break;
726 }
727 if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
728 if (FLAGS_trace_re2)
729 LOG(INFO) << "Match " << trunc(pattern_)
730 << " [" << CEscape(subtext) << "]"
731 << " skipping DFA for BitState.";
732 skipped_test = true;
733 break;
734 }
735 if (!prog_->SearchDFA(subtext, text, anchor, kind,
736 &match, &dfa_failed, NULL)) {
737 if (dfa_failed) {
738 if (FLAGS_trace_re2)
739 LOG(INFO) << "Match " << trunc(pattern_)
740 << " [" << CEscape(subtext) << "]"
741 << " DFA failed.";
742 skipped_test = true;
743 break;
744 }
745 if (FLAGS_trace_re2)
746 LOG(INFO) << "Match " << trunc(pattern_)
747 << " [" << CEscape(subtext) << "]"
748 << " used DFA - no match.";
749 return false;
750 }
751 break;
752 }
753
754 if (!skipped_test && ncap <= 1) {
755 // We know exactly where it matches. That's enough.
756 if (ncap == 1)
757 submatch[0] = match;
758 } else {
759 StringPiece subtext1;
760 if (skipped_test) {
761 // DFA ran out of memory or was skipped:
762 // need to search in entire original text.
763 subtext1 = subtext;
764 } else {
765 // DFA found the exact match location:
766 // let NFA run an anchored, full match search
767 // to find submatch locations.
768 subtext1 = match;
769 anchor = Prog::kAnchored;
770 kind = Prog::kFullMatch;
771 }
772
773 if (can_one_pass && anchor != Prog::kUnanchored) {
774 if (FLAGS_trace_re2)
775 LOG(INFO) << "Match " << trunc(pattern_)
776 << " [" << CEscape(subtext) << "]"
777 << " using OnePass.";
778 if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
779 if (!skipped_test && options_.log_errors())
780 LOG(ERROR) << "SearchOnePass inconsistency";
781 return false;
782 }
783 } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
784 if (FLAGS_trace_re2)
785 LOG(INFO) << "Match " << trunc(pattern_)
786 << " [" << CEscape(subtext) << "]"
787 << " using BitState.";
788 if (!prog_->SearchBitState(subtext1, text, anchor,
789 kind, submatch, ncap)) {
790 if (!skipped_test && options_.log_errors())
791 LOG(ERROR) << "SearchBitState inconsistency";
792 return false;
793 }
794 } else {
795 if (FLAGS_trace_re2)
796 LOG(INFO) << "Match " << trunc(pattern_)
797 << " [" << CEscape(subtext) << "]"
798 << " using NFA.";
799 if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
800 if (!skipped_test && options_.log_errors())
801 LOG(ERROR) << "SearchNFA inconsistency";
802 return false;
803 }
804 }
805 }
806
807 // Adjust overall match for required prefix that we stripped off.
808 if (prefixlen > 0 && nsubmatch > 0)
809 submatch[0] = StringPiece(submatch[0].begin() - prefixlen,
810 submatch[0].size() + prefixlen);
811
812 // Zero submatches that don't exist in the regexp.
813 for (int i = ncap; i < nsubmatch; i++)
814 submatch[i] = NULL;
815 return true;
816 }
817
818 // Internal matcher - like Match() but takes Args not StringPieces.
819 bool RE2::DoMatch(const StringPiece& text,
820 Anchor anchor,
821 int* consumed,
822 const Arg* const* args,
823 int n) const {
824 if (!ok()) {
825 if (options_.log_errors())
826 LOG(ERROR) << "Invalid RE2: " << *error_;
827 return false;
828 }
829
830 // Count number of capture groups needed.
831 int nvec;
832 if (n == 0 && consumed == NULL)
833 nvec = 0;
834 else
835 nvec = n+1;
836
837 StringPiece* vec;
838 StringPiece stkvec[kVecSize];
839 StringPiece* heapvec = NULL;
840
841 if (nvec <= arraysize(stkvec)) {
842 vec = stkvec;
843 } else {
844 vec = new StringPiece[nvec];
845 heapvec = vec;
846 }
847
848 if (!Match(text, 0, text.size(), anchor, vec, nvec)) {
849 delete[] heapvec;
850 return false;
851 }
852
853 if (consumed != NULL)
854 *consumed = static_cast<int>(vec[0].end() - text.begin());
855
856 if (n == 0 || args == NULL) {
857 // We are not interested in results
858 delete[] heapvec;
859 return true;
860 }
861
862 int ncap = NumberOfCapturingGroups();
863 if (ncap < n) {
864 // RE has fewer capturing groups than number of arg pointers passed in
865 VLOG(1) << "Asked for " << n << " but only have " << ncap;
866 delete[] heapvec;
867 return false;
868 }
869
870 // If we got here, we must have matched the whole pattern.
871 for (int i = 0; i < n; i++) {
872 const StringPiece& s = vec[i+1];
873 if (!args[i]->Parse(s.data(), s.size())) {
874 // TODO: Should we indicate what the error was?
875 VLOG(1) << "Parse error on #" << i << " " << s << " "
876 << (void*)s.data() << "/" << s.size();
877 delete[] heapvec;
878 return false;
879 }
880 }
881
882 delete[] heapvec;
883 return true;
884 }
885
886 // Append the "rewrite" string, with backslash subsitutions from "vec",
887 // to string "out".
888 bool RE2::Rewrite(string *out, const StringPiece &rewrite,
889 const StringPiece *vec, int veclen) const {
890 for (const char *s = rewrite.data(), *end = s + rewrite.size();
891 s < end; s++) {
892 if (*s != '\\') {
893 out->push_back(*s);
894 continue;
895 }
896 s++;
897 int c = (s < end) ? *s : -1;
898 if (isdigit(c)) {
899 int n = (c - '0');
900 if (n >= veclen) {
901 if (options_.log_errors()) {
902 LOG(ERROR) << "requested group " << n
903 << " in regexp " << rewrite.data();
904 }
905 return false;
906 }
907 StringPiece snip = vec[n];
908 if (snip.size() > 0)
909 out->append(snip.data(), snip.size());
910 } else if (c == '\\') {
911 out->push_back('\\');
912 } else {
913 if (options_.log_errors())
914 LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
915 return false;
916 }
917 }
918 return true;
919 }
920
921 // Checks that the rewrite string is well-formed with respect to this
922 // regular expression.
923 bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {
924 int max_token = -1;
925 for (const char *s = rewrite.data(), *end = s + rewrite.size();
926 s < end; s++) {
927 int c = *s;
928 if (c != '\\') {
929 continue;
930 }
931 if (++s == end) {
932 *error = "Rewrite schema error: '\\' not allowed at end.";
933 return false;
934 }
935 c = *s;
936 if (c == '\\') {
937 continue;
938 }
939 if (!isdigit(c)) {
940 *error = "Rewrite schema error: "
941 "'\\' must be followed by a digit or '\\'.";
942 return false;
943 }
944 int n = (c - '0');
945 if (max_token < n) {
946 max_token = n;
947 }
948 }
949
950 if (max_token > NumberOfCapturingGroups()) {
951 SStringPrintf(error, "Rewrite schema requests %d matches, "
952 "but the regexp only has %d parenthesized subexpressions.",
953 max_token, NumberOfCapturingGroups());
954 return false;
955 }
956 return true;
957 }
958
959 /***** Parsers for various types *****/
960
961 bool RE2::Arg::parse_null(const char* str, int n, void* dest) {
962 // We fail if somebody asked us to store into a non-NULL void* pointer
963 return (dest == NULL);
964 }
965
966 bool RE2::Arg::parse_string(const char* str, int n, void* dest) {
967 if (dest == NULL) return true;
968 reinterpret_cast<string*>(dest)->assign(str, n);
969 return true;
970 }
971
972 bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) {
973 if (dest == NULL) return true;
974 reinterpret_cast<StringPiece*>(dest)->set(str, n);
975 return true;
976 }
977
978 bool RE2::Arg::parse_char(const char* str, int n, void* dest) {
979 if (n != 1) return false;
980 if (dest == NULL) return true;
981 *(reinterpret_cast<char*>(dest)) = str[0];
982 return true;
983 }
984
985 bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) {
986 if (n != 1) return false;
987 if (dest == NULL) return true;
988 *(reinterpret_cast<unsigned char*>(dest)) = str[0];
989 return true;
990 }
991
992 // Largest number spec that we are willing to parse
993 static const int kMaxNumberLength = 32;
994
995 // REQUIRES "buf" must have length at least nbuf.
996 // Copies "str" into "buf" and null-terminates.
997 // Overwrites *np with the new length.
998 static const char* TerminateNumber(char* buf, int nbuf, const char* str, int* np ,
999 bool accept_spaces) {
1000 int n = *np;
1001 if (n <= 0) return "";
1002 if (n > 0 && isspace(*str)) {
1003 // We are less forgiving than the strtoxxx() routines and do not
1004 // allow leading spaces. We do allow leading spaces for floats.
1005 if (!accept_spaces) {
1006 return "";
1007 }
1008 while (n > 0 && isspace(*str)) {
1009 n--;
1010 str++;
1011 }
1012 }
1013
1014 // Although buf has a fixed maximum size, we can still handle
1015 // arbitrarily large integers correctly by omitting leading zeros.
1016 // (Numbers that are still too long will be out of range.)
1017 // Before deciding whether str is too long,
1018 // remove leading zeros with s/000+/00/.
1019 // Leaving the leading two zeros in place means that
1020 // we don't change 0000x123 (invalid) into 0x123 (valid).
1021 // Skip over leading - before replacing.
1022 bool neg = false;
1023 if (n >= 1 && str[0] == '-') {
1024 neg = true;
1025 n--;
1026 str++;
1027 }
1028
1029 if (n >= 3 && str[0] == '0' && str[1] == '0') {
1030 while (n >= 3 && str[2] == '0') {
1031 n--;
1032 str++;
1033 }
1034 }
1035
1036 if (neg) { // make room in buf for -
1037 n++;
1038 str--;
1039 }
1040
1041 if (n > nbuf-1) return "";
1042
1043 memmove(buf, str, n);
1044 if (neg) {
1045 buf[0] = '-';
1046 }
1047 buf[n] = '\0';
1048 *np = n;
1049 return buf;
1050 }
1051
1052 bool RE2::Arg::parse_long_radix(const char* str,
1053 int n,
1054 void* dest,
1055 int radix) {
1056 if (n == 0) return false;
1057 char buf[kMaxNumberLength+1];
1058 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1059 char* end;
1060 errno = 0;
1061 long r = strtol(str, &end, radix);
1062 if (end != str + n) return false; // Leftover junk
1063 if (errno) return false;
1064 if (dest == NULL) return true;
1065 *(reinterpret_cast<long*>(dest)) = r;
1066 return true;
1067 }
1068
1069 bool RE2::Arg::parse_ulong_radix(const char* str,
1070 int n,
1071 void* dest,
1072 int radix) {
1073 if (n == 0) return false;
1074 char buf[kMaxNumberLength+1];
1075 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1076 if (str[0] == '-') {
1077 // strtoul() will silently accept negative numbers and parse
1078 // them. This module is more strict and treats them as errors.
1079 return false;
1080 }
1081
1082 char* end;
1083 errno = 0;
1084 unsigned long r = strtoul(str, &end, radix);
1085 if (end != str + n) return false; // Leftover junk
1086 if (errno) return false;
1087 if (dest == NULL) return true;
1088 *(reinterpret_cast<unsigned long*>(dest)) = r;
1089 return true;
1090 }
1091
1092 bool RE2::Arg::parse_short_radix(const char* str,
1093 int n,
1094 void* dest,
1095 int radix) {
1096 long r;
1097 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1098 if ((short)r != r) return false; // Out of range
1099 if (dest == NULL) return true;
1100 *(reinterpret_cast<short*>(dest)) = (short)r;
1101 return true;
1102 }
1103
1104 bool RE2::Arg::parse_ushort_radix(const char* str,
1105 int n,
1106 void* dest,
1107 int radix) {
1108 unsigned long r;
1109 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1110 if ((ushort)r != r) return false; // Out of range
1111 if (dest == NULL) return true;
1112 *(reinterpret_cast<unsigned short*>(dest)) = (ushort)r;
1113 return true;
1114 }
1115
1116 bool RE2::Arg::parse_int_radix(const char* str,
1117 int n,
1118 void* dest,
1119 int radix) {
1120 long r;
1121 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1122 if ((int)r != r) return false; // Out of range
1123 if (dest == NULL) return true;
1124 *(reinterpret_cast<int*>(dest)) = r;
1125 return true;
1126 }
1127
1128 bool RE2::Arg::parse_uint_radix(const char* str,
1129 int n,
1130 void* dest,
1131 int radix) {
1132 unsigned long r;
1133 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1134 if ((uint)r != r) return false; // Out of range
1135 if (dest == NULL) return true;
1136 *(reinterpret_cast<unsigned int*>(dest)) = r;
1137 return true;
1138 }
1139
1140 #if RE2_HAVE_LONGLONG
1141 bool RE2::Arg::parse_longlong_radix(const char* str,
1142 int n,
1143 void* dest,
1144 int radix) {
1145 if (n == 0) return false;
1146 char buf[kMaxNumberLength+1];
1147 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1148 char* end;
1149 errno = 0;
1150 int64 r = strtoll(str, &end, radix);
1151 if (end != str + n) return false; // Leftover junk
1152 if (errno) return false;
1153 if (dest == NULL) return true;
1154 *(reinterpret_cast<int64*>(dest)) = r;
1155 return true;
1156 }
1157
1158 bool RE2::Arg::parse_ulonglong_radix(const char* str,
1159 int n,
1160 void* dest,
1161 int radix) {
1162 if (n == 0) return false;
1163 char buf[kMaxNumberLength+1];
1164 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1165 if (str[0] == '-') {
1166 // strtoull() will silently accept negative numbers and parse
1167 // them. This module is more strict and treats them as errors.
1168 return false;
1169 }
1170 char* end;
1171 errno = 0;
1172 uint64 r = strtoull(str, &end, radix);
1173 if (end != str + n) return false; // Leftover junk
1174 if (errno) return false;
1175 if (dest == NULL) return true;
1176 *(reinterpret_cast<uint64*>(dest)) = r;
1177 return true;
1178 }
1179 #endif
1180
1181 static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) {
1182 if (n == 0) return false;
1183 static const int kMaxLength = 200;
1184 char buf[kMaxLength+1];
1185 str = TerminateNumber(buf, sizeof buf, str, &n, true);
1186 char* end;
1187 errno = 0;
1188 double r;
1189 if (isfloat) {
1190 r = strtof(str, &end);
1191 } else {
1192 r = strtod(str, &end);
1193 }
1194 if (end != str + n) return false; // Leftover junk
1195 if (errno) return false;
1196 if (dest == NULL) return true;
1197 if (isfloat) {
1198 *(reinterpret_cast<float*>(dest)) = (float)r;
1199 } else {
1200 *(reinterpret_cast<double*>(dest)) = r;
1201 }
1202 return true;
1203 }
1204
1205 bool RE2::Arg::parse_double(const char* str, int n, void* dest) {
1206 return parse_double_float(str, n, false, dest);
1207 }
1208
1209 bool RE2::Arg::parse_float(const char* str, int n, void* dest) {
1210 return parse_double_float(str, n, true, dest);
1211 }
1212
1213
1214 #define DEFINE_INTEGER_PARSERS(name) \
1215 bool RE2::Arg::parse_##name(const char* str, int n, void* dest) { \
1216 return parse_##name##_radix(str, n, dest, 10); \
1217 } \
1218 bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \
1219 return parse_##name##_radix(str, n, dest, 16); \
1220 } \
1221 bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \
1222 return parse_##name##_radix(str, n, dest, 8); \
1223 } \
1224 bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
1225 return parse_##name##_radix(str, n, dest, 0); \
1226 }
1227
1228 DEFINE_INTEGER_PARSERS(short);
1229 DEFINE_INTEGER_PARSERS(ushort);
1230 DEFINE_INTEGER_PARSERS(int);
1231 DEFINE_INTEGER_PARSERS(uint);
1232 DEFINE_INTEGER_PARSERS(long);
1233 DEFINE_INTEGER_PARSERS(ulong);
1234 DEFINE_INTEGER_PARSERS(longlong);
1235 DEFINE_INTEGER_PARSERS(ulonglong);
1236
1237 #undef DEFINE_INTEGER_PARSERS
1238
1239 } // namespace re2
OLDNEW
« no previous file with comments | « third_party/re2/re2/re2.h ('k') | third_party/re2/re2/regexp.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698