Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2)

Side by Side Diff: third_party/re2/re2/re2.cc

Issue 10575037: Include RE2 library (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Removed valgrind specific code that is in Chromium already Created 8 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 // Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Regular expression interface RE2.
6 //
7 // Originally the PCRE C++ wrapper, but adapted to use
8 // the new automata-based regular expression engines.
9
10 #include "re2/re2.h"
11
12 #include <stdio.h>
13 #include <string>
14 #ifdef WIN32
15 #define strtoll _strtoi64
16 #define strtoull _strtoui64
17 #define strtof strtod
18 #else
19 #include <pthread.h>
20 #endif
21 #include <errno.h>
22 #include "util/util.h"
23 #include "util/flags.h"
24 #include "re2/prog.h"
25 #include "re2/regexp.h"
26
27 DEFINE_bool(trace_re2, false, "trace RE2 execution");
28
29 namespace re2 {
30
31 // Maximum number of args we can set
32 static const int kMaxArgs = 16;
33 static const int kVecSize = 1+kMaxArgs;
34
35 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::Ful lMatchN> RE2::FullMatch;
36 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::Par tialMatchN> RE2::PartialMatch;
37 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume;
38 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndCo nsumeN> RE2::FindAndConsume;
39
40 // Commonly-used option sets; arguments to constructor are:
41 // utf8 input
42 // posix syntax
43 // longest match
44 // log errors
45 const RE2::Options RE2::DefaultOptions; // EncodingUTF8, false, false, true
46 const RE2::Options RE2::Latin1(RE2::Options::EncodingLatin1, false, false, true) ;
47 //const RE2::Options RE2::POSIX(RE2::Options::EncodingUTF8, true, true, true);
48 const RE2::Options RE2::Quiet(RE2::Options::EncodingUTF8, false, false, false);
49
50 // If a regular expression has no error, its error_ field points here
51 static const string empty_string;
52
53 // Converts from Regexp error code to RE2 error code.
54 // Maybe some day they will diverge. In any event, this
55 // hides the existence of Regexp from RE2 users.
56 static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
57 switch (code) {
58 case re2::kRegexpSuccess:
59 return RE2::NoError;
60 case re2::kRegexpInternalError:
61 return RE2::ErrorInternal;
62 case re2::kRegexpBadEscape:
63 return RE2::ErrorBadEscape;
64 case re2::kRegexpBadCharClass:
65 return RE2::ErrorBadCharClass;
66 case re2::kRegexpBadCharRange:
67 return RE2::ErrorBadCharRange;
68 case re2::kRegexpMissingBracket:
69 return RE2::ErrorMissingBracket;
70 case re2::kRegexpMissingParen:
71 return RE2::ErrorMissingParen;
72 case re2::kRegexpTrailingBackslash:
73 return RE2::ErrorTrailingBackslash;
74 case re2::kRegexpRepeatArgument:
75 return RE2::ErrorRepeatArgument;
76 case re2::kRegexpRepeatSize:
77 return RE2::ErrorRepeatSize;
78 case re2::kRegexpRepeatOp:
79 return RE2::ErrorRepeatOp;
80 case re2::kRegexpBadPerlOp:
81 return RE2::ErrorBadPerlOp;
82 case re2::kRegexpBadUTF8:
83 return RE2::ErrorBadUTF8;
84 case re2::kRegexpBadNamedCapture:
85 return RE2::ErrorBadNamedCapture;
86 }
87 return RE2::ErrorInternal;
88 }
89
90 static string trunc(const StringPiece& pattern) {
91 if (pattern.size() < 100)
92 return pattern.as_string();
93 return pattern.substr(0, 100).as_string() + "...";
94 }
95
96
97 RE2::RE2(const char* pattern) {
98 Init(pattern, DefaultOptions);
99 }
100
101 RE2::RE2(const string& pattern) {
102 Init(pattern, DefaultOptions);
103 }
104
105 RE2::RE2(const StringPiece& pattern) {
106 Init(pattern, DefaultOptions);
107 }
108
109 RE2::RE2(const StringPiece& pattern, const Options& options) {
110 Init(pattern, options);
111 }
112
113 int RE2::Options::ParseFlags() const {
114 int flags = Regexp::ClassNL;
115 switch (encoding()) {
116 default:
117 LOG(ERROR) << "Unknown encoding " << encoding();
118 break;
119 case RE2::Options::EncodingUTF8:
120 break;
121 case RE2::Options::EncodingLatin1:
122 flags |= Regexp::Latin1;
123 break;
124 }
125
126 if (!posix_syntax())
127 flags |= Regexp::LikePerl;
128
129 if (literal())
130 flags |= Regexp::Literal;
131
132 if (never_nl())
133 flags |= Regexp::NeverNL;
134
135 if (!case_sensitive())
136 flags |= Regexp::FoldCase;
137
138 if (perl_classes())
139 flags |= Regexp::PerlClasses;
140
141 if (word_boundary())
142 flags |= Regexp::PerlB;
143
144 if (one_line())
145 flags |= Regexp::OneLine;
146
147 return flags;
148 }
149
150 void RE2::Init(const StringPiece& pattern, const Options& options) {
151 mutex_ = new Mutex;
152 pattern_ = pattern.as_string();
153 options_.Copy(options);
154 error_ = &empty_string;
155 error_code_ = NoError;
156 suffix_regexp_ = NULL;
157 entire_regexp_ = NULL;
158 prog_ = NULL;
159 rprog_ = NULL;
160 named_groups_ = NULL;
161 group_names_ = NULL;
162 num_captures_ = -1;
163
164 RegexpStatus status;
165 entire_regexp_ = Regexp::Parse(
166 pattern_,
167 static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
168 &status);
169 if (entire_regexp_ == NULL) {
170 if (error_ == &empty_string)
171 error_ = new string(status.Text());
172 if (options_.log_errors()) {
173 LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
174 << status.Text();
175 }
176 error_arg_ = status.error_arg().as_string();
177 error_code_ = RegexpErrorToRE2(status.code());
178 return;
179 }
180
181 prefix_.clear();
182 prefix_foldcase_ = false;
183 re2::Regexp* suffix;
184 if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
185 suffix_regexp_ = suffix;
186 else
187 suffix_regexp_ = entire_regexp_->Incref();
188
189 // Two thirds of the memory goes to the forward Prog,
190 // one third to the reverse prog, because the forward
191 // Prog has two DFAs but the reverse prog has one.
192 prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
193 if (prog_ == NULL) {
194 if (options_.log_errors())
195 LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
196 error_ = new string("pattern too large - compile failed");
197 error_code_ = RE2::ErrorPatternTooLarge;
198 return;
199 }
200
201 // Could delay this until the first match call that
202 // cares about submatch information, but the one-pass
203 // machine's memory gets cut from the DFA memory budget,
204 // and that is harder to do if the DFA has already
205 // been built.
206 is_one_pass_ = prog_->IsOnePass();
207 }
208
209 // Returns rprog_, computing it if needed.
210 re2::Prog* RE2::ReverseProg() const {
211 MutexLock l(mutex_);
212 if (rprog_ == NULL && error_ == &empty_string) {
213 rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3);
214 if (rprog_ == NULL) {
215 if (options_.log_errors())
216 LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'";
217 error_ = new string("pattern too large - reverse compile failed");
218 error_code_ = RE2::ErrorPatternTooLarge;
219 return NULL;
220 }
221 }
222 return rprog_;
223 }
224
225 static const map<string, int> empty_named_groups;
226 static const map<int, string> empty_group_names;
227
228 RE2::~RE2() {
229 if (suffix_regexp_)
230 suffix_regexp_->Decref();
231 if (entire_regexp_)
232 entire_regexp_->Decref();
233 delete mutex_;
234 delete prog_;
235 delete rprog_;
236 if (error_ != &empty_string)
237 delete error_;
238 if (named_groups_ != NULL && named_groups_ != &empty_named_groups)
239 delete named_groups_;
240 if (group_names_ != NULL && group_names_ != &empty_group_names)
241 delete group_names_;
242 }
243
244 int RE2::ProgramSize() const {
245 if (prog_ == NULL)
246 return -1;
247 return prog_->size();
248 }
249
250 // Returns named_groups_, computing it if needed.
251 const map<string, int>& RE2::NamedCapturingGroups() const {
252 MutexLock l(mutex_);
253 if (!ok())
254 return empty_named_groups;
255 if (named_groups_ == NULL) {
256 named_groups_ = suffix_regexp_->NamedCaptures();
257 if (named_groups_ == NULL)
258 named_groups_ = &empty_named_groups;
259 }
260 return *named_groups_;
261 }
262
263 // Returns group_names_, computing it if needed.
264 const map<int, string>& RE2::CapturingGroupNames() const {
265 MutexLock l(mutex_);
266 if (!ok())
267 return empty_group_names;
268 if (group_names_ == NULL) {
269 group_names_ = suffix_regexp_->CaptureNames();
270 if (group_names_ == NULL)
271 group_names_ = &empty_group_names;
272 }
273 return *group_names_;
274 }
275
276 /***** Convenience interfaces *****/
277
278 bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
279 const Arg* const args[], int n) {
280 return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
281 }
282
283 bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
284 const Arg* const args[], int n) {
285 return re.DoMatch(text, UNANCHORED, NULL, args, n);
286 }
287
288 bool RE2::ConsumeN(StringPiece* input, const RE2& re,
289 const Arg* const args[], int n) {
290 int consumed;
291 if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
292 input->remove_prefix(consumed);
293 return true;
294 } else {
295 return false;
296 }
297 }
298
299 bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
300 const Arg* const args[], int n) {
301 int consumed;
302 if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
303 input->remove_prefix(consumed);
304 return true;
305 } else {
306 return false;
307 }
308 }
309
310 // Returns the maximum submatch needed for the rewrite to be done by Replace().
311 // E.g. if rewrite == "foo \\2,\\1", returns 2.
312 static int MaxSubmatch(const StringPiece& rewrite) {
313 int max = 0;
314 for (const char *s = rewrite.data(), *end = s + rewrite.size();
315 s < end; s++) {
316 if (*s == '\\') {
317 s++;
318 int c = (s < end) ? *s : -1;
319 if (isdigit(c)) {
320 int n = (c - '0');
321 if (n > max)
322 max = n;
323 }
324 }
325 }
326 return max;
327 }
328
329 bool RE2::Replace(string *str,
330 const RE2& re,
331 const StringPiece& rewrite) {
332 StringPiece vec[kVecSize];
333 int nvec = 1 + MaxSubmatch(rewrite);
334 if (nvec > arraysize(vec))
335 return false;
336 if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
337 return false;
338
339 string s;
340 if (!re.Rewrite(&s, rewrite, vec, nvec))
341 return false;
342
343 assert(vec[0].begin() >= str->data());
344 assert(vec[0].end() <= str->data()+str->size());
345 str->replace(vec[0].data() - str->data(), vec[0].size(), s);
346 return true;
347 }
348
349 int RE2::GlobalReplace(string *str,
350 const RE2& re,
351 const StringPiece& rewrite) {
352 StringPiece vec[kVecSize];
353 int nvec = 1 + MaxSubmatch(rewrite);
354 if (nvec > arraysize(vec))
355 return false;
356
357 const char* p = str->data();
358 const char* ep = p + str->size();
359 const char* lastend = NULL;
360 string out;
361 int count = 0;
362 while (p <= ep) {
363 if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec))
364 break;
365 if (p < vec[0].begin())
366 out.append(p, vec[0].begin() - p);
367 if (vec[0].begin() == lastend && vec[0].size() == 0) {
368 // Disallow empty match at end of last match: skip ahead.
369 if (p < ep)
370 out.append(p, 1);
371 p++;
372 continue;
373 }
374 re.Rewrite(&out, rewrite, vec, nvec);
375 p = vec[0].end();
376 lastend = p;
377 count++;
378 }
379
380 if (count == 0)
381 return 0;
382
383 if (p < ep)
384 out.append(p, ep - p);
385 swap(out, *str);
386 return count;
387 }
388
389 bool RE2::Extract(const StringPiece &text,
390 const RE2& re,
391 const StringPiece &rewrite,
392 string *out) {
393 StringPiece vec[kVecSize];
394 int nvec = 1 + MaxSubmatch(rewrite);
395 if (nvec > arraysize(vec))
396 return false;
397
398 if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
399 return false;
400
401 out->clear();
402 return re.Rewrite(out, rewrite, vec, nvec);
403 }
404
405 string RE2::QuoteMeta(const StringPiece& unquoted) {
406 string result;
407 result.reserve(unquoted.size() << 1);
408
409 // Escape any ascii character not in [A-Za-z_0-9].
410 //
411 // Note that it's legal to escape a character even if it has no
412 // special meaning in a regular expression -- so this function does
413 // that. (This also makes it identical to the perl function of the
414 // same name except for the null-character special case;
415 // see `perldoc -f quotemeta`.)
416 for (int ii = 0; ii < unquoted.length(); ++ii) {
417 // Note that using 'isalnum' here raises the benchmark time from
418 // 32ns to 58ns:
419 if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
420 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
421 (unquoted[ii] < '0' || unquoted[ii] > '9') &&
422 unquoted[ii] != '_' &&
423 // If this is the part of a UTF8 or Latin1 character, we need
424 // to copy this byte without escaping. Experimentally this is
425 // what works correctly with the regexp library.
426 !(unquoted[ii] & 128)) {
427 if (unquoted[ii] == '\0') { // Special handling for null chars.
428 // Note that this special handling is not strictly required for RE2,
429 // but this quoting is required for other regexp libraries such as
430 // PCRE.
431 // Can't use "\\0" since the next character might be a digit.
432 result += "\\x00";
433 continue;
434 }
435 result += '\\';
436 }
437 result += unquoted[ii];
438 }
439
440 return result;
441 }
442
443 bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {
444 if (prog_ == NULL)
445 return false;
446
447 int n = prefix_.size();
448 if (n > maxlen)
449 n = maxlen;
450
451 // Determine initial min max from prefix_ literal.
452 string pmin, pmax;
453 pmin = prefix_.substr(0, n);
454 pmax = prefix_.substr(0, n);
455 if (prefix_foldcase_) {
456 // prefix is ASCII lowercase; change pmin to uppercase.
457 for (int i = 0; i < n; i++) {
458 if ('a' <= pmin[i] && pmin[i] <= 'z')
459 pmin[i] += 'A' - 'a';
460 }
461 }
462
463 // Add to prefix min max using PossibleMatchRange on regexp.
464 string dmin, dmax;
465 maxlen -= n;
466 if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
467 pmin += dmin;
468 pmax += dmax;
469 } else if (pmax.size() > 0) {
470 // prog_->PossibleMatchRange has failed us,
471 // but we still have useful information from prefix_.
472 // Round up pmax to allow any possible suffix.
473 pmax = PrefixSuccessor(pmax);
474 } else {
475 // Nothing useful.
476 *min = "";
477 *max = "";
478 return false;
479 }
480
481 *min = pmin;
482 *max = pmax;
483 return true;
484 }
485
486 // Avoid possible locale nonsense in standard strcasecmp.
487 // The string a is known to be all lowercase.
488 static int ascii_strcasecmp(const char* a, const char* b, int len) {
489 const char *ae = a + len;
490
491 for (; a < ae; a++, b++) {
492 uint8 x = *a;
493 uint8 y = *b;
494 if ('A' <= y && y <= 'Z')
495 y += 'a' - 'A';
496 if (x != y)
497 return x - y;
498 }
499 return 0;
500 }
501
502
503 /***** Actual matching and rewriting code *****/
504
505 bool RE2::Match(const StringPiece& text,
506 int startpos,
507 int endpos,
508 Anchor re_anchor,
509 StringPiece* submatch,
510 int nsubmatch) const {
511 if (!ok() || suffix_regexp_ == NULL) {
512 if (options_.log_errors())
513 LOG(ERROR) << "Invalid RE2: " << *error_;
514 return false;
515 }
516
517 if (startpos < 0 || startpos > endpos || endpos > text.size()) {
518 LOG(ERROR) << "RE2: invalid startpos, endpos pair.";
519 return false;
520 }
521
522 StringPiece subtext = text;
523 subtext.remove_prefix(startpos);
524 subtext.remove_suffix(text.size() - endpos);
525
526 // Use DFAs to find exact location of match, filter out non-matches.
527
528 // Don't ask for the location if we won't use it.
529 // SearchDFA can do extra optimizations in that case.
530 StringPiece match;
531 StringPiece* matchp = &match;
532 if (nsubmatch == 0)
533 matchp = NULL;
534
535 int ncap = 1 + NumberOfCapturingGroups();
536 if (ncap > nsubmatch)
537 ncap = nsubmatch;
538
539 // If the regexp is anchored explicitly, must not be in middle of text.
540 if (prog_->anchor_start() && startpos != 0)
541 return false;
542
543 // If the regexp is anchored explicitly, update re_anchor
544 // so that we can potentially fall into a faster case below.
545 if (prog_->anchor_start() && prog_->anchor_end())
546 re_anchor = ANCHOR_BOTH;
547 else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
548 re_anchor = ANCHOR_START;
549
550 // Check for the required prefix, if any.
551 int prefixlen = 0;
552 if (!prefix_.empty()) {
553 if (startpos != 0)
554 return false;
555 prefixlen = prefix_.size();
556 if (prefixlen > subtext.size())
557 return false;
558 if (prefix_foldcase_) {
559 if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
560 return false;
561 } else {
562 if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
563 return false;
564 }
565 subtext.remove_prefix(prefixlen);
566 // If there is a required prefix, the anchor must be at least ANCHOR_START.
567 if (re_anchor != ANCHOR_BOTH)
568 re_anchor = ANCHOR_START;
569 }
570
571 Prog::Anchor anchor = Prog::kUnanchored;
572 Prog::MatchKind kind = Prog::kFirstMatch;
573 if (options_.longest_match())
574 kind = Prog::kLongestMatch;
575 bool skipped_test = false;
576
577 bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
578
579 // SearchBitState allocates a bit vector of size prog_->size() * text.size().
580 // It also allocates a stack of 3-word structures which could potentially
581 // grow as large as prog_->size() * text.size() but in practice is much
582 // smaller.
583 // Conditions for using SearchBitState:
584 const int MaxBitStateProg = 500; // prog_->size() <= Max.
585 const int MaxBitStateVector = 256*1024; // bit vector size <= Max (bits)
586 bool can_bit_state = prog_->size() <= MaxBitStateProg;
587 int bit_state_text_max = MaxBitStateVector / prog_->size();
588
589 bool dfa_failed = false;
590 switch (re_anchor) {
591 default:
592 case UNANCHORED: {
593 if (!prog_->SearchDFA(subtext, text, anchor, kind,
594 matchp, &dfa_failed, NULL)) {
595 if (dfa_failed) {
596 // Fall back to NFA below.
597 skipped_test = true;
598 if (FLAGS_trace_re2)
599 LOG(INFO) << "Match " << trunc(pattern_)
600 << " [" << CEscape(subtext) << "]"
601 << " DFA failed.";
602 break;
603 }
604 if (FLAGS_trace_re2)
605 LOG(INFO) << "Match " << trunc(pattern_)
606 << " [" << CEscape(subtext) << "]"
607 << " used DFA - no match.";
608 return false;
609 }
610 if (FLAGS_trace_re2)
611 LOG(INFO) << "Match " << trunc(pattern_)
612 << " [" << CEscape(subtext) << "]"
613 << " used DFA - match";
614 if (matchp == NULL) // Matched. Don't care where
615 return true;
616 // SearchDFA set match[0].end() but didn't know where the
617 // match started. Run the regexp backward from match[0].end()
618 // to find the longest possible match -- that's where it started.
619 Prog* prog = ReverseProg();
620 if (prog == NULL)
621 return false;
622 if (!prog->SearchDFA(match, text, Prog::kAnchored,
623 Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
624 if (dfa_failed) {
625 // Fall back to NFA below.
626 skipped_test = true;
627 if (FLAGS_trace_re2)
628 LOG(INFO) << "Match " << trunc(pattern_)
629 << " [" << CEscape(subtext) << "]"
630 << " reverse DFA failed.";
631 break;
632 }
633 if (FLAGS_trace_re2)
634 LOG(INFO) << "Match " << trunc(pattern_)
635 << " [" << CEscape(subtext) << "]"
636 << " DFA inconsistency.";
637 LOG(ERROR) << "DFA inconsistency";
638 return false;
639 }
640 if (FLAGS_trace_re2)
641 LOG(INFO) << "Match " << trunc(pattern_)
642 << " [" << CEscape(subtext) << "]"
643 << " used reverse DFA.";
644 break;
645 }
646
647 case ANCHOR_BOTH:
648 case ANCHOR_START:
649 if (re_anchor == ANCHOR_BOTH)
650 kind = Prog::kFullMatch;
651 anchor = Prog::kAnchored;
652
653 // If only a small amount of text and need submatch
654 // information anyway and we're going to use OnePass or BitState
655 // to get it, we might as well not even bother with the DFA:
656 // OnePass or BitState will be fast enough.
657 // On tiny texts, OnePass outruns even the DFA, and
658 // it doesn't have the shared state and occasional mutex that
659 // the DFA does.
660 if (can_one_pass && text.size() <= 4096 &&
661 (ncap > 1 || text.size() <= 8)) {
662 if (FLAGS_trace_re2)
663 LOG(INFO) << "Match " << trunc(pattern_)
664 << " [" << CEscape(subtext) << "]"
665 << " skipping DFA for OnePass.";
666 skipped_test = true;
667 break;
668 }
669 if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
670 if (FLAGS_trace_re2)
671 LOG(INFO) << "Match " << trunc(pattern_)
672 << " [" << CEscape(subtext) << "]"
673 << " skipping DFA for BitState.";
674 skipped_test = true;
675 break;
676 }
677 if (!prog_->SearchDFA(subtext, text, anchor, kind,
678 &match, &dfa_failed, NULL)) {
679 if (dfa_failed) {
680 if (FLAGS_trace_re2)
681 LOG(INFO) << "Match " << trunc(pattern_)
682 << " [" << CEscape(subtext) << "]"
683 << " DFA failed.";
684 skipped_test = true;
685 break;
686 }
687 if (FLAGS_trace_re2)
688 LOG(INFO) << "Match " << trunc(pattern_)
689 << " [" << CEscape(subtext) << "]"
690 << " used DFA - no match.";
691 return false;
692 }
693 break;
694 }
695
696 if (!skipped_test && ncap <= 1) {
697 // We know exactly where it matches. That's enough.
698 if (ncap == 1)
699 submatch[0] = match;
700 } else {
701 StringPiece subtext1;
702 if (skipped_test) {
703 // DFA ran out of memory or was skipped:
704 // need to search in entire original text.
705 subtext1 = subtext;
706 } else {
707 // DFA found the exact match location:
708 // let NFA run an anchored, full match search
709 // to find submatch locations.
710 subtext1 = match;
711 anchor = Prog::kAnchored;
712 kind = Prog::kFullMatch;
713 }
714
715 if (can_one_pass && anchor != Prog::kUnanchored) {
716 if (FLAGS_trace_re2)
717 LOG(INFO) << "Match " << trunc(pattern_)
718 << " [" << CEscape(subtext) << "]"
719 << " using OnePass.";
720 if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
721 if (!skipped_test)
722 LOG(ERROR) << "SearchOnePass inconsistency";
723 return false;
724 }
725 } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
726 if (FLAGS_trace_re2)
727 LOG(INFO) << "Match " << trunc(pattern_)
728 << " [" << CEscape(subtext) << "]"
729 << " using BitState.";
730 if (!prog_->SearchBitState(subtext1, text, anchor,
731 kind, submatch, ncap)) {
732 if (!skipped_test)
733 LOG(ERROR) << "SearchBitState inconsistency";
734 return false;
735 }
736 } else {
737 if (FLAGS_trace_re2)
738 LOG(INFO) << "Match " << trunc(pattern_)
739 << " [" << CEscape(subtext) << "]"
740 << " using NFA.";
741 if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
742 if (!skipped_test)
743 LOG(ERROR) << "SearchNFA inconsistency";
744 return false;
745 }
746 }
747 }
748
749 // Adjust overall match for required prefix that we stripped off.
750 if (prefixlen > 0 && nsubmatch > 0)
751 submatch[0] = StringPiece(submatch[0].begin() - prefixlen,
752 submatch[0].size() + prefixlen);
753
754 // Zero submatches that don't exist in the regexp.
755 for (int i = ncap; i < nsubmatch; i++)
756 submatch[i] = NULL;
757 return true;
758 }
759
760 // Internal matcher - like Match() but takes Args not StringPieces.
761 bool RE2::DoMatch(const StringPiece& text,
762 Anchor anchor,
763 int* consumed,
764 const Arg* const* args,
765 int n) const {
766 if (!ok()) {
767 if (options_.log_errors())
768 LOG(ERROR) << "Invalid RE2: " << *error_;
769 return false;
770 }
771
772 // Count number of capture groups needed.
773 int nvec;
774 if (n == 0 && consumed == NULL)
775 nvec = 0;
776 else
777 nvec = n+1;
778
779 StringPiece* vec;
780 StringPiece stkvec[kVecSize];
781 StringPiece* heapvec = NULL;
782
783 if (nvec <= arraysize(stkvec)) {
784 vec = stkvec;
785 } else {
786 vec = new StringPiece[nvec];
787 heapvec = vec;
788 }
789
790 if (!Match(text, 0, text.size(), anchor, vec, nvec)) {
791 delete[] heapvec;
792 return false;
793 }
794
795 if(consumed != NULL)
796 *consumed = vec[0].end() - text.begin();
797
798 if (n == 0 || args == NULL) {
799 // We are not interested in results
800 delete[] heapvec;
801 return true;
802 }
803
804 int ncap = NumberOfCapturingGroups();
805 if (ncap < n) {
806 // RE has fewer capturing groups than number of arg pointers passed in
807 VLOG(1) << "Asked for " << n << " but only have " << ncap;
808 delete[] heapvec;
809 return false;
810 }
811
812 // If we got here, we must have matched the whole pattern.
813 for (int i = 0; i < n; i++) {
814 const StringPiece& s = vec[i+1];
815 if (!args[i]->Parse(s.data(), s.size())) {
816 // TODO: Should we indicate what the error was?
817 VLOG(1) << "Parse error on #" << i << " " << s << " "
818 << (void*)s.data() << "/" << s.size();
819 delete[] heapvec;
820 return false;
821 }
822 }
823
824 delete[] heapvec;
825 return true;
826 }
827
828 // Append the "rewrite" string, with backslash subsitutions from "vec",
829 // to string "out".
830 bool RE2::Rewrite(string *out, const StringPiece &rewrite,
831 const StringPiece *vec, int veclen) const {
832 for (const char *s = rewrite.data(), *end = s + rewrite.size();
833 s < end; s++) {
834 int c = *s;
835 if (c == '\\') {
836 s++;
837 c = (s < end) ? *s : -1;
838 if (isdigit(c)) {
839 int n = (c - '0');
840 if (n >= veclen) {
841 LOG(ERROR) << "requested group " << n
842 << " in regexp " << rewrite.data();
843 return false;
844 }
845 StringPiece snip = vec[n];
846 if (snip.size() > 0)
847 out->append(snip.data(), snip.size());
848 } else if (c == '\\') {
849 out->push_back('\\');
850 } else {
851 LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
852 return false;
853 }
854 } else {
855 out->push_back(c);
856 }
857 }
858 return true;
859 }
860
861 // Return the number of capturing subpatterns, or -1 if the
862 // regexp wasn't valid on construction.
863 int RE2::NumberOfCapturingGroups() const {
864 if (suffix_regexp_ == NULL)
865 return -1;
866 ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case"
867 " multiple threads end up doing the same work in parallel.");
868 if (num_captures_ == -1)
869 num_captures_ = suffix_regexp_->NumCaptures();
870 return num_captures_;
871 }
872
873 // Checks that the rewrite string is well-formed with respect to this
874 // regular expression.
875 bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {
876 int max_token = -1;
877 for (const char *s = rewrite.data(), *end = s + rewrite.size();
878 s < end; s++) {
879 int c = *s;
880 if (c != '\\') {
881 continue;
882 }
883 if (++s == end) {
884 *error = "Rewrite schema error: '\\' not allowed at end.";
885 return false;
886 }
887 c = *s;
888 if (c == '\\') {
889 continue;
890 }
891 if (!isdigit(c)) {
892 *error = "Rewrite schema error: "
893 "'\\' must be followed by a digit or '\\'.";
894 return false;
895 }
896 int n = (c - '0');
897 if (max_token < n) {
898 max_token = n;
899 }
900 }
901
902 if (max_token > NumberOfCapturingGroups()) {
903 SStringPrintf(error, "Rewrite schema requests %d matches, "
904 "but the regexp only has %d parenthesized subexpressions.",
905 max_token, NumberOfCapturingGroups());
906 return false;
907 }
908 return true;
909 }
910
911 /***** Parsers for various types *****/
912
913 bool RE2::Arg::parse_null(const char* str, int n, void* dest) {
914 // We fail if somebody asked us to store into a non-NULL void* pointer
915 return (dest == NULL);
916 }
917
918 bool RE2::Arg::parse_string(const char* str, int n, void* dest) {
919 if (dest == NULL) return true;
920 reinterpret_cast<string*>(dest)->assign(str, n);
921 return true;
922 }
923
924 bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) {
925 if (dest == NULL) return true;
926 reinterpret_cast<StringPiece*>(dest)->set(str, n);
927 return true;
928 }
929
930 bool RE2::Arg::parse_char(const char* str, int n, void* dest) {
931 if (n != 1) return false;
932 if (dest == NULL) return true;
933 *(reinterpret_cast<char*>(dest)) = str[0];
934 return true;
935 }
936
937 bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) {
938 if (n != 1) return false;
939 if (dest == NULL) return true;
940 *(reinterpret_cast<unsigned char*>(dest)) = str[0];
941 return true;
942 }
943
944 // Largest number spec that we are willing to parse
945 static const int kMaxNumberLength = 32;
946
947 // REQUIRES "buf" must have length at least kMaxNumberLength+1
948 // Copies "str" into "buf" and null-terminates.
949 // Overwrites *np with the new length.
950 static const char* TerminateNumber(char* buf, const char* str, int* np) {
951 int n = *np;
952 if (n <= 0) return "";
953 if (n > 0 && isspace(*str)) {
954 // We are less forgiving than the strtoxxx() routines and do not
955 // allow leading spaces.
956 return "";
957 }
958
959 // Although buf has a fixed maximum size, we can still handle
960 // arbitrarily large integers correctly by omitting leading zeros.
961 // (Numbers that are still too long will be out of range.)
962 // Before deciding whether str is too long,
963 // remove leading zeros with s/000+/00/.
964 // Leaving the leading two zeros in place means that
965 // we don't change 0000x123 (invalid) into 0x123 (valid).
966 // Skip over leading - before replacing.
967 bool neg = false;
968 if (n >= 1 && str[0] == '-') {
969 neg = true;
970 n--;
971 str++;
972 }
973
974 if (n >= 3 && str[0] == '0' && str[1] == '0') {
975 while (n >= 3 && str[2] == '0') {
976 n--;
977 str++;
978 }
979 }
980
981 if (neg) { // make room in buf for -
982 n++;
983 str--;
984 }
985
986 if (n > kMaxNumberLength) return "";
987
988 memmove(buf, str, n);
989 if (neg) {
990 buf[0] = '-';
991 }
992 buf[n] = '\0';
993 *np = n;
994 return buf;
995 }
996
997 bool RE2::Arg::parse_long_radix(const char* str,
998 int n,
999 void* dest,
1000 int radix) {
1001 if (n == 0) return false;
1002 char buf[kMaxNumberLength+1];
1003 str = TerminateNumber(buf, str, &n);
1004 char* end;
1005 errno = 0;
1006 long r = strtol(str, &end, radix);
1007 if (end != str + n) return false; // Leftover junk
1008 if (errno) return false;
1009 if (dest == NULL) return true;
1010 *(reinterpret_cast<long*>(dest)) = r;
1011 return true;
1012 }
1013
1014 bool RE2::Arg::parse_ulong_radix(const char* str,
1015 int n,
1016 void* dest,
1017 int radix) {
1018 if (n == 0) return false;
1019 char buf[kMaxNumberLength+1];
1020 str = TerminateNumber(buf, str, &n);
1021 if (str[0] == '-') {
1022 // strtoul() will silently accept negative numbers and parse
1023 // them. This module is more strict and treats them as errors.
1024 return false;
1025 }
1026
1027 char* end;
1028 errno = 0;
1029 unsigned long r = strtoul(str, &end, radix);
1030 if (end != str + n) return false; // Leftover junk
1031 if (errno) return false;
1032 if (dest == NULL) return true;
1033 *(reinterpret_cast<unsigned long*>(dest)) = r;
1034 return true;
1035 }
1036
1037 bool RE2::Arg::parse_short_radix(const char* str,
1038 int n,
1039 void* dest,
1040 int radix) {
1041 long r;
1042 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1043 if ((short)r != r) return false; // Out of range
1044 if (dest == NULL) return true;
1045 *(reinterpret_cast<short*>(dest)) = r;
1046 return true;
1047 }
1048
1049 bool RE2::Arg::parse_ushort_radix(const char* str,
1050 int n,
1051 void* dest,
1052 int radix) {
1053 unsigned long r;
1054 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1055 if ((ushort)r != r) return false; // Out of range
1056 if (dest == NULL) return true;
1057 *(reinterpret_cast<unsigned short*>(dest)) = r;
1058 return true;
1059 }
1060
1061 bool RE2::Arg::parse_int_radix(const char* str,
1062 int n,
1063 void* dest,
1064 int radix) {
1065 long r;
1066 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1067 if ((int)r != r) return false; // Out of range
1068 if (dest == NULL) return true;
1069 *(reinterpret_cast<int*>(dest)) = r;
1070 return true;
1071 }
1072
1073 bool RE2::Arg::parse_uint_radix(const char* str,
1074 int n,
1075 void* dest,
1076 int radix) {
1077 unsigned long r;
1078 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1079 if ((uint)r != r) return false; // Out of range
1080 if (dest == NULL) return true;
1081 *(reinterpret_cast<unsigned int*>(dest)) = r;
1082 return true;
1083 }
1084
1085 bool RE2::Arg::parse_longlong_radix(const char* str,
1086 int n,
1087 void* dest,
1088 int radix) {
1089 if (n == 0) return false;
1090 char buf[kMaxNumberLength+1];
1091 str = TerminateNumber(buf, str, &n);
1092 char* end;
1093 errno = 0;
1094 int64 r = strtoll(str, &end, radix);
1095 if (end != str + n) return false; // Leftover junk
1096 if (errno) return false;
1097 if (dest == NULL) return true;
1098 *(reinterpret_cast<int64*>(dest)) = r;
1099 return true;
1100 }
1101
1102 bool RE2::Arg::parse_ulonglong_radix(const char* str,
1103 int n,
1104 void* dest,
1105 int radix) {
1106 if (n == 0) return false;
1107 char buf[kMaxNumberLength+1];
1108 str = TerminateNumber(buf, str, &n);
1109 if (str[0] == '-') {
1110 // strtoull() will silently accept negative numbers and parse
1111 // them. This module is more strict and treats them as errors.
1112 return false;
1113 }
1114 char* end;
1115 errno = 0;
1116 uint64 r = strtoull(str, &end, radix);
1117 if (end != str + n) return false; // Leftover junk
1118 if (errno) return false;
1119 if (dest == NULL) return true;
1120 *(reinterpret_cast<uint64*>(dest)) = r;
1121 return true;
1122 }
1123
1124 static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) {
1125 if (n == 0) return false;
1126 static const int kMaxLength = 200;
1127 char buf[kMaxLength];
1128 if (n >= kMaxLength) return false;
1129 memcpy(buf, str, n);
1130 buf[n] = '\0';
1131 errno = 0;
1132 char* end;
1133 double r;
1134 if (isfloat) {
1135 r = strtof(buf, &end);
1136 } else {
1137 r = strtod(buf, &end);
1138 }
1139 if (end != buf + n) return false; // Leftover junk
1140 if (errno) return false;
1141 if (dest == NULL) return true;
1142 if (isfloat) {
1143 *(reinterpret_cast<float*>(dest)) = r;
1144 } else {
1145 *(reinterpret_cast<double*>(dest)) = r;
1146 }
1147 return true;
1148 }
1149
1150 bool RE2::Arg::parse_double(const char* str, int n, void* dest) {
1151 return parse_double_float(str, n, false, dest);
1152 }
1153
1154 bool RE2::Arg::parse_float(const char* str, int n, void* dest) {
1155 return parse_double_float(str, n, true, dest);
1156 }
1157
1158
1159 #define DEFINE_INTEGER_PARSERS(name) \
1160 bool RE2::Arg::parse_##name(const char* str, int n, void* dest) { \
1161 return parse_##name##_radix(str, n, dest, 10); \
1162 } \
1163 bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \
1164 return parse_##name##_radix(str, n, dest, 16); \
1165 } \
1166 bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \
1167 return parse_##name##_radix(str, n, dest, 8); \
1168 } \
1169 bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
1170 return parse_##name##_radix(str, n, dest, 0); \
1171 }
1172
1173 DEFINE_INTEGER_PARSERS(short);
1174 DEFINE_INTEGER_PARSERS(ushort);
1175 DEFINE_INTEGER_PARSERS(int);
1176 DEFINE_INTEGER_PARSERS(uint);
1177 DEFINE_INTEGER_PARSERS(long);
1178 DEFINE_INTEGER_PARSERS(ulong);
1179 DEFINE_INTEGER_PARSERS(longlong);
1180 DEFINE_INTEGER_PARSERS(ulonglong);
1181
1182 #undef DEFINE_INTEGER_PARSERS
1183
1184 } // namespace re2
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698