third_party/re2/re2/re2.cc - Issue 10575037: Include RE2 library

Side by Side Diff: third_party/re2/re2/re2.cc

Issue 10575037: Include RE2 library (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Removed valgrind specific code that is in Chromium already Created 8 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 // Copyright 2003-2009 The RE2 Authors. All Rights Reserved.

	2 // Use of this source code is governed by a BSD-style

	3 // license that can be found in the LICENSE file.

	4

	5 // Regular expression interface RE2.

	6 //

	7 // Originally the PCRE C++ wrapper, but adapted to use

	8 // the new automata-based regular expression engines.

	9

	10 #include "re2/re2.h"

	11

	12 #include <stdio.h>

	13 #include <string>

	14 #ifdef WIN32

	15 #define strtoll _strtoi64

	16 #define strtoull _strtoui64

	17 #define strtof strtod

	18 #else

	19 #include <pthread.h>

	20 #endif

	21 #include <errno.h>

	22 #include "util/util.h"

	23 #include "util/flags.h"

	24 #include "re2/prog.h"

	25 #include "re2/regexp.h"

	26

	27 DEFINE_bool(trace_re2, false, "trace RE2 execution");

	28

	29 namespace re2 {

	30

	31 // Maximum number of args we can set

	32 static const int kMaxArgs = 16;

	33 static const int kVecSize = 1+kMaxArgs;

	34

	35 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::Ful lMatchN> RE2::FullMatch;

	36 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::Par tialMatchN> RE2::PartialMatch;

	37 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume;

	38 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndCo nsumeN> RE2::FindAndConsume;

	39

	40 // Commonly-used option sets; arguments to constructor are:

	41 // utf8 input

	42 // posix syntax

	43 // longest match

	44 // log errors

	45 const RE2::Options RE2::DefaultOptions; // EncodingUTF8, false, false, true

	46 const RE2::Options RE2::Latin1(RE2::Options::EncodingLatin1, false, false, true) ;

	47 //const RE2::Options RE2::POSIX(RE2::Options::EncodingUTF8, true, true, true);

	48 const RE2::Options RE2::Quiet(RE2::Options::EncodingUTF8, false, false, false);

	49

	50 // If a regular expression has no error, its error_ field points here

	51 static const string empty_string;

	52

	53 // Converts from Regexp error code to RE2 error code.

	54 // Maybe some day they will diverge. In any event, this

	55 // hides the existence of Regexp from RE2 users.

	56 static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {

	57 switch (code) {

	58 case re2::kRegexpSuccess:

	59 return RE2::NoError;

	60 case re2::kRegexpInternalError:

	61 return RE2::ErrorInternal;

	62 case re2::kRegexpBadEscape:

	63 return RE2::ErrorBadEscape;

	64 case re2::kRegexpBadCharClass:

	65 return RE2::ErrorBadCharClass;

	66 case re2::kRegexpBadCharRange:

	67 return RE2::ErrorBadCharRange;

	68 case re2::kRegexpMissingBracket:

	69 return RE2::ErrorMissingBracket;

	70 case re2::kRegexpMissingParen:

	71 return RE2::ErrorMissingParen;

	72 case re2::kRegexpTrailingBackslash:

	73 return RE2::ErrorTrailingBackslash;

	74 case re2::kRegexpRepeatArgument:

	75 return RE2::ErrorRepeatArgument;

	76 case re2::kRegexpRepeatSize:

	77 return RE2::ErrorRepeatSize;

	78 case re2::kRegexpRepeatOp:

	79 return RE2::ErrorRepeatOp;

	80 case re2::kRegexpBadPerlOp:

	81 return RE2::ErrorBadPerlOp;

	82 case re2::kRegexpBadUTF8:

	83 return RE2::ErrorBadUTF8;

	84 case re2::kRegexpBadNamedCapture:

	85 return RE2::ErrorBadNamedCapture;

	86 }

	87 return RE2::ErrorInternal;

	88 }

	89

	90 static string trunc(const StringPiece& pattern) {

	91 if (pattern.size() < 100)

	92 return pattern.as_string();

	93 return pattern.substr(0, 100).as_string() + "...";

	94 }

	95

	96

	97 RE2::RE2(const char* pattern) {

	98 Init(pattern, DefaultOptions);

	99 }

	100

	101 RE2::RE2(const string& pattern) {

	102 Init(pattern, DefaultOptions);

	103 }

	104

	105 RE2::RE2(const StringPiece& pattern) {

	106 Init(pattern, DefaultOptions);

	107 }

	108

	109 RE2::RE2(const StringPiece& pattern, const Options& options) {

	110 Init(pattern, options);

	111 }

	112

	113 int RE2::Options::ParseFlags() const {

	114 int flags = Regexp::ClassNL;

	115 switch (encoding()) {

	116 default:

	117 LOG(ERROR) << "Unknown encoding " << encoding();

	118 break;

	119 case RE2::Options::EncodingUTF8:

	120 break;

	121 case RE2::Options::EncodingLatin1:

	122 flags \|= Regexp::Latin1;

	123 break;

	124 }

	125

	126 if (!posix_syntax())

	127 flags \|= Regexp::LikePerl;

	128

	129 if (literal())

	130 flags \|= Regexp::Literal;

	131

	132 if (never_nl())

	133 flags \|= Regexp::NeverNL;

	134

	135 if (!case_sensitive())

	136 flags \|= Regexp::FoldCase;

	137

	138 if (perl_classes())

	139 flags \|= Regexp::PerlClasses;

	140

	141 if (word_boundary())

	142 flags \|= Regexp::PerlB;

	143

	144 if (one_line())

	145 flags \|= Regexp::OneLine;

	146

	147 return flags;

	148 }

	149

	150 void RE2::Init(const StringPiece& pattern, const Options& options) {

	151 mutex_ = new Mutex;

	152 pattern_ = pattern.as_string();

	153 options_.Copy(options);

	154 error_ = &empty_string;

	155 error_code_ = NoError;

	156 suffix_regexp_ = NULL;

	157 entire_regexp_ = NULL;

	158 prog_ = NULL;

	159 rprog_ = NULL;

	160 named_groups_ = NULL;

	161 group_names_ = NULL;

	162 num_captures_ = -1;

	163

	164 RegexpStatus status;

	165 entire_regexp_ = Regexp::Parse(

	166 pattern_,

	167 static_cast<Regexp::ParseFlags>(options_.ParseFlags()),

	168 &status);

	169 if (entire_regexp_ == NULL) {

	170 if (error_ == &empty_string)

	171 error_ = new string(status.Text());

	172 if (options_.log_errors()) {

	173 LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "

	174 << status.Text();

	175 }

	176 error_arg_ = status.error_arg().as_string();

	177 error_code_ = RegexpErrorToRE2(status.code());

	178 return;

	179 }

	180

	181 prefix_.clear();

	182 prefix_foldcase_ = false;

	183 re2::Regexp* suffix;

	184 if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))

	185 suffix_regexp_ = suffix;

	186 else

	187 suffix_regexp_ = entire_regexp_->Incref();

	188

	189 // Two thirds of the memory goes to the forward Prog,

	190 // one third to the reverse prog, because the forward

	191 // Prog has two DFAs but the reverse prog has one.

	192 prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);

	193 if (prog_ == NULL) {

	194 if (options_.log_errors())

	195 LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";

	196 error_ = new string("pattern too large - compile failed");

	197 error_code_ = RE2::ErrorPatternTooLarge;

	198 return;

	199 }

	200

	201 // Could delay this until the first match call that

	202 // cares about submatch information, but the one-pass

	203 // machine's memory gets cut from the DFA memory budget,

	204 // and that is harder to do if the DFA has already

	205 // been built.

	206 is_one_pass_ = prog_->IsOnePass();

	207 }

	208

	209 // Returns rprog_, computing it if needed.

	210 re2::Prog* RE2::ReverseProg() const {

	211 MutexLock l(mutex_);

	212 if (rprog_ == NULL && error_ == &empty_string) {

	213 rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3);

	214 if (rprog_ == NULL) {

	215 if (options_.log_errors())

	216 LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'";

	217 error_ = new string("pattern too large - reverse compile failed");

	218 error_code_ = RE2::ErrorPatternTooLarge;

	219 return NULL;

	220 }

	221 }

	222 return rprog_;

	223 }

	224

	225 static const map<string, int> empty_named_groups;

	226 static const map<int, string> empty_group_names;

	227

	228 RE2::~RE2() {

	229 if (suffix_regexp_)

	230 suffix_regexp_->Decref();

	231 if (entire_regexp_)

	232 entire_regexp_->Decref();

	233 delete mutex_;

	234 delete prog_;

	235 delete rprog_;

	236 if (error_ != &empty_string)

	237 delete error_;

	238 if (named_groups_ != NULL && named_groups_ != &empty_named_groups)

	239 delete named_groups_;

	240 if (group_names_ != NULL && group_names_ != &empty_group_names)

	241 delete group_names_;

	242 }

	243

	244 int RE2::ProgramSize() const {

	245 if (prog_ == NULL)

	246 return -1;

	247 return prog_->size();

	248 }

	249

	250 // Returns named_groups_, computing it if needed.

	251 const map<string, int>& RE2::NamedCapturingGroups() const {

	252 MutexLock l(mutex_);

	253 if (!ok())

	254 return empty_named_groups;

	255 if (named_groups_ == NULL) {

	256 named_groups_ = suffix_regexp_->NamedCaptures();

	257 if (named_groups_ == NULL)

	258 named_groups_ = &empty_named_groups;

	259 }

	260 return *named_groups_;

	261 }

	262

	263 // Returns group_names_, computing it if needed.

	264 const map<int, string>& RE2::CapturingGroupNames() const {

	265 MutexLock l(mutex_);

	266 if (!ok())

	267 return empty_group_names;

	268 if (group_names_ == NULL) {

	269 group_names_ = suffix_regexp_->CaptureNames();

	270 if (group_names_ == NULL)

	271 group_names_ = &empty_group_names;

	272 }

	273 return *group_names_;

	274 }

	275

	276 /*** Convenience interfaces ***/

	277

	278 bool RE2::FullMatchN(const StringPiece& text, const RE2& re,

	279 const Arg* const args[], int n) {

	280 return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);

	281 }

	282

	283 bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,

	284 const Arg* const args[], int n) {

	285 return re.DoMatch(text, UNANCHORED, NULL, args, n);

	286 }

	287

	288 bool RE2::ConsumeN(StringPiece* input, const RE2& re,

	289 const Arg* const args[], int n) {

	290 int consumed;

	291 if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {

	292 input->remove_prefix(consumed);

	293 return true;

	294 } else {

	295 return false;

	296 }

	297 }

	298

	299 bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,

	300 const Arg* const args[], int n) {

	301 int consumed;

	302 if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {

	303 input->remove_prefix(consumed);

	304 return true;

	305 } else {

	306 return false;

	307 }

	308 }

	309

	310 // Returns the maximum submatch needed for the rewrite to be done by Replace().

	311 // E.g. if rewrite == "foo \\2,\\1", returns 2.

	312 static int MaxSubmatch(const StringPiece& rewrite) {

	313 int max = 0;

	314 for (const char s = rewrite.data(), end = s + rewrite.size();

	315 s < end; s++) {

	316 if (*s == '\\') {

	317 s++;

	318 int c = (s < end) ? *s : -1;

	319 if (isdigit(c)) {

	320 int n = (c - '0');

	321 if (n > max)

	322 max = n;

	323 }

	324 }

	325 }

	326 return max;

	327 }

	328

	329 bool RE2::Replace(string *str,

	330 const RE2& re,

	331 const StringPiece& rewrite) {

	332 StringPiece vec[kVecSize];

	333 int nvec = 1 + MaxSubmatch(rewrite);

	334 if (nvec > arraysize(vec))

	335 return false;

	336 if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))

	337 return false;

	338

	339 string s;

	340 if (!re.Rewrite(&s, rewrite, vec, nvec))

	341 return false;

	342

	343 assert(vec[0].begin() >= str->data());

	344 assert(vec[0].end() <= str->data()+str->size());

	345 str->replace(vec[0].data() - str->data(), vec[0].size(), s);

	346 return true;

	347 }

	348

	349 int RE2::GlobalReplace(string *str,

	350 const RE2& re,

	351 const StringPiece& rewrite) {

	352 StringPiece vec[kVecSize];

	353 int nvec = 1 + MaxSubmatch(rewrite);

	354 if (nvec > arraysize(vec))

	355 return false;

	356

	357 const char* p = str->data();

	358 const char* ep = p + str->size();

	359 const char* lastend = NULL;

	360 string out;

	361 int count = 0;

	362 while (p <= ep) {

	363 if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec))

	364 break;

	365 if (p < vec[0].begin())

	366 out.append(p, vec[0].begin() - p);

	367 if (vec[0].begin() == lastend && vec[0].size() == 0) {

	368 // Disallow empty match at end of last match: skip ahead.

	369 if (p < ep)

	370 out.append(p, 1);

	371 p++;

	372 continue;

	373 }

	374 re.Rewrite(&out, rewrite, vec, nvec);

	375 p = vec[0].end();

	376 lastend = p;

	377 count++;

	378 }

	379

	380 if (count == 0)

	381 return 0;

	382

	383 if (p < ep)

	384 out.append(p, ep - p);

	385 swap(out, *str);

	386 return count;

	387 }

	388

	389 bool RE2::Extract(const StringPiece &text,

	390 const RE2& re,

	391 const StringPiece &rewrite,

	392 string *out) {

	393 StringPiece vec[kVecSize];

	394 int nvec = 1 + MaxSubmatch(rewrite);

	395 if (nvec > arraysize(vec))

	396 return false;

	397

	398 if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))

	399 return false;

	400

	401 out->clear();

	402 return re.Rewrite(out, rewrite, vec, nvec);

	403 }

	404

	405 string RE2::QuoteMeta(const StringPiece& unquoted) {

	406 string result;

	407 result.reserve(unquoted.size() << 1);

	408

	409 // Escape any ascii character not in [A-Za-z_0-9].

	410 //

	411 // Note that it's legal to escape a character even if it has no

	412 // special meaning in a regular expression -- so this function does

	413 // that. (This also makes it identical to the perl function of the

	414 // same name except for the null-character special case;

	415 // see `perldoc -f quotemeta`.)

	416 for (int ii = 0; ii < unquoted.length(); ++ii) {

	417 // Note that using 'isalnum' here raises the benchmark time from

	418 // 32ns to 58ns:

	419 if ((unquoted[ii] < 'a' \|\| unquoted[ii] > 'z') &&

	420 (unquoted[ii] < 'A' \|\| unquoted[ii] > 'Z') &&

	421 (unquoted[ii] < '0' \|\| unquoted[ii] > '9') &&

	422 unquoted[ii] != '_' &&

	423 // If this is the part of a UTF8 or Latin1 character, we need

	424 // to copy this byte without escaping. Experimentally this is

	425 // what works correctly with the regexp library.

	426 !(unquoted[ii] & 128)) {

	427 if (unquoted[ii] == '\0') { // Special handling for null chars.

	428 // Note that this special handling is not strictly required for RE2,

	429 // but this quoting is required for other regexp libraries such as

	430 // PCRE.

	431 // Can't use "\\0" since the next character might be a digit.

	432 result += "\\x00";

	433 continue;

	434 }

	435 result += '\\';

	436 }

	437 result += unquoted[ii];

	438 }

	439

	440 return result;

	441 }

	442

	443 bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {

	444 if (prog_ == NULL)

	445 return false;

	446

	447 int n = prefix_.size();

	448 if (n > maxlen)

	449 n = maxlen;

	450

	451 // Determine initial min max from prefix_ literal.

	452 string pmin, pmax;

	453 pmin = prefix_.substr(0, n);

	454 pmax = prefix_.substr(0, n);

	455 if (prefix_foldcase_) {

	456 // prefix is ASCII lowercase; change pmin to uppercase.

	457 for (int i = 0; i < n; i++) {

	458 if ('a' <= pmin[i] && pmin[i] <= 'z')

	459 pmin[i] += 'A' - 'a';

	460 }

	461 }

	462

	463 // Add to prefix min max using PossibleMatchRange on regexp.

	464 string dmin, dmax;

	465 maxlen -= n;

	466 if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {

	467 pmin += dmin;

	468 pmax += dmax;

	469 } else if (pmax.size() > 0) {

	470 // prog_->PossibleMatchRange has failed us,

	471 // but we still have useful information from prefix_.

	472 // Round up pmax to allow any possible suffix.

	473 pmax = PrefixSuccessor(pmax);

	474 } else {

	475 // Nothing useful.

	476 *min = "";

	477 *max = "";

	478 return false;

	479 }

	480

	481 *min = pmin;

	482 *max = pmax;

	483 return true;

	484 }

	485

	486 // Avoid possible locale nonsense in standard strcasecmp.

	487 // The string a is known to be all lowercase.

	488 static int ascii_strcasecmp(const char* a, const char* b, int len) {

	489 const char *ae = a + len;

	490

	491 for (; a < ae; a++, b++) {

	492 uint8 x = *a;

	493 uint8 y = *b;

	494 if ('A' <= y && y <= 'Z')

	495 y += 'a' - 'A';

	496 if (x != y)

	497 return x - y;

	498 }

	499 return 0;

	500 }

	501

	502

	503 /*** Actual matching and rewriting code ***/

	504

	505 bool RE2::Match(const StringPiece& text,

	506 int startpos,

	507 int endpos,

	508 Anchor re_anchor,

	509 StringPiece* submatch,

	510 int nsubmatch) const {

	511 if (!ok() \|\| suffix_regexp_ == NULL) {

	512 if (options_.log_errors())

	513 LOG(ERROR) << "Invalid RE2: " << *error_;

	514 return false;

	515 }

	516

	517 if (startpos < 0 \|\| startpos > endpos \|\| endpos > text.size()) {

	518 LOG(ERROR) << "RE2: invalid startpos, endpos pair.";

	519 return false;

	520 }

	521

	522 StringPiece subtext = text;

	523 subtext.remove_prefix(startpos);

	524 subtext.remove_suffix(text.size() - endpos);

	525

	526 // Use DFAs to find exact location of match, filter out non-matches.

	527

	528 // Don't ask for the location if we won't use it.

	529 // SearchDFA can do extra optimizations in that case.

	530 StringPiece match;

	531 StringPiece* matchp = &match;

	532 if (nsubmatch == 0)

	533 matchp = NULL;

	534

	535 int ncap = 1 + NumberOfCapturingGroups();

	536 if (ncap > nsubmatch)

	537 ncap = nsubmatch;

	538

	539 // If the regexp is anchored explicitly, must not be in middle of text.

	540 if (prog_->anchor_start() && startpos != 0)

	541 return false;

	542

	543 // If the regexp is anchored explicitly, update re_anchor

	544 // so that we can potentially fall into a faster case below.

	545 if (prog_->anchor_start() && prog_->anchor_end())

	546 re_anchor = ANCHOR_BOTH;

	547 else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)

	548 re_anchor = ANCHOR_START;

	549

	550 // Check for the required prefix, if any.

	551 int prefixlen = 0;

	552 if (!prefix_.empty()) {

	553 if (startpos != 0)

	554 return false;

	555 prefixlen = prefix_.size();

	556 if (prefixlen > subtext.size())

	557 return false;

	558 if (prefix_foldcase_) {

	559 if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)

	560 return false;

	561 } else {

	562 if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)

	563 return false;

	564 }

	565 subtext.remove_prefix(prefixlen);

	566 // If there is a required prefix, the anchor must be at least ANCHOR_START.

	567 if (re_anchor != ANCHOR_BOTH)

	568 re_anchor = ANCHOR_START;

	569 }

	570

	571 Prog::Anchor anchor = Prog::kUnanchored;

	572 Prog::MatchKind kind = Prog::kFirstMatch;

	573 if (options_.longest_match())

	574 kind = Prog::kLongestMatch;

	575 bool skipped_test = false;

	576

	577 bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);

	578

	579 // SearchBitState allocates a bit vector of size prog_->size() * text.size().

	580 // It also allocates a stack of 3-word structures which could potentially

	581 // grow as large as prog_->size() * text.size() but in practice is much

	582 // smaller.

	583 // Conditions for using SearchBitState:

	584 const int MaxBitStateProg = 500; // prog_->size() <= Max.

	585 const int MaxBitStateVector = 256*1024; // bit vector size <= Max (bits)

	586 bool can_bit_state = prog_->size() <= MaxBitStateProg;

	587 int bit_state_text_max = MaxBitStateVector / prog_->size();

	588

	589 bool dfa_failed = false;

	590 switch (re_anchor) {

	591 default:

	592 case UNANCHORED: {

	593 if (!prog_->SearchDFA(subtext, text, anchor, kind,

	594 matchp, &dfa_failed, NULL)) {

	595 if (dfa_failed) {

	596 // Fall back to NFA below.

	597 skipped_test = true;

	598 if (FLAGS_trace_re2)

	599 LOG(INFO) << "Match " << trunc(pattern_)

	600 << " [" << CEscape(subtext) << "]"

	601 << " DFA failed.";

	602 break;

	603 }

	604 if (FLAGS_trace_re2)

	605 LOG(INFO) << "Match " << trunc(pattern_)

	606 << " [" << CEscape(subtext) << "]"

	607 << " used DFA - no match.";

	608 return false;

	609 }

	610 if (FLAGS_trace_re2)

	611 LOG(INFO) << "Match " << trunc(pattern_)

	612 << " [" << CEscape(subtext) << "]"

	613 << " used DFA - match";

	614 if (matchp == NULL) // Matched. Don't care where

	615 return true;

	616 // SearchDFA set match[0].end() but didn't know where the

	617 // match started. Run the regexp backward from match[0].end()

	618 // to find the longest possible match -- that's where it started.

	619 Prog* prog = ReverseProg();

	620 if (prog == NULL)

	621 return false;

	622 if (!prog->SearchDFA(match, text, Prog::kAnchored,

	623 Prog::kLongestMatch, &match, &dfa_failed, NULL)) {

	624 if (dfa_failed) {

	625 // Fall back to NFA below.

	626 skipped_test = true;

	627 if (FLAGS_trace_re2)

	628 LOG(INFO) << "Match " << trunc(pattern_)

	629 << " [" << CEscape(subtext) << "]"

	630 << " reverse DFA failed.";

	631 break;

	632 }

	633 if (FLAGS_trace_re2)

	634 LOG(INFO) << "Match " << trunc(pattern_)

	635 << " [" << CEscape(subtext) << "]"

	636 << " DFA inconsistency.";

	637 LOG(ERROR) << "DFA inconsistency";

	638 return false;

	639 }

	640 if (FLAGS_trace_re2)

	641 LOG(INFO) << "Match " << trunc(pattern_)

	642 << " [" << CEscape(subtext) << "]"

	643 << " used reverse DFA.";

	644 break;

	645 }

	646

	647 case ANCHOR_BOTH:

	648 case ANCHOR_START:

	649 if (re_anchor == ANCHOR_BOTH)

	650 kind = Prog::kFullMatch;

	651 anchor = Prog::kAnchored;

	652

	653 // If only a small amount of text and need submatch

	654 // information anyway and we're going to use OnePass or BitState

	655 // to get it, we might as well not even bother with the DFA:

	656 // OnePass or BitState will be fast enough.

	657 // On tiny texts, OnePass outruns even the DFA, and

	658 // it doesn't have the shared state and occasional mutex that

	659 // the DFA does.

	660 if (can_one_pass && text.size() <= 4096 &&

	661 (ncap > 1 \|\| text.size() <= 8)) {

	662 if (FLAGS_trace_re2)

	663 LOG(INFO) << "Match " << trunc(pattern_)

	664 << " [" << CEscape(subtext) << "]"

	665 << " skipping DFA for OnePass.";

	666 skipped_test = true;

	667 break;

	668 }

	669 if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {

	670 if (FLAGS_trace_re2)

	671 LOG(INFO) << "Match " << trunc(pattern_)

	672 << " [" << CEscape(subtext) << "]"

	673 << " skipping DFA for BitState.";

	674 skipped_test = true;

	675 break;

	676 }

	677 if (!prog_->SearchDFA(subtext, text, anchor, kind,

	678 &match, &dfa_failed, NULL)) {

	679 if (dfa_failed) {

	680 if (FLAGS_trace_re2)

	681 LOG(INFO) << "Match " << trunc(pattern_)

	682 << " [" << CEscape(subtext) << "]"

	683 << " DFA failed.";

	684 skipped_test = true;

	685 break;

	686 }

	687 if (FLAGS_trace_re2)

	688 LOG(INFO) << "Match " << trunc(pattern_)

	689 << " [" << CEscape(subtext) << "]"

	690 << " used DFA - no match.";

	691 return false;

	692 }

	693 break;

	694 }

	695

	696 if (!skipped_test && ncap <= 1) {

	697 // We know exactly where it matches. That's enough.

	698 if (ncap == 1)

	699 submatch[0] = match;

	700 } else {

	701 StringPiece subtext1;

	702 if (skipped_test) {

	703 // DFA ran out of memory or was skipped:

	704 // need to search in entire original text.

	705 subtext1 = subtext;

	706 } else {

	707 // DFA found the exact match location:

	708 // let NFA run an anchored, full match search

	709 // to find submatch locations.

	710 subtext1 = match;

	711 anchor = Prog::kAnchored;

	712 kind = Prog::kFullMatch;

	713 }

	714

	715 if (can_one_pass && anchor != Prog::kUnanchored) {

	716 if (FLAGS_trace_re2)

	717 LOG(INFO) << "Match " << trunc(pattern_)

	718 << " [" << CEscape(subtext) << "]"

	719 << " using OnePass.";

	720 if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {

	721 if (!skipped_test)

	722 LOG(ERROR) << "SearchOnePass inconsistency";

	723 return false;

	724 }

	725 } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {

	726 if (FLAGS_trace_re2)

	727 LOG(INFO) << "Match " << trunc(pattern_)

	728 << " [" << CEscape(subtext) << "]"

	729 << " using BitState.";

	730 if (!prog_->SearchBitState(subtext1, text, anchor,

	731 kind, submatch, ncap)) {

	732 if (!skipped_test)

	733 LOG(ERROR) << "SearchBitState inconsistency";

	734 return false;

	735 }

	736 } else {

	737 if (FLAGS_trace_re2)

	738 LOG(INFO) << "Match " << trunc(pattern_)

	739 << " [" << CEscape(subtext) << "]"

	740 << " using NFA.";

	741 if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {

	742 if (!skipped_test)

	743 LOG(ERROR) << "SearchNFA inconsistency";

	744 return false;

	745 }

	746 }

	747 }

	748

	749 // Adjust overall match for required prefix that we stripped off.

	750 if (prefixlen > 0 && nsubmatch > 0)

	751 submatch[0] = StringPiece(submatch[0].begin() - prefixlen,

	752 submatch[0].size() + prefixlen);

	753

	754 // Zero submatches that don't exist in the regexp.

	755 for (int i = ncap; i < nsubmatch; i++)

	756 submatch[i] = NULL;

	757 return true;

	758 }

	759

	760 // Internal matcher - like Match() but takes Args not StringPieces.

	761 bool RE2::DoMatch(const StringPiece& text,

	762 Anchor anchor,

	763 int* consumed,

	764 const Arg* const* args,

	765 int n) const {

	766 if (!ok()) {

	767 if (options_.log_errors())

	768 LOG(ERROR) << "Invalid RE2: " << *error_;

	769 return false;

	770 }

	771

	772 // Count number of capture groups needed.

	773 int nvec;

	774 if (n == 0 && consumed == NULL)

	775 nvec = 0;

	776 else

	777 nvec = n+1;

	778

	779 StringPiece* vec;

	780 StringPiece stkvec[kVecSize];

	781 StringPiece* heapvec = NULL;

	782

	783 if (nvec <= arraysize(stkvec)) {

	784 vec = stkvec;

	785 } else {

	786 vec = new StringPiece[nvec];

	787 heapvec = vec;

	788 }

	789

	790 if (!Match(text, 0, text.size(), anchor, vec, nvec)) {

	791 delete[] heapvec;

	792 return false;

	793 }

	794

	795 if(consumed != NULL)

	796 *consumed = vec[0].end() - text.begin();

	797

	798 if (n == 0 \|\| args == NULL) {

	799 // We are not interested in results

	800 delete[] heapvec;

	801 return true;

	802 }

	803

	804 int ncap = NumberOfCapturingGroups();

	805 if (ncap < n) {

	806 // RE has fewer capturing groups than number of arg pointers passed in

	807 VLOG(1) << "Asked for " << n << " but only have " << ncap;

	808 delete[] heapvec;

	809 return false;

	810 }

	811

	812 // If we got here, we must have matched the whole pattern.

	813 for (int i = 0; i < n; i++) {

	814 const StringPiece& s = vec[i+1];

	815 if (!args[i]->Parse(s.data(), s.size())) {

	816 // TODO: Should we indicate what the error was?

	817 VLOG(1) << "Parse error on #" << i << " " << s << " "

	818 << (void*)s.data() << "/" << s.size();

	819 delete[] heapvec;

	820 return false;

	821 }

	822 }

	823

	824 delete[] heapvec;

	825 return true;

	826 }

	827

	828 // Append the "rewrite" string, with backslash subsitutions from "vec",

	829 // to string "out".

	830 bool RE2::Rewrite(string *out, const StringPiece &rewrite,

	831 const StringPiece *vec, int veclen) const {

	832 for (const char s = rewrite.data(), end = s + rewrite.size();

	833 s < end; s++) {

	834 int c = *s;

	835 if (c == '\\') {

	836 s++;

	837 c = (s < end) ? *s : -1;

	838 if (isdigit(c)) {

	839 int n = (c - '0');

	840 if (n >= veclen) {

	841 LOG(ERROR) << "requested group " << n

	842 << " in regexp " << rewrite.data();

	843 return false;

	844 }

	845 StringPiece snip = vec[n];

	846 if (snip.size() > 0)

	847 out->append(snip.data(), snip.size());

	848 } else if (c == '\\') {

	849 out->push_back('\\');

	850 } else {

	851 LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();

	852 return false;

	853 }

	854 } else {

	855 out->push_back(c);

	856 }

	857 }

	858 return true;

	859 }

	860

	861 // Return the number of capturing subpatterns, or -1 if the

	862 // regexp wasn't valid on construction.

	863 int RE2::NumberOfCapturingGroups() const {

	864 if (suffix_regexp_ == NULL)

	865 return -1;

	866 ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case"

	867 " multiple threads end up doing the same work in parallel.");

	868 if (num_captures_ == -1)

	869 num_captures_ = suffix_regexp_->NumCaptures();

	870 return num_captures_;

	871 }

	872

	873 // Checks that the rewrite string is well-formed with respect to this

	874 // regular expression.

	875 bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {

	876 int max_token = -1;

	877 for (const char s = rewrite.data(), end = s + rewrite.size();

	878 s < end; s++) {

	879 int c = *s;

	880 if (c != '\\') {

	881 continue;

	882 }

	883 if (++s == end) {

	884 *error = "Rewrite schema error: '\\' not allowed at end.";

	885 return false;

	886 }

	887 c = *s;

	888 if (c == '\\') {

	889 continue;

	890 }

	891 if (!isdigit(c)) {

	892 *error = "Rewrite schema error: "

	893 "'\\' must be followed by a digit or '\\'.";

	894 return false;

	895 }

	896 int n = (c - '0');

	897 if (max_token < n) {

	898 max_token = n;

	899 }

	900 }

	901

	902 if (max_token > NumberOfCapturingGroups()) {

	903 SStringPrintf(error, "Rewrite schema requests %d matches, "

	904 "but the regexp only has %d parenthesized subexpressions.",

	905 max_token, NumberOfCapturingGroups());

	906 return false;

	907 }

	908 return true;

	909 }

	910

	911 /*** Parsers for various types ***/

	912

	913 bool RE2::Arg::parse_null(const char* str, int n, void* dest) {

	914 // We fail if somebody asked us to store into a non-NULL void* pointer

	915 return (dest == NULL);

	916 }

	917

	918 bool RE2::Arg::parse_string(const char* str, int n, void* dest) {

	919 if (dest == NULL) return true;

	920 reinterpret_cast<string*>(dest)->assign(str, n);

	921 return true;

	922 }

	923

	924 bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) {

	925 if (dest == NULL) return true;

	926 reinterpret_cast<StringPiece*>(dest)->set(str, n);

	927 return true;

	928 }

	929

	930 bool RE2::Arg::parse_char(const char* str, int n, void* dest) {

	931 if (n != 1) return false;

	932 if (dest == NULL) return true;

	933 (reinterpret_cast<char>(dest)) = str[0];

	934 return true;

	935 }

	936

	937 bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) {

	938 if (n != 1) return false;

	939 if (dest == NULL) return true;

	940 (reinterpret_cast<unsigned char>(dest)) = str[0];

	941 return true;

	942 }

	943

	944 // Largest number spec that we are willing to parse

	945 static const int kMaxNumberLength = 32;

	946

	947 // REQUIRES "buf" must have length at least kMaxNumberLength+1

	948 // Copies "str" into "buf" and null-terminates.

	949 // Overwrites *np with the new length.

	950 static const char* TerminateNumber(char* buf, const char* str, int* np) {

	951 int n = *np;

	952 if (n <= 0) return "";

	953 if (n > 0 && isspace(*str)) {

	954 // We are less forgiving than the strtoxxx() routines and do not

	955 // allow leading spaces.

	956 return "";

	957 }

	958

	959 // Although buf has a fixed maximum size, we can still handle

	960 // arbitrarily large integers correctly by omitting leading zeros.

	961 // (Numbers that are still too long will be out of range.)

	962 // Before deciding whether str is too long,

	963 // remove leading zeros with s/000+/00/.

	964 // Leaving the leading two zeros in place means that

	965 // we don't change 0000x123 (invalid) into 0x123 (valid).

	966 // Skip over leading - before replacing.

	967 bool neg = false;

	968 if (n >= 1 && str[0] == '-') {

	969 neg = true;

	970 n--;

	971 str++;

	972 }

	973

	974 if (n >= 3 && str[0] == '0' && str[1] == '0') {

	975 while (n >= 3 && str[2] == '0') {

	976 n--;

	977 str++;

	978 }

	979 }

	980

	981 if (neg) { // make room in buf for -

	982 n++;

	983 str--;

	984 }

	985

	986 if (n > kMaxNumberLength) return "";

	987

	988 memmove(buf, str, n);

	989 if (neg) {

	990 buf[0] = '-';

	991 }

	992 buf[n] = '\0';

	993 *np = n;

	994 return buf;

	995 }

	996

	997 bool RE2::Arg::parse_long_radix(const char* str,

	998 int n,

	999 void* dest,

	1000 int radix) {

	1001 if (n == 0) return false;

	1002 char buf[kMaxNumberLength+1];

	1003 str = TerminateNumber(buf, str, &n);

	1004 char* end;

	1005 errno = 0;

	1006 long r = strtol(str, &end, radix);

	1007 if (end != str + n) return false; // Leftover junk

	1008 if (errno) return false;

	1009 if (dest == NULL) return true;

	1010 (reinterpret_cast<long>(dest)) = r;

	1011 return true;

	1012 }

	1013

	1014 bool RE2::Arg::parse_ulong_radix(const char* str,

	1015 int n,

	1016 void* dest,

	1017 int radix) {

	1018 if (n == 0) return false;

	1019 char buf[kMaxNumberLength+1];

	1020 str = TerminateNumber(buf, str, &n);

	1021 if (str[0] == '-') {

	1022 // strtoul() will silently accept negative numbers and parse

	1023 // them. This module is more strict and treats them as errors.

	1024 return false;

	1025 }

	1026

	1027 char* end;

	1028 errno = 0;

	1029 unsigned long r = strtoul(str, &end, radix);

	1030 if (end != str + n) return false; // Leftover junk

	1031 if (errno) return false;

	1032 if (dest == NULL) return true;

	1033 (reinterpret_cast<unsigned long>(dest)) = r;

	1034 return true;

	1035 }

	1036

	1037 bool RE2::Arg::parse_short_radix(const char* str,

	1038 int n,

	1039 void* dest,

	1040 int radix) {

	1041 long r;

	1042 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse

	1043 if ((short)r != r) return false; // Out of range

	1044 if (dest == NULL) return true;

	1045 (reinterpret_cast<short>(dest)) = r;

	1046 return true;

	1047 }

	1048

	1049 bool RE2::Arg::parse_ushort_radix(const char* str,

	1050 int n,

	1051 void* dest,

	1052 int radix) {

	1053 unsigned long r;

	1054 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse

	1055 if ((ushort)r != r) return false; // Out of range

	1056 if (dest == NULL) return true;

	1057 (reinterpret_cast<unsigned short>(dest)) = r;

	1058 return true;

	1059 }

	1060

	1061 bool RE2::Arg::parse_int_radix(const char* str,

	1062 int n,

	1063 void* dest,

	1064 int radix) {

	1065 long r;

	1066 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse

	1067 if ((int)r != r) return false; // Out of range

	1068 if (dest == NULL) return true;

	1069 (reinterpret_cast<int>(dest)) = r;

	1070 return true;

	1071 }

	1072

	1073 bool RE2::Arg::parse_uint_radix(const char* str,

	1074 int n,

	1075 void* dest,

	1076 int radix) {

	1077 unsigned long r;

	1078 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse

	1079 if ((uint)r != r) return false; // Out of range

	1080 if (dest == NULL) return true;

	1081 (reinterpret_cast<unsigned int>(dest)) = r;

	1082 return true;

	1083 }

	1084

	1085 bool RE2::Arg::parse_longlong_radix(const char* str,

	1086 int n,

	1087 void* dest,

	1088 int radix) {

	1089 if (n == 0) return false;

	1090 char buf[kMaxNumberLength+1];

	1091 str = TerminateNumber(buf, str, &n);

	1092 char* end;

	1093 errno = 0;

	1094 int64 r = strtoll(str, &end, radix);

	1095 if (end != str + n) return false; // Leftover junk

	1096 if (errno) return false;

	1097 if (dest == NULL) return true;

	1098 (reinterpret_cast<int64>(dest)) = r;

	1099 return true;

	1100 }

	1101

	1102 bool RE2::Arg::parse_ulonglong_radix(const char* str,

	1103 int n,

	1104 void* dest,

	1105 int radix) {

	1106 if (n == 0) return false;

	1107 char buf[kMaxNumberLength+1];

	1108 str = TerminateNumber(buf, str, &n);

	1109 if (str[0] == '-') {

	1110 // strtoull() will silently accept negative numbers and parse

	1111 // them. This module is more strict and treats them as errors.

	1112 return false;

	1113 }

	1114 char* end;

	1115 errno = 0;

	1116 uint64 r = strtoull(str, &end, radix);

	1117 if (end != str + n) return false; // Leftover junk

	1118 if (errno) return false;

	1119 if (dest == NULL) return true;

	1120 (reinterpret_cast<uint64>(dest)) = r;

	1121 return true;

	1122 }

	1123

	1124 static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) {

	1125 if (n == 0) return false;

	1126 static const int kMaxLength = 200;

	1127 char buf[kMaxLength];

	1128 if (n >= kMaxLength) return false;

	1129 memcpy(buf, str, n);

	1130 buf[n] = '\0';

	1131 errno = 0;

	1132 char* end;

	1133 double r;

	1134 if (isfloat) {

	1135 r = strtof(buf, &end);

	1136 } else {

	1137 r = strtod(buf, &end);

	1138 }

	1139 if (end != buf + n) return false; // Leftover junk

	1140 if (errno) return false;

	1141 if (dest == NULL) return true;

	1142 if (isfloat) {

	1143 (reinterpret_cast<float>(dest)) = r;

	1144 } else {

	1145 (reinterpret_cast<double>(dest)) = r;

	1146 }

	1147 return true;

	1148 }

	1149

	1150 bool RE2::Arg::parse_double(const char* str, int n, void* dest) {

	1151 return parse_double_float(str, n, false, dest);

	1152 }

	1153

	1154 bool RE2::Arg::parse_float(const char* str, int n, void* dest) {

	1155 return parse_double_float(str, n, true, dest);

	1156 }

	1157

	1158

	1159 #define DEFINE_INTEGER_PARSERS(name) \

	1160 bool RE2::Arg::parse_##name(const char* str, int n, void* dest) { \

	1161 return parse_##name##_radix(str, n, dest, 10); \

	1162 } \

	1163 bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \

	1164 return parse_##name##_radix(str, n, dest, 16); \

	1165 } \

	1166 bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \

	1167 return parse_##name##_radix(str, n, dest, 8); \

	1168 } \

	1169 bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \

	1170 return parse_##name##_radix(str, n, dest, 0); \

	1171 }

	1172

	1173 DEFINE_INTEGER_PARSERS(short);

	1174 DEFINE_INTEGER_PARSERS(ushort);

	1175 DEFINE_INTEGER_PARSERS(int);

	1176 DEFINE_INTEGER_PARSERS(uint);

	1177 DEFINE_INTEGER_PARSERS(long);

	1178 DEFINE_INTEGER_PARSERS(ulong);

	1179 DEFINE_INTEGER_PARSERS(longlong);

	1180 DEFINE_INTEGER_PARSERS(ulonglong);

	1181

	1182 #undef DEFINE_INTEGER_PARSERS

	1183

	1184 } // namespace re2

OLD	NEW

« third_party/re2/README.chromium ('K') | « third_party/re2/re2/re2.h ('k') | third_party/re2/re2/regexp.h » ('j') | no next file with comments »