third_party/re2/re2/re2.cc - Issue 1544433002: Replace RE2 import with a dependency

Unified Diff: third_party/re2/re2/re2.cc

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Re-Added LICENSE and OWNERS file Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: third_party/re2/re2/re2.cc

diff --git a/third_party/re2/re2/re2.cc b/third_party/re2/re2/re2.cc

deleted file mode 100644

index b3e582f0580e97a80a69287d12a9e58e8183fd04..0000000000000000000000000000000000000000

--- a/third_party/re2/re2/re2.cc

+++ /dev/null

@@ -1,1239 +0,0 @@

-// Use of this source code is governed by a BSD-style

-// license that can be found in the LICENSE file.

-// Regular expression interface RE2.

-//

-// Originally the PCRE C++ wrapper, but adapted to use

-// the new automata-based regular expression engines.

-#include "re2/re2.h"

-#include <stdio.h>

-#include <string>

-#include <errno.h>

-#include "util/util.h"

-#include "util/flags.h"

-#include "util/sparse_array.h"

-#include "re2/prog.h"

-#include "re2/regexp.h"

-DEFINE_bool(trace_re2, false, "trace RE2 execution");

-namespace re2 {

-// Maximum number of args we can set

-static const int kMaxArgs = 16;

-static const int kVecSize = 1+kMaxArgs;

-const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::FullMatchN> RE2::FullMatch = {};

-const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::PartialMatchN> RE2::PartialMatch = {};

-const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume = {};

-const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndConsumeN> RE2::FindAndConsume = {};

-// This will trigger LNK2005 error in MSVC.

-#ifndef _MSC_VER

-const int RE2::Options::kDefaultMaxMem; // initialized in re2.h

-#endif

-RE2::Options::Options(RE2::CannedOptions opt)

- : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),

- posix_syntax_(opt == RE2::POSIX),

- longest_match_(opt == RE2::POSIX),

- log_errors_(opt != RE2::Quiet),

- max_mem_(kDefaultMaxMem),

- literal_(false),

- never_nl_(false),

- dot_nl_(false),

- never_capture_(false),

- case_sensitive_(true),

- perl_classes_(false),

- word_boundary_(false),

- one_line_(false) {

-// static empty things for use as const references.

-// To avoid global constructors, initialized on demand.

-GLOBAL_MUTEX(empty_mutex);

-static const string *empty_string;

-static const map<string, int> *empty_named_groups;

-static const map<int, string> *empty_group_names;

-static void InitEmpty() {

- GLOBAL_MUTEX_LOCK(empty_mutex);

- if (empty_string == NULL) {

- empty_string = new string;

- empty_named_groups = new map<string, int>;

- empty_group_names = new map<int, string>;

- }

- GLOBAL_MUTEX_UNLOCK(empty_mutex);

-// Converts from Regexp error code to RE2 error code.

-// Maybe some day they will diverge. In any event, this

-// hides the existence of Regexp from RE2 users.

-static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {

- switch (code) {

- case re2::kRegexpSuccess:

- return RE2::NoError;

- case re2::kRegexpInternalError:

- return RE2::ErrorInternal;

- case re2::kRegexpBadEscape:

- return RE2::ErrorBadEscape;

- case re2::kRegexpBadCharClass:

- return RE2::ErrorBadCharClass;

- case re2::kRegexpBadCharRange:

- return RE2::ErrorBadCharRange;

- case re2::kRegexpMissingBracket:

- return RE2::ErrorMissingBracket;

- case re2::kRegexpMissingParen:

- return RE2::ErrorMissingParen;

- case re2::kRegexpTrailingBackslash:

- return RE2::ErrorTrailingBackslash;

- case re2::kRegexpRepeatArgument:

- return RE2::ErrorRepeatArgument;

- case re2::kRegexpRepeatSize:

- return RE2::ErrorRepeatSize;

- case re2::kRegexpRepeatOp:

- return RE2::ErrorRepeatOp;

- case re2::kRegexpBadPerlOp:

- return RE2::ErrorBadPerlOp;

- case re2::kRegexpBadUTF8:

- return RE2::ErrorBadUTF8;

- case re2::kRegexpBadNamedCapture:

- return RE2::ErrorBadNamedCapture;

- }

- return RE2::ErrorInternal;

-static string trunc(const StringPiece& pattern) {

- if (pattern.size() < 100)

- return pattern.as_string();

- return pattern.substr(0, 100).as_string() + "...";

-RE2::RE2(const char* pattern) {

- Init(pattern, DefaultOptions);

-RE2::RE2(const string& pattern) {

- Init(pattern, DefaultOptions);

-RE2::RE2(const StringPiece& pattern) {

- Init(pattern, DefaultOptions);

-RE2::RE2(const StringPiece& pattern, const Options& options) {

- Init(pattern, options);

-int RE2::Options::ParseFlags() const {

- int flags = Regexp::ClassNL;

- switch (encoding()) {

- default:

- if (log_errors())

- LOG(ERROR) << "Unknown encoding " << encoding();

- break;

- case RE2::Options::EncodingUTF8:

- break;

- case RE2::Options::EncodingLatin1:

- flags |= Regexp::Latin1;

- break;

- }

- if (!posix_syntax())

- flags |= Regexp::LikePerl;

- if (literal())

- flags |= Regexp::Literal;

- if (never_nl())

- flags |= Regexp::NeverNL;

- if (dot_nl())

- flags |= Regexp::DotNL;

- if (never_capture())

- flags |= Regexp::NeverCapture;

- if (!case_sensitive())

- flags |= Regexp::FoldCase;

- if (perl_classes())

- flags |= Regexp::PerlClasses;

- if (word_boundary())

- flags |= Regexp::PerlB;

- if (one_line())

- flags |= Regexp::OneLine;

- return flags;

-void RE2::Init(const StringPiece& pattern, const Options& options) {

- mutex_ = new Mutex;

- pattern_ = pattern.as_string();

- options_.Copy(options);

- InitEmpty();

- error_ = empty_string;

- error_code_ = NoError;

- suffix_regexp_ = NULL;

- entire_regexp_ = NULL;

- prog_ = NULL;

- rprog_ = NULL;

- named_groups_ = NULL;

- group_names_ = NULL;

- num_captures_ = -1;

- RegexpStatus status;

- entire_regexp_ = Regexp::Parse(

- pattern_,

- static_cast<Regexp::ParseFlags>(options_.ParseFlags()),

- &status);

- if (entire_regexp_ == NULL) {

- if (error_ == empty_string)

- error_ = new string(status.Text());

- if (options_.log_errors()) {

- LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "

- << status.Text();

- }

- error_arg_ = status.error_arg().as_string();

- error_code_ = RegexpErrorToRE2(status.code());

- return;

- }

- prefix_.clear();

- prefix_foldcase_ = false;

- re2::Regexp* suffix;

- if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))

- suffix_regexp_ = suffix;

- else

- suffix_regexp_ = entire_regexp_->Incref();

- // Two thirds of the memory goes to the forward Prog,

- // one third to the reverse prog, because the forward

- // Prog has two DFAs but the reverse prog has one.

- prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);

- if (prog_ == NULL) {

- if (options_.log_errors())

- LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";

- error_ = new string("pattern too large - compile failed");

- error_code_ = RE2::ErrorPatternTooLarge;

- return;

- }

- // Could delay this until the first match call that

- // cares about submatch information, but the one-pass

- // machine's memory gets cut from the DFA memory budget,

- // and that is harder to do if the DFA has already

- // been built.

- is_one_pass_ = prog_->IsOnePass();

-// Returns rprog_, computing it if needed.

-re2::Prog* RE2::ReverseProg() const {

- MutexLock l(mutex_);

- if (rprog_ == NULL && error_ == empty_string) {

- rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3);

- if (rprog_ == NULL) {

- if (options_.log_errors())

- LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'";

- error_ = new string("pattern too large - reverse compile failed");

- error_code_ = RE2::ErrorPatternTooLarge;

- return NULL;

- }

- return rprog_;

-RE2::~RE2() {

- if (suffix_regexp_)

- suffix_regexp_->Decref();

- if (entire_regexp_)

- entire_regexp_->Decref();

- delete mutex_;

- delete prog_;

- delete rprog_;

- if (error_ != empty_string)

- delete error_;

- if (named_groups_ != NULL && named_groups_ != empty_named_groups)

- delete named_groups_;

- if (group_names_ != NULL && group_names_ != empty_group_names)

- delete group_names_;

-int RE2::ProgramSize() const {

- if (prog_ == NULL)

- return -1;

- return prog_->size();

-int RE2::ProgramFanout(map<int, int>* histogram) const {

- if (prog_ == NULL)

- return -1;

- SparseArray<int> fanout(prog_->size());

- prog_->Fanout(&fanout);

- histogram->clear();

- for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) {

- // TODO(junyer): Optimise this?

- int bucket = 0;

- while (1 << bucket < i->second) {

- bucket++;

- }

- (*histogram)[bucket]++;

- }

- return histogram->rbegin()->first;

-// Returns num_captures_, computing it if needed, or -1 if the

-// regexp wasn't valid on construction.

-int RE2::NumberOfCapturingGroups() const {

- MutexLock l(mutex_);

- if (suffix_regexp_ == NULL)

- return -1;

- if (num_captures_ == -1)

- num_captures_ = suffix_regexp_->NumCaptures();

- return num_captures_;

-// Returns named_groups_, computing it if needed.

-const map<string, int>& RE2::NamedCapturingGroups() const {

- MutexLock l(mutex_);

- if (!ok())

- return *empty_named_groups;

- if (named_groups_ == NULL) {

- named_groups_ = suffix_regexp_->NamedCaptures();

- if (named_groups_ == NULL)

- named_groups_ = empty_named_groups;

- }

- return *named_groups_;

-// Returns group_names_, computing it if needed.

-const map<int, string>& RE2::CapturingGroupNames() const {

- MutexLock l(mutex_);

- if (!ok())

- return *empty_group_names;

- if (group_names_ == NULL) {

- group_names_ = suffix_regexp_->CaptureNames();

- if (group_names_ == NULL)

- group_names_ = empty_group_names;

- }

- return *group_names_;

-/***** Convenience interfaces *****/

-bool RE2::FullMatchN(const StringPiece& text, const RE2& re,

- const Arg* const args[], int n) {

- return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);

-bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,

- const Arg* const args[], int n) {

- return re.DoMatch(text, UNANCHORED, NULL, args, n);

-bool RE2::ConsumeN(StringPiece* input, const RE2& re,

- const Arg* const args[], int n) {

- int consumed;

- if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {

- input->remove_prefix(consumed);

- return true;

- } else {

- return false;

- }

-bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,

- const Arg* const args[], int n) {

- int consumed;

- if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {

- input->remove_prefix(consumed);

- return true;

- } else {

- return false;

- }

-// Returns the maximum submatch needed for the rewrite to be done by Replace().

-// E.g. if rewrite == "foo \\2,\\1", returns 2.

-int RE2::MaxSubmatch(const StringPiece& rewrite) {

- int max = 0;

- for (const char *s = rewrite.data(), *end = s + rewrite.size();

- s < end; s++) {

- if (*s == '\\') {

- s++;

- int c = (s < end) ? *s : -1;

- if (isdigit(c)) {

- int n = (c - '0');

- if (n > max)

- max = n;

- }

- return max;

-bool RE2::Replace(string *str,

- const RE2& re,

- const StringPiece& rewrite) {

- StringPiece vec[kVecSize];

- int nvec = 1 + MaxSubmatch(rewrite);

- if (nvec > arraysize(vec))

- return false;

- if (!re.Match(*str, 0, static_cast<int>(str->size()), UNANCHORED, vec, nvec))

- return false;

- string s;

- if (!re.Rewrite(&s, rewrite, vec, nvec))

- return false;

- assert(vec[0].begin() >= str->data());

- assert(vec[0].end() <= str->data()+str->size());

- str->replace(vec[0].data() - str->data(), vec[0].size(), s);

- return true;

-int RE2::GlobalReplace(string *str,

- const RE2& re,

- const StringPiece& rewrite) {

- StringPiece vec[kVecSize];

- int nvec = 1 + MaxSubmatch(rewrite);

- if (nvec > arraysize(vec))

- return false;

- const char* p = str->data();

- const char* ep = p + str->size();

- const char* lastend = NULL;

- string out;

- int count = 0;

- while (p <= ep) {

- if (!re.Match(*str, static_cast<int>(p - str->data()),

- static_cast<int>(str->size()), UNANCHORED, vec, nvec))

- break;

- if (p < vec[0].begin())

- out.append(p, vec[0].begin() - p);

- if (vec[0].begin() == lastend && vec[0].size() == 0) {

- // Disallow empty match at end of last match: skip ahead.

- if (p < ep)

- out.append(p, 1);

- p++;

- continue;

- }

- re.Rewrite(&out, rewrite, vec, nvec);

- p = vec[0].end();

- lastend = p;

- count++;

- }

- if (count == 0)

- return 0;

- if (p < ep)

- out.append(p, ep - p);

- swap(out, *str);

- return count;

-bool RE2::Extract(const StringPiece &text,

- const RE2& re,

- const StringPiece &rewrite,

- string *out) {

- StringPiece vec[kVecSize];

- int nvec = 1 + MaxSubmatch(rewrite);

- if (nvec > arraysize(vec))

- return false;

- if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))

- return false;

- out->clear();

- return re.Rewrite(out, rewrite, vec, nvec);

-string RE2::QuoteMeta(const StringPiece& unquoted) {

- string result;

- result.reserve(unquoted.size() << 1);

- // Escape any ascii character not in [A-Za-z_0-9].

- //

- // Note that it's legal to escape a character even if it has no

- // special meaning in a regular expression -- so this function does

- // that. (This also makes it identical to the perl function of the

- // same name except for the null-character special case;

- // see `perldoc -f quotemeta`.)

- for (int ii = 0; ii < unquoted.length(); ++ii) {

- // Note that using 'isalnum' here raises the benchmark time from

- // 32ns to 58ns:

- if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&

- (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&

- (unquoted[ii] < '0' || unquoted[ii] > '9') &&

- unquoted[ii] != '_' &&

- // If this is the part of a UTF8 or Latin1 character, we need

- // to copy this byte without escaping. Experimentally this is

- // what works correctly with the regexp library.

- !(unquoted[ii] & 128)) {

- if (unquoted[ii] == '\0') { // Special handling for null chars.

- // Note that this special handling is not strictly required for RE2,

- // but this quoting is required for other regexp libraries such as

- // PCRE.

- // Can't use "\\0" since the next character might be a digit.

- result += "\\x00";

- continue;

- }

- result += '\\';

- }

- result += unquoted[ii];

- }

- return result;

-bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {

- if (prog_ == NULL)

- return false;

- int n = static_cast<int>(prefix_.size());

- if (n > maxlen)

- n = maxlen;

- // Determine initial min max from prefix_ literal.

- string pmin, pmax;

- pmin = prefix_.substr(0, n);

- pmax = prefix_.substr(0, n);

- if (prefix_foldcase_) {

- // prefix is ASCII lowercase; change pmin to uppercase.

- for (int i = 0; i < n; i++) {

- if ('a' <= pmin[i] && pmin[i] <= 'z')

- pmin[i] += 'A' - 'a';

- }

- // Add to prefix min max using PossibleMatchRange on regexp.

- string dmin, dmax;

- maxlen -= n;

- if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {

- pmin += dmin;

- pmax += dmax;

- } else if (pmax.size() > 0) {

- // prog_->PossibleMatchRange has failed us,

- // but we still have useful information from prefix_.

- // Round up pmax to allow any possible suffix.

- pmax = PrefixSuccessor(pmax);

- } else {

- // Nothing useful.

- *min = "";

- *max = "";

- return false;

- }

- *min = pmin;

- *max = pmax;

- return true;

-// Avoid possible locale nonsense in standard strcasecmp.

-// The string a is known to be all lowercase.

-static int ascii_strcasecmp(const char* a, const char* b, int len) {

- const char *ae = a + len;

- for (; a < ae; a++, b++) {

- uint8 x = *a;

- uint8 y = *b;

- if ('A' <= y && y <= 'Z')

- y += 'a' - 'A';

- if (x != y)

- return x - y;

- }

- return 0;

-/***** Actual matching and rewriting code *****/

-bool RE2::Match(const StringPiece& text,

- int startpos,

- int endpos,

- Anchor re_anchor,

- StringPiece* submatch,

- int nsubmatch) const {

- if (!ok() || suffix_regexp_ == NULL) {

- if (options_.log_errors())

- LOG(ERROR) << "Invalid RE2: " << *error_;

- return false;

- }

- if (startpos < 0 || startpos > endpos || endpos > text.size()) {

- if (options_.log_errors())

- LOG(ERROR) << "RE2: invalid startpos, endpos pair. ["

- << "startpos: " << startpos << ", "

- << "endpos: " << endpos << ", "

- << "text size: " << text.size() << "]";

- return false;

- }

- StringPiece subtext = text;

- subtext.remove_prefix(startpos);

- subtext.remove_suffix(text.size() - endpos);

- // Use DFAs to find exact location of match, filter out non-matches.

- // Don't ask for the location if we won't use it.

- // SearchDFA can do extra optimizations in that case.

- StringPiece match;

- StringPiece* matchp = &match;

- if (nsubmatch == 0)

- matchp = NULL;

- int ncap = 1 + NumberOfCapturingGroups();

- if (ncap > nsubmatch)

- ncap = nsubmatch;

- // If the regexp is anchored explicitly, must not be in middle of text.

- if (prog_->anchor_start() && startpos != 0)

- return false;

- // If the regexp is anchored explicitly, update re_anchor

- // so that we can potentially fall into a faster case below.

- if (prog_->anchor_start() && prog_->anchor_end())

- re_anchor = ANCHOR_BOTH;

- else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)

- re_anchor = ANCHOR_START;

- // Check for the required prefix, if any.

- int prefixlen = 0;

- if (!prefix_.empty()) {

- if (startpos != 0)

- return false;

- prefixlen = static_cast<int>(prefix_.size());

- if (prefixlen > subtext.size())

- return false;

- if (prefix_foldcase_) {

- if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)

- return false;

- } else {

- if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)

- return false;

- }

- subtext.remove_prefix(prefixlen);

- // If there is a required prefix, the anchor must be at least ANCHOR_START.

- if (re_anchor != ANCHOR_BOTH)

- re_anchor = ANCHOR_START;

- }

- Prog::Anchor anchor = Prog::kUnanchored;

- Prog::MatchKind kind = Prog::kFirstMatch;

- if (options_.longest_match())

- kind = Prog::kLongestMatch;

- bool skipped_test = false;

- bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);

- // SearchBitState allocates a bit vector of size prog_->size() * text.size().

- // It also allocates a stack of 3-word structures which could potentially

- // grow as large as prog_->size() * text.size() but in practice is much

- // smaller.

- // Conditions for using SearchBitState:

- const int MaxBitStateProg = 500; // prog_->size() <= Max.

- const int MaxBitStateVector = 256*1024; // bit vector size <= Max (bits)

- bool can_bit_state = prog_->size() <= MaxBitStateProg;

- int bit_state_text_max = MaxBitStateVector / prog_->size();

- bool dfa_failed = false;

- switch (re_anchor) {

- default:

- case UNANCHORED: {

- if (!prog_->SearchDFA(subtext, text, anchor, kind,

- matchp, &dfa_failed, NULL)) {

- if (dfa_failed) {

- // Fall back to NFA below.

- skipped_test = true;

- if (FLAGS_trace_re2)

- LOG(INFO) << "Match " << trunc(pattern_)

- << " [" << CEscape(subtext) << "]"

- << " DFA failed.";

- break;

- }

- if (FLAGS_trace_re2)

- LOG(INFO) << "Match " << trunc(pattern_)

- << " [" << CEscape(subtext) << "]"

- << " used DFA - no match.";

- return false;

- }

- if (FLAGS_trace_re2)

- LOG(INFO) << "Match " << trunc(pattern_)

- << " [" << CEscape(subtext) << "]"

- << " used DFA - match";

- if (matchp == NULL) // Matched. Don't care where

- return true;

- // SearchDFA set match[0].end() but didn't know where the

- // match started. Run the regexp backward from match[0].end()

- // to find the longest possible match -- that's where it started.

- Prog* prog = ReverseProg();

- if (prog == NULL)

- return false;

- if (!prog->SearchDFA(match, text, Prog::kAnchored,

- Prog::kLongestMatch, &match, &dfa_failed, NULL)) {

- if (dfa_failed) {

- // Fall back to NFA below.

- skipped_test = true;

- if (FLAGS_trace_re2)

- LOG(INFO) << "Match " << trunc(pattern_)

- << " [" << CEscape(subtext) << "]"

- << " reverse DFA failed.";

- break;

- }

- if (FLAGS_trace_re2)

- LOG(INFO) << "Match " << trunc(pattern_)

- << " [" << CEscape(subtext) << "]"

- << " DFA inconsistency.";

- if (options_.log_errors())

- LOG(ERROR) << "DFA inconsistency";

- return false;

- }

- if (FLAGS_trace_re2)

- LOG(INFO) << "Match " << trunc(pattern_)

- << " [" << CEscape(subtext) << "]"

- << " used reverse DFA.";

- break;

- }

- case ANCHOR_BOTH:

- case ANCHOR_START:

- if (re_anchor == ANCHOR_BOTH)

- kind = Prog::kFullMatch;

- anchor = Prog::kAnchored;

- // If only a small amount of text and need submatch

- // information anyway and we're going to use OnePass or BitState

- // to get it, we might as well not even bother with the DFA:

- // OnePass or BitState will be fast enough.

- // On tiny texts, OnePass outruns even the DFA, and

- // it doesn't have the shared state and occasional mutex that

- // the DFA does.

- if (can_one_pass && text.size() <= 4096 &&

- (ncap > 1 || text.size() <= 8)) {

- if (FLAGS_trace_re2)

- LOG(INFO) << "Match " << trunc(pattern_)

- << " [" << CEscape(subtext) << "]"

- << " skipping DFA for OnePass.";

- skipped_test = true;

- break;

- }

- if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {

- if (FLAGS_trace_re2)

- LOG(INFO) << "Match " << trunc(pattern_)

- << " [" << CEscape(subtext) << "]"

- << " skipping DFA for BitState.";

- skipped_test = true;

- break;

- }

- if (!prog_->SearchDFA(subtext, text, anchor, kind,

- &match, &dfa_failed, NULL)) {

- if (dfa_failed) {

- if (FLAGS_trace_re2)

- LOG(INFO) << "Match " << trunc(pattern_)

- << " [" << CEscape(subtext) << "]"

- << " DFA failed.";

- skipped_test = true;

- break;

- }

- if (FLAGS_trace_re2)

- LOG(INFO) << "Match " << trunc(pattern_)

- << " [" << CEscape(subtext) << "]"

- << " used DFA - no match.";

- return false;

- }

- break;

- }

- if (!skipped_test && ncap <= 1) {

- // We know exactly where it matches. That's enough.

- if (ncap == 1)

- submatch[0] = match;

- } else {

- StringPiece subtext1;

- if (skipped_test) {

- // DFA ran out of memory or was skipped:

- // need to search in entire original text.

- subtext1 = subtext;

- } else {

- // DFA found the exact match location:

- // let NFA run an anchored, full match search

- // to find submatch locations.

- subtext1 = match;

- anchor = Prog::kAnchored;

- kind = Prog::kFullMatch;

- }

- if (can_one_pass && anchor != Prog::kUnanchored) {

- if (FLAGS_trace_re2)

- LOG(INFO) << "Match " << trunc(pattern_)

- << " [" << CEscape(subtext) << "]"

- << " using OnePass.";

- if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {

- if (!skipped_test && options_.log_errors())

- LOG(ERROR) << "SearchOnePass inconsistency";

- return false;

- }

- } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {

- if (FLAGS_trace_re2)

- LOG(INFO) << "Match " << trunc(pattern_)

- << " [" << CEscape(subtext) << "]"

- << " using BitState.";

- if (!prog_->SearchBitState(subtext1, text, anchor,

- kind, submatch, ncap)) {

- if (!skipped_test && options_.log_errors())

- LOG(ERROR) << "SearchBitState inconsistency";

- return false;

- }

- } else {

- if (FLAGS_trace_re2)

- LOG(INFO) << "Match " << trunc(pattern_)

- << " [" << CEscape(subtext) << "]"

- << " using NFA.";

- if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {

- if (!skipped_test && options_.log_errors())

- LOG(ERROR) << "SearchNFA inconsistency";

- return false;

- }

- // Adjust overall match for required prefix that we stripped off.

- if (prefixlen > 0 && nsubmatch > 0)

- submatch[0] = StringPiece(submatch[0].begin() - prefixlen,

- submatch[0].size() + prefixlen);

- // Zero submatches that don't exist in the regexp.

- for (int i = ncap; i < nsubmatch; i++)

- submatch[i] = NULL;

- return true;

-// Internal matcher - like Match() but takes Args not StringPieces.

-bool RE2::DoMatch(const StringPiece& text,

- Anchor anchor,

- int* consumed,

- const Arg* const* args,

- int n) const {

- if (!ok()) {

- if (options_.log_errors())

- LOG(ERROR) << "Invalid RE2: " << *error_;

- return false;

- }

- // Count number of capture groups needed.

- int nvec;

- if (n == 0 && consumed == NULL)

- nvec = 0;

- else

- nvec = n+1;

- StringPiece* vec;

- StringPiece stkvec[kVecSize];

- StringPiece* heapvec = NULL;

- if (nvec <= arraysize(stkvec)) {

- vec = stkvec;

- } else {

- vec = new StringPiece[nvec];

- heapvec = vec;

- }

- if (!Match(text, 0, text.size(), anchor, vec, nvec)) {

- delete[] heapvec;

- return false;

- }

- if (consumed != NULL)

- *consumed = static_cast<int>(vec[0].end() - text.begin());

- if (n == 0 || args == NULL) {

- // We are not interested in results

- delete[] heapvec;

- return true;

- }

- int ncap = NumberOfCapturingGroups();

- if (ncap < n) {

- // RE has fewer capturing groups than number of arg pointers passed in

- VLOG(1) << "Asked for " << n << " but only have " << ncap;

- delete[] heapvec;

- return false;

- }

- // If we got here, we must have matched the whole pattern.

- for (int i = 0; i < n; i++) {

- const StringPiece& s = vec[i+1];

- if (!args[i]->Parse(s.data(), s.size())) {

- // TODO: Should we indicate what the error was?

- VLOG(1) << "Parse error on #" << i << " " << s << " "

- << (void*)s.data() << "/" << s.size();

- delete[] heapvec;

- return false;

- }

- delete[] heapvec;

- return true;

-// Append the "rewrite" string, with backslash subsitutions from "vec",

-// to string "out".

-bool RE2::Rewrite(string *out, const StringPiece &rewrite,

- const StringPiece *vec, int veclen) const {

- for (const char *s = rewrite.data(), *end = s + rewrite.size();

- s < end; s++) {

- if (*s != '\\') {

- out->push_back(*s);

- continue;

- }

- s++;

- int c = (s < end) ? *s : -1;

- if (isdigit(c)) {

- int n = (c - '0');

- if (n >= veclen) {

- if (options_.log_errors()) {

- LOG(ERROR) << "requested group " << n

- << " in regexp " << rewrite.data();

- }

- return false;

- }

- StringPiece snip = vec[n];

- if (snip.size() > 0)

- out->append(snip.data(), snip.size());

- } else if (c == '\\') {

- out->push_back('\\');

- } else {

- if (options_.log_errors())

- LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();

- return false;

- }

- return true;

-// Checks that the rewrite string is well-formed with respect to this

-// regular expression.

-bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {

- int max_token = -1;

- for (const char *s = rewrite.data(), *end = s + rewrite.size();

- s < end; s++) {

- int c = *s;

- if (c != '\\') {

- continue;

- }

- if (++s == end) {

- *error = "Rewrite schema error: '\\' not allowed at end.";

- return false;

- }

- c = *s;

- if (c == '\\') {

- continue;

- }

- if (!isdigit(c)) {

- *error = "Rewrite schema error: "

- "'\\' must be followed by a digit or '\\'.";

- return false;

- }

- int n = (c - '0');

- if (max_token < n) {

- max_token = n;

- }

- if (max_token > NumberOfCapturingGroups()) {

- SStringPrintf(error, "Rewrite schema requests %d matches, "

- "but the regexp only has %d parenthesized subexpressions.",

- max_token, NumberOfCapturingGroups());

- return false;

- }

- return true;

-/***** Parsers for various types *****/

-bool RE2::Arg::parse_null(const char* str, int n, void* dest) {

- // We fail if somebody asked us to store into a non-NULL void* pointer

- return (dest == NULL);

-bool RE2::Arg::parse_string(const char* str, int n, void* dest) {

- if (dest == NULL) return true;

- reinterpret_cast<string*>(dest)->assign(str, n);

- return true;

-bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) {

- if (dest == NULL) return true;

- reinterpret_cast<StringPiece*>(dest)->set(str, n);

- return true;

-bool RE2::Arg::parse_char(const char* str, int n, void* dest) {

- if (n != 1) return false;

- if (dest == NULL) return true;

- *(reinterpret_cast<char*>(dest)) = str[0];

- return true;

-bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) {

- if (n != 1) return false;

- if (dest == NULL) return true;

- *(reinterpret_cast<unsigned char*>(dest)) = str[0];

- return true;

-// Largest number spec that we are willing to parse

-static const int kMaxNumberLength = 32;

-// REQUIRES "buf" must have length at least nbuf.

-// Copies "str" into "buf" and null-terminates.

-// Overwrites *np with the new length.

-static const char* TerminateNumber(char* buf, int nbuf, const char* str, int* np,

- bool accept_spaces) {

- int n = *np;

- if (n <= 0) return "";

- if (n > 0 && isspace(*str)) {

- // We are less forgiving than the strtoxxx() routines and do not

- // allow leading spaces. We do allow leading spaces for floats.

- if (!accept_spaces) {

- return "";

- }

- while (n > 0 && isspace(*str)) {

- n--;

- str++;

- }

- // Although buf has a fixed maximum size, we can still handle

- // arbitrarily large integers correctly by omitting leading zeros.

- // (Numbers that are still too long will be out of range.)

- // Before deciding whether str is too long,

- // remove leading zeros with s/000+/00/.

- // Leaving the leading two zeros in place means that

- // we don't change 0000x123 (invalid) into 0x123 (valid).

- // Skip over leading - before replacing.

- bool neg = false;

- if (n >= 1 && str[0] == '-') {

- neg = true;

- n--;

- str++;

- }

- if (n >= 3 && str[0] == '0' && str[1] == '0') {

- while (n >= 3 && str[2] == '0') {

- n--;

- str++;

- }

- if (neg) { // make room in buf for -

- n++;

- str--;

- }

- if (n > nbuf-1) return "";

- memmove(buf, str, n);

- if (neg) {

- buf[0] = '-';

- }

- buf[n] = '\0';

- *np = n;

- return buf;

-bool RE2::Arg::parse_long_radix(const char* str,

- int n,

- void* dest,

- int radix) {

- if (n == 0) return false;

- char buf[kMaxNumberLength+1];

- str = TerminateNumber(buf, sizeof buf, str, &n, false);

- char* end;

- errno = 0;

- long r = strtol(str, &end, radix);

- if (end != str + n) return false; // Leftover junk

- if (errno) return false;

- if (dest == NULL) return true;

- *(reinterpret_cast<long*>(dest)) = r;

- return true;

-bool RE2::Arg::parse_ulong_radix(const char* str,

- int n,

- void* dest,

- int radix) {

- if (n == 0) return false;

- char buf[kMaxNumberLength+1];

- str = TerminateNumber(buf, sizeof buf, str, &n, false);

- if (str[0] == '-') {

- // strtoul() will silently accept negative numbers and parse

- // them. This module is more strict and treats them as errors.

- return false;

- }

- char* end;

- errno = 0;

- unsigned long r = strtoul(str, &end, radix);

- if (end != str + n) return false; // Leftover junk

- if (errno) return false;

- if (dest == NULL) return true;

- *(reinterpret_cast<unsigned long*>(dest)) = r;

- return true;

-bool RE2::Arg::parse_short_radix(const char* str,

- int n,

- void* dest,

- int radix) {

- long r;

- if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse

- if ((short)r != r) return false; // Out of range

- if (dest == NULL) return true;

- *(reinterpret_cast<short*>(dest)) = (short)r;

- return true;

-bool RE2::Arg::parse_ushort_radix(const char* str,

- int n,

- void* dest,

- int radix) {

- unsigned long r;

- if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse

- if ((ushort)r != r) return false; // Out of range

- if (dest == NULL) return true;

- *(reinterpret_cast<unsigned short*>(dest)) = (ushort)r;

- return true;

-bool RE2::Arg::parse_int_radix(const char* str,

- int n,

- void* dest,

- int radix) {

- long r;

- if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse

- if ((int)r != r) return false; // Out of range

- if (dest == NULL) return true;

- *(reinterpret_cast<int*>(dest)) = r;

- return true;

-bool RE2::Arg::parse_uint_radix(const char* str,

- int n,

- void* dest,

- int radix) {

- unsigned long r;

- if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse

- if ((uint)r != r) return false; // Out of range

- if (dest == NULL) return true;

- *(reinterpret_cast<unsigned int*>(dest)) = r;

- return true;

-#if RE2_HAVE_LONGLONG

-bool RE2::Arg::parse_longlong_radix(const char* str,

- int n,

- void* dest,

- int radix) {

- if (n == 0) return false;

- char buf[kMaxNumberLength+1];

- str = TerminateNumber(buf, sizeof buf, str, &n, false);

- char* end;

- errno = 0;

- int64 r = strtoll(str, &end, radix);

- if (end != str + n) return false; // Leftover junk

- if (errno) return false;

- if (dest == NULL) return true;

- *(reinterpret_cast<int64*>(dest)) = r;

- return true;

-bool RE2::Arg::parse_ulonglong_radix(const char* str,

- int n,

- void* dest,

- int radix) {

- if (n == 0) return false;

- char buf[kMaxNumberLength+1];

- str = TerminateNumber(buf, sizeof buf, str, &n, false);

- if (str[0] == '-') {

- // strtoull() will silently accept negative numbers and parse

- // them. This module is more strict and treats them as errors.

- return false;

- }

- char* end;

- errno = 0;

- uint64 r = strtoull(str, &end, radix);

- if (end != str + n) return false; // Leftover junk

- if (errno) return false;

- if (dest == NULL) return true;

- *(reinterpret_cast<uint64*>(dest)) = r;

- return true;

-#endif

-static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) {

- if (n == 0) return false;

- static const int kMaxLength = 200;

- char buf[kMaxLength+1];

- str = TerminateNumber(buf, sizeof buf, str, &n, true);

- char* end;

- errno = 0;

- double r;

- if (isfloat) {

- r = strtof(str, &end);

- } else {

- r = strtod(str, &end);

- }

- if (end != str + n) return false; // Leftover junk

- if (errno) return false;

- if (dest == NULL) return true;

- if (isfloat) {

- *(reinterpret_cast<float*>(dest)) = (float)r;

- } else {

- *(reinterpret_cast<double*>(dest)) = r;

- }

- return true;

-bool RE2::Arg::parse_double(const char* str, int n, void* dest) {

- return parse_double_float(str, n, false, dest);

-bool RE2::Arg::parse_float(const char* str, int n, void* dest) {

- return parse_double_float(str, n, true, dest);

-#define DEFINE_INTEGER_PARSERS(name) \

- bool RE2::Arg::parse_##name(const char* str, int n, void* dest) { \

- return parse_##name##_radix(str, n, dest, 10); \

- } \

- bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \

- return parse_##name##_radix(str, n, dest, 16); \

- } \

- bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \

- return parse_##name##_radix(str, n, dest, 8); \

- } \

- bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \

- return parse_##name##_radix(str, n, dest, 0); \

- }

-DEFINE_INTEGER_PARSERS(short);

-DEFINE_INTEGER_PARSERS(ushort);

-DEFINE_INTEGER_PARSERS(int);

-DEFINE_INTEGER_PARSERS(uint);

-DEFINE_INTEGER_PARSERS(long);

-DEFINE_INTEGER_PARSERS(ulong);

-DEFINE_INTEGER_PARSERS(longlong);

-DEFINE_INTEGER_PARSERS(ulonglong);

-#undef DEFINE_INTEGER_PARSERS

-} // namespace re2

« no previous file with comments | « third_party/re2/re2/re2.h ('k') | third_party/re2/re2/regexp.h » ('j') | no next file with comments »