| OLD | NEW |
| 1 // Copyright 2003-2009 The RE2 Authors. All Rights Reserved. | 1 // Copyright 2003-2009 The RE2 Authors. All Rights Reserved. |
| 2 // Use of this source code is governed by a BSD-style | 2 // Use of this source code is governed by a BSD-style |
| 3 // license that can be found in the LICENSE file. | 3 // license that can be found in the LICENSE file. |
| 4 | 4 |
| 5 // Regular expression interface RE2. | 5 // Regular expression interface RE2. |
| 6 // | 6 // |
| 7 // Originally the PCRE C++ wrapper, but adapted to use | 7 // Originally the PCRE C++ wrapper, but adapted to use |
| 8 // the new automata-based regular expression engines. | 8 // the new automata-based regular expression engines. |
| 9 | 9 |
| 10 #include "re2/re2.h" | 10 #include "re2/re2.h" |
| 11 | 11 |
| 12 #include <stdio.h> | 12 #include <stdio.h> |
| 13 #include <string> | 13 #include <string> |
| 14 #ifdef WIN32 | |
| 15 #define strtoll _strtoi64 | |
| 16 #define strtoull _strtoui64 | |
| 17 #define strtof strtod | |
| 18 #else | |
| 19 #include <pthread.h> | |
| 20 #endif | |
| 21 #include <errno.h> | 14 #include <errno.h> |
| 22 #include "util/util.h" | 15 #include "util/util.h" |
| 23 #include "util/flags.h" | 16 #include "util/flags.h" |
| 17 #include "util/sparse_array.h" |
| 24 #include "re2/prog.h" | 18 #include "re2/prog.h" |
| 25 #include "re2/regexp.h" | 19 #include "re2/regexp.h" |
| 26 | 20 |
| 27 DEFINE_bool(trace_re2, false, "trace RE2 execution"); | 21 DEFINE_bool(trace_re2, false, "trace RE2 execution"); |
| 28 | 22 |
| 29 namespace re2 { | 23 namespace re2 { |
| 30 | 24 |
| 31 // Maximum number of args we can set | 25 // Maximum number of args we can set |
| 32 static const int kMaxArgs = 16; | 26 static const int kMaxArgs = 16; |
| 33 static const int kVecSize = 1+kMaxArgs; | 27 static const int kVecSize = 1+kMaxArgs; |
| 34 | 28 |
| 35 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::Ful
lMatchN> RE2::FullMatch = {}; | 29 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::Ful
lMatchN> RE2::FullMatch = {}; |
| 36 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::Par
tialMatchN> RE2::PartialMatch = {}; | 30 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::Par
tialMatchN> RE2::PartialMatch = {}; |
| 37 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN>
RE2::Consume = {}; | 31 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN>
RE2::Consume = {}; |
| 38 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndCo
nsumeN> RE2::FindAndConsume = {}; | 32 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndCo
nsumeN> RE2::FindAndConsume = {}; |
| 39 | 33 |
| 40 #define kDefaultMaxMem (8<<20) | 34 // This will trigger LNK2005 error in MSVC. |
| 41 | 35 #ifndef _MSC_VER |
| 42 RE2::Options::Options() | 36 const int RE2::Options::kDefaultMaxMem; // initialized in re2.h |
| 43 : encoding_(EncodingUTF8), | 37 #endif |
| 44 posix_syntax_(false), | |
| 45 longest_match_(false), | |
| 46 log_errors_(true), | |
| 47 max_mem_(kDefaultMaxMem), | |
| 48 literal_(false), | |
| 49 never_nl_(false), | |
| 50 never_capture_(false), | |
| 51 case_sensitive_(true), | |
| 52 perl_classes_(false), | |
| 53 word_boundary_(false), | |
| 54 one_line_(false) { | |
| 55 } | |
| 56 | 38 |
| 57 RE2::Options::Options(RE2::CannedOptions opt) | 39 RE2::Options::Options(RE2::CannedOptions opt) |
| 58 : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8), | 40 : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8), |
| 59 posix_syntax_(opt == RE2::POSIX), | 41 posix_syntax_(opt == RE2::POSIX), |
| 60 longest_match_(opt == RE2::POSIX), | 42 longest_match_(opt == RE2::POSIX), |
| 61 log_errors_(opt != RE2::Quiet), | 43 log_errors_(opt != RE2::Quiet), |
| 62 max_mem_(kDefaultMaxMem), | 44 max_mem_(kDefaultMaxMem), |
| 63 literal_(false), | 45 literal_(false), |
| 64 never_nl_(false), | 46 never_nl_(false), |
| 47 dot_nl_(false), |
| 65 never_capture_(false), | 48 never_capture_(false), |
| 66 case_sensitive_(true), | 49 case_sensitive_(true), |
| 67 perl_classes_(false), | 50 perl_classes_(false), |
| 68 word_boundary_(false), | 51 word_boundary_(false), |
| 69 one_line_(false) { | 52 one_line_(false) { |
| 70 } | 53 } |
| 71 | 54 |
| 72 // static empty things for use as const references. | 55 // static empty things for use as const references. |
| 73 // To avoid global constructors, initialized on demand. | 56 // To avoid global constructors, initialized on demand. |
| 74 GLOBAL_MUTEX(empty_mutex); | 57 GLOBAL_MUTEX(empty_mutex); |
| (...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 162 | 145 |
| 163 if (!posix_syntax()) | 146 if (!posix_syntax()) |
| 164 flags |= Regexp::LikePerl; | 147 flags |= Regexp::LikePerl; |
| 165 | 148 |
| 166 if (literal()) | 149 if (literal()) |
| 167 flags |= Regexp::Literal; | 150 flags |= Regexp::Literal; |
| 168 | 151 |
| 169 if (never_nl()) | 152 if (never_nl()) |
| 170 flags |= Regexp::NeverNL; | 153 flags |= Regexp::NeverNL; |
| 171 | 154 |
| 155 if (dot_nl()) |
| 156 flags |= Regexp::DotNL; |
| 157 |
| 172 if (never_capture()) | 158 if (never_capture()) |
| 173 flags |= Regexp::NeverCapture; | 159 flags |= Regexp::NeverCapture; |
| 174 | 160 |
| 175 if (!case_sensitive()) | 161 if (!case_sensitive()) |
| 176 flags |= Regexp::FoldCase; | 162 flags |= Regexp::FoldCase; |
| 177 | 163 |
| 178 if (perl_classes()) | 164 if (perl_classes()) |
| 179 flags |= Regexp::PerlClasses; | 165 flags |= Regexp::PerlClasses; |
| 180 | 166 |
| 181 if (word_boundary()) | 167 if (word_boundary()) |
| (...skipping 96 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 278 if (group_names_ != NULL && group_names_ != empty_group_names) | 264 if (group_names_ != NULL && group_names_ != empty_group_names) |
| 279 delete group_names_; | 265 delete group_names_; |
| 280 } | 266 } |
| 281 | 267 |
| 282 int RE2::ProgramSize() const { | 268 int RE2::ProgramSize() const { |
| 283 if (prog_ == NULL) | 269 if (prog_ == NULL) |
| 284 return -1; | 270 return -1; |
| 285 return prog_->size(); | 271 return prog_->size(); |
| 286 } | 272 } |
| 287 | 273 |
| 274 int RE2::ProgramFanout(map<int, int>* histogram) const { |
| 275 if (prog_ == NULL) |
| 276 return -1; |
| 277 SparseArray<int> fanout(prog_->size()); |
| 278 prog_->Fanout(&fanout); |
| 279 histogram->clear(); |
| 280 for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) { |
| 281 // TODO(junyer): Optimise this? |
| 282 int bucket = 0; |
| 283 while (1 << bucket < i->second) { |
| 284 bucket++; |
| 285 } |
| 286 (*histogram)[bucket]++; |
| 287 } |
| 288 return histogram->rbegin()->first; |
| 289 } |
| 290 |
| 291 // Returns num_captures_, computing it if needed, or -1 if the |
| 292 // regexp wasn't valid on construction. |
| 293 int RE2::NumberOfCapturingGroups() const { |
| 294 MutexLock l(mutex_); |
| 295 if (suffix_regexp_ == NULL) |
| 296 return -1; |
| 297 if (num_captures_ == -1) |
| 298 num_captures_ = suffix_regexp_->NumCaptures(); |
| 299 return num_captures_; |
| 300 } |
| 301 |
| 288 // Returns named_groups_, computing it if needed. | 302 // Returns named_groups_, computing it if needed. |
| 289 const map<string, int>& RE2::NamedCapturingGroups() const { | 303 const map<string, int>& RE2::NamedCapturingGroups() const { |
| 290 MutexLock l(mutex_); | 304 MutexLock l(mutex_); |
| 291 if (!ok()) | 305 if (!ok()) |
| 292 return *empty_named_groups; | 306 return *empty_named_groups; |
| 293 if (named_groups_ == NULL) { | 307 if (named_groups_ == NULL) { |
| 294 named_groups_ = suffix_regexp_->NamedCaptures(); | 308 named_groups_ = suffix_regexp_->NamedCaptures(); |
| 295 if (named_groups_ == NULL) | 309 if (named_groups_ == NULL) |
| 296 named_groups_ = empty_named_groups; | 310 named_groups_ = empty_named_groups; |
| 297 } | 311 } |
| 298 return *named_groups_; | 312 return *named_groups_; |
| 299 } | 313 } |
| 300 | 314 |
| 301 // Returns group_names_, computing it if needed. | 315 // Returns group_names_, computing it if needed. |
| 302 const map<int, string>& RE2::CapturingGroupNames() const { | 316 const map<int, string>& RE2::CapturingGroupNames() const { |
| 303 MutexLock l(mutex_); | 317 MutexLock l(mutex_); |
| 304 if (!ok()) | 318 if (!ok()) |
| 305 return *empty_group_names; | 319 return *empty_group_names; |
| 306 if (group_names_ == NULL) { | 320 if (group_names_ == NULL) { |
| 307 group_names_ = suffix_regexp_->CaptureNames(); | 321 group_names_ = suffix_regexp_->CaptureNames(); |
| 308 if (group_names_ == NULL) | 322 if (group_names_ == NULL) |
| 309 group_names_ = empty_group_names; | 323 group_names_ = empty_group_names; |
| 310 } | 324 } |
| 311 return *group_names_; | 325 return *group_names_; |
| 312 } | 326 } |
| (...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 364 return max; | 378 return max; |
| 365 } | 379 } |
| 366 | 380 |
| 367 bool RE2::Replace(string *str, | 381 bool RE2::Replace(string *str, |
| 368 const RE2& re, | 382 const RE2& re, |
| 369 const StringPiece& rewrite) { | 383 const StringPiece& rewrite) { |
| 370 StringPiece vec[kVecSize]; | 384 StringPiece vec[kVecSize]; |
| 371 int nvec = 1 + MaxSubmatch(rewrite); | 385 int nvec = 1 + MaxSubmatch(rewrite); |
| 372 if (nvec > arraysize(vec)) | 386 if (nvec > arraysize(vec)) |
| 373 return false; | 387 return false; |
| 374 if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) | 388 if (!re.Match(*str, 0, static_cast<int>(str->size()), UNANCHORED, vec, nvec)) |
| 375 return false; | 389 return false; |
| 376 | 390 |
| 377 string s; | 391 string s; |
| 378 if (!re.Rewrite(&s, rewrite, vec, nvec)) | 392 if (!re.Rewrite(&s, rewrite, vec, nvec)) |
| 379 return false; | 393 return false; |
| 380 | 394 |
| 381 assert(vec[0].begin() >= str->data()); | 395 assert(vec[0].begin() >= str->data()); |
| 382 assert(vec[0].end() <= str->data()+str->size()); | 396 assert(vec[0].end() <= str->data()+str->size()); |
| 383 str->replace(vec[0].data() - str->data(), vec[0].size(), s); | 397 str->replace(vec[0].data() - str->data(), vec[0].size(), s); |
| 384 return true; | 398 return true; |
| 385 } | 399 } |
| 386 | 400 |
| 387 int RE2::GlobalReplace(string *str, | 401 int RE2::GlobalReplace(string *str, |
| 388 const RE2& re, | 402 const RE2& re, |
| 389 const StringPiece& rewrite) { | 403 const StringPiece& rewrite) { |
| 390 StringPiece vec[kVecSize]; | 404 StringPiece vec[kVecSize]; |
| 391 int nvec = 1 + MaxSubmatch(rewrite); | 405 int nvec = 1 + MaxSubmatch(rewrite); |
| 392 if (nvec > arraysize(vec)) | 406 if (nvec > arraysize(vec)) |
| 393 return false; | 407 return false; |
| 394 | 408 |
| 395 const char* p = str->data(); | 409 const char* p = str->data(); |
| 396 const char* ep = p + str->size(); | 410 const char* ep = p + str->size(); |
| 397 const char* lastend = NULL; | 411 const char* lastend = NULL; |
| 398 string out; | 412 string out; |
| 399 int count = 0; | 413 int count = 0; |
| 400 while (p <= ep) { | 414 while (p <= ep) { |
| 401 if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec)) | 415 if (!re.Match(*str, static_cast<int>(p - str->data()), |
| 416 static_cast<int>(str->size()), UNANCHORED, vec, nvec)) |
| 402 break; | 417 break; |
| 403 if (p < vec[0].begin()) | 418 if (p < vec[0].begin()) |
| 404 out.append(p, vec[0].begin() - p); | 419 out.append(p, vec[0].begin() - p); |
| 405 if (vec[0].begin() == lastend && vec[0].size() == 0) { | 420 if (vec[0].begin() == lastend && vec[0].size() == 0) { |
| 406 // Disallow empty match at end of last match: skip ahead. | 421 // Disallow empty match at end of last match: skip ahead. |
| 407 if (p < ep) | 422 if (p < ep) |
| 408 out.append(p, 1); | 423 out.append(p, 1); |
| 409 p++; | 424 p++; |
| 410 continue; | 425 continue; |
| 411 } | 426 } |
| (...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 475 result += unquoted[ii]; | 490 result += unquoted[ii]; |
| 476 } | 491 } |
| 477 | 492 |
| 478 return result; | 493 return result; |
| 479 } | 494 } |
| 480 | 495 |
| 481 bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const { | 496 bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const { |
| 482 if (prog_ == NULL) | 497 if (prog_ == NULL) |
| 483 return false; | 498 return false; |
| 484 | 499 |
| 485 int n = prefix_.size(); | 500 int n = static_cast<int>(prefix_.size()); |
| 486 if (n > maxlen) | 501 if (n > maxlen) |
| 487 n = maxlen; | 502 n = maxlen; |
| 488 | 503 |
| 489 // Determine initial min max from prefix_ literal. | 504 // Determine initial min max from prefix_ literal. |
| 490 string pmin, pmax; | 505 string pmin, pmax; |
| 491 pmin = prefix_.substr(0, n); | 506 pmin = prefix_.substr(0, n); |
| 492 pmax = prefix_.substr(0, n); | 507 pmax = prefix_.substr(0, n); |
| 493 if (prefix_foldcase_) { | 508 if (prefix_foldcase_) { |
| 494 // prefix is ASCII lowercase; change pmin to uppercase. | 509 // prefix is ASCII lowercase; change pmin to uppercase. |
| 495 for (int i = 0; i < n; i++) { | 510 for (int i = 0; i < n; i++) { |
| (...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 547 StringPiece* submatch, | 562 StringPiece* submatch, |
| 548 int nsubmatch) const { | 563 int nsubmatch) const { |
| 549 if (!ok() || suffix_regexp_ == NULL) { | 564 if (!ok() || suffix_regexp_ == NULL) { |
| 550 if (options_.log_errors()) | 565 if (options_.log_errors()) |
| 551 LOG(ERROR) << "Invalid RE2: " << *error_; | 566 LOG(ERROR) << "Invalid RE2: " << *error_; |
| 552 return false; | 567 return false; |
| 553 } | 568 } |
| 554 | 569 |
| 555 if (startpos < 0 || startpos > endpos || endpos > text.size()) { | 570 if (startpos < 0 || startpos > endpos || endpos > text.size()) { |
| 556 if (options_.log_errors()) | 571 if (options_.log_errors()) |
| 557 LOG(ERROR) << "RE2: invalid startpos, endpos pair."; | 572 LOG(ERROR) << "RE2: invalid startpos, endpos pair. [" |
| 573 << "startpos: " << startpos << ", " |
| 574 << "endpos: " << endpos << ", " |
| 575 << "text size: " << text.size() << "]"; |
| 558 return false; | 576 return false; |
| 559 } | 577 } |
| 560 | 578 |
| 561 StringPiece subtext = text; | 579 StringPiece subtext = text; |
| 562 subtext.remove_prefix(startpos); | 580 subtext.remove_prefix(startpos); |
| 563 subtext.remove_suffix(text.size() - endpos); | 581 subtext.remove_suffix(text.size() - endpos); |
| 564 | 582 |
| 565 // Use DFAs to find exact location of match, filter out non-matches. | 583 // Use DFAs to find exact location of match, filter out non-matches. |
| 566 | 584 |
| 567 // Don't ask for the location if we won't use it. | 585 // Don't ask for the location if we won't use it. |
| (...skipping 16 matching lines...) Expand all Loading... |
| 584 if (prog_->anchor_start() && prog_->anchor_end()) | 602 if (prog_->anchor_start() && prog_->anchor_end()) |
| 585 re_anchor = ANCHOR_BOTH; | 603 re_anchor = ANCHOR_BOTH; |
| 586 else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH) | 604 else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH) |
| 587 re_anchor = ANCHOR_START; | 605 re_anchor = ANCHOR_START; |
| 588 | 606 |
| 589 // Check for the required prefix, if any. | 607 // Check for the required prefix, if any. |
| 590 int prefixlen = 0; | 608 int prefixlen = 0; |
| 591 if (!prefix_.empty()) { | 609 if (!prefix_.empty()) { |
| 592 if (startpos != 0) | 610 if (startpos != 0) |
| 593 return false; | 611 return false; |
| 594 prefixlen = prefix_.size(); | 612 prefixlen = static_cast<int>(prefix_.size()); |
| 595 if (prefixlen > subtext.size()) | 613 if (prefixlen > subtext.size()) |
| 596 return false; | 614 return false; |
| 597 if (prefix_foldcase_) { | 615 if (prefix_foldcase_) { |
| 598 if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0) | 616 if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0) |
| 599 return false; | 617 return false; |
| 600 } else { | 618 } else { |
| 601 if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0) | 619 if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0) |
| 602 return false; | 620 return false; |
| 603 } | 621 } |
| 604 subtext.remove_prefix(prefixlen); | 622 subtext.remove_prefix(prefixlen); |
| (...skipping 220 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 825 } else { | 843 } else { |
| 826 vec = new StringPiece[nvec]; | 844 vec = new StringPiece[nvec]; |
| 827 heapvec = vec; | 845 heapvec = vec; |
| 828 } | 846 } |
| 829 | 847 |
| 830 if (!Match(text, 0, text.size(), anchor, vec, nvec)) { | 848 if (!Match(text, 0, text.size(), anchor, vec, nvec)) { |
| 831 delete[] heapvec; | 849 delete[] heapvec; |
| 832 return false; | 850 return false; |
| 833 } | 851 } |
| 834 | 852 |
| 835 if(consumed != NULL) | 853 if (consumed != NULL) |
| 836 *consumed = vec[0].end() - text.begin(); | 854 *consumed = static_cast<int>(vec[0].end() - text.begin()); |
| 837 | 855 |
| 838 if (n == 0 || args == NULL) { | 856 if (n == 0 || args == NULL) { |
| 839 // We are not interested in results | 857 // We are not interested in results |
| 840 delete[] heapvec; | 858 delete[] heapvec; |
| 841 return true; | 859 return true; |
| 842 } | 860 } |
| 843 | 861 |
| 844 int ncap = NumberOfCapturingGroups(); | 862 int ncap = NumberOfCapturingGroups(); |
| 845 if (ncap < n) { | 863 if (ncap < n) { |
| 846 // RE has fewer capturing groups than number of arg pointers passed in | 864 // RE has fewer capturing groups than number of arg pointers passed in |
| 847 VLOG(1) << "Asked for " << n << " but only have " << ncap; | 865 VLOG(1) << "Asked for " << n << " but only have " << ncap; |
| 848 delete[] heapvec; | 866 delete[] heapvec; |
| 849 return false; | 867 return false; |
| 850 } | 868 } |
| 851 | 869 |
| 852 // If we got here, we must have matched the whole pattern. | 870 // If we got here, we must have matched the whole pattern. |
| 853 for (int i = 0; i < n; i++) { | 871 for (int i = 0; i < n; i++) { |
| 854 const StringPiece& s = vec[i+1]; | 872 const StringPiece& s = vec[i+1]; |
| 855 if (!args[i]->Parse(s.data(), s.size())) { | 873 if (!args[i]->Parse(s.data(), s.size())) { |
| 856 // TODO: Should we indicate what the error was? | 874 // TODO: Should we indicate what the error was? |
| 857 VLOG(1) << "Parse error on #" << i << " " << s << " " | 875 VLOG(1) << "Parse error on #" << i << " " << s << " " |
| 858 » << (void*)s.data() << "/" << s.size(); | 876 << (void*)s.data() << "/" << s.size(); |
| 859 delete[] heapvec; | 877 delete[] heapvec; |
| 860 return false; | 878 return false; |
| 861 } | 879 } |
| 862 } | 880 } |
| 863 | 881 |
| 864 delete[] heapvec; | 882 delete[] heapvec; |
| 865 return true; | 883 return true; |
| 866 } | 884 } |
| 867 | 885 |
| 868 // Append the "rewrite" string, with backslash subsitutions from "vec", | 886 // Append the "rewrite" string, with backslash subsitutions from "vec", |
| 869 // to string "out". | 887 // to string "out". |
| 870 bool RE2::Rewrite(string *out, const StringPiece &rewrite, | 888 bool RE2::Rewrite(string *out, const StringPiece &rewrite, |
| 871 const StringPiece *vec, int veclen) const { | 889 const StringPiece *vec, int veclen) const { |
| 872 for (const char *s = rewrite.data(), *end = s + rewrite.size(); | 890 for (const char *s = rewrite.data(), *end = s + rewrite.size(); |
| 873 s < end; s++) { | 891 s < end; s++) { |
| 874 int c = *s; | 892 if (*s != '\\') { |
| 875 if (c == '\\') { | 893 out->push_back(*s); |
| 876 s++; | 894 continue; |
| 877 c = (s < end) ? *s : -1; | 895 } |
| 878 if (isdigit(c)) { | 896 s++; |
| 879 int n = (c - '0'); | 897 int c = (s < end) ? *s : -1; |
| 880 if (n >= veclen) { | 898 if (isdigit(c)) { |
| 881 if (options_.log_errors()) { | 899 int n = (c - '0'); |
| 882 LOG(ERROR) << "requested group " << n | 900 if (n >= veclen) { |
| 883 << " in regexp " << rewrite.data(); | 901 if (options_.log_errors()) { |
| 884 } | 902 LOG(ERROR) << "requested group " << n |
| 885 return false; | 903 << " in regexp " << rewrite.data(); |
| 886 } | 904 } |
| 887 StringPiece snip = vec[n]; | |
| 888 if (snip.size() > 0) | |
| 889 out->append(snip.data(), snip.size()); | |
| 890 } else if (c == '\\') { | |
| 891 out->push_back('\\'); | |
| 892 } else { | |
| 893 if (options_.log_errors()) | |
| 894 LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); | |
| 895 return false; | 905 return false; |
| 896 } | 906 } |
| 907 StringPiece snip = vec[n]; |
| 908 if (snip.size() > 0) |
| 909 out->append(snip.data(), snip.size()); |
| 910 } else if (c == '\\') { |
| 911 out->push_back('\\'); |
| 897 } else { | 912 } else { |
| 898 out->push_back(c); | 913 if (options_.log_errors()) |
| 914 LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); |
| 915 return false; |
| 899 } | 916 } |
| 900 } | 917 } |
| 901 return true; | 918 return true; |
| 902 } | 919 } |
| 903 | 920 |
| 904 // Return the number of capturing subpatterns, or -1 if the | |
| 905 // regexp wasn't valid on construction. | |
| 906 int RE2::NumberOfCapturingGroups() const { | |
| 907 if (suffix_regexp_ == NULL) | |
| 908 return -1; | |
| 909 ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case" | |
| 910 " multiple threads end up doing the same work in parallel."); | |
| 911 if (num_captures_ == -1) | |
| 912 num_captures_ = suffix_regexp_->NumCaptures(); | |
| 913 return num_captures_; | |
| 914 } | |
| 915 | |
| 916 // Checks that the rewrite string is well-formed with respect to this | 921 // Checks that the rewrite string is well-formed with respect to this |
| 917 // regular expression. | 922 // regular expression. |
| 918 bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const { | 923 bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const { |
| 919 int max_token = -1; | 924 int max_token = -1; |
| 920 for (const char *s = rewrite.data(), *end = s + rewrite.size(); | 925 for (const char *s = rewrite.data(), *end = s + rewrite.size(); |
| 921 s < end; s++) { | 926 s < end; s++) { |
| 922 int c = *s; | 927 int c = *s; |
| 923 if (c != '\\') { | 928 if (c != '\\') { |
| 924 continue; | 929 continue; |
| 925 } | 930 } |
| (...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 980 bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) { | 985 bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) { |
| 981 if (n != 1) return false; | 986 if (n != 1) return false; |
| 982 if (dest == NULL) return true; | 987 if (dest == NULL) return true; |
| 983 *(reinterpret_cast<unsigned char*>(dest)) = str[0]; | 988 *(reinterpret_cast<unsigned char*>(dest)) = str[0]; |
| 984 return true; | 989 return true; |
| 985 } | 990 } |
| 986 | 991 |
| 987 // Largest number spec that we are willing to parse | 992 // Largest number spec that we are willing to parse |
| 988 static const int kMaxNumberLength = 32; | 993 static const int kMaxNumberLength = 32; |
| 989 | 994 |
| 990 // REQUIRES "buf" must have length at least kMaxNumberLength+1 | 995 // REQUIRES "buf" must have length at least nbuf. |
| 991 // Copies "str" into "buf" and null-terminates. | 996 // Copies "str" into "buf" and null-terminates. |
| 992 // Overwrites *np with the new length. | 997 // Overwrites *np with the new length. |
| 993 static const char* TerminateNumber(char* buf, const char* str, int* np) { | 998 static const char* TerminateNumber(char* buf, int nbuf, const char* str, int* np
, |
| 999 bool accept_spaces) { |
| 994 int n = *np; | 1000 int n = *np; |
| 995 if (n <= 0) return ""; | 1001 if (n <= 0) return ""; |
| 996 if (n > 0 && isspace(*str)) { | 1002 if (n > 0 && isspace(*str)) { |
| 997 // We are less forgiving than the strtoxxx() routines and do not | 1003 // We are less forgiving than the strtoxxx() routines and do not |
| 998 // allow leading spaces. | 1004 // allow leading spaces. We do allow leading spaces for floats. |
| 999 return ""; | 1005 if (!accept_spaces) { |
| 1006 return ""; |
| 1007 } |
| 1008 while (n > 0 && isspace(*str)) { |
| 1009 n--; |
| 1010 str++; |
| 1011 } |
| 1000 } | 1012 } |
| 1001 | 1013 |
| 1002 // Although buf has a fixed maximum size, we can still handle | 1014 // Although buf has a fixed maximum size, we can still handle |
| 1003 // arbitrarily large integers correctly by omitting leading zeros. | 1015 // arbitrarily large integers correctly by omitting leading zeros. |
| 1004 // (Numbers that are still too long will be out of range.) | 1016 // (Numbers that are still too long will be out of range.) |
| 1005 // Before deciding whether str is too long, | 1017 // Before deciding whether str is too long, |
| 1006 // remove leading zeros with s/000+/00/. | 1018 // remove leading zeros with s/000+/00/. |
| 1007 // Leaving the leading two zeros in place means that | 1019 // Leaving the leading two zeros in place means that |
| 1008 // we don't change 0000x123 (invalid) into 0x123 (valid). | 1020 // we don't change 0000x123 (invalid) into 0x123 (valid). |
| 1009 // Skip over leading - before replacing. | 1021 // Skip over leading - before replacing. |
| 1010 bool neg = false; | 1022 bool neg = false; |
| 1011 if (n >= 1 && str[0] == '-') { | 1023 if (n >= 1 && str[0] == '-') { |
| 1012 neg = true; | 1024 neg = true; |
| 1013 n--; | 1025 n--; |
| 1014 str++; | 1026 str++; |
| 1015 } | 1027 } |
| 1016 | 1028 |
| 1017 if (n >= 3 && str[0] == '0' && str[1] == '0') { | 1029 if (n >= 3 && str[0] == '0' && str[1] == '0') { |
| 1018 while (n >= 3 && str[2] == '0') { | 1030 while (n >= 3 && str[2] == '0') { |
| 1019 n--; | 1031 n--; |
| 1020 str++; | 1032 str++; |
| 1021 } | 1033 } |
| 1022 } | 1034 } |
| 1023 | 1035 |
| 1024 if (neg) { // make room in buf for - | 1036 if (neg) { // make room in buf for - |
| 1025 n++; | 1037 n++; |
| 1026 str--; | 1038 str--; |
| 1027 } | 1039 } |
| 1028 | 1040 |
| 1029 if (n > kMaxNumberLength) return ""; | 1041 if (n > nbuf-1) return ""; |
| 1030 | 1042 |
| 1031 memmove(buf, str, n); | 1043 memmove(buf, str, n); |
| 1032 if (neg) { | 1044 if (neg) { |
| 1033 buf[0] = '-'; | 1045 buf[0] = '-'; |
| 1034 } | 1046 } |
| 1035 buf[n] = '\0'; | 1047 buf[n] = '\0'; |
| 1036 *np = n; | 1048 *np = n; |
| 1037 return buf; | 1049 return buf; |
| 1038 } | 1050 } |
| 1039 | 1051 |
| 1040 bool RE2::Arg::parse_long_radix(const char* str, | 1052 bool RE2::Arg::parse_long_radix(const char* str, |
| 1041 int n, | 1053 int n, |
| 1042 void* dest, | 1054 void* dest, |
| 1043 int radix) { | 1055 int radix) { |
| 1044 if (n == 0) return false; | 1056 if (n == 0) return false; |
| 1045 char buf[kMaxNumberLength+1]; | 1057 char buf[kMaxNumberLength+1]; |
| 1046 str = TerminateNumber(buf, str, &n); | 1058 str = TerminateNumber(buf, sizeof buf, str, &n, false); |
| 1047 char* end; | 1059 char* end; |
| 1048 errno = 0; | 1060 errno = 0; |
| 1049 long r = strtol(str, &end, radix); | 1061 long r = strtol(str, &end, radix); |
| 1050 if (end != str + n) return false; // Leftover junk | 1062 if (end != str + n) return false; // Leftover junk |
| 1051 if (errno) return false; | 1063 if (errno) return false; |
| 1052 if (dest == NULL) return true; | 1064 if (dest == NULL) return true; |
| 1053 *(reinterpret_cast<long*>(dest)) = r; | 1065 *(reinterpret_cast<long*>(dest)) = r; |
| 1054 return true; | 1066 return true; |
| 1055 } | 1067 } |
| 1056 | 1068 |
| 1057 bool RE2::Arg::parse_ulong_radix(const char* str, | 1069 bool RE2::Arg::parse_ulong_radix(const char* str, |
| 1058 int n, | 1070 int n, |
| 1059 void* dest, | 1071 void* dest, |
| 1060 int radix) { | 1072 int radix) { |
| 1061 if (n == 0) return false; | 1073 if (n == 0) return false; |
| 1062 char buf[kMaxNumberLength+1]; | 1074 char buf[kMaxNumberLength+1]; |
| 1063 str = TerminateNumber(buf, str, &n); | 1075 str = TerminateNumber(buf, sizeof buf, str, &n, false); |
| 1064 if (str[0] == '-') { | 1076 if (str[0] == '-') { |
| 1065 // strtoul() will silently accept negative numbers and parse | 1077 // strtoul() will silently accept negative numbers and parse |
| 1066 // them. This module is more strict and treats them as errors. | 1078 // them. This module is more strict and treats them as errors. |
| 1067 return false; | 1079 return false; |
| 1068 } | 1080 } |
| 1069 | 1081 |
| 1070 char* end; | 1082 char* end; |
| 1071 errno = 0; | 1083 errno = 0; |
| 1072 unsigned long r = strtoul(str, &end, radix); | 1084 unsigned long r = strtoul(str, &end, radix); |
| 1073 if (end != str + n) return false; // Leftover junk | 1085 if (end != str + n) return false; // Leftover junk |
| 1074 if (errno) return false; | 1086 if (errno) return false; |
| 1075 if (dest == NULL) return true; | 1087 if (dest == NULL) return true; |
| 1076 *(reinterpret_cast<unsigned long*>(dest)) = r; | 1088 *(reinterpret_cast<unsigned long*>(dest)) = r; |
| 1077 return true; | 1089 return true; |
| 1078 } | 1090 } |
| 1079 | 1091 |
| 1080 bool RE2::Arg::parse_short_radix(const char* str, | 1092 bool RE2::Arg::parse_short_radix(const char* str, |
| 1081 int n, | 1093 int n, |
| 1082 void* dest, | 1094 void* dest, |
| 1083 int radix) { | 1095 int radix) { |
| 1084 long r; | 1096 long r; |
| 1085 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse | 1097 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse |
| 1086 if ((short)r != r) return false; // Out of range | 1098 if ((short)r != r) return false; // Out of range |
| 1087 if (dest == NULL) return true; | 1099 if (dest == NULL) return true; |
| 1088 *(reinterpret_cast<short*>(dest)) = r; | 1100 *(reinterpret_cast<short*>(dest)) = (short)r; |
| 1089 return true; | 1101 return true; |
| 1090 } | 1102 } |
| 1091 | 1103 |
| 1092 bool RE2::Arg::parse_ushort_radix(const char* str, | 1104 bool RE2::Arg::parse_ushort_radix(const char* str, |
| 1093 int n, | 1105 int n, |
| 1094 void* dest, | 1106 void* dest, |
| 1095 int radix) { | 1107 int radix) { |
| 1096 unsigned long r; | 1108 unsigned long r; |
| 1097 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse | 1109 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse |
| 1098 if ((ushort)r != r) return false; // Out of range | 1110 if ((ushort)r != r) return false; // Out of range |
| 1099 if (dest == NULL) return true; | 1111 if (dest == NULL) return true; |
| 1100 *(reinterpret_cast<unsigned short*>(dest)) = r; | 1112 *(reinterpret_cast<unsigned short*>(dest)) = (ushort)r; |
| 1101 return true; | 1113 return true; |
| 1102 } | 1114 } |
| 1103 | 1115 |
| 1104 bool RE2::Arg::parse_int_radix(const char* str, | 1116 bool RE2::Arg::parse_int_radix(const char* str, |
| 1105 int n, | 1117 int n, |
| 1106 void* dest, | 1118 void* dest, |
| 1107 int radix) { | 1119 int radix) { |
| 1108 long r; | 1120 long r; |
| 1109 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse | 1121 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse |
| 1110 if ((int)r != r) return false; // Out of range | 1122 if ((int)r != r) return false; // Out of range |
| 1111 if (dest == NULL) return true; | 1123 if (dest == NULL) return true; |
| 1112 *(reinterpret_cast<int*>(dest)) = r; | 1124 *(reinterpret_cast<int*>(dest)) = r; |
| 1113 return true; | 1125 return true; |
| 1114 } | 1126 } |
| 1115 | 1127 |
| 1116 bool RE2::Arg::parse_uint_radix(const char* str, | 1128 bool RE2::Arg::parse_uint_radix(const char* str, |
| 1117 int n, | 1129 int n, |
| 1118 void* dest, | 1130 void* dest, |
| 1119 int radix) { | 1131 int radix) { |
| 1120 unsigned long r; | 1132 unsigned long r; |
| 1121 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse | 1133 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse |
| 1122 if ((uint)r != r) return false; // Out of range | 1134 if ((uint)r != r) return false; // Out of range |
| 1123 if (dest == NULL) return true; | 1135 if (dest == NULL) return true; |
| 1124 *(reinterpret_cast<unsigned int*>(dest)) = r; | 1136 *(reinterpret_cast<unsigned int*>(dest)) = r; |
| 1125 return true; | 1137 return true; |
| 1126 } | 1138 } |
| 1127 | 1139 |
| 1140 #if RE2_HAVE_LONGLONG |
| 1128 bool RE2::Arg::parse_longlong_radix(const char* str, | 1141 bool RE2::Arg::parse_longlong_radix(const char* str, |
| 1129 int n, | 1142 int n, |
| 1130 void* dest, | 1143 void* dest, |
| 1131 int radix) { | 1144 int radix) { |
| 1132 if (n == 0) return false; | 1145 if (n == 0) return false; |
| 1133 char buf[kMaxNumberLength+1]; | 1146 char buf[kMaxNumberLength+1]; |
| 1134 str = TerminateNumber(buf, str, &n); | 1147 str = TerminateNumber(buf, sizeof buf, str, &n, false); |
| 1135 char* end; | 1148 char* end; |
| 1136 errno = 0; | 1149 errno = 0; |
| 1137 int64 r = strtoll(str, &end, radix); | 1150 int64 r = strtoll(str, &end, radix); |
| 1138 if (end != str + n) return false; // Leftover junk | 1151 if (end != str + n) return false; // Leftover junk |
| 1139 if (errno) return false; | 1152 if (errno) return false; |
| 1140 if (dest == NULL) return true; | 1153 if (dest == NULL) return true; |
| 1141 *(reinterpret_cast<int64*>(dest)) = r; | 1154 *(reinterpret_cast<int64*>(dest)) = r; |
| 1142 return true; | 1155 return true; |
| 1143 } | 1156 } |
| 1144 | 1157 |
| 1145 bool RE2::Arg::parse_ulonglong_radix(const char* str, | 1158 bool RE2::Arg::parse_ulonglong_radix(const char* str, |
| 1146 int n, | 1159 int n, |
| 1147 void* dest, | 1160 void* dest, |
| 1148 int radix) { | 1161 int radix) { |
| 1149 if (n == 0) return false; | 1162 if (n == 0) return false; |
| 1150 char buf[kMaxNumberLength+1]; | 1163 char buf[kMaxNumberLength+1]; |
| 1151 str = TerminateNumber(buf, str, &n); | 1164 str = TerminateNumber(buf, sizeof buf, str, &n, false); |
| 1152 if (str[0] == '-') { | 1165 if (str[0] == '-') { |
| 1153 // strtoull() will silently accept negative numbers and parse | 1166 // strtoull() will silently accept negative numbers and parse |
| 1154 // them. This module is more strict and treats them as errors. | 1167 // them. This module is more strict and treats them as errors. |
| 1155 return false; | 1168 return false; |
| 1156 } | 1169 } |
| 1157 char* end; | 1170 char* end; |
| 1158 errno = 0; | 1171 errno = 0; |
| 1159 uint64 r = strtoull(str, &end, radix); | 1172 uint64 r = strtoull(str, &end, radix); |
| 1160 if (end != str + n) return false; // Leftover junk | 1173 if (end != str + n) return false; // Leftover junk |
| 1161 if (errno) return false; | 1174 if (errno) return false; |
| 1162 if (dest == NULL) return true; | 1175 if (dest == NULL) return true; |
| 1163 *(reinterpret_cast<uint64*>(dest)) = r; | 1176 *(reinterpret_cast<uint64*>(dest)) = r; |
| 1164 return true; | 1177 return true; |
| 1165 } | 1178 } |
| 1179 #endif |
| 1166 | 1180 |
| 1167 static bool parse_double_float(const char* str, int n, bool isfloat, void *dest)
{ | 1181 static bool parse_double_float(const char* str, int n, bool isfloat, void *dest)
{ |
| 1168 if (n == 0) return false; | 1182 if (n == 0) return false; |
| 1169 static const int kMaxLength = 200; | 1183 static const int kMaxLength = 200; |
| 1170 char buf[kMaxLength]; | 1184 char buf[kMaxLength+1]; |
| 1171 if (n >= kMaxLength) return false; | 1185 str = TerminateNumber(buf, sizeof buf, str, &n, true); |
| 1172 memcpy(buf, str, n); | 1186 char* end; |
| 1173 buf[n] = '\0'; | |
| 1174 errno = 0; | 1187 errno = 0; |
| 1175 char* end; | |
| 1176 double r; | 1188 double r; |
| 1177 if (isfloat) { | 1189 if (isfloat) { |
| 1178 r = strtof(buf, &end); | 1190 r = strtof(str, &end); |
| 1179 } else { | 1191 } else { |
| 1180 r = strtod(buf, &end); | 1192 r = strtod(str, &end); |
| 1181 } | 1193 } |
| 1182 if (end != buf + n) return false; // Leftover junk | 1194 if (end != str + n) return false; // Leftover junk |
| 1183 if (errno) return false; | 1195 if (errno) return false; |
| 1184 if (dest == NULL) return true; | 1196 if (dest == NULL) return true; |
| 1185 if (isfloat) { | 1197 if (isfloat) { |
| 1186 *(reinterpret_cast<float*>(dest)) = r; | 1198 *(reinterpret_cast<float*>(dest)) = (float)r; |
| 1187 } else { | 1199 } else { |
| 1188 *(reinterpret_cast<double*>(dest)) = r; | 1200 *(reinterpret_cast<double*>(dest)) = r; |
| 1189 } | 1201 } |
| 1190 return true; | 1202 return true; |
| 1191 } | 1203 } |
| 1192 | 1204 |
| 1193 bool RE2::Arg::parse_double(const char* str, int n, void* dest) { | 1205 bool RE2::Arg::parse_double(const char* str, int n, void* dest) { |
| 1194 return parse_double_float(str, n, false, dest); | 1206 return parse_double_float(str, n, false, dest); |
| 1195 } | 1207 } |
| 1196 | 1208 |
| (...skipping 21 matching lines...) Expand all Loading... |
| 1218 DEFINE_INTEGER_PARSERS(int); | 1230 DEFINE_INTEGER_PARSERS(int); |
| 1219 DEFINE_INTEGER_PARSERS(uint); | 1231 DEFINE_INTEGER_PARSERS(uint); |
| 1220 DEFINE_INTEGER_PARSERS(long); | 1232 DEFINE_INTEGER_PARSERS(long); |
| 1221 DEFINE_INTEGER_PARSERS(ulong); | 1233 DEFINE_INTEGER_PARSERS(ulong); |
| 1222 DEFINE_INTEGER_PARSERS(longlong); | 1234 DEFINE_INTEGER_PARSERS(longlong); |
| 1223 DEFINE_INTEGER_PARSERS(ulonglong); | 1235 DEFINE_INTEGER_PARSERS(ulonglong); |
| 1224 | 1236 |
| 1225 #undef DEFINE_INTEGER_PARSERS | 1237 #undef DEFINE_INTEGER_PARSERS |
| 1226 | 1238 |
| 1227 } // namespace re2 | 1239 } // namespace re2 |
| OLD | NEW |