OLD | NEW |
1 // Copyright 2003-2009 The RE2 Authors. All Rights Reserved. | 1 // Copyright 2003-2009 The RE2 Authors. All Rights Reserved. |
2 // Use of this source code is governed by a BSD-style | 2 // Use of this source code is governed by a BSD-style |
3 // license that can be found in the LICENSE file. | 3 // license that can be found in the LICENSE file. |
4 | 4 |
5 // Regular expression interface RE2. | 5 // Regular expression interface RE2. |
6 // | 6 // |
7 // Originally the PCRE C++ wrapper, but adapted to use | 7 // Originally the PCRE C++ wrapper, but adapted to use |
8 // the new automata-based regular expression engines. | 8 // the new automata-based regular expression engines. |
9 | 9 |
10 #include "re2/re2.h" | 10 #include "re2/re2.h" |
11 | 11 |
12 #include <stdio.h> | 12 #include <stdio.h> |
13 #include <string> | 13 #include <string> |
14 #ifdef WIN32 | |
15 #define strtoll _strtoi64 | |
16 #define strtoull _strtoui64 | |
17 #define strtof strtod | |
18 #else | |
19 #include <pthread.h> | |
20 #endif | |
21 #include <errno.h> | 14 #include <errno.h> |
22 #include "util/util.h" | 15 #include "util/util.h" |
23 #include "util/flags.h" | 16 #include "util/flags.h" |
| 17 #include "util/sparse_array.h" |
24 #include "re2/prog.h" | 18 #include "re2/prog.h" |
25 #include "re2/regexp.h" | 19 #include "re2/regexp.h" |
26 | 20 |
27 DEFINE_bool(trace_re2, false, "trace RE2 execution"); | 21 DEFINE_bool(trace_re2, false, "trace RE2 execution"); |
28 | 22 |
29 namespace re2 { | 23 namespace re2 { |
30 | 24 |
31 // Maximum number of args we can set | 25 // Maximum number of args we can set |
32 static const int kMaxArgs = 16; | 26 static const int kMaxArgs = 16; |
33 static const int kVecSize = 1+kMaxArgs; | 27 static const int kVecSize = 1+kMaxArgs; |
34 | 28 |
35 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::Ful
lMatchN> RE2::FullMatch = {}; | 29 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::Ful
lMatchN> RE2::FullMatch = {}; |
36 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::Par
tialMatchN> RE2::PartialMatch = {}; | 30 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::Par
tialMatchN> RE2::PartialMatch = {}; |
37 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN>
RE2::Consume = {}; | 31 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN>
RE2::Consume = {}; |
38 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndCo
nsumeN> RE2::FindAndConsume = {}; | 32 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndCo
nsumeN> RE2::FindAndConsume = {}; |
39 | 33 |
40 #define kDefaultMaxMem (8<<20) | 34 // This will trigger LNK2005 error in MSVC. |
41 | 35 #ifndef _MSC_VER |
42 RE2::Options::Options() | 36 const int RE2::Options::kDefaultMaxMem; // initialized in re2.h |
43 : encoding_(EncodingUTF8), | 37 #endif |
44 posix_syntax_(false), | |
45 longest_match_(false), | |
46 log_errors_(true), | |
47 max_mem_(kDefaultMaxMem), | |
48 literal_(false), | |
49 never_nl_(false), | |
50 never_capture_(false), | |
51 case_sensitive_(true), | |
52 perl_classes_(false), | |
53 word_boundary_(false), | |
54 one_line_(false) { | |
55 } | |
56 | 38 |
57 RE2::Options::Options(RE2::CannedOptions opt) | 39 RE2::Options::Options(RE2::CannedOptions opt) |
58 : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8), | 40 : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8), |
59 posix_syntax_(opt == RE2::POSIX), | 41 posix_syntax_(opt == RE2::POSIX), |
60 longest_match_(opt == RE2::POSIX), | 42 longest_match_(opt == RE2::POSIX), |
61 log_errors_(opt != RE2::Quiet), | 43 log_errors_(opt != RE2::Quiet), |
62 max_mem_(kDefaultMaxMem), | 44 max_mem_(kDefaultMaxMem), |
63 literal_(false), | 45 literal_(false), |
64 never_nl_(false), | 46 never_nl_(false), |
| 47 dot_nl_(false), |
65 never_capture_(false), | 48 never_capture_(false), |
66 case_sensitive_(true), | 49 case_sensitive_(true), |
67 perl_classes_(false), | 50 perl_classes_(false), |
68 word_boundary_(false), | 51 word_boundary_(false), |
69 one_line_(false) { | 52 one_line_(false) { |
70 } | 53 } |
71 | 54 |
72 // static empty things for use as const references. | 55 // static empty things for use as const references. |
73 // To avoid global constructors, initialized on demand. | 56 // To avoid global constructors, initialized on demand. |
74 GLOBAL_MUTEX(empty_mutex); | 57 GLOBAL_MUTEX(empty_mutex); |
(...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
162 | 145 |
163 if (!posix_syntax()) | 146 if (!posix_syntax()) |
164 flags |= Regexp::LikePerl; | 147 flags |= Regexp::LikePerl; |
165 | 148 |
166 if (literal()) | 149 if (literal()) |
167 flags |= Regexp::Literal; | 150 flags |= Regexp::Literal; |
168 | 151 |
169 if (never_nl()) | 152 if (never_nl()) |
170 flags |= Regexp::NeverNL; | 153 flags |= Regexp::NeverNL; |
171 | 154 |
| 155 if (dot_nl()) |
| 156 flags |= Regexp::DotNL; |
| 157 |
172 if (never_capture()) | 158 if (never_capture()) |
173 flags |= Regexp::NeverCapture; | 159 flags |= Regexp::NeverCapture; |
174 | 160 |
175 if (!case_sensitive()) | 161 if (!case_sensitive()) |
176 flags |= Regexp::FoldCase; | 162 flags |= Regexp::FoldCase; |
177 | 163 |
178 if (perl_classes()) | 164 if (perl_classes()) |
179 flags |= Regexp::PerlClasses; | 165 flags |= Regexp::PerlClasses; |
180 | 166 |
181 if (word_boundary()) | 167 if (word_boundary()) |
(...skipping 96 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
278 if (group_names_ != NULL && group_names_ != empty_group_names) | 264 if (group_names_ != NULL && group_names_ != empty_group_names) |
279 delete group_names_; | 265 delete group_names_; |
280 } | 266 } |
281 | 267 |
282 int RE2::ProgramSize() const { | 268 int RE2::ProgramSize() const { |
283 if (prog_ == NULL) | 269 if (prog_ == NULL) |
284 return -1; | 270 return -1; |
285 return prog_->size(); | 271 return prog_->size(); |
286 } | 272 } |
287 | 273 |
| 274 int RE2::ProgramFanout(map<int, int>* histogram) const { |
| 275 if (prog_ == NULL) |
| 276 return -1; |
| 277 SparseArray<int> fanout(prog_->size()); |
| 278 prog_->Fanout(&fanout); |
| 279 histogram->clear(); |
| 280 for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) { |
| 281 // TODO(junyer): Optimise this? |
| 282 int bucket = 0; |
| 283 while (1 << bucket < i->second) { |
| 284 bucket++; |
| 285 } |
| 286 (*histogram)[bucket]++; |
| 287 } |
| 288 return histogram->rbegin()->first; |
| 289 } |
| 290 |
| 291 // Returns num_captures_, computing it if needed, or -1 if the |
| 292 // regexp wasn't valid on construction. |
| 293 int RE2::NumberOfCapturingGroups() const { |
| 294 MutexLock l(mutex_); |
| 295 if (suffix_regexp_ == NULL) |
| 296 return -1; |
| 297 if (num_captures_ == -1) |
| 298 num_captures_ = suffix_regexp_->NumCaptures(); |
| 299 return num_captures_; |
| 300 } |
| 301 |
288 // Returns named_groups_, computing it if needed. | 302 // Returns named_groups_, computing it if needed. |
289 const map<string, int>& RE2::NamedCapturingGroups() const { | 303 const map<string, int>& RE2::NamedCapturingGroups() const { |
290 MutexLock l(mutex_); | 304 MutexLock l(mutex_); |
291 if (!ok()) | 305 if (!ok()) |
292 return *empty_named_groups; | 306 return *empty_named_groups; |
293 if (named_groups_ == NULL) { | 307 if (named_groups_ == NULL) { |
294 named_groups_ = suffix_regexp_->NamedCaptures(); | 308 named_groups_ = suffix_regexp_->NamedCaptures(); |
295 if (named_groups_ == NULL) | 309 if (named_groups_ == NULL) |
296 named_groups_ = empty_named_groups; | 310 named_groups_ = empty_named_groups; |
297 } | 311 } |
298 return *named_groups_; | 312 return *named_groups_; |
299 } | 313 } |
300 | 314 |
301 // Returns group_names_, computing it if needed. | 315 // Returns group_names_, computing it if needed. |
302 const map<int, string>& RE2::CapturingGroupNames() const { | 316 const map<int, string>& RE2::CapturingGroupNames() const { |
303 MutexLock l(mutex_); | 317 MutexLock l(mutex_); |
304 if (!ok()) | 318 if (!ok()) |
305 return *empty_group_names; | 319 return *empty_group_names; |
306 if (group_names_ == NULL) { | 320 if (group_names_ == NULL) { |
307 group_names_ = suffix_regexp_->CaptureNames(); | 321 group_names_ = suffix_regexp_->CaptureNames(); |
308 if (group_names_ == NULL) | 322 if (group_names_ == NULL) |
309 group_names_ = empty_group_names; | 323 group_names_ = empty_group_names; |
310 } | 324 } |
311 return *group_names_; | 325 return *group_names_; |
312 } | 326 } |
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
364 return max; | 378 return max; |
365 } | 379 } |
366 | 380 |
367 bool RE2::Replace(string *str, | 381 bool RE2::Replace(string *str, |
368 const RE2& re, | 382 const RE2& re, |
369 const StringPiece& rewrite) { | 383 const StringPiece& rewrite) { |
370 StringPiece vec[kVecSize]; | 384 StringPiece vec[kVecSize]; |
371 int nvec = 1 + MaxSubmatch(rewrite); | 385 int nvec = 1 + MaxSubmatch(rewrite); |
372 if (nvec > arraysize(vec)) | 386 if (nvec > arraysize(vec)) |
373 return false; | 387 return false; |
374 if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) | 388 if (!re.Match(*str, 0, static_cast<int>(str->size()), UNANCHORED, vec, nvec)) |
375 return false; | 389 return false; |
376 | 390 |
377 string s; | 391 string s; |
378 if (!re.Rewrite(&s, rewrite, vec, nvec)) | 392 if (!re.Rewrite(&s, rewrite, vec, nvec)) |
379 return false; | 393 return false; |
380 | 394 |
381 assert(vec[0].begin() >= str->data()); | 395 assert(vec[0].begin() >= str->data()); |
382 assert(vec[0].end() <= str->data()+str->size()); | 396 assert(vec[0].end() <= str->data()+str->size()); |
383 str->replace(vec[0].data() - str->data(), vec[0].size(), s); | 397 str->replace(vec[0].data() - str->data(), vec[0].size(), s); |
384 return true; | 398 return true; |
385 } | 399 } |
386 | 400 |
387 int RE2::GlobalReplace(string *str, | 401 int RE2::GlobalReplace(string *str, |
388 const RE2& re, | 402 const RE2& re, |
389 const StringPiece& rewrite) { | 403 const StringPiece& rewrite) { |
390 StringPiece vec[kVecSize]; | 404 StringPiece vec[kVecSize]; |
391 int nvec = 1 + MaxSubmatch(rewrite); | 405 int nvec = 1 + MaxSubmatch(rewrite); |
392 if (nvec > arraysize(vec)) | 406 if (nvec > arraysize(vec)) |
393 return false; | 407 return false; |
394 | 408 |
395 const char* p = str->data(); | 409 const char* p = str->data(); |
396 const char* ep = p + str->size(); | 410 const char* ep = p + str->size(); |
397 const char* lastend = NULL; | 411 const char* lastend = NULL; |
398 string out; | 412 string out; |
399 int count = 0; | 413 int count = 0; |
400 while (p <= ep) { | 414 while (p <= ep) { |
401 if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec)) | 415 if (!re.Match(*str, static_cast<int>(p - str->data()), |
| 416 static_cast<int>(str->size()), UNANCHORED, vec, nvec)) |
402 break; | 417 break; |
403 if (p < vec[0].begin()) | 418 if (p < vec[0].begin()) |
404 out.append(p, vec[0].begin() - p); | 419 out.append(p, vec[0].begin() - p); |
405 if (vec[0].begin() == lastend && vec[0].size() == 0) { | 420 if (vec[0].begin() == lastend && vec[0].size() == 0) { |
406 // Disallow empty match at end of last match: skip ahead. | 421 // Disallow empty match at end of last match: skip ahead. |
407 if (p < ep) | 422 if (p < ep) |
408 out.append(p, 1); | 423 out.append(p, 1); |
409 p++; | 424 p++; |
410 continue; | 425 continue; |
411 } | 426 } |
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
475 result += unquoted[ii]; | 490 result += unquoted[ii]; |
476 } | 491 } |
477 | 492 |
478 return result; | 493 return result; |
479 } | 494 } |
480 | 495 |
481 bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const { | 496 bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const { |
482 if (prog_ == NULL) | 497 if (prog_ == NULL) |
483 return false; | 498 return false; |
484 | 499 |
485 int n = prefix_.size(); | 500 int n = static_cast<int>(prefix_.size()); |
486 if (n > maxlen) | 501 if (n > maxlen) |
487 n = maxlen; | 502 n = maxlen; |
488 | 503 |
489 // Determine initial min max from prefix_ literal. | 504 // Determine initial min max from prefix_ literal. |
490 string pmin, pmax; | 505 string pmin, pmax; |
491 pmin = prefix_.substr(0, n); | 506 pmin = prefix_.substr(0, n); |
492 pmax = prefix_.substr(0, n); | 507 pmax = prefix_.substr(0, n); |
493 if (prefix_foldcase_) { | 508 if (prefix_foldcase_) { |
494 // prefix is ASCII lowercase; change pmin to uppercase. | 509 // prefix is ASCII lowercase; change pmin to uppercase. |
495 for (int i = 0; i < n; i++) { | 510 for (int i = 0; i < n; i++) { |
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
547 StringPiece* submatch, | 562 StringPiece* submatch, |
548 int nsubmatch) const { | 563 int nsubmatch) const { |
549 if (!ok() || suffix_regexp_ == NULL) { | 564 if (!ok() || suffix_regexp_ == NULL) { |
550 if (options_.log_errors()) | 565 if (options_.log_errors()) |
551 LOG(ERROR) << "Invalid RE2: " << *error_; | 566 LOG(ERROR) << "Invalid RE2: " << *error_; |
552 return false; | 567 return false; |
553 } | 568 } |
554 | 569 |
555 if (startpos < 0 || startpos > endpos || endpos > text.size()) { | 570 if (startpos < 0 || startpos > endpos || endpos > text.size()) { |
556 if (options_.log_errors()) | 571 if (options_.log_errors()) |
557 LOG(ERROR) << "RE2: invalid startpos, endpos pair."; | 572 LOG(ERROR) << "RE2: invalid startpos, endpos pair. [" |
| 573 << "startpos: " << startpos << ", " |
| 574 << "endpos: " << endpos << ", " |
| 575 << "text size: " << text.size() << "]"; |
558 return false; | 576 return false; |
559 } | 577 } |
560 | 578 |
561 StringPiece subtext = text; | 579 StringPiece subtext = text; |
562 subtext.remove_prefix(startpos); | 580 subtext.remove_prefix(startpos); |
563 subtext.remove_suffix(text.size() - endpos); | 581 subtext.remove_suffix(text.size() - endpos); |
564 | 582 |
565 // Use DFAs to find exact location of match, filter out non-matches. | 583 // Use DFAs to find exact location of match, filter out non-matches. |
566 | 584 |
567 // Don't ask for the location if we won't use it. | 585 // Don't ask for the location if we won't use it. |
(...skipping 16 matching lines...) Expand all Loading... |
584 if (prog_->anchor_start() && prog_->anchor_end()) | 602 if (prog_->anchor_start() && prog_->anchor_end()) |
585 re_anchor = ANCHOR_BOTH; | 603 re_anchor = ANCHOR_BOTH; |
586 else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH) | 604 else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH) |
587 re_anchor = ANCHOR_START; | 605 re_anchor = ANCHOR_START; |
588 | 606 |
589 // Check for the required prefix, if any. | 607 // Check for the required prefix, if any. |
590 int prefixlen = 0; | 608 int prefixlen = 0; |
591 if (!prefix_.empty()) { | 609 if (!prefix_.empty()) { |
592 if (startpos != 0) | 610 if (startpos != 0) |
593 return false; | 611 return false; |
594 prefixlen = prefix_.size(); | 612 prefixlen = static_cast<int>(prefix_.size()); |
595 if (prefixlen > subtext.size()) | 613 if (prefixlen > subtext.size()) |
596 return false; | 614 return false; |
597 if (prefix_foldcase_) { | 615 if (prefix_foldcase_) { |
598 if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0) | 616 if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0) |
599 return false; | 617 return false; |
600 } else { | 618 } else { |
601 if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0) | 619 if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0) |
602 return false; | 620 return false; |
603 } | 621 } |
604 subtext.remove_prefix(prefixlen); | 622 subtext.remove_prefix(prefixlen); |
(...skipping 220 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
825 } else { | 843 } else { |
826 vec = new StringPiece[nvec]; | 844 vec = new StringPiece[nvec]; |
827 heapvec = vec; | 845 heapvec = vec; |
828 } | 846 } |
829 | 847 |
830 if (!Match(text, 0, text.size(), anchor, vec, nvec)) { | 848 if (!Match(text, 0, text.size(), anchor, vec, nvec)) { |
831 delete[] heapvec; | 849 delete[] heapvec; |
832 return false; | 850 return false; |
833 } | 851 } |
834 | 852 |
835 if(consumed != NULL) | 853 if (consumed != NULL) |
836 *consumed = vec[0].end() - text.begin(); | 854 *consumed = static_cast<int>(vec[0].end() - text.begin()); |
837 | 855 |
838 if (n == 0 || args == NULL) { | 856 if (n == 0 || args == NULL) { |
839 // We are not interested in results | 857 // We are not interested in results |
840 delete[] heapvec; | 858 delete[] heapvec; |
841 return true; | 859 return true; |
842 } | 860 } |
843 | 861 |
844 int ncap = NumberOfCapturingGroups(); | 862 int ncap = NumberOfCapturingGroups(); |
845 if (ncap < n) { | 863 if (ncap < n) { |
846 // RE has fewer capturing groups than number of arg pointers passed in | 864 // RE has fewer capturing groups than number of arg pointers passed in |
847 VLOG(1) << "Asked for " << n << " but only have " << ncap; | 865 VLOG(1) << "Asked for " << n << " but only have " << ncap; |
848 delete[] heapvec; | 866 delete[] heapvec; |
849 return false; | 867 return false; |
850 } | 868 } |
851 | 869 |
852 // If we got here, we must have matched the whole pattern. | 870 // If we got here, we must have matched the whole pattern. |
853 for (int i = 0; i < n; i++) { | 871 for (int i = 0; i < n; i++) { |
854 const StringPiece& s = vec[i+1]; | 872 const StringPiece& s = vec[i+1]; |
855 if (!args[i]->Parse(s.data(), s.size())) { | 873 if (!args[i]->Parse(s.data(), s.size())) { |
856 // TODO: Should we indicate what the error was? | 874 // TODO: Should we indicate what the error was? |
857 VLOG(1) << "Parse error on #" << i << " " << s << " " | 875 VLOG(1) << "Parse error on #" << i << " " << s << " " |
858 » << (void*)s.data() << "/" << s.size(); | 876 << (void*)s.data() << "/" << s.size(); |
859 delete[] heapvec; | 877 delete[] heapvec; |
860 return false; | 878 return false; |
861 } | 879 } |
862 } | 880 } |
863 | 881 |
864 delete[] heapvec; | 882 delete[] heapvec; |
865 return true; | 883 return true; |
866 } | 884 } |
867 | 885 |
868 // Append the "rewrite" string, with backslash subsitutions from "vec", | 886 // Append the "rewrite" string, with backslash subsitutions from "vec", |
869 // to string "out". | 887 // to string "out". |
870 bool RE2::Rewrite(string *out, const StringPiece &rewrite, | 888 bool RE2::Rewrite(string *out, const StringPiece &rewrite, |
871 const StringPiece *vec, int veclen) const { | 889 const StringPiece *vec, int veclen) const { |
872 for (const char *s = rewrite.data(), *end = s + rewrite.size(); | 890 for (const char *s = rewrite.data(), *end = s + rewrite.size(); |
873 s < end; s++) { | 891 s < end; s++) { |
874 int c = *s; | 892 if (*s != '\\') { |
875 if (c == '\\') { | 893 out->push_back(*s); |
876 s++; | 894 continue; |
877 c = (s < end) ? *s : -1; | 895 } |
878 if (isdigit(c)) { | 896 s++; |
879 int n = (c - '0'); | 897 int c = (s < end) ? *s : -1; |
880 if (n >= veclen) { | 898 if (isdigit(c)) { |
881 if (options_.log_errors()) { | 899 int n = (c - '0'); |
882 LOG(ERROR) << "requested group " << n | 900 if (n >= veclen) { |
883 << " in regexp " << rewrite.data(); | 901 if (options_.log_errors()) { |
884 } | 902 LOG(ERROR) << "requested group " << n |
885 return false; | 903 << " in regexp " << rewrite.data(); |
886 } | 904 } |
887 StringPiece snip = vec[n]; | |
888 if (snip.size() > 0) | |
889 out->append(snip.data(), snip.size()); | |
890 } else if (c == '\\') { | |
891 out->push_back('\\'); | |
892 } else { | |
893 if (options_.log_errors()) | |
894 LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); | |
895 return false; | 905 return false; |
896 } | 906 } |
| 907 StringPiece snip = vec[n]; |
| 908 if (snip.size() > 0) |
| 909 out->append(snip.data(), snip.size()); |
| 910 } else if (c == '\\') { |
| 911 out->push_back('\\'); |
897 } else { | 912 } else { |
898 out->push_back(c); | 913 if (options_.log_errors()) |
| 914 LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); |
| 915 return false; |
899 } | 916 } |
900 } | 917 } |
901 return true; | 918 return true; |
902 } | 919 } |
903 | 920 |
904 // Return the number of capturing subpatterns, or -1 if the | |
905 // regexp wasn't valid on construction. | |
906 int RE2::NumberOfCapturingGroups() const { | |
907 if (suffix_regexp_ == NULL) | |
908 return -1; | |
909 ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case" | |
910 " multiple threads end up doing the same work in parallel."); | |
911 if (num_captures_ == -1) | |
912 num_captures_ = suffix_regexp_->NumCaptures(); | |
913 return num_captures_; | |
914 } | |
915 | |
916 // Checks that the rewrite string is well-formed with respect to this | 921 // Checks that the rewrite string is well-formed with respect to this |
917 // regular expression. | 922 // regular expression. |
918 bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const { | 923 bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const { |
919 int max_token = -1; | 924 int max_token = -1; |
920 for (const char *s = rewrite.data(), *end = s + rewrite.size(); | 925 for (const char *s = rewrite.data(), *end = s + rewrite.size(); |
921 s < end; s++) { | 926 s < end; s++) { |
922 int c = *s; | 927 int c = *s; |
923 if (c != '\\') { | 928 if (c != '\\') { |
924 continue; | 929 continue; |
925 } | 930 } |
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
980 bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) { | 985 bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) { |
981 if (n != 1) return false; | 986 if (n != 1) return false; |
982 if (dest == NULL) return true; | 987 if (dest == NULL) return true; |
983 *(reinterpret_cast<unsigned char*>(dest)) = str[0]; | 988 *(reinterpret_cast<unsigned char*>(dest)) = str[0]; |
984 return true; | 989 return true; |
985 } | 990 } |
986 | 991 |
987 // Largest number spec that we are willing to parse | 992 // Largest number spec that we are willing to parse |
988 static const int kMaxNumberLength = 32; | 993 static const int kMaxNumberLength = 32; |
989 | 994 |
990 // REQUIRES "buf" must have length at least kMaxNumberLength+1 | 995 // REQUIRES "buf" must have length at least nbuf. |
991 // Copies "str" into "buf" and null-terminates. | 996 // Copies "str" into "buf" and null-terminates. |
992 // Overwrites *np with the new length. | 997 // Overwrites *np with the new length. |
993 static const char* TerminateNumber(char* buf, const char* str, int* np) { | 998 static const char* TerminateNumber(char* buf, int nbuf, const char* str, int* np
, |
| 999 bool accept_spaces) { |
994 int n = *np; | 1000 int n = *np; |
995 if (n <= 0) return ""; | 1001 if (n <= 0) return ""; |
996 if (n > 0 && isspace(*str)) { | 1002 if (n > 0 && isspace(*str)) { |
997 // We are less forgiving than the strtoxxx() routines and do not | 1003 // We are less forgiving than the strtoxxx() routines and do not |
998 // allow leading spaces. | 1004 // allow leading spaces. We do allow leading spaces for floats. |
999 return ""; | 1005 if (!accept_spaces) { |
| 1006 return ""; |
| 1007 } |
| 1008 while (n > 0 && isspace(*str)) { |
| 1009 n--; |
| 1010 str++; |
| 1011 } |
1000 } | 1012 } |
1001 | 1013 |
1002 // Although buf has a fixed maximum size, we can still handle | 1014 // Although buf has a fixed maximum size, we can still handle |
1003 // arbitrarily large integers correctly by omitting leading zeros. | 1015 // arbitrarily large integers correctly by omitting leading zeros. |
1004 // (Numbers that are still too long will be out of range.) | 1016 // (Numbers that are still too long will be out of range.) |
1005 // Before deciding whether str is too long, | 1017 // Before deciding whether str is too long, |
1006 // remove leading zeros with s/000+/00/. | 1018 // remove leading zeros with s/000+/00/. |
1007 // Leaving the leading two zeros in place means that | 1019 // Leaving the leading two zeros in place means that |
1008 // we don't change 0000x123 (invalid) into 0x123 (valid). | 1020 // we don't change 0000x123 (invalid) into 0x123 (valid). |
1009 // Skip over leading - before replacing. | 1021 // Skip over leading - before replacing. |
1010 bool neg = false; | 1022 bool neg = false; |
1011 if (n >= 1 && str[0] == '-') { | 1023 if (n >= 1 && str[0] == '-') { |
1012 neg = true; | 1024 neg = true; |
1013 n--; | 1025 n--; |
1014 str++; | 1026 str++; |
1015 } | 1027 } |
1016 | 1028 |
1017 if (n >= 3 && str[0] == '0' && str[1] == '0') { | 1029 if (n >= 3 && str[0] == '0' && str[1] == '0') { |
1018 while (n >= 3 && str[2] == '0') { | 1030 while (n >= 3 && str[2] == '0') { |
1019 n--; | 1031 n--; |
1020 str++; | 1032 str++; |
1021 } | 1033 } |
1022 } | 1034 } |
1023 | 1035 |
1024 if (neg) { // make room in buf for - | 1036 if (neg) { // make room in buf for - |
1025 n++; | 1037 n++; |
1026 str--; | 1038 str--; |
1027 } | 1039 } |
1028 | 1040 |
1029 if (n > kMaxNumberLength) return ""; | 1041 if (n > nbuf-1) return ""; |
1030 | 1042 |
1031 memmove(buf, str, n); | 1043 memmove(buf, str, n); |
1032 if (neg) { | 1044 if (neg) { |
1033 buf[0] = '-'; | 1045 buf[0] = '-'; |
1034 } | 1046 } |
1035 buf[n] = '\0'; | 1047 buf[n] = '\0'; |
1036 *np = n; | 1048 *np = n; |
1037 return buf; | 1049 return buf; |
1038 } | 1050 } |
1039 | 1051 |
1040 bool RE2::Arg::parse_long_radix(const char* str, | 1052 bool RE2::Arg::parse_long_radix(const char* str, |
1041 int n, | 1053 int n, |
1042 void* dest, | 1054 void* dest, |
1043 int radix) { | 1055 int radix) { |
1044 if (n == 0) return false; | 1056 if (n == 0) return false; |
1045 char buf[kMaxNumberLength+1]; | 1057 char buf[kMaxNumberLength+1]; |
1046 str = TerminateNumber(buf, str, &n); | 1058 str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1047 char* end; | 1059 char* end; |
1048 errno = 0; | 1060 errno = 0; |
1049 long r = strtol(str, &end, radix); | 1061 long r = strtol(str, &end, radix); |
1050 if (end != str + n) return false; // Leftover junk | 1062 if (end != str + n) return false; // Leftover junk |
1051 if (errno) return false; | 1063 if (errno) return false; |
1052 if (dest == NULL) return true; | 1064 if (dest == NULL) return true; |
1053 *(reinterpret_cast<long*>(dest)) = r; | 1065 *(reinterpret_cast<long*>(dest)) = r; |
1054 return true; | 1066 return true; |
1055 } | 1067 } |
1056 | 1068 |
1057 bool RE2::Arg::parse_ulong_radix(const char* str, | 1069 bool RE2::Arg::parse_ulong_radix(const char* str, |
1058 int n, | 1070 int n, |
1059 void* dest, | 1071 void* dest, |
1060 int radix) { | 1072 int radix) { |
1061 if (n == 0) return false; | 1073 if (n == 0) return false; |
1062 char buf[kMaxNumberLength+1]; | 1074 char buf[kMaxNumberLength+1]; |
1063 str = TerminateNumber(buf, str, &n); | 1075 str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1064 if (str[0] == '-') { | 1076 if (str[0] == '-') { |
1065 // strtoul() will silently accept negative numbers and parse | 1077 // strtoul() will silently accept negative numbers and parse |
1066 // them. This module is more strict and treats them as errors. | 1078 // them. This module is more strict and treats them as errors. |
1067 return false; | 1079 return false; |
1068 } | 1080 } |
1069 | 1081 |
1070 char* end; | 1082 char* end; |
1071 errno = 0; | 1083 errno = 0; |
1072 unsigned long r = strtoul(str, &end, radix); | 1084 unsigned long r = strtoul(str, &end, radix); |
1073 if (end != str + n) return false; // Leftover junk | 1085 if (end != str + n) return false; // Leftover junk |
1074 if (errno) return false; | 1086 if (errno) return false; |
1075 if (dest == NULL) return true; | 1087 if (dest == NULL) return true; |
1076 *(reinterpret_cast<unsigned long*>(dest)) = r; | 1088 *(reinterpret_cast<unsigned long*>(dest)) = r; |
1077 return true; | 1089 return true; |
1078 } | 1090 } |
1079 | 1091 |
1080 bool RE2::Arg::parse_short_radix(const char* str, | 1092 bool RE2::Arg::parse_short_radix(const char* str, |
1081 int n, | 1093 int n, |
1082 void* dest, | 1094 void* dest, |
1083 int radix) { | 1095 int radix) { |
1084 long r; | 1096 long r; |
1085 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse | 1097 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse |
1086 if ((short)r != r) return false; // Out of range | 1098 if ((short)r != r) return false; // Out of range |
1087 if (dest == NULL) return true; | 1099 if (dest == NULL) return true; |
1088 *(reinterpret_cast<short*>(dest)) = r; | 1100 *(reinterpret_cast<short*>(dest)) = (short)r; |
1089 return true; | 1101 return true; |
1090 } | 1102 } |
1091 | 1103 |
1092 bool RE2::Arg::parse_ushort_radix(const char* str, | 1104 bool RE2::Arg::parse_ushort_radix(const char* str, |
1093 int n, | 1105 int n, |
1094 void* dest, | 1106 void* dest, |
1095 int radix) { | 1107 int radix) { |
1096 unsigned long r; | 1108 unsigned long r; |
1097 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse | 1109 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse |
1098 if ((ushort)r != r) return false; // Out of range | 1110 if ((ushort)r != r) return false; // Out of range |
1099 if (dest == NULL) return true; | 1111 if (dest == NULL) return true; |
1100 *(reinterpret_cast<unsigned short*>(dest)) = r; | 1112 *(reinterpret_cast<unsigned short*>(dest)) = (ushort)r; |
1101 return true; | 1113 return true; |
1102 } | 1114 } |
1103 | 1115 |
1104 bool RE2::Arg::parse_int_radix(const char* str, | 1116 bool RE2::Arg::parse_int_radix(const char* str, |
1105 int n, | 1117 int n, |
1106 void* dest, | 1118 void* dest, |
1107 int radix) { | 1119 int radix) { |
1108 long r; | 1120 long r; |
1109 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse | 1121 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse |
1110 if ((int)r != r) return false; // Out of range | 1122 if ((int)r != r) return false; // Out of range |
1111 if (dest == NULL) return true; | 1123 if (dest == NULL) return true; |
1112 *(reinterpret_cast<int*>(dest)) = r; | 1124 *(reinterpret_cast<int*>(dest)) = r; |
1113 return true; | 1125 return true; |
1114 } | 1126 } |
1115 | 1127 |
1116 bool RE2::Arg::parse_uint_radix(const char* str, | 1128 bool RE2::Arg::parse_uint_radix(const char* str, |
1117 int n, | 1129 int n, |
1118 void* dest, | 1130 void* dest, |
1119 int radix) { | 1131 int radix) { |
1120 unsigned long r; | 1132 unsigned long r; |
1121 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse | 1133 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse |
1122 if ((uint)r != r) return false; // Out of range | 1134 if ((uint)r != r) return false; // Out of range |
1123 if (dest == NULL) return true; | 1135 if (dest == NULL) return true; |
1124 *(reinterpret_cast<unsigned int*>(dest)) = r; | 1136 *(reinterpret_cast<unsigned int*>(dest)) = r; |
1125 return true; | 1137 return true; |
1126 } | 1138 } |
1127 | 1139 |
| 1140 #if RE2_HAVE_LONGLONG |
1128 bool RE2::Arg::parse_longlong_radix(const char* str, | 1141 bool RE2::Arg::parse_longlong_radix(const char* str, |
1129 int n, | 1142 int n, |
1130 void* dest, | 1143 void* dest, |
1131 int radix) { | 1144 int radix) { |
1132 if (n == 0) return false; | 1145 if (n == 0) return false; |
1133 char buf[kMaxNumberLength+1]; | 1146 char buf[kMaxNumberLength+1]; |
1134 str = TerminateNumber(buf, str, &n); | 1147 str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1135 char* end; | 1148 char* end; |
1136 errno = 0; | 1149 errno = 0; |
1137 int64 r = strtoll(str, &end, radix); | 1150 int64 r = strtoll(str, &end, radix); |
1138 if (end != str + n) return false; // Leftover junk | 1151 if (end != str + n) return false; // Leftover junk |
1139 if (errno) return false; | 1152 if (errno) return false; |
1140 if (dest == NULL) return true; | 1153 if (dest == NULL) return true; |
1141 *(reinterpret_cast<int64*>(dest)) = r; | 1154 *(reinterpret_cast<int64*>(dest)) = r; |
1142 return true; | 1155 return true; |
1143 } | 1156 } |
1144 | 1157 |
1145 bool RE2::Arg::parse_ulonglong_radix(const char* str, | 1158 bool RE2::Arg::parse_ulonglong_radix(const char* str, |
1146 int n, | 1159 int n, |
1147 void* dest, | 1160 void* dest, |
1148 int radix) { | 1161 int radix) { |
1149 if (n == 0) return false; | 1162 if (n == 0) return false; |
1150 char buf[kMaxNumberLength+1]; | 1163 char buf[kMaxNumberLength+1]; |
1151 str = TerminateNumber(buf, str, &n); | 1164 str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1152 if (str[0] == '-') { | 1165 if (str[0] == '-') { |
1153 // strtoull() will silently accept negative numbers and parse | 1166 // strtoull() will silently accept negative numbers and parse |
1154 // them. This module is more strict and treats them as errors. | 1167 // them. This module is more strict and treats them as errors. |
1155 return false; | 1168 return false; |
1156 } | 1169 } |
1157 char* end; | 1170 char* end; |
1158 errno = 0; | 1171 errno = 0; |
1159 uint64 r = strtoull(str, &end, radix); | 1172 uint64 r = strtoull(str, &end, radix); |
1160 if (end != str + n) return false; // Leftover junk | 1173 if (end != str + n) return false; // Leftover junk |
1161 if (errno) return false; | 1174 if (errno) return false; |
1162 if (dest == NULL) return true; | 1175 if (dest == NULL) return true; |
1163 *(reinterpret_cast<uint64*>(dest)) = r; | 1176 *(reinterpret_cast<uint64*>(dest)) = r; |
1164 return true; | 1177 return true; |
1165 } | 1178 } |
| 1179 #endif |
1166 | 1180 |
1167 static bool parse_double_float(const char* str, int n, bool isfloat, void *dest)
{ | 1181 static bool parse_double_float(const char* str, int n, bool isfloat, void *dest)
{ |
1168 if (n == 0) return false; | 1182 if (n == 0) return false; |
1169 static const int kMaxLength = 200; | 1183 static const int kMaxLength = 200; |
1170 char buf[kMaxLength]; | 1184 char buf[kMaxLength+1]; |
1171 if (n >= kMaxLength) return false; | 1185 str = TerminateNumber(buf, sizeof buf, str, &n, true); |
1172 memcpy(buf, str, n); | 1186 char* end; |
1173 buf[n] = '\0'; | |
1174 errno = 0; | 1187 errno = 0; |
1175 char* end; | |
1176 double r; | 1188 double r; |
1177 if (isfloat) { | 1189 if (isfloat) { |
1178 r = strtof(buf, &end); | 1190 r = strtof(str, &end); |
1179 } else { | 1191 } else { |
1180 r = strtod(buf, &end); | 1192 r = strtod(str, &end); |
1181 } | 1193 } |
1182 if (end != buf + n) return false; // Leftover junk | 1194 if (end != str + n) return false; // Leftover junk |
1183 if (errno) return false; | 1195 if (errno) return false; |
1184 if (dest == NULL) return true; | 1196 if (dest == NULL) return true; |
1185 if (isfloat) { | 1197 if (isfloat) { |
1186 *(reinterpret_cast<float*>(dest)) = r; | 1198 *(reinterpret_cast<float*>(dest)) = (float)r; |
1187 } else { | 1199 } else { |
1188 *(reinterpret_cast<double*>(dest)) = r; | 1200 *(reinterpret_cast<double*>(dest)) = r; |
1189 } | 1201 } |
1190 return true; | 1202 return true; |
1191 } | 1203 } |
1192 | 1204 |
1193 bool RE2::Arg::parse_double(const char* str, int n, void* dest) { | 1205 bool RE2::Arg::parse_double(const char* str, int n, void* dest) { |
1194 return parse_double_float(str, n, false, dest); | 1206 return parse_double_float(str, n, false, dest); |
1195 } | 1207 } |
1196 | 1208 |
(...skipping 21 matching lines...) Expand all Loading... |
1218 DEFINE_INTEGER_PARSERS(int); | 1230 DEFINE_INTEGER_PARSERS(int); |
1219 DEFINE_INTEGER_PARSERS(uint); | 1231 DEFINE_INTEGER_PARSERS(uint); |
1220 DEFINE_INTEGER_PARSERS(long); | 1232 DEFINE_INTEGER_PARSERS(long); |
1221 DEFINE_INTEGER_PARSERS(ulong); | 1233 DEFINE_INTEGER_PARSERS(ulong); |
1222 DEFINE_INTEGER_PARSERS(longlong); | 1234 DEFINE_INTEGER_PARSERS(longlong); |
1223 DEFINE_INTEGER_PARSERS(ulonglong); | 1235 DEFINE_INTEGER_PARSERS(ulonglong); |
1224 | 1236 |
1225 #undef DEFINE_INTEGER_PARSERS | 1237 #undef DEFINE_INTEGER_PARSERS |
1226 | 1238 |
1227 } // namespace re2 | 1239 } // namespace re2 |
OLD | NEW |