| OLD | NEW |
| (Empty) |
| 1 // Copyright 2003-2009 The RE2 Authors. All Rights Reserved. | |
| 2 // Use of this source code is governed by a BSD-style | |
| 3 // license that can be found in the LICENSE file. | |
| 4 | |
| 5 // Regular expression interface RE2. | |
| 6 // | |
| 7 // Originally the PCRE C++ wrapper, but adapted to use | |
| 8 // the new automata-based regular expression engines. | |
| 9 | |
| 10 #include "re2/re2.h" | |
| 11 | |
| 12 #include <stdio.h> | |
| 13 #include <string> | |
| 14 #include <errno.h> | |
| 15 #include "util/util.h" | |
| 16 #include "util/flags.h" | |
| 17 #include "util/sparse_array.h" | |
| 18 #include "re2/prog.h" | |
| 19 #include "re2/regexp.h" | |
| 20 | |
| 21 DEFINE_bool(trace_re2, false, "trace RE2 execution"); | |
| 22 | |
| 23 namespace re2 { | |
| 24 | |
| 25 // Maximum number of args we can set | |
| 26 static const int kMaxArgs = 16; | |
| 27 static const int kVecSize = 1+kMaxArgs; | |
| 28 | |
| 29 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::Ful
lMatchN> RE2::FullMatch = {}; | |
| 30 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::Par
tialMatchN> RE2::PartialMatch = {}; | |
| 31 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN>
RE2::Consume = {}; | |
| 32 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndCo
nsumeN> RE2::FindAndConsume = {}; | |
| 33 | |
| 34 // This will trigger LNK2005 error in MSVC. | |
| 35 #ifndef _MSC_VER | |
| 36 const int RE2::Options::kDefaultMaxMem; // initialized in re2.h | |
| 37 #endif | |
| 38 | |
| 39 RE2::Options::Options(RE2::CannedOptions opt) | |
| 40 : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8), | |
| 41 posix_syntax_(opt == RE2::POSIX), | |
| 42 longest_match_(opt == RE2::POSIX), | |
| 43 log_errors_(opt != RE2::Quiet), | |
| 44 max_mem_(kDefaultMaxMem), | |
| 45 literal_(false), | |
| 46 never_nl_(false), | |
| 47 dot_nl_(false), | |
| 48 never_capture_(false), | |
| 49 case_sensitive_(true), | |
| 50 perl_classes_(false), | |
| 51 word_boundary_(false), | |
| 52 one_line_(false) { | |
| 53 } | |
| 54 | |
| 55 // static empty things for use as const references. | |
| 56 // To avoid global constructors, initialized on demand. | |
| 57 GLOBAL_MUTEX(empty_mutex); | |
| 58 static const string *empty_string; | |
| 59 static const map<string, int> *empty_named_groups; | |
| 60 static const map<int, string> *empty_group_names; | |
| 61 | |
| 62 static void InitEmpty() { | |
| 63 GLOBAL_MUTEX_LOCK(empty_mutex); | |
| 64 if (empty_string == NULL) { | |
| 65 empty_string = new string; | |
| 66 empty_named_groups = new map<string, int>; | |
| 67 empty_group_names = new map<int, string>; | |
| 68 } | |
| 69 GLOBAL_MUTEX_UNLOCK(empty_mutex); | |
| 70 } | |
| 71 | |
| 72 // Converts from Regexp error code to RE2 error code. | |
| 73 // Maybe some day they will diverge. In any event, this | |
| 74 // hides the existence of Regexp from RE2 users. | |
| 75 static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) { | |
| 76 switch (code) { | |
| 77 case re2::kRegexpSuccess: | |
| 78 return RE2::NoError; | |
| 79 case re2::kRegexpInternalError: | |
| 80 return RE2::ErrorInternal; | |
| 81 case re2::kRegexpBadEscape: | |
| 82 return RE2::ErrorBadEscape; | |
| 83 case re2::kRegexpBadCharClass: | |
| 84 return RE2::ErrorBadCharClass; | |
| 85 case re2::kRegexpBadCharRange: | |
| 86 return RE2::ErrorBadCharRange; | |
| 87 case re2::kRegexpMissingBracket: | |
| 88 return RE2::ErrorMissingBracket; | |
| 89 case re2::kRegexpMissingParen: | |
| 90 return RE2::ErrorMissingParen; | |
| 91 case re2::kRegexpTrailingBackslash: | |
| 92 return RE2::ErrorTrailingBackslash; | |
| 93 case re2::kRegexpRepeatArgument: | |
| 94 return RE2::ErrorRepeatArgument; | |
| 95 case re2::kRegexpRepeatSize: | |
| 96 return RE2::ErrorRepeatSize; | |
| 97 case re2::kRegexpRepeatOp: | |
| 98 return RE2::ErrorRepeatOp; | |
| 99 case re2::kRegexpBadPerlOp: | |
| 100 return RE2::ErrorBadPerlOp; | |
| 101 case re2::kRegexpBadUTF8: | |
| 102 return RE2::ErrorBadUTF8; | |
| 103 case re2::kRegexpBadNamedCapture: | |
| 104 return RE2::ErrorBadNamedCapture; | |
| 105 } | |
| 106 return RE2::ErrorInternal; | |
| 107 } | |
| 108 | |
| 109 static string trunc(const StringPiece& pattern) { | |
| 110 if (pattern.size() < 100) | |
| 111 return pattern.as_string(); | |
| 112 return pattern.substr(0, 100).as_string() + "..."; | |
| 113 } | |
| 114 | |
| 115 | |
| 116 RE2::RE2(const char* pattern) { | |
| 117 Init(pattern, DefaultOptions); | |
| 118 } | |
| 119 | |
| 120 RE2::RE2(const string& pattern) { | |
| 121 Init(pattern, DefaultOptions); | |
| 122 } | |
| 123 | |
| 124 RE2::RE2(const StringPiece& pattern) { | |
| 125 Init(pattern, DefaultOptions); | |
| 126 } | |
| 127 | |
| 128 RE2::RE2(const StringPiece& pattern, const Options& options) { | |
| 129 Init(pattern, options); | |
| 130 } | |
| 131 | |
| 132 int RE2::Options::ParseFlags() const { | |
| 133 int flags = Regexp::ClassNL; | |
| 134 switch (encoding()) { | |
| 135 default: | |
| 136 if (log_errors()) | |
| 137 LOG(ERROR) << "Unknown encoding " << encoding(); | |
| 138 break; | |
| 139 case RE2::Options::EncodingUTF8: | |
| 140 break; | |
| 141 case RE2::Options::EncodingLatin1: | |
| 142 flags |= Regexp::Latin1; | |
| 143 break; | |
| 144 } | |
| 145 | |
| 146 if (!posix_syntax()) | |
| 147 flags |= Regexp::LikePerl; | |
| 148 | |
| 149 if (literal()) | |
| 150 flags |= Regexp::Literal; | |
| 151 | |
| 152 if (never_nl()) | |
| 153 flags |= Regexp::NeverNL; | |
| 154 | |
| 155 if (dot_nl()) | |
| 156 flags |= Regexp::DotNL; | |
| 157 | |
| 158 if (never_capture()) | |
| 159 flags |= Regexp::NeverCapture; | |
| 160 | |
| 161 if (!case_sensitive()) | |
| 162 flags |= Regexp::FoldCase; | |
| 163 | |
| 164 if (perl_classes()) | |
| 165 flags |= Regexp::PerlClasses; | |
| 166 | |
| 167 if (word_boundary()) | |
| 168 flags |= Regexp::PerlB; | |
| 169 | |
| 170 if (one_line()) | |
| 171 flags |= Regexp::OneLine; | |
| 172 | |
| 173 return flags; | |
| 174 } | |
| 175 | |
| 176 void RE2::Init(const StringPiece& pattern, const Options& options) { | |
| 177 mutex_ = new Mutex; | |
| 178 pattern_ = pattern.as_string(); | |
| 179 options_.Copy(options); | |
| 180 InitEmpty(); | |
| 181 error_ = empty_string; | |
| 182 error_code_ = NoError; | |
| 183 suffix_regexp_ = NULL; | |
| 184 entire_regexp_ = NULL; | |
| 185 prog_ = NULL; | |
| 186 rprog_ = NULL; | |
| 187 named_groups_ = NULL; | |
| 188 group_names_ = NULL; | |
| 189 num_captures_ = -1; | |
| 190 | |
| 191 RegexpStatus status; | |
| 192 entire_regexp_ = Regexp::Parse( | |
| 193 pattern_, | |
| 194 static_cast<Regexp::ParseFlags>(options_.ParseFlags()), | |
| 195 &status); | |
| 196 if (entire_regexp_ == NULL) { | |
| 197 if (error_ == empty_string) | |
| 198 error_ = new string(status.Text()); | |
| 199 if (options_.log_errors()) { | |
| 200 LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': " | |
| 201 << status.Text(); | |
| 202 } | |
| 203 error_arg_ = status.error_arg().as_string(); | |
| 204 error_code_ = RegexpErrorToRE2(status.code()); | |
| 205 return; | |
| 206 } | |
| 207 | |
| 208 prefix_.clear(); | |
| 209 prefix_foldcase_ = false; | |
| 210 re2::Regexp* suffix; | |
| 211 if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix)) | |
| 212 suffix_regexp_ = suffix; | |
| 213 else | |
| 214 suffix_regexp_ = entire_regexp_->Incref(); | |
| 215 | |
| 216 // Two thirds of the memory goes to the forward Prog, | |
| 217 // one third to the reverse prog, because the forward | |
| 218 // Prog has two DFAs but the reverse prog has one. | |
| 219 prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3); | |
| 220 if (prog_ == NULL) { | |
| 221 if (options_.log_errors()) | |
| 222 LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'"; | |
| 223 error_ = new string("pattern too large - compile failed"); | |
| 224 error_code_ = RE2::ErrorPatternTooLarge; | |
| 225 return; | |
| 226 } | |
| 227 | |
| 228 // Could delay this until the first match call that | |
| 229 // cares about submatch information, but the one-pass | |
| 230 // machine's memory gets cut from the DFA memory budget, | |
| 231 // and that is harder to do if the DFA has already | |
| 232 // been built. | |
| 233 is_one_pass_ = prog_->IsOnePass(); | |
| 234 } | |
| 235 | |
| 236 // Returns rprog_, computing it if needed. | |
| 237 re2::Prog* RE2::ReverseProg() const { | |
| 238 MutexLock l(mutex_); | |
| 239 if (rprog_ == NULL && error_ == empty_string) { | |
| 240 rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3); | |
| 241 if (rprog_ == NULL) { | |
| 242 if (options_.log_errors()) | |
| 243 LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'"; | |
| 244 error_ = new string("pattern too large - reverse compile failed"); | |
| 245 error_code_ = RE2::ErrorPatternTooLarge; | |
| 246 return NULL; | |
| 247 } | |
| 248 } | |
| 249 return rprog_; | |
| 250 } | |
| 251 | |
| 252 RE2::~RE2() { | |
| 253 if (suffix_regexp_) | |
| 254 suffix_regexp_->Decref(); | |
| 255 if (entire_regexp_) | |
| 256 entire_regexp_->Decref(); | |
| 257 delete mutex_; | |
| 258 delete prog_; | |
| 259 delete rprog_; | |
| 260 if (error_ != empty_string) | |
| 261 delete error_; | |
| 262 if (named_groups_ != NULL && named_groups_ != empty_named_groups) | |
| 263 delete named_groups_; | |
| 264 if (group_names_ != NULL && group_names_ != empty_group_names) | |
| 265 delete group_names_; | |
| 266 } | |
| 267 | |
| 268 int RE2::ProgramSize() const { | |
| 269 if (prog_ == NULL) | |
| 270 return -1; | |
| 271 return prog_->size(); | |
| 272 } | |
| 273 | |
| 274 int RE2::ProgramFanout(map<int, int>* histogram) const { | |
| 275 if (prog_ == NULL) | |
| 276 return -1; | |
| 277 SparseArray<int> fanout(prog_->size()); | |
| 278 prog_->Fanout(&fanout); | |
| 279 histogram->clear(); | |
| 280 for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) { | |
| 281 // TODO(junyer): Optimise this? | |
| 282 int bucket = 0; | |
| 283 while (1 << bucket < i->second) { | |
| 284 bucket++; | |
| 285 } | |
| 286 (*histogram)[bucket]++; | |
| 287 } | |
| 288 return histogram->rbegin()->first; | |
| 289 } | |
| 290 | |
| 291 // Returns num_captures_, computing it if needed, or -1 if the | |
| 292 // regexp wasn't valid on construction. | |
| 293 int RE2::NumberOfCapturingGroups() const { | |
| 294 MutexLock l(mutex_); | |
| 295 if (suffix_regexp_ == NULL) | |
| 296 return -1; | |
| 297 if (num_captures_ == -1) | |
| 298 num_captures_ = suffix_regexp_->NumCaptures(); | |
| 299 return num_captures_; | |
| 300 } | |
| 301 | |
| 302 // Returns named_groups_, computing it if needed. | |
| 303 const map<string, int>& RE2::NamedCapturingGroups() const { | |
| 304 MutexLock l(mutex_); | |
| 305 if (!ok()) | |
| 306 return *empty_named_groups; | |
| 307 if (named_groups_ == NULL) { | |
| 308 named_groups_ = suffix_regexp_->NamedCaptures(); | |
| 309 if (named_groups_ == NULL) | |
| 310 named_groups_ = empty_named_groups; | |
| 311 } | |
| 312 return *named_groups_; | |
| 313 } | |
| 314 | |
| 315 // Returns group_names_, computing it if needed. | |
| 316 const map<int, string>& RE2::CapturingGroupNames() const { | |
| 317 MutexLock l(mutex_); | |
| 318 if (!ok()) | |
| 319 return *empty_group_names; | |
| 320 if (group_names_ == NULL) { | |
| 321 group_names_ = suffix_regexp_->CaptureNames(); | |
| 322 if (group_names_ == NULL) | |
| 323 group_names_ = empty_group_names; | |
| 324 } | |
| 325 return *group_names_; | |
| 326 } | |
| 327 | |
| 328 /***** Convenience interfaces *****/ | |
| 329 | |
| 330 bool RE2::FullMatchN(const StringPiece& text, const RE2& re, | |
| 331 const Arg* const args[], int n) { | |
| 332 return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n); | |
| 333 } | |
| 334 | |
| 335 bool RE2::PartialMatchN(const StringPiece& text, const RE2& re, | |
| 336 const Arg* const args[], int n) { | |
| 337 return re.DoMatch(text, UNANCHORED, NULL, args, n); | |
| 338 } | |
| 339 | |
| 340 bool RE2::ConsumeN(StringPiece* input, const RE2& re, | |
| 341 const Arg* const args[], int n) { | |
| 342 int consumed; | |
| 343 if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) { | |
| 344 input->remove_prefix(consumed); | |
| 345 return true; | |
| 346 } else { | |
| 347 return false; | |
| 348 } | |
| 349 } | |
| 350 | |
| 351 bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re, | |
| 352 const Arg* const args[], int n) { | |
| 353 int consumed; | |
| 354 if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) { | |
| 355 input->remove_prefix(consumed); | |
| 356 return true; | |
| 357 } else { | |
| 358 return false; | |
| 359 } | |
| 360 } | |
| 361 | |
| 362 // Returns the maximum submatch needed for the rewrite to be done by Replace(). | |
| 363 // E.g. if rewrite == "foo \\2,\\1", returns 2. | |
| 364 int RE2::MaxSubmatch(const StringPiece& rewrite) { | |
| 365 int max = 0; | |
| 366 for (const char *s = rewrite.data(), *end = s + rewrite.size(); | |
| 367 s < end; s++) { | |
| 368 if (*s == '\\') { | |
| 369 s++; | |
| 370 int c = (s < end) ? *s : -1; | |
| 371 if (isdigit(c)) { | |
| 372 int n = (c - '0'); | |
| 373 if (n > max) | |
| 374 max = n; | |
| 375 } | |
| 376 } | |
| 377 } | |
| 378 return max; | |
| 379 } | |
| 380 | |
| 381 bool RE2::Replace(string *str, | |
| 382 const RE2& re, | |
| 383 const StringPiece& rewrite) { | |
| 384 StringPiece vec[kVecSize]; | |
| 385 int nvec = 1 + MaxSubmatch(rewrite); | |
| 386 if (nvec > arraysize(vec)) | |
| 387 return false; | |
| 388 if (!re.Match(*str, 0, static_cast<int>(str->size()), UNANCHORED, vec, nvec)) | |
| 389 return false; | |
| 390 | |
| 391 string s; | |
| 392 if (!re.Rewrite(&s, rewrite, vec, nvec)) | |
| 393 return false; | |
| 394 | |
| 395 assert(vec[0].begin() >= str->data()); | |
| 396 assert(vec[0].end() <= str->data()+str->size()); | |
| 397 str->replace(vec[0].data() - str->data(), vec[0].size(), s); | |
| 398 return true; | |
| 399 } | |
| 400 | |
| 401 int RE2::GlobalReplace(string *str, | |
| 402 const RE2& re, | |
| 403 const StringPiece& rewrite) { | |
| 404 StringPiece vec[kVecSize]; | |
| 405 int nvec = 1 + MaxSubmatch(rewrite); | |
| 406 if (nvec > arraysize(vec)) | |
| 407 return false; | |
| 408 | |
| 409 const char* p = str->data(); | |
| 410 const char* ep = p + str->size(); | |
| 411 const char* lastend = NULL; | |
| 412 string out; | |
| 413 int count = 0; | |
| 414 while (p <= ep) { | |
| 415 if (!re.Match(*str, static_cast<int>(p - str->data()), | |
| 416 static_cast<int>(str->size()), UNANCHORED, vec, nvec)) | |
| 417 break; | |
| 418 if (p < vec[0].begin()) | |
| 419 out.append(p, vec[0].begin() - p); | |
| 420 if (vec[0].begin() == lastend && vec[0].size() == 0) { | |
| 421 // Disallow empty match at end of last match: skip ahead. | |
| 422 if (p < ep) | |
| 423 out.append(p, 1); | |
| 424 p++; | |
| 425 continue; | |
| 426 } | |
| 427 re.Rewrite(&out, rewrite, vec, nvec); | |
| 428 p = vec[0].end(); | |
| 429 lastend = p; | |
| 430 count++; | |
| 431 } | |
| 432 | |
| 433 if (count == 0) | |
| 434 return 0; | |
| 435 | |
| 436 if (p < ep) | |
| 437 out.append(p, ep - p); | |
| 438 swap(out, *str); | |
| 439 return count; | |
| 440 } | |
| 441 | |
| 442 bool RE2::Extract(const StringPiece &text, | |
| 443 const RE2& re, | |
| 444 const StringPiece &rewrite, | |
| 445 string *out) { | |
| 446 StringPiece vec[kVecSize]; | |
| 447 int nvec = 1 + MaxSubmatch(rewrite); | |
| 448 if (nvec > arraysize(vec)) | |
| 449 return false; | |
| 450 | |
| 451 if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec)) | |
| 452 return false; | |
| 453 | |
| 454 out->clear(); | |
| 455 return re.Rewrite(out, rewrite, vec, nvec); | |
| 456 } | |
| 457 | |
| 458 string RE2::QuoteMeta(const StringPiece& unquoted) { | |
| 459 string result; | |
| 460 result.reserve(unquoted.size() << 1); | |
| 461 | |
| 462 // Escape any ascii character not in [A-Za-z_0-9]. | |
| 463 // | |
| 464 // Note that it's legal to escape a character even if it has no | |
| 465 // special meaning in a regular expression -- so this function does | |
| 466 // that. (This also makes it identical to the perl function of the | |
| 467 // same name except for the null-character special case; | |
| 468 // see `perldoc -f quotemeta`.) | |
| 469 for (int ii = 0; ii < unquoted.length(); ++ii) { | |
| 470 // Note that using 'isalnum' here raises the benchmark time from | |
| 471 // 32ns to 58ns: | |
| 472 if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && | |
| 473 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && | |
| 474 (unquoted[ii] < '0' || unquoted[ii] > '9') && | |
| 475 unquoted[ii] != '_' && | |
| 476 // If this is the part of a UTF8 or Latin1 character, we need | |
| 477 // to copy this byte without escaping. Experimentally this is | |
| 478 // what works correctly with the regexp library. | |
| 479 !(unquoted[ii] & 128)) { | |
| 480 if (unquoted[ii] == '\0') { // Special handling for null chars. | |
| 481 // Note that this special handling is not strictly required for RE2, | |
| 482 // but this quoting is required for other regexp libraries such as | |
| 483 // PCRE. | |
| 484 // Can't use "\\0" since the next character might be a digit. | |
| 485 result += "\\x00"; | |
| 486 continue; | |
| 487 } | |
| 488 result += '\\'; | |
| 489 } | |
| 490 result += unquoted[ii]; | |
| 491 } | |
| 492 | |
| 493 return result; | |
| 494 } | |
| 495 | |
| 496 bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const { | |
| 497 if (prog_ == NULL) | |
| 498 return false; | |
| 499 | |
| 500 int n = static_cast<int>(prefix_.size()); | |
| 501 if (n > maxlen) | |
| 502 n = maxlen; | |
| 503 | |
| 504 // Determine initial min max from prefix_ literal. | |
| 505 string pmin, pmax; | |
| 506 pmin = prefix_.substr(0, n); | |
| 507 pmax = prefix_.substr(0, n); | |
| 508 if (prefix_foldcase_) { | |
| 509 // prefix is ASCII lowercase; change pmin to uppercase. | |
| 510 for (int i = 0; i < n; i++) { | |
| 511 if ('a' <= pmin[i] && pmin[i] <= 'z') | |
| 512 pmin[i] += 'A' - 'a'; | |
| 513 } | |
| 514 } | |
| 515 | |
| 516 // Add to prefix min max using PossibleMatchRange on regexp. | |
| 517 string dmin, dmax; | |
| 518 maxlen -= n; | |
| 519 if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) { | |
| 520 pmin += dmin; | |
| 521 pmax += dmax; | |
| 522 } else if (pmax.size() > 0) { | |
| 523 // prog_->PossibleMatchRange has failed us, | |
| 524 // but we still have useful information from prefix_. | |
| 525 // Round up pmax to allow any possible suffix. | |
| 526 pmax = PrefixSuccessor(pmax); | |
| 527 } else { | |
| 528 // Nothing useful. | |
| 529 *min = ""; | |
| 530 *max = ""; | |
| 531 return false; | |
| 532 } | |
| 533 | |
| 534 *min = pmin; | |
| 535 *max = pmax; | |
| 536 return true; | |
| 537 } | |
| 538 | |
| 539 // Avoid possible locale nonsense in standard strcasecmp. | |
| 540 // The string a is known to be all lowercase. | |
| 541 static int ascii_strcasecmp(const char* a, const char* b, int len) { | |
| 542 const char *ae = a + len; | |
| 543 | |
| 544 for (; a < ae; a++, b++) { | |
| 545 uint8 x = *a; | |
| 546 uint8 y = *b; | |
| 547 if ('A' <= y && y <= 'Z') | |
| 548 y += 'a' - 'A'; | |
| 549 if (x != y) | |
| 550 return x - y; | |
| 551 } | |
| 552 return 0; | |
| 553 } | |
| 554 | |
| 555 | |
| 556 /***** Actual matching and rewriting code *****/ | |
| 557 | |
| 558 bool RE2::Match(const StringPiece& text, | |
| 559 int startpos, | |
| 560 int endpos, | |
| 561 Anchor re_anchor, | |
| 562 StringPiece* submatch, | |
| 563 int nsubmatch) const { | |
| 564 if (!ok() || suffix_regexp_ == NULL) { | |
| 565 if (options_.log_errors()) | |
| 566 LOG(ERROR) << "Invalid RE2: " << *error_; | |
| 567 return false; | |
| 568 } | |
| 569 | |
| 570 if (startpos < 0 || startpos > endpos || endpos > text.size()) { | |
| 571 if (options_.log_errors()) | |
| 572 LOG(ERROR) << "RE2: invalid startpos, endpos pair. [" | |
| 573 << "startpos: " << startpos << ", " | |
| 574 << "endpos: " << endpos << ", " | |
| 575 << "text size: " << text.size() << "]"; | |
| 576 return false; | |
| 577 } | |
| 578 | |
| 579 StringPiece subtext = text; | |
| 580 subtext.remove_prefix(startpos); | |
| 581 subtext.remove_suffix(text.size() - endpos); | |
| 582 | |
| 583 // Use DFAs to find exact location of match, filter out non-matches. | |
| 584 | |
| 585 // Don't ask for the location if we won't use it. | |
| 586 // SearchDFA can do extra optimizations in that case. | |
| 587 StringPiece match; | |
| 588 StringPiece* matchp = &match; | |
| 589 if (nsubmatch == 0) | |
| 590 matchp = NULL; | |
| 591 | |
| 592 int ncap = 1 + NumberOfCapturingGroups(); | |
| 593 if (ncap > nsubmatch) | |
| 594 ncap = nsubmatch; | |
| 595 | |
| 596 // If the regexp is anchored explicitly, must not be in middle of text. | |
| 597 if (prog_->anchor_start() && startpos != 0) | |
| 598 return false; | |
| 599 | |
| 600 // If the regexp is anchored explicitly, update re_anchor | |
| 601 // so that we can potentially fall into a faster case below. | |
| 602 if (prog_->anchor_start() && prog_->anchor_end()) | |
| 603 re_anchor = ANCHOR_BOTH; | |
| 604 else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH) | |
| 605 re_anchor = ANCHOR_START; | |
| 606 | |
| 607 // Check for the required prefix, if any. | |
| 608 int prefixlen = 0; | |
| 609 if (!prefix_.empty()) { | |
| 610 if (startpos != 0) | |
| 611 return false; | |
| 612 prefixlen = static_cast<int>(prefix_.size()); | |
| 613 if (prefixlen > subtext.size()) | |
| 614 return false; | |
| 615 if (prefix_foldcase_) { | |
| 616 if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0) | |
| 617 return false; | |
| 618 } else { | |
| 619 if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0) | |
| 620 return false; | |
| 621 } | |
| 622 subtext.remove_prefix(prefixlen); | |
| 623 // If there is a required prefix, the anchor must be at least ANCHOR_START. | |
| 624 if (re_anchor != ANCHOR_BOTH) | |
| 625 re_anchor = ANCHOR_START; | |
| 626 } | |
| 627 | |
| 628 Prog::Anchor anchor = Prog::kUnanchored; | |
| 629 Prog::MatchKind kind = Prog::kFirstMatch; | |
| 630 if (options_.longest_match()) | |
| 631 kind = Prog::kLongestMatch; | |
| 632 bool skipped_test = false; | |
| 633 | |
| 634 bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture); | |
| 635 | |
| 636 // SearchBitState allocates a bit vector of size prog_->size() * text.size(). | |
| 637 // It also allocates a stack of 3-word structures which could potentially | |
| 638 // grow as large as prog_->size() * text.size() but in practice is much | |
| 639 // smaller. | |
| 640 // Conditions for using SearchBitState: | |
| 641 const int MaxBitStateProg = 500; // prog_->size() <= Max. | |
| 642 const int MaxBitStateVector = 256*1024; // bit vector size <= Max (bits) | |
| 643 bool can_bit_state = prog_->size() <= MaxBitStateProg; | |
| 644 int bit_state_text_max = MaxBitStateVector / prog_->size(); | |
| 645 | |
| 646 bool dfa_failed = false; | |
| 647 switch (re_anchor) { | |
| 648 default: | |
| 649 case UNANCHORED: { | |
| 650 if (!prog_->SearchDFA(subtext, text, anchor, kind, | |
| 651 matchp, &dfa_failed, NULL)) { | |
| 652 if (dfa_failed) { | |
| 653 // Fall back to NFA below. | |
| 654 skipped_test = true; | |
| 655 if (FLAGS_trace_re2) | |
| 656 LOG(INFO) << "Match " << trunc(pattern_) | |
| 657 << " [" << CEscape(subtext) << "]" | |
| 658 << " DFA failed."; | |
| 659 break; | |
| 660 } | |
| 661 if (FLAGS_trace_re2) | |
| 662 LOG(INFO) << "Match " << trunc(pattern_) | |
| 663 << " [" << CEscape(subtext) << "]" | |
| 664 << " used DFA - no match."; | |
| 665 return false; | |
| 666 } | |
| 667 if (FLAGS_trace_re2) | |
| 668 LOG(INFO) << "Match " << trunc(pattern_) | |
| 669 << " [" << CEscape(subtext) << "]" | |
| 670 << " used DFA - match"; | |
| 671 if (matchp == NULL) // Matched. Don't care where | |
| 672 return true; | |
| 673 // SearchDFA set match[0].end() but didn't know where the | |
| 674 // match started. Run the regexp backward from match[0].end() | |
| 675 // to find the longest possible match -- that's where it started. | |
| 676 Prog* prog = ReverseProg(); | |
| 677 if (prog == NULL) | |
| 678 return false; | |
| 679 if (!prog->SearchDFA(match, text, Prog::kAnchored, | |
| 680 Prog::kLongestMatch, &match, &dfa_failed, NULL)) { | |
| 681 if (dfa_failed) { | |
| 682 // Fall back to NFA below. | |
| 683 skipped_test = true; | |
| 684 if (FLAGS_trace_re2) | |
| 685 LOG(INFO) << "Match " << trunc(pattern_) | |
| 686 << " [" << CEscape(subtext) << "]" | |
| 687 << " reverse DFA failed."; | |
| 688 break; | |
| 689 } | |
| 690 if (FLAGS_trace_re2) | |
| 691 LOG(INFO) << "Match " << trunc(pattern_) | |
| 692 << " [" << CEscape(subtext) << "]" | |
| 693 << " DFA inconsistency."; | |
| 694 if (options_.log_errors()) | |
| 695 LOG(ERROR) << "DFA inconsistency"; | |
| 696 return false; | |
| 697 } | |
| 698 if (FLAGS_trace_re2) | |
| 699 LOG(INFO) << "Match " << trunc(pattern_) | |
| 700 << " [" << CEscape(subtext) << "]" | |
| 701 << " used reverse DFA."; | |
| 702 break; | |
| 703 } | |
| 704 | |
| 705 case ANCHOR_BOTH: | |
| 706 case ANCHOR_START: | |
| 707 if (re_anchor == ANCHOR_BOTH) | |
| 708 kind = Prog::kFullMatch; | |
| 709 anchor = Prog::kAnchored; | |
| 710 | |
| 711 // If only a small amount of text and need submatch | |
| 712 // information anyway and we're going to use OnePass or BitState | |
| 713 // to get it, we might as well not even bother with the DFA: | |
| 714 // OnePass or BitState will be fast enough. | |
| 715 // On tiny texts, OnePass outruns even the DFA, and | |
| 716 // it doesn't have the shared state and occasional mutex that | |
| 717 // the DFA does. | |
| 718 if (can_one_pass && text.size() <= 4096 && | |
| 719 (ncap > 1 || text.size() <= 8)) { | |
| 720 if (FLAGS_trace_re2) | |
| 721 LOG(INFO) << "Match " << trunc(pattern_) | |
| 722 << " [" << CEscape(subtext) << "]" | |
| 723 << " skipping DFA for OnePass."; | |
| 724 skipped_test = true; | |
| 725 break; | |
| 726 } | |
| 727 if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) { | |
| 728 if (FLAGS_trace_re2) | |
| 729 LOG(INFO) << "Match " << trunc(pattern_) | |
| 730 << " [" << CEscape(subtext) << "]" | |
| 731 << " skipping DFA for BitState."; | |
| 732 skipped_test = true; | |
| 733 break; | |
| 734 } | |
| 735 if (!prog_->SearchDFA(subtext, text, anchor, kind, | |
| 736 &match, &dfa_failed, NULL)) { | |
| 737 if (dfa_failed) { | |
| 738 if (FLAGS_trace_re2) | |
| 739 LOG(INFO) << "Match " << trunc(pattern_) | |
| 740 << " [" << CEscape(subtext) << "]" | |
| 741 << " DFA failed."; | |
| 742 skipped_test = true; | |
| 743 break; | |
| 744 } | |
| 745 if (FLAGS_trace_re2) | |
| 746 LOG(INFO) << "Match " << trunc(pattern_) | |
| 747 << " [" << CEscape(subtext) << "]" | |
| 748 << " used DFA - no match."; | |
| 749 return false; | |
| 750 } | |
| 751 break; | |
| 752 } | |
| 753 | |
| 754 if (!skipped_test && ncap <= 1) { | |
| 755 // We know exactly where it matches. That's enough. | |
| 756 if (ncap == 1) | |
| 757 submatch[0] = match; | |
| 758 } else { | |
| 759 StringPiece subtext1; | |
| 760 if (skipped_test) { | |
| 761 // DFA ran out of memory or was skipped: | |
| 762 // need to search in entire original text. | |
| 763 subtext1 = subtext; | |
| 764 } else { | |
| 765 // DFA found the exact match location: | |
| 766 // let NFA run an anchored, full match search | |
| 767 // to find submatch locations. | |
| 768 subtext1 = match; | |
| 769 anchor = Prog::kAnchored; | |
| 770 kind = Prog::kFullMatch; | |
| 771 } | |
| 772 | |
| 773 if (can_one_pass && anchor != Prog::kUnanchored) { | |
| 774 if (FLAGS_trace_re2) | |
| 775 LOG(INFO) << "Match " << trunc(pattern_) | |
| 776 << " [" << CEscape(subtext) << "]" | |
| 777 << " using OnePass."; | |
| 778 if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) { | |
| 779 if (!skipped_test && options_.log_errors()) | |
| 780 LOG(ERROR) << "SearchOnePass inconsistency"; | |
| 781 return false; | |
| 782 } | |
| 783 } else if (can_bit_state && subtext1.size() <= bit_state_text_max) { | |
| 784 if (FLAGS_trace_re2) | |
| 785 LOG(INFO) << "Match " << trunc(pattern_) | |
| 786 << " [" << CEscape(subtext) << "]" | |
| 787 << " using BitState."; | |
| 788 if (!prog_->SearchBitState(subtext1, text, anchor, | |
| 789 kind, submatch, ncap)) { | |
| 790 if (!skipped_test && options_.log_errors()) | |
| 791 LOG(ERROR) << "SearchBitState inconsistency"; | |
| 792 return false; | |
| 793 } | |
| 794 } else { | |
| 795 if (FLAGS_trace_re2) | |
| 796 LOG(INFO) << "Match " << trunc(pattern_) | |
| 797 << " [" << CEscape(subtext) << "]" | |
| 798 << " using NFA."; | |
| 799 if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) { | |
| 800 if (!skipped_test && options_.log_errors()) | |
| 801 LOG(ERROR) << "SearchNFA inconsistency"; | |
| 802 return false; | |
| 803 } | |
| 804 } | |
| 805 } | |
| 806 | |
| 807 // Adjust overall match for required prefix that we stripped off. | |
| 808 if (prefixlen > 0 && nsubmatch > 0) | |
| 809 submatch[0] = StringPiece(submatch[0].begin() - prefixlen, | |
| 810 submatch[0].size() + prefixlen); | |
| 811 | |
| 812 // Zero submatches that don't exist in the regexp. | |
| 813 for (int i = ncap; i < nsubmatch; i++) | |
| 814 submatch[i] = NULL; | |
| 815 return true; | |
| 816 } | |
| 817 | |
| 818 // Internal matcher - like Match() but takes Args not StringPieces. | |
| 819 bool RE2::DoMatch(const StringPiece& text, | |
| 820 Anchor anchor, | |
| 821 int* consumed, | |
| 822 const Arg* const* args, | |
| 823 int n) const { | |
| 824 if (!ok()) { | |
| 825 if (options_.log_errors()) | |
| 826 LOG(ERROR) << "Invalid RE2: " << *error_; | |
| 827 return false; | |
| 828 } | |
| 829 | |
| 830 // Count number of capture groups needed. | |
| 831 int nvec; | |
| 832 if (n == 0 && consumed == NULL) | |
| 833 nvec = 0; | |
| 834 else | |
| 835 nvec = n+1; | |
| 836 | |
| 837 StringPiece* vec; | |
| 838 StringPiece stkvec[kVecSize]; | |
| 839 StringPiece* heapvec = NULL; | |
| 840 | |
| 841 if (nvec <= arraysize(stkvec)) { | |
| 842 vec = stkvec; | |
| 843 } else { | |
| 844 vec = new StringPiece[nvec]; | |
| 845 heapvec = vec; | |
| 846 } | |
| 847 | |
| 848 if (!Match(text, 0, text.size(), anchor, vec, nvec)) { | |
| 849 delete[] heapvec; | |
| 850 return false; | |
| 851 } | |
| 852 | |
| 853 if (consumed != NULL) | |
| 854 *consumed = static_cast<int>(vec[0].end() - text.begin()); | |
| 855 | |
| 856 if (n == 0 || args == NULL) { | |
| 857 // We are not interested in results | |
| 858 delete[] heapvec; | |
| 859 return true; | |
| 860 } | |
| 861 | |
| 862 int ncap = NumberOfCapturingGroups(); | |
| 863 if (ncap < n) { | |
| 864 // RE has fewer capturing groups than number of arg pointers passed in | |
| 865 VLOG(1) << "Asked for " << n << " but only have " << ncap; | |
| 866 delete[] heapvec; | |
| 867 return false; | |
| 868 } | |
| 869 | |
| 870 // If we got here, we must have matched the whole pattern. | |
| 871 for (int i = 0; i < n; i++) { | |
| 872 const StringPiece& s = vec[i+1]; | |
| 873 if (!args[i]->Parse(s.data(), s.size())) { | |
| 874 // TODO: Should we indicate what the error was? | |
| 875 VLOG(1) << "Parse error on #" << i << " " << s << " " | |
| 876 << (void*)s.data() << "/" << s.size(); | |
| 877 delete[] heapvec; | |
| 878 return false; | |
| 879 } | |
| 880 } | |
| 881 | |
| 882 delete[] heapvec; | |
| 883 return true; | |
| 884 } | |
| 885 | |
| 886 // Append the "rewrite" string, with backslash subsitutions from "vec", | |
| 887 // to string "out". | |
| 888 bool RE2::Rewrite(string *out, const StringPiece &rewrite, | |
| 889 const StringPiece *vec, int veclen) const { | |
| 890 for (const char *s = rewrite.data(), *end = s + rewrite.size(); | |
| 891 s < end; s++) { | |
| 892 if (*s != '\\') { | |
| 893 out->push_back(*s); | |
| 894 continue; | |
| 895 } | |
| 896 s++; | |
| 897 int c = (s < end) ? *s : -1; | |
| 898 if (isdigit(c)) { | |
| 899 int n = (c - '0'); | |
| 900 if (n >= veclen) { | |
| 901 if (options_.log_errors()) { | |
| 902 LOG(ERROR) << "requested group " << n | |
| 903 << " in regexp " << rewrite.data(); | |
| 904 } | |
| 905 return false; | |
| 906 } | |
| 907 StringPiece snip = vec[n]; | |
| 908 if (snip.size() > 0) | |
| 909 out->append(snip.data(), snip.size()); | |
| 910 } else if (c == '\\') { | |
| 911 out->push_back('\\'); | |
| 912 } else { | |
| 913 if (options_.log_errors()) | |
| 914 LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); | |
| 915 return false; | |
| 916 } | |
| 917 } | |
| 918 return true; | |
| 919 } | |
| 920 | |
| 921 // Checks that the rewrite string is well-formed with respect to this | |
| 922 // regular expression. | |
| 923 bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const { | |
| 924 int max_token = -1; | |
| 925 for (const char *s = rewrite.data(), *end = s + rewrite.size(); | |
| 926 s < end; s++) { | |
| 927 int c = *s; | |
| 928 if (c != '\\') { | |
| 929 continue; | |
| 930 } | |
| 931 if (++s == end) { | |
| 932 *error = "Rewrite schema error: '\\' not allowed at end."; | |
| 933 return false; | |
| 934 } | |
| 935 c = *s; | |
| 936 if (c == '\\') { | |
| 937 continue; | |
| 938 } | |
| 939 if (!isdigit(c)) { | |
| 940 *error = "Rewrite schema error: " | |
| 941 "'\\' must be followed by a digit or '\\'."; | |
| 942 return false; | |
| 943 } | |
| 944 int n = (c - '0'); | |
| 945 if (max_token < n) { | |
| 946 max_token = n; | |
| 947 } | |
| 948 } | |
| 949 | |
| 950 if (max_token > NumberOfCapturingGroups()) { | |
| 951 SStringPrintf(error, "Rewrite schema requests %d matches, " | |
| 952 "but the regexp only has %d parenthesized subexpressions.", | |
| 953 max_token, NumberOfCapturingGroups()); | |
| 954 return false; | |
| 955 } | |
| 956 return true; | |
| 957 } | |
| 958 | |
| 959 /***** Parsers for various types *****/ | |
| 960 | |
| 961 bool RE2::Arg::parse_null(const char* str, int n, void* dest) { | |
| 962 // We fail if somebody asked us to store into a non-NULL void* pointer | |
| 963 return (dest == NULL); | |
| 964 } | |
| 965 | |
| 966 bool RE2::Arg::parse_string(const char* str, int n, void* dest) { | |
| 967 if (dest == NULL) return true; | |
| 968 reinterpret_cast<string*>(dest)->assign(str, n); | |
| 969 return true; | |
| 970 } | |
| 971 | |
| 972 bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) { | |
| 973 if (dest == NULL) return true; | |
| 974 reinterpret_cast<StringPiece*>(dest)->set(str, n); | |
| 975 return true; | |
| 976 } | |
| 977 | |
| 978 bool RE2::Arg::parse_char(const char* str, int n, void* dest) { | |
| 979 if (n != 1) return false; | |
| 980 if (dest == NULL) return true; | |
| 981 *(reinterpret_cast<char*>(dest)) = str[0]; | |
| 982 return true; | |
| 983 } | |
| 984 | |
| 985 bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) { | |
| 986 if (n != 1) return false; | |
| 987 if (dest == NULL) return true; | |
| 988 *(reinterpret_cast<unsigned char*>(dest)) = str[0]; | |
| 989 return true; | |
| 990 } | |
| 991 | |
| 992 // Largest number spec that we are willing to parse | |
| 993 static const int kMaxNumberLength = 32; | |
| 994 | |
| 995 // REQUIRES "buf" must have length at least nbuf. | |
| 996 // Copies "str" into "buf" and null-terminates. | |
| 997 // Overwrites *np with the new length. | |
| 998 static const char* TerminateNumber(char* buf, int nbuf, const char* str, int* np
, | |
| 999 bool accept_spaces) { | |
| 1000 int n = *np; | |
| 1001 if (n <= 0) return ""; | |
| 1002 if (n > 0 && isspace(*str)) { | |
| 1003 // We are less forgiving than the strtoxxx() routines and do not | |
| 1004 // allow leading spaces. We do allow leading spaces for floats. | |
| 1005 if (!accept_spaces) { | |
| 1006 return ""; | |
| 1007 } | |
| 1008 while (n > 0 && isspace(*str)) { | |
| 1009 n--; | |
| 1010 str++; | |
| 1011 } | |
| 1012 } | |
| 1013 | |
| 1014 // Although buf has a fixed maximum size, we can still handle | |
| 1015 // arbitrarily large integers correctly by omitting leading zeros. | |
| 1016 // (Numbers that are still too long will be out of range.) | |
| 1017 // Before deciding whether str is too long, | |
| 1018 // remove leading zeros with s/000+/00/. | |
| 1019 // Leaving the leading two zeros in place means that | |
| 1020 // we don't change 0000x123 (invalid) into 0x123 (valid). | |
| 1021 // Skip over leading - before replacing. | |
| 1022 bool neg = false; | |
| 1023 if (n >= 1 && str[0] == '-') { | |
| 1024 neg = true; | |
| 1025 n--; | |
| 1026 str++; | |
| 1027 } | |
| 1028 | |
| 1029 if (n >= 3 && str[0] == '0' && str[1] == '0') { | |
| 1030 while (n >= 3 && str[2] == '0') { | |
| 1031 n--; | |
| 1032 str++; | |
| 1033 } | |
| 1034 } | |
| 1035 | |
| 1036 if (neg) { // make room in buf for - | |
| 1037 n++; | |
| 1038 str--; | |
| 1039 } | |
| 1040 | |
| 1041 if (n > nbuf-1) return ""; | |
| 1042 | |
| 1043 memmove(buf, str, n); | |
| 1044 if (neg) { | |
| 1045 buf[0] = '-'; | |
| 1046 } | |
| 1047 buf[n] = '\0'; | |
| 1048 *np = n; | |
| 1049 return buf; | |
| 1050 } | |
| 1051 | |
| 1052 bool RE2::Arg::parse_long_radix(const char* str, | |
| 1053 int n, | |
| 1054 void* dest, | |
| 1055 int radix) { | |
| 1056 if (n == 0) return false; | |
| 1057 char buf[kMaxNumberLength+1]; | |
| 1058 str = TerminateNumber(buf, sizeof buf, str, &n, false); | |
| 1059 char* end; | |
| 1060 errno = 0; | |
| 1061 long r = strtol(str, &end, radix); | |
| 1062 if (end != str + n) return false; // Leftover junk | |
| 1063 if (errno) return false; | |
| 1064 if (dest == NULL) return true; | |
| 1065 *(reinterpret_cast<long*>(dest)) = r; | |
| 1066 return true; | |
| 1067 } | |
| 1068 | |
| 1069 bool RE2::Arg::parse_ulong_radix(const char* str, | |
| 1070 int n, | |
| 1071 void* dest, | |
| 1072 int radix) { | |
| 1073 if (n == 0) return false; | |
| 1074 char buf[kMaxNumberLength+1]; | |
| 1075 str = TerminateNumber(buf, sizeof buf, str, &n, false); | |
| 1076 if (str[0] == '-') { | |
| 1077 // strtoul() will silently accept negative numbers and parse | |
| 1078 // them. This module is more strict and treats them as errors. | |
| 1079 return false; | |
| 1080 } | |
| 1081 | |
| 1082 char* end; | |
| 1083 errno = 0; | |
| 1084 unsigned long r = strtoul(str, &end, radix); | |
| 1085 if (end != str + n) return false; // Leftover junk | |
| 1086 if (errno) return false; | |
| 1087 if (dest == NULL) return true; | |
| 1088 *(reinterpret_cast<unsigned long*>(dest)) = r; | |
| 1089 return true; | |
| 1090 } | |
| 1091 | |
| 1092 bool RE2::Arg::parse_short_radix(const char* str, | |
| 1093 int n, | |
| 1094 void* dest, | |
| 1095 int radix) { | |
| 1096 long r; | |
| 1097 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse | |
| 1098 if ((short)r != r) return false; // Out of range | |
| 1099 if (dest == NULL) return true; | |
| 1100 *(reinterpret_cast<short*>(dest)) = (short)r; | |
| 1101 return true; | |
| 1102 } | |
| 1103 | |
| 1104 bool RE2::Arg::parse_ushort_radix(const char* str, | |
| 1105 int n, | |
| 1106 void* dest, | |
| 1107 int radix) { | |
| 1108 unsigned long r; | |
| 1109 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse | |
| 1110 if ((ushort)r != r) return false; // Out of range | |
| 1111 if (dest == NULL) return true; | |
| 1112 *(reinterpret_cast<unsigned short*>(dest)) = (ushort)r; | |
| 1113 return true; | |
| 1114 } | |
| 1115 | |
| 1116 bool RE2::Arg::parse_int_radix(const char* str, | |
| 1117 int n, | |
| 1118 void* dest, | |
| 1119 int radix) { | |
| 1120 long r; | |
| 1121 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse | |
| 1122 if ((int)r != r) return false; // Out of range | |
| 1123 if (dest == NULL) return true; | |
| 1124 *(reinterpret_cast<int*>(dest)) = r; | |
| 1125 return true; | |
| 1126 } | |
| 1127 | |
| 1128 bool RE2::Arg::parse_uint_radix(const char* str, | |
| 1129 int n, | |
| 1130 void* dest, | |
| 1131 int radix) { | |
| 1132 unsigned long r; | |
| 1133 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse | |
| 1134 if ((uint)r != r) return false; // Out of range | |
| 1135 if (dest == NULL) return true; | |
| 1136 *(reinterpret_cast<unsigned int*>(dest)) = r; | |
| 1137 return true; | |
| 1138 } | |
| 1139 | |
| 1140 #if RE2_HAVE_LONGLONG | |
| 1141 bool RE2::Arg::parse_longlong_radix(const char* str, | |
| 1142 int n, | |
| 1143 void* dest, | |
| 1144 int radix) { | |
| 1145 if (n == 0) return false; | |
| 1146 char buf[kMaxNumberLength+1]; | |
| 1147 str = TerminateNumber(buf, sizeof buf, str, &n, false); | |
| 1148 char* end; | |
| 1149 errno = 0; | |
| 1150 int64 r = strtoll(str, &end, radix); | |
| 1151 if (end != str + n) return false; // Leftover junk | |
| 1152 if (errno) return false; | |
| 1153 if (dest == NULL) return true; | |
| 1154 *(reinterpret_cast<int64*>(dest)) = r; | |
| 1155 return true; | |
| 1156 } | |
| 1157 | |
| 1158 bool RE2::Arg::parse_ulonglong_radix(const char* str, | |
| 1159 int n, | |
| 1160 void* dest, | |
| 1161 int radix) { | |
| 1162 if (n == 0) return false; | |
| 1163 char buf[kMaxNumberLength+1]; | |
| 1164 str = TerminateNumber(buf, sizeof buf, str, &n, false); | |
| 1165 if (str[0] == '-') { | |
| 1166 // strtoull() will silently accept negative numbers and parse | |
| 1167 // them. This module is more strict and treats them as errors. | |
| 1168 return false; | |
| 1169 } | |
| 1170 char* end; | |
| 1171 errno = 0; | |
| 1172 uint64 r = strtoull(str, &end, radix); | |
| 1173 if (end != str + n) return false; // Leftover junk | |
| 1174 if (errno) return false; | |
| 1175 if (dest == NULL) return true; | |
| 1176 *(reinterpret_cast<uint64*>(dest)) = r; | |
| 1177 return true; | |
| 1178 } | |
| 1179 #endif | |
| 1180 | |
| 1181 static bool parse_double_float(const char* str, int n, bool isfloat, void *dest)
{ | |
| 1182 if (n == 0) return false; | |
| 1183 static const int kMaxLength = 200; | |
| 1184 char buf[kMaxLength+1]; | |
| 1185 str = TerminateNumber(buf, sizeof buf, str, &n, true); | |
| 1186 char* end; | |
| 1187 errno = 0; | |
| 1188 double r; | |
| 1189 if (isfloat) { | |
| 1190 r = strtof(str, &end); | |
| 1191 } else { | |
| 1192 r = strtod(str, &end); | |
| 1193 } | |
| 1194 if (end != str + n) return false; // Leftover junk | |
| 1195 if (errno) return false; | |
| 1196 if (dest == NULL) return true; | |
| 1197 if (isfloat) { | |
| 1198 *(reinterpret_cast<float*>(dest)) = (float)r; | |
| 1199 } else { | |
| 1200 *(reinterpret_cast<double*>(dest)) = r; | |
| 1201 } | |
| 1202 return true; | |
| 1203 } | |
| 1204 | |
| 1205 bool RE2::Arg::parse_double(const char* str, int n, void* dest) { | |
| 1206 return parse_double_float(str, n, false, dest); | |
| 1207 } | |
| 1208 | |
| 1209 bool RE2::Arg::parse_float(const char* str, int n, void* dest) { | |
| 1210 return parse_double_float(str, n, true, dest); | |
| 1211 } | |
| 1212 | |
| 1213 | |
| 1214 #define DEFINE_INTEGER_PARSERS(name) \ | |
| 1215 bool RE2::Arg::parse_##name(const char* str, int n, void* dest) { \ | |
| 1216 return parse_##name##_radix(str, n, dest, 10); \ | |
| 1217 } \ | |
| 1218 bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \ | |
| 1219 return parse_##name##_radix(str, n, dest, 16); \ | |
| 1220 } \ | |
| 1221 bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \ | |
| 1222 return parse_##name##_radix(str, n, dest, 8); \ | |
| 1223 } \ | |
| 1224 bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \ | |
| 1225 return parse_##name##_radix(str, n, dest, 0); \ | |
| 1226 } | |
| 1227 | |
| 1228 DEFINE_INTEGER_PARSERS(short); | |
| 1229 DEFINE_INTEGER_PARSERS(ushort); | |
| 1230 DEFINE_INTEGER_PARSERS(int); | |
| 1231 DEFINE_INTEGER_PARSERS(uint); | |
| 1232 DEFINE_INTEGER_PARSERS(long); | |
| 1233 DEFINE_INTEGER_PARSERS(ulong); | |
| 1234 DEFINE_INTEGER_PARSERS(longlong); | |
| 1235 DEFINE_INTEGER_PARSERS(ulonglong); | |
| 1236 | |
| 1237 #undef DEFINE_INTEGER_PARSERS | |
| 1238 | |
| 1239 } // namespace re2 | |
| OLD | NEW |