| OLD | NEW |
| (Empty) |
| 1 // Copyright 2003-2009 Google Inc. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style | |
| 3 // license that can be found in the LICENSE file. | |
| 4 | |
| 5 // This is a variant of PCRE's pcrecpp.cc, originally written at Google. | |
| 6 // The main changes are the addition of the HitLimit method and | |
| 7 // compilation as PCRE in namespace re2. | |
| 8 | |
| 9 #include <errno.h> | |
| 10 #include <limits> | |
| 11 #include "util/util.h" | |
| 12 #include "util/flags.h" | |
| 13 #include "util/pcre.h" | |
| 14 | |
| 15 #define PCREPORT(level) LOG(level) | |
| 16 | |
| 17 // Default PCRE limits. | |
| 18 // Defaults chosen to allow a plausible amount of CPU and | |
| 19 // not exceed main thread stacks. Note that other threads | |
| 20 // often have smaller stacks, and therefore tightening | |
| 21 // regexp_stack_limit may frequently be necessary. | |
| 22 DEFINE_int32(regexp_stack_limit, 256<<10, "default PCRE stack limit (bytes)"); | |
| 23 DEFINE_int32(regexp_match_limit, 1000000, | |
| 24 "default PCRE match limit (function calls)"); | |
| 25 | |
| 26 #ifndef USEPCRE | |
| 27 | |
| 28 // Fake just enough of the PCRE API to allow this file to build. :) | |
| 29 | |
| 30 struct pcre_extra { | |
| 31 int flags; | |
| 32 int match_limit; | |
| 33 int match_limit_recursion; | |
| 34 }; | |
| 35 | |
| 36 #define PCRE_EXTRA_MATCH_LIMIT 0 | |
| 37 #define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0 | |
| 38 #define PCRE_ANCHORED 0 | |
| 39 #define PCRE_NOTEMPTY 0 | |
| 40 #define PCRE_ERROR_NOMATCH 1 | |
| 41 #define PCRE_ERROR_MATCHLIMIT 2 | |
| 42 #define PCRE_ERROR_RECURSIONLIMIT 3 | |
| 43 #define PCRE_INFO_CAPTURECOUNT 0 | |
| 44 | |
| 45 void pcre_free(void*) { | |
| 46 } | |
| 47 | |
| 48 pcre* pcre_compile(const char*, int, const char**, int*, const unsigned char*) { | |
| 49 return NULL; | |
| 50 } | |
| 51 | |
| 52 int pcre_exec(const pcre*, const pcre_extra*, const char*, int, int, int, int*,
int) { | |
| 53 return 0; | |
| 54 } | |
| 55 | |
| 56 int pcre_fullinfo(const pcre*, const pcre_extra*, int, void*) { | |
| 57 return 0; | |
| 58 } | |
| 59 | |
| 60 #endif | |
| 61 | |
| 62 namespace re2 { | |
| 63 | |
| 64 // Maximum number of args we can set | |
| 65 static const int kMaxArgs = 16; | |
| 66 static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace | |
| 67 | |
| 68 // Approximate size of a recursive invocation of PCRE's | |
| 69 // internal "match()" frame. This varies depending on the | |
| 70 // compiler and architecture, of course, so the constant is | |
| 71 // just a conservative estimate. To find the exact number, | |
| 72 // run regexp_unittest with --regexp_stack_limit=0 under | |
| 73 // a debugger and look at the frames when it crashes. | |
| 74 // The exact frame size was 656 in production on 2008/02/03. | |
| 75 static const int kPCREFrameSize = 700; | |
| 76 | |
| 77 // Special name for missing C++ arguments. | |
| 78 PCRE::Arg PCRE::no_more_args((void*)NULL); | |
| 79 | |
| 80 const PCRE::PartialMatchFunctor PCRE::PartialMatch = { }; | |
| 81 const PCRE::FullMatchFunctor PCRE::FullMatch = { } ; | |
| 82 const PCRE::ConsumeFunctor PCRE::Consume = { }; | |
| 83 const PCRE::FindAndConsumeFunctor PCRE::FindAndConsume = { }; | |
| 84 | |
| 85 // If a regular expression has no error, its error_ field points here | |
| 86 static const string empty_string; | |
| 87 | |
| 88 void PCRE::Init(const char* pattern, Option options, int match_limit, | |
| 89 int stack_limit, bool report_errors) { | |
| 90 pattern_ = pattern; | |
| 91 options_ = options; | |
| 92 match_limit_ = match_limit; | |
| 93 stack_limit_ = stack_limit; | |
| 94 hit_limit_ = false; | |
| 95 error_ = &empty_string; | |
| 96 report_errors_ = report_errors; | |
| 97 re_full_ = NULL; | |
| 98 re_partial_ = NULL; | |
| 99 | |
| 100 if (options & ~(EnabledCompileOptions | EnabledExecOptions)) { | |
| 101 error_ = new string("illegal regexp option"); | |
| 102 PCREPORT(ERROR) | |
| 103 << "Error compiling '" << pattern << "': illegal regexp option"; | |
| 104 } else { | |
| 105 re_partial_ = Compile(UNANCHORED); | |
| 106 if (re_partial_ != NULL) { | |
| 107 re_full_ = Compile(ANCHOR_BOTH); | |
| 108 } | |
| 109 } | |
| 110 } | |
| 111 | |
| 112 PCRE::PCRE(const char* pattern) { | |
| 113 Init(pattern, None, 0, 0, true); | |
| 114 } | |
| 115 PCRE::PCRE(const char* pattern, Option option) { | |
| 116 Init(pattern, option, 0, 0, true); | |
| 117 } | |
| 118 PCRE::PCRE(const string& pattern) { | |
| 119 Init(pattern.c_str(), None, 0, 0, true); | |
| 120 } | |
| 121 PCRE::PCRE(const string& pattern, Option option) { | |
| 122 Init(pattern.c_str(), option, 0, 0, true); | |
| 123 } | |
| 124 PCRE::PCRE(const string& pattern, const PCRE_Options& re_option) { | |
| 125 Init(pattern.c_str(), re_option.option(), re_option.match_limit(), | |
| 126 re_option.stack_limit(), re_option.report_errors()); | |
| 127 } | |
| 128 | |
| 129 PCRE::PCRE(const char *pattern, const PCRE_Options& re_option) { | |
| 130 Init(pattern, re_option.option(), re_option.match_limit(), | |
| 131 re_option.stack_limit(), re_option.report_errors()); | |
| 132 } | |
| 133 | |
| 134 PCRE::~PCRE() { | |
| 135 if (re_full_ != NULL) pcre_free(re_full_); | |
| 136 if (re_partial_ != NULL) pcre_free(re_partial_); | |
| 137 if (error_ != &empty_string) delete error_; | |
| 138 } | |
| 139 | |
| 140 pcre* PCRE::Compile(Anchor anchor) { | |
| 141 // Special treatment for anchoring. This is needed because at | |
| 142 // runtime pcre only provides an option for anchoring at the | |
| 143 // beginning of a string. | |
| 144 // | |
| 145 // There are three types of anchoring we want: | |
| 146 // UNANCHORED Compile the original pattern, and use | |
| 147 // a pcre unanchored match. | |
| 148 // ANCHOR_START Compile the original pattern, and use | |
| 149 // a pcre anchored match. | |
| 150 // ANCHOR_BOTH Tack a "\z" to the end of the original pattern | |
| 151 // and use a pcre anchored match. | |
| 152 | |
| 153 const char* error = ""; | |
| 154 int eoffset; | |
| 155 pcre* re; | |
| 156 if (anchor != ANCHOR_BOTH) { | |
| 157 re = pcre_compile(pattern_.c_str(), | |
| 158 (options_ & EnabledCompileOptions), | |
| 159 &error, &eoffset, NULL); | |
| 160 } else { | |
| 161 // Tack a '\z' at the end of PCRE. Parenthesize it first so that | |
| 162 // the '\z' applies to all top-level alternatives in the regexp. | |
| 163 string wrapped = "(?:"; // A non-counting grouping operator | |
| 164 wrapped += pattern_; | |
| 165 wrapped += ")\\z"; | |
| 166 re = pcre_compile(wrapped.c_str(), | |
| 167 (options_ & EnabledCompileOptions), | |
| 168 &error, &eoffset, NULL); | |
| 169 } | |
| 170 if (re == NULL) { | |
| 171 if (error_ == &empty_string) error_ = new string(error); | |
| 172 PCREPORT(ERROR) << "Error compiling '" << pattern_ << "': " << error; | |
| 173 } | |
| 174 return re; | |
| 175 } | |
| 176 | |
| 177 /***** Convenience interfaces *****/ | |
| 178 | |
| 179 bool PCRE::FullMatchFunctor::operator ()(const StringPiece& text, | |
| 180 const PCRE& re, | |
| 181 const Arg& a0, | |
| 182 const Arg& a1, | |
| 183 const Arg& a2, | |
| 184 const Arg& a3, | |
| 185 const Arg& a4, | |
| 186 const Arg& a5, | |
| 187 const Arg& a6, | |
| 188 const Arg& a7, | |
| 189 const Arg& a8, | |
| 190 const Arg& a9, | |
| 191 const Arg& a10, | |
| 192 const Arg& a11, | |
| 193 const Arg& a12, | |
| 194 const Arg& a13, | |
| 195 const Arg& a14, | |
| 196 const Arg& a15) const { | |
| 197 const Arg* args[kMaxArgs]; | |
| 198 int n = 0; | |
| 199 if (&a0 == &no_more_args) goto done; args[n++] = &a0; | |
| 200 if (&a1 == &no_more_args) goto done; args[n++] = &a1; | |
| 201 if (&a2 == &no_more_args) goto done; args[n++] = &a2; | |
| 202 if (&a3 == &no_more_args) goto done; args[n++] = &a3; | |
| 203 if (&a4 == &no_more_args) goto done; args[n++] = &a4; | |
| 204 if (&a5 == &no_more_args) goto done; args[n++] = &a5; | |
| 205 if (&a6 == &no_more_args) goto done; args[n++] = &a6; | |
| 206 if (&a7 == &no_more_args) goto done; args[n++] = &a7; | |
| 207 if (&a8 == &no_more_args) goto done; args[n++] = &a8; | |
| 208 if (&a9 == &no_more_args) goto done; args[n++] = &a9; | |
| 209 if (&a10 == &no_more_args) goto done; args[n++] = &a10; | |
| 210 if (&a11 == &no_more_args) goto done; args[n++] = &a11; | |
| 211 if (&a12 == &no_more_args) goto done; args[n++] = &a12; | |
| 212 if (&a13 == &no_more_args) goto done; args[n++] = &a13; | |
| 213 if (&a14 == &no_more_args) goto done; args[n++] = &a14; | |
| 214 if (&a15 == &no_more_args) goto done; args[n++] = &a15; | |
| 215 done: | |
| 216 | |
| 217 int consumed; | |
| 218 int vec[kVecSize] = {}; | |
| 219 return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize); | |
| 220 } | |
| 221 | |
| 222 bool PCRE::PartialMatchFunctor::operator ()(const StringPiece& text, | |
| 223 const PCRE& re, | |
| 224 const Arg& a0, | |
| 225 const Arg& a1, | |
| 226 const Arg& a2, | |
| 227 const Arg& a3, | |
| 228 const Arg& a4, | |
| 229 const Arg& a5, | |
| 230 const Arg& a6, | |
| 231 const Arg& a7, | |
| 232 const Arg& a8, | |
| 233 const Arg& a9, | |
| 234 const Arg& a10, | |
| 235 const Arg& a11, | |
| 236 const Arg& a12, | |
| 237 const Arg& a13, | |
| 238 const Arg& a14, | |
| 239 const Arg& a15) const { | |
| 240 const Arg* args[kMaxArgs]; | |
| 241 int n = 0; | |
| 242 if (&a0 == &no_more_args) goto done; args[n++] = &a0; | |
| 243 if (&a1 == &no_more_args) goto done; args[n++] = &a1; | |
| 244 if (&a2 == &no_more_args) goto done; args[n++] = &a2; | |
| 245 if (&a3 == &no_more_args) goto done; args[n++] = &a3; | |
| 246 if (&a4 == &no_more_args) goto done; args[n++] = &a4; | |
| 247 if (&a5 == &no_more_args) goto done; args[n++] = &a5; | |
| 248 if (&a6 == &no_more_args) goto done; args[n++] = &a6; | |
| 249 if (&a7 == &no_more_args) goto done; args[n++] = &a7; | |
| 250 if (&a8 == &no_more_args) goto done; args[n++] = &a8; | |
| 251 if (&a9 == &no_more_args) goto done; args[n++] = &a9; | |
| 252 if (&a10 == &no_more_args) goto done; args[n++] = &a10; | |
| 253 if (&a11 == &no_more_args) goto done; args[n++] = &a11; | |
| 254 if (&a12 == &no_more_args) goto done; args[n++] = &a12; | |
| 255 if (&a13 == &no_more_args) goto done; args[n++] = &a13; | |
| 256 if (&a14 == &no_more_args) goto done; args[n++] = &a14; | |
| 257 if (&a15 == &no_more_args) goto done; args[n++] = &a15; | |
| 258 done: | |
| 259 | |
| 260 int consumed; | |
| 261 int vec[kVecSize] = {}; | |
| 262 return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize); | |
| 263 } | |
| 264 | |
| 265 bool PCRE::ConsumeFunctor::operator ()(StringPiece* input, | |
| 266 const PCRE& pattern, | |
| 267 const Arg& a0, | |
| 268 const Arg& a1, | |
| 269 const Arg& a2, | |
| 270 const Arg& a3, | |
| 271 const Arg& a4, | |
| 272 const Arg& a5, | |
| 273 const Arg& a6, | |
| 274 const Arg& a7, | |
| 275 const Arg& a8, | |
| 276 const Arg& a9, | |
| 277 const Arg& a10, | |
| 278 const Arg& a11, | |
| 279 const Arg& a12, | |
| 280 const Arg& a13, | |
| 281 const Arg& a14, | |
| 282 const Arg& a15) const { | |
| 283 const Arg* args[kMaxArgs]; | |
| 284 int n = 0; | |
| 285 if (&a0 == &no_more_args) goto done; args[n++] = &a0; | |
| 286 if (&a1 == &no_more_args) goto done; args[n++] = &a1; | |
| 287 if (&a2 == &no_more_args) goto done; args[n++] = &a2; | |
| 288 if (&a3 == &no_more_args) goto done; args[n++] = &a3; | |
| 289 if (&a4 == &no_more_args) goto done; args[n++] = &a4; | |
| 290 if (&a5 == &no_more_args) goto done; args[n++] = &a5; | |
| 291 if (&a6 == &no_more_args) goto done; args[n++] = &a6; | |
| 292 if (&a7 == &no_more_args) goto done; args[n++] = &a7; | |
| 293 if (&a8 == &no_more_args) goto done; args[n++] = &a8; | |
| 294 if (&a9 == &no_more_args) goto done; args[n++] = &a9; | |
| 295 if (&a10 == &no_more_args) goto done; args[n++] = &a10; | |
| 296 if (&a11 == &no_more_args) goto done; args[n++] = &a11; | |
| 297 if (&a12 == &no_more_args) goto done; args[n++] = &a12; | |
| 298 if (&a13 == &no_more_args) goto done; args[n++] = &a13; | |
| 299 if (&a14 == &no_more_args) goto done; args[n++] = &a14; | |
| 300 if (&a15 == &no_more_args) goto done; args[n++] = &a15; | |
| 301 done: | |
| 302 | |
| 303 int consumed; | |
| 304 int vec[kVecSize] = {}; | |
| 305 if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed, | |
| 306 args, n, vec, kVecSize)) { | |
| 307 input->remove_prefix(consumed); | |
| 308 return true; | |
| 309 } else { | |
| 310 return false; | |
| 311 } | |
| 312 } | |
| 313 | |
| 314 bool PCRE::FindAndConsumeFunctor::operator ()(StringPiece* input, | |
| 315 const PCRE& pattern, | |
| 316 const Arg& a0, | |
| 317 const Arg& a1, | |
| 318 const Arg& a2, | |
| 319 const Arg& a3, | |
| 320 const Arg& a4, | |
| 321 const Arg& a5, | |
| 322 const Arg& a6, | |
| 323 const Arg& a7, | |
| 324 const Arg& a8, | |
| 325 const Arg& a9, | |
| 326 const Arg& a10, | |
| 327 const Arg& a11, | |
| 328 const Arg& a12, | |
| 329 const Arg& a13, | |
| 330 const Arg& a14, | |
| 331 const Arg& a15) const { | |
| 332 const Arg* args[kMaxArgs]; | |
| 333 int n = 0; | |
| 334 if (&a0 == &no_more_args) goto done; args[n++] = &a0; | |
| 335 if (&a1 == &no_more_args) goto done; args[n++] = &a1; | |
| 336 if (&a2 == &no_more_args) goto done; args[n++] = &a2; | |
| 337 if (&a3 == &no_more_args) goto done; args[n++] = &a3; | |
| 338 if (&a4 == &no_more_args) goto done; args[n++] = &a4; | |
| 339 if (&a5 == &no_more_args) goto done; args[n++] = &a5; | |
| 340 if (&a6 == &no_more_args) goto done; args[n++] = &a6; | |
| 341 if (&a7 == &no_more_args) goto done; args[n++] = &a7; | |
| 342 if (&a8 == &no_more_args) goto done; args[n++] = &a8; | |
| 343 if (&a9 == &no_more_args) goto done; args[n++] = &a9; | |
| 344 if (&a10 == &no_more_args) goto done; args[n++] = &a10; | |
| 345 if (&a11 == &no_more_args) goto done; args[n++] = &a11; | |
| 346 if (&a12 == &no_more_args) goto done; args[n++] = &a12; | |
| 347 if (&a13 == &no_more_args) goto done; args[n++] = &a13; | |
| 348 if (&a14 == &no_more_args) goto done; args[n++] = &a14; | |
| 349 if (&a15 == &no_more_args) goto done; args[n++] = &a15; | |
| 350 done: | |
| 351 | |
| 352 int consumed; | |
| 353 int vec[kVecSize] = {}; | |
| 354 if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed, | |
| 355 args, n, vec, kVecSize)) { | |
| 356 input->remove_prefix(consumed); | |
| 357 return true; | |
| 358 } else { | |
| 359 return false; | |
| 360 } | |
| 361 } | |
| 362 | |
| 363 bool PCRE::Replace(string *str, | |
| 364 const PCRE& pattern, | |
| 365 const StringPiece& rewrite) { | |
| 366 int vec[kVecSize] = {}; | |
| 367 int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize); | |
| 368 if (matches == 0) | |
| 369 return false; | |
| 370 | |
| 371 string s; | |
| 372 if (!pattern.Rewrite(&s, rewrite, *str, vec, matches)) | |
| 373 return false; | |
| 374 | |
| 375 assert(vec[0] >= 0); | |
| 376 assert(vec[1] >= 0); | |
| 377 str->replace(vec[0], vec[1] - vec[0], s); | |
| 378 return true; | |
| 379 } | |
| 380 | |
| 381 int PCRE::GlobalReplace(string *str, | |
| 382 const PCRE& pattern, | |
| 383 const StringPiece& rewrite) { | |
| 384 int count = 0; | |
| 385 int vec[kVecSize] = {}; | |
| 386 string out; | |
| 387 int start = 0; | |
| 388 bool last_match_was_empty_string = false; | |
| 389 | |
| 390 while (start <= static_cast<int>(str->size())) { | |
| 391 // If the previous match was for the empty string, we shouldn't | |
| 392 // just match again: we'll match in the same way and get an | |
| 393 // infinite loop. Instead, we do the match in a special way: | |
| 394 // anchored -- to force another try at the same position -- | |
| 395 // and with a flag saying that this time, ignore empty matches. | |
| 396 // If this special match returns, that means there's a non-empty | |
| 397 // match at this position as well, and we can continue. If not, | |
| 398 // we do what perl does, and just advance by one. | |
| 399 // Notice that perl prints '@@@' for this; | |
| 400 // perl -le '$_ = "aa"; s/b*|aa/@/g; print' | |
| 401 int matches; | |
| 402 if (last_match_was_empty_string) { | |
| 403 matches = pattern.TryMatch(*str, start, ANCHOR_START, false, | |
| 404 vec, kVecSize); | |
| 405 if (matches <= 0) { | |
| 406 if (start < static_cast<int>(str->size())) | |
| 407 out.push_back((*str)[start]); | |
| 408 start++; | |
| 409 last_match_was_empty_string = false; | |
| 410 continue; | |
| 411 } | |
| 412 } else { | |
| 413 matches = pattern.TryMatch(*str, start, UNANCHORED, true, | |
| 414 vec, kVecSize); | |
| 415 if (matches <= 0) | |
| 416 break; | |
| 417 } | |
| 418 int matchstart = vec[0], matchend = vec[1]; | |
| 419 assert(matchstart >= start); | |
| 420 assert(matchend >= matchstart); | |
| 421 | |
| 422 out.append(*str, start, matchstart - start); | |
| 423 pattern.Rewrite(&out, rewrite, *str, vec, matches); | |
| 424 start = matchend; | |
| 425 count++; | |
| 426 last_match_was_empty_string = (matchstart == matchend); | |
| 427 } | |
| 428 | |
| 429 if (count == 0) | |
| 430 return 0; | |
| 431 | |
| 432 if (start < static_cast<int>(str->size())) | |
| 433 out.append(*str, start, static_cast<int>(str->size()) - start); | |
| 434 swap(out, *str); | |
| 435 return count; | |
| 436 } | |
| 437 | |
| 438 bool PCRE::Extract(const StringPiece &text, | |
| 439 const PCRE& pattern, | |
| 440 const StringPiece &rewrite, | |
| 441 string *out) { | |
| 442 int vec[kVecSize] = {}; | |
| 443 int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize); | |
| 444 if (matches == 0) | |
| 445 return false; | |
| 446 out->clear(); | |
| 447 return pattern.Rewrite(out, rewrite, text, vec, matches); | |
| 448 } | |
| 449 | |
| 450 string PCRE::QuoteMeta(const StringPiece& unquoted) { | |
| 451 string result; | |
| 452 result.reserve(unquoted.size() << 1); | |
| 453 | |
| 454 // Escape any ascii character not in [A-Za-z_0-9]. | |
| 455 // | |
| 456 // Note that it's legal to escape a character even if it has no | |
| 457 // special meaning in a regular expression -- so this function does | |
| 458 // that. (This also makes it identical to the perl function of the | |
| 459 // same name except for the null-character special case; | |
| 460 // see `perldoc -f quotemeta`.) | |
| 461 for (int ii = 0; ii < unquoted.length(); ++ii) { | |
| 462 // Note that using 'isalnum' here raises the benchmark time from | |
| 463 // 32ns to 58ns: | |
| 464 if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && | |
| 465 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && | |
| 466 (unquoted[ii] < '0' || unquoted[ii] > '9') && | |
| 467 unquoted[ii] != '_' && | |
| 468 // If this is the part of a UTF8 or Latin1 character, we need | |
| 469 // to copy this byte without escaping. Experimentally this is | |
| 470 // what works correctly with the regexp library. | |
| 471 !(unquoted[ii] & 128)) { | |
| 472 if (unquoted[ii] == '\0') { // Special handling for null chars. | |
| 473 // Can't use "\\0" since the next character might be a digit. | |
| 474 result += "\\x00"; | |
| 475 continue; | |
| 476 } | |
| 477 result += '\\'; | |
| 478 } | |
| 479 result += unquoted[ii]; | |
| 480 } | |
| 481 | |
| 482 return result; | |
| 483 } | |
| 484 | |
| 485 /***** Actual matching and rewriting code *****/ | |
| 486 | |
| 487 bool PCRE::HitLimit() { | |
| 488 return hit_limit_ != 0; | |
| 489 } | |
| 490 | |
| 491 void PCRE::ClearHitLimit() { | |
| 492 hit_limit_ = 0; | |
| 493 } | |
| 494 | |
| 495 int PCRE::TryMatch(const StringPiece& text, | |
| 496 int startpos, | |
| 497 Anchor anchor, | |
| 498 bool empty_ok, | |
| 499 int *vec, | |
| 500 int vecsize) const { | |
| 501 pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_; | |
| 502 if (re == NULL) { | |
| 503 PCREPORT(ERROR) << "Matching against invalid re: " << *error_; | |
| 504 return 0; | |
| 505 } | |
| 506 | |
| 507 int match_limit = match_limit_; | |
| 508 if (match_limit <= 0) { | |
| 509 match_limit = FLAGS_regexp_match_limit; | |
| 510 } | |
| 511 | |
| 512 int stack_limit = stack_limit_; | |
| 513 if (stack_limit <= 0) { | |
| 514 stack_limit = FLAGS_regexp_stack_limit; | |
| 515 } | |
| 516 | |
| 517 pcre_extra extra = { 0 }; | |
| 518 if (match_limit > 0) { | |
| 519 extra.flags |= PCRE_EXTRA_MATCH_LIMIT; | |
| 520 extra.match_limit = match_limit; | |
| 521 } | |
| 522 if (stack_limit > 0) { | |
| 523 extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION; | |
| 524 extra.match_limit_recursion = stack_limit / kPCREFrameSize; | |
| 525 } | |
| 526 | |
| 527 int options = 0; | |
| 528 if (anchor != UNANCHORED) | |
| 529 options |= PCRE_ANCHORED; | |
| 530 if (!empty_ok) | |
| 531 options |= PCRE_NOTEMPTY; | |
| 532 | |
| 533 int rc = pcre_exec(re, // The regular expression object | |
| 534 &extra, | |
| 535 (text.data() == NULL) ? "" : text.data(), | |
| 536 text.size(), | |
| 537 startpos, | |
| 538 options, | |
| 539 vec, | |
| 540 vecsize); | |
| 541 | |
| 542 // Handle errors | |
| 543 if (rc == 0) { | |
| 544 // pcre_exec() returns 0 as a special case when the number of | |
| 545 // capturing subpatterns exceeds the size of the vector. | |
| 546 // When this happens, there is a match and the output vector | |
| 547 // is filled, but we miss out on the positions of the extra subpatterns. | |
| 548 rc = vecsize / 2; | |
| 549 } else if (rc < 0) { | |
| 550 switch (rc) { | |
| 551 case PCRE_ERROR_NOMATCH: | |
| 552 return 0; | |
| 553 case PCRE_ERROR_MATCHLIMIT: | |
| 554 // Writing to hit_limit is not safe if multiple threads | |
| 555 // are using the PCRE, but the flag is only intended | |
| 556 // for use by unit tests anyway, so we let it go. | |
| 557 hit_limit_ = true; | |
| 558 PCREPORT(WARNING) << "Exceeded match limit of " << match_limit | |
| 559 << " when matching '" << pattern_ << "'" | |
| 560 << " against text that is " << text.size() << " bytes."; | |
| 561 return 0; | |
| 562 case PCRE_ERROR_RECURSIONLIMIT: | |
| 563 // See comment about hit_limit above. | |
| 564 hit_limit_ = true; | |
| 565 PCREPORT(WARNING) << "Exceeded stack limit of " << stack_limit | |
| 566 << " when matching '" << pattern_ << "'" | |
| 567 << " against text that is " << text.size() << " bytes."; | |
| 568 return 0; | |
| 569 default: | |
| 570 // There are other return codes from pcre.h : | |
| 571 // PCRE_ERROR_NULL (-2) | |
| 572 // PCRE_ERROR_BADOPTION (-3) | |
| 573 // PCRE_ERROR_BADMAGIC (-4) | |
| 574 // PCRE_ERROR_UNKNOWN_NODE (-5) | |
| 575 // PCRE_ERROR_NOMEMORY (-6) | |
| 576 // PCRE_ERROR_NOSUBSTRING (-7) | |
| 577 // ... | |
| 578 PCREPORT(ERROR) << "Unexpected return code: " << rc | |
| 579 << " when matching '" << pattern_ << "'" | |
| 580 << ", re=" << re | |
| 581 << ", text=" << text | |
| 582 << ", vec=" << vec | |
| 583 << ", vecsize=" << vecsize; | |
| 584 return 0; | |
| 585 } | |
| 586 } | |
| 587 | |
| 588 return rc; | |
| 589 } | |
| 590 | |
| 591 bool PCRE::DoMatchImpl(const StringPiece& text, | |
| 592 Anchor anchor, | |
| 593 int* consumed, | |
| 594 const Arg* const* args, | |
| 595 int n, | |
| 596 int* vec, | |
| 597 int vecsize) const { | |
| 598 assert((1 + n) * 3 <= vecsize); // results + PCRE workspace | |
| 599 int matches = TryMatch(text, 0, anchor, true, vec, vecsize); | |
| 600 assert(matches >= 0); // TryMatch never returns negatives | |
| 601 if (matches == 0) | |
| 602 return false; | |
| 603 | |
| 604 *consumed = vec[1]; | |
| 605 | |
| 606 if (n == 0 || args == NULL) { | |
| 607 // We are not interested in results | |
| 608 return true; | |
| 609 } | |
| 610 if (NumberOfCapturingGroups() < n) { | |
| 611 // PCRE has fewer capturing groups than number of arg pointers passed in | |
| 612 return false; | |
| 613 } | |
| 614 | |
| 615 // If we got here, we must have matched the whole pattern. | |
| 616 // We do not need (can not do) any more checks on the value of 'matches' here | |
| 617 // -- see the comment for TryMatch. | |
| 618 for (int i = 0; i < n; i++) { | |
| 619 const int start = vec[2*(i+1)]; | |
| 620 const int limit = vec[2*(i+1)+1]; | |
| 621 if (!args[i]->Parse(text.data() + start, limit-start)) { | |
| 622 // TODO: Should we indicate what the error was? | |
| 623 return false; | |
| 624 } | |
| 625 } | |
| 626 | |
| 627 return true; | |
| 628 } | |
| 629 | |
| 630 bool PCRE::DoMatch(const StringPiece& text, | |
| 631 Anchor anchor, | |
| 632 int* consumed, | |
| 633 const Arg* const args[], | |
| 634 int n) const { | |
| 635 assert(n >= 0); | |
| 636 const int vecsize = (1 + n) * 3; // results + PCRE workspace | |
| 637 // (as for kVecSize) | |
| 638 int* vec = new int[vecsize]; | |
| 639 bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize); | |
| 640 delete[] vec; | |
| 641 return b; | |
| 642 } | |
| 643 | |
| 644 bool PCRE::Rewrite(string *out, const StringPiece &rewrite, | |
| 645 const StringPiece &text, int *vec, int veclen) const { | |
| 646 int number_of_capturing_groups = NumberOfCapturingGroups(); | |
| 647 for (const char *s = rewrite.data(), *end = s + rewrite.size(); | |
| 648 s < end; s++) { | |
| 649 int c = *s; | |
| 650 if (c == '\\') { | |
| 651 c = *++s; | |
| 652 if (isdigit(c)) { | |
| 653 int n = (c - '0'); | |
| 654 if (n >= veclen) { | |
| 655 if (n <= number_of_capturing_groups) { | |
| 656 // unmatched optional capturing group. treat | |
| 657 // its value as empty string; i.e., nothing to append. | |
| 658 } else { | |
| 659 PCREPORT(ERROR) << "requested group " << n | |
| 660 << " in regexp " << rewrite.data(); | |
| 661 return false; | |
| 662 } | |
| 663 } | |
| 664 int start = vec[2 * n]; | |
| 665 if (start >= 0) | |
| 666 out->append(text.data() + start, vec[2 * n + 1] - start); | |
| 667 } else if (c == '\\') { | |
| 668 out->push_back('\\'); | |
| 669 } else { | |
| 670 PCREPORT(ERROR) << "invalid rewrite pattern: " << rewrite.data(); | |
| 671 return false; | |
| 672 } | |
| 673 } else { | |
| 674 out->push_back(c); | |
| 675 } | |
| 676 } | |
| 677 return true; | |
| 678 } | |
| 679 | |
| 680 bool PCRE::CheckRewriteString(const StringPiece& rewrite, string* error) const { | |
| 681 int max_token = -1; | |
| 682 for (const char *s = rewrite.data(), *end = s + rewrite.size(); | |
| 683 s < end; s++) { | |
| 684 int c = *s; | |
| 685 if (c != '\\') { | |
| 686 continue; | |
| 687 } | |
| 688 if (++s == end) { | |
| 689 *error = "Rewrite schema error: '\\' not allowed at end."; | |
| 690 return false; | |
| 691 } | |
| 692 c = *s; | |
| 693 if (c == '\\') { | |
| 694 continue; | |
| 695 } | |
| 696 if (!isdigit(c)) { | |
| 697 *error = "Rewrite schema error: " | |
| 698 "'\\' must be followed by a digit or '\\'."; | |
| 699 return false; | |
| 700 } | |
| 701 int n = (c - '0'); | |
| 702 if (max_token < n) { | |
| 703 max_token = n; | |
| 704 } | |
| 705 } | |
| 706 | |
| 707 if (max_token > NumberOfCapturingGroups()) { | |
| 708 SStringPrintf(error, "Rewrite schema requests %d matches, " | |
| 709 "but the regexp only has %d parenthesized subexpressions.", | |
| 710 max_token, NumberOfCapturingGroups()); | |
| 711 return false; | |
| 712 } | |
| 713 return true; | |
| 714 } | |
| 715 | |
| 716 | |
| 717 // Return the number of capturing subpatterns, or -1 if the | |
| 718 // regexp wasn't valid on construction. | |
| 719 int PCRE::NumberOfCapturingGroups() const { | |
| 720 if (re_partial_ == NULL) return -1; | |
| 721 | |
| 722 int result; | |
| 723 CHECK(pcre_fullinfo(re_partial_, // The regular expression object | |
| 724 NULL, // We did not study the pattern | |
| 725 PCRE_INFO_CAPTURECOUNT, | |
| 726 &result) == 0); | |
| 727 return result; | |
| 728 } | |
| 729 | |
| 730 | |
| 731 /***** Parsers for various types *****/ | |
| 732 | |
| 733 bool PCRE::Arg::parse_null(const char* str, int n, void* dest) { | |
| 734 // We fail if somebody asked us to store into a non-NULL void* pointer | |
| 735 return (dest == NULL); | |
| 736 } | |
| 737 | |
| 738 bool PCRE::Arg::parse_string(const char* str, int n, void* dest) { | |
| 739 if (dest == NULL) return true; | |
| 740 reinterpret_cast<string*>(dest)->assign(str, n); | |
| 741 return true; | |
| 742 } | |
| 743 | |
| 744 bool PCRE::Arg::parse_stringpiece(const char* str, int n, void* dest) { | |
| 745 if (dest == NULL) return true; | |
| 746 reinterpret_cast<StringPiece*>(dest)->set(str, n); | |
| 747 return true; | |
| 748 } | |
| 749 | |
| 750 bool PCRE::Arg::parse_char(const char* str, int n, void* dest) { | |
| 751 if (n != 1) return false; | |
| 752 if (dest == NULL) return true; | |
| 753 *(reinterpret_cast<char*>(dest)) = str[0]; | |
| 754 return true; | |
| 755 } | |
| 756 | |
| 757 bool PCRE::Arg::parse_uchar(const char* str, int n, void* dest) { | |
| 758 if (n != 1) return false; | |
| 759 if (dest == NULL) return true; | |
| 760 *(reinterpret_cast<unsigned char*>(dest)) = str[0]; | |
| 761 return true; | |
| 762 } | |
| 763 | |
| 764 // Largest number spec that we are willing to parse | |
| 765 static const int kMaxNumberLength = 32; | |
| 766 | |
| 767 // PCREQUIPCRES "buf" must have length at least kMaxNumberLength+1 | |
| 768 // PCREQUIPCRES "n > 0" | |
| 769 // Copies "str" into "buf" and null-terminates if necessary. | |
| 770 // Returns one of: | |
| 771 // a. "str" if no termination is needed | |
| 772 // b. "buf" if the string was copied and null-terminated | |
| 773 // c. "" if the input was invalid and has no hope of being parsed | |
| 774 static const char* TerminateNumber(char* buf, const char* str, int n) { | |
| 775 if ((n > 0) && isspace(*str)) { | |
| 776 // We are less forgiving than the strtoxxx() routines and do not | |
| 777 // allow leading spaces. | |
| 778 return ""; | |
| 779 } | |
| 780 | |
| 781 // See if the character right after the input text may potentially | |
| 782 // look like a digit. | |
| 783 if (isdigit(str[n]) || | |
| 784 ((str[n] >= 'a') && (str[n] <= 'f')) || | |
| 785 ((str[n] >= 'A') && (str[n] <= 'F'))) { | |
| 786 if (n > kMaxNumberLength) return ""; // Input too big to be a valid number | |
| 787 memcpy(buf, str, n); | |
| 788 buf[n] = '\0'; | |
| 789 return buf; | |
| 790 } else { | |
| 791 // We can parse right out of the supplied string, so return it. | |
| 792 return str; | |
| 793 } | |
| 794 } | |
| 795 | |
| 796 bool PCRE::Arg::parse_long_radix(const char* str, | |
| 797 int n, | |
| 798 void* dest, | |
| 799 int radix) { | |
| 800 if (n == 0) return false; | |
| 801 char buf[kMaxNumberLength+1]; | |
| 802 str = TerminateNumber(buf, str, n); | |
| 803 char* end; | |
| 804 errno = 0; | |
| 805 long r = strtol(str, &end, radix); | |
| 806 if (end != str + n) return false; // Leftover junk | |
| 807 if (errno) return false; | |
| 808 if (dest == NULL) return true; | |
| 809 *(reinterpret_cast<long*>(dest)) = r; | |
| 810 return true; | |
| 811 } | |
| 812 | |
| 813 bool PCRE::Arg::parse_ulong_radix(const char* str, | |
| 814 int n, | |
| 815 void* dest, | |
| 816 int radix) { | |
| 817 if (n == 0) return false; | |
| 818 char buf[kMaxNumberLength+1]; | |
| 819 str = TerminateNumber(buf, str, n); | |
| 820 if (str[0] == '-') { | |
| 821 // strtoul() will silently accept negative numbers and parse | |
| 822 // them. This module is more strict and treats them as errors. | |
| 823 return false; | |
| 824 } | |
| 825 | |
| 826 char* end; | |
| 827 errno = 0; | |
| 828 unsigned long r = strtoul(str, &end, radix); | |
| 829 if (end != str + n) return false; // Leftover junk | |
| 830 if (errno) return false; | |
| 831 if (dest == NULL) return true; | |
| 832 *(reinterpret_cast<unsigned long*>(dest)) = r; | |
| 833 return true; | |
| 834 } | |
| 835 | |
| 836 bool PCRE::Arg::parse_short_radix(const char* str, | |
| 837 int n, | |
| 838 void* dest, | |
| 839 int radix) { | |
| 840 long r; | |
| 841 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse | |
| 842 if ((short)r != r) return false; // Out of range | |
| 843 if (dest == NULL) return true; | |
| 844 *(reinterpret_cast<short*>(dest)) = (short)r; | |
| 845 return true; | |
| 846 } | |
| 847 | |
| 848 bool PCRE::Arg::parse_ushort_radix(const char* str, | |
| 849 int n, | |
| 850 void* dest, | |
| 851 int radix) { | |
| 852 unsigned long r; | |
| 853 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse | |
| 854 if ((ushort)r != r) return false; // Out of range | |
| 855 if (dest == NULL) return true; | |
| 856 *(reinterpret_cast<unsigned short*>(dest)) = (ushort)r; | |
| 857 return true; | |
| 858 } | |
| 859 | |
| 860 bool PCRE::Arg::parse_int_radix(const char* str, | |
| 861 int n, | |
| 862 void* dest, | |
| 863 int radix) { | |
| 864 long r; | |
| 865 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse | |
| 866 if ((int)r != r) return false; // Out of range | |
| 867 if (dest == NULL) return true; | |
| 868 *(reinterpret_cast<int*>(dest)) = r; | |
| 869 return true; | |
| 870 } | |
| 871 | |
| 872 bool PCRE::Arg::parse_uint_radix(const char* str, | |
| 873 int n, | |
| 874 void* dest, | |
| 875 int radix) { | |
| 876 unsigned long r; | |
| 877 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse | |
| 878 if ((uint)r != r) return false; // Out of range | |
| 879 if (dest == NULL) return true; | |
| 880 *(reinterpret_cast<unsigned int*>(dest)) = r; | |
| 881 return true; | |
| 882 } | |
| 883 | |
| 884 bool PCRE::Arg::parse_longlong_radix(const char* str, | |
| 885 int n, | |
| 886 void* dest, | |
| 887 int radix) { | |
| 888 if (n == 0) return false; | |
| 889 char buf[kMaxNumberLength+1]; | |
| 890 str = TerminateNumber(buf, str, n); | |
| 891 char* end; | |
| 892 errno = 0; | |
| 893 int64 r = strtoll(str, &end, radix); | |
| 894 if (end != str + n) return false; // Leftover junk | |
| 895 if (errno) return false; | |
| 896 if (dest == NULL) return true; | |
| 897 *(reinterpret_cast<int64*>(dest)) = r; | |
| 898 return true; | |
| 899 } | |
| 900 | |
| 901 bool PCRE::Arg::parse_ulonglong_radix(const char* str, | |
| 902 int n, | |
| 903 void* dest, | |
| 904 int radix) { | |
| 905 if (n == 0) return false; | |
| 906 char buf[kMaxNumberLength+1]; | |
| 907 str = TerminateNumber(buf, str, n); | |
| 908 if (str[0] == '-') { | |
| 909 // strtoull() will silently accept negative numbers and parse | |
| 910 // them. This module is more strict and treats them as errors. | |
| 911 return false; | |
| 912 } | |
| 913 char* end; | |
| 914 errno = 0; | |
| 915 uint64 r = strtoull(str, &end, radix); | |
| 916 if (end != str + n) return false; // Leftover junk | |
| 917 if (errno) return false; | |
| 918 if (dest == NULL) return true; | |
| 919 *(reinterpret_cast<uint64*>(dest)) = r; | |
| 920 return true; | |
| 921 } | |
| 922 | |
| 923 bool PCRE::Arg::parse_double(const char* str, int n, void* dest) { | |
| 924 if (n == 0) return false; | |
| 925 static const int kMaxLength = 200; | |
| 926 char buf[kMaxLength]; | |
| 927 if (n >= kMaxLength) return false; | |
| 928 memcpy(buf, str, n); | |
| 929 buf[n] = '\0'; | |
| 930 errno = 0; | |
| 931 char* end; | |
| 932 double r = strtod(buf, &end); | |
| 933 if (end != buf + n) { | |
| 934 #ifdef _WIN32 | |
| 935 // Microsoft's strtod() doesn't handle inf and nan, so we have to | |
| 936 // handle it explicitly. Speed is not important here because this | |
| 937 // code is only called in unit tests. | |
| 938 bool pos = true; | |
| 939 const char* i = buf; | |
| 940 if ('-' == *i) { | |
| 941 pos = false; | |
| 942 ++i; | |
| 943 } else if ('+' == *i) { | |
| 944 ++i; | |
| 945 } | |
| 946 if (0 == stricmp(i, "inf") || 0 == stricmp(i, "infinity")) { | |
| 947 r = std::numeric_limits<double>::infinity(); | |
| 948 if (!pos) | |
| 949 r = -r; | |
| 950 } else if (0 == stricmp(i, "nan")) { | |
| 951 r = std::numeric_limits<double>::quiet_NaN(); | |
| 952 } else { | |
| 953 return false; | |
| 954 } | |
| 955 #else | |
| 956 return false; // Leftover junk | |
| 957 #endif | |
| 958 } | |
| 959 if (errno) return false; | |
| 960 if (dest == NULL) return true; | |
| 961 *(reinterpret_cast<double*>(dest)) = r; | |
| 962 return true; | |
| 963 } | |
| 964 | |
| 965 bool PCRE::Arg::parse_float(const char* str, int n, void* dest) { | |
| 966 double r; | |
| 967 if (!parse_double(str, n, &r)) return false; | |
| 968 if (dest == NULL) return true; | |
| 969 *(reinterpret_cast<float*>(dest)) = static_cast<float>(r); | |
| 970 return true; | |
| 971 } | |
| 972 | |
| 973 | |
| 974 #define DEFINE_INTEGER_PARSERS(name) \ | |
| 975 bool PCRE::Arg::parse_##name(const char* str, int n, void* dest) { \ | |
| 976 return parse_##name##_radix(str, n, dest, 10); \ | |
| 977 } \ | |
| 978 bool PCRE::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \ | |
| 979 return parse_##name##_radix(str, n, dest, 16); \ | |
| 980 } \ | |
| 981 bool PCRE::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \ | |
| 982 return parse_##name##_radix(str, n, dest, 8); \ | |
| 983 } \ | |
| 984 bool PCRE::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \ | |
| 985 return parse_##name##_radix(str, n, dest, 0); \ | |
| 986 } | |
| 987 | |
| 988 DEFINE_INTEGER_PARSERS(short); | |
| 989 DEFINE_INTEGER_PARSERS(ushort); | |
| 990 DEFINE_INTEGER_PARSERS(int); | |
| 991 DEFINE_INTEGER_PARSERS(uint); | |
| 992 DEFINE_INTEGER_PARSERS(long); | |
| 993 DEFINE_INTEGER_PARSERS(ulong); | |
| 994 DEFINE_INTEGER_PARSERS(longlong); | |
| 995 DEFINE_INTEGER_PARSERS(ulonglong); | |
| 996 | |
| 997 #undef DEFINE_INTEGER_PARSERS | |
| 998 | |
| 999 } // namespace re2 | |
| OLD | NEW |