| OLD | NEW |
| (Empty) |
| 1 // Copyright 2006 The RE2 Authors. All Rights Reserved. | |
| 2 // Use of this source code is governed by a BSD-style | |
| 3 // license that can be found in the LICENSE file. | |
| 4 | |
| 5 // Regular expression representation. | |
| 6 // Tested by parse_test.cc | |
| 7 | |
| 8 #include "util/util.h" | |
| 9 #include "re2/regexp.h" | |
| 10 #include "re2/stringpiece.h" | |
| 11 #include "re2/walker-inl.h" | |
| 12 | |
| 13 namespace re2 { | |
| 14 | |
| 15 // Constructor. Allocates vectors as appropriate for operator. | |
| 16 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags) | |
| 17 : op_(static_cast<uint8>(op)), | |
| 18 simple_(false), | |
| 19 parse_flags_(static_cast<uint16>(parse_flags)), | |
| 20 ref_(1), | |
| 21 nsub_(0), | |
| 22 down_(NULL) { | |
| 23 subone_ = NULL; | |
| 24 memset(the_union_, 0, sizeof the_union_); | |
| 25 } | |
| 26 | |
| 27 // Destructor. Assumes already cleaned up children. | |
| 28 // Private: use Decref() instead of delete to destroy Regexps. | |
| 29 // Can't call Decref on the sub-Regexps here because | |
| 30 // that could cause arbitrarily deep recursion, so | |
| 31 // required Decref() to have handled them for us. | |
| 32 Regexp::~Regexp() { | |
| 33 if (nsub_ > 0) | |
| 34 LOG(DFATAL) << "Regexp not destroyed."; | |
| 35 | |
| 36 switch (op_) { | |
| 37 default: | |
| 38 break; | |
| 39 case kRegexpCapture: | |
| 40 delete name_; | |
| 41 break; | |
| 42 case kRegexpLiteralString: | |
| 43 delete[] runes_; | |
| 44 break; | |
| 45 case kRegexpCharClass: | |
| 46 if (cc_) | |
| 47 cc_->Delete(); | |
| 48 delete ccb_; | |
| 49 break; | |
| 50 } | |
| 51 } | |
| 52 | |
| 53 // If it's possible to destroy this regexp without recurring, | |
| 54 // do so and return true. Else return false. | |
| 55 bool Regexp::QuickDestroy() { | |
| 56 if (nsub_ == 0) { | |
| 57 delete this; | |
| 58 return true; | |
| 59 } | |
| 60 return false; | |
| 61 } | |
| 62 | |
| 63 static map<Regexp*, int> *ref_map; | |
| 64 GLOBAL_MUTEX(ref_mutex); | |
| 65 | |
| 66 int Regexp::Ref() { | |
| 67 if (ref_ < kMaxRef) | |
| 68 return ref_; | |
| 69 | |
| 70 GLOBAL_MUTEX_LOCK(ref_mutex); | |
| 71 int r = 0; | |
| 72 if (ref_map != NULL) { | |
| 73 r = (*ref_map)[this]; | |
| 74 } | |
| 75 GLOBAL_MUTEX_UNLOCK(ref_mutex); | |
| 76 return r; | |
| 77 } | |
| 78 | |
| 79 // Increments reference count, returns object as convenience. | |
| 80 Regexp* Regexp::Incref() { | |
| 81 if (ref_ >= kMaxRef-1) { | |
| 82 // Store ref count in overflow map. | |
| 83 GLOBAL_MUTEX_LOCK(ref_mutex); | |
| 84 if (ref_map == NULL) { | |
| 85 ref_map = new map<Regexp*, int>; | |
| 86 } | |
| 87 if (ref_ == kMaxRef) { | |
| 88 // already overflowed | |
| 89 (*ref_map)[this]++; | |
| 90 } else { | |
| 91 // overflowing now | |
| 92 (*ref_map)[this] = kMaxRef; | |
| 93 ref_ = kMaxRef; | |
| 94 } | |
| 95 GLOBAL_MUTEX_UNLOCK(ref_mutex); | |
| 96 return this; | |
| 97 } | |
| 98 | |
| 99 ref_++; | |
| 100 return this; | |
| 101 } | |
| 102 | |
| 103 // Decrements reference count and deletes this object if count reaches 0. | |
| 104 void Regexp::Decref() { | |
| 105 if (ref_ == kMaxRef) { | |
| 106 // Ref count is stored in overflow map. | |
| 107 GLOBAL_MUTEX_LOCK(ref_mutex); | |
| 108 int r = (*ref_map)[this] - 1; | |
| 109 if (r < kMaxRef) { | |
| 110 ref_ = static_cast<uint16>(r); | |
| 111 ref_map->erase(this); | |
| 112 } else { | |
| 113 (*ref_map)[this] = r; | |
| 114 } | |
| 115 GLOBAL_MUTEX_UNLOCK(ref_mutex); | |
| 116 return; | |
| 117 } | |
| 118 ref_--; | |
| 119 if (ref_ == 0) | |
| 120 Destroy(); | |
| 121 } | |
| 122 | |
| 123 // Deletes this object; ref count has count reached 0. | |
| 124 void Regexp::Destroy() { | |
| 125 if (QuickDestroy()) | |
| 126 return; | |
| 127 | |
| 128 // Handle recursive Destroy with explicit stack | |
| 129 // to avoid arbitrarily deep recursion on process stack [sigh]. | |
| 130 down_ = NULL; | |
| 131 Regexp* stack = this; | |
| 132 while (stack != NULL) { | |
| 133 Regexp* re = stack; | |
| 134 stack = re->down_; | |
| 135 if (re->ref_ != 0) | |
| 136 LOG(DFATAL) << "Bad reference count " << re->ref_; | |
| 137 if (re->nsub_ > 0) { | |
| 138 Regexp** subs = re->sub(); | |
| 139 for (int i = 0; i < re->nsub_; i++) { | |
| 140 Regexp* sub = subs[i]; | |
| 141 if (sub == NULL) | |
| 142 continue; | |
| 143 if (sub->ref_ == kMaxRef) | |
| 144 sub->Decref(); | |
| 145 else | |
| 146 --sub->ref_; | |
| 147 if (sub->ref_ == 0 && !sub->QuickDestroy()) { | |
| 148 sub->down_ = stack; | |
| 149 stack = sub; | |
| 150 } | |
| 151 } | |
| 152 if (re->nsub_ > 1) | |
| 153 delete[] subs; | |
| 154 re->nsub_ = 0; | |
| 155 } | |
| 156 delete re; | |
| 157 } | |
| 158 } | |
| 159 | |
| 160 void Regexp::AddRuneToString(Rune r) { | |
| 161 DCHECK(op_ == kRegexpLiteralString); | |
| 162 if (nrunes_ == 0) { | |
| 163 // start with 8 | |
| 164 runes_ = new Rune[8]; | |
| 165 } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) { | |
| 166 // double on powers of two | |
| 167 Rune *old = runes_; | |
| 168 runes_ = new Rune[nrunes_ * 2]; | |
| 169 for (int i = 0; i < nrunes_; i++) | |
| 170 runes_[i] = old[i]; | |
| 171 delete[] old; | |
| 172 } | |
| 173 | |
| 174 runes_[nrunes_++] = r; | |
| 175 } | |
| 176 | |
| 177 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) { | |
| 178 Regexp* re = new Regexp(kRegexpHaveMatch, flags); | |
| 179 re->match_id_ = match_id; | |
| 180 return re; | |
| 181 } | |
| 182 | |
| 183 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) { | |
| 184 if (sub->op() == kRegexpPlus && sub->parse_flags() == flags) | |
| 185 return sub; | |
| 186 Regexp* re = new Regexp(kRegexpPlus, flags); | |
| 187 re->AllocSub(1); | |
| 188 re->sub()[0] = sub; | |
| 189 return re; | |
| 190 } | |
| 191 | |
| 192 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) { | |
| 193 if (sub->op() == kRegexpStar && sub->parse_flags() == flags) | |
| 194 return sub; | |
| 195 Regexp* re = new Regexp(kRegexpStar, flags); | |
| 196 re->AllocSub(1); | |
| 197 re->sub()[0] = sub; | |
| 198 return re; | |
| 199 } | |
| 200 | |
| 201 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) { | |
| 202 if (sub->op() == kRegexpQuest && sub->parse_flags() == flags) | |
| 203 return sub; | |
| 204 Regexp* re = new Regexp(kRegexpQuest, flags); | |
| 205 re->AllocSub(1); | |
| 206 re->sub()[0] = sub; | |
| 207 return re; | |
| 208 } | |
| 209 | |
| 210 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, | |
| 211 ParseFlags flags, bool can_factor) { | |
| 212 if (nsub == 1) | |
| 213 return sub[0]; | |
| 214 | |
| 215 if (nsub == 0) { | |
| 216 if (op == kRegexpAlternate) | |
| 217 return new Regexp(kRegexpNoMatch, flags); | |
| 218 else | |
| 219 return new Regexp(kRegexpEmptyMatch, flags); | |
| 220 } | |
| 221 | |
| 222 Regexp** subcopy = NULL; | |
| 223 if (op == kRegexpAlternate && can_factor) { | |
| 224 // Going to edit sub; make a copy so we don't step on caller. | |
| 225 subcopy = new Regexp*[nsub]; | |
| 226 memmove(subcopy, sub, nsub * sizeof sub[0]); | |
| 227 sub = subcopy; | |
| 228 nsub = FactorAlternation(sub, nsub, flags); | |
| 229 if (nsub == 1) { | |
| 230 Regexp* re = sub[0]; | |
| 231 delete[] subcopy; | |
| 232 return re; | |
| 233 } | |
| 234 } | |
| 235 | |
| 236 if (nsub > kMaxNsub) { | |
| 237 // Too many subexpressions to fit in a single Regexp. | |
| 238 // Make a two-level tree. Two levels gets us to 65535^2. | |
| 239 int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub; | |
| 240 Regexp* re = new Regexp(op, flags); | |
| 241 re->AllocSub(nbigsub); | |
| 242 Regexp** subs = re->sub(); | |
| 243 for (int i = 0; i < nbigsub - 1; i++) | |
| 244 subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false); | |
| 245 subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub, | |
| 246 nsub - (nbigsub-1)*kMaxNsub, flags, | |
| 247 false); | |
| 248 delete[] subcopy; | |
| 249 return re; | |
| 250 } | |
| 251 | |
| 252 Regexp* re = new Regexp(op, flags); | |
| 253 re->AllocSub(nsub); | |
| 254 Regexp** subs = re->sub(); | |
| 255 for (int i = 0; i < nsub; i++) | |
| 256 subs[i] = sub[i]; | |
| 257 | |
| 258 delete[] subcopy; | |
| 259 return re; | |
| 260 } | |
| 261 | |
| 262 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) { | |
| 263 return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false); | |
| 264 } | |
| 265 | |
| 266 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) { | |
| 267 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true); | |
| 268 } | |
| 269 | |
| 270 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) { | |
| 271 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false); | |
| 272 } | |
| 273 | |
| 274 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) { | |
| 275 Regexp* re = new Regexp(kRegexpCapture, flags); | |
| 276 re->AllocSub(1); | |
| 277 re->sub()[0] = sub; | |
| 278 re->cap_ = cap; | |
| 279 return re; | |
| 280 } | |
| 281 | |
| 282 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) { | |
| 283 Regexp* re = new Regexp(kRegexpRepeat, flags); | |
| 284 re->AllocSub(1); | |
| 285 re->sub()[0] = sub; | |
| 286 re->min_ = min; | |
| 287 re->max_ = max; | |
| 288 return re; | |
| 289 } | |
| 290 | |
| 291 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) { | |
| 292 Regexp* re = new Regexp(kRegexpLiteral, flags); | |
| 293 re->rune_ = rune; | |
| 294 return re; | |
| 295 } | |
| 296 | |
| 297 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) { | |
| 298 if (nrunes <= 0) | |
| 299 return new Regexp(kRegexpEmptyMatch, flags); | |
| 300 if (nrunes == 1) | |
| 301 return NewLiteral(runes[0], flags); | |
| 302 Regexp* re = new Regexp(kRegexpLiteralString, flags); | |
| 303 for (int i = 0; i < nrunes; i++) | |
| 304 re->AddRuneToString(runes[i]); | |
| 305 return re; | |
| 306 } | |
| 307 | |
| 308 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) { | |
| 309 Regexp* re = new Regexp(kRegexpCharClass, flags); | |
| 310 re->cc_ = cc; | |
| 311 return re; | |
| 312 } | |
| 313 | |
| 314 // Swaps this and that in place. | |
| 315 void Regexp::Swap(Regexp* that) { | |
| 316 // Can use memmove because Regexp is just a struct (no vtable). | |
| 317 char tmp[sizeof *this]; | |
| 318 memmove(tmp, this, sizeof tmp); | |
| 319 memmove(this, that, sizeof tmp); | |
| 320 memmove(that, tmp, sizeof tmp); | |
| 321 } | |
| 322 | |
| 323 // Tests equality of all top-level structure but not subregexps. | |
| 324 static bool TopEqual(Regexp* a, Regexp* b) { | |
| 325 if (a->op() != b->op()) | |
| 326 return false; | |
| 327 | |
| 328 switch (a->op()) { | |
| 329 case kRegexpNoMatch: | |
| 330 case kRegexpEmptyMatch: | |
| 331 case kRegexpAnyChar: | |
| 332 case kRegexpAnyByte: | |
| 333 case kRegexpBeginLine: | |
| 334 case kRegexpEndLine: | |
| 335 case kRegexpWordBoundary: | |
| 336 case kRegexpNoWordBoundary: | |
| 337 case kRegexpBeginText: | |
| 338 return true; | |
| 339 | |
| 340 case kRegexpEndText: | |
| 341 // The parse flags remember whether it's \z or (?-m:$), | |
| 342 // which matters when testing against PCRE. | |
| 343 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0; | |
| 344 | |
| 345 case kRegexpLiteral: | |
| 346 return a->rune() == b->rune() && | |
| 347 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0; | |
| 348 | |
| 349 case kRegexpLiteralString: | |
| 350 return a->nrunes() == b->nrunes() && | |
| 351 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 && | |
| 352 memcmp(a->runes(), b->runes(), | |
| 353 a->nrunes() * sizeof a->runes()[0]) == 0; | |
| 354 | |
| 355 case kRegexpAlternate: | |
| 356 case kRegexpConcat: | |
| 357 return a->nsub() == b->nsub(); | |
| 358 | |
| 359 case kRegexpStar: | |
| 360 case kRegexpPlus: | |
| 361 case kRegexpQuest: | |
| 362 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0; | |
| 363 | |
| 364 case kRegexpRepeat: | |
| 365 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 && | |
| 366 a->min() == b->min() && | |
| 367 a->max() == b->max(); | |
| 368 | |
| 369 case kRegexpCapture: | |
| 370 return a->cap() == b->cap() && a->name() == b->name(); | |
| 371 | |
| 372 case kRegexpHaveMatch: | |
| 373 return a->match_id() == b->match_id(); | |
| 374 | |
| 375 case kRegexpCharClass: { | |
| 376 CharClass* acc = a->cc(); | |
| 377 CharClass* bcc = b->cc(); | |
| 378 return acc->size() == bcc->size() && | |
| 379 acc->end() - acc->begin() == bcc->end() - bcc->begin() && | |
| 380 memcmp(acc->begin(), bcc->begin(), | |
| 381 (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0; | |
| 382 } | |
| 383 } | |
| 384 | |
| 385 LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op(); | |
| 386 return 0; | |
| 387 } | |
| 388 | |
| 389 bool Regexp::Equal(Regexp* a, Regexp* b) { | |
| 390 if (a == NULL || b == NULL) | |
| 391 return a == b; | |
| 392 | |
| 393 if (!TopEqual(a, b)) | |
| 394 return false; | |
| 395 | |
| 396 // Fast path: | |
| 397 // return without allocating vector if there are no subregexps. | |
| 398 switch (a->op()) { | |
| 399 case kRegexpAlternate: | |
| 400 case kRegexpConcat: | |
| 401 case kRegexpStar: | |
| 402 case kRegexpPlus: | |
| 403 case kRegexpQuest: | |
| 404 case kRegexpRepeat: | |
| 405 case kRegexpCapture: | |
| 406 break; | |
| 407 | |
| 408 default: | |
| 409 return true; | |
| 410 } | |
| 411 | |
| 412 // Committed to doing real work. | |
| 413 // The stack (vector) has pairs of regexps waiting to | |
| 414 // be compared. The regexps are only equal if | |
| 415 // all the pairs end up being equal. | |
| 416 vector<Regexp*> stk; | |
| 417 | |
| 418 for (;;) { | |
| 419 // Invariant: TopEqual(a, b) == true. | |
| 420 Regexp* a2; | |
| 421 Regexp* b2; | |
| 422 switch (a->op()) { | |
| 423 default: | |
| 424 break; | |
| 425 case kRegexpAlternate: | |
| 426 case kRegexpConcat: | |
| 427 for (int i = 0; i < a->nsub(); i++) { | |
| 428 a2 = a->sub()[i]; | |
| 429 b2 = b->sub()[i]; | |
| 430 if (!TopEqual(a2, b2)) | |
| 431 return false; | |
| 432 stk.push_back(a2); | |
| 433 stk.push_back(b2); | |
| 434 } | |
| 435 break; | |
| 436 | |
| 437 case kRegexpStar: | |
| 438 case kRegexpPlus: | |
| 439 case kRegexpQuest: | |
| 440 case kRegexpRepeat: | |
| 441 case kRegexpCapture: | |
| 442 a2 = a->sub()[0]; | |
| 443 b2 = b->sub()[0]; | |
| 444 if (!TopEqual(a2, b2)) | |
| 445 return false; | |
| 446 // Really: | |
| 447 // stk.push_back(a2); | |
| 448 // stk.push_back(b2); | |
| 449 // break; | |
| 450 // but faster to assign directly and loop. | |
| 451 a = a2; | |
| 452 b = b2; | |
| 453 continue; | |
| 454 } | |
| 455 | |
| 456 size_t n = stk.size(); | |
| 457 if (n == 0) | |
| 458 break; | |
| 459 | |
| 460 DCHECK_GE(n, 2); | |
| 461 a = stk[n-2]; | |
| 462 b = stk[n-1]; | |
| 463 stk.resize(n-2); | |
| 464 } | |
| 465 | |
| 466 return true; | |
| 467 } | |
| 468 | |
| 469 // Keep in sync with enum RegexpStatusCode in regexp.h | |
| 470 static const char *kErrorStrings[] = { | |
| 471 "no error", | |
| 472 "unexpected error", | |
| 473 "invalid escape sequence", | |
| 474 "invalid character class", | |
| 475 "invalid character class range", | |
| 476 "missing ]", | |
| 477 "missing )", | |
| 478 "trailing \\", | |
| 479 "no argument for repetition operator", | |
| 480 "invalid repetition size", | |
| 481 "bad repetition operator", | |
| 482 "invalid perl operator", | |
| 483 "invalid UTF-8", | |
| 484 "invalid named capture group", | |
| 485 }; | |
| 486 | |
| 487 string RegexpStatus::CodeText(enum RegexpStatusCode code) { | |
| 488 if (code < 0 || code >= arraysize(kErrorStrings)) | |
| 489 code = kRegexpInternalError; | |
| 490 return kErrorStrings[code]; | |
| 491 } | |
| 492 | |
| 493 string RegexpStatus::Text() const { | |
| 494 if (error_arg_.empty()) | |
| 495 return CodeText(code_); | |
| 496 string s; | |
| 497 s.append(CodeText(code_)); | |
| 498 s.append(": "); | |
| 499 s.append(error_arg_.data(), error_arg_.size()); | |
| 500 return s; | |
| 501 } | |
| 502 | |
| 503 void RegexpStatus::Copy(const RegexpStatus& status) { | |
| 504 code_ = status.code_; | |
| 505 error_arg_ = status.error_arg_; | |
| 506 } | |
| 507 | |
| 508 typedef int Ignored; // Walker<void> doesn't exist | |
| 509 | |
| 510 // Walker subclass to count capturing parens in regexp. | |
| 511 class NumCapturesWalker : public Regexp::Walker<Ignored> { | |
| 512 public: | |
| 513 NumCapturesWalker() : ncapture_(0) {} | |
| 514 int ncapture() { return ncapture_; } | |
| 515 | |
| 516 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { | |
| 517 if (re->op() == kRegexpCapture) | |
| 518 ncapture_++; | |
| 519 return ignored; | |
| 520 } | |
| 521 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { | |
| 522 // Should never be called: we use Walk not WalkExponential. | |
| 523 LOG(DFATAL) << "NumCapturesWalker::ShortVisit called"; | |
| 524 return ignored; | |
| 525 } | |
| 526 | |
| 527 private: | |
| 528 int ncapture_; | |
| 529 DISALLOW_COPY_AND_ASSIGN(NumCapturesWalker); | |
| 530 }; | |
| 531 | |
| 532 int Regexp::NumCaptures() { | |
| 533 NumCapturesWalker w; | |
| 534 w.Walk(this, 0); | |
| 535 return w.ncapture(); | |
| 536 } | |
| 537 | |
| 538 // Walker class to build map of named capture groups and their indices. | |
| 539 class NamedCapturesWalker : public Regexp::Walker<Ignored> { | |
| 540 public: | |
| 541 NamedCapturesWalker() : map_(NULL) {} | |
| 542 ~NamedCapturesWalker() { delete map_; } | |
| 543 | |
| 544 map<string, int>* TakeMap() { | |
| 545 map<string, int>* m = map_; | |
| 546 map_ = NULL; | |
| 547 return m; | |
| 548 } | |
| 549 | |
| 550 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { | |
| 551 if (re->op() == kRegexpCapture && re->name() != NULL) { | |
| 552 // Allocate map once we find a name. | |
| 553 if (map_ == NULL) | |
| 554 map_ = new map<string, int>; | |
| 555 | |
| 556 // Record first occurrence of each name. | |
| 557 // (The rule is that if you have the same name | |
| 558 // multiple times, only the leftmost one counts.) | |
| 559 if (map_->find(*re->name()) == map_->end()) | |
| 560 (*map_)[*re->name()] = re->cap(); | |
| 561 } | |
| 562 return ignored; | |
| 563 } | |
| 564 | |
| 565 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { | |
| 566 // Should never be called: we use Walk not WalkExponential. | |
| 567 LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called"; | |
| 568 return ignored; | |
| 569 } | |
| 570 | |
| 571 private: | |
| 572 map<string, int>* map_; | |
| 573 DISALLOW_COPY_AND_ASSIGN(NamedCapturesWalker); | |
| 574 }; | |
| 575 | |
| 576 map<string, int>* Regexp::NamedCaptures() { | |
| 577 NamedCapturesWalker w; | |
| 578 w.Walk(this, 0); | |
| 579 return w.TakeMap(); | |
| 580 } | |
| 581 | |
| 582 // Walker class to build map from capture group indices to their names. | |
| 583 class CaptureNamesWalker : public Regexp::Walker<Ignored> { | |
| 584 public: | |
| 585 CaptureNamesWalker() : map_(NULL) {} | |
| 586 ~CaptureNamesWalker() { delete map_; } | |
| 587 | |
| 588 map<int, string>* TakeMap() { | |
| 589 map<int, string>* m = map_; | |
| 590 map_ = NULL; | |
| 591 return m; | |
| 592 } | |
| 593 | |
| 594 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { | |
| 595 if (re->op() == kRegexpCapture && re->name() != NULL) { | |
| 596 // Allocate map once we find a name. | |
| 597 if (map_ == NULL) | |
| 598 map_ = new map<int, string>; | |
| 599 | |
| 600 (*map_)[re->cap()] = *re->name(); | |
| 601 } | |
| 602 return ignored; | |
| 603 } | |
| 604 | |
| 605 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { | |
| 606 // Should never be called: we use Walk not WalkExponential. | |
| 607 LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called"; | |
| 608 return ignored; | |
| 609 } | |
| 610 | |
| 611 private: | |
| 612 map<int, string>* map_; | |
| 613 DISALLOW_COPY_AND_ASSIGN(CaptureNamesWalker); | |
| 614 }; | |
| 615 | |
| 616 map<int, string>* Regexp::CaptureNames() { | |
| 617 CaptureNamesWalker w; | |
| 618 w.Walk(this, 0); | |
| 619 return w.TakeMap(); | |
| 620 } | |
| 621 | |
| 622 // Determines whether regexp matches must be anchored | |
| 623 // with a fixed string prefix. If so, returns the prefix and | |
| 624 // the regexp that remains after the prefix. The prefix might | |
| 625 // be ASCII case-insensitive. | |
| 626 bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { | |
| 627 // No need for a walker: the regexp must be of the form | |
| 628 // 1. some number of ^ anchors | |
| 629 // 2. a literal char or string | |
| 630 // 3. the rest | |
| 631 prefix->clear(); | |
| 632 *foldcase = false; | |
| 633 *suffix = NULL; | |
| 634 if (op_ != kRegexpConcat) | |
| 635 return false; | |
| 636 | |
| 637 // Some number of anchors, then a literal or concatenation. | |
| 638 int i = 0; | |
| 639 Regexp** sub = this->sub(); | |
| 640 while (i < nsub_ && sub[i]->op_ == kRegexpBeginText) | |
| 641 i++; | |
| 642 if (i == 0 || i >= nsub_) | |
| 643 return false; | |
| 644 | |
| 645 Regexp* re = sub[i]; | |
| 646 switch (re->op_) { | |
| 647 default: | |
| 648 return false; | |
| 649 | |
| 650 case kRegexpLiteralString: | |
| 651 // Convert to string in proper encoding. | |
| 652 if (re->parse_flags() & Latin1) { | |
| 653 prefix->resize(re->nrunes_); | |
| 654 for (int j = 0; j < re->nrunes_; j++) | |
| 655 (*prefix)[j] = static_cast<char>(re->runes_[j]); | |
| 656 } else { | |
| 657 // Convert to UTF-8 in place. | |
| 658 // Assume worst-case space and then trim. | |
| 659 prefix->resize(re->nrunes_ * UTFmax); | |
| 660 char *p = &(*prefix)[0]; | |
| 661 for (int j = 0; j < re->nrunes_; j++) { | |
| 662 Rune r = re->runes_[j]; | |
| 663 if (r < Runeself) | |
| 664 *p++ = static_cast<char>(r); | |
| 665 else | |
| 666 p += runetochar(p, &r); | |
| 667 } | |
| 668 prefix->resize(p - &(*prefix)[0]); | |
| 669 } | |
| 670 break; | |
| 671 | |
| 672 case kRegexpLiteral: | |
| 673 if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) { | |
| 674 prefix->append(1, static_cast<char>(re->rune_)); | |
| 675 } else { | |
| 676 char buf[UTFmax]; | |
| 677 prefix->append(buf, runetochar(buf, &re->rune_)); | |
| 678 } | |
| 679 break; | |
| 680 } | |
| 681 *foldcase = (sub[i]->parse_flags() & FoldCase) != 0; | |
| 682 i++; | |
| 683 | |
| 684 // The rest. | |
| 685 if (i < nsub_) { | |
| 686 for (int j = i; j < nsub_; j++) | |
| 687 sub[j]->Incref(); | |
| 688 re = Concat(sub + i, nsub_ - i, parse_flags()); | |
| 689 } else { | |
| 690 re = new Regexp(kRegexpEmptyMatch, parse_flags()); | |
| 691 } | |
| 692 *suffix = re; | |
| 693 return true; | |
| 694 } | |
| 695 | |
| 696 // Character class builder is a balanced binary tree (STL set) | |
| 697 // containing non-overlapping, non-abutting RuneRanges. | |
| 698 // The less-than operator used in the tree treats two | |
| 699 // ranges as equal if they overlap at all, so that | |
| 700 // lookups for a particular Rune are possible. | |
| 701 | |
| 702 CharClassBuilder::CharClassBuilder() { | |
| 703 nrunes_ = 0; | |
| 704 upper_ = 0; | |
| 705 lower_ = 0; | |
| 706 } | |
| 707 | |
| 708 // Add lo-hi to the class; return whether class got bigger. | |
| 709 bool CharClassBuilder::AddRange(Rune lo, Rune hi) { | |
| 710 if (hi < lo) | |
| 711 return false; | |
| 712 | |
| 713 if (lo <= 'z' && hi >= 'A') { | |
| 714 // Overlaps some alpha, maybe not all. | |
| 715 // Update bitmaps telling which ASCII letters are in the set. | |
| 716 Rune lo1 = max<Rune>(lo, 'A'); | |
| 717 Rune hi1 = min<Rune>(hi, 'Z'); | |
| 718 if (lo1 <= hi1) | |
| 719 upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A'); | |
| 720 | |
| 721 lo1 = max<Rune>(lo, 'a'); | |
| 722 hi1 = min<Rune>(hi, 'z'); | |
| 723 if (lo1 <= hi1) | |
| 724 lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a'); | |
| 725 } | |
| 726 | |
| 727 { // Check whether lo, hi is already in the class. | |
| 728 iterator it = ranges_.find(RuneRange(lo, lo)); | |
| 729 if (it != end() && it->lo <= lo && hi <= it->hi) | |
| 730 return false; | |
| 731 } | |
| 732 | |
| 733 // Look for a range abutting lo on the left. | |
| 734 // If it exists, take it out and increase our range. | |
| 735 if (lo > 0) { | |
| 736 iterator it = ranges_.find(RuneRange(lo-1, lo-1)); | |
| 737 if (it != end()) { | |
| 738 lo = it->lo; | |
| 739 if (it->hi > hi) | |
| 740 hi = it->hi; | |
| 741 nrunes_ -= it->hi - it->lo + 1; | |
| 742 ranges_.erase(it); | |
| 743 } | |
| 744 } | |
| 745 | |
| 746 // Look for a range abutting hi on the right. | |
| 747 // If it exists, take it out and increase our range. | |
| 748 if (hi < Runemax) { | |
| 749 iterator it = ranges_.find(RuneRange(hi+1, hi+1)); | |
| 750 if (it != end()) { | |
| 751 hi = it->hi; | |
| 752 nrunes_ -= it->hi - it->lo + 1; | |
| 753 ranges_.erase(it); | |
| 754 } | |
| 755 } | |
| 756 | |
| 757 // Look for ranges between lo and hi. Take them out. | |
| 758 // This is only safe because the set has no overlapping ranges. | |
| 759 // We've already removed any ranges abutting lo and hi, so | |
| 760 // any that overlap [lo, hi] must be contained within it. | |
| 761 for (;;) { | |
| 762 iterator it = ranges_.find(RuneRange(lo, hi)); | |
| 763 if (it == end()) | |
| 764 break; | |
| 765 nrunes_ -= it->hi - it->lo + 1; | |
| 766 ranges_.erase(it); | |
| 767 } | |
| 768 | |
| 769 // Finally, add [lo, hi]. | |
| 770 nrunes_ += hi - lo + 1; | |
| 771 ranges_.insert(RuneRange(lo, hi)); | |
| 772 return true; | |
| 773 } | |
| 774 | |
| 775 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) { | |
| 776 for (iterator it = cc->begin(); it != cc->end(); ++it) | |
| 777 AddRange(it->lo, it->hi); | |
| 778 } | |
| 779 | |
| 780 bool CharClassBuilder::Contains(Rune r) { | |
| 781 return ranges_.find(RuneRange(r, r)) != end(); | |
| 782 } | |
| 783 | |
| 784 // Does the character class behave the same on A-Z as on a-z? | |
| 785 bool CharClassBuilder::FoldsASCII() { | |
| 786 return ((upper_ ^ lower_) & AlphaMask) == 0; | |
| 787 } | |
| 788 | |
| 789 CharClassBuilder* CharClassBuilder::Copy() { | |
| 790 CharClassBuilder* cc = new CharClassBuilder; | |
| 791 for (iterator it = begin(); it != end(); ++it) | |
| 792 cc->ranges_.insert(RuneRange(it->lo, it->hi)); | |
| 793 cc->upper_ = upper_; | |
| 794 cc->lower_ = lower_; | |
| 795 cc->nrunes_ = nrunes_; | |
| 796 return cc; | |
| 797 } | |
| 798 | |
| 799 | |
| 800 | |
| 801 void CharClassBuilder::RemoveAbove(Rune r) { | |
| 802 if (r >= Runemax) | |
| 803 return; | |
| 804 | |
| 805 if (r < 'z') { | |
| 806 if (r < 'a') | |
| 807 lower_ = 0; | |
| 808 else | |
| 809 lower_ &= AlphaMask >> ('z' - r); | |
| 810 } | |
| 811 | |
| 812 if (r < 'Z') { | |
| 813 if (r < 'A') | |
| 814 upper_ = 0; | |
| 815 else | |
| 816 upper_ &= AlphaMask >> ('Z' - r); | |
| 817 } | |
| 818 | |
| 819 for (;;) { | |
| 820 | |
| 821 iterator it = ranges_.find(RuneRange(r + 1, Runemax)); | |
| 822 if (it == end()) | |
| 823 break; | |
| 824 RuneRange rr = *it; | |
| 825 ranges_.erase(it); | |
| 826 nrunes_ -= rr.hi - rr.lo + 1; | |
| 827 if (rr.lo <= r) { | |
| 828 rr.hi = r; | |
| 829 ranges_.insert(rr); | |
| 830 nrunes_ += rr.hi - rr.lo + 1; | |
| 831 } | |
| 832 } | |
| 833 } | |
| 834 | |
| 835 void CharClassBuilder::Negate() { | |
| 836 // Build up negation and then copy in. | |
| 837 // Could edit ranges in place, but C++ won't let me. | |
| 838 vector<RuneRange> v; | |
| 839 v.reserve(ranges_.size() + 1); | |
| 840 | |
| 841 // In negation, first range begins at 0, unless | |
| 842 // the current class begins at 0. | |
| 843 iterator it = begin(); | |
| 844 if (it == end()) { | |
| 845 v.push_back(RuneRange(0, Runemax)); | |
| 846 } else { | |
| 847 int nextlo = 0; | |
| 848 if (it->lo == 0) { | |
| 849 nextlo = it->hi + 1; | |
| 850 ++it; | |
| 851 } | |
| 852 for (; it != end(); ++it) { | |
| 853 v.push_back(RuneRange(nextlo, it->lo - 1)); | |
| 854 nextlo = it->hi + 1; | |
| 855 } | |
| 856 if (nextlo <= Runemax) | |
| 857 v.push_back(RuneRange(nextlo, Runemax)); | |
| 858 } | |
| 859 | |
| 860 ranges_.clear(); | |
| 861 for (size_t i = 0; i < v.size(); i++) | |
| 862 ranges_.insert(v[i]); | |
| 863 | |
| 864 upper_ = AlphaMask & ~upper_; | |
| 865 lower_ = AlphaMask & ~lower_; | |
| 866 nrunes_ = Runemax+1 - nrunes_; | |
| 867 } | |
| 868 | |
| 869 // Character class is a sorted list of ranges. | |
| 870 // The ranges are allocated in the same block as the header, | |
| 871 // necessitating a special allocator and Delete method. | |
| 872 | |
| 873 CharClass* CharClass::New(int maxranges) { | |
| 874 CharClass* cc; | |
| 875 uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]]; | |
| 876 cc = reinterpret_cast<CharClass*>(data); | |
| 877 cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc); | |
| 878 cc->nranges_ = 0; | |
| 879 cc->folds_ascii_ = false; | |
| 880 cc->nrunes_ = 0; | |
| 881 return cc; | |
| 882 } | |
| 883 | |
| 884 void CharClass::Delete() { | |
| 885 uint8 *data = reinterpret_cast<uint8*>(this); | |
| 886 delete[] data; | |
| 887 } | |
| 888 | |
| 889 CharClass* CharClass::Negate() { | |
| 890 CharClass* cc = CharClass::New(nranges_+1); | |
| 891 cc->folds_ascii_ = folds_ascii_; | |
| 892 cc->nrunes_ = Runemax + 1 - nrunes_; | |
| 893 int n = 0; | |
| 894 int nextlo = 0; | |
| 895 for (CharClass::iterator it = begin(); it != end(); ++it) { | |
| 896 if (it->lo == nextlo) { | |
| 897 nextlo = it->hi + 1; | |
| 898 } else { | |
| 899 cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1); | |
| 900 nextlo = it->hi + 1; | |
| 901 } | |
| 902 } | |
| 903 if (nextlo <= Runemax) | |
| 904 cc->ranges_[n++] = RuneRange(nextlo, Runemax); | |
| 905 cc->nranges_ = n; | |
| 906 return cc; | |
| 907 } | |
| 908 | |
| 909 bool CharClass::Contains(Rune r) { | |
| 910 RuneRange* rr = ranges_; | |
| 911 int n = nranges_; | |
| 912 while (n > 0) { | |
| 913 int m = n/2; | |
| 914 if (rr[m].hi < r) { | |
| 915 rr += m+1; | |
| 916 n -= m+1; | |
| 917 } else if (r < rr[m].lo) { | |
| 918 n = m; | |
| 919 } else { // rr[m].lo <= r && r <= rr[m].hi | |
| 920 return true; | |
| 921 } | |
| 922 } | |
| 923 return false; | |
| 924 } | |
| 925 | |
| 926 CharClass* CharClassBuilder::GetCharClass() { | |
| 927 CharClass* cc = CharClass::New(static_cast<int>(ranges_.size())); | |
| 928 int n = 0; | |
| 929 for (iterator it = begin(); it != end(); ++it) | |
| 930 cc->ranges_[n++] = *it; | |
| 931 cc->nranges_ = n; | |
| 932 DCHECK_LE(n, static_cast<int>(ranges_.size())); | |
| 933 cc->nrunes_ = nrunes_; | |
| 934 cc->folds_ascii_ = FoldsASCII(); | |
| 935 return cc; | |
| 936 } | |
| 937 | |
| 938 } // namespace re2 | |
| OLD | NEW |