OLD | NEW |
(Empty) | |
| 1 // Copyright 2006 The RE2 Authors. All Rights Reserved. |
| 2 // Use of this source code is governed by a BSD-style |
| 3 // license that can be found in the LICENSE file. |
| 4 |
| 5 // Regular expression representation. |
| 6 // Tested by parse_test.cc |
| 7 |
| 8 #include "util/util.h" |
| 9 #include "re2/regexp.h" |
| 10 #include "re2/stringpiece.h" |
| 11 #include "re2/walker-inl.h" |
| 12 |
| 13 namespace re2 { |
| 14 |
| 15 // Constructor. Allocates vectors as appropriate for operator. |
| 16 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags) |
| 17 : op_(op), |
| 18 simple_(false), |
| 19 parse_flags_(static_cast<uint16>(parse_flags)), |
| 20 ref_(1), |
| 21 nsub_(0), |
| 22 down_(NULL) { |
| 23 subone_ = NULL; |
| 24 memset(the_union_, 0, sizeof the_union_); |
| 25 } |
| 26 |
| 27 // Destructor. Assumes already cleaned up children. |
| 28 // Private: use Decref() instead of delete to destroy Regexps. |
| 29 // Can't call Decref on the sub-Regexps here because |
| 30 // that could cause arbitrarily deep recursion, so |
| 31 // required Decref() to have handled them for us. |
| 32 Regexp::~Regexp() { |
| 33 if (nsub_ > 0) |
| 34 LOG(DFATAL) << "Regexp not destroyed."; |
| 35 |
| 36 switch (op_) { |
| 37 default: |
| 38 break; |
| 39 case kRegexpCapture: |
| 40 delete name_; |
| 41 break; |
| 42 case kRegexpLiteralString: |
| 43 delete[] runes_; |
| 44 break; |
| 45 case kRegexpCharClass: |
| 46 cc_->Delete(); |
| 47 delete ccb_; |
| 48 break; |
| 49 } |
| 50 } |
| 51 |
| 52 // If it's possible to destroy this regexp without recurring, |
| 53 // do so and return true. Else return false. |
| 54 bool Regexp::QuickDestroy() { |
| 55 if (nsub_ == 0) { |
| 56 delete this; |
| 57 return true; |
| 58 } |
| 59 return false; |
| 60 } |
| 61 |
| 62 static map<Regexp*, int> ref_map; |
| 63 static Mutex ref_mutex; |
| 64 |
| 65 int Regexp::Ref() { |
| 66 if (ref_ < kMaxRef) |
| 67 return ref_; |
| 68 |
| 69 MutexLock l(&ref_mutex); |
| 70 return ref_map[this]; |
| 71 } |
| 72 |
| 73 // Increments reference count, returns object as convenience. |
| 74 Regexp* Regexp::Incref() { |
| 75 if (ref_ >= kMaxRef-1) { |
| 76 // Store ref count in overflow map. |
| 77 MutexLock l(&ref_mutex); |
| 78 if (ref_ == kMaxRef) { // already overflowed |
| 79 ref_map[this]++; |
| 80 return this; |
| 81 } |
| 82 // overflowing now |
| 83 ref_map[this] = kMaxRef; |
| 84 ref_ = kMaxRef; |
| 85 return this; |
| 86 } |
| 87 |
| 88 ref_++; |
| 89 return this; |
| 90 } |
| 91 |
| 92 // Decrements reference count and deletes this object if count reaches 0. |
| 93 void Regexp::Decref() { |
| 94 if (ref_ == kMaxRef) { |
| 95 // Ref count is stored in overflow map. |
| 96 MutexLock l(&ref_mutex); |
| 97 int r = ref_map[this] - 1; |
| 98 if (r < kMaxRef) { |
| 99 ref_ = r; |
| 100 ref_map.erase(this); |
| 101 } else { |
| 102 ref_map[this] = r; |
| 103 } |
| 104 return; |
| 105 } |
| 106 ref_--; |
| 107 if (ref_ == 0) |
| 108 Destroy(); |
| 109 } |
| 110 |
| 111 // Deletes this object; ref count has count reached 0. |
| 112 void Regexp::Destroy() { |
| 113 if (QuickDestroy()) |
| 114 return; |
| 115 |
| 116 // Handle recursive Destroy with explicit stack |
| 117 // to avoid arbitrarily deep recursion on process stack [sigh]. |
| 118 down_ = NULL; |
| 119 Regexp* stack = this; |
| 120 while (stack != NULL) { |
| 121 Regexp* re = stack; |
| 122 stack = re->down_; |
| 123 if (re->ref_ != 0) |
| 124 LOG(DFATAL) << "Bad reference count " << re->ref_; |
| 125 if (re->nsub_ > 0) { |
| 126 Regexp** subs = re->sub(); |
| 127 for (int i = 0; i < re->nsub_; i++) { |
| 128 Regexp* sub = subs[i]; |
| 129 if (sub == NULL) |
| 130 continue; |
| 131 if (sub->ref_ == kMaxRef) |
| 132 sub->Decref(); |
| 133 else |
| 134 --sub->ref_; |
| 135 if (sub->ref_ == 0 && !sub->QuickDestroy()) { |
| 136 sub->down_ = stack; |
| 137 stack = sub; |
| 138 } |
| 139 } |
| 140 if (re->nsub_ > 1) |
| 141 delete[] subs; |
| 142 re->nsub_ = 0; |
| 143 } |
| 144 delete re; |
| 145 } |
| 146 } |
| 147 |
| 148 void Regexp::AddRuneToString(Rune r) { |
| 149 DCHECK(op_ == kRegexpLiteralString); |
| 150 if (nrunes_ == 0) { |
| 151 // start with 8 |
| 152 runes_ = new Rune[8]; |
| 153 } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) { |
| 154 // double on powers of two |
| 155 Rune *old = runes_; |
| 156 runes_ = new Rune[nrunes_ * 2]; |
| 157 for (int i = 0; i < nrunes_; i++) |
| 158 runes_[i] = old[i]; |
| 159 delete[] old; |
| 160 } |
| 161 |
| 162 runes_[nrunes_++] = r; |
| 163 } |
| 164 |
| 165 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) { |
| 166 Regexp* re = new Regexp(kRegexpHaveMatch, flags); |
| 167 re->match_id_ = match_id; |
| 168 return re; |
| 169 } |
| 170 |
| 171 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) { |
| 172 if (sub->op() == kRegexpPlus && sub->parse_flags() == flags) |
| 173 return sub; |
| 174 Regexp* re = new Regexp(kRegexpPlus, flags); |
| 175 re->AllocSub(1); |
| 176 re->sub()[0] = sub; |
| 177 return re; |
| 178 } |
| 179 |
| 180 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) { |
| 181 if (sub->op() == kRegexpStar && sub->parse_flags() == flags) |
| 182 return sub; |
| 183 Regexp* re = new Regexp(kRegexpStar, flags); |
| 184 re->AllocSub(1); |
| 185 re->sub()[0] = sub; |
| 186 return re; |
| 187 } |
| 188 |
| 189 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) { |
| 190 if (sub->op() == kRegexpQuest && sub->parse_flags() == flags) |
| 191 return sub; |
| 192 Regexp* re = new Regexp(kRegexpQuest, flags); |
| 193 re->AllocSub(1); |
| 194 re->sub()[0] = sub; |
| 195 return re; |
| 196 } |
| 197 |
| 198 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, |
| 199 ParseFlags flags, bool can_factor) { |
| 200 if (nsub == 1) |
| 201 return sub[0]; |
| 202 |
| 203 Regexp** subcopy = NULL; |
| 204 if (op == kRegexpAlternate && can_factor) { |
| 205 // Going to edit sub; make a copy so we don't step on caller. |
| 206 subcopy = new Regexp*[nsub]; |
| 207 memmove(subcopy, sub, nsub * sizeof sub[0]); |
| 208 sub = subcopy; |
| 209 nsub = FactorAlternation(sub, nsub, flags); |
| 210 if (nsub == 1) { |
| 211 Regexp* re = sub[0]; |
| 212 delete[] subcopy; |
| 213 return re; |
| 214 } |
| 215 } |
| 216 |
| 217 if (nsub > kMaxNsub) { |
| 218 // Too many subexpressions to fit in a single Regexp. |
| 219 // Make a two-level tree. Two levels gets us to 65535^2. |
| 220 int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub; |
| 221 Regexp* re = new Regexp(op, flags); |
| 222 re->AllocSub(nbigsub); |
| 223 Regexp** subs = re->sub(); |
| 224 for (int i = 0; i < nbigsub - 1; i++) |
| 225 subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false); |
| 226 subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub, |
| 227 nsub - (nbigsub-1)*kMaxNsub, flags, |
| 228 false); |
| 229 delete[] subcopy; |
| 230 return re; |
| 231 } |
| 232 |
| 233 Regexp* re = new Regexp(op, flags); |
| 234 re->AllocSub(nsub); |
| 235 Regexp** subs = re->sub(); |
| 236 for (int i = 0; i < nsub; i++) |
| 237 subs[i] = sub[i]; |
| 238 |
| 239 delete[] subcopy; |
| 240 return re; |
| 241 } |
| 242 |
| 243 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) { |
| 244 return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false); |
| 245 } |
| 246 |
| 247 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) { |
| 248 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true); |
| 249 } |
| 250 |
| 251 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) { |
| 252 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false); |
| 253 } |
| 254 |
| 255 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) { |
| 256 Regexp* re = new Regexp(kRegexpCapture, flags); |
| 257 re->AllocSub(1); |
| 258 re->sub()[0] = sub; |
| 259 re->cap_ = cap; |
| 260 return re; |
| 261 } |
| 262 |
| 263 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) { |
| 264 Regexp* re = new Regexp(kRegexpRepeat, flags); |
| 265 re->AllocSub(1); |
| 266 re->sub()[0] = sub; |
| 267 re->min_ = min; |
| 268 re->max_ = max; |
| 269 return re; |
| 270 } |
| 271 |
| 272 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) { |
| 273 Regexp* re = new Regexp(kRegexpLiteral, flags); |
| 274 re->rune_ = rune; |
| 275 return re; |
| 276 } |
| 277 |
| 278 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) { |
| 279 if (nrunes <= 0) |
| 280 return new Regexp(kRegexpEmptyMatch, flags); |
| 281 if (nrunes == 1) |
| 282 return NewLiteral(runes[0], flags); |
| 283 Regexp* re = new Regexp(kRegexpLiteralString, flags); |
| 284 for (int i = 0; i < nrunes; i++) |
| 285 re->AddRuneToString(runes[i]); |
| 286 return re; |
| 287 } |
| 288 |
| 289 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) { |
| 290 Regexp* re = new Regexp(kRegexpCharClass, flags); |
| 291 re->cc_ = cc; |
| 292 return re; |
| 293 } |
| 294 |
| 295 // Swaps this and that in place. |
| 296 void Regexp::Swap(Regexp* that) { |
| 297 // Can use memmove because Regexp is just a struct (no vtable). |
| 298 char tmp[sizeof *this]; |
| 299 memmove(tmp, this, sizeof tmp); |
| 300 memmove(this, that, sizeof tmp); |
| 301 memmove(that, tmp, sizeof tmp); |
| 302 } |
| 303 |
| 304 // Tests equality of all top-level structure but not subregexps. |
| 305 static bool TopEqual(Regexp* a, Regexp* b) { |
| 306 if (a->op() != b->op()) |
| 307 return false; |
| 308 |
| 309 switch (a->op()) { |
| 310 case kRegexpNoMatch: |
| 311 case kRegexpEmptyMatch: |
| 312 case kRegexpAnyChar: |
| 313 case kRegexpAnyByte: |
| 314 case kRegexpBeginLine: |
| 315 case kRegexpEndLine: |
| 316 case kRegexpWordBoundary: |
| 317 case kRegexpNoWordBoundary: |
| 318 case kRegexpBeginText: |
| 319 return true; |
| 320 |
| 321 case kRegexpEndText: |
| 322 // The parse flags remember whether it's \z or (?-m:$), |
| 323 // which matters when testing against PCRE. |
| 324 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0; |
| 325 |
| 326 case kRegexpLiteral: |
| 327 return a->rune() == b->rune() && |
| 328 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0; |
| 329 |
| 330 case kRegexpLiteralString: |
| 331 return a->nrunes() == b->nrunes() && |
| 332 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 && |
| 333 memcmp(a->runes(), b->runes(), |
| 334 a->nrunes() * sizeof a->runes()[0]) == 0; |
| 335 |
| 336 case kRegexpAlternate: |
| 337 case kRegexpConcat: |
| 338 return a->nsub() == b->nsub(); |
| 339 |
| 340 case kRegexpStar: |
| 341 case kRegexpPlus: |
| 342 case kRegexpQuest: |
| 343 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0; |
| 344 |
| 345 case kRegexpRepeat: |
| 346 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 && |
| 347 a->min() == b->min() && |
| 348 a->max() == b->max(); |
| 349 |
| 350 case kRegexpCapture: |
| 351 return a->cap() == b->cap() && a->name() == b->name(); |
| 352 |
| 353 case kRegexpHaveMatch: |
| 354 return a->match_id() == b->match_id(); |
| 355 |
| 356 case kRegexpCharClass: { |
| 357 CharClass* acc = a->cc(); |
| 358 CharClass* bcc = b->cc(); |
| 359 return acc->size() == bcc->size() && |
| 360 acc->end() - acc->begin() == bcc->end() - bcc->begin() && |
| 361 memcmp(acc->begin(), bcc->begin(), |
| 362 (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0; |
| 363 } |
| 364 } |
| 365 |
| 366 LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op(); |
| 367 return 0; |
| 368 } |
| 369 |
| 370 bool Regexp::Equal(Regexp* a, Regexp* b) { |
| 371 if (a == NULL || b == NULL) |
| 372 return a == b; |
| 373 |
| 374 if (!TopEqual(a, b)) |
| 375 return false; |
| 376 |
| 377 // Fast path: |
| 378 // return without allocating vector if there are no subregexps. |
| 379 switch (a->op()) { |
| 380 case kRegexpAlternate: |
| 381 case kRegexpConcat: |
| 382 case kRegexpStar: |
| 383 case kRegexpPlus: |
| 384 case kRegexpQuest: |
| 385 case kRegexpRepeat: |
| 386 case kRegexpCapture: |
| 387 break; |
| 388 |
| 389 default: |
| 390 return true; |
| 391 } |
| 392 |
| 393 // Committed to doing real work. |
| 394 // The stack (vector) has pairs of regexps waiting to |
| 395 // be compared. The regexps are only equal if |
| 396 // all the pairs end up being equal. |
| 397 vector<Regexp*> stk; |
| 398 |
| 399 for (;;) { |
| 400 // Invariant: TopEqual(a, b) == true. |
| 401 Regexp* a2; |
| 402 Regexp* b2; |
| 403 switch (a->op()) { |
| 404 default: |
| 405 break; |
| 406 case kRegexpAlternate: |
| 407 case kRegexpConcat: |
| 408 for (int i = 0; i < a->nsub(); i++) { |
| 409 a2 = a->sub()[i]; |
| 410 b2 = b->sub()[i]; |
| 411 if (!TopEqual(a2, b2)) |
| 412 return false; |
| 413 stk.push_back(a2); |
| 414 stk.push_back(b2); |
| 415 } |
| 416 break; |
| 417 |
| 418 case kRegexpStar: |
| 419 case kRegexpPlus: |
| 420 case kRegexpQuest: |
| 421 case kRegexpRepeat: |
| 422 case kRegexpCapture: |
| 423 a2 = a->sub()[0]; |
| 424 b2 = b->sub()[0]; |
| 425 if (!TopEqual(a2, b2)) |
| 426 return false; |
| 427 // Really: |
| 428 // stk.push_back(a2); |
| 429 // stk.push_back(b2); |
| 430 // break; |
| 431 // but faster to assign directly and loop. |
| 432 a = a2; |
| 433 b = b2; |
| 434 continue; |
| 435 } |
| 436 |
| 437 int n = stk.size(); |
| 438 if (n == 0) |
| 439 break; |
| 440 |
| 441 a = stk[n-2]; |
| 442 b = stk[n-1]; |
| 443 stk.resize(n-2); |
| 444 } |
| 445 |
| 446 return true; |
| 447 } |
| 448 |
| 449 // Keep in sync with enum RegexpStatusCode in regexp.h |
| 450 static const string kErrorStrings[] = { |
| 451 "no error", |
| 452 "unexpected error", |
| 453 "invalid escape sequence", |
| 454 "invalid character class", |
| 455 "invalid character class range", |
| 456 "missing ]", |
| 457 "missing )", |
| 458 "trailing \\", |
| 459 "no argument for repetition operator", |
| 460 "invalid repetition size", |
| 461 "bad repetition operator", |
| 462 "invalid perl operator", |
| 463 "invalid UTF-8", |
| 464 "invalid named capture group", |
| 465 }; |
| 466 |
| 467 const string& RegexpStatus::CodeText(enum RegexpStatusCode code) { |
| 468 if (code < 0 || code >= arraysize(kErrorStrings)) |
| 469 code = kRegexpInternalError; |
| 470 return kErrorStrings[code]; |
| 471 } |
| 472 |
| 473 string RegexpStatus::Text() const { |
| 474 if (error_arg_.empty()) |
| 475 return CodeText(code_); |
| 476 string s; |
| 477 s.append(CodeText(code_)); |
| 478 s.append(": "); |
| 479 s.append(error_arg_.data(), error_arg_.size()); |
| 480 return s; |
| 481 } |
| 482 |
| 483 void RegexpStatus::Copy(const RegexpStatus& status) { |
| 484 code_ = status.code_; |
| 485 error_arg_ = status.error_arg_; |
| 486 } |
| 487 |
| 488 typedef int Ignored; // Walker<void> doesn't exist |
| 489 |
| 490 // Walker subclass to count capturing parens in regexp. |
| 491 class NumCapturesWalker : public Regexp::Walker<Ignored> { |
| 492 public: |
| 493 NumCapturesWalker() : ncapture_(0) {} |
| 494 int ncapture() { return ncapture_; } |
| 495 |
| 496 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { |
| 497 if (re->op() == kRegexpCapture) |
| 498 ncapture_++; |
| 499 return ignored; |
| 500 } |
| 501 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { |
| 502 // Should never be called: we use Walk not WalkExponential. |
| 503 LOG(DFATAL) << "NumCapturesWalker::ShortVisit called"; |
| 504 return ignored; |
| 505 } |
| 506 |
| 507 private: |
| 508 int ncapture_; |
| 509 DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker); |
| 510 }; |
| 511 |
| 512 int Regexp::NumCaptures() { |
| 513 NumCapturesWalker w; |
| 514 w.Walk(this, 0); |
| 515 return w.ncapture(); |
| 516 } |
| 517 |
| 518 // Walker class to build map of named capture groups and their indices. |
| 519 class NamedCapturesWalker : public Regexp::Walker<Ignored> { |
| 520 public: |
| 521 NamedCapturesWalker() : map_(NULL) {} |
| 522 ~NamedCapturesWalker() { delete map_; } |
| 523 |
| 524 map<string, int>* TakeMap() { |
| 525 map<string, int>* m = map_; |
| 526 map_ = NULL; |
| 527 return m; |
| 528 } |
| 529 |
| 530 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { |
| 531 if (re->op() == kRegexpCapture && re->name() != NULL) { |
| 532 // Allocate map once we find a name. |
| 533 if (map_ == NULL) |
| 534 map_ = new map<string, int>; |
| 535 |
| 536 // Record first occurrence of each name. |
| 537 // (The rule is that if you have the same name |
| 538 // multiple times, only the leftmost one counts.) |
| 539 if (map_->find(*re->name()) == map_->end()) |
| 540 (*map_)[*re->name()] = re->cap(); |
| 541 } |
| 542 return ignored; |
| 543 } |
| 544 |
| 545 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { |
| 546 // Should never be called: we use Walk not WalkExponential. |
| 547 LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called"; |
| 548 return ignored; |
| 549 } |
| 550 |
| 551 private: |
| 552 map<string, int>* map_; |
| 553 DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker); |
| 554 }; |
| 555 |
| 556 map<string, int>* Regexp::NamedCaptures() { |
| 557 NamedCapturesWalker w; |
| 558 w.Walk(this, 0); |
| 559 return w.TakeMap(); |
| 560 } |
| 561 |
| 562 // Walker class to build map from capture group indices to their names. |
| 563 class CaptureNamesWalker : public Regexp::Walker<Ignored> { |
| 564 public: |
| 565 CaptureNamesWalker() : map_(NULL) {} |
| 566 ~CaptureNamesWalker() { delete map_; } |
| 567 |
| 568 map<int, string>* TakeMap() { |
| 569 map<int, string>* m = map_; |
| 570 map_ = NULL; |
| 571 return m; |
| 572 } |
| 573 |
| 574 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { |
| 575 if (re->op() == kRegexpCapture && re->name() != NULL) { |
| 576 // Allocate map once we find a name. |
| 577 if (map_ == NULL) |
| 578 map_ = new map<int, string>; |
| 579 |
| 580 (*map_)[re->cap()] = *re->name(); |
| 581 } |
| 582 return ignored; |
| 583 } |
| 584 |
| 585 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { |
| 586 // Should never be called: we use Walk not WalkExponential. |
| 587 LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called"; |
| 588 return ignored; |
| 589 } |
| 590 |
| 591 private: |
| 592 map<int, string>* map_; |
| 593 DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker); |
| 594 }; |
| 595 |
| 596 map<int, string>* Regexp::CaptureNames() { |
| 597 CaptureNamesWalker w; |
| 598 w.Walk(this, 0); |
| 599 return w.TakeMap(); |
| 600 } |
| 601 |
| 602 // Determines whether regexp matches must be anchored |
| 603 // with a fixed string prefix. If so, returns the prefix and |
| 604 // the regexp that remains after the prefix. The prefix might |
| 605 // be ASCII case-insensitive. |
| 606 bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { |
| 607 // No need for a walker: the regexp must be of the form |
| 608 // 1. some number of ^ anchors |
| 609 // 2. a literal char or string |
| 610 // 3. the rest |
| 611 prefix->clear(); |
| 612 *foldcase = false; |
| 613 *suffix = NULL; |
| 614 if (op_ != kRegexpConcat) |
| 615 return false; |
| 616 |
| 617 // Some number of anchors, then a literal or concatenation. |
| 618 int i = 0; |
| 619 Regexp** sub = this->sub(); |
| 620 while (i < nsub_ && sub[i]->op_ == kRegexpBeginText) |
| 621 i++; |
| 622 if (i == 0 || i >= nsub_) |
| 623 return false; |
| 624 |
| 625 Regexp* re = sub[i]; |
| 626 switch (re->op_) { |
| 627 default: |
| 628 return false; |
| 629 |
| 630 case kRegexpLiteralString: |
| 631 // Convert to string in proper encoding. |
| 632 if (re->parse_flags() & Latin1) { |
| 633 prefix->resize(re->nrunes_); |
| 634 for (int j = 0; j < re->nrunes_; j++) |
| 635 (*prefix)[j] = re->runes_[j]; |
| 636 } else { |
| 637 // Convert to UTF-8 in place. |
| 638 // Assume worst-case space and then trim. |
| 639 prefix->resize(re->nrunes_ * UTFmax); |
| 640 char *p = &(*prefix)[0]; |
| 641 for (int j = 0; j < re->nrunes_; j++) { |
| 642 Rune r = re->runes_[j]; |
| 643 if (r < Runeself) |
| 644 *p++ = r; |
| 645 else |
| 646 p += runetochar(p, &r); |
| 647 } |
| 648 prefix->resize(p - &(*prefix)[0]); |
| 649 } |
| 650 break; |
| 651 |
| 652 case kRegexpLiteral: |
| 653 if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) { |
| 654 prefix->append(1, re->rune_); |
| 655 } else { |
| 656 char buf[UTFmax]; |
| 657 prefix->append(buf, runetochar(buf, &re->rune_)); |
| 658 } |
| 659 break; |
| 660 } |
| 661 *foldcase = (sub[i]->parse_flags() & FoldCase); |
| 662 i++; |
| 663 |
| 664 // The rest. |
| 665 if (i < nsub_) { |
| 666 for (int j = i; j < nsub_; j++) |
| 667 sub[j]->Incref(); |
| 668 re = Concat(sub + i, nsub_ - i, parse_flags()); |
| 669 } else { |
| 670 re = new Regexp(kRegexpEmptyMatch, parse_flags()); |
| 671 } |
| 672 *suffix = re; |
| 673 return true; |
| 674 } |
| 675 |
| 676 // Character class builder is a balanced binary tree (STL set) |
| 677 // containing non-overlapping, non-abutting RuneRanges. |
| 678 // The less-than operator used in the tree treats two |
| 679 // ranges as equal if they overlap at all, so that |
| 680 // lookups for a particular Rune are possible. |
| 681 |
| 682 CharClassBuilder::CharClassBuilder() { |
| 683 nrunes_ = 0; |
| 684 upper_ = 0; |
| 685 lower_ = 0; |
| 686 } |
| 687 |
| 688 // Add lo-hi to the class; return whether class got bigger. |
| 689 bool CharClassBuilder::AddRange(Rune lo, Rune hi) { |
| 690 if (hi < lo) |
| 691 return false; |
| 692 |
| 693 if (lo <= 'z' && hi >= 'A') { |
| 694 // Overlaps some alpha, maybe not all. |
| 695 // Update bitmaps telling which ASCII letters are in the set. |
| 696 Rune lo1 = max<Rune>(lo, 'A'); |
| 697 Rune hi1 = min<Rune>(hi, 'Z'); |
| 698 if (lo1 <= hi1) |
| 699 upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A'); |
| 700 |
| 701 lo1 = max<Rune>(lo, 'a'); |
| 702 hi1 = min<Rune>(hi, 'z'); |
| 703 if (lo1 <= hi1) |
| 704 lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a'); |
| 705 } |
| 706 |
| 707 { // Check whether lo, hi is already in the class. |
| 708 iterator it = ranges_.find(RuneRange(lo, lo)); |
| 709 if (it != end() && it->lo <= lo && hi <= it->hi) |
| 710 return false; |
| 711 } |
| 712 |
| 713 // Look for a range abutting lo on the left. |
| 714 // If it exists, take it out and increase our range. |
| 715 if (lo > 0) { |
| 716 iterator it = ranges_.find(RuneRange(lo-1, lo-1)); |
| 717 if (it != end()) { |
| 718 lo = it->lo; |
| 719 if (it->hi > hi) |
| 720 hi = it->hi; |
| 721 nrunes_ -= it->hi - it->lo + 1; |
| 722 ranges_.erase(it); |
| 723 } |
| 724 } |
| 725 |
| 726 // Look for a range abutting hi on the right. |
| 727 // If it exists, take it out and increase our range. |
| 728 if (hi < Runemax) { |
| 729 iterator it = ranges_.find(RuneRange(hi+1, hi+1)); |
| 730 if (it != end()) { |
| 731 hi = it->hi; |
| 732 nrunes_ -= it->hi - it->lo + 1; |
| 733 ranges_.erase(it); |
| 734 } |
| 735 } |
| 736 |
| 737 // Look for ranges between lo and hi. Take them out. |
| 738 // This is only safe because the set has no overlapping ranges. |
| 739 // We've already removed any ranges abutting lo and hi, so |
| 740 // any that overlap [lo, hi] must be contained within it. |
| 741 for (;;) { |
| 742 iterator it = ranges_.find(RuneRange(lo, hi)); |
| 743 if (it == end()) |
| 744 break; |
| 745 nrunes_ -= it->hi - it->lo + 1; |
| 746 ranges_.erase(it); |
| 747 } |
| 748 |
| 749 // Finally, add [lo, hi]. |
| 750 nrunes_ += hi - lo + 1; |
| 751 ranges_.insert(RuneRange(lo, hi)); |
| 752 return true; |
| 753 } |
| 754 |
| 755 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) { |
| 756 for (iterator it = cc->begin(); it != cc->end(); ++it) |
| 757 AddRange(it->lo, it->hi); |
| 758 } |
| 759 |
| 760 bool CharClassBuilder::Contains(Rune r) { |
| 761 return ranges_.find(RuneRange(r, r)) != end(); |
| 762 } |
| 763 |
| 764 // Does the character class behave the same on A-Z as on a-z? |
| 765 bool CharClassBuilder::FoldsASCII() { |
| 766 return ((upper_ ^ lower_) & AlphaMask) == 0; |
| 767 } |
| 768 |
| 769 CharClassBuilder* CharClassBuilder::Copy() { |
| 770 CharClassBuilder* cc = new CharClassBuilder; |
| 771 for (iterator it = begin(); it != end(); ++it) |
| 772 cc->ranges_.insert(RuneRange(it->lo, it->hi)); |
| 773 cc->upper_ = upper_; |
| 774 cc->lower_ = lower_; |
| 775 cc->nrunes_ = nrunes_; |
| 776 return cc; |
| 777 } |
| 778 |
| 779 |
| 780 |
| 781 void CharClassBuilder::RemoveAbove(Rune r) { |
| 782 if (r >= Runemax) |
| 783 return; |
| 784 |
| 785 if (r < 'z') { |
| 786 if (r < 'a') |
| 787 lower_ = 0; |
| 788 else |
| 789 lower_ &= AlphaMask >> ('z' - r); |
| 790 } |
| 791 |
| 792 if (r < 'Z') { |
| 793 if (r < 'A') |
| 794 upper_ = 0; |
| 795 else |
| 796 upper_ &= AlphaMask >> ('Z' - r); |
| 797 } |
| 798 |
| 799 for (;;) { |
| 800 |
| 801 iterator it = ranges_.find(RuneRange(r + 1, Runemax)); |
| 802 if (it == end()) |
| 803 break; |
| 804 RuneRange rr = *it; |
| 805 ranges_.erase(it); |
| 806 nrunes_ -= rr.hi - rr.lo + 1; |
| 807 if (rr.lo <= r) { |
| 808 rr.hi = r; |
| 809 ranges_.insert(rr); |
| 810 nrunes_ += rr.hi - rr.lo + 1; |
| 811 } |
| 812 } |
| 813 } |
| 814 |
| 815 void CharClassBuilder::Negate() { |
| 816 // Build up negation and then copy in. |
| 817 // Could edit ranges in place, but C++ won't let me. |
| 818 vector<RuneRange> v; |
| 819 v.reserve(ranges_.size() + 1); |
| 820 |
| 821 // In negation, first range begins at 0, unless |
| 822 // the current class begins at 0. |
| 823 iterator it = begin(); |
| 824 if (it == end()) { |
| 825 v.push_back(RuneRange(0, Runemax)); |
| 826 } else { |
| 827 int nextlo = 0; |
| 828 if (it->lo == 0) { |
| 829 nextlo = it->hi + 1; |
| 830 ++it; |
| 831 } |
| 832 for (; it != end(); ++it) { |
| 833 v.push_back(RuneRange(nextlo, it->lo - 1)); |
| 834 nextlo = it->hi + 1; |
| 835 } |
| 836 if (nextlo <= Runemax) |
| 837 v.push_back(RuneRange(nextlo, Runemax)); |
| 838 } |
| 839 |
| 840 ranges_.clear(); |
| 841 for (int i = 0; i < v.size(); i++) |
| 842 ranges_.insert(v[i]); |
| 843 |
| 844 upper_ = AlphaMask & ~upper_; |
| 845 lower_ = AlphaMask & ~lower_; |
| 846 nrunes_ = Runemax+1 - nrunes_; |
| 847 } |
| 848 |
| 849 // Character class is a sorted list of ranges. |
| 850 // The ranges are allocated in the same block as the header, |
| 851 // necessitating a special allocator and Delete method. |
| 852 |
| 853 CharClass* CharClass::New(int maxranges) { |
| 854 CharClass* cc; |
| 855 uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]]; |
| 856 cc = reinterpret_cast<CharClass*>(data); |
| 857 cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc); |
| 858 cc->nranges_ = 0; |
| 859 cc->folds_ascii_ = false; |
| 860 cc->nrunes_ = 0; |
| 861 return cc; |
| 862 } |
| 863 |
| 864 void CharClass::Delete() { |
| 865 if (this == NULL) |
| 866 return; |
| 867 uint8 *data = reinterpret_cast<uint8*>(this); |
| 868 delete[] data; |
| 869 } |
| 870 |
| 871 CharClass* CharClass::Negate() { |
| 872 CharClass* cc = CharClass::New(nranges_+1); |
| 873 cc->folds_ascii_ = folds_ascii_; |
| 874 cc->nrunes_ = Runemax + 1 - nrunes_; |
| 875 int n = 0; |
| 876 int nextlo = 0; |
| 877 for (CharClass::iterator it = begin(); it != end(); ++it) { |
| 878 if (it->lo == nextlo) { |
| 879 nextlo = it->hi + 1; |
| 880 } else { |
| 881 cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1); |
| 882 nextlo = it->hi + 1; |
| 883 } |
| 884 } |
| 885 if (nextlo <= Runemax) |
| 886 cc->ranges_[n++] = RuneRange(nextlo, Runemax); |
| 887 cc->nranges_ = n; |
| 888 return cc; |
| 889 } |
| 890 |
| 891 bool CharClass::Contains(Rune r) { |
| 892 RuneRange* rr = ranges_; |
| 893 int n = nranges_; |
| 894 while (n > 0) { |
| 895 int m = n/2; |
| 896 if (rr[m].hi < r) { |
| 897 rr += m+1; |
| 898 n -= m+1; |
| 899 } else if (r < rr[m].lo) { |
| 900 n = m; |
| 901 } else { // rr[m].lo <= r && r <= rr[m].hi |
| 902 return true; |
| 903 } |
| 904 } |
| 905 return false; |
| 906 } |
| 907 |
| 908 CharClass* CharClassBuilder::GetCharClass() { |
| 909 CharClass* cc = CharClass::New(ranges_.size()); |
| 910 int n = 0; |
| 911 for (iterator it = begin(); it != end(); ++it) |
| 912 cc->ranges_[n++] = *it; |
| 913 cc->nranges_ = n; |
| 914 DCHECK_LE(n, ranges_.size()); |
| 915 cc->nrunes_ = nrunes_; |
| 916 cc->folds_ascii_ = FoldsASCII(); |
| 917 return cc; |
| 918 } |
| 919 |
| 920 } // namespace re2 |
OLD | NEW |