Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(86)

Side by Side Diff: src/jsregexp.h

Issue 9961009: Regexp: Unify the lookahead code for the BoyerMoore-like skipping (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 8 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | src/jsregexp.cc » ('j') | src/jsregexp.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 22 matching lines...) Expand all
33 #include "zone-inl.h" 33 #include "zone-inl.h"
34 34
35 namespace v8 { 35 namespace v8 {
36 namespace internal { 36 namespace internal {
37 37
38 class NodeVisitor; 38 class NodeVisitor;
39 class RegExpCompiler; 39 class RegExpCompiler;
40 class RegExpMacroAssembler; 40 class RegExpMacroAssembler;
41 class RegExpNode; 41 class RegExpNode;
42 class RegExpTree; 42 class RegExpTree;
43 class BoyerMooreLookahead;
43 44
44 class RegExpImpl { 45 class RegExpImpl {
45 public: 46 public:
46 // Whether V8 is compiled with native regexp support or not. 47 // Whether V8 is compiled with native regexp support or not.
47 static bool UsesNativeRegExp() { 48 static bool UsesNativeRegExp() {
48 #ifdef V8_INTERPRETED_REGEXP 49 #ifdef V8_INTERPRETED_REGEXP
49 return false; 50 return false;
50 #else 51 #else
51 return true; 52 return true;
52 #endif 53 #endif
(...skipping 164 matching lines...) Expand 10 before | Expand all | Expand 10 after
217 // Represents the location of one element relative to the intersection of 218 // Represents the location of one element relative to the intersection of
218 // two sets. Corresponds to the four areas of a Venn diagram. 219 // two sets. Corresponds to the four areas of a Venn diagram.
219 enum ElementInSetsRelation { 220 enum ElementInSetsRelation {
220 kInsideNone = 0, 221 kInsideNone = 0,
221 kInsideFirst = 1, 222 kInsideFirst = 1,
222 kInsideSecond = 2, 223 kInsideSecond = 2,
223 kInsideBoth = 3 224 kInsideBoth = 3
224 }; 225 };
225 226
226 227
227 // Represents the relation of two sets.
228 // Sets can be either disjoint, partially or fully overlapping, or equal.
229 class SetRelation BASE_EMBEDDED {
230 public:
231 // Relation is represented by a bit saying whether there are elements in
232 // one set that is not in the other, and a bit saying that there are elements
233 // that are in both sets.
234
235 // Location of an element. Corresponds to the internal areas of
236 // a Venn diagram.
237 enum {
238 kInFirst = 1 << kInsideFirst,
239 kInSecond = 1 << kInsideSecond,
240 kInBoth = 1 << kInsideBoth
241 };
242 SetRelation() : bits_(0) {}
243 ~SetRelation() {}
244 // Add the existence of objects in a particular
245 void SetElementsInFirstSet() { bits_ |= kInFirst; }
246 void SetElementsInSecondSet() { bits_ |= kInSecond; }
247 void SetElementsInBothSets() { bits_ |= kInBoth; }
248 // Check the currently known relation of the sets (common functions only,
249 // for other combinations, use value() to get the bits and check them
250 // manually).
251 // Sets are completely disjoint.
252 bool Disjoint() { return (bits_ & kInBoth) == 0; }
253 // Sets are equal.
254 bool Equals() { return (bits_ & (kInFirst | kInSecond)) == 0; }
255 // First set contains second.
256 bool Contains() { return (bits_ & kInSecond) == 0; }
257 // Second set contains first.
258 bool ContainedIn() { return (bits_ & kInFirst) == 0; }
259 bool NonTrivialIntersection() {
260 return (bits_ == (kInFirst | kInSecond | kInBoth));
261 }
262 int value() { return bits_; }
263
264 private:
265 int bits_;
266 };
267
268
269 class CharacterRange { 228 class CharacterRange {
270 public: 229 public:
271 CharacterRange() : from_(0), to_(0) { } 230 CharacterRange() : from_(0), to_(0) { }
272 // For compatibility with the CHECK_OK macro 231 // For compatibility with the CHECK_OK macro
273 CharacterRange(void* null) { ASSERT_EQ(NULL, null); } //NOLINT 232 CharacterRange(void* null) { ASSERT_EQ(NULL, null); } //NOLINT
274 CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) { } 233 CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) { }
275 static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges); 234 static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges);
276 static Vector<const uc16> GetWordBounds(); 235 static Vector<const int> GetWordBounds();
277 static inline CharacterRange Singleton(uc16 value) { 236 static inline CharacterRange Singleton(uc16 value) {
278 return CharacterRange(value, value); 237 return CharacterRange(value, value);
279 } 238 }
280 static inline CharacterRange Range(uc16 from, uc16 to) { 239 static inline CharacterRange Range(uc16 from, uc16 to) {
281 ASSERT(from <= to); 240 ASSERT(from <= to);
282 return CharacterRange(from, to); 241 return CharacterRange(from, to);
283 } 242 }
284 static inline CharacterRange Everything() { 243 static inline CharacterRange Everything() {
285 return CharacterRange(0, 0xFFFF); 244 return CharacterRange(0, 0xFFFF);
286 } 245 }
287 bool Contains(uc16 i) { return from_ <= i && i <= to_; } 246 bool Contains(uc16 i) { return from_ <= i && i <= to_; }
288 uc16 from() const { return from_; } 247 uc16 from() const { return from_; }
289 void set_from(uc16 value) { from_ = value; } 248 void set_from(uc16 value) { from_ = value; }
290 uc16 to() const { return to_; } 249 uc16 to() const { return to_; }
291 void set_to(uc16 value) { to_ = value; } 250 void set_to(uc16 value) { to_ = value; }
292 bool is_valid() { return from_ <= to_; } 251 bool is_valid() { return from_ <= to_; }
293 bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; } 252 bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
294 bool IsSingleton() { return (from_ == to_); } 253 bool IsSingleton() { return (from_ == to_); }
295 void AddCaseEquivalents(ZoneList<CharacterRange>* ranges, bool is_ascii); 254 void AddCaseEquivalents(ZoneList<CharacterRange>* ranges, bool is_ascii);
296 static void Split(ZoneList<CharacterRange>* base, 255 static void Split(ZoneList<CharacterRange>* base,
297 Vector<const uc16> overlay, 256 Vector<const int> overlay,
298 ZoneList<CharacterRange>** included, 257 ZoneList<CharacterRange>** included,
299 ZoneList<CharacterRange>** excluded); 258 ZoneList<CharacterRange>** excluded);
300 // Whether a range list is in canonical form: Ranges ordered by from value, 259 // Whether a range list is in canonical form: Ranges ordered by from value,
301 // and ranges non-overlapping and non-adjacent. 260 // and ranges non-overlapping and non-adjacent.
302 static bool IsCanonical(ZoneList<CharacterRange>* ranges); 261 static bool IsCanonical(ZoneList<CharacterRange>* ranges);
303 // Convert range list to canonical form. The characters covered by the ranges 262 // Convert range list to canonical form. The characters covered by the ranges
304 // will still be the same, but no character is in more than one range, and 263 // will still be the same, but no character is in more than one range, and
305 // adjacent ranges are merged. The resulting list may be shorter than the 264 // adjacent ranges are merged. The resulting list may be shorter than the
306 // original, but cannot be longer. 265 // original, but cannot be longer.
307 static void Canonicalize(ZoneList<CharacterRange>* ranges); 266 static void Canonicalize(ZoneList<CharacterRange>* ranges);
308 // Check how the set of characters defined by a CharacterRange list relates
309 // to the set of word characters. List must be in canonical form.
310 static SetRelation WordCharacterRelation(ZoneList<CharacterRange>* ranges);
311 // Takes two character range lists (representing character sets) in canonical
312 // form and merges them.
313 // The characters that are only covered by the first set are added to
314 // first_set_only_out. the characters that are only in the second set are
315 // added to second_set_only_out, and the characters that are in both are
316 // added to both_sets_out.
317 // The pointers to first_set_only_out, second_set_only_out and both_sets_out
318 // should be to empty lists, but they need not be distinct, and may be NULL.
319 // If NULL, the characters are dropped, and if two arguments are the same
320 // pointer, the result is the union of the two sets that would be created
321 // if the pointers had been distinct.
322 // This way, the Merge function can compute all the usual set operations:
323 // union (all three out-sets are equal), intersection (only both_sets_out is
324 // non-NULL), and set difference (only first_set is non-NULL).
325 static void Merge(ZoneList<CharacterRange>* first_set,
326 ZoneList<CharacterRange>* second_set,
327 ZoneList<CharacterRange>* first_set_only_out,
328 ZoneList<CharacterRange>* second_set_only_out,
329 ZoneList<CharacterRange>* both_sets_out);
330 // Negate the contents of a character range in canonical form. 267 // Negate the contents of a character range in canonical form.
331 static void Negate(ZoneList<CharacterRange>* src, 268 static void Negate(ZoneList<CharacterRange>* src,
332 ZoneList<CharacterRange>* dst); 269 ZoneList<CharacterRange>* dst);
333 static const int kStartMarker = (1 << 24); 270 static const int kStartMarker = (1 << 24);
334 static const int kPayloadMask = (1 << 24) - 1; 271 static const int kPayloadMask = (1 << 24) - 1;
335 272
336 private: 273 private:
337 uc16 from_; 274 uc16 from_;
338 uc16 to_; 275 uc16 to_;
339 }; 276 };
(...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after
414 private: 351 private:
415 // There can't be a static empty set since it allocates its 352 // There can't be a static empty set since it allocates its
416 // successors in a zone and caches them. 353 // successors in a zone and caches them.
417 OutSet* empty() { return &empty_; } 354 OutSet* empty() { return &empty_; }
418 OutSet empty_; 355 OutSet empty_;
419 ZoneSplayTree<Config>* tree() { return &tree_; } 356 ZoneSplayTree<Config>* tree() { return &tree_; }
420 ZoneSplayTree<Config> tree_; 357 ZoneSplayTree<Config> tree_;
421 }; 358 };
422 359
423 360
424 // Improve the speed that we scan for an initial point where a non-anchored
425 // regexp can match by using a Boyer-Moore-like table. This is done by
426 // identifying non-greedy non-capturing loops in the nodes that eat any
427 // character one at a time. For example in the middle of the regexp
428 // /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
429 // inserted at the start of any non-anchored regexp.
430 //
431 // When we have found such a loop we look ahead in the nodes to find the set of
432 // characters that can come at given distances. For example for the regexp
433 // /.?foo/ we know that there are at least 3 characters ahead of us, and the
434 // sets of characters that can occur are [any, [f, o], [o]]. We find a range in
435 // the lookahead info where the set of characters is reasonably constrained. In
436 // our example this is from index 1 to 2 (0 is not constrained). We can now
437 // look 3 characters ahead and if we don't find one of [f, o] (the union of
438 // [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
439 //
440 // For Unicode input strings we do the same, but modulo 128.
441 //
442 // We also look at the first string fed to the regexp and use that to get a hint
443 // of the character frequencies in the inputs. This affects the assessment of
444 // whether the set of characters is 'reasonably constrained'.
445 //
446 // We also have another lookahead mechanism (called quick check in the code),
447 // which uses a wide load of multiple characters followed by a mask and compare
448 // to determine whether a match is possible at this point.
449 class BoyerMooreLookahead {
450 public:
451 BoyerMooreLookahead(int length, int map_length, RegExpCompiler* compiler);
452
453 int length() { return length_; }
454 int max_char() { return max_char_; }
455 RegExpCompiler* compiler() { return compiler_; }
456
457 static const int kTooManyCharacters = 32;
458
459 int Count(int map_number) {
460 ZoneList<bool>* map = bitmaps_->at(map_number);
461 if (map == NULL) return map_length_;
462 int count = 0;
463 for (int i = 0; i < map_length_; i++) {
464 if (map->at(i)) count++;
465 }
466 return count;
467 }
468
469 void Set(int map_number, int character) {
470 if (character > max_char_) return;
471 ZoneList<bool>* map = bitmaps_->at(map_number);
472 if (map == NULL) return;
473 map->at(character & (map_length_ - 1)) = true;
474 }
475
476 void SetAll(int map_number) {
477 bitmaps_->at(map_number) = NULL;
478 }
479
480 void SetRest(int from_map) {
481 for (int i = from_map; i < length_; i++) SetAll(i);
482 }
483 bool EmitSkipInstructions(RegExpMacroAssembler* masm);
484
485 private:
486 // This is the value obtained by EatsAtLeast. If we do not have at least this
487 // many characters left in the sample string then the match is bound to fail.
488 // Therefore it is OK to read a character this far ahead of the current match
489 // point.
490 int length_;
491 // We conservatively consider all character values modulo this length. For
492 // ASCII there is no loss of precision, since this has a value of 128.
493 int map_length_;
494 RegExpCompiler* compiler_;
495 // 0x7f for ASCII, 0xffff for UTF-16.
496 int max_char_;
497 ZoneList<ZoneList<bool>*>* bitmaps_;
498
499 int GetSkipTable(int min_lookahead,
500 int max_lookahead,
501 Handle<ByteArray> boolean_skip_table);
502 bool FindWorthwhileInterval(int* from, int* to);
503 int FindBestInterval(
504 int max_number_of_chars, int old_biggest_points, int* from, int* to);
505 };
506
507
508 #define FOR_EACH_NODE_TYPE(VISIT) \ 361 #define FOR_EACH_NODE_TYPE(VISIT) \
509 VISIT(End) \ 362 VISIT(End) \
510 VISIT(Action) \ 363 VISIT(Action) \
511 VISIT(Choice) \ 364 VISIT(Choice) \
512 VISIT(BackReference) \ 365 VISIT(BackReference) \
513 VISIT(Assertion) \ 366 VISIT(Assertion) \
514 VISIT(Text) 367 VISIT(Text)
515 368
516 369
517 #define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \ 370 #define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \
(...skipping 162 matching lines...) Expand 10 before | Expand all | Expand 10 after
680 uint32_t mask_; 533 uint32_t mask_;
681 uint32_t value_; 534 uint32_t value_;
682 // If set to true, there is no way this quick check can match at all. 535 // If set to true, there is no way this quick check can match at all.
683 // E.g., if it requires to be at the start of the input, and isn't. 536 // E.g., if it requires to be at the start of the input, and isn't.
684 bool cannot_match_; 537 bool cannot_match_;
685 }; 538 };
686 539
687 540
688 class RegExpNode: public ZoneObject { 541 class RegExpNode: public ZoneObject {
689 public: 542 public:
690 RegExpNode() : first_character_set_(NULL), trace_count_(0) { } 543 RegExpNode() : first_character_set_(NULL), trace_count_(0) {
544 bm_info_[0] = bm_info_[1] = NULL;
545 }
691 virtual ~RegExpNode(); 546 virtual ~RegExpNode();
692 virtual void Accept(NodeVisitor* visitor) = 0; 547 virtual void Accept(NodeVisitor* visitor) = 0;
693 // Generates a goto to this node or actually generates the code at this point. 548 // Generates a goto to this node or actually generates the code at this point.
694 virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0; 549 virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
695 // How many characters must this node consume at a minimum in order to 550 // How many characters must this node consume at a minimum in order to
696 // succeed. If we have found at least 'still_to_find' characters that 551 // succeed. If we have found at least 'still_to_find' characters that
697 // must be consumed there is no need to ask any following nodes whether 552 // must be consumed there is no need to ask any following nodes whether
698 // they are sure to eat any more characters. The not_at_start argument is 553 // they are sure to eat any more characters. The not_at_start argument is
699 // used to indicate that we know we are not at the start of the input. In 554 // used to indicate that we know we are not at the start of the input. In
700 // this case anchored branches will always fail and can be ignored when 555 // this case anchored branches will always fail and can be ignored when
(...skipping 23 matching lines...) Expand all
724 // Only returns the successor for a text node of length 1 that matches any 579 // Only returns the successor for a text node of length 1 that matches any
725 // character and that has no guards on it. 580 // character and that has no guards on it.
726 virtual RegExpNode* GetSuccessorOfOmnivorousTextNode( 581 virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
727 RegExpCompiler* compiler) { 582 RegExpCompiler* compiler) {
728 return NULL; 583 return NULL;
729 } 584 }
730 585
731 // Collects information on the possible code units (mod 128) that can match if 586 // Collects information on the possible code units (mod 128) that can match if
732 // we look forward. This is used for a Boyer-Moore-like string searching 587 // we look forward. This is used for a Boyer-Moore-like string searching
733 // implementation. TODO(erikcorry): This should share more code with 588 // implementation. TODO(erikcorry): This should share more code with
734 // EatsAtLeast, GetQuickCheckDetails and ComputeFirstCharacterSet. 589 // EatsAtLeast, GetQuickCheckDetails.
735 virtual void FillInBMInfo( 590 virtual void FillInBMInfo(
736 int offset, BoyerMooreLookahead* bm, bool not_at_start) { 591 int offset, BoyerMooreLookahead* bm, bool not_at_start) {
737 UNREACHABLE(); 592 UNREACHABLE();
738 } 593 }
739 594
740 Label* label() { return &label_; } 595 Label* label() { return &label_; }
741 // If non-generic code is generated for a node (i.e. the node is not at the 596 // If non-generic code is generated for a node (i.e. the node is not at the
742 // start of the trace) then it cannot be reused. This variable sets a limit 597 // start of the trace) then it cannot be reused. This variable sets a limit
743 // on how often we allow that to happen before we insist on starting a new 598 // on how often we allow that to happen before we insist on starting a new
744 // trace and generating generic code for a node that can be reused by flushing 599 // trace and generating generic code for a node that can be reused by flushing
745 // the deferred actions in the current trace and generating a goto. 600 // the deferred actions in the current trace and generating a goto.
746 static const int kMaxCopiesCodeGenerated = 10; 601 static const int kMaxCopiesCodeGenerated = 10;
747 602
748 NodeInfo* info() { return &info_; } 603 NodeInfo* info() { return &info_; }
749 604
750 void AddSibling(RegExpNode* node) { siblings_.Add(node); } 605 void AddSibling(RegExpNode* node) { siblings_.Add(node); }
751 606
752 // Static version of EnsureSibling that expresses the fact that the 607 // Static version of EnsureSibling that expresses the fact that the
753 // result has the same type as the input. 608 // result has the same type as the input.
754 template <class C> 609 template <class C>
755 static C* EnsureSibling(C* node, NodeInfo* info, bool* cloned) { 610 static C* EnsureSibling(C* node, NodeInfo* info, bool* cloned) {
756 return static_cast<C*>(node->EnsureSibling(info, cloned)); 611 return static_cast<C*>(node->EnsureSibling(info, cloned));
757 } 612 }
758 613
759 SiblingList* siblings() { return &siblings_; } 614 SiblingList* siblings() { return &siblings_; }
760 void set_siblings(SiblingList* other) { siblings_ = *other; } 615 void set_siblings(SiblingList* other) { siblings_ = *other; }
761 616
762 // Return the set of possible next characters recognized by the regexp
763 // (or a safe subset, potentially the set of all characters).
764 ZoneList<CharacterRange>* FirstCharacterSet();
765
766 // Compute (if possible within the budget of traversed nodes) the
767 // possible first characters of the input matched by this node and
768 // its continuation. Returns the remaining budget after the computation.
769 // If the budget is spent, the result is negative, and the cached
770 // first_character_set_ value isn't set.
771 virtual int ComputeFirstCharacterSet(int budget);
772
773 // Get and set the cached first character set value. 617 // Get and set the cached first character set value.
774 ZoneList<CharacterRange>* first_character_set() { 618 ZoneList<CharacterRange>* first_character_set() {
775 return first_character_set_; 619 return first_character_set_;
776 } 620 }
777 void set_first_character_set(ZoneList<CharacterRange>* character_set) { 621 void set_first_character_set(ZoneList<CharacterRange>* character_set) {
778 first_character_set_ = character_set; 622 first_character_set_ = character_set;
779 } 623 }
624 BoyerMooreLookahead* bm_info(bool not_at_start) {
625 return bm_info_[not_at_start ? 1 : 0];
626 }
780 627
781 protected: 628 protected:
782 enum LimitResult { DONE, CONTINUE }; 629 enum LimitResult { DONE, CONTINUE };
783 static const int kComputeFirstCharacterSetFail = -1; 630 static const int kComputeFirstCharacterSetFail = -1;
784 631
785 LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace); 632 LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
786 633
787 // Returns a sibling of this node whose interests and assumptions 634 // Returns a sibling of this node whose interests and assumptions
788 // match the ones in the given node info. If no sibling exists NULL 635 // match the ones in the given node info. If no sibling exists NULL
789 // is returned. 636 // is returned.
790 RegExpNode* TryGetSibling(NodeInfo* info); 637 RegExpNode* TryGetSibling(NodeInfo* info);
791 638
792 // Returns a sibling of this node whose interests match the ones in 639 // Returns a sibling of this node whose interests match the ones in
793 // the given node info. The info must not contain any assertions. 640 // the given node info. The info must not contain any assertions.
794 // If no node exists a new one will be created by cloning the current 641 // If no node exists a new one will be created by cloning the current
795 // node. The result will always be an instance of the same concrete 642 // node. The result will always be an instance of the same concrete
796 // class as this node. 643 // class as this node.
797 RegExpNode* EnsureSibling(NodeInfo* info, bool* cloned); 644 RegExpNode* EnsureSibling(NodeInfo* info, bool* cloned);
798 645
799 // Returns a clone of this node initialized using the copy constructor 646 // Returns a clone of this node initialized using the copy constructor
800 // of its concrete class. Note that the node may have to be pre- 647 // of its concrete class. Note that the node may have to be pre-
801 // processed before it is on a usable state. 648 // processed before it is on a usable state.
802 virtual RegExpNode* Clone() = 0; 649 virtual RegExpNode* Clone() = 0;
803 650
651 void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
652 bm_info_[not_at_start ? 1 : 0] = bm;
653 }
654
804 private: 655 private:
805 static const int kFirstCharBudget = 10; 656 static const int kFirstCharBudget = 10;
806 Label label_; 657 Label label_;
807 NodeInfo info_; 658 NodeInfo info_;
808 SiblingList siblings_; 659 SiblingList siblings_;
809 ZoneList<CharacterRange>* first_character_set_; 660 ZoneList<CharacterRange>* first_character_set_;
810 // This variable keeps track of how many times code has been generated for 661 // This variable keeps track of how many times code has been generated for
811 // this node (in different traces). We don't keep track of where the 662 // this node (in different traces). We don't keep track of where the
812 // generated code is located unless the code is generated at the start of 663 // generated code is located unless the code is generated at the start of
813 // a trace, in which case it is generic and can be reused by flushing the 664 // a trace, in which case it is generic and can be reused by flushing the
814 // deferred operations in the current trace and generating a goto. 665 // deferred operations in the current trace and generating a goto.
815 int trace_count_; 666 int trace_count_;
667 BoyerMooreLookahead* bm_info_[2];
816 }; 668 };
817 669
818 670
819 // A simple closed interval. 671 // A simple closed interval.
820 class Interval { 672 class Interval {
821 public: 673 public:
822 Interval() : from_(kNone), to_(kNone) { } 674 Interval() : from_(kNone), to_(kNone) { }
823 Interval(int from, int to) : from_(from), to_(to) { } 675 Interval(int from, int to) : from_(from), to_(to) { }
824 Interval Union(Interval that) { 676 Interval Union(Interval that) {
825 if (that.from_ == kNone) 677 if (that.from_ == kNone)
826 return *this; 678 return *this;
827 else if (from_ == kNone) 679 else if (from_ == kNone)
828 return that; 680 return that;
829 else 681 else
830 return Interval(Min(from_, that.from_), Max(to_, that.to_)); 682 return Interval(Min(from_, that.from_), Max(to_, that.to_));
831 } 683 }
832 bool Contains(int value) { 684 bool Contains(int value) {
833 return (from_ <= value) && (value <= to_); 685 return (from_ <= value) && (value <= to_);
834 } 686 }
835 bool is_empty() { return from_ == kNone; } 687 bool is_empty() { return from_ == kNone; }
836 int from() { return from_; } 688 int from() const { return from_; }
837 int to() { return to_; } 689 int to() const { return to_; }
838 static Interval Empty() { return Interval(); } 690 static Interval Empty() { return Interval(); }
839 static const int kNone = -1; 691 static const int kNone = -1;
840 private: 692 private:
841 int from_; 693 int from_;
842 int to_; 694 int to_;
843 }; 695 };
844 696
845 697
846 class SeqRegExpNode: public RegExpNode { 698 class SeqRegExpNode: public RegExpNode {
847 public: 699 public:
848 explicit SeqRegExpNode(RegExpNode* on_success) 700 explicit SeqRegExpNode(RegExpNode* on_success)
849 : on_success_(on_success) { } 701 : on_success_(on_success) { }
850 RegExpNode* on_success() { return on_success_; } 702 RegExpNode* on_success() { return on_success_; }
851 void set_on_success(RegExpNode* node) { on_success_ = node; } 703 void set_on_success(RegExpNode* node) { on_success_ = node; }
852 virtual void FillInBMInfo( 704 virtual void FillInBMInfo(
853 int offset, BoyerMooreLookahead* bm, bool not_at_start) { 705 int offset, BoyerMooreLookahead* bm, bool not_at_start) {
854 on_success_->FillInBMInfo(offset, bm, not_at_start); 706 on_success_->FillInBMInfo(offset, bm, not_at_start);
707 if (offset == 0) set_bm_info(not_at_start, bm);
855 } 708 }
856 private: 709 private:
857 RegExpNode* on_success_; 710 RegExpNode* on_success_;
858 }; 711 };
859 712
860 713
861 class ActionNode: public SeqRegExpNode { 714 class ActionNode: public SeqRegExpNode {
862 public: 715 public:
863 enum Type { 716 enum Type {
864 SET_REGISTER, 717 SET_REGISTER,
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
898 bool not_at_start) { 751 bool not_at_start) {
899 return on_success()->GetQuickCheckDetails( 752 return on_success()->GetQuickCheckDetails(
900 details, compiler, filled_in, not_at_start); 753 details, compiler, filled_in, not_at_start);
901 } 754 }
902 virtual void FillInBMInfo( 755 virtual void FillInBMInfo(
903 int offset, BoyerMooreLookahead* bm, bool not_at_start); 756 int offset, BoyerMooreLookahead* bm, bool not_at_start);
904 Type type() { return type_; } 757 Type type() { return type_; }
905 // TODO(erikcorry): We should allow some action nodes in greedy loops. 758 // TODO(erikcorry): We should allow some action nodes in greedy loops.
906 virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; } 759 virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
907 virtual ActionNode* Clone() { return new ActionNode(*this); } 760 virtual ActionNode* Clone() { return new ActionNode(*this); }
908 virtual int ComputeFirstCharacterSet(int budget);
909 761
910 private: 762 private:
911 union { 763 union {
912 struct { 764 struct {
913 int reg; 765 int reg;
914 int value; 766 int value;
915 } u_store_register; 767 } u_store_register;
916 struct { 768 struct {
917 int reg; 769 int reg;
918 } u_increment_register; 770 } u_increment_register;
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after
971 virtual RegExpNode* GetSuccessorOfOmnivorousTextNode( 823 virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
972 RegExpCompiler* compiler); 824 RegExpCompiler* compiler);
973 virtual void FillInBMInfo( 825 virtual void FillInBMInfo(
974 int offset, BoyerMooreLookahead* bm, bool not_at_start); 826 int offset, BoyerMooreLookahead* bm, bool not_at_start);
975 virtual TextNode* Clone() { 827 virtual TextNode* Clone() {
976 TextNode* result = new TextNode(*this); 828 TextNode* result = new TextNode(*this);
977 result->CalculateOffsets(); 829 result->CalculateOffsets();
978 return result; 830 return result;
979 } 831 }
980 void CalculateOffsets(); 832 void CalculateOffsets();
981 virtual int ComputeFirstCharacterSet(int budget);
982 833
983 private: 834 private:
984 enum TextEmitPassType { 835 enum TextEmitPassType {
985 NON_ASCII_MATCH, // Check for characters that can't match. 836 NON_ASCII_MATCH, // Check for characters that can't match.
986 SIMPLE_CHARACTER_MATCH, // Case-dependent single character check. 837 SIMPLE_CHARACTER_MATCH, // Case-dependent single character check.
987 NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs. 838 NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs.
988 CASE_CHARACTER_MATCH, // Case-independent single character check. 839 CASE_CHARACTER_MATCH, // Case-independent single character check.
989 CHARACTER_CLASS_MATCH // Character class. 840 CHARACTER_CLASS_MATCH // Character class.
990 }; 841 };
991 static bool SkipPass(int pass, bool ignore_case); 842 static bool SkipPass(int pass, bool ignore_case);
(...skipping 10 matching lines...) Expand all
1002 }; 853 };
1003 854
1004 855
1005 class AssertionNode: public SeqRegExpNode { 856 class AssertionNode: public SeqRegExpNode {
1006 public: 857 public:
1007 enum AssertionNodeType { 858 enum AssertionNodeType {
1008 AT_END, 859 AT_END,
1009 AT_START, 860 AT_START,
1010 AT_BOUNDARY, 861 AT_BOUNDARY,
1011 AT_NON_BOUNDARY, 862 AT_NON_BOUNDARY,
1012 AFTER_NEWLINE, 863 AFTER_NEWLINE
1013 // Types not directly expressible in regexp syntax.
1014 // Used for modifying a boundary node if its following character is
1015 // known to be word and/or non-word.
1016 AFTER_NONWORD_CHARACTER,
1017 AFTER_WORD_CHARACTER
1018 }; 864 };
1019 static AssertionNode* AtEnd(RegExpNode* on_success) { 865 static AssertionNode* AtEnd(RegExpNode* on_success) {
1020 return new AssertionNode(AT_END, on_success); 866 return new AssertionNode(AT_END, on_success);
1021 } 867 }
1022 static AssertionNode* AtStart(RegExpNode* on_success) { 868 static AssertionNode* AtStart(RegExpNode* on_success) {
1023 return new AssertionNode(AT_START, on_success); 869 return new AssertionNode(AT_START, on_success);
1024 } 870 }
1025 static AssertionNode* AtBoundary(RegExpNode* on_success) { 871 static AssertionNode* AtBoundary(RegExpNode* on_success) {
1026 return new AssertionNode(AT_BOUNDARY, on_success); 872 return new AssertionNode(AT_BOUNDARY, on_success);
1027 } 873 }
1028 static AssertionNode* AtNonBoundary(RegExpNode* on_success) { 874 static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
1029 return new AssertionNode(AT_NON_BOUNDARY, on_success); 875 return new AssertionNode(AT_NON_BOUNDARY, on_success);
1030 } 876 }
1031 static AssertionNode* AfterNewline(RegExpNode* on_success) { 877 static AssertionNode* AfterNewline(RegExpNode* on_success) {
1032 return new AssertionNode(AFTER_NEWLINE, on_success); 878 return new AssertionNode(AFTER_NEWLINE, on_success);
1033 } 879 }
1034 virtual void Accept(NodeVisitor* visitor); 880 virtual void Accept(NodeVisitor* visitor);
1035 virtual void Emit(RegExpCompiler* compiler, Trace* trace); 881 virtual void Emit(RegExpCompiler* compiler, Trace* trace);
1036 virtual int EatsAtLeast(int still_to_find, 882 virtual int EatsAtLeast(int still_to_find,
1037 int recursion_depth, 883 int recursion_depth,
1038 bool not_at_start); 884 bool not_at_start);
1039 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 885 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1040 RegExpCompiler* compiler, 886 RegExpCompiler* compiler,
1041 int filled_in, 887 int filled_in,
1042 bool not_at_start); 888 bool not_at_start);
1043 virtual void FillInBMInfo( 889 virtual void FillInBMInfo(
1044 int offset, BoyerMooreLookahead* bm, bool not_at_start); 890 int offset, BoyerMooreLookahead* bm, bool not_at_start);
1045 virtual int ComputeFirstCharacterSet(int budget);
1046 virtual AssertionNode* Clone() { return new AssertionNode(*this); } 891 virtual AssertionNode* Clone() { return new AssertionNode(*this); }
1047 AssertionNodeType type() { return type_; } 892 AssertionNodeType type() { return type_; }
1048 void set_type(AssertionNodeType type) { type_ = type; } 893 void set_type(AssertionNodeType type) { type_ = type; }
1049 894
1050 private: 895 private:
896 void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
897 enum IfPrevious { kIsNonWord, kIsWord };
898 void BacktrackIfPrevious(RegExpCompiler* compiler,
899 Trace* trace,
900 IfPrevious backtrack_if_previous);
1051 AssertionNode(AssertionNodeType t, RegExpNode* on_success) 901 AssertionNode(AssertionNodeType t, RegExpNode* on_success)
1052 : SeqRegExpNode(on_success), type_(t) { } 902 : SeqRegExpNode(on_success), type_(t) { }
1053 AssertionNodeType type_; 903 AssertionNodeType type_;
1054 }; 904 };
1055 905
1056 906
1057 class BackReferenceNode: public SeqRegExpNode { 907 class BackReferenceNode: public SeqRegExpNode {
1058 public: 908 public:
1059 BackReferenceNode(int start_reg, 909 BackReferenceNode(int start_reg,
1060 int end_reg, 910 int end_reg,
1061 RegExpNode* on_success) 911 RegExpNode* on_success)
1062 : SeqRegExpNode(on_success), 912 : SeqRegExpNode(on_success),
1063 start_reg_(start_reg), 913 start_reg_(start_reg),
1064 end_reg_(end_reg) { } 914 end_reg_(end_reg) { }
1065 virtual void Accept(NodeVisitor* visitor); 915 virtual void Accept(NodeVisitor* visitor);
1066 int start_register() { return start_reg_; } 916 int start_register() { return start_reg_; }
1067 int end_register() { return end_reg_; } 917 int end_register() { return end_reg_; }
1068 virtual void Emit(RegExpCompiler* compiler, Trace* trace); 918 virtual void Emit(RegExpCompiler* compiler, Trace* trace);
1069 virtual int EatsAtLeast(int still_to_find, 919 virtual int EatsAtLeast(int still_to_find,
1070 int recursion_depth, 920 int recursion_depth,
1071 bool not_at_start); 921 bool not_at_start);
1072 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 922 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1073 RegExpCompiler* compiler, 923 RegExpCompiler* compiler,
1074 int characters_filled_in, 924 int characters_filled_in,
1075 bool not_at_start) { 925 bool not_at_start) {
1076 return; 926 return;
1077 } 927 }
1078 virtual void FillInBMInfo( 928 virtual void FillInBMInfo(
1079 int offset, BoyerMooreLookahead* bm, bool not_at_start) { 929 int offset, BoyerMooreLookahead* bm, bool not_at_start);
1080 // Working out the set of characters that a backreference can match is too
1081 // hard, so we just say that any character can match.
1082 bm->SetRest(offset);
1083 }
1084 virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); } 930 virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); }
1085 virtual int ComputeFirstCharacterSet(int budget);
1086 931
1087 private: 932 private:
1088 int start_reg_; 933 int start_reg_;
1089 int end_reg_; 934 int end_reg_;
1090 }; 935 };
1091 936
1092 937
1093 class EndNode: public RegExpNode { 938 class EndNode: public RegExpNode {
1094 public: 939 public:
1095 enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS }; 940 enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
(...skipping 146 matching lines...) Expand 10 before | Expand all | Expand 10 after
1242 virtual int EatsAtLeast(int still_to_find, 1087 virtual int EatsAtLeast(int still_to_find,
1243 int recursion_depth, 1088 int recursion_depth,
1244 bool not_at_start); 1089 bool not_at_start);
1245 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 1090 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1246 RegExpCompiler* compiler, 1091 RegExpCompiler* compiler,
1247 int characters_filled_in, 1092 int characters_filled_in,
1248 bool not_at_start); 1093 bool not_at_start);
1249 virtual void FillInBMInfo( 1094 virtual void FillInBMInfo(
1250 int offset, BoyerMooreLookahead* bm, bool not_at_start) { 1095 int offset, BoyerMooreLookahead* bm, bool not_at_start) {
1251 alternatives_->at(1).node()->FillInBMInfo(offset, bm, not_at_start); 1096 alternatives_->at(1).node()->FillInBMInfo(offset, bm, not_at_start);
1097 if (offset == 0) set_bm_info(not_at_start, bm);
1252 } 1098 }
1253 // For a negative lookahead we don't emit the quick check for the 1099 // For a negative lookahead we don't emit the quick check for the
1254 // alternative that is expected to fail. This is because quick check code 1100 // alternative that is expected to fail. This is because quick check code
1255 // starts by loading enough characters for the alternative that takes fewest 1101 // starts by loading enough characters for the alternative that takes fewest
1256 // characters, but on a negative lookahead the negative branch did not take 1102 // characters, but on a negative lookahead the negative branch did not take
1257 // part in that calculation (EatsAtLeast) so the assumptions don't hold. 1103 // part in that calculation (EatsAtLeast) so the assumptions don't hold.
1258 virtual bool try_to_emit_quick_check_for_alternative(int i) { return i != 0; } 1104 virtual bool try_to_emit_quick_check_for_alternative(int i) { return i != 0; }
1259 virtual int ComputeFirstCharacterSet(int budget);
1260 }; 1105 };
1261 1106
1262 1107
1263 class LoopChoiceNode: public ChoiceNode { 1108 class LoopChoiceNode: public ChoiceNode {
1264 public: 1109 public:
1265 explicit LoopChoiceNode(bool body_can_be_zero_length) 1110 explicit LoopChoiceNode(bool body_can_be_zero_length)
1266 : ChoiceNode(2), 1111 : ChoiceNode(2),
1267 loop_node_(NULL), 1112 loop_node_(NULL),
1268 continue_node_(NULL), 1113 continue_node_(NULL),
1269 body_can_be_zero_length_(body_can_be_zero_length) { } 1114 body_can_be_zero_length_(body_can_be_zero_length) { }
1270 void AddLoopAlternative(GuardedAlternative alt); 1115 void AddLoopAlternative(GuardedAlternative alt);
1271 void AddContinueAlternative(GuardedAlternative alt); 1116 void AddContinueAlternative(GuardedAlternative alt);
1272 virtual void Emit(RegExpCompiler* compiler, Trace* trace); 1117 virtual void Emit(RegExpCompiler* compiler, Trace* trace);
1273 virtual int EatsAtLeast(int still_to_find, 1118 virtual int EatsAtLeast(int still_to_find,
1274 int recursion_depth, 1119 int recursion_depth,
1275 bool not_at_start); 1120 bool not_at_start);
1276 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 1121 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1277 RegExpCompiler* compiler, 1122 RegExpCompiler* compiler,
1278 int characters_filled_in, 1123 int characters_filled_in,
1279 bool not_at_start); 1124 bool not_at_start);
1280 virtual void FillInBMInfo( 1125 virtual void FillInBMInfo(
1281 int offset, BoyerMooreLookahead* bm, bool not_at_start); 1126 int offset, BoyerMooreLookahead* bm, bool not_at_start);
1282 virtual int ComputeFirstCharacterSet(int budget);
1283 virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); } 1127 virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); }
1284 RegExpNode* loop_node() { return loop_node_; } 1128 RegExpNode* loop_node() { return loop_node_; }
1285 RegExpNode* continue_node() { return continue_node_; } 1129 RegExpNode* continue_node() { return continue_node_; }
1286 bool body_can_be_zero_length() { return body_can_be_zero_length_; } 1130 bool body_can_be_zero_length() { return body_can_be_zero_length_; }
1287 virtual void Accept(NodeVisitor* visitor); 1131 virtual void Accept(NodeVisitor* visitor);
1288 1132
1289 private: 1133 private:
1290 // AddAlternative is made private for loop nodes because alternatives 1134 // AddAlternative is made private for loop nodes because alternatives
1291 // should not be added freely, we need to keep track of which node 1135 // should not be added freely, we need to keep track of which node
1292 // goes back to the node itself. 1136 // goes back to the node itself.
1293 void AddAlternative(GuardedAlternative node) { 1137 void AddAlternative(GuardedAlternative node) {
1294 ChoiceNode::AddAlternative(node); 1138 ChoiceNode::AddAlternative(node);
1295 } 1139 }
1296 1140
1297 RegExpNode* loop_node_; 1141 RegExpNode* loop_node_;
1298 RegExpNode* continue_node_; 1142 RegExpNode* continue_node_;
1299 bool body_can_be_zero_length_; 1143 bool body_can_be_zero_length_;
1300 }; 1144 };
1301 1145
1302 1146
1147 // Improve the speed that we scan for an initial point where a non-anchored
1148 // regexp can match by using a Boyer-Moore-like table. This is done by
1149 // identifying non-greedy non-capturing loops in the nodes that eat any
1150 // character one at a time. For example in the middle of the regexp
1151 // /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
1152 // inserted at the start of any non-anchored regexp.
1153 //
1154 // When we have found such a loop we look ahead in the nodes to find the set of
1155 // characters that can come at given distances. For example for the regexp
1156 // /.?foo/ we know that there are at least 3 characters ahead of us, and the
1157 // sets of characters that can occur are [any, [f, o], [o]]. We find a range in
1158 // the lookahead info where the set of characters is reasonably constrained. In
1159 // our example this is from index 1 to 2 (0 is not constrained). We can now
1160 // look 3 characters ahead and if we don't find one of [f, o] (the union of
1161 // [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
1162 //
1163 // For Unicode input strings we do the same, but modulo 128.
1164 //
1165 // We also look at the first string fed to the regexp and use that to get a hint
1166 // of the character frequencies in the inputs. This affects the assessment of
1167 // whether the set of characters is 'reasonably constrained'.
1168 //
1169 // We also have another lookahead mechanism (called quick check in the code),
1170 // which uses a wide load of multiple characters followed by a mask and compare
1171 // to determine whether a match is possible at this point.
1172 enum ContainedInLattice {
1173 kNotYet = 0,
1174 kLatticeIn = 1,
1175 kLatticeOut = 2,
1176 kLatticeUnknown = 3 // Can also mean both in and out.
1177 };
1178
1179
1180 ContainedInLattice AddRange(ContainedInLattice a,
1181 const int* ranges,
1182 int ranges_size,
1183 Interval new_range);
1184
1185
1186 class BoyerMoorePositionInfo : public ZoneObject {
1187 public:
1188 BoyerMoorePositionInfo()
1189 : map_(new ZoneList<bool>(kMapSize)),
1190 map_count_(0),
1191 w_(kNotYet),
1192 s_(kNotYet),
1193 d_(kNotYet),
1194 surrogate_(kNotYet) {
1195 for (int i = 0; i < kMapSize; i++) {
1196 map_->Add(false);
1197 }
1198 }
1199
1200 bool& at(int i) { return map_->at(i); }
1201
1202 static const int kMapSize = 128;
1203 static const int kMask = kMapSize -1 ;
ulan 2012/04/16 13:52:22 Should be kMapSize - 1;
Erik Corry 2012/04/17 07:51:45 Done.
1204
1205 int map_count() const { return map_count_; }
1206
1207 void Set(int character);
1208 void SetInterval(const Interval& interval);
1209 void SetAll();
1210 bool is_non_word() { return w_ == kLatticeOut; }
1211 bool is_word() { return w_ == kLatticeIn; }
1212
1213 private:
1214 ZoneList<bool>* map_;
1215 int map_count_; // Number of set bits in the map.
1216 ContainedInLattice w_; // The \w character class.
1217 ContainedInLattice s_; // The \s character class.
1218 ContainedInLattice d_; // The \d character class.
1219 ContainedInLattice surrogate_; // Surrogate UTF-16 code units.
1220 };
1221
1222
1223 class BoyerMooreLookahead : public ZoneObject {
1224 public:
1225 BoyerMooreLookahead(int length, RegExpCompiler* compiler);
1226
1227 int length() { return length_; }
1228 int max_char() { return max_char_; }
1229 RegExpCompiler* compiler() { return compiler_; }
1230
1231 int Count(int map_number) {
1232 return bitmaps_->at(map_number)->map_count();
1233 }
1234
1235 BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
1236
1237 void Set(int map_number, int character) {
1238 if (character > max_char_) return;
1239 BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1240 info->Set(character);
1241 }
1242
1243 void SetInterval(int map_number, const Interval& interval) {
1244 if (interval.from() > max_char_) return;
1245 BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1246 if (interval.to() > max_char_) {
1247 info->SetInterval(Interval(interval.from(), max_char_));
1248 } else {
1249 info->SetInterval(interval);
1250 }
1251 }
1252
1253 void SetAll(int map_number) {
1254 bitmaps_->at(map_number)->SetAll();
1255 }
1256
1257 void SetRest(int from_map) {
1258 for (int i = from_map; i < length_; i++) SetAll(i);
1259 }
1260 bool EmitSkipInstructions(RegExpMacroAssembler* masm);
1261
1262 private:
1263 // This is the value obtained by EatsAtLeast. If we do not have at least this
1264 // many characters left in the sample string then the match is bound to fail.
1265 // Therefore it is OK to read a character this far ahead of the current match
1266 // point.
1267 int length_;
1268 RegExpCompiler* compiler_;
1269 // 0x7f for ASCII, 0xffff for UTF-16.
1270 int max_char_;
1271 ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
1272
1273 int GetSkipTable(int min_lookahead,
1274 int max_lookahead,
1275 Handle<ByteArray> boolean_skip_table);
1276 bool FindWorthwhileInterval(int* from, int* to);
1277 int FindBestInterval(
1278 int max_number_of_chars, int old_biggest_points, int* from, int* to);
1279 };
1280
1281
1303 // There are many ways to generate code for a node. This class encapsulates 1282 // There are many ways to generate code for a node. This class encapsulates
1304 // the current way we should be generating. In other words it encapsulates 1283 // the current way we should be generating. In other words it encapsulates
1305 // the current state of the code generator. The effect of this is that we 1284 // the current state of the code generator. The effect of this is that we
1306 // generate code for paths that the matcher can take through the regular 1285 // generate code for paths that the matcher can take through the regular
1307 // expression. A given node in the regexp can be code-generated several times 1286 // expression. A given node in the regexp can be code-generated several times
1308 // as it can be part of several traces. For example for the regexp: 1287 // as it can be part of several traces. For example for the regexp:
1309 // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part 1288 // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
1310 // of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code 1289 // of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code
1311 // to match foo is generated only once (the traces have a common prefix). The 1290 // to match foo is generated only once (the traces have a common prefix). The
1312 // code to store the capture is deferred and generated (twice) after the places 1291 // code to store the capture is deferred and generated (twice) after the places
(...skipping 314 matching lines...) Expand 10 before | Expand all | Expand 10 after
1627 int* vector_; 1606 int* vector_;
1628 int offsets_vector_length_; 1607 int offsets_vector_length_;
1629 1608
1630 friend class ExternalReference; 1609 friend class ExternalReference;
1631 }; 1610 };
1632 1611
1633 1612
1634 } } // namespace v8::internal 1613 } } // namespace v8::internal
1635 1614
1636 #endif // V8_JSREGEXP_H_ 1615 #endif // V8_JSREGEXP_H_
OLDNEW
« no previous file with comments | « no previous file | src/jsregexp.cc » ('j') | src/jsregexp.cc » ('J')

Powered by Google App Engine
This is Rietveld 408576698