Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(147)

Side by Side Diff: src/jsregexp.cc

Issue 8188: Some new regexp infrastructure. (Closed)
Patch Set: Created 12 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved. 1 // Copyright 2006-2008 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution. 11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its 12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived 13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission. 14 // from this software without specific prior written permission.
15 // 15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 27
28 #include "v8.h" 28 #include "v8.h"
29 29
30 #include "ast.h"
30 #include "execution.h" 31 #include "execution.h"
31 #include "factory.h" 32 #include "factory.h"
32 #include "jsregexp.h" 33 #include "jsregexp.h"
33 #include "third_party/jscre/pcre.h" 34 #include "third_party/jscre/pcre.h"
34 #include "platform.h" 35 #include "platform.h"
35 #include "runtime.h" 36 #include "runtime.h"
36 #include "top.h" 37 #include "top.h"
37 #include "compilation-cache.h" 38 #include "compilation-cache.h"
39 #include "string-stream.h"
40 #include <set>
38 41
39 namespace v8 { namespace internal { 42 namespace v8 { namespace internal {
40 43
41 44
42 #define CAPTURE_INDEX 0 45 #define CAPTURE_INDEX 0
43 #define INTERNAL_INDEX 1 46 #define INTERNAL_INDEX 1
44 47
45 static Failure* malloc_failure; 48 static Failure* malloc_failure;
46 49
47 static void* JSREMalloc(size_t size) { 50 static void* JSREMalloc(size_t size) {
(...skipping 483 matching lines...) Expand 10 before | Expand all | Expand 10 after
531 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex)); 534 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
532 return Smi::cast(value->get(CAPTURE_INDEX))->value(); 535 return Smi::cast(value->get(CAPTURE_INDEX))->value();
533 } 536 }
534 537
535 538
536 ByteArray* RegExpImpl::JsreInternal(Handle<JSRegExp> re) { 539 ByteArray* RegExpImpl::JsreInternal(Handle<JSRegExp> re) {
537 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex)); 540 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
538 return ByteArray::cast(value->get(INTERNAL_INDEX)); 541 return ByteArray::cast(value->get(INTERNAL_INDEX));
539 } 542 }
540 543
544
545 // -------------------------------------------------------------------
546 // New regular expression engine
547
548
549 template <typename Char> class ExecutionState;
550
551
552 template <typename Char>
553 class DotPrinter {
554 public:
555 DotPrinter() : stream_(&alloc_) { }
556 void PrintNode(RegExpNode<Char>* node);
557 void Visit(RegExpNode<Char>* node);
558 StringStream* stream() { return &stream_; }
559 private:
560 HeapStringAllocator alloc_;
561 StringStream stream_;
562 std::set<RegExpNode<Char>*> seen_;
563 };
564
565
566 template <typename Char>
567 class RegExpCompiler: public RegExpVisitor {
568 public:
569 RegExpCompiler() { }
570 RegExpNode<Char>* Compile(RegExpTree* tree, RegExpNode<Char>* rest) {
571 return static_cast<RegExpNode<Char>*>(tree->Accept(this, rest));
572 }
573 #define MAKE_CASE(Name) virtual void* Visit##Name(RegExp##Name*, void*);
574 FOR_EACH_REG_EXP_NODE_TYPE(MAKE_CASE)
575 #undef MAKE_CASE
576 };
577
578
579 template <typename Char>
580 class RegExpNode: public ZoneObject {
581 public:
582 virtual ~RegExpNode() { }
583 virtual void EmitDot(DotPrinter<Char>* out);
584 virtual bool Step(ExecutionState<Char>* state) = 0;
585 };
586
587
588 template <typename Char>
589 class SeqRegExpNode: public RegExpNode<Char> {
590 public:
591 SeqRegExpNode(RegExpNode<Char>* next) : next_(next) { }
592 RegExpNode<Char>* next() { return next_; }
593 private:
594 RegExpNode<Char>* next_;
595 };
596
597
598 template <typename Char>
599 class EndNode: public RegExpNode<Char> {
600 public:
601 virtual void EmitDot(DotPrinter<Char>* out);
602 virtual bool Step(ExecutionState<Char>* state);
603 };
604
605
606 template <typename Char>
607 class AtomNode: public SeqRegExpNode<Char> {
608 public:
609 AtomNode(Vector<const uc16> data, RegExpNode<Char>* next)
610 : SeqRegExpNode<Char>(next),
611 data_(data) { }
612 virtual void EmitDot(DotPrinter<Char>* out);
613 virtual bool Step(ExecutionState<Char>* state);
614 Vector<const uc16> data() { return data_; }
615 private:
616 Vector<const uc16> data_;
617 };
618
619
620 template <typename Char>
621 class CharacterClassNode: public SeqRegExpNode<Char> {
622 public:
623 CharacterClassNode(ZoneList<CharacterRange>* ranges, RegExpNode<Char>* next)
624 : SeqRegExpNode<Char>(next),
625 ranges_(ranges) { }
626 virtual void EmitDot(DotPrinter<Char>* out);
627 virtual bool Step(ExecutionState<Char>* state);
628 ZoneList<CharacterRange>* ranges() { return ranges_; }
629 private:
630 ZoneList<CharacterRange>* ranges_;
631 };
632
633
634 template <typename Char>
635 class ChoiceNode: public RegExpNode<Char> {
636 public:
637 ChoiceNode(ZoneList<RegExpNode<Char>*>* choices) : choices_(choices) { }
638 virtual void EmitDot(DotPrinter<Char>* out);
639 virtual bool Step(ExecutionState<Char>* state);
640 ZoneList<RegExpNode<Char>*>* choices() { return choices_; }
641 private:
642 ZoneList<RegExpNode<Char>*>* choices_;
643 };
644
645
646 // -------------------------------------------------------------------
647 // Dot/dotty output
Erik Corry 2008/10/27 14:58:44 Cool idea :-)
648
649
650 template <typename Char>
651 void DotPrinter<Char>::PrintNode(RegExpNode<Char>* node) {
652 stream()->Add("digraph G {\n");
653 Visit(node);
654 stream()->Add("}\n");
655 printf("%s", *(stream()->ToCString()));
656 }
657
658
659 template <typename Char>
660 void DotPrinter<Char>::Visit(RegExpNode<Char>* node) {
661 if (seen_.find(node) != seen_.end())
662 return;
663 seen_.insert(node);
664 node->EmitDot(this);
665 }
666
667
668 template <typename Char>
669 void RegExpNode<Char>::EmitDot(DotPrinter<Char>* out) {
670 UNIMPLEMENTED();
671 }
672
673
674 template <typename Char>
675 void ChoiceNode<Char>::EmitDot(DotPrinter<Char>* out) {
676 out->stream()->Add("n%p [label=\"?\"];\n", this);
677 for (int i = 0; i < choices()->length(); i++) {
678 out->stream()->Add("n%p -> n%p [label=\"%i\"];\n", this, choices()->at(i), i );
679 out->Visit(choices()->at(i));
680 }
681 }
682
683
684 template <typename Char>
685 void AtomNode<Char>::EmitDot(DotPrinter<Char>* out) {
686 out->stream()->Add("n%p [label=\"'%w'\"];\n", this, data());
687 out->stream()->Add("n%p -> n%p;\n", this, this->next());
688 out->Visit(this->next());
689 }
690
691
692 template <typename Char>
693 void EndNode<Char>::EmitDot(DotPrinter<Char>* out) {
694 out->stream()->Add("n%p [style=bold, label=\"done\"];\n", this);
695 }
696
697
698 template <typename Char>
699 void CharacterClassNode<Char>::EmitDot(DotPrinter<Char>* out) {
700 out->stream()->Add("n%p [label=\"[...]\"];\n", this);
701 out->stream()->Add("n%p -> n%p;\n", this, this->next());
702 out->Visit(this->next());
703 }
704
705
706 // -------------------------------------------------------------------
707 // Tree to graph conversion
708
709
710 template <typename Char>
711 void* RegExpCompiler<Char>::VisitAtom(RegExpAtom* that, void* rest) {
712 return new AtomNode<Char>(that->data(),
713 static_cast<RegExpNode<Char>*>(rest));
714 }
715
716
717 template <typename Char>
718 void* RegExpCompiler<Char>::VisitCharacterClass(RegExpCharacterClass* that,
719 void* rest) {
720 return new CharacterClassNode<Char>(that->ranges(),
721 static_cast<RegExpNode<Char>*>(rest));
722 }
723
724
725 template <typename Char>
726 void* RegExpCompiler<Char>::VisitDisjunction(RegExpDisjunction* that,
727 void* rest_ptr) {
728 RegExpNode<Char>* rest = static_cast<RegExpNode<Char>*>(rest_ptr);
729 ZoneList<RegExpTree*>* children = that->nodes();
730 int length = children->length();
731 ZoneList<RegExpNode<Char>*>* choices
732 = new ZoneList<RegExpNode<Char>*>(length);
733 for (int i = 0; i < length; i++)
734 choices->Add(Compile(children->at(i), rest));
735 return new ChoiceNode<Char>(choices);
736 }
737
738
739 template <typename Char>
740 void* RegExpCompiler<Char>::VisitQuantifier(RegExpQuantifier* that,
741 void* rest_ptr) {
742 RegExpNode<Char>* rest = static_cast<RegExpNode<Char>*>(rest_ptr);
743 if (that->max() >= RegExpQuantifier::kInfinity) {
744 // Don't try to count the number of iterations if the max it too
745 // large.
746 if (that->min() != 0) {
747 UNIMPLEMENTED();
748 }
749 ZoneList<RegExpNode<Char>*>* loop_choices
750 = new ZoneList<RegExpNode<Char>*>(2);
751 RegExpNode<Char>* loop_node = new ChoiceNode<Char>(loop_choices);
752 RegExpNode<Char>* body_node = Compile(that->body(), loop_node);
753 if (that->is_greedy()) {
754 loop_choices->Add(body_node);
755 loop_choices->Add(rest);
756 } else {
757 loop_choices->Add(rest);
758 loop_choices->Add(body_node);
759 }
760 return loop_node;
761 } else {
762 UNIMPLEMENTED();
763 return NULL;
764 }
765 }
766
767
768 template <typename Char>
769 void* RegExpCompiler<Char>::VisitAssertion(RegExpAssertion* that,
770 void* rest) {
771 UNIMPLEMENTED();
772 return NULL;
773 }
774
775
776 template <typename Char>
777 void* RegExpCompiler<Char>::VisitCapture(RegExpCapture* that, void* rest) {
778 UNIMPLEMENTED();
779 return NULL;
780 }
781
782
783 template <typename Char>
784 void* RegExpCompiler<Char>::VisitLookahead(RegExpLookahead* that,
785 void* rest) {
786 UNIMPLEMENTED();
787 return NULL;
788 }
789
790
791 template <typename Char>
792 void* RegExpCompiler<Char>::VisitEmpty(RegExpEmpty* that, void* rest) {
793 return rest;
794 }
795
796
797 template <typename Char>
798 void* RegExpCompiler<Char>::VisitAlternative(RegExpAlternative* that,
799 void* rest) {
800 ZoneList<RegExpTree*>* children = that->nodes();
801 RegExpNode<Char>* current = static_cast<RegExpNode<Char>*>(rest);
802 for (int i = children->length() - 1; i >= 0; i--) {
803 current = Compile(children->at(i), current);
804 }
805 return current;
806 }
807
808
809 // -------------------------------------------------------------------
810 // Execution
811
812
813 template <typename Char>
814 class ExecutionState {
815 public:
816 ExecutionState(RegExpNode<Char>* start, Vector<Char> input)
817 : current_(start),
818 input_(input),
819 pos_(0),
820 backtrack_stack_(8),
821 is_done_(false) { }
822
823 class BacktrackState {
824 public:
825 BacktrackState(ChoiceNode<Char>* choice, int next, int pos)
826 : choice_(choice),
827 next_(next),
828 pos_(pos) { }
829 ChoiceNode<Char>* choice() { return choice_; }
830 int next() { return next_; }
831 void set_next(int value) { next_ = value; }
832 int pos() { return pos_; }
833 private:
834 ChoiceNode<Char>* choice_;
835 int next_;
836 int pos_;
837 };
838
839 // Execute a single step, returning true if it succeeded
840 inline bool Step() { return current()->Step(this); }
841
842 // Stores the given choice node and the execution state on the
843 // backtrack stack.
844 void SaveBacktrack(ChoiceNode<Char>* choice);
845
846 // Reverts to the next unused backtrack if there is one. Returns
847 // false exactly if there was no backtrack to restore.
848 bool Backtrack();
849
850 Char current_char() { return input()[pos()]; }
851
852 void Advance(int delta, RegExpNode<Char>* next) {
853 pos_ += delta;
854 current_ = next;
855 }
856
857 bool AtEnd() { return pos_ >= input_.length(); }
858
859 bool is_done() { return is_done_; }
860 void set_done() { is_done_ = true; }
861
862 List<BacktrackState>* backtrack_stack() { return &backtrack_stack_; }
863 RegExpNode<Char>* current() { return current_; }
864 void set_current(RegExpNode<Char>* value) { current_ = value; }
865 Vector<Char> input() { return input_; }
866 int pos() { return pos_; }
867 private:
868 RegExpNode<Char>* current_;
869 Vector<Char> input_;
870 int pos_;
871 List<BacktrackState> backtrack_stack_;
872 bool is_done_;
873 };
874
875
876 template <typename Char>
877 void ExecutionState<Char>::SaveBacktrack(ChoiceNode<Char>* choice) {
878 ASSERT(choice->choices()->length() > 1);
879 if (FLAG_trace_regexps) {
880 PrintF("Setting up backtrack on level %i for choice %p\n",
881 backtrack_stack()->length(),
882 choice);
883 }
884 backtrack_stack()->Add(BacktrackState(choice, 1, pos_));
885 }
886
887
888 template <typename Char>
889 bool ExecutionState<Char>::Backtrack() {
890 if (backtrack_stack()->is_empty()) return false;
891 BacktrackState& top = backtrack_stack()->at(backtrack_stack()->length() - 1);
892 ZoneList<RegExpNode<Char>*>* choices = top.choice()->choices();
893 int next_index = top.next();
894 current_ = choices->at(next_index);
895 pos_ = top.pos();
896 if (FLAG_trace_regexps) {
897 PrintF("Backtracking to %p[%i] on level %i\n",
898 top.choice(),
899 next_index,
900 backtrack_stack()->length() - 1);
901 }
902 if (next_index == choices->length() - 1) {
903 if (FLAG_trace_regexps)
904 PrintF("Popping backtrack on level %i\n",
Lasse Reichstein 2008/10/27 13:12:58 Move to same line as "if" or wrap in squiggly brac
905 backtrack_stack()->length() - 1);
906 // If this was the last alternative we're done with this backtrack
907 // state and can pop it off the stack.
908 backtrack_stack()->RemoveLast();
909 } else {
910 if (FLAG_trace_regexps)
911 PrintF("Advancing backtrack on level %i\n",
912 backtrack_stack()->length() - 1);
913 // Otherwise we set the next choice to visit if this one fails.
914 top.set_next(next_index + 1);
915 }
916 return true;
917 }
918
919
920 template <typename Char>
921 bool ChoiceNode<Char>::Step(ExecutionState<Char>* state) {
922 state->SaveBacktrack(this);
923 state->set_current(this->choices()->at(0));
924 return true;
925 }
926
927
928 template <typename Char>
929 bool AtomNode<Char>::Step(ExecutionState<Char>* state) {
930 Vector<const uc16> data = this->data();
931 int length = data.length();
932 Vector<Char> input = state->input();
933 int p = state->pos();
934 if (p + length > input.length())
935 return false;
Lasse Reichstein 2008/10/27 13:12:58 Same line as "if" or squiggly braces. There are s
936 for (int i = 0; i < length; i++, p++) {
937 if (data[i] != input[p])
938 return false;
939 }
940 state->Advance(length, this->next());
941 return true;
942 }
943
944
945 template <typename Char>
946 bool CharacterClassNode<Char>::Step(ExecutionState<Char>* state) {
947 if (state->AtEnd()) return false;
948 ZoneList<CharacterRange>* ranges = this->ranges();
949 unsigned current = state->current_char();
950 for (int i = 0; i < ranges->length(); i++) {
951 CharacterRange& range = ranges->at(i);
952 if (range.is_special()) {
953 switch (range.from()) {
954 case '.':
Lasse Reichstein 2008/10/27 13:12:58 Is just the "." in the initial implicit ".*?"? Oth
Christian Plesner Hansen 2008/10/27 18:57:02 This is the general '.'. However, I don't expect
955 state->Advance(1, this->next());
956 return true;
957 default:
958 UNIMPLEMENTED();
959 }
960 } else {
961 if (range.from() <= current && current <= range.to()) {
962 state->Advance(1, this->next());
963 return true;
964 }
965 }
966 }
967 return false;
968 }
969
970
971 template <typename Char>
972 bool EndNode<Char>::Step(ExecutionState<Char>* state) {
973 state->set_done();
974 return false;
975 }
976
977
978 template <typename Char>
979 bool RegExpEngine::Execute(RegExpNode<Char>* start, Vector<Char> input) {
980 ExecutionState<Char> state(start, input);
981 if (FLAG_trace_regexps) {
982 PrintF("Beginning regexp execution\n");
983 }
984 while (true) {
985 if (!state.Step() && (state.is_done() || !state.Backtrack()))
986 break;
Lasse Reichstein 2008/10/27 13:12:58 Is there a reason for splitting the if and the whi
Christian Plesner Hansen 2008/10/27 18:57:02 No, I've fixed it.
987 }
988 if (FLAG_trace_regexps) {
989 PrintF("Matching %s\n", state.is_done() ? "succeeded" : "failed");
990 }
991 return state.is_done();
992 }
993
994
995 template <typename Char>
996 RegExpNode<Char>* RegExpEngine::Compile(RegExpTree* regexp) {
997 RegExpNode<Char>* end = new EndNode<Char>();
998 RegExpCompiler<Char> compiler;
999 return compiler.Compile(regexp, end);
1000 }
1001
1002
1003 template
1004 RegExpNode<const char>* RegExpEngine::Compile<const char>(RegExpTree* regexp);
1005
1006 template
1007 RegExpNode<const uc16>* RegExpEngine::Compile<const uc16>(RegExpTree* regexp);
1008
1009 template
1010 bool RegExpEngine::Execute<const char>(RegExpNode<const char>* start,
1011 Vector<const char> input);
1012
1013 template
1014 bool RegExpEngine::Execute<const uc32>(RegExpNode<const uc32>* start,
1015 Vector<const uc32> input);
Lasse Reichstein 2008/10/27 13:12:58 Should input really be uc32?
Christian Plesner Hansen 2008/10/27 18:57:02 No. It was getting late...
1016
1017
541 }} // namespace v8::internal 1018 }} // namespace v8::internal
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698