Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1618)

Side by Side Diff: src/jsregexp.cc

Issue 8732: Character classes (Closed)
Patch Set: Created 12 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved. 1 // Copyright 2006-2008 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 14 matching lines...) Expand all
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 27
28 #include <set> 28 #include <set>
29 29
30 #include "v8.h" 30 #include "v8.h"
31 31
32 #include "ast.h" 32 #include "ast.h"
33 #include "execution.h" 33 #include "execution.h"
34 #include "factory.h" 34 #include "factory.h"
35 #include "jsregexp.h" 35 #include "jsregexp-inl.h"
36 #include "third_party/jscre/pcre.h" 36 #include "third_party/jscre/pcre.h"
37 #include "platform.h" 37 #include "platform.h"
38 #include "runtime.h" 38 #include "runtime.h"
39 #include "top.h" 39 #include "top.h"
40 #include "compilation-cache.h" 40 #include "compilation-cache.h"
41 #include "string-stream.h" 41 #include "string-stream.h"
42 42
43 43
44 namespace v8 { namespace internal { 44 namespace v8 { namespace internal {
45 45
(...skipping 767 matching lines...) Expand 10 before | Expand all | Expand 10 after
813 void* rest) { 813 void* rest) {
814 ZoneList<RegExpTree*>* children = that->nodes(); 814 ZoneList<RegExpTree*>* children = that->nodes();
815 RegExpNode<Char>* current = static_cast<RegExpNode<Char>*>(rest); 815 RegExpNode<Char>* current = static_cast<RegExpNode<Char>*>(rest);
816 for (int i = children->length() - 1; i >= 0; i--) { 816 for (int i = children->length() - 1; i >= 0; i--) {
817 current = Compile(children->at(i), current); 817 current = Compile(children->at(i), current);
818 } 818 }
819 return current; 819 return current;
820 } 820 }
821 821
822 822
823 bool CharacterClass::Contains(uc16 c) {
824 switch (type()) {
825 case FIELD: {
826 int offset = segment_start(segment_);
827 if (c < offset) return false;
828 int index = c - offset;
829 if (index >= CharacterClass::kFieldMax) return false;
830 return (data_.u_field & long_bit(index)) != 0;
831 }
832 case RANGES: {
833 int offset = segment_start(segment_);
834 if (c < offset) return false;
835 bool is_on = false;
836 unsigned i = 0;
837 while (i < count_) {
838 int delta = 0;
839 int nibble = read_nibble(i);
840 int bit_offset = 0;
841 while ((nibble & 0x8) == 0) {
842 delta |= nibble << bit_offset;
843 bit_offset += 3;
844 nibble = read_nibble(++i);
845 }
846 delta |= (nibble & 0x7) << bit_offset;
847 i++;
848 offset += delta;
849 if (c < offset)
850 return is_on;
851 is_on = !is_on;
852 }
853 return false;
854 }
855 case UNION: {
856 return data_.u_union.left->Contains(c)
857 || data_.u_union.right->Contains(c);
858 }
859 default:
860 UNREACHABLE();
861 return false;
862 }
863 }
864
865
866 template <int kCount>
867 CharacterClass* StaticCharacterClassAllocator<kCount>::Allocate() {
868 ASSERT(used_ < kCount);
869 CharacterClass* result = &preallocated_[used_];
870 used_++;
871 return result;
872 }
873
874
875 class StaticCharacterClasses {
876 public:
877 StaticCharacterClasses();
878 CharacterClass* GetClass(uc16 tag);
879 static StaticCharacterClasses* instance_;
880 private:
881 StaticCharacterClassAllocator<5> static_allocator_;
882 CharacterClass digit_;
883 CharacterClass whitespace_;
884 CharacterClass word_;
885 };
886
887
888 StaticCharacterClasses* StaticCharacterClasses::instance_ = NULL;
889
890
891 StaticCharacterClasses::StaticCharacterClasses() {
892 #define MAKE_CLASS(Name)\
893 CharacterClass::Ranges(Vector<CharacterClass::Range>(k##Name##Ranges,\
894 k##Name##RangeCount), \
895 &static_allocator_)
896
897 const int kDigitRangeCount = 1;
898 CharacterClass::Range kDigitRanges[kDigitRangeCount] = {
899 CharacterClass::Range('0', '9')
900 };
901 digit_ = MAKE_CLASS(Digit);
902
903 const int kWhiteSpaceRangeCount = 10;
904 CharacterClass::Range kWhiteSpaceRanges[kWhiteSpaceRangeCount] = {
905 CharacterClass::Range(0x0009, 0x0009), CharacterClass::Range(0x000B, 0x000C),
906 CharacterClass::Range(0x0020, 0x0020), CharacterClass::Range(0x00A0, 0x00A0),
907 CharacterClass::Range(0x1680, 0x1680), CharacterClass::Range(0x180E, 0x180E),
908 CharacterClass::Range(0x2000, 0x200A), CharacterClass::Range(0x202F, 0x202F),
909 CharacterClass::Range(0x205F, 0x205F), CharacterClass::Range(0x3000, 0x3000)
910 };
911 whitespace_ = MAKE_CLASS(WhiteSpace);
912
913 const int kWordRangeCount = 4;
914 static CharacterClass::Range kWordRanges[kWordRangeCount] = {
915 CharacterClass::Range('0', '9'), CharacterClass::Range('A', 'Z'),
916 CharacterClass::Range('_', '_'), CharacterClass::Range('a', 'z')
917 };
918 word_ = MAKE_CLASS(Word);
919
920 #undef MAKE_CLASS
921 }
922
923
924 CharacterClass* StaticCharacterClasses::GetClass(uc16 tag) {
925 switch (tag) {
926 case 'd':
927 return &digit_;
928 case 's':
929 return &whitespace_;
930 case 'w':
931 return &word_;
932 default:
933 UNREACHABLE();
934 return NULL;
935 }
936 }
937
938
939 CharacterClass* CharacterClass::GetCharacterClass(uc16 tag) {
940 if (StaticCharacterClasses::instance_ == NULL)
941 StaticCharacterClasses::instance_ = new StaticCharacterClasses();
942 return StaticCharacterClasses::instance_->GetClass(tag);
943 }
944
945
946 CharacterClass CharacterClass::Ranges(Vector<Range> boundaries,
947 CharacterClassAllocator* alloc) {
948 CharacterClass result;
949 result.InitializeRangesFrom(boundaries, alloc);
950 return result;
951 }
952
953
954 void CharacterClass::InitializeFieldFrom(Vector<Range> boundaries) {
955 ASSERT(boundaries.length() > 0);
956 ASSERT_EQ(EMPTY, type_);
957 type_ = FIELD;
958 segment_ = segment_of(boundaries.first().from());
959 ASSERT_EQ(static_cast<int>(segment_),
960 static_cast<int>(segment_of(boundaries.last().to())));
961 data_.u_field = 0;
962 for (int i = 0; i < boundaries.length(); i++) {
963 int start = boundaries[i].from();
964 int end = boundaries[i].to();
965 // Mask in the range
966 uint64_t t0 = ~static_cast<uint64_t>(0) >> (63 - (end & kSegmentMask));
967 uint64_t t1 = ~static_cast<uint64_t>(0) << (start & kSegmentMask);
968 data_.u_field |= (t0 & t1);
969 }
970 }
971
972
973 void CharacterClass::InitializeRangesFrom(Vector<Range> boundaries,
974 CharacterClassAllocator* alloc) {
975 ASSERT(boundaries.length() > 0);
976 ASSERT_EQ(EMPTY, type_);
977 int start_segment = segment_of(boundaries.first().from());
978 int end_segment = segment_of(boundaries.last().to());
979 if (start_segment == end_segment) {
980 InitializeFieldFrom(boundaries);
981 return;
982 }
983 type_ = RANGES;
984 segment_ = start_segment;
985 data_.u_ranges = 0;
986 int offset = 0;
987 uc16 last_boundary = segment_start(segment_);
988 int i;
989 for (i = 0; i < boundaries.length(); i++) {
990 Range current = boundaries[i];
991 ASSERT(last_boundary <= current.from());
992 int word = current.from() - last_boundary;
993 // We have to repeat the write operation to write both the delta
994 // from the last range to the start of this one, and the delta
995 // from the start to the end of this range.
996 for (int j = 0; j < 2; j++) {
997 do {
998 if (offset >= 16)
999 goto overflow;
1000 byte nibble = (word & 0x7);
1001 int next_word = word >> 3;
1002 if (next_word == 0)
1003 nibble |= 0x8;
1004 write_nibble(offset, nibble);
1005 word = next_word;
1006 offset++;
1007 } while (word > 0);
1008 word = current.to() - current.from() + 1;
1009 }
1010 count_ = offset;
1011 last_boundary = current.to() + 1;
1012 continue;
1013 overflow:
1014 CharacterClass* left = alloc->Allocate();
1015 ASSERT(i > 0);
1016 if (segment_of(boundaries[i-1].to()) == segment_) {
1017 // If what we've written so far fits within a single segment we
1018 // can fit it into a bitfield.
1019 left->InitializeFieldFrom(boundaries.SubVector(0, i));
Lasse Reichstein 2008/10/30 10:28:42 Try to read some more first, and see if more range
1020 } else {
1021 // Otherwise we copy the work we've done so far into the left
1022 // heap-allocated charclass.
1023 *left = *this;
Lasse Reichstein 2008/10/30 10:28:42 As discussed, the segments won't necessarily be di
1024 }
1025 CharacterClass* right = alloc->Allocate();
1026 right->InitializeRangesFrom(boundaries.SubVector(i, boundaries.length()),
1027 alloc);
1028 type_ = UNION;
1029 data_.u_union.left = left;
1030 data_.u_union.right = right;
1031 return;
1032 }
1033 }
1034
1035
823 // ------------------------------------------------------------------- 1036 // -------------------------------------------------------------------
824 // Execution 1037 // Execution
825 1038
826 1039
827 template <typename Char> 1040 template <typename Char>
828 class ExecutionState { 1041 class ExecutionState {
829 public: 1042 public:
830 ExecutionState(RegExpNode<Char>* start, Vector<Char> input) 1043 ExecutionState(RegExpNode<Char>* start, Vector<Char> input)
831 : current_(start), 1044 : current_(start),
832 input_(input), 1045 input_(input),
(...skipping 188 matching lines...) Expand 10 before | Expand all | Expand 10 after
1021 template 1234 template
1022 bool RegExpEngine::Execute<const char>(RegExpNode<const char>* start, 1235 bool RegExpEngine::Execute<const char>(RegExpNode<const char>* start,
1023 Vector<const char> input); 1236 Vector<const char> input);
1024 1237
1025 template 1238 template
1026 bool RegExpEngine::Execute<const uc16>(RegExpNode<const uc16>* start, 1239 bool RegExpEngine::Execute<const uc16>(RegExpNode<const uc16>* start,
1027 Vector<const uc16> input); 1240 Vector<const uc16> input);
1028 1241
1029 1242
1030 }} // namespace v8::internal 1243 }} // namespace v8::internal
OLDNEW
« src/jsregexp.h ('K') | « src/jsregexp.h ('k') | src/jsregexp-inl.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698