Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(3)

Side by Side Diff: src/regexp/jsregexp.cc

Issue 1578253005: [regexp] implement character classes for unicode regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: lookaround builder Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/jsregexp.h" 5 #include "src/regexp/jsregexp.h"
6 6
7 #include "src/ast/ast.h" 7 #include "src/ast/ast.h"
8 #include "src/base/platform/platform.h" 8 #include "src/base/platform/platform.h"
9 #include "src/compilation-cache.h" 9 #include "src/compilation-cache.h"
10 #include "src/compiler.h" 10 #include "src/compiler.h"
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after
65 Handle<String> error_text) { 65 Handle<String> error_text) {
66 USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text)); 66 USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text));
67 } 67 }
68 68
69 69
70 ContainedInLattice AddRange(ContainedInLattice containment, 70 ContainedInLattice AddRange(ContainedInLattice containment,
71 const int* ranges, 71 const int* ranges,
72 int ranges_length, 72 int ranges_length,
73 Interval new_range) { 73 Interval new_range) {
74 DCHECK((ranges_length & 1) == 1); 74 DCHECK((ranges_length & 1) == 1);
75 DCHECK(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1); 75 DCHECK(ranges[ranges_length - 1] == String::kMaxCodePoint + 1);
76 if (containment == kLatticeUnknown) return containment; 76 if (containment == kLatticeUnknown) return containment;
77 bool inside = false; 77 bool inside = false;
78 int last = 0; 78 int last = 0;
79 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) { 79 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
80 // Consider the range from last to ranges[i]. 80 // Consider the range from last to ranges[i].
81 // We haven't got to the new range yet. 81 // We haven't got to the new range yet.
82 if (ranges[i] <= new_range.from()) continue; 82 if (ranges[i] <= new_range.from()) continue;
83 // New range is wholly inside last-ranges[i]. Note that new_range.to() is 83 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
84 // inclusive, but the values in ranges are not. 84 // inclusive, but the values in ranges are not.
85 if (last <= new_range.from() && new_range.to() < ranges[i]) { 85 if (last <= new_range.from() && new_range.to() < ranges[i]) {
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after
138 138
139 Handle<Object> result; 139 Handle<Object> result;
140 if (in_cache) { 140 if (in_cache) {
141 re->set_data(*cached); 141 re->set_data(*cached);
142 return re; 142 return re;
143 } 143 }
144 pattern = String::Flatten(pattern); 144 pattern = String::Flatten(pattern);
145 PostponeInterruptsScope postpone(isolate); 145 PostponeInterruptsScope postpone(isolate);
146 RegExpCompileData parse_result; 146 RegExpCompileData parse_result;
147 FlatStringReader reader(isolate, pattern); 147 FlatStringReader reader(isolate, pattern);
148 if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, 148 if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, flags,
149 flags & JSRegExp::kMultiline, 149 &parse_result)) {
150 flags & JSRegExp::kUnicode, &parse_result)) {
151 // Throw an exception if we fail to parse the pattern. 150 // Throw an exception if we fail to parse the pattern.
152 return ThrowRegExpException(re, pattern, parse_result.error); 151 return ThrowRegExpException(re, pattern, parse_result.error);
153 } 152 }
154 153
155 bool has_been_compiled = false; 154 bool has_been_compiled = false;
156 155
157 if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) && 156 if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&
158 !(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) { 157 !(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) {
159 // Parse-tree is a single atom that is equal to the pattern. 158 // Parse-tree is a single atom that is equal to the pattern.
160 AtomCompile(re, pattern, flags, pattern); 159 AtomCompile(re, pattern, flags, pattern);
(...skipping 203 matching lines...) Expand 10 before | Expand all | Expand 10 after
364 ThrowRegExpException(re, error_message); 363 ThrowRegExpException(re, error_message);
365 return false; 364 return false;
366 } 365 }
367 366
368 JSRegExp::Flags flags = re->GetFlags(); 367 JSRegExp::Flags flags = re->GetFlags();
369 368
370 Handle<String> pattern(re->Pattern()); 369 Handle<String> pattern(re->Pattern());
371 pattern = String::Flatten(pattern); 370 pattern = String::Flatten(pattern);
372 RegExpCompileData compile_data; 371 RegExpCompileData compile_data;
373 FlatStringReader reader(isolate, pattern); 372 FlatStringReader reader(isolate, pattern);
374 if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, 373 if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
375 flags & JSRegExp::kMultiline, 374 &compile_data)) {
376 flags & JSRegExp::kUnicode, &compile_data)) {
377 // Throw an exception if we fail to parse the pattern. 375 // Throw an exception if we fail to parse the pattern.
378 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once. 376 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
379 USE(ThrowRegExpException(re, pattern, compile_data.error)); 377 USE(ThrowRegExpException(re, pattern, compile_data.error));
380 return false; 378 return false;
381 } 379 }
382 RegExpEngine::CompilationResult result = RegExpEngine::Compile( 380 RegExpEngine::CompilationResult result =
383 isolate, &zone, &compile_data, flags & JSRegExp::kIgnoreCase, 381 RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
384 flags & JSRegExp::kGlobal, flags & JSRegExp::kMultiline, 382 sample_subject, is_one_byte);
385 flags & JSRegExp::kSticky, pattern, sample_subject, is_one_byte);
386 if (result.error_message != NULL) { 383 if (result.error_message != NULL) {
387 // Unable to compile regexp. 384 // Unable to compile regexp.
388 Handle<String> error_message = isolate->factory()->NewStringFromUtf8( 385 Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
389 CStrVector(result.error_message)).ToHandleChecked(); 386 CStrVector(result.error_message)).ToHandleChecked();
390 ThrowRegExpException(re, error_message); 387 ThrowRegExpException(re, error_message);
391 return false; 388 return false;
392 } 389 }
393 390
394 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); 391 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
395 data->set(JSRegExp::code_index(is_one_byte), result.code); 392 data->set(JSRegExp::code_index(is_one_byte), result.code);
(...skipping 542 matching lines...) Expand 10 before | Expand all | Expand 10 after
938 935
939 private: 936 private:
940 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; 937 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
941 int total_samples_; 938 int total_samples_;
942 }; 939 };
943 940
944 941
945 class RegExpCompiler { 942 class RegExpCompiler {
946 public: 943 public:
947 RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, 944 RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
948 bool ignore_case, bool is_one_byte); 945 JSRegExp::Flags flags, bool is_one_byte);
949 946
950 int AllocateRegister() { 947 int AllocateRegister() {
951 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { 948 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
952 reg_exp_too_big_ = true; 949 reg_exp_too_big_ = true;
953 return next_register_; 950 return next_register_;
954 } 951 }
955 return next_register_++; 952 return next_register_++;
956 } 953 }
957 954
955 // Lookarounds to match lone surrogates for unicode character class matches
956 // are never nested. We can therefore reuse registers.
957 int UnicodeLookaroundStackRegister() {
958 if (unicode_lookaround_stack_register_ == kNoRegister) {
959 unicode_lookaround_stack_register_ = AllocateRegister();
960 }
961 return unicode_lookaround_stack_register_;
962 }
963
964 int UnicodeLookaroundPositionRegister() {
965 if (unicode_lookaround_position_register_ == kNoRegister) {
966 unicode_lookaround_position_register_ = AllocateRegister();
967 }
968 return unicode_lookaround_position_register_;
969 }
970
958 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler, 971 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
959 RegExpNode* start, 972 RegExpNode* start,
960 int capture_count, 973 int capture_count,
961 Handle<String> pattern); 974 Handle<String> pattern);
962 975
963 inline void AddWork(RegExpNode* node) { 976 inline void AddWork(RegExpNode* node) {
964 if (!node->on_work_list() && !node->label()->is_bound()) { 977 if (!node->on_work_list() && !node->label()->is_bound()) {
965 node->set_on_work_list(true); 978 node->set_on_work_list(true);
966 work_list_->Add(node); 979 work_list_->Add(node);
967 } 980 }
968 } 981 }
969 982
970 static const int kImplementationOffset = 0; 983 static const int kImplementationOffset = 0;
971 static const int kNumberOfRegistersOffset = 0; 984 static const int kNumberOfRegistersOffset = 0;
972 static const int kCodeOffset = 1; 985 static const int kCodeOffset = 1;
973 986
974 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } 987 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
975 EndNode* accept() { return accept_; } 988 EndNode* accept() { return accept_; }
976 989
977 static const int kMaxRecursion = 100; 990 static const int kMaxRecursion = 100;
978 inline int recursion_depth() { return recursion_depth_; } 991 inline int recursion_depth() { return recursion_depth_; }
979 inline void IncrementRecursionDepth() { recursion_depth_++; } 992 inline void IncrementRecursionDepth() { recursion_depth_++; }
980 inline void DecrementRecursionDepth() { recursion_depth_--; } 993 inline void DecrementRecursionDepth() { recursion_depth_--; }
981 994
982 void SetRegExpTooBig() { reg_exp_too_big_ = true; } 995 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
983 996
984 inline bool ignore_case() { return ignore_case_; } 997 inline bool ignore_case() { return flags_ & JSRegExp::kIgnoreCase; }
erikcorry 2016/01/20 10:47:05 Do we allow implicit int->bool conversions?
Yang 2016/01/20 13:06:20 We do this else where, for example in src/compiler
998 inline bool unicode() { return flags_ & JSRegExp::kUnicode; }
985 inline bool one_byte() { return one_byte_; } 999 inline bool one_byte() { return one_byte_; }
986 inline bool optimize() { return optimize_; } 1000 inline bool optimize() { return optimize_; }
987 inline void set_optimize(bool value) { optimize_ = value; } 1001 inline void set_optimize(bool value) { optimize_ = value; }
988 inline bool limiting_recursion() { return limiting_recursion_; } 1002 inline bool limiting_recursion() { return limiting_recursion_; }
989 inline void set_limiting_recursion(bool value) { 1003 inline void set_limiting_recursion(bool value) {
990 limiting_recursion_ = value; 1004 limiting_recursion_ = value;
991 } 1005 }
992 bool read_backward() { return read_backward_; } 1006 bool read_backward() { return read_backward_; }
993 void set_read_backward(bool value) { read_backward_ = value; } 1007 void set_read_backward(bool value) { read_backward_ = value; }
994 FrequencyCollator* frequency_collator() { return &frequency_collator_; } 1008 FrequencyCollator* frequency_collator() { return &frequency_collator_; }
995 1009
996 int current_expansion_factor() { return current_expansion_factor_; } 1010 int current_expansion_factor() { return current_expansion_factor_; }
997 void set_current_expansion_factor(int value) { 1011 void set_current_expansion_factor(int value) {
998 current_expansion_factor_ = value; 1012 current_expansion_factor_ = value;
999 } 1013 }
1000 1014
1001 Isolate* isolate() const { return isolate_; } 1015 Isolate* isolate() const { return isolate_; }
1002 Zone* zone() const { return zone_; } 1016 Zone* zone() const { return zone_; }
1003 1017
1004 static const int kNoRegister = -1; 1018 static const int kNoRegister = -1;
1005 1019
1006 private: 1020 private:
1007 EndNode* accept_; 1021 EndNode* accept_;
1008 int next_register_; 1022 int next_register_;
1023 int unicode_lookaround_stack_register_;
1024 int unicode_lookaround_position_register_;
1009 List<RegExpNode*>* work_list_; 1025 List<RegExpNode*>* work_list_;
1010 int recursion_depth_; 1026 int recursion_depth_;
1011 RegExpMacroAssembler* macro_assembler_; 1027 RegExpMacroAssembler* macro_assembler_;
1012 bool ignore_case_; 1028 JSRegExp::Flags flags_;
1013 bool one_byte_; 1029 bool one_byte_;
1014 bool reg_exp_too_big_; 1030 bool reg_exp_too_big_;
1015 bool limiting_recursion_; 1031 bool limiting_recursion_;
1016 bool optimize_; 1032 bool optimize_;
1017 bool read_backward_; 1033 bool read_backward_;
1018 int current_expansion_factor_; 1034 int current_expansion_factor_;
1019 FrequencyCollator frequency_collator_; 1035 FrequencyCollator frequency_collator_;
1020 Isolate* isolate_; 1036 Isolate* isolate_;
1021 Zone* zone_; 1037 Zone* zone_;
1022 }; 1038 };
(...skipping 11 matching lines...) Expand all
1034 1050
1035 1051
1036 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) { 1052 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1037 return RegExpEngine::CompilationResult(isolate, "RegExp too big"); 1053 return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1038 } 1054 }
1039 1055
1040 1056
1041 // Attempts to compile the regexp using an Irregexp code generator. Returns 1057 // Attempts to compile the regexp using an Irregexp code generator. Returns
1042 // a fixed array or a null handle depending on whether it succeeded. 1058 // a fixed array or a null handle depending on whether it succeeded.
1043 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, 1059 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
1044 bool ignore_case, bool one_byte) 1060 JSRegExp::Flags flags, bool one_byte)
1045 : next_register_(2 * (capture_count + 1)), 1061 : next_register_(2 * (capture_count + 1)),
1062 unicode_lookaround_stack_register_(kNoRegister),
1063 unicode_lookaround_position_register_(kNoRegister),
1046 work_list_(NULL), 1064 work_list_(NULL),
1047 recursion_depth_(0), 1065 recursion_depth_(0),
1048 ignore_case_(ignore_case), 1066 flags_(flags),
1049 one_byte_(one_byte), 1067 one_byte_(one_byte),
1050 reg_exp_too_big_(false), 1068 reg_exp_too_big_(false),
1051 limiting_recursion_(false), 1069 limiting_recursion_(false),
1052 optimize_(FLAG_regexp_optimization), 1070 optimize_(FLAG_regexp_optimization),
1053 read_backward_(false), 1071 read_backward_(false),
1054 current_expansion_factor_(1), 1072 current_expansion_factor_(1),
1055 frequency_collator_(), 1073 frequency_collator_(),
1056 isolate_(isolate), 1074 isolate_(isolate),
1057 zone_(zone) { 1075 zone_(zone) {
1058 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone); 1076 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
(...skipping 629 matching lines...) Expand 10 before | Expand all | Expand 10 after
1688 typedef bool EmitCharacterFunction(Isolate* isolate, 1706 typedef bool EmitCharacterFunction(Isolate* isolate,
1689 RegExpCompiler* compiler, 1707 RegExpCompiler* compiler,
1690 uc16 c, 1708 uc16 c,
1691 Label* on_failure, 1709 Label* on_failure,
1692 int cp_offset, 1710 int cp_offset,
1693 bool check, 1711 bool check,
1694 bool preloaded); 1712 bool preloaded);
1695 1713
1696 // Only emits letters (things that have case). Only used for case independent 1714 // Only emits letters (things that have case). Only used for case independent
1697 // matches. 1715 // matches.
1698 static inline bool EmitAtomLetter(Isolate* isolate, 1716 static inline bool EmitAtomLetter(Isolate* isolate,
erikcorry 2016/01/21 10:58:56 This is not /u-ified. Does it get called in /ui m
Yang 2016/01/21 11:49:04 No. This only gets called if the TextNode element
1699 RegExpCompiler* compiler, 1717 RegExpCompiler* compiler,
1700 uc16 c, 1718 uc16 c,
1701 Label* on_failure, 1719 Label* on_failure,
1702 int cp_offset, 1720 int cp_offset,
1703 bool check, 1721 bool check,
1704 bool preloaded) { 1722 bool preloaded) {
1705 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1723 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1706 bool one_byte = compiler->one_byte(); 1724 bool one_byte = compiler->one_byte();
1707 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1725 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1708 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); 1726 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
(...skipping 382 matching lines...) Expand 10 before | Expand all | Expand 10 after
2091 flip ? even_label : odd_label); 2109 flip ? even_label : odd_label);
2092 } 2110 }
2093 } 2111 }
2094 2112
2095 2113
2096 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, 2114 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2097 RegExpCharacterClass* cc, bool one_byte, 2115 RegExpCharacterClass* cc, bool one_byte,
2098 Label* on_failure, int cp_offset, bool check_offset, 2116 Label* on_failure, int cp_offset, bool check_offset,
2099 bool preloaded, Zone* zone) { 2117 bool preloaded, Zone* zone) {
2100 ZoneList<CharacterRange>* ranges = cc->ranges(zone); 2118 ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2101 if (!CharacterRange::IsCanonical(ranges)) { 2119 CharacterRange::Canonicalize(ranges);
2102 CharacterRange::Canonicalize(ranges);
2103 }
2104 2120
2105 int max_char; 2121 int max_char;
2106 if (one_byte) { 2122 if (one_byte) {
2107 max_char = String::kMaxOneByteCharCode; 2123 max_char = String::kMaxOneByteCharCode;
2108 } else { 2124 } else {
2109 max_char = String::kMaxUtf16CodeUnit; 2125 max_char = String::kMaxUtf16CodeUnit;
2110 } 2126 }
2111 2127
2112 int range_count = ranges->length(); 2128 int range_count = ranges->length();
2113 2129
(...skipping 21 matching lines...) Expand all
2135 if (cc->is_negated()) { 2151 if (cc->is_negated()) {
2136 macro_assembler->GoTo(on_failure); 2152 macro_assembler->GoTo(on_failure);
2137 } else { 2153 } else {
2138 // This is a common case hit by non-anchored expressions. 2154 // This is a common case hit by non-anchored expressions.
2139 if (check_offset) { 2155 if (check_offset) {
2140 macro_assembler->CheckPosition(cp_offset, on_failure); 2156 macro_assembler->CheckPosition(cp_offset, on_failure);
2141 } 2157 }
2142 } 2158 }
2143 return; 2159 return;
2144 } 2160 }
2145 if (last_valid_range == 0 &&
2146 !cc->is_negated() &&
2147 ranges->at(0).IsEverything(max_char)) {
2148 // This is a common case hit by non-anchored expressions.
2149 if (check_offset) {
2150 macro_assembler->CheckPosition(cp_offset, on_failure);
2151 }
2152 return;
2153 }
2154 2161
2155 if (!preloaded) { 2162 if (!preloaded) {
2156 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); 2163 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2157 } 2164 }
2158 2165
2159 if (cc->is_standard(zone) && 2166 if (cc->is_standard(zone) &&
2160 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), 2167 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2161 on_failure)) { 2168 on_failure)) {
2162 return; 2169 return;
2163 } 2170 }
2164 2171
2165 2172
2166 // A new list with ascending entries. Each entry is a code unit 2173 // A new list with ascending entries. Each entry is a code unit
2167 // where there is a boundary between code units that are part of 2174 // where there is a boundary between code units that are part of
2168 // the class and code units that are not. Normally we insert an 2175 // the class and code units that are not. Normally we insert an
2169 // entry at zero which goes to the failure label, but if there 2176 // entry at zero which goes to the failure label, but if there
2170 // was already one there we fall through for success on that entry. 2177 // was already one there we fall through for success on that entry.
2171 // Subsequent entries have alternating meaning (success/failure). 2178 // Subsequent entries have alternating meaning (success/failure).
(...skipping 619 matching lines...) Expand 10 before | Expand all | Expand 10 after
2791 // Character is outside Latin-1 completely 2798 // Character is outside Latin-1 completely
2792 if (converted == 0) return set_replacement(NULL); 2799 if (converted == 0) return set_replacement(NULL);
2793 // Convert quark to Latin-1 in place. 2800 // Convert quark to Latin-1 in place.
2794 uint16_t* copy = const_cast<uint16_t*>(quarks.start()); 2801 uint16_t* copy = const_cast<uint16_t*>(quarks.start());
2795 copy[j] = converted; 2802 copy[j] = converted;
2796 } 2803 }
2797 } else { 2804 } else {
2798 DCHECK(elm.text_type() == TextElement::CHAR_CLASS); 2805 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2799 RegExpCharacterClass* cc = elm.char_class(); 2806 RegExpCharacterClass* cc = elm.char_class();
2800 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 2807 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2801 if (!CharacterRange::IsCanonical(ranges)) { 2808 CharacterRange::Canonicalize(ranges);
2802 CharacterRange::Canonicalize(ranges);
2803 }
2804 // Now they are in order so we only need to look at the first. 2809 // Now they are in order so we only need to look at the first.
2805 int range_count = ranges->length(); 2810 int range_count = ranges->length();
2806 if (cc->is_negated()) { 2811 if (cc->is_negated()) {
2807 if (range_count != 0 && 2812 if (range_count != 0 &&
2808 ranges->at(0).from() == 0 && 2813 ranges->at(0).from() == 0 &&
2809 ranges->at(0).to() >= String::kMaxOneByteCharCode) { 2814 ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2810 // This will be handled in a later filter. 2815 // This will be handled in a later filter.
2811 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; 2816 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2812 return set_replacement(NULL); 2817 return set_replacement(NULL);
2813 } 2818 }
(...skipping 468 matching lines...) Expand 10 before | Expand all | Expand 10 after
3282 bool TextNode::SkipPass(int int_pass, bool ignore_case) { 3287 bool TextNode::SkipPass(int int_pass, bool ignore_case) {
3283 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass); 3288 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3284 if (ignore_case) { 3289 if (ignore_case) {
3285 return pass == SIMPLE_CHARACTER_MATCH; 3290 return pass == SIMPLE_CHARACTER_MATCH;
3286 } else { 3291 } else {
3287 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH; 3292 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3288 } 3293 }
3289 } 3294 }
3290 3295
3291 3296
3297 TextNode* TextNode::CharacterRanges(Zone* zone,
3298 ZoneList<CharacterRange>* ranges,
3299 bool read_backward,
3300 RegExpNode* on_success) {
3301 DCHECK_NOT_NULL(ranges);
3302 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
3303 elms->Add(
3304 TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)),
3305 zone);
3306 return new (zone) TextNode(elms, read_backward, on_success);
3307 }
3308
3309
3310 TextNode* TextNode::SurrogatePair(Zone* zone, CharacterRange lead,
3311 CharacterRange trail, bool read_backward,
3312 RegExpNode* on_success) {
3313 ZoneList<CharacterRange>* lead_ranges =
3314 new (zone) ZoneList<CharacterRange>(1, zone);
3315 lead_ranges->Add(lead, zone);
3316 ZoneList<CharacterRange>* trail_ranges =
3317 new (zone) ZoneList<CharacterRange>(1, zone);
3318 trail_ranges->Add(trail, zone);
3319 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
3320 elms->Add(TextElement::CharClass(
3321 new (zone) RegExpCharacterClass(lead_ranges, false)),
3322 zone);
3323 elms->Add(TextElement::CharClass(
3324 new (zone) RegExpCharacterClass(trail_ranges, false)),
3325 zone);
3326 return new (zone) TextNode(elms, read_backward, on_success);
3327 }
3328
3329
3292 // This generates the code to match a text node. A text node can contain 3330 // This generates the code to match a text node. A text node can contain
3293 // straight character sequences (possibly to be matched in a case-independent 3331 // straight character sequences (possibly to be matched in a case-independent
3294 // way) and character classes. For efficiency we do not do this in a single 3332 // way) and character classes. For efficiency we do not do this in a single
3295 // pass from left to right. Instead we pass over the text node several times, 3333 // pass from left to right. Instead we pass over the text node several times,
3296 // emitting code for some character positions every time. See the comment on 3334 // emitting code for some character positions every time. See the comment on
3297 // TextEmitPass for details. 3335 // TextEmitPass for details.
3298 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3336 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3299 LimitResult limit_result = LimitVersions(compiler, trace); 3337 LimitResult limit_result = LimitVersions(compiler, trace);
3300 if (limit_result == DONE) return; 3338 if (limit_result == DONE) return;
3301 DCHECK(limit_result == CONTINUE); 3339 DCHECK(limit_result == CONTINUE);
(...skipping 96 matching lines...) Expand 10 before | Expand all | Expand 10 after
3398 3436
3399 3437
3400 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( 3438 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3401 RegExpCompiler* compiler) { 3439 RegExpCompiler* compiler) {
3402 if (read_backward()) return NULL; 3440 if (read_backward()) return NULL;
3403 if (elements()->length() != 1) return NULL; 3441 if (elements()->length() != 1) return NULL;
3404 TextElement elm = elements()->at(0); 3442 TextElement elm = elements()->at(0);
3405 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL; 3443 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
3406 RegExpCharacterClass* node = elm.char_class(); 3444 RegExpCharacterClass* node = elm.char_class();
3407 ZoneList<CharacterRange>* ranges = node->ranges(zone()); 3445 ZoneList<CharacterRange>* ranges = node->ranges(zone());
3408 if (!CharacterRange::IsCanonical(ranges)) { 3446 CharacterRange::Canonicalize(ranges);
3409 CharacterRange::Canonicalize(ranges);
3410 }
3411 if (node->is_negated()) { 3447 if (node->is_negated()) {
3412 return ranges->length() == 0 ? on_success() : NULL; 3448 return ranges->length() == 0 ? on_success() : NULL;
3413 } 3449 }
3414 if (ranges->length() != 1) return NULL; 3450 if (ranges->length() != 1) return NULL;
3415 uint32_t max_char; 3451 uint32_t max_char;
3416 if (compiler->one_byte()) { 3452 if (compiler->one_byte()) {
3417 max_char = String::kMaxOneByteCharCode; 3453 max_char = String::kMaxOneByteCharCode;
3418 } else { 3454 } else {
3419 max_char = String::kMaxUtf16CodeUnit; 3455 max_char = String::kMaxUtf16CodeUnit;
3420 } 3456 }
(...skipping 126 matching lines...) Expand 10 before | Expand all | Expand 10 after
3547 return alt_gens_[i]; 3583 return alt_gens_[i];
3548 } 3584 }
3549 3585
3550 private: 3586 private:
3551 static const int kAFew = 10; 3587 static const int kAFew = 10;
3552 ZoneList<AlternativeGeneration*> alt_gens_; 3588 ZoneList<AlternativeGeneration*> alt_gens_;
3553 AlternativeGeneration a_few_alt_gens_[kAFew]; 3589 AlternativeGeneration a_few_alt_gens_[kAFew];
3554 }; 3590 };
3555 3591
3556 3592
3593 static const uc32 kLeadSurrogateStart = 0xd800;
3594 static const uc32 kLeadSurrogateEnd = 0xdbff;
3595 static const uc32 kTrailSurrogateStart = 0xdc00;
3596 static const uc32 kTrailSurrogateEnd = 0xdfff;
3597 static const uc32 kNonBmpStart = 0x10000;
3598 static const uc32 kNonBmpEnd = 0x10ffff;
3599 static const uc32 kRangeEndMarker = 0x110000;
3600
3557 // The '2' variant is has inclusive from and exclusive to. 3601 // The '2' variant is has inclusive from and exclusive to.
3558 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12, 3602 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3559 // which include WhiteSpace (7.2) or LineTerminator (7.3) values. 3603 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
3560 static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 3604 static const int kSpaceRanges[] = {
3561 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, 3605 '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, 0x1681,
3562 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 3606 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
3563 0xFEFF, 0xFF00, 0x10000 }; 3607 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
3564 static const int kSpaceRangeCount = arraysize(kSpaceRanges); 3608 static const int kSpaceRangeCount = arraysize(kSpaceRanges);
3565 3609
3566 static const int kWordRanges[] = { 3610 static const int kWordRanges[] = {
3567 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 }; 3611 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
3568 static const int kWordRangeCount = arraysize(kWordRanges); 3612 static const int kWordRangeCount = arraysize(kWordRanges);
3569 static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 }; 3613 static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
3570 static const int kDigitRangeCount = arraysize(kDigitRanges); 3614 static const int kDigitRangeCount = arraysize(kDigitRanges);
3571 static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 }; 3615 static const int kSurrogateRanges[] = {
3616 kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
3572 static const int kSurrogateRangeCount = arraysize(kSurrogateRanges); 3617 static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
3573 static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E, 3618 static const int kLineTerminatorRanges[] = {
3574 0x2028, 0x202A, 0x10000 }; 3619 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
3575 static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges); 3620 static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
3576 3621
3577
3578 void BoyerMoorePositionInfo::Set(int character) { 3622 void BoyerMoorePositionInfo::Set(int character) {
3579 SetInterval(Interval(character, character)); 3623 SetInterval(Interval(character, character));
3580 } 3624 }
3581 3625
3582 3626
3583 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) { 3627 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3584 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval); 3628 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3585 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval); 3629 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3586 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval); 3630 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3587 surrogate_ = 3631 surrogate_ =
(...skipping 1137 matching lines...) Expand 10 before | Expand all | Expand 10 after
4725 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, 4769 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4726 RegExpNode* on_success) { 4770 RegExpNode* on_success) {
4727 return new (compiler->zone()) 4771 return new (compiler->zone())
4728 TextNode(elements(), compiler->read_backward(), on_success); 4772 TextNode(elements(), compiler->read_backward(), on_success);
4729 } 4773 }
4730 4774
4731 4775
4732 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, 4776 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4733 const int* special_class, 4777 const int* special_class,
4734 int length) { 4778 int length) {
4735 length--; // Remove final 0x10000. 4779 length--; // Remove final marker.
4736 DCHECK(special_class[length] == 0x10000); 4780 DCHECK(special_class[length] == kRangeEndMarker);
4737 DCHECK(ranges->length() != 0); 4781 DCHECK(ranges->length() != 0);
4738 DCHECK(length != 0); 4782 DCHECK(length != 0);
4739 DCHECK(special_class[0] != 0); 4783 DCHECK(special_class[0] != 0);
4740 if (ranges->length() != (length >> 1) + 1) { 4784 if (ranges->length() != (length >> 1) + 1) {
4741 return false; 4785 return false;
4742 } 4786 }
4743 CharacterRange range = ranges->at(0); 4787 CharacterRange range = ranges->at(0);
4744 if (range.from() != 0) { 4788 if (range.from() != 0) {
4745 return false; 4789 return false;
4746 } 4790 }
4747 for (int i = 0; i < length; i += 2) { 4791 for (int i = 0; i < length; i += 2) {
4748 if (special_class[i] != (range.to() + 1)) { 4792 if (special_class[i] != (range.to() + 1)) {
4749 return false; 4793 return false;
4750 } 4794 }
4751 range = ranges->at((i >> 1) + 1); 4795 range = ranges->at((i >> 1) + 1);
4752 if (special_class[i+1] != range.from()) { 4796 if (special_class[i+1] != range.from()) {
4753 return false; 4797 return false;
4754 } 4798 }
4755 } 4799 }
4756 if (range.to() != 0xffff) { 4800 if (range.to() != 0xffff) {
4757 return false; 4801 return false;
4758 } 4802 }
4759 return true; 4803 return true;
4760 } 4804 }
4761 4805
4762 4806
4763 static bool CompareRanges(ZoneList<CharacterRange>* ranges, 4807 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4764 const int* special_class, 4808 const int* special_class,
4765 int length) { 4809 int length) {
4766 length--; // Remove final 0x10000. 4810 length--; // Remove final marker.
4767 DCHECK(special_class[length] == 0x10000); 4811 DCHECK(special_class[length] == kRangeEndMarker);
4768 if (ranges->length() * 2 != length) { 4812 if (ranges->length() * 2 != length) {
4769 return false; 4813 return false;
4770 } 4814 }
4771 for (int i = 0; i < length; i += 2) { 4815 for (int i = 0; i < length; i += 2) {
4772 CharacterRange range = ranges->at(i >> 1); 4816 CharacterRange range = ranges->at(i >> 1);
4773 if (range.from() != special_class[i] || 4817 if (range.from() != special_class[i] ||
4774 range.to() != special_class[i + 1] - 1) { 4818 range.to() != special_class[i + 1] - 1) {
4775 return false; 4819 return false;
4776 } 4820 }
4777 } 4821 }
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after
4813 return true; 4857 return true;
4814 } 4858 }
4815 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 4859 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4816 set_.set_standard_set_type('W'); 4860 set_.set_standard_set_type('W');
4817 return true; 4861 return true;
4818 } 4862 }
4819 return false; 4863 return false;
4820 } 4864 }
4821 4865
4822 4866
4867 bool RegExpCharacterClass::NeedsDesugaringForUnicode(Zone* zone) {
4868 ZoneList<CharacterRange>* ranges = this->ranges(zone);
4869 CharacterRange::Canonicalize(ranges);
4870 for (int i = ranges->length() - 1; i >= 0; i--) {
4871 uc32 from = ranges->at(i).from();
4872 uc32 to = ranges->at(i).to();
4873 // Check for non-BMP characters.
4874 if (to >= kNonBmpStart) return true;
4875 // Check for lone surrogates.
4876 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
4877 }
4878 return false;
4879 }
4880
4881
4882 UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
4883 ZoneList<CharacterRange>* base)
4884 : zone_(zone),
4885 table_(zone),
4886 bmp_(nullptr),
4887 lead_surrogates_(nullptr),
4888 trail_surrogates_(nullptr),
4889 non_bmp_(nullptr) {
erikcorry 2016/01/20 10:47:05 What is going on here? Some explanation needed, I
Yang 2016/01/20 13:06:20 Added a wall of comment.
4890 for (int i = 0; i < base->length(); i++) {
4891 table_.AddRange(base->at(i), kBase, zone_);
4892 }
4893 table_.AddRange(CharacterRange(0, kLeadSurrogateStart - 1), kBmpCodePoints,
4894 zone_);
4895 table_.AddRange(CharacterRange(kLeadSurrogateStart, kLeadSurrogateEnd),
4896 kLeadSurrogates, zone_);
4897 table_.AddRange(CharacterRange(kTrailSurrogateStart, kTrailSurrogateEnd),
4898 kTrailSurrogates, zone_);
4899 table_.AddRange(CharacterRange(kTrailSurrogateEnd, kNonBmpStart - 1),
4900 kBmpCodePoints, zone_);
4901 table_.AddRange(CharacterRange(kNonBmpStart, kNonBmpEnd), kNonBmpCodePoints,
4902 zone_);
4903 table_.ForEach(this);
4904 }
4905
4906
4907 void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
4908 OutSet* outset = entry.out_set();
4909 if (!outset->Get(kBase)) return;
4910 ZoneList<CharacterRange>** target = NULL;
4911 if (outset->Get(kBmpCodePoints)) {
4912 target = &bmp_;
4913 } else if (outset->Get(kLeadSurrogates)) {
4914 target = &lead_surrogates_;
4915 } else if (outset->Get(kTrailSurrogates)) {
4916 target = &trail_surrogates_;
4917 } else {
4918 DCHECK(outset->Get(kNonBmpCodePoints));
4919 target = &non_bmp_;
4920 }
4921 if (*target == NULL) *target = new (zone_) ZoneList<CharacterRange>(2, zone_);
4922 (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
4923 }
4924
4925
4926 void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
4927 RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
4928 ZoneList<CharacterRange>* bmp = splitter->bmp();
4929 if (bmp == nullptr) return;
4930 result->AddAlternative(GuardedAlternative(TextNode::CharacterRanges(
4931 compiler->zone(), bmp, compiler->read_backward(), on_success)));
4932 }
4933
4934
4935 void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
4936 RegExpNode* on_success,
4937 UnicodeRangeSplitter* splitter) {
4938 ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
4939 if (non_bmp == nullptr) return;
4940 DCHECK(compiler->unicode());
4941 DCHECK(!compiler->one_byte());
4942 Zone* zone = compiler->zone();
4943 CharacterRange::Canonicalize(non_bmp);
4944 for (int i = 0; i < non_bmp->length(); i++) {
4945 // Match surrogate pair.
4946 // E.g. [\u10005-\u11005] becomes
4947 // \ud800[\udc05-\udfff]|
4948 // [\ud801-\ud803][\udc00-\udfff]|
4949 // \ud804[\udc00-\udc05]
4950 uc32 from = non_bmp->at(i).from();
4951 uc32 to = non_bmp->at(i).to();
4952 uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
4953 uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
4954 uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
4955 uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
4956 if (from_l == to_l) {
4957 // The lead surrogate is the same.
4958 result->AddAlternative(GuardedAlternative(
4959 TextNode::SurrogatePair(zone, CharacterRange::Singleton(from_l),
4960 CharacterRange::Range(from_t, to_t),
4961 compiler->read_backward(), on_success)));
4962 } else if (from_t == kTrailSurrogateStart && to_t == kTrailSurrogateEnd) {
4963 // Can be represented as [<from_l>-<to_l>][\udc00-\udfff].
4964 // This would simplify /./u a bit.
4965 result->AddAlternative(GuardedAlternative(TextNode::SurrogatePair(
4966 zone, CharacterRange::Range(from_l, to_l),
4967 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4968 compiler->read_backward(), on_success)));
4969 } else {
4970 result->AddAlternative(GuardedAlternative(TextNode::SurrogatePair(
erikcorry 2016/01/20 10:47:05 This one we only need if from_t is not kTrailSurro
Yang 2016/01/20 13:06:20 Done.
4971 zone, CharacterRange::Singleton(from_l),
4972 CharacterRange::Range(from_t, kTrailSurrogateEnd),
4973 compiler->read_backward(), on_success)));
4974 if (from_l + 1 < to_l) {
4975 result->AddAlternative(GuardedAlternative(TextNode::SurrogatePair(
4976 zone, CharacterRange::Range(from_l + 1, to_l - 1),
4977 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4978 compiler->read_backward(), on_success)));
4979 }
4980 result->AddAlternative(GuardedAlternative(TextNode::SurrogatePair(
4981 zone, CharacterRange::Singleton(to_l),
4982 CharacterRange::Range(kTrailSurrogateStart, to_t),
4983 compiler->read_backward(), on_success)));
4984 }
4985 }
4986 }
4987
4988
4989 RegExpNode* NegativeLookbehindAndMatch(RegExpCompiler* compiler,
4990 ZoneList<CharacterRange>* lookbehind,
4991 ZoneList<CharacterRange>* match,
4992 RegExpNode* on_success,
4993 bool read_backward) {
4994 Zone* zone = compiler->zone();
4995 RegExpNode* match_node =
4996 TextNode::CharacterRanges(zone, match, read_backward, on_success);
4997 int stack_register = compiler->UnicodeLookaroundStackRegister();
4998 int position_register = compiler->UnicodeLookaroundPositionRegister();
4999 RegExpLookaround::Builder lookaround(false, match_node, stack_register,
5000 position_register);
5001 RegExpNode* negative_match = TextNode::CharacterRanges(
5002 zone, lookbehind, !read_backward, lookaround.on_match_success());
5003 return lookaround.ForMatch(negative_match);
5004 }
5005
5006
5007 RegExpNode* MatchAndNegativeLookahead(RegExpCompiler* compiler,
5008 ZoneList<CharacterRange>* match,
5009 ZoneList<CharacterRange>* lookahead,
5010 RegExpNode* on_success,
5011 bool read_backward) {
5012 Zone* zone = compiler->zone();
5013 int stack_register = compiler->UnicodeLookaroundStackRegister();
5014 int position_register = compiler->UnicodeLookaroundPositionRegister();
5015 RegExpLookaround::Builder lookaround(false, on_success, stack_register,
5016 position_register);
5017 RegExpNode* negative_match = TextNode::CharacterRanges(
5018 zone, lookahead, read_backward, lookaround.on_match_success());
5019 return TextNode::CharacterRanges(zone, match, read_backward,
5020 lookaround.ForMatch(negative_match));
5021 }
5022
5023
5024 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5025 RegExpNode* on_success,
5026 UnicodeRangeSplitter* splitter) {
5027 ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
5028 if (lead_surrogates == nullptr) return;
5029 Zone* zone = compiler->zone();
5030 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
5031 ZoneList<CharacterRange>* trail_surrogates =
5032 new (zone) ZoneList<CharacterRange>(1, zone);
5033 trail_surrogates->Add(
5034 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), zone);
5035
5036 RegExpNode* match =
5037 compiler->read_backward()
5038 // reading backward, make sure of no trail surrogate before the match.
5039 ? NegativeLookbehindAndMatch(compiler, trail_surrogates,
erikcorry 2016/01/20 10:47:05 I don't think so. Even if you are reading backwar
Yang 2016/01/20 13:06:20 I think it's the naming that's confusing. Inside t
5040 lead_surrogates, on_success, true)
5041 // reading forward, make sure of no trail surrogate after the match.
5042 : MatchAndNegativeLookahead(compiler, lead_surrogates,
5043 trail_surrogates, on_success, false);
5044 result->AddAlternative(GuardedAlternative(match));
5045 }
5046
5047
5048 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5049 RegExpNode* on_success,
5050 UnicodeRangeSplitter* splitter) {
5051 ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
5052 if (trail_surrogates == nullptr) return;
5053 Zone* zone = compiler->zone();
5054 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
5055 ZoneList<CharacterRange>* lead_surrogates =
5056 new (zone) ZoneList<CharacterRange>(1, zone);
5057 lead_surrogates->Add(
5058 CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd), zone);
5059
5060 RegExpNode* match =
5061 compiler->read_backward()
5062 // reading backward, make sure of no lead surrogate after the match.
5063 ? MatchAndNegativeLookahead(compiler, trail_surrogates,
5064 lead_surrogates, on_success, true)
5065 // reading forward, make sure of no lead surrogate before the match.
5066 : NegativeLookbehindAndMatch(compiler, lead_surrogates,
5067 trail_surrogates, on_success, false);
5068 result->AddAlternative(GuardedAlternative(match));
5069 }
5070
5071
4823 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 5072 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
4824 RegExpNode* on_success) { 5073 RegExpNode* on_success) {
4825 return new (compiler->zone()) 5074 set_.Canonicalize();
4826 TextNode(this, compiler->read_backward(), on_success); 5075 Zone* zone = compiler->zone();
4827 } 5076 ZoneList<CharacterRange>* ranges = this->ranges(zone);
4828 5077 if (compiler->unicode() && !compiler->one_byte()) {
4829 5078 if (is_negated()) {
5079 ZoneList<CharacterRange>* negated =
5080 new (zone) ZoneList<CharacterRange>(2, zone);
5081 CharacterRange::Negate(ranges, negated, zone);
5082 ranges = negated;
5083 }
5084 if (ranges->length() == 0) {
5085 // No matches possible.
5086 return new (zone) EndNode(EndNode::BACKTRACK, zone);
5087 }
5088 UnicodeRangeSplitter splitter(zone, ranges);
5089 ChoiceNode* result = new (compiler->zone()) ChoiceNode(2, compiler->zone());
5090 AddBmpCharacters(compiler, result, on_success, &splitter);
5091 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
5092 AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
5093 AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
5094 return result;
5095 } else {
5096 // Limit the ranges to 0 .. 0xffff.
5097 int i;
5098 for (i = ranges->length() - 1; i >= 0; i--) {
5099 if (ranges->at(i).from() <= String::kMaxUtf16CodeUnit) break;
erikcorry 2016/01/20 10:47:05 For the one-byte case don't we want to have a lowe
Yang 2016/01/20 13:06:20 Actually this is not necessary since we do the lim
5100 }
5101 ranges->at(i).set_to(Min(ranges->at(i).to(), String::kMaxUtf16CodeUnit));
5102 ranges->Rewind(i + 1);
5103 return new (zone) TextNode(this, compiler->read_backward(), on_success);
5104 }
5105 }
5106
5107
4830 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) { 5108 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
4831 RegExpAtom* atom1 = (*a)->AsAtom(); 5109 RegExpAtom* atom1 = (*a)->AsAtom();
4832 RegExpAtom* atom2 = (*b)->AsAtom(); 5110 RegExpAtom* atom2 = (*b)->AsAtom();
4833 uc16 character1 = atom1->data().at(0); 5111 uc16 character1 = atom1->data().at(0);
4834 uc16 character2 = atom2->data().at(0); 5112 uc16 character2 = atom2->data().at(0);
4835 if (character1 < character2) return -1; 5113 if (character1 < character2) return -1;
4836 if (character1 > character2) return 1; 5114 if (character1 > character2) return 1;
4837 return 0; 5115 return 0;
4838 } 5116 }
4839 5117
(...skipping 491 matching lines...) Expand 10 before | Expand all | Expand 10 after
5331 compiler->read_backward(), on_success); 5609 compiler->read_backward(), on_success);
5332 } 5610 }
5333 5611
5334 5612
5335 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, 5613 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5336 RegExpNode* on_success) { 5614 RegExpNode* on_success) {
5337 return on_success; 5615 return on_success;
5338 } 5616 }
5339 5617
5340 5618
5619 RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
5620 int stack_pointer_register,
5621 int position_register,
5622 int capture_register_count,
5623 int capture_register_start)
5624 : is_positive_(is_positive),
5625 on_success_(on_success),
5626 stack_pointer_register_(stack_pointer_register),
5627 position_register_(position_register) {
5628 if (is_positive_) {
5629 on_match_success_ = ActionNode::PositiveSubmatchSuccess(
5630 stack_pointer_register, position_register, capture_register_count,
5631 capture_register_start, on_success_);
5632 } else {
5633 Zone* zone = on_success_->zone();
5634 on_match_success_ = new (zone) NegativeSubmatchSuccess(
5635 stack_pointer_register, position_register, capture_register_count,
5636 capture_register_start, zone);
5637 }
5638 }
5639
5640
5641 RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
5642 if (is_positive_) {
5643 return ActionNode::BeginSubmatch(stack_pointer_register_,
5644 position_register_, match);
5645 } else {
5646 Zone* zone = on_success_->zone();
5647 // We use a ChoiceNode to represent the negative lookaround. The first
5648 // alternative is the negative match. On success, the end node backtracks.
5649 // On failure, the second alternative is tried and leads to success.
5650 // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
5651 // first exit when calculating quick checks.
5652 ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
5653 GuardedAlternative(match), GuardedAlternative(on_success_), zone);
5654 return ActionNode::BeginSubmatch(stack_pointer_register_,
5655 position_register_, choice_node);
5656 }
5657 }
5658
5659
5341 RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler, 5660 RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
5342 RegExpNode* on_success) { 5661 RegExpNode* on_success) {
5343 int stack_pointer_register = compiler->AllocateRegister(); 5662 int stack_pointer_register = compiler->AllocateRegister();
5344 int position_register = compiler->AllocateRegister(); 5663 int position_register = compiler->AllocateRegister();
5345 5664
5346 const int registers_per_capture = 2; 5665 const int registers_per_capture = 2;
5347 const int register_of_first_capture = 2; 5666 const int register_of_first_capture = 2;
5348 int register_count = capture_count_ * registers_per_capture; 5667 int register_count = capture_count_ * registers_per_capture;
5349 int register_start = 5668 int register_start =
5350 register_of_first_capture + capture_from_ * registers_per_capture; 5669 register_of_first_capture + capture_from_ * registers_per_capture;
5351 5670
5352 RegExpNode* result; 5671 RegExpNode* result;
5353 bool was_reading_backward = compiler->read_backward(); 5672 bool was_reading_backward = compiler->read_backward();
5354 compiler->set_read_backward(type() == LOOKBEHIND); 5673 compiler->set_read_backward(type() == LOOKBEHIND);
5355 if (is_positive()) { 5674 Builder builder(is_positive(), on_success, stack_pointer_register,
5356 result = ActionNode::BeginSubmatch( 5675 position_register, register_count, register_start);
5357 stack_pointer_register, position_register, 5676 RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
5358 body()->ToNode(compiler, 5677 result = builder.ForMatch(match);
5359 ActionNode::PositiveSubmatchSuccess(
5360 stack_pointer_register, position_register,
5361 register_count, register_start, on_success)));
5362 } else {
5363 // We use a ChoiceNode for a negative lookahead because it has most of
5364 // the characteristics we need. It has the body of the lookahead as its
5365 // first alternative and the expression after the lookahead of the second
5366 // alternative. If the first alternative succeeds then the
5367 // NegativeSubmatchSuccess will unwind the stack including everything the
5368 // choice node set up and backtrack. If the first alternative fails then
5369 // the second alternative is tried, which is exactly the desired result
5370 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special
5371 // ChoiceNode that knows to ignore the first exit when calculating quick
5372 // checks.
5373 Zone* zone = compiler->zone();
5374
5375 GuardedAlternative body_alt(
5376 body()->ToNode(compiler, new (zone) NegativeSubmatchSuccess(
5377 stack_pointer_register, position_register,
5378 register_count, register_start, zone)));
5379 ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
5380 body_alt, GuardedAlternative(on_success), zone);
5381 result = ActionNode::BeginSubmatch(stack_pointer_register,
5382 position_register, choice_node);
5383 }
5384 compiler->set_read_backward(was_reading_backward); 5678 compiler->set_read_backward(was_reading_backward);
5385 return result; 5679 return result;
5386 } 5680 }
5387 5681
5388 5682
5389 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, 5683 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5390 RegExpNode* on_success) { 5684 RegExpNode* on_success) {
5391 return ToNode(body(), index(), compiler, on_success); 5685 return ToNode(body(), index(), compiler, on_success);
5392 } 5686 }
5393 5687
(...skipping 27 matching lines...) Expand all
5421 } 5715 }
5422 return current; 5716 return current;
5423 } 5717 }
5424 5718
5425 5719
5426 static void AddClass(const int* elmv, 5720 static void AddClass(const int* elmv,
5427 int elmc, 5721 int elmc,
5428 ZoneList<CharacterRange>* ranges, 5722 ZoneList<CharacterRange>* ranges,
5429 Zone* zone) { 5723 Zone* zone) {
5430 elmc--; 5724 elmc--;
5431 DCHECK(elmv[elmc] == 0x10000); 5725 DCHECK(elmv[elmc] == kRangeEndMarker);
5432 for (int i = 0; i < elmc; i += 2) { 5726 for (int i = 0; i < elmc; i += 2) {
5433 DCHECK(elmv[i] < elmv[i + 1]); 5727 DCHECK(elmv[i] < elmv[i + 1]);
5434 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1), zone); 5728 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1), zone);
5435 } 5729 }
5436 } 5730 }
5437 5731
5438 5732
5439 static void AddClassNegated(const int *elmv, 5733 static void AddClassNegated(const int *elmv,
5440 int elmc, 5734 int elmc,
5441 ZoneList<CharacterRange>* ranges, 5735 ZoneList<CharacterRange>* ranges,
5442 Zone* zone) { 5736 Zone* zone) {
5443 elmc--; 5737 elmc--;
5444 DCHECK(elmv[elmc] == 0x10000); 5738 DCHECK(elmv[elmc] == kRangeEndMarker);
5445 DCHECK(elmv[0] != 0x0000); 5739 DCHECK(elmv[0] != 0x0000);
5446 DCHECK(elmv[elmc-1] != String::kMaxUtf16CodeUnit); 5740 DCHECK(elmv[elmc - 1] != String::kMaxCodePoint);
5447 uc16 last = 0x0000; 5741 uc16 last = 0x0000;
5448 for (int i = 0; i < elmc; i += 2) { 5742 for (int i = 0; i < elmc; i += 2) {
5449 DCHECK(last <= elmv[i] - 1); 5743 DCHECK(last <= elmv[i] - 1);
5450 DCHECK(elmv[i] < elmv[i + 1]); 5744 DCHECK(elmv[i] < elmv[i + 1]);
5451 ranges->Add(CharacterRange(last, elmv[i] - 1), zone); 5745 ranges->Add(CharacterRange(last, elmv[i] - 1), zone);
5452 last = elmv[i + 1]; 5746 last = elmv[i + 1];
5453 } 5747 }
5454 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit), zone); 5748 ranges->Add(CharacterRange(last, String::kMaxCodePoint), zone);
5455 } 5749 }
5456 5750
5457 5751
5458 void CharacterRange::AddClassEscape(uc16 type, 5752 void CharacterRange::AddClassEscape(uc16 type,
5459 ZoneList<CharacterRange>* ranges, 5753 ZoneList<CharacterRange>* ranges,
5460 Zone* zone) { 5754 Zone* zone) {
5461 switch (type) { 5755 switch (type) {
5462 case 's': 5756 case 's':
5463 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone); 5757 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5464 break; 5758 break;
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after
5501 UNREACHABLE(); 5795 UNREACHABLE();
5502 } 5796 }
5503 } 5797 }
5504 5798
5505 5799
5506 Vector<const int> CharacterRange::GetWordBounds() { 5800 Vector<const int> CharacterRange::GetWordBounds() {
5507 return Vector<const int>(kWordRanges, kWordRangeCount - 1); 5801 return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5508 } 5802 }
5509 5803
5510 5804
5511 class CharacterRangeSplitter {
5512 public:
5513 CharacterRangeSplitter(ZoneList<CharacterRange>** included,
5514 ZoneList<CharacterRange>** excluded,
5515 Zone* zone)
5516 : included_(included),
5517 excluded_(excluded),
5518 zone_(zone) { }
5519 void Call(uc16 from, DispatchTable::Entry entry);
5520
5521 static const int kInBase = 0;
5522 static const int kInOverlay = 1;
5523
5524 private:
5525 ZoneList<CharacterRange>** included_;
5526 ZoneList<CharacterRange>** excluded_;
5527 Zone* zone_;
5528 };
5529
5530
5531 void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
5532 if (!entry.out_set()->Get(kInBase)) return;
5533 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
5534 ? included_
5535 : excluded_;
5536 if (*target == NULL) *target = new(zone_) ZoneList<CharacterRange>(2, zone_);
5537 (*target)->Add(CharacterRange(entry.from(), entry.to()), zone_);
5538 }
5539
5540
5541 void CharacterRange::Split(ZoneList<CharacterRange>* base,
5542 Vector<const int> overlay,
5543 ZoneList<CharacterRange>** included,
5544 ZoneList<CharacterRange>** excluded,
5545 Zone* zone) {
5546 DCHECK_NULL(*included);
5547 DCHECK_NULL(*excluded);
5548 DispatchTable table(zone);
5549 for (int i = 0; i < base->length(); i++)
5550 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase, zone);
5551 for (int i = 0; i < overlay.length(); i += 2) {
5552 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1),
5553 CharacterRangeSplitter::kInOverlay, zone);
5554 }
5555 CharacterRangeSplitter callback(included, excluded, zone);
5556 table.ForEach(&callback);
5557 }
5558
5559
5560 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, 5805 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
5561 ZoneList<CharacterRange>* ranges, 5806 ZoneList<CharacterRange>* ranges,
5562 bool is_one_byte) { 5807 bool is_one_byte) {
5563 uc16 bottom = from(); 5808 uc32 bottom = from();
5564 uc16 top = to(); 5809 uc32 top = to();
5810 // Nothing to be done for surrogates.
5811 if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return;
5565 if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) { 5812 if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) {
5566 if (bottom > String::kMaxOneByteCharCode) return; 5813 if (bottom > String::kMaxOneByteCharCode) return;
5567 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; 5814 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
5568 } 5815 }
5569 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5816 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5570 if (top == bottom) { 5817 if (top == bottom) {
5571 // If this is a singleton we just expand the one character. 5818 // If this is a singleton we just expand the one character.
5572 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); 5819 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
5573 for (int i = 0; i < length; i++) { 5820 for (int i = 0; i < length; i++) {
5574 uc32 chr = chars[i]; 5821 uc32 chr = chars[i];
(...skipping 17 matching lines...) Expand all
5592 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only 5839 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only
5593 // add a range if it is not already contained in the input, so [c-f] 5840 // add a range if it is not already contained in the input, so [c-f]
5594 // will be skipped but [C-F] will be added. If this range is not 5841 // will be skipped but [C-F] will be added. If this range is not
5595 // completely contained in a block we do this for all the blocks 5842 // completely contained in a block we do this for all the blocks
5596 // covered by the range (handling characters that is not in a block 5843 // covered by the range (handling characters that is not in a block
5597 // as a "singleton block"). 5844 // as a "singleton block").
5598 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5845 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5599 int pos = bottom; 5846 int pos = bottom;
5600 while (pos <= top) { 5847 while (pos <= top) {
5601 int length = isolate->jsregexp_canonrange()->get(pos, '\0', range); 5848 int length = isolate->jsregexp_canonrange()->get(pos, '\0', range);
5602 uc16 block_end; 5849 uc32 block_end;
5603 if (length == 0) { 5850 if (length == 0) {
5604 block_end = pos; 5851 block_end = pos;
5605 } else { 5852 } else {
5606 DCHECK_EQ(1, length); 5853 DCHECK_EQ(1, length);
5607 block_end = range[0]; 5854 block_end = range[0];
5608 } 5855 }
5609 int end = (block_end > top) ? top : block_end; 5856 int end = (block_end > top) ? top : block_end;
5610 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range); 5857 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range);
5611 for (int i = 0; i < length; i++) { 5858 for (int i = 0; i < length; i++) {
5612 uc32 c = range[i]; 5859 uc32 c = range[i];
5613 uc16 range_from = c - (block_end - pos); 5860 uc32 range_from = c - (block_end - pos);
5614 uc16 range_to = c - (block_end - end); 5861 uc32 range_to = c - (block_end - end);
5615 if (!(bottom <= range_from && range_to <= top)) { 5862 if (!(bottom <= range_from && range_to <= top)) {
5616 ranges->Add(CharacterRange(range_from, range_to), zone); 5863 ranges->Add(CharacterRange(range_from, range_to), zone);
5617 } 5864 }
5618 } 5865 }
5619 pos = end + 1; 5866 pos = end + 1;
5620 } 5867 }
5621 } 5868 }
5622 } 5869 }
5623 5870
5624 5871
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
5665 5912
5666 5913
5667 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, 5914 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
5668 int count, 5915 int count,
5669 CharacterRange insert) { 5916 CharacterRange insert) {
5670 // Inserts a range into list[0..count[, which must be sorted 5917 // Inserts a range into list[0..count[, which must be sorted
5671 // by from value and non-overlapping and non-adjacent, using at most 5918 // by from value and non-overlapping and non-adjacent, using at most
5672 // list[0..count] for the result. Returns the number of resulting 5919 // list[0..count] for the result. Returns the number of resulting
5673 // canonicalized ranges. Inserting a range may collapse existing ranges into 5920 // canonicalized ranges. Inserting a range may collapse existing ranges into
5674 // fewer ranges, so the return value can be anything in the range 1..count+1. 5921 // fewer ranges, so the return value can be anything in the range 1..count+1.
5675 uc16 from = insert.from(); 5922 uc32 from = insert.from();
5676 uc16 to = insert.to(); 5923 uc32 to = insert.to();
5677 int start_pos = 0; 5924 int start_pos = 0;
5678 int end_pos = count; 5925 int end_pos = count;
5679 for (int i = count - 1; i >= 0; i--) { 5926 for (int i = count - 1; i >= 0; i--) {
5680 CharacterRange current = list->at(i); 5927 CharacterRange current = list->at(i);
5681 if (current.from() > to + 1) { 5928 if (current.from() > to + 1) {
5682 end_pos = i; 5929 end_pos = i;
5683 } else if (current.to() + 1 < from) { 5930 } else if (current.to() + 1 < from) {
5684 start_pos = i + 1; 5931 start_pos = i + 1;
5685 break; 5932 break;
5686 } 5933 }
(...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after
5766 DCHECK(CharacterRange::IsCanonical(character_ranges)); 6013 DCHECK(CharacterRange::IsCanonical(character_ranges));
5767 } 6014 }
5768 6015
5769 6016
5770 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, 6017 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
5771 ZoneList<CharacterRange>* negated_ranges, 6018 ZoneList<CharacterRange>* negated_ranges,
5772 Zone* zone) { 6019 Zone* zone) {
5773 DCHECK(CharacterRange::IsCanonical(ranges)); 6020 DCHECK(CharacterRange::IsCanonical(ranges));
5774 DCHECK_EQ(0, negated_ranges->length()); 6021 DCHECK_EQ(0, negated_ranges->length());
5775 int range_count = ranges->length(); 6022 int range_count = ranges->length();
5776 uc16 from = 0; 6023 uc32 from = 0;
5777 int i = 0; 6024 int i = 0;
5778 if (range_count > 0 && ranges->at(0).from() == 0) { 6025 if (range_count > 0 && ranges->at(0).from() == 0) {
5779 from = ranges->at(0).to(); 6026 from = ranges->at(0).to();
5780 i = 1; 6027 i = 1;
5781 } 6028 }
5782 while (i < range_count) { 6029 while (i < range_count) {
5783 CharacterRange range = ranges->at(i); 6030 CharacterRange range = ranges->at(i);
5784 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1), zone); 6031 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1), zone);
5785 from = range.to(); 6032 from = range.to();
5786 i++; 6033 i++;
5787 } 6034 }
5788 if (from < String::kMaxUtf16CodeUnit) { 6035 if (from < String::kMaxCodePoint) {
5789 negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit), 6036 negated_ranges->Add(CharacterRange(from + 1, String::kMaxCodePoint), zone);
5790 zone);
5791 } 6037 }
5792 } 6038 }
5793 6039
5794 6040
5795 // ------------------------------------------------------------------- 6041 // -------------------------------------------------------------------
5796 // Splay tree 6042 // Splay tree
5797 6043
5798 6044
5799 OutSet* OutSet::Extend(unsigned value, Zone* zone) { 6045 OutSet* OutSet::Extend(unsigned value, Zone* zone) {
5800 if (Get(value)) 6046 if (Get(value))
(...skipping 30 matching lines...) Expand all
5831 if (value < kFirstLimit) { 6077 if (value < kFirstLimit) {
5832 return (first_ & (1 << value)) != 0; 6078 return (first_ & (1 << value)) != 0;
5833 } else if (remaining_ == NULL) { 6079 } else if (remaining_ == NULL) {
5834 return false; 6080 return false;
5835 } else { 6081 } else {
5836 return remaining_->Contains(value); 6082 return remaining_->Contains(value);
5837 } 6083 }
5838 } 6084 }
5839 6085
5840 6086
5841 const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar; 6087 const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
5842 6088
5843 6089
5844 void DispatchTable::AddRange(CharacterRange full_range, int value, 6090 void DispatchTable::AddRange(CharacterRange full_range, int value,
5845 Zone* zone) { 6091 Zone* zone) {
5846 CharacterRange current = full_range; 6092 CharacterRange current = full_range;
5847 if (tree()->is_empty()) { 6093 if (tree()->is_empty()) {
5848 // If this is the first range we just insert into the table. 6094 // If this is the first range we just insert into the table.
5849 ZoneSplayTree<Config>::Locator loc; 6095 ZoneSplayTree<Config>::Locator loc;
5850 bool inserted = tree()->Insert(current.from(), &loc); 6096 bool inserted = tree()->Insert(current.from(), &loc);
5851 DCHECK(inserted); 6097 DCHECK(inserted);
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after
5933 USE(inserted); 6179 USE(inserted);
5934 ins.set_value(Entry(current.from(), 6180 ins.set_value(Entry(current.from(),
5935 current.to(), 6181 current.to(),
5936 empty()->Extend(value, zone))); 6182 empty()->Extend(value, zone)));
5937 break; 6183 break;
5938 } 6184 }
5939 } 6185 }
5940 } 6186 }
5941 6187
5942 6188
5943 OutSet* DispatchTable::Get(uc16 value) { 6189 OutSet* DispatchTable::Get(uc32 value) {
5944 ZoneSplayTree<Config>::Locator loc; 6190 ZoneSplayTree<Config>::Locator loc;
5945 if (!tree()->FindGreatestLessThan(value, &loc)) 6191 if (!tree()->FindGreatestLessThan(value, &loc))
5946 return empty(); 6192 return empty();
5947 Entry* entry = &loc.value(); 6193 Entry* entry = &loc.value();
5948 if (value <= entry->to()) 6194 if (value <= entry->to())
5949 return entry->out_set(); 6195 return entry->out_set();
5950 else 6196 else
5951 return empty(); 6197 return empty();
5952 } 6198 }
5953 6199
(...skipping 297 matching lines...) Expand 10 before | Expand all | Expand 10 after
6251 } 6497 }
6252 6498
6253 6499
6254 void DispatchTableConstructor::VisitAction(ActionNode* that) { 6500 void DispatchTableConstructor::VisitAction(ActionNode* that) {
6255 RegExpNode* target = that->on_success(); 6501 RegExpNode* target = that->on_success();
6256 target->Accept(this); 6502 target->Accept(this);
6257 } 6503 }
6258 6504
6259 6505
6260 RegExpEngine::CompilationResult RegExpEngine::Compile( 6506 RegExpEngine::CompilationResult RegExpEngine::Compile(
6261 Isolate* isolate, Zone* zone, RegExpCompileData* data, bool ignore_case, 6507 Isolate* isolate, Zone* zone, RegExpCompileData* data,
6262 bool is_global, bool is_multiline, bool is_sticky, Handle<String> pattern, 6508 JSRegExp::Flags flags, Handle<String> pattern,
6263 Handle<String> sample_subject, bool is_one_byte) { 6509 Handle<String> sample_subject, bool is_one_byte) {
6264 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { 6510 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6265 return IrregexpRegExpTooBig(isolate); 6511 return IrregexpRegExpTooBig(isolate);
6266 } 6512 }
6267 RegExpCompiler compiler(isolate, zone, data->capture_count, ignore_case, 6513 bool ignore_case = flags & JSRegExp::kIgnoreCase;
6514 bool is_sticky = flags & JSRegExp::kSticky;
6515 bool is_global = flags & JSRegExp::kGlobal;
6516 RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
6268 is_one_byte); 6517 is_one_byte);
6269 6518
6270 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern)); 6519 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
6271 6520
6272 // Sample some characters from the middle of the string. 6521 // Sample some characters from the middle of the string.
6273 static const int kSampleSize = 128; 6522 static const int kSampleSize = 128;
6274 6523
6275 sample_subject = String::Flatten(sample_subject); 6524 sample_subject = String::Flatten(sample_subject);
6276 int chars_sampled = 0; 6525 int chars_sampled = 0;
6277 int half_way = (sample_subject->length() - kSampleSize) / 2; 6526 int half_way = (sample_subject->length() - kSampleSize) / 2;
(...skipping 222 matching lines...) Expand 10 before | Expand all | Expand 10 after
6500 6749
6501 6750
6502 void RegExpResultsCache::Clear(FixedArray* cache) { 6751 void RegExpResultsCache::Clear(FixedArray* cache) {
6503 for (int i = 0; i < kRegExpResultsCacheSize; i++) { 6752 for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6504 cache->set(i, Smi::FromInt(0)); 6753 cache->set(i, Smi::FromInt(0));
6505 } 6754 }
6506 } 6755 }
6507 6756
6508 } // namespace internal 6757 } // namespace internal
6509 } // namespace v8 6758 } // namespace v8
OLDNEW
« src/regexp/jsregexp.h ('K') | « src/regexp/jsregexp.h ('k') | src/regexp/regexp-ast.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698