src/regexp/jsregexp.cc - Issue 1578253005: [regexp] implement character classes for unicode regexps.

Side by Side Diff: src/regexp/jsregexp.cc

Issue 1578253005: [regexp] implement character classes for unicode regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: lookaround builder Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/regexp/jsregexp.h"	5 #include "src/regexp/jsregexp.h"

6	6

7 #include "src/ast/ast.h"	7 #include "src/ast/ast.h"

8 #include "src/base/platform/platform.h"	8 #include "src/base/platform/platform.h"

9 #include "src/compilation-cache.h"	9 #include "src/compilation-cache.h"

10 #include "src/compiler.h"	10 #include "src/compiler.h"

(...skipping 54 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
65 Handle<String> error_text) {	65 Handle<String> error_text) {

66 USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text));	66 USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text));

67 }	67 }

68	68

69	69

70 ContainedInLattice AddRange(ContainedInLattice containment,	70 ContainedInLattice AddRange(ContainedInLattice containment,

71 const int* ranges,	71 const int* ranges,

72 int ranges_length,	72 int ranges_length,

73 Interval new_range) {	73 Interval new_range) {

74 DCHECK((ranges_length & 1) == 1);	74 DCHECK((ranges_length & 1) == 1);

75 DCHECK(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1);	75 DCHECK(ranges[ranges_length - 1] == String::kMaxCodePoint + 1);

76 if (containment == kLatticeUnknown) return containment;	76 if (containment == kLatticeUnknown) return containment;

77 bool inside = false;	77 bool inside = false;

78 int last = 0;	78 int last = 0;

79 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {	79 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {

80 // Consider the range from last to ranges[i].	80 // Consider the range from last to ranges[i].

81 // We haven't got to the new range yet.	81 // We haven't got to the new range yet.

82 if (ranges[i] <= new_range.from()) continue;	82 if (ranges[i] <= new_range.from()) continue;

83 // New range is wholly inside last-ranges[i]. Note that new_range.to() is	83 // New range is wholly inside last-ranges[i]. Note that new_range.to() is

84 // inclusive, but the values in ranges are not.	84 // inclusive, but the values in ranges are not.

85 if (last <= new_range.from() && new_range.to() < ranges[i]) {	85 if (last <= new_range.from() && new_range.to() < ranges[i]) {

(...skipping 52 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
138	138

139 Handle<Object> result;	139 Handle<Object> result;

140 if (in_cache) {	140 if (in_cache) {

141 re->set_data(*cached);	141 re->set_data(*cached);

142 return re;	142 return re;

143 }	143 }

144 pattern = String::Flatten(pattern);	144 pattern = String::Flatten(pattern);

145 PostponeInterruptsScope postpone(isolate);	145 PostponeInterruptsScope postpone(isolate);

146 RegExpCompileData parse_result;	146 RegExpCompileData parse_result;

147 FlatStringReader reader(isolate, pattern);	147 FlatStringReader reader(isolate, pattern);

148 if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader,	148 if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, flags,

149 flags & JSRegExp::kMultiline,	149 &parse_result)) {

150 flags & JSRegExp::kUnicode, &parse_result)) {

151 // Throw an exception if we fail to parse the pattern.	150 // Throw an exception if we fail to parse the pattern.

152 return ThrowRegExpException(re, pattern, parse_result.error);	151 return ThrowRegExpException(re, pattern, parse_result.error);

153 }	152 }

154	153

155 bool has_been_compiled = false;	154 bool has_been_compiled = false;

156	155

157 if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&	156 if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&

158 !(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) {	157 !(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) {

159 // Parse-tree is a single atom that is equal to the pattern.	158 // Parse-tree is a single atom that is equal to the pattern.

160 AtomCompile(re, pattern, flags, pattern);	159 AtomCompile(re, pattern, flags, pattern);

(...skipping 203 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
364 ThrowRegExpException(re, error_message);	363 ThrowRegExpException(re, error_message);

365 return false;	364 return false;

366 }	365 }

367	366

368 JSRegExp::Flags flags = re->GetFlags();	367 JSRegExp::Flags flags = re->GetFlags();

369	368

370 Handle<String> pattern(re->Pattern());	369 Handle<String> pattern(re->Pattern());

371 pattern = String::Flatten(pattern);	370 pattern = String::Flatten(pattern);

372 RegExpCompileData compile_data;	371 RegExpCompileData compile_data;

373 FlatStringReader reader(isolate, pattern);	372 FlatStringReader reader(isolate, pattern);

374 if (!RegExpParser::ParseRegExp(isolate, &zone, &reader,	373 if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,

375 flags & JSRegExp::kMultiline,	374 &compile_data)) {

376 flags & JSRegExp::kUnicode, &compile_data)) {

377 // Throw an exception if we fail to parse the pattern.	375 // Throw an exception if we fail to parse the pattern.

378 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.	376 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.

379 USE(ThrowRegExpException(re, pattern, compile_data.error));	377 USE(ThrowRegExpException(re, pattern, compile_data.error));

380 return false;	378 return false;

381 }	379 }

382 RegExpEngine::CompilationResult result = RegExpEngine::Compile(	380 RegExpEngine::CompilationResult result =

383 isolate, &zone, &compile_data, flags & JSRegExp::kIgnoreCase,	381 RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,

384 flags & JSRegExp::kGlobal, flags & JSRegExp::kMultiline,	382 sample_subject, is_one_byte);

385 flags & JSRegExp::kSticky, pattern, sample_subject, is_one_byte);

386 if (result.error_message != NULL) {	383 if (result.error_message != NULL) {

387 // Unable to compile regexp.	384 // Unable to compile regexp.

388 Handle<String> error_message = isolate->factory()->NewStringFromUtf8(	385 Handle<String> error_message = isolate->factory()->NewStringFromUtf8(

389 CStrVector(result.error_message)).ToHandleChecked();	386 CStrVector(result.error_message)).ToHandleChecked();

390 ThrowRegExpException(re, error_message);	387 ThrowRegExpException(re, error_message);

391 return false;	388 return false;

392 }	389 }

393	390

394 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));	391 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));

395 data->set(JSRegExp::code_index(is_one_byte), result.code);	392 data->set(JSRegExp::code_index(is_one_byte), result.code);

(...skipping 542 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
938	935

939 private:	936 private:

940 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];	937 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];

941 int total_samples_;	938 int total_samples_;

942 };	939 };

943	940

944	941

945 class RegExpCompiler {	942 class RegExpCompiler {

946 public:	943 public:

947 RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,	944 RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,

948 bool ignore_case, bool is_one_byte);	945 JSRegExp::Flags flags, bool is_one_byte);

949	946

950 int AllocateRegister() {	947 int AllocateRegister() {

951 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {	948 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {

952 reg_exp_too_big_ = true;	949 reg_exp_too_big_ = true;

953 return next_register_;	950 return next_register_;

954 }	951 }

955 return next_register_++;	952 return next_register_++;

956 }	953 }

957	954

	955 // Lookarounds to match lone surrogates for unicode character class matches

	956 // are never nested. We can therefore reuse registers.

	957 int UnicodeLookaroundStackRegister() {

	958 if (unicode_lookaround_stack_register_ == kNoRegister) {

	959 unicode_lookaround_stack_register_ = AllocateRegister();

	960 }

	961 return unicode_lookaround_stack_register_;

	962 }

	963

	964 int UnicodeLookaroundPositionRegister() {

	965 if (unicode_lookaround_position_register_ == kNoRegister) {

	966 unicode_lookaround_position_register_ = AllocateRegister();

	967 }

	968 return unicode_lookaround_position_register_;

	969 }

	970

958 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,	971 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,

959 RegExpNode* start,	972 RegExpNode* start,

960 int capture_count,	973 int capture_count,

961 Handle<String> pattern);	974 Handle<String> pattern);

962	975

963 inline void AddWork(RegExpNode* node) {	976 inline void AddWork(RegExpNode* node) {

964 if (!node->on_work_list() && !node->label()->is_bound()) {	977 if (!node->on_work_list() && !node->label()->is_bound()) {

965 node->set_on_work_list(true);	978 node->set_on_work_list(true);

966 work_list_->Add(node);	979 work_list_->Add(node);

967 }	980 }

968 }	981 }

969	982

970 static const int kImplementationOffset = 0;	983 static const int kImplementationOffset = 0;

971 static const int kNumberOfRegistersOffset = 0;	984 static const int kNumberOfRegistersOffset = 0;

972 static const int kCodeOffset = 1;	985 static const int kCodeOffset = 1;

973	986

974 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }	987 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }

975 EndNode* accept() { return accept_; }	988 EndNode* accept() { return accept_; }

976	989

977 static const int kMaxRecursion = 100;	990 static const int kMaxRecursion = 100;

978 inline int recursion_depth() { return recursion_depth_; }	991 inline int recursion_depth() { return recursion_depth_; }

979 inline void IncrementRecursionDepth() { recursion_depth_++; }	992 inline void IncrementRecursionDepth() { recursion_depth_++; }

980 inline void DecrementRecursionDepth() { recursion_depth_--; }	993 inline void DecrementRecursionDepth() { recursion_depth_--; }

981	994

982 void SetRegExpTooBig() { reg_exp_too_big_ = true; }	995 void SetRegExpTooBig() { reg_exp_too_big_ = true; }

983	996

984 inline bool ignore_case() { return ignore_case_; }	997 inline bool ignore_case() { return flags_ & JSRegExp::kIgnoreCase; }
	erikcorry 2016/01/20 10:47:05 Do we allow implicit int->bool conversions? Do we allow implicit int->bool conversions? Yang 2016/01/20 13:06:20 We do this else where, for example in src/compiler Show quoted text On 2016/01/20 10:47:05, erikcorry wrote: > Do we allow implicit int->bool conversions? We do this else where, for example in src/compiler/machine-operator.h. But I guess it can't hurt to have it explicit.
	998 inline bool unicode() { return flags_ & JSRegExp::kUnicode; }

985 inline bool one_byte() { return one_byte_; }	999 inline bool one_byte() { return one_byte_; }

986 inline bool optimize() { return optimize_; }	1000 inline bool optimize() { return optimize_; }

987 inline void set_optimize(bool value) { optimize_ = value; }	1001 inline void set_optimize(bool value) { optimize_ = value; }

988 inline bool limiting_recursion() { return limiting_recursion_; }	1002 inline bool limiting_recursion() { return limiting_recursion_; }

989 inline void set_limiting_recursion(bool value) {	1003 inline void set_limiting_recursion(bool value) {

990 limiting_recursion_ = value;	1004 limiting_recursion_ = value;

991 }	1005 }

992 bool read_backward() { return read_backward_; }	1006 bool read_backward() { return read_backward_; }

993 void set_read_backward(bool value) { read_backward_ = value; }	1007 void set_read_backward(bool value) { read_backward_ = value; }

994 FrequencyCollator* frequency_collator() { return &frequency_collator_; }	1008 FrequencyCollator* frequency_collator() { return &frequency_collator_; }

995	1009

996 int current_expansion_factor() { return current_expansion_factor_; }	1010 int current_expansion_factor() { return current_expansion_factor_; }

997 void set_current_expansion_factor(int value) {	1011 void set_current_expansion_factor(int value) {

998 current_expansion_factor_ = value;	1012 current_expansion_factor_ = value;

999 }	1013 }

1000	1014

1001 Isolate* isolate() const { return isolate_; }	1015 Isolate* isolate() const { return isolate_; }

1002 Zone* zone() const { return zone_; }	1016 Zone* zone() const { return zone_; }

1003	1017

1004 static const int kNoRegister = -1;	1018 static const int kNoRegister = -1;

1005	1019

1006 private:	1020 private:

1007 EndNode* accept_;	1021 EndNode* accept_;

1008 int next_register_;	1022 int next_register_;

	1023 int unicode_lookaround_stack_register_;

	1024 int unicode_lookaround_position_register_;

1009 List<RegExpNode> work_list_;	1025 List<RegExpNode> work_list_;

1010 int recursion_depth_;	1026 int recursion_depth_;

1011 RegExpMacroAssembler* macro_assembler_;	1027 RegExpMacroAssembler* macro_assembler_;

1012 bool ignore_case_;	1028 JSRegExp::Flags flags_;

1013 bool one_byte_;	1029 bool one_byte_;

1014 bool reg_exp_too_big_;	1030 bool reg_exp_too_big_;

1015 bool limiting_recursion_;	1031 bool limiting_recursion_;

1016 bool optimize_;	1032 bool optimize_;

1017 bool read_backward_;	1033 bool read_backward_;

1018 int current_expansion_factor_;	1034 int current_expansion_factor_;

1019 FrequencyCollator frequency_collator_;	1035 FrequencyCollator frequency_collator_;

1020 Isolate* isolate_;	1036 Isolate* isolate_;

1021 Zone* zone_;	1037 Zone* zone_;

1022 };	1038 };

(...skipping 11 matching lines...) Expand all Loading...
1034	1050

1035	1051

1036 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {	1052 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {

1037 return RegExpEngine::CompilationResult(isolate, "RegExp too big");	1053 return RegExpEngine::CompilationResult(isolate, "RegExp too big");

1038 }	1054 }

1039	1055

1040	1056

1041 // Attempts to compile the regexp using an Irregexp code generator. Returns	1057 // Attempts to compile the regexp using an Irregexp code generator. Returns

1042 // a fixed array or a null handle depending on whether it succeeded.	1058 // a fixed array or a null handle depending on whether it succeeded.

1043 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,	1059 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,

1044 bool ignore_case, bool one_byte)	1060 JSRegExp::Flags flags, bool one_byte)

1045 : next_register_(2 * (capture_count + 1)),	1061 : next_register_(2 * (capture_count + 1)),

	1062 unicode_lookaround_stack_register_(kNoRegister),

	1063 unicode_lookaround_position_register_(kNoRegister),

1046 work_list_(NULL),	1064 work_list_(NULL),

1047 recursion_depth_(0),	1065 recursion_depth_(0),

1048 ignore_case_(ignore_case),	1066 flags_(flags),

1049 one_byte_(one_byte),	1067 one_byte_(one_byte),

1050 reg_exp_too_big_(false),	1068 reg_exp_too_big_(false),

1051 limiting_recursion_(false),	1069 limiting_recursion_(false),

1052 optimize_(FLAG_regexp_optimization),	1070 optimize_(FLAG_regexp_optimization),

1053 read_backward_(false),	1071 read_backward_(false),

1054 current_expansion_factor_(1),	1072 current_expansion_factor_(1),

1055 frequency_collator_(),	1073 frequency_collator_(),

1056 isolate_(isolate),	1074 isolate_(isolate),

1057 zone_(zone) {	1075 zone_(zone) {

1058 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);	1076 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);

(...skipping 629 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1688 typedef bool EmitCharacterFunction(Isolate* isolate,	1706 typedef bool EmitCharacterFunction(Isolate* isolate,

1689 RegExpCompiler* compiler,	1707 RegExpCompiler* compiler,

1690 uc16 c,	1708 uc16 c,

1691 Label* on_failure,	1709 Label* on_failure,

1692 int cp_offset,	1710 int cp_offset,

1693 bool check,	1711 bool check,

1694 bool preloaded);	1712 bool preloaded);

1695	1713

1696 // Only emits letters (things that have case). Only used for case independent	1714 // Only emits letters (things that have case). Only used for case independent

1697 // matches.	1715 // matches.

1698 static inline bool EmitAtomLetter(Isolate* isolate,	1716 static inline bool EmitAtomLetter(Isolate* isolate,
	erikcorry 2016/01/21 10:58:56 This is not /u-ified. Does it get called in /ui m This is not /u-ified. Does it get called in /ui mode? Yang 2016/01/21 11:49:04 No. This only gets called if the TextNode element Show quoted text On 2016/01/21 10:58:56, erikcorry wrote: > This is not /u-ified. Does it get called in /ui mode? No. This only gets called if the TextNode element kind is an TextElement::ATOM as opposed to TextElement::CHAR_CLASS. However, in all the cases where unicode is relevant (surrogate pair of lone surrogates), we create a RegExpCharacterClass as a standalone term, which then desugars into whatever we need. TextNodes it desugars to are all created via TextNode::CharClass though, EmitCharClass eventually does the job of emitting code, which generates good code for singleton character classes as well. For case independent matching, which is handled by EmitAtomLetter, I have a separate CL prepared.
1699 RegExpCompiler* compiler,	1717 RegExpCompiler* compiler,

1700 uc16 c,	1718 uc16 c,

1701 Label* on_failure,	1719 Label* on_failure,

1702 int cp_offset,	1720 int cp_offset,

1703 bool check,	1721 bool check,

1704 bool preloaded) {	1722 bool preloaded) {

1705 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();	1723 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

1706 bool one_byte = compiler->one_byte();	1724 bool one_byte = compiler->one_byte();

1707 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	1725 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

1708 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);	1726 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);

(...skipping 382 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2091 flip ? even_label : odd_label);	2109 flip ? even_label : odd_label);

2092 }	2110 }

2093 }	2111 }

2094	2112

2095	2113

2096 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,	2114 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,

2097 RegExpCharacterClass* cc, bool one_byte,	2115 RegExpCharacterClass* cc, bool one_byte,

2098 Label* on_failure, int cp_offset, bool check_offset,	2116 Label* on_failure, int cp_offset, bool check_offset,

2099 bool preloaded, Zone* zone) {	2117 bool preloaded, Zone* zone) {

2100 ZoneList<CharacterRange>* ranges = cc->ranges(zone);	2118 ZoneList<CharacterRange>* ranges = cc->ranges(zone);

2101 if (!CharacterRange::IsCanonical(ranges)) {	2119 CharacterRange::Canonicalize(ranges);

2102 CharacterRange::Canonicalize(ranges);

2103 }

2104	2120

2105 int max_char;	2121 int max_char;

2106 if (one_byte) {	2122 if (one_byte) {

2107 max_char = String::kMaxOneByteCharCode;	2123 max_char = String::kMaxOneByteCharCode;

2108 } else {	2124 } else {

2109 max_char = String::kMaxUtf16CodeUnit;	2125 max_char = String::kMaxUtf16CodeUnit;

2110 }	2126 }

2111	2127

2112 int range_count = ranges->length();	2128 int range_count = ranges->length();

2113	2129

(...skipping 21 matching lines...) Expand all Loading...
2135 if (cc->is_negated()) {	2151 if (cc->is_negated()) {

2136 macro_assembler->GoTo(on_failure);	2152 macro_assembler->GoTo(on_failure);

2137 } else {	2153 } else {

2138 // This is a common case hit by non-anchored expressions.	2154 // This is a common case hit by non-anchored expressions.

2139 if (check_offset) {	2155 if (check_offset) {

2140 macro_assembler->CheckPosition(cp_offset, on_failure);	2156 macro_assembler->CheckPosition(cp_offset, on_failure);

2141 }	2157 }

2142 }	2158 }

2143 return;	2159 return;

2144 }	2160 }

2145 if (last_valid_range == 0 &&

2146 !cc->is_negated() &&

2147 ranges->at(0).IsEverything(max_char)) {

2148 // This is a common case hit by non-anchored expressions.

2149 if (check_offset) {

2150 macro_assembler->CheckPosition(cp_offset, on_failure);

2151 }

2152 return;

2153 }

2154	2161

2155 if (!preloaded) {	2162 if (!preloaded) {

2156 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);	2163 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);

2157 }	2164 }

2158	2165

2159 if (cc->is_standard(zone) &&	2166 if (cc->is_standard(zone) &&

2160 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),	2167 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),

2161 on_failure)) {	2168 on_failure)) {

2162 return;	2169 return;

2163 }	2170 }

2164	2171

2165	2172

2166 // A new list with ascending entries. Each entry is a code unit	2173 // A new list with ascending entries. Each entry is a code unit

2167 // where there is a boundary between code units that are part of	2174 // where there is a boundary between code units that are part of

2168 // the class and code units that are not. Normally we insert an	2175 // the class and code units that are not. Normally we insert an

2169 // entry at zero which goes to the failure label, but if there	2176 // entry at zero which goes to the failure label, but if there

2170 // was already one there we fall through for success on that entry.	2177 // was already one there we fall through for success on that entry.

2171 // Subsequent entries have alternating meaning (success/failure).	2178 // Subsequent entries have alternating meaning (success/failure).

(...skipping 619 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2791 // Character is outside Latin-1 completely	2798 // Character is outside Latin-1 completely

2792 if (converted == 0) return set_replacement(NULL);	2799 if (converted == 0) return set_replacement(NULL);

2793 // Convert quark to Latin-1 in place.	2800 // Convert quark to Latin-1 in place.

2794 uint16_t* copy = const_cast<uint16_t*>(quarks.start());	2801 uint16_t* copy = const_cast<uint16_t*>(quarks.start());

2795 copy[j] = converted;	2802 copy[j] = converted;

2796 }	2803 }

2797 } else {	2804 } else {

2798 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);	2805 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);

2799 RegExpCharacterClass* cc = elm.char_class();	2806 RegExpCharacterClass* cc = elm.char_class();

2800 ZoneList<CharacterRange>* ranges = cc->ranges(zone());	2807 ZoneList<CharacterRange>* ranges = cc->ranges(zone());

2801 if (!CharacterRange::IsCanonical(ranges)) {	2808 CharacterRange::Canonicalize(ranges);

2802 CharacterRange::Canonicalize(ranges);

2803 }

2804 // Now they are in order so we only need to look at the first.	2809 // Now they are in order so we only need to look at the first.

2805 int range_count = ranges->length();	2810 int range_count = ranges->length();

2806 if (cc->is_negated()) {	2811 if (cc->is_negated()) {

2807 if (range_count != 0 &&	2812 if (range_count != 0 &&

2808 ranges->at(0).from() == 0 &&	2813 ranges->at(0).from() == 0 &&

2809 ranges->at(0).to() >= String::kMaxOneByteCharCode) {	2814 ranges->at(0).to() >= String::kMaxOneByteCharCode) {

2810 // This will be handled in a later filter.	2815 // This will be handled in a later filter.

2811 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;	2816 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;

2812 return set_replacement(NULL);	2817 return set_replacement(NULL);

2813 }	2818 }

(...skipping 468 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3282 bool TextNode::SkipPass(int int_pass, bool ignore_case) {	3287 bool TextNode::SkipPass(int int_pass, bool ignore_case) {

3283 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);	3288 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);

3284 if (ignore_case) {	3289 if (ignore_case) {

3285 return pass == SIMPLE_CHARACTER_MATCH;	3290 return pass == SIMPLE_CHARACTER_MATCH;

3286 } else {	3291 } else {

3287 return pass == NON_LETTER_CHARACTER_MATCH \|\| pass == CASE_CHARACTER_MATCH;	3292 return pass == NON_LETTER_CHARACTER_MATCH \|\| pass == CASE_CHARACTER_MATCH;

3288 }	3293 }

3289 }	3294 }

3290	3295

3291	3296

	3297 TextNode* TextNode::CharacterRanges(Zone* zone,

	3298 ZoneList<CharacterRange>* ranges,

	3299 bool read_backward,

	3300 RegExpNode* on_success) {

	3301 DCHECK_NOT_NULL(ranges);

	3302 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);

	3303 elms->Add(

	3304 TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)),

	3305 zone);

	3306 return new (zone) TextNode(elms, read_backward, on_success);

	3307 }

	3308

	3309

	3310 TextNode* TextNode::SurrogatePair(Zone* zone, CharacterRange lead,

	3311 CharacterRange trail, bool read_backward,

	3312 RegExpNode* on_success) {

	3313 ZoneList<CharacterRange>* lead_ranges =

	3314 new (zone) ZoneList<CharacterRange>(1, zone);

	3315 lead_ranges->Add(lead, zone);

	3316 ZoneList<CharacterRange>* trail_ranges =

	3317 new (zone) ZoneList<CharacterRange>(1, zone);

	3318 trail_ranges->Add(trail, zone);

	3319 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);

	3320 elms->Add(TextElement::CharClass(

	3321 new (zone) RegExpCharacterClass(lead_ranges, false)),

	3322 zone);

	3323 elms->Add(TextElement::CharClass(

	3324 new (zone) RegExpCharacterClass(trail_ranges, false)),

	3325 zone);

	3326 return new (zone) TextNode(elms, read_backward, on_success);

	3327 }

	3328

	3329

3292 // This generates the code to match a text node. A text node can contain	3330 // This generates the code to match a text node. A text node can contain

3293 // straight character sequences (possibly to be matched in a case-independent	3331 // straight character sequences (possibly to be matched in a case-independent

3294 // way) and character classes. For efficiency we do not do this in a single	3332 // way) and character classes. For efficiency we do not do this in a single

3295 // pass from left to right. Instead we pass over the text node several times,	3333 // pass from left to right. Instead we pass over the text node several times,

3296 // emitting code for some character positions every time. See the comment on	3334 // emitting code for some character positions every time. See the comment on

3297 // TextEmitPass for details.	3335 // TextEmitPass for details.

3298 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {	3336 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {

3299 LimitResult limit_result = LimitVersions(compiler, trace);	3337 LimitResult limit_result = LimitVersions(compiler, trace);

3300 if (limit_result == DONE) return;	3338 if (limit_result == DONE) return;

3301 DCHECK(limit_result == CONTINUE);	3339 DCHECK(limit_result == CONTINUE);

(...skipping 96 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3398	3436

3399	3437

3400 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(	3438 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(

3401 RegExpCompiler* compiler) {	3439 RegExpCompiler* compiler) {

3402 if (read_backward()) return NULL;	3440 if (read_backward()) return NULL;

3403 if (elements()->length() != 1) return NULL;	3441 if (elements()->length() != 1) return NULL;

3404 TextElement elm = elements()->at(0);	3442 TextElement elm = elements()->at(0);

3405 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;	3443 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;

3406 RegExpCharacterClass* node = elm.char_class();	3444 RegExpCharacterClass* node = elm.char_class();

3407 ZoneList<CharacterRange>* ranges = node->ranges(zone());	3445 ZoneList<CharacterRange>* ranges = node->ranges(zone());

3408 if (!CharacterRange::IsCanonical(ranges)) {	3446 CharacterRange::Canonicalize(ranges);

3409 CharacterRange::Canonicalize(ranges);

3410 }

3411 if (node->is_negated()) {	3447 if (node->is_negated()) {

3412 return ranges->length() == 0 ? on_success() : NULL;	3448 return ranges->length() == 0 ? on_success() : NULL;

3413 }	3449 }

3414 if (ranges->length() != 1) return NULL;	3450 if (ranges->length() != 1) return NULL;

3415 uint32_t max_char;	3451 uint32_t max_char;

3416 if (compiler->one_byte()) {	3452 if (compiler->one_byte()) {

3417 max_char = String::kMaxOneByteCharCode;	3453 max_char = String::kMaxOneByteCharCode;

3418 } else {	3454 } else {

3419 max_char = String::kMaxUtf16CodeUnit;	3455 max_char = String::kMaxUtf16CodeUnit;

3420 }	3456 }

(...skipping 126 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3547 return alt_gens_[i];	3583 return alt_gens_[i];

3548 }	3584 }

3549	3585

3550 private:	3586 private:

3551 static const int kAFew = 10;	3587 static const int kAFew = 10;

3552 ZoneList<AlternativeGeneration*> alt_gens_;	3588 ZoneList<AlternativeGeneration*> alt_gens_;

3553 AlternativeGeneration a_few_alt_gens_[kAFew];	3589 AlternativeGeneration a_few_alt_gens_[kAFew];

3554 };	3590 };

3555	3591

3556	3592

	3593 static const uc32 kLeadSurrogateStart = 0xd800;

	3594 static const uc32 kLeadSurrogateEnd = 0xdbff;

	3595 static const uc32 kTrailSurrogateStart = 0xdc00;

	3596 static const uc32 kTrailSurrogateEnd = 0xdfff;

	3597 static const uc32 kNonBmpStart = 0x10000;

	3598 static const uc32 kNonBmpEnd = 0x10ffff;

	3599 static const uc32 kRangeEndMarker = 0x110000;

	3600

3557 // The '2' variant is has inclusive from and exclusive to.	3601 // The '2' variant is has inclusive from and exclusive to.

3558 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,	3602 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,

3559 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.	3603 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.

3560 static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,	3604 static const int kSpaceRanges[] = {

3561 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,	3605 '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, 0x1681,

3562 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,	3606 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,

3563 0xFEFF, 0xFF00, 0x10000 };	3607 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};

3564 static const int kSpaceRangeCount = arraysize(kSpaceRanges);	3608 static const int kSpaceRangeCount = arraysize(kSpaceRanges);

3565	3609

3566 static const int kWordRanges[] = {	3610 static const int kWordRanges[] = {

3567 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };	3611 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};

3568 static const int kWordRangeCount = arraysize(kWordRanges);	3612 static const int kWordRangeCount = arraysize(kWordRanges);

3569 static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 };	3613 static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};

3570 static const int kDigitRangeCount = arraysize(kDigitRanges);	3614 static const int kDigitRangeCount = arraysize(kDigitRanges);

3571 static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 };	3615 static const int kSurrogateRanges[] = {

	3616 kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};

3572 static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);	3617 static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);

3573 static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E,	3618 static const int kLineTerminatorRanges[] = {

3574 0x2028, 0x202A, 0x10000 };	3619 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};

3575 static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);	3620 static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);

3576	3621

3577

3578 void BoyerMoorePositionInfo::Set(int character) {	3622 void BoyerMoorePositionInfo::Set(int character) {

3579 SetInterval(Interval(character, character));	3623 SetInterval(Interval(character, character));

3580 }	3624 }

3581	3625

3582	3626

3583 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {	3627 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {

3584 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);	3628 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);

3585 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);	3629 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);

3586 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);	3630 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);

3587 surrogate_ =	3631 surrogate_ =

(...skipping 1137 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4725 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,	4769 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,

4726 RegExpNode* on_success) {	4770 RegExpNode* on_success) {

4727 return new (compiler->zone())	4771 return new (compiler->zone())

4728 TextNode(elements(), compiler->read_backward(), on_success);	4772 TextNode(elements(), compiler->read_backward(), on_success);

4729 }	4773 }

4730	4774

4731	4775

4732 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,	4776 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,

4733 const int* special_class,	4777 const int* special_class,

4734 int length) {	4778 int length) {

4735 length--; // Remove final 0x10000.	4779 length--; // Remove final marker.

4736 DCHECK(special_class[length] == 0x10000);	4780 DCHECK(special_class[length] == kRangeEndMarker);

4737 DCHECK(ranges->length() != 0);	4781 DCHECK(ranges->length() != 0);

4738 DCHECK(length != 0);	4782 DCHECK(length != 0);

4739 DCHECK(special_class[0] != 0);	4783 DCHECK(special_class[0] != 0);

4740 if (ranges->length() != (length >> 1) + 1) {	4784 if (ranges->length() != (length >> 1) + 1) {

4741 return false;	4785 return false;

4742 }	4786 }

4743 CharacterRange range = ranges->at(0);	4787 CharacterRange range = ranges->at(0);

4744 if (range.from() != 0) {	4788 if (range.from() != 0) {

4745 return false;	4789 return false;

4746 }	4790 }

4747 for (int i = 0; i < length; i += 2) {	4791 for (int i = 0; i < length; i += 2) {

4748 if (special_class[i] != (range.to() + 1)) {	4792 if (special_class[i] != (range.to() + 1)) {

4749 return false;	4793 return false;

4750 }	4794 }

4751 range = ranges->at((i >> 1) + 1);	4795 range = ranges->at((i >> 1) + 1);

4752 if (special_class[i+1] != range.from()) {	4796 if (special_class[i+1] != range.from()) {

4753 return false;	4797 return false;

4754 }	4798 }

4755 }	4799 }

4756 if (range.to() != 0xffff) {	4800 if (range.to() != 0xffff) {

4757 return false;	4801 return false;

4758 }	4802 }

4759 return true;	4803 return true;

4760 }	4804 }

4761	4805

4762	4806

4763 static bool CompareRanges(ZoneList<CharacterRange>* ranges,	4807 static bool CompareRanges(ZoneList<CharacterRange>* ranges,

4764 const int* special_class,	4808 const int* special_class,

4765 int length) {	4809 int length) {

4766 length--; // Remove final 0x10000.	4810 length--; // Remove final marker.

4767 DCHECK(special_class[length] == 0x10000);	4811 DCHECK(special_class[length] == kRangeEndMarker);

4768 if (ranges->length() * 2 != length) {	4812 if (ranges->length() * 2 != length) {

4769 return false;	4813 return false;

4770 }	4814 }

4771 for (int i = 0; i < length; i += 2) {	4815 for (int i = 0; i < length; i += 2) {

4772 CharacterRange range = ranges->at(i >> 1);	4816 CharacterRange range = ranges->at(i >> 1);

4773 if (range.from() != special_class[i] \|\|	4817 if (range.from() != special_class[i] \|\|

4774 range.to() != special_class[i + 1] - 1) {	4818 range.to() != special_class[i + 1] - 1) {

4775 return false;	4819 return false;

4776 }	4820 }

4777 }	4821 }

(...skipping 35 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4813 return true;	4857 return true;

4814 }	4858 }

4815 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {	4859 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {

4816 set_.set_standard_set_type('W');	4860 set_.set_standard_set_type('W');

4817 return true;	4861 return true;

4818 }	4862 }

4819 return false;	4863 return false;

4820 }	4864 }

4821	4865

4822	4866

	4867 bool RegExpCharacterClass::NeedsDesugaringForUnicode(Zone* zone) {

	4868 ZoneList<CharacterRange>* ranges = this->ranges(zone);

	4869 CharacterRange::Canonicalize(ranges);

	4870 for (int i = ranges->length() - 1; i >= 0; i--) {

	4871 uc32 from = ranges->at(i).from();

	4872 uc32 to = ranges->at(i).to();

	4873 // Check for non-BMP characters.

	4874 if (to >= kNonBmpStart) return true;

	4875 // Check for lone surrogates.

	4876 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;

	4877 }

	4878 return false;

	4879 }

	4880

	4881

	4882 UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,

	4883 ZoneList<CharacterRange>* base)

	4884 : zone_(zone),

	4885 table_(zone),

	4886 bmp_(nullptr),

	4887 lead_surrogates_(nullptr),

	4888 trail_surrogates_(nullptr),

	4889 non_bmp_(nullptr) {
	erikcorry 2016/01/20 10:47:05 What is going on here? Some explanation needed, I What is going on here? Some explanation needed, I feel. Yang 2016/01/20 13:06:20 Added a wall of comment. Show quoted text On 2016/01/20 10:47:05, erikcorry wrote: > What is going on here? Some explanation needed, I feel. Added a wall of comment.
	4890 for (int i = 0; i < base->length(); i++) {

	4891 table_.AddRange(base->at(i), kBase, zone_);

	4892 }

	4893 table_.AddRange(CharacterRange(0, kLeadSurrogateStart - 1), kBmpCodePoints,

	4894 zone_);

	4895 table_.AddRange(CharacterRange(kLeadSurrogateStart, kLeadSurrogateEnd),

	4896 kLeadSurrogates, zone_);

	4897 table_.AddRange(CharacterRange(kTrailSurrogateStart, kTrailSurrogateEnd),

	4898 kTrailSurrogates, zone_);

	4899 table_.AddRange(CharacterRange(kTrailSurrogateEnd, kNonBmpStart - 1),

	4900 kBmpCodePoints, zone_);

	4901 table_.AddRange(CharacterRange(kNonBmpStart, kNonBmpEnd), kNonBmpCodePoints,

	4902 zone_);

	4903 table_.ForEach(this);

	4904 }

	4905

	4906

	4907 void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {

	4908 OutSet* outset = entry.out_set();

	4909 if (!outset->Get(kBase)) return;

	4910 ZoneList<CharacterRange>** target = NULL;

	4911 if (outset->Get(kBmpCodePoints)) {

	4912 target = &bmp_;

	4913 } else if (outset->Get(kLeadSurrogates)) {

	4914 target = &lead_surrogates_;

	4915 } else if (outset->Get(kTrailSurrogates)) {

	4916 target = &trail_surrogates_;

	4917 } else {

	4918 DCHECK(outset->Get(kNonBmpCodePoints));

	4919 target = &non_bmp_;

	4920 }

	4921 if (target == NULL) target = new (zone_) ZoneList<CharacterRange>(2, zone_);

	4922 (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);

	4923 }

	4924

	4925

	4926 void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,

	4927 RegExpNode* on_success, UnicodeRangeSplitter* splitter) {

	4928 ZoneList<CharacterRange>* bmp = splitter->bmp();

	4929 if (bmp == nullptr) return;

	4930 result->AddAlternative(GuardedAlternative(TextNode::CharacterRanges(

	4931 compiler->zone(), bmp, compiler->read_backward(), on_success)));

	4932 }

	4933

	4934

	4935 void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,

	4936 RegExpNode* on_success,

	4937 UnicodeRangeSplitter* splitter) {

	4938 ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();

	4939 if (non_bmp == nullptr) return;

	4940 DCHECK(compiler->unicode());

	4941 DCHECK(!compiler->one_byte());

	4942 Zone* zone = compiler->zone();

	4943 CharacterRange::Canonicalize(non_bmp);

	4944 for (int i = 0; i < non_bmp->length(); i++) {

	4945 // Match surrogate pair.

	4946 // E.g. [\u10005-\u11005] becomes

	4947 // \ud800[\udc05-\udfff]\|

	4948 // [\ud801-\ud803][\udc00-\udfff]\|

	4949 // \ud804[\udc00-\udc05]

	4950 uc32 from = non_bmp->at(i).from();

	4951 uc32 to = non_bmp->at(i).to();

	4952 uc16 from_l = unibrow::Utf16::LeadSurrogate(from);

	4953 uc16 from_t = unibrow::Utf16::TrailSurrogate(from);

	4954 uc16 to_l = unibrow::Utf16::LeadSurrogate(to);

	4955 uc16 to_t = unibrow::Utf16::TrailSurrogate(to);

	4956 if (from_l == to_l) {

	4957 // The lead surrogate is the same.

	4958 result->AddAlternative(GuardedAlternative(

	4959 TextNode::SurrogatePair(zone, CharacterRange::Singleton(from_l),

	4960 CharacterRange::Range(from_t, to_t),

	4961 compiler->read_backward(), on_success)));

	4962 } else if (from_t == kTrailSurrogateStart && to_t == kTrailSurrogateEnd) {

	4963 // Can be represented as [<from_l>-<to_l>][\udc00-\udfff].

	4964 // This would simplify /./u a bit.

	4965 result->AddAlternative(GuardedAlternative(TextNode::SurrogatePair(

	4966 zone, CharacterRange::Range(from_l, to_l),

	4967 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),

	4968 compiler->read_backward(), on_success)));

	4969 } else {

	4970 result->AddAlternative(GuardedAlternative(TextNode::SurrogatePair(
	erikcorry 2016/01/20 10:47:05 This one we only need if from_t is not kTrailSurro This one we only need if from_t is not kTrailSurrogateStart. Proposal: if (from_t != kTrailSurrogateStart) { result->AddAlternative(... from_l++; } if (to_t != kTrailSurrogateEnd) { result->AddAlternative(... to_l--; } result->AddAlternative(... This replaces the else-if and the else clause. Yang 2016/01/20 13:06:20 Done. Show quoted text On 2016/01/20 10:47:05, erikcorry wrote: > This one we only need if from_t is not kTrailSurrogateStart. > > Proposal: > > if (from_t != kTrailSurrogateStart) { > result->AddAlternative(... > from_l++; > } > if (to_t != kTrailSurrogateEnd) { > result->AddAlternative(... > to_l--; > } > result->AddAlternative(... > > This replaces the else-if and the else clause. Done.
	4971 zone, CharacterRange::Singleton(from_l),

	4972 CharacterRange::Range(from_t, kTrailSurrogateEnd),

	4973 compiler->read_backward(), on_success)));

	4974 if (from_l + 1 < to_l) {

	4975 result->AddAlternative(GuardedAlternative(TextNode::SurrogatePair(

	4976 zone, CharacterRange::Range(from_l + 1, to_l - 1),

	4977 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),

	4978 compiler->read_backward(), on_success)));

	4979 }

	4980 result->AddAlternative(GuardedAlternative(TextNode::SurrogatePair(

	4981 zone, CharacterRange::Singleton(to_l),

	4982 CharacterRange::Range(kTrailSurrogateStart, to_t),

	4983 compiler->read_backward(), on_success)));

	4984 }

	4985 }

	4986 }

	4987

	4988

	4989 RegExpNode* NegativeLookbehindAndMatch(RegExpCompiler* compiler,

	4990 ZoneList<CharacterRange>* lookbehind,

	4991 ZoneList<CharacterRange>* match,

	4992 RegExpNode* on_success,

	4993 bool read_backward) {

	4994 Zone* zone = compiler->zone();

	4995 RegExpNode* match_node =

	4996 TextNode::CharacterRanges(zone, match, read_backward, on_success);

	4997 int stack_register = compiler->UnicodeLookaroundStackRegister();

	4998 int position_register = compiler->UnicodeLookaroundPositionRegister();

	4999 RegExpLookaround::Builder lookaround(false, match_node, stack_register,

	5000 position_register);

	5001 RegExpNode* negative_match = TextNode::CharacterRanges(

	5002 zone, lookbehind, !read_backward, lookaround.on_match_success());

	5003 return lookaround.ForMatch(negative_match);

	5004 }

	5005

	5006

	5007 RegExpNode* MatchAndNegativeLookahead(RegExpCompiler* compiler,

	5008 ZoneList<CharacterRange>* match,

	5009 ZoneList<CharacterRange>* lookahead,

	5010 RegExpNode* on_success,

	5011 bool read_backward) {

	5012 Zone* zone = compiler->zone();

	5013 int stack_register = compiler->UnicodeLookaroundStackRegister();

	5014 int position_register = compiler->UnicodeLookaroundPositionRegister();

	5015 RegExpLookaround::Builder lookaround(false, on_success, stack_register,

	5016 position_register);

	5017 RegExpNode* negative_match = TextNode::CharacterRanges(

	5018 zone, lookahead, read_backward, lookaround.on_match_success());

	5019 return TextNode::CharacterRanges(zone, match, read_backward,

	5020 lookaround.ForMatch(negative_match));

	5021 }

	5022

	5023

	5024 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,

	5025 RegExpNode* on_success,

	5026 UnicodeRangeSplitter* splitter) {

	5027 ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();

	5028 if (lead_surrogates == nullptr) return;

	5029 Zone* zone = compiler->zone();

	5030 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).

	5031 ZoneList<CharacterRange>* trail_surrogates =

	5032 new (zone) ZoneList<CharacterRange>(1, zone);

	5033 trail_surrogates->Add(

	5034 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), zone);

	5035

	5036 RegExpNode* match =

	5037 compiler->read_backward()

	5038 // reading backward, make sure of no trail surrogate before the match.

	5039 ? NegativeLookbehindAndMatch(compiler, trail_surrogates,
	erikcorry 2016/01/20 10:47:05 I don't think so. Even if you are reading backwar I don't think so. Even if you are reading backwards, it is the position after the match that needs to be checked. But in the backwards case you want to check it chronologically before you check the actual character (and move the current position). Yang 2016/01/20 13:06:20 I think it's the naming that's confusing. Inside t Show quoted text On 2016/01/20 10:47:05, erikcorry wrote: > I don't think so. Even if you are reading backwards, it is the position after > the match that needs to be checked. But in the backwards case you want to check > it chronologically before you check the actual character (and move the current > position). I think it's the naming that's confusing. Inside the function, if the last parameter is true (read_backward), we first perform a negative lookahead for trailing surrogates, and then do a backward match for the lead surrogate. NegativeLookbehindAndMatch is actually to be understood as such in reading forward case. When reading backward, we would actually be doing a negative lookahead. I renamed it to be easier to understand, and elaborated in the comments.
	5040 lead_surrogates, on_success, true)

	5041 // reading forward, make sure of no trail surrogate after the match.

	5042 : MatchAndNegativeLookahead(compiler, lead_surrogates,

	5043 trail_surrogates, on_success, false);

	5044 result->AddAlternative(GuardedAlternative(match));

	5045 }

	5046

	5047

	5048 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,

	5049 RegExpNode* on_success,

	5050 UnicodeRangeSplitter* splitter) {

	5051 ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();

	5052 if (trail_surrogates == nullptr) return;

	5053 Zone* zone = compiler->zone();

	5054 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01

	5055 ZoneList<CharacterRange>* lead_surrogates =

	5056 new (zone) ZoneList<CharacterRange>(1, zone);

	5057 lead_surrogates->Add(

	5058 CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd), zone);

	5059

	5060 RegExpNode* match =

	5061 compiler->read_backward()

	5062 // reading backward, make sure of no lead surrogate after the match.

	5063 ? MatchAndNegativeLookahead(compiler, trail_surrogates,

	5064 lead_surrogates, on_success, true)

	5065 // reading forward, make sure of no lead surrogate before the match.

	5066 : NegativeLookbehindAndMatch(compiler, lead_surrogates,

	5067 trail_surrogates, on_success, false);

	5068 result->AddAlternative(GuardedAlternative(match));

	5069 }

	5070

	5071

4823 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,	5072 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,

4824 RegExpNode* on_success) {	5073 RegExpNode* on_success) {

4825 return new (compiler->zone())	5074 set_.Canonicalize();

4826 TextNode(this, compiler->read_backward(), on_success);	5075 Zone* zone = compiler->zone();

4827 }	5076 ZoneList<CharacterRange>* ranges = this->ranges(zone);

4828	5077 if (compiler->unicode() && !compiler->one_byte()) {

4829	5078 if (is_negated()) {

	5079 ZoneList<CharacterRange>* negated =

	5080 new (zone) ZoneList<CharacterRange>(2, zone);

	5081 CharacterRange::Negate(ranges, negated, zone);

	5082 ranges = negated;

	5083 }

	5084 if (ranges->length() == 0) {

	5085 // No matches possible.

	5086 return new (zone) EndNode(EndNode::BACKTRACK, zone);

	5087 }

	5088 UnicodeRangeSplitter splitter(zone, ranges);

	5089 ChoiceNode* result = new (compiler->zone()) ChoiceNode(2, compiler->zone());

	5090 AddBmpCharacters(compiler, result, on_success, &splitter);

	5091 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);

	5092 AddLoneLeadSurrogates(compiler, result, on_success, &splitter);

	5093 AddLoneTrailSurrogates(compiler, result, on_success, &splitter);

	5094 return result;

	5095 } else {

	5096 // Limit the ranges to 0 .. 0xffff.

	5097 int i;

	5098 for (i = ranges->length() - 1; i >= 0; i--) {

	5099 if (ranges->at(i).from() <= String::kMaxUtf16CodeUnit) break;
	erikcorry 2016/01/20 10:47:05 For the one-byte case don't we want to have a lowe For the one-byte case don't we want to have a lower limit than kMaxUtf16CodeUnit? Yang 2016/01/20 13:06:20 Actually this is not necessary since we do the lim Show quoted text On 2016/01/20 10:47:05, erikcorry wrote: > For the one-byte case don't we want to have a lower limit than > kMaxUtf16CodeUnit? Actually this is not necessary since we do the limiting again in EmitCharClass, but then properly account for the one-byte case. Removed.
	5100 }

	5101 ranges->at(i).set_to(Min(ranges->at(i).to(), String::kMaxUtf16CodeUnit));

	5102 ranges->Rewind(i + 1);

	5103 return new (zone) TextNode(this, compiler->read_backward(), on_success);

	5104 }

	5105 }

	5106

	5107

4830 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {	5108 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {

4831 RegExpAtom* atom1 = (*a)->AsAtom();	5109 RegExpAtom* atom1 = (*a)->AsAtom();

4832 RegExpAtom* atom2 = (*b)->AsAtom();	5110 RegExpAtom* atom2 = (*b)->AsAtom();

4833 uc16 character1 = atom1->data().at(0);	5111 uc16 character1 = atom1->data().at(0);

4834 uc16 character2 = atom2->data().at(0);	5112 uc16 character2 = atom2->data().at(0);

4835 if (character1 < character2) return -1;	5113 if (character1 < character2) return -1;

4836 if (character1 > character2) return 1;	5114 if (character1 > character2) return 1;

4837 return 0;	5115 return 0;

4838 }	5116 }

4839	5117

(...skipping 491 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5331 compiler->read_backward(), on_success);	5609 compiler->read_backward(), on_success);

5332 }	5610 }

5333	5611

5334	5612

5335 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,	5613 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,

5336 RegExpNode* on_success) {	5614 RegExpNode* on_success) {

5337 return on_success;	5615 return on_success;

5338 }	5616 }

5339	5617

5340	5618

	5619 RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,

	5620 int stack_pointer_register,

	5621 int position_register,

	5622 int capture_register_count,

	5623 int capture_register_start)

	5624 : is_positive_(is_positive),

	5625 on_success_(on_success),

	5626 stack_pointer_register_(stack_pointer_register),

	5627 position_register_(position_register) {

	5628 if (is_positive_) {

	5629 on_match_success_ = ActionNode::PositiveSubmatchSuccess(

	5630 stack_pointer_register, position_register, capture_register_count,

	5631 capture_register_start, on_success_);

	5632 } else {

	5633 Zone* zone = on_success_->zone();

	5634 on_match_success_ = new (zone) NegativeSubmatchSuccess(

	5635 stack_pointer_register, position_register, capture_register_count,

	5636 capture_register_start, zone);

	5637 }

	5638 }

	5639

	5640

	5641 RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {

	5642 if (is_positive_) {

	5643 return ActionNode::BeginSubmatch(stack_pointer_register_,

	5644 position_register_, match);

	5645 } else {

	5646 Zone* zone = on_success_->zone();

	5647 // We use a ChoiceNode to represent the negative lookaround. The first

	5648 // alternative is the negative match. On success, the end node backtracks.

	5649 // On failure, the second alternative is tried and leads to success.

	5650 // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the

	5651 // first exit when calculating quick checks.

	5652 ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(

	5653 GuardedAlternative(match), GuardedAlternative(on_success_), zone);

	5654 return ActionNode::BeginSubmatch(stack_pointer_register_,

	5655 position_register_, choice_node);

	5656 }

	5657 }

	5658

	5659

5341 RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,	5660 RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,

5342 RegExpNode* on_success) {	5661 RegExpNode* on_success) {

5343 int stack_pointer_register = compiler->AllocateRegister();	5662 int stack_pointer_register = compiler->AllocateRegister();

5344 int position_register = compiler->AllocateRegister();	5663 int position_register = compiler->AllocateRegister();

5345	5664

5346 const int registers_per_capture = 2;	5665 const int registers_per_capture = 2;

5347 const int register_of_first_capture = 2;	5666 const int register_of_first_capture = 2;

5348 int register_count = capture_count_ * registers_per_capture;	5667 int register_count = capture_count_ * registers_per_capture;

5349 int register_start =	5668 int register_start =

5350 register_of_first_capture + capture_from_ * registers_per_capture;	5669 register_of_first_capture + capture_from_ * registers_per_capture;

5351	5670

5352 RegExpNode* result;	5671 RegExpNode* result;

5353 bool was_reading_backward = compiler->read_backward();	5672 bool was_reading_backward = compiler->read_backward();

5354 compiler->set_read_backward(type() == LOOKBEHIND);	5673 compiler->set_read_backward(type() == LOOKBEHIND);

5355 if (is_positive()) {	5674 Builder builder(is_positive(), on_success, stack_pointer_register,

5356 result = ActionNode::BeginSubmatch(	5675 position_register, register_count, register_start);

5357 stack_pointer_register, position_register,	5676 RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());

5358 body()->ToNode(compiler,	5677 result = builder.ForMatch(match);

5359 ActionNode::PositiveSubmatchSuccess(

5360 stack_pointer_register, position_register,

5361 register_count, register_start, on_success)));

5362 } else {

5363 // We use a ChoiceNode for a negative lookahead because it has most of

5364 // the characteristics we need. It has the body of the lookahead as its

5365 // first alternative and the expression after the lookahead of the second

5366 // alternative. If the first alternative succeeds then the

5367 // NegativeSubmatchSuccess will unwind the stack including everything the

5368 // choice node set up and backtrack. If the first alternative fails then

5369 // the second alternative is tried, which is exactly the desired result

5370 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special

5371 // ChoiceNode that knows to ignore the first exit when calculating quick

5372 // checks.

5373 Zone* zone = compiler->zone();

5374

5375 GuardedAlternative body_alt(

5376 body()->ToNode(compiler, new (zone) NegativeSubmatchSuccess(

5377 stack_pointer_register, position_register,

5378 register_count, register_start, zone)));

5379 ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(

5380 body_alt, GuardedAlternative(on_success), zone);

5381 result = ActionNode::BeginSubmatch(stack_pointer_register,

5382 position_register, choice_node);

5383 }

5384 compiler->set_read_backward(was_reading_backward);	5678 compiler->set_read_backward(was_reading_backward);

5385 return result;	5679 return result;

5386 }	5680 }

5387	5681

5388	5682

5389 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,	5683 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,

5390 RegExpNode* on_success) {	5684 RegExpNode* on_success) {

5391 return ToNode(body(), index(), compiler, on_success);	5685 return ToNode(body(), index(), compiler, on_success);

5392 }	5686 }

5393	5687

(...skipping 27 matching lines...) Expand all Loading...
5421 }	5715 }

5422 return current;	5716 return current;

5423 }	5717 }

5424	5718

5425	5719

5426 static void AddClass(const int* elmv,	5720 static void AddClass(const int* elmv,

5427 int elmc,	5721 int elmc,

5428 ZoneList<CharacterRange>* ranges,	5722 ZoneList<CharacterRange>* ranges,

5429 Zone* zone) {	5723 Zone* zone) {

5430 elmc--;	5724 elmc--;

5431 DCHECK(elmv[elmc] == 0x10000);	5725 DCHECK(elmv[elmc] == kRangeEndMarker);

5432 for (int i = 0; i < elmc; i += 2) {	5726 for (int i = 0; i < elmc; i += 2) {

5433 DCHECK(elmv[i] < elmv[i + 1]);	5727 DCHECK(elmv[i] < elmv[i + 1]);

5434 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1), zone);	5728 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1), zone);

5435 }	5729 }

5436 }	5730 }

5437	5731

5438	5732

5439 static void AddClassNegated(const int *elmv,	5733 static void AddClassNegated(const int *elmv,

5440 int elmc,	5734 int elmc,

5441 ZoneList<CharacterRange>* ranges,	5735 ZoneList<CharacterRange>* ranges,

5442 Zone* zone) {	5736 Zone* zone) {

5443 elmc--;	5737 elmc--;

5444 DCHECK(elmv[elmc] == 0x10000);	5738 DCHECK(elmv[elmc] == kRangeEndMarker);

5445 DCHECK(elmv[0] != 0x0000);	5739 DCHECK(elmv[0] != 0x0000);

5446 DCHECK(elmv[elmc-1] != String::kMaxUtf16CodeUnit);	5740 DCHECK(elmv[elmc - 1] != String::kMaxCodePoint);

5447 uc16 last = 0x0000;	5741 uc16 last = 0x0000;

5448 for (int i = 0; i < elmc; i += 2) {	5742 for (int i = 0; i < elmc; i += 2) {

5449 DCHECK(last <= elmv[i] - 1);	5743 DCHECK(last <= elmv[i] - 1);

5450 DCHECK(elmv[i] < elmv[i + 1]);	5744 DCHECK(elmv[i] < elmv[i + 1]);

5451 ranges->Add(CharacterRange(last, elmv[i] - 1), zone);	5745 ranges->Add(CharacterRange(last, elmv[i] - 1), zone);

5452 last = elmv[i + 1];	5746 last = elmv[i + 1];

5453 }	5747 }

5454 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit), zone);	5748 ranges->Add(CharacterRange(last, String::kMaxCodePoint), zone);

5455 }	5749 }

5456	5750

5457	5751

5458 void CharacterRange::AddClassEscape(uc16 type,	5752 void CharacterRange::AddClassEscape(uc16 type,

5459 ZoneList<CharacterRange>* ranges,	5753 ZoneList<CharacterRange>* ranges,

5460 Zone* zone) {	5754 Zone* zone) {

5461 switch (type) {	5755 switch (type) {

5462 case 's':	5756 case 's':

5463 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);	5757 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);

5464 break;	5758 break;

(...skipping 36 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5501 UNREACHABLE();	5795 UNREACHABLE();

5502 }	5796 }

5503 }	5797 }

5504	5798

5505	5799

5506 Vector<const int> CharacterRange::GetWordBounds() {	5800 Vector<const int> CharacterRange::GetWordBounds() {

5507 return Vector<const int>(kWordRanges, kWordRangeCount - 1);	5801 return Vector<const int>(kWordRanges, kWordRangeCount - 1);

5508 }	5802 }

5509	5803

5510	5804

5511 class CharacterRangeSplitter {

5512 public:

5513 CharacterRangeSplitter(ZoneList<CharacterRange>** included,

5514 ZoneList<CharacterRange>** excluded,

5515 Zone* zone)

5516 : included_(included),

5517 excluded_(excluded),

5518 zone_(zone) { }

5519 void Call(uc16 from, DispatchTable::Entry entry);

5520

5521 static const int kInBase = 0;

5522 static const int kInOverlay = 1;

5523

5524 private:

5525 ZoneList<CharacterRange>** included_;

5526 ZoneList<CharacterRange>** excluded_;

5527 Zone* zone_;

5528 };

5529

5530

5531 void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {

5532 if (!entry.out_set()->Get(kInBase)) return;

5533 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)

5534 ? included_

5535 : excluded_;

5536 if (target == NULL) target = new(zone_) ZoneList<CharacterRange>(2, zone_);

5537 (*target)->Add(CharacterRange(entry.from(), entry.to()), zone_);

5538 }

5539

5540

5541 void CharacterRange::Split(ZoneList<CharacterRange>* base,

5542 Vector<const int> overlay,

5543 ZoneList<CharacterRange>** included,

5544 ZoneList<CharacterRange>** excluded,

5545 Zone* zone) {

5546 DCHECK_NULL(*included);

5547 DCHECK_NULL(*excluded);

5548 DispatchTable table(zone);

5549 for (int i = 0; i < base->length(); i++)

5550 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase, zone);

5551 for (int i = 0; i < overlay.length(); i += 2) {

5552 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1),

5553 CharacterRangeSplitter::kInOverlay, zone);

5554 }

5555 CharacterRangeSplitter callback(included, excluded, zone);

5556 table.ForEach(&callback);

5557 }

5558

5559

5560 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,	5805 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,

5561 ZoneList<CharacterRange>* ranges,	5806 ZoneList<CharacterRange>* ranges,

5562 bool is_one_byte) {	5807 bool is_one_byte) {

5563 uc16 bottom = from();	5808 uc32 bottom = from();

5564 uc16 top = to();	5809 uc32 top = to();

	5810 // Nothing to be done for surrogates.

	5811 if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return;

5565 if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) {	5812 if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) {

5566 if (bottom > String::kMaxOneByteCharCode) return;	5813 if (bottom > String::kMaxOneByteCharCode) return;

5567 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;	5814 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;

5568 }	5815 }

5569 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	5816 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

5570 if (top == bottom) {	5817 if (top == bottom) {

5571 // If this is a singleton we just expand the one character.	5818 // If this is a singleton we just expand the one character.

5572 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);	5819 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);

5573 for (int i = 0; i < length; i++) {	5820 for (int i = 0; i < length; i++) {

5574 uc32 chr = chars[i];	5821 uc32 chr = chars[i];

(...skipping 17 matching lines...) Expand all Loading...
5592 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only	5839 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only

5593 // add a range if it is not already contained in the input, so [c-f]	5840 // add a range if it is not already contained in the input, so [c-f]

5594 // will be skipped but [C-F] will be added. If this range is not	5841 // will be skipped but [C-F] will be added. If this range is not

5595 // completely contained in a block we do this for all the blocks	5842 // completely contained in a block we do this for all the blocks

5596 // covered by the range (handling characters that is not in a block	5843 // covered by the range (handling characters that is not in a block

5597 // as a "singleton block").	5844 // as a "singleton block").

5598 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];	5845 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];

5599 int pos = bottom;	5846 int pos = bottom;

5600 while (pos <= top) {	5847 while (pos <= top) {

5601 int length = isolate->jsregexp_canonrange()->get(pos, '\0', range);	5848 int length = isolate->jsregexp_canonrange()->get(pos, '\0', range);

5602 uc16 block_end;	5849 uc32 block_end;

5603 if (length == 0) {	5850 if (length == 0) {

5604 block_end = pos;	5851 block_end = pos;

5605 } else {	5852 } else {

5606 DCHECK_EQ(1, length);	5853 DCHECK_EQ(1, length);

5607 block_end = range[0];	5854 block_end = range[0];

5608 }	5855 }

5609 int end = (block_end > top) ? top : block_end;	5856 int end = (block_end > top) ? top : block_end;

5610 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range);	5857 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range);

5611 for (int i = 0; i < length; i++) {	5858 for (int i = 0; i < length; i++) {

5612 uc32 c = range[i];	5859 uc32 c = range[i];

5613 uc16 range_from = c - (block_end - pos);	5860 uc32 range_from = c - (block_end - pos);

5614 uc16 range_to = c - (block_end - end);	5861 uc32 range_to = c - (block_end - end);

5615 if (!(bottom <= range_from && range_to <= top)) {	5862 if (!(bottom <= range_from && range_to <= top)) {

5616 ranges->Add(CharacterRange(range_from, range_to), zone);	5863 ranges->Add(CharacterRange(range_from, range_to), zone);

5617 }	5864 }

5618 }	5865 }

5619 pos = end + 1;	5866 pos = end + 1;

5620 }	5867 }

5621 }	5868 }

5622 }	5869 }

5623	5870

5624	5871

(...skipping 40 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5665	5912

5666	5913

5667 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,	5914 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,

5668 int count,	5915 int count,

5669 CharacterRange insert) {	5916 CharacterRange insert) {

5670 // Inserts a range into list[0..count[, which must be sorted	5917 // Inserts a range into list[0..count[, which must be sorted

5671 // by from value and non-overlapping and non-adjacent, using at most	5918 // by from value and non-overlapping and non-adjacent, using at most

5672 // list[0..count] for the result. Returns the number of resulting	5919 // list[0..count] for the result. Returns the number of resulting

5673 // canonicalized ranges. Inserting a range may collapse existing ranges into	5920 // canonicalized ranges. Inserting a range may collapse existing ranges into

5674 // fewer ranges, so the return value can be anything in the range 1..count+1.	5921 // fewer ranges, so the return value can be anything in the range 1..count+1.

5675 uc16 from = insert.from();	5922 uc32 from = insert.from();

5676 uc16 to = insert.to();	5923 uc32 to = insert.to();

5677 int start_pos = 0;	5924 int start_pos = 0;

5678 int end_pos = count;	5925 int end_pos = count;

5679 for (int i = count - 1; i >= 0; i--) {	5926 for (int i = count - 1; i >= 0; i--) {

5680 CharacterRange current = list->at(i);	5927 CharacterRange current = list->at(i);

5681 if (current.from() > to + 1) {	5928 if (current.from() > to + 1) {

5682 end_pos = i;	5929 end_pos = i;

5683 } else if (current.to() + 1 < from) {	5930 } else if (current.to() + 1 < from) {

5684 start_pos = i + 1;	5931 start_pos = i + 1;

5685 break;	5932 break;

5686 }	5933 }

(...skipping 79 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5766 DCHECK(CharacterRange::IsCanonical(character_ranges));	6013 DCHECK(CharacterRange::IsCanonical(character_ranges));

5767 }	6014 }

5768	6015

5769	6016

5770 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,	6017 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,

5771 ZoneList<CharacterRange>* negated_ranges,	6018 ZoneList<CharacterRange>* negated_ranges,

5772 Zone* zone) {	6019 Zone* zone) {

5773 DCHECK(CharacterRange::IsCanonical(ranges));	6020 DCHECK(CharacterRange::IsCanonical(ranges));

5774 DCHECK_EQ(0, negated_ranges->length());	6021 DCHECK_EQ(0, negated_ranges->length());

5775 int range_count = ranges->length();	6022 int range_count = ranges->length();

5776 uc16 from = 0;	6023 uc32 from = 0;

5777 int i = 0;	6024 int i = 0;

5778 if (range_count > 0 && ranges->at(0).from() == 0) {	6025 if (range_count > 0 && ranges->at(0).from() == 0) {

5779 from = ranges->at(0).to();	6026 from = ranges->at(0).to();

5780 i = 1;	6027 i = 1;

5781 }	6028 }

5782 while (i < range_count) {	6029 while (i < range_count) {

5783 CharacterRange range = ranges->at(i);	6030 CharacterRange range = ranges->at(i);

5784 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1), zone);	6031 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1), zone);

5785 from = range.to();	6032 from = range.to();

5786 i++;	6033 i++;

5787 }	6034 }

5788 if (from < String::kMaxUtf16CodeUnit) {	6035 if (from < String::kMaxCodePoint) {

5789 negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit),	6036 negated_ranges->Add(CharacterRange(from + 1, String::kMaxCodePoint), zone);

5790 zone);

5791 }	6037 }

5792 }	6038 }

5793	6039

5794	6040

5795 // -------------------------------------------------------------------	6041 // -------------------------------------------------------------------

5796 // Splay tree	6042 // Splay tree

5797	6043

5798	6044

5799 OutSet* OutSet::Extend(unsigned value, Zone* zone) {	6045 OutSet* OutSet::Extend(unsigned value, Zone* zone) {

5800 if (Get(value))	6046 if (Get(value))

(...skipping 30 matching lines...) Expand all Loading...
5831 if (value < kFirstLimit) {	6077 if (value < kFirstLimit) {

5832 return (first_ & (1 << value)) != 0;	6078 return (first_ & (1 << value)) != 0;

5833 } else if (remaining_ == NULL) {	6079 } else if (remaining_ == NULL) {

5834 return false;	6080 return false;

5835 } else {	6081 } else {

5836 return remaining_->Contains(value);	6082 return remaining_->Contains(value);

5837 }	6083 }

5838 }	6084 }

5839	6085

5840	6086

5841 const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;	6087 const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;

5842	6088

5843	6089

5844 void DispatchTable::AddRange(CharacterRange full_range, int value,	6090 void DispatchTable::AddRange(CharacterRange full_range, int value,

5845 Zone* zone) {	6091 Zone* zone) {

5846 CharacterRange current = full_range;	6092 CharacterRange current = full_range;

5847 if (tree()->is_empty()) {	6093 if (tree()->is_empty()) {

5848 // If this is the first range we just insert into the table.	6094 // If this is the first range we just insert into the table.

5849 ZoneSplayTree<Config>::Locator loc;	6095 ZoneSplayTree<Config>::Locator loc;

5850 bool inserted = tree()->Insert(current.from(), &loc);	6096 bool inserted = tree()->Insert(current.from(), &loc);

5851 DCHECK(inserted);	6097 DCHECK(inserted);

(...skipping 81 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5933 USE(inserted);	6179 USE(inserted);

5934 ins.set_value(Entry(current.from(),	6180 ins.set_value(Entry(current.from(),

5935 current.to(),	6181 current.to(),

5936 empty()->Extend(value, zone)));	6182 empty()->Extend(value, zone)));

5937 break;	6183 break;

5938 }	6184 }

5939 }	6185 }

5940 }	6186 }

5941	6187

5942	6188

5943 OutSet* DispatchTable::Get(uc16 value) {	6189 OutSet* DispatchTable::Get(uc32 value) {

5944 ZoneSplayTree<Config>::Locator loc;	6190 ZoneSplayTree<Config>::Locator loc;

5945 if (!tree()->FindGreatestLessThan(value, &loc))	6191 if (!tree()->FindGreatestLessThan(value, &loc))

5946 return empty();	6192 return empty();

5947 Entry* entry = &loc.value();	6193 Entry* entry = &loc.value();

5948 if (value <= entry->to())	6194 if (value <= entry->to())

5949 return entry->out_set();	6195 return entry->out_set();

5950 else	6196 else

5951 return empty();	6197 return empty();

5952 }	6198 }

5953	6199

(...skipping 297 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6251 }	6497 }

6252	6498

6253	6499

6254 void DispatchTableConstructor::VisitAction(ActionNode* that) {	6500 void DispatchTableConstructor::VisitAction(ActionNode* that) {

6255 RegExpNode* target = that->on_success();	6501 RegExpNode* target = that->on_success();

6256 target->Accept(this);	6502 target->Accept(this);

6257 }	6503 }

6258	6504

6259	6505

6260 RegExpEngine::CompilationResult RegExpEngine::Compile(	6506 RegExpEngine::CompilationResult RegExpEngine::Compile(

6261 Isolate* isolate, Zone* zone, RegExpCompileData* data, bool ignore_case,	6507 Isolate* isolate, Zone* zone, RegExpCompileData* data,

6262 bool is_global, bool is_multiline, bool is_sticky, Handle<String> pattern,	6508 JSRegExp::Flags flags, Handle<String> pattern,

6263 Handle<String> sample_subject, bool is_one_byte) {	6509 Handle<String> sample_subject, bool is_one_byte) {

6264 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {	6510 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {

6265 return IrregexpRegExpTooBig(isolate);	6511 return IrregexpRegExpTooBig(isolate);

6266 }	6512 }

6267 RegExpCompiler compiler(isolate, zone, data->capture_count, ignore_case,	6513 bool ignore_case = flags & JSRegExp::kIgnoreCase;

	6514 bool is_sticky = flags & JSRegExp::kSticky;

	6515 bool is_global = flags & JSRegExp::kGlobal;

	6516 RegExpCompiler compiler(isolate, zone, data->capture_count, flags,

6268 is_one_byte);	6517 is_one_byte);

6269	6518

6270 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));	6519 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));

6271	6520

6272 // Sample some characters from the middle of the string.	6521 // Sample some characters from the middle of the string.

6273 static const int kSampleSize = 128;	6522 static const int kSampleSize = 128;

6274	6523

6275 sample_subject = String::Flatten(sample_subject);	6524 sample_subject = String::Flatten(sample_subject);

6276 int chars_sampled = 0;	6525 int chars_sampled = 0;

6277 int half_way = (sample_subject->length() - kSampleSize) / 2;	6526 int half_way = (sample_subject->length() - kSampleSize) / 2;

(...skipping 222 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6500	6749

6501	6750

6502 void RegExpResultsCache::Clear(FixedArray* cache) {	6751 void RegExpResultsCache::Clear(FixedArray* cache) {

6503 for (int i = 0; i < kRegExpResultsCacheSize; i++) {	6752 for (int i = 0; i < kRegExpResultsCacheSize; i++) {

6504 cache->set(i, Smi::FromInt(0));	6753 cache->set(i, Smi::FromInt(0));

6505 }	6754 }

6506 }	6755 }

6507	6756

6508 } // namespace internal	6757 } // namespace internal

6509 } // namespace v8	6758 } // namespace v8

OLD	NEW

« src/regexp/jsregexp.h ('K') | « src/regexp/jsregexp.h ('k') | src/regexp/regexp-ast.h » ('j') | no next file with comments »